1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (C) 1995-1998 by Symantec 6 * Copyright (C) 2000-2021 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d) 10 */ 11 12 module dmd.backend.cgsched; 13 14 version (SCPP) 15 version = COMPILE; 16 version (MARS) 17 version = COMPILE; 18 19 version (COMPILE) 20 { 21 22 import core.stdc.stdio; 23 import core.stdc.stdlib; 24 import core.stdc.string; 25 26 import dmd.backend.cc; 27 import dmd.backend.cdef; 28 import dmd.backend.code; 29 import dmd.backend.code_x86; 30 import dmd.backend.dlist; 31 import dmd.backend.global; 32 import dmd.backend.mem; 33 import dmd.backend.ty; 34 import dmd.backend.barray; 35 36 extern (C++): 37 38 nothrow: 39 40 int REGSIZE(); 41 code *gen1(code *c, uint op); 42 code *gen2(code *c, uint op, uint rm); 43 44 private uint mask(uint m) { return 1 << m; } 45 46 // is32bitaddr works correctly only when x is 0 or 1. This is 47 // true today for the current definition of I32, but if the definition 48 // of I32 changes, this macro will need to change as well 49 // 50 // Note: even for linux targets, CFaddrsize can be set by the inline 51 // assembler. 52 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); } 53 54 // If we use Pentium Pro scheduler 55 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; } 56 57 private enum FP : ubyte 58 { 59 fstp = 1, /// FSTP mem 60 fld = 2, /// FLD mem 61 fop = 3, /// Fop ST0,mem or Fop ST0 62 } 63 64 private enum CIFL : ubyte 65 { 66 arraybounds = 1, /// this instruction is a jmp to array bounds 67 ea = 2, /// this instruction has a memory-referencing 68 /// modregrm EA byte 69 nostage = 4, /// don't stage these instructions 70 push = 8, /// it's a push we can swap around 71 } 72 73 // Struct where we gather information about an instruction 74 struct Cinfo 75 { 76 code *c; // the instruction 77 ubyte pair; // pairing information 78 ubyte sz; // operand size 79 ubyte isz; // instruction size 80 81 // For floating point scheduling 82 ubyte fxch_pre; 83 ubyte fxch_post; 84 FP fp_op; /// FPxxxx 85 86 ubyte flags; /// CIFLxxx 87 88 uint r; // read mask 89 uint w; // write mask 90 uint a; // registers used in addressing mode 91 ubyte reg; // reg field of modregrm byte 92 ubyte uops; // Pentium Pro micro-ops 93 uint sibmodrm; // (sib << 8) + mod__rm byte 94 uint spadjust; // if !=0, then amount ESP changes as a result of this 95 // instruction being executed 96 int fpuadjust; // if !=0, then amount FPU stack changes as a result 97 // of this instruction being executed 98 99 nothrow void print() // pretty-printer 100 { 101 Cinfo *ci = &this; 102 103 if (ci == null) 104 { 105 printf("Cinfo 0\n"); 106 return; 107 } 108 109 printf("Cinfo %p: c %p, pair %x, sz %d, isz %d, flags - ", 110 ci,c,pair,sz,isz); 111 if (ci.flags & CIFL.arraybounds) 112 printf("arraybounds,"); 113 if (ci.flags & CIFL.ea) 114 printf("ea,"); 115 if (ci.flags & CIFL.nostage) 116 printf("nostage,"); 117 if (ci.flags & CIFL.push) 118 printf("push,"); 119 if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea)) 120 printf("bad flag,"); 121 printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n", 122 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust); 123 if (ci.fp_op) 124 { 125 __gshared const(char*)[3] fpops = ["fstp","fld","fop"]; 126 127 printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n", 128 fpops[fp_op-1],fxch_pre,fxch_post); 129 } 130 } 131 132 } 133 134 135 /***************************************** 136 * Do Pentium optimizations. 137 * Input: 138 * scratch scratch registers we can use 139 */ 140 141 private void cgsched_pentium(code **pc,regm_t scratch) 142 { 143 //printf("scratch = x%02x\n",scratch); 144 if (config.target_scheduler >= TARGET_80486) 145 { 146 if (!I64) 147 *pc = peephole(*pc,0); 148 if (I32) // forget about 16 bit code 149 { 150 if (config.target_cpu == TARGET_Pentium || 151 config.target_cpu == TARGET_PentiumMMX) 152 *pc = simpleops(*pc,scratch); 153 *pc = schedule(*pc,0); 154 } 155 } 156 } 157 158 /************************************ 159 * Entry point 160 */ 161 public void cgsched_block(block* b) 162 { 163 if (config.flags4 & CFG4speed && 164 config.target_cpu >= TARGET_Pentium && 165 b.BC != BCasm) 166 { 167 regm_t scratch = allregs; 168 169 scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg); 170 scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval); 171 cgsched_pentium(&b.Bcode,scratch); 172 //printf("after schedule:\n"); WRcodlst(b.Bcode); 173 } 174 } 175 176 enum 177 { 178 NP = 0, /// not pairable 179 PU = 1, /// pairable in U only, never executed in V 180 PV = 2, /// pairable in V only 181 UV = (PU|PV), /// pairable in both U and V 182 PE = 4, /// register contention exception 183 PF = 8, /// flags contention exception 184 FX = 0x10, /// pairable with FXCH instruction 185 } 186 187 extern (D) private immutable ubyte[256] pentcycl = 188 [ 189 UV,UV,UV,UV, UV,UV,NP,NP, // 0 190 UV,UV,UV,UV, UV,UV,NP,NP, // 8 191 PU,PU,PU,PU, PU,PU,NP,NP, // 10 192 PU,PU,PU,PU, PU,PU,NP,NP, // 18 193 UV,UV,UV,UV, UV,UV,NP,NP, // 20 194 UV,UV,UV,UV, UV,UV,NP,NP, // 28 195 UV,UV,UV,UV, UV,UV,NP,NP, // 30 196 UV,UV,UV,UV, UV,UV,NP,NP, // 38 197 198 UV,UV,UV,UV, UV,UV,UV,UV, // 40 199 UV,UV,UV,UV, UV,UV,UV,UV, // 48 200 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 50 PUSH reg 201 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 58 POP reg 202 NP,NP,NP,NP, NP,NP,NP,NP, // 60 203 PE|UV,NP,PE|UV,NP, NP,NP,NP,NP, // 68 204 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 70 Jcc rel8 205 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 78 Jcc rel8 206 207 NP,NP,NP,NP, NP,NP,NP,NP, // 80 208 UV,UV,UV,UV, NP,UV,NP,NP, // 88 209 NP,NP,NP,NP, NP,NP,NP,NP, // 90 210 NP,NP,NP,NP, NP,NP,NP,NP, // 98 211 UV,UV,UV,UV, NP,NP,NP,NP, // A0 212 UV,UV,NP,NP, NP,NP,NP,NP, // A8 213 UV,UV,UV,UV, UV,UV,UV,UV, // B0 214 UV,UV,UV,UV, UV,UV,UV,UV, // B8 215 216 NP,NP,NP,NP, NP,NP,NP,NP, // C0 217 NP,NP,NP,NP, NP,NP,NP,NP, // C8 218 PU,PU,NP,NP, NP,NP,NP,NP, // D0 219 FX,NP,FX,FX, NP,NP,FX,NP, // D8 all floating point 220 NP,NP,NP,NP, NP,NP,NP,NP, // E0 221 PE|PV,PV,NP,PV, NP,NP,NP,NP, // E8 222 NP,NP,NP,NP, NP,NP,NP,NP, // F0 223 NP,NP,NP,NP, NP,NP,NP,NP, // F8 224 ]; 225 226 /******************************************** 227 * For each opcode, determine read [0] and written [1] masks. 228 */ 229 230 enum 231 { 232 EA = 0x100000, 233 R = 0x200000, /// register (reg of modregrm field) 234 N = 0x400000, /// other things modified, not swappable 235 B = 0x800000, /// it's a byte operation 236 C = 0x1000000, /// floating point flags 237 mMEM = 0x2000000, /// memory 238 S = 0x4000000, /// floating point stack 239 F = 0x8000000, /// flags 240 } 241 242 extern (D) private immutable uint[2][256] oprw = 243 [ 244 // 00 245 [ EA|R|B, F|EA|B ], // ADD 246 [ EA|R, F|EA ], 247 [ EA|R|B, F|R|B ], 248 [ EA|R, F|R ], 249 [ mAX, F|mAX ], 250 [ mAX, F|mAX ], 251 [ N, N ], // PUSH ES 252 [ N, N ], // POP ES 253 254 // 08 255 [ EA|R|B, F|EA|B ], // OR 256 [ EA|R, F|EA ], 257 [ EA|R|B, F|R|B ], 258 [ EA|R, F|R ], 259 [ mAX, F|mAX ], 260 [ mAX, F|mAX ], 261 [ N, N ], // PUSH CS 262 [ N, N ], // 2 byte escape 263 264 // 10 265 [ F|EA|R|B,F|EA|B ], // ADC 266 [ F|EA|R, F|EA ], 267 [ F|EA|R|B,F|R|B ], 268 [ F|EA|R, F|R ], 269 [ F|mAX, F|mAX ], 270 [ F|mAX, F|mAX ], 271 [ N, N ], // PUSH SS 272 [ N, N ], // POP SS 273 274 // 18 275 [ F|EA|R|B,F|EA|B ], // SBB 276 [ F|EA|R, F|EA ], 277 [ F|EA|R|B,F|R|B ], 278 [ F|EA|R, F|R ], 279 [ F|mAX, F|mAX ], 280 [ F|mAX, F|mAX ], 281 [ N, N ], // PUSH DS 282 [ N, N ], // POP DS 283 284 // 20 285 [ EA|R|B, F|EA|B ], // AND 286 [ EA|R, F|EA ], 287 [ EA|R|B, F|R|B ], 288 [ EA|R, F|R ], 289 [ mAX, F|mAX ], 290 [ mAX, F|mAX ], 291 [ N, N ], // SEG ES 292 [ F|mAX, F|mAX ], // DAA 293 294 // 28 295 [ EA|R|B, F|EA|B ], // SUB 296 [ EA|R, F|EA ], 297 [ EA|R|B, F|R|B ], 298 [ EA|R, F|R ], 299 [ mAX, F|mAX ], 300 [ mAX, F|mAX ], 301 [ N, N ], // SEG CS 302 [ F|mAX, F|mAX ], // DAS 303 304 // 30 305 [ EA|R|B, F|EA|B ], // XOR 306 [ EA|R, F|EA ], 307 [ EA|R|B, F|R|B ], 308 [ EA|R, F|R ], 309 [ mAX, F|mAX ], 310 [ mAX, F|mAX ], 311 [ N, N ], // SEG SS 312 [ F|mAX, F|mAX ], // AAA 313 314 // 38 315 [ EA|R|B, F ], // CMP 316 [ EA|R, F ], 317 [ EA|R|B, F ], 318 [ EA|R, F ], 319 [ mAX, F ], // CMP AL,imm8 320 [ mAX, F ], // CMP EAX,imm16/32 321 [ N, N ], // SEG DS 322 [ N, N ], // AAS 323 324 // 40 325 [ mAX, F|mAX ], // INC EAX 326 [ mCX, F|mCX ], 327 [ mDX, F|mDX ], 328 [ mBX, F|mBX ], 329 [ mSP, F|mSP ], 330 [ mBP, F|mBP ], 331 [ mSI, F|mSI ], 332 [ mDI, F|mDI ], 333 334 // 48 335 [ mAX, F|mAX ], // DEC EAX 336 [ mCX, F|mCX ], 337 [ mDX, F|mDX ], 338 [ mBX, F|mBX ], 339 [ mSP, F|mSP ], 340 [ mBP, F|mBP ], 341 [ mSI, F|mSI ], 342 [ mDI, F|mDI ], 343 344 // 50 345 [ mAX|mSP, mSP|mMEM ], // PUSH EAX 346 [ mCX|mSP, mSP|mMEM ], 347 [ mDX|mSP, mSP|mMEM ], 348 [ mBX|mSP, mSP|mMEM ], 349 [ mSP|mSP, mSP|mMEM ], 350 [ mBP|mSP, mSP|mMEM ], 351 [ mSI|mSP, mSP|mMEM ], 352 [ mDI|mSP, mSP|mMEM ], 353 354 // 58 355 [ mSP|mMEM, mAX|mSP ], // POP EAX 356 [ mSP|mMEM, mCX|mSP ], 357 [ mSP|mMEM, mDX|mSP ], 358 [ mSP|mMEM, mBX|mSP ], 359 [ mSP|mMEM, mSP|mSP ], 360 [ mSP|mMEM, mBP|mSP ], 361 [ mSP|mMEM, mSI|mSP ], 362 [ mSP|mMEM, mDI|mSP ], 363 364 // 60 365 [ N, N ], // PUSHA 366 [ N, N ], // POPA 367 [ N, N ], // BOUND Gv,Ma 368 [ N, N ], // ARPL Ew,Rw 369 [ N, N ], // SEG FS 370 [ N, N ], // SEG GS 371 [ N, N ], // operand size prefix 372 [ N, N ], // address size prefix 373 374 // 68 375 [ mSP, mSP|mMEM ], // PUSH immed16/32 376 [ EA, F|R ], // IMUL Gv,Ev,lv 377 [ mSP, mSP|mMEM ], // PUSH immed8 378 [ EA, F|R ], // IMUL Gv,Ev,lb 379 [ N, N ], // INSB Yb,DX 380 [ N, N ], // INSW/D Yv,DX 381 [ N, N ], // OUTSB DX,Xb 382 [ N, N ], // OUTSW/D DX,Xv 383 384 // 70 385 [ F|N, N ], 386 [ F|N, N ], 387 [ F|N, N ], 388 [ F|N, N ], 389 [ F|N, N ], 390 [ F|N, N ], 391 [ F|N, N ], 392 [ F|N, N ], 393 394 // 78 395 [ F|N, N ], 396 [ F|N, N ], 397 [ F|N, N ], 398 [ F|N, N ], 399 [ F|N, N ], 400 [ F|N, N ], 401 [ F|N, N ], 402 [ F|N, N ], 403 404 // 80 405 [ N, N ], 406 [ N, N ], 407 [ N, N ], 408 [ N, N ], 409 [ EA|R, F ], // TEST EA,r8 410 [ EA|R, F ], // TEST EA,r16/32 411 [ EA|R, EA|R ], // XCHG EA,r8 412 [ EA|R, EA|R ], // XCHG EA,r16/32 413 414 // 88 415 [ R|B, EA|B ], // MOV EA8,r8 416 [ R, EA ], // MOV EA,r16/32 417 [ EA|B, R|B ], // MOV r8,EA8 418 [ EA, R ], // MOV r16/32,EA 419 [ N, N ], // MOV EA,segreg 420 [ EA, R ], // LEA r16/32,EA 421 [ N, N ], // MOV segreg,EA 422 [ mSP|mMEM, EA|mSP ], // POP mem16/32 423 424 // 90 425 [ 0, 0 ], // NOP 426 [ mAX|mCX, mAX|mCX ], 427 [ mAX|mDX, mAX|mDX ], 428 [ mAX|mBX, mAX|mBX ], 429 [ mAX|mSP, mAX|mSP ], 430 [ mAX|mBP, mAX|mBP ], 431 [ mAX|mSI, mAX|mSI ], 432 [ mAX|mDI, mAX|mDI ], 433 434 // 98 435 [ mAX, mAX ], // CBW 436 [ mAX, mDX ], // CWD 437 [ N, N|F ], // CALL far ptr 438 [ N, N ], // WAIT 439 [ F|mSP, mSP|mMEM ], // PUSHF 440 [ mSP|mMEM, F|mSP ], // POPF 441 [ mAX, F ], // SAHF 442 [ F, mAX ], // LAHF 443 444 // A0 445 [ mMEM, mAX ], // MOV AL,moffs8 446 [ mMEM, mAX ], // MOV EAX,moffs32 447 [ mAX, mMEM ], // MOV moffs8,AL 448 [ mAX, mMEM ], // MOV moffs32,EAX 449 [ N, N ], // MOVSB 450 [ N, N ], // MOVSW/D 451 [ N, N ], // CMPSB 452 [ N, N ], // CMPSW/D 453 454 // A8 455 [ mAX, F ], // TEST AL,imm8 456 [ mAX, F ], // TEST AX,imm16 457 [ N, N ], // STOSB 458 [ N, N ], // STOSW/D 459 [ N, N ], // LODSB 460 [ N, N ], // LODSW/D 461 [ N, N ], // SCASB 462 [ N, N ], // SCASW/D 463 464 // B0 465 [ 0, mAX ], // MOV AL,imm8 466 [ 0, mCX ], 467 [ 0, mDX ], 468 [ 0, mBX ], 469 [ 0, mAX ], 470 [ 0, mCX ], 471 [ 0, mDX ], 472 [ 0, mBX ], 473 474 // B8 475 [ 0, mAX ], // MOV AX,imm16 476 [ 0, mCX ], 477 [ 0, mDX ], 478 [ 0, mBX ], 479 [ 0, mSP ], 480 [ 0, mBP ], 481 [ 0, mSI ], 482 [ 0, mDI ], 483 484 // C0 485 [ EA, F|EA ], // Shift Eb,Ib 486 [ EA, F|EA ], 487 [ N, N ], 488 [ N, N ], 489 [ N, N ], 490 [ N, N ], 491 [ 0, EA|B ], // MOV EA8,imm8 492 [ 0, EA ], // MOV EA,imm16 493 494 // C8 495 [ N, N ], // ENTER 496 [ N, N ], // LEAVE 497 [ N, N ], // RETF lw 498 [ N, N ], // RETF 499 [ N, N ], // INT 3 500 [ N, N ], // INT lb 501 [ N, N ], // INTO 502 [ N, N ], // IRET 503 504 // D0 505 [ EA, F|EA ], // Shift EA,1 506 [ EA, F|EA ], 507 [ EA|mCX, F|EA ], // Shift EA,CL 508 [ EA|mCX, F|EA ], 509 [ mAX, F|mAX ], // AAM 510 [ mAX, F|mAX ], // AAD 511 [ N, N ], // reserved 512 [ mAX|mBX|mMEM, mAX ], // XLAT 513 514 // D8 515 [ N, N ], 516 [ N, N ], 517 [ N, N ], 518 [ N, N ], 519 [ N, N ], 520 [ N, N ], 521 [ N, N ], 522 [ N, N ], 523 524 // E0 525 [ F|mCX|N,mCX|N ], // LOOPNE jb 526 [ F|mCX|N,mCX|N ], // LOOPE jb 527 [ mCX|N, mCX|N ], // LOOP jb 528 [ mCX|N, N ], // JCXZ jb 529 [ N, N ], // IN AL,lb 530 [ N, N ], // IN EAX,lb 531 [ N, N ], // OUT lb,AL 532 [ N, N ], // OUT lb,EAX 533 534 // E8 535 [ N, N|F ], // CALL jv 536 [ N, N ], // JMP Jv 537 [ N, N ], // JMP Ab 538 [ N, N ], // JMP jb 539 [ N|mDX, N|mAX ], // IN AL,DX 540 [ N|mDX, N|mAX ], // IN AX,DX 541 [ N|mAX|mDX,N ], // OUT DX,AL 542 [ N|mAX|mDX,N ], // OUT DX,AX 543 544 // F0 545 [ N, N ], // LOCK 546 [ N, N ], // reserved 547 [ N, N ], // REPNE 548 [ N, N ], // REP,REPE 549 [ N, N ], // HLT 550 [ F, F ], // CMC 551 [ N, N ], 552 [ N, N ], 553 554 // F8 555 [ 0, F ], // CLC 556 [ 0, F ], // STC 557 [ N, N ], // CLI 558 [ N, N ], // STI 559 [ N, N ], // CLD 560 [ N, N ], // STD 561 [ EA, F|EA ], // INC/DEC 562 [ N, N ], 563 ]; 564 565 /**************************************** 566 * Same thing, but for groups. 567 */ 568 569 extern (D) private immutable uint[2][8][8] grprw = 570 [ 571 [ 572 // Grp 1 573 [ EA, F|EA ], // ADD 574 [ EA, F|EA ], // OR 575 [ F|EA, F|EA ], // ADC 576 [ F|EA, F|EA ], // SBB 577 [ EA, F|EA ], // AND 578 [ EA, F|EA ], // SUB 579 [ EA, F|EA ], // XOR 580 [ EA, F ], // CMP 581 ], 582 [ 583 // Grp 3 584 [ EA, F ], // TEST EA,imm 585 [ N, N ], // reserved 586 [ EA, EA ], // NOT 587 [ EA, F|EA ], // NEG 588 [ mAX|EA, F|mAX|mDX ], // MUL 589 [ mAX|EA, F|mAX|mDX ], // IMUL 590 [ mAX|mDX|EA, F|mAX|mDX ], // DIV 591 592 // Could generate an exception we want to catch 593 //mAX|mDX|EA|N, F|mAX|mDX|N, // IDIV 594 595 [ mAX|mDX|EA, F|mAX|mDX ], // IDIV 596 ], 597 [ 598 // Grp 5 599 [ EA, F|EA ], // INC Ev 600 [ EA, F|EA ], // DEC Ev 601 [ N|EA, N ], // CALL Ev 602 [ N|EA, N ], // CALL eP 603 [ N|EA, N ], // JMP Ev 604 [ N|EA, N ], // JMP Ep 605 [ mSP|EA, mSP|mMEM ], // PUSH Ev 606 [ N, N ], // reserved 607 ], 608 [ 609 // Grp 3, byte version 610 [ EA|B, F ], // TEST EA,imm 611 [ N, N ], // reserved 612 [ EA|B, EA|B ], // NOT 613 [ EA|B, F|EA|B ], // NEG 614 [ mAX|EA, F|mAX ], // MUL 615 [ mAX|EA, F|mAX ], // IMUL 616 [ mAX|EA, F|mAX ], // DIV 617 618 // Could generate an exception we want to catch 619 //mAX|EA|N, F|mAX|N, // IDIV 620 621 [ mAX|EA, F|mAX ], // IDIV 622 ] 623 ]; 624 625 /******************************************** 626 * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 627 * [][][0] = read 628 * [1] = write 629 */ 630 631 extern (D) private immutable uint[2][8][8] grpf1 = 632 [ 633 [ 634 // 0xD8 635 [ EA|S, S|C ], // FADD float 636 [ EA|S, S|C ], // FMUL float 637 [ EA|S, C ], // FCOM float 638 [ EA|S, S|C ], // FCOMP float 639 [ EA|S, S|C ], // FSUB float 640 [ EA|S, S|C ], // FSUBR float 641 [ EA|S, S|C ], // FDIV float 642 [ EA|S, S|C ], // FDIVR float 643 ], 644 [ 645 // 0xD9 646 [ EA, S|C ], // FLD float 647 [ N, N ], // 648 [ S, EA|C ], // FST float 649 [ S, EA|S|C ], // FSTP float 650 [ N, N ], // FLDENV 651 [ N, N ], // FLDCW 652 [ N, N ], // FSTENV 653 [ N, N ], // FSTCW 654 ], 655 [ 656 // 0xDA 657 [ EA|S, S|C ], // FIADD long 658 [ EA|S, S|C ], // FIMUL long 659 [ EA|S, C ], // FICOM long 660 [ EA|S, S|C ], // FICOMP long 661 [ EA|S, S|C ], // FISUB long 662 [ EA|S, S|C ], // FISUBR long 663 [ EA|S, S|C ], // FIDIV long 664 [ EA|S, S|C ], // FIDIVR long 665 ], 666 [ 667 // 0xDB 668 [ EA, S|C ], // FILD long 669 [ S, EA|S|C ], // FISTTP int 670 [ S, EA|C ], // FIST long 671 [ S, EA|S|C ], // FISTP long 672 [ N, N ], // 673 [ EA, S|C ], // FLD real80 674 [ N, N ], // 675 [ S, EA|S|C ], // FSTP real80 676 ], 677 [ 678 // 0xDC 679 [ EA|S, S|C ], // FADD double 680 [ EA|S, S|C ], // FMUL double 681 [ EA|S, C ], // FCOM double 682 [ EA|S, S|C ], // FCOMP double 683 [ EA|S, S|C ], // FSUB double 684 [ EA|S, S|C ], // FSUBR double 685 [ EA|S, S|C ], // FDIV double 686 [ EA|S, S|C ], // FDIVR double 687 ], 688 [ 689 // 0xDD 690 [ EA, S|C ], // FLD double 691 [ S, EA|S|C ], // FISTTP long 692 [ S, EA|C ], // FST double 693 [ S, EA|S|C ], // FSTP double 694 [ N, N ], // FRSTOR 695 [ N, N ], // 696 [ N, N ], // FSAVE 697 [ C, EA ], // FSTSW 698 ], 699 [ 700 // 0xDE 701 [ EA|S, S|C ], // FIADD short 702 [ EA|S, S|C ], // FIMUL short 703 [ EA|S, C ], // FICOM short 704 [ EA|S, S|C ], // FICOMP short 705 [ EA|S, S|C ], // FISUB short 706 [ EA|S, S|C ], // FISUBR short 707 [ EA|S, S|C ], // FIDIV short 708 [ EA|S, S|C ], // FIDIVR short 709 ], 710 [ 711 // 0xDF 712 [ EA, S|C ], // FILD short 713 [ S, EA|S|C ], // FISTTP short 714 [ S, EA|C ], // FIST short 715 [ S, EA|S|C ], // FISTP short 716 [ EA, S|C ], // FBLD packed BCD 717 [ EA, S|C ], // FILD long long 718 [ S, EA|S|C ], // FBSTP packed BCD 719 [ S, EA|S|C ], // FISTP long long 720 ] 721 ]; 722 723 724 /******************************************** 725 * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 726 */ 727 728 extern (D) private immutable ubyte[8][8] uopsgrpf1 = 729 [ 730 [ 731 // 0xD8 732 2, // FADD float 733 2, // FMUL float 734 2, // FCOM float 735 2, // FCOMP float 736 2, // FSUB float 737 2, // FSUBR float 738 2, // FDIV float 739 2, // FDIVR float 740 ], 741 [ 742 // 0xD9 743 1, // FLD float 744 0, // 745 2, // FST float 746 2, // FSTP float 747 5, // FLDENV 748 3, // FLDCW 749 5, // FSTENV 750 5, // FSTCW 751 ], 752 [ 753 // 0xDA 754 5, // FIADD long 755 5, // FIMUL long 756 5, // FICOM long 757 5, // FICOMP long 758 5, // FISUB long 759 5, // FISUBR long 760 5, // FIDIV long 761 5, // FIDIVR long 762 ], 763 [ 764 // 0xDB 765 4, // FILD long 766 0, // 767 4, // FIST long 768 4, // FISTP long 769 0, // 770 4, // FLD real80 771 0, // 772 5, // FSTP real80 773 ], 774 [ 775 // 0xDC 776 2, // FADD double 777 2, // FMUL double 778 2, // FCOM double 779 2, // FCOMP double 780 2, // FSUB double 781 2, // FSUBR double 782 2, // FDIV double 783 2, // FDIVR double 784 ], 785 [ 786 // 0xDD 787 1, // FLD double 788 0, // 789 2, // FST double 790 2, // FSTP double 791 5, // FRSTOR 792 0, // 793 5, // FSAVE 794 5, // FSTSW 795 ], 796 [ 797 // 0xDE 798 5, // FIADD short 799 5, // FIMUL short 800 5, // FICOM short 801 5, // FICOMP short 802 5, // FISUB short 803 5, // FISUBR short 804 5, // FIDIV short 805 5, // FIDIVR short 806 ], 807 [ 808 // 0xDF 809 4, // FILD short 810 0, // 811 4, // FIST short 812 4, // FISTP short 813 5, // FBLD packed BCD 814 4, // FILD long long 815 5, // FBSTP packed BCD 816 4, // FISTP long long 817 ] 818 ]; 819 820 /************************************************** 821 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 822 * 0 means special case, 823 * 5 means 'complex' 824 */ 825 826 extern (D) private immutable ubyte[256] insuops = 827 [ 0,0,0,0, 1,1,4,5, /* 00 */ 828 0,0,0,0, 1,1,4,0, /* 08 */ 829 0,0,0,0, 2,2,4,5, /* 10 */ 830 0,0,0,0, 2,2,4,5, /* 18 */ 831 0,0,0,0, 1,1,0,1, /* 20 */ 832 0,0,0,0, 1,1,0,1, /* 28 */ 833 0,0,0,0, 1,1,0,1, /* 30 */ 834 0,0,0,0, 1,1,0,1, /* 38 */ 835 1,1,1,1, 1,1,1,1, /* 40 */ 836 1,1,1,1, 1,1,1,1, /* 48 */ 837 3,3,3,3, 3,3,3,3, /* 50 */ 838 2,2,2,2, 3,2,2,2, /* 58 */ 839 5,5,5,5, 0,0,0,0, /* 60 */ 840 3,3,0,0, 5,5,5,5, /* 68 */ 841 1,1,1,1, 1,1,1,1, /* 70 */ 842 1,1,1,1, 1,1,1,1, /* 78 */ 843 0,0,0,0, 0,0,0,0, /* 80 */ 844 0,0,0,0, 0,1,4,0, /* 88 */ 845 1,3,3,3, 3,3,3,3, /* 90 */ 846 1,1,5,0, 5,5,1,1, /* 98 */ 847 1,1,2,2, 5,5,5,5, /* A0 */ 848 1,1,3,3, 2,2,3,3, /* A8 */ 849 1,1,1,1, 1,1,1,1, /* B0 */ 850 1,1,1,1, 1,1,1,1, /* B8 */ 851 0,0,5,4, 0,0,0,0, /* C0 */ 852 5,3,5,5, 5,3,5,5, /* C8 */ 853 0,0,0,0, 4,3,0,2, /* D0 */ 854 0,0,0,0, 0,0,0,0, /* D8 */ 855 4,4,4,2, 5,5,5,5, /* E0 */ 856 4,1,5,1, 5,5,5,5, /* E8 */ 857 0,0,5,5, 5,1,0,0, /* F0 */ 858 1,1,5,5, 4,4,0,0, /* F8 */ 859 ]; 860 861 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ]; 862 863 /************************************************ 864 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 865 * 5 means 'complex'. 866 * Doesn't currently handle: 867 * floating point 868 * MMX 869 * 0F opcodes 870 * prefix bytes 871 */ 872 873 private int uops(code *c) 874 { int n; 875 int op; 876 int op2; 877 878 op = c.Iop & 0xFF; 879 if ((c.Iop & 0xFF00) == 0x0F00) 880 op = 0x0F; 881 n = insuops[op]; 882 if (!n) // if special case 883 { ubyte irm,mod,reg,rm; 884 885 irm = c.Irm; 886 mod = (irm >> 6) & 3; 887 reg = (irm >> 3) & 7; 888 rm = irm & 7; 889 890 switch (op) 891 { 892 case 0x10: 893 case 0x11: // ADC rm,r 894 case 0x18: 895 case 0x19: // SBB rm,r 896 n = (mod == 3) ? 2 : 4; 897 break; 898 899 case 0x12: 900 case 0x13: // ADC r,rm 901 case 0x1A: 902 case 0x1B: // SBB r,rm 903 n = (mod == 3) ? 2 : 3; 904 break; 905 906 case 0x00: 907 case 0x01: // ADD rm,r 908 case 0x08: 909 case 0x09: // OR rm,r 910 case 0x20: 911 case 0x21: // AND rm,r 912 case 0x28: 913 case 0x29: // SUB rm,r 914 case 0x30: 915 case 0x31: // XOR rm,r 916 n = (mod == 3) ? 1 : 4; 917 break; 918 919 case 0x02: 920 case 0x03: // ADD r,rm 921 case 0x0A: 922 case 0x0B: // OR r,rm 923 case 0x22: 924 case 0x23: // AND r,rm 925 case 0x2A: 926 case 0x2B: // SUB r,rm 927 case 0x32: 928 case 0x33: // XOR r,rm 929 case 0x38: 930 case 0x39: // CMP rm,r 931 case 0x3A: 932 case 0x3B: // CMP r,rm 933 case 0x69: // IMUL rm,r,imm 934 case 0x6B: // IMUL rm,r,imm8 935 case 0x84: 936 case 0x85: // TEST rm,r 937 n = (mod == 3) ? 1 : 2; 938 break; 939 940 case 0x80: 941 case 0x81: 942 case 0x82: 943 case 0x83: 944 if (reg == 2 || reg == 3) // ADC/SBB rm,imm 945 n = (mod == 3) ? 2 : 4; 946 else if (reg == 7) // CMP rm,imm 947 n = (mod == 3) ? 1 : 2; 948 else 949 n = (mod == 3) ? 1 : 4; 950 break; 951 952 case 0x86: 953 case 0x87: // XCHG rm,r 954 n = (mod == 3) ? 3 : 5; 955 break; 956 957 case 0x88: 958 case 0x89: // MOV rm,r 959 n = (mod == 3) ? 1 : 2; 960 break; 961 962 case 0x8A: 963 case 0x8B: // MOV r,rm 964 n = 1; 965 break; 966 967 case 0x8C: // MOV Sreg,rm 968 n = (mod == 3) ? 1 : 3; 969 break; 970 971 case 0x8F: 972 if (reg == 0) // POP m 973 n = 5; 974 break; 975 976 case 0xC6: 977 case 0xC7: 978 if (reg == 0) // MOV rm,imm 979 n = (mod == 3) ? 1 : 2; 980 break; 981 982 case 0xD0: 983 case 0xD1: 984 if (reg == 2 || reg == 3) // RCL/RCR rm,1 985 n = (mod == 3) ? 2 : 4; 986 else 987 n = (mod == 3) ? 1 : 4; 988 break; 989 990 case 0xC0: 991 case 0xC1: // RCL/RCR rm,imm8 992 case 0xD2: 993 case 0xD3: 994 if (reg == 2 || reg == 3) // RCL/RCR rm,CL 995 n = 5; 996 else 997 n = (mod == 3) ? 1 : 4; 998 break; 999 1000 case 0xD8: 1001 case 0xD9: 1002 case 0xDA: 1003 case 0xDB: 1004 case 0xDC: 1005 case 0xDD: 1006 case 0xDE: 1007 case 0xDF: 1008 // Floating point opcodes 1009 if (irm < 0xC0) 1010 { n = uopsgrpf1[op - 0xD8][reg]; 1011 break; 1012 } 1013 n = uopsx[op - 0xD8]; 1014 switch (op) 1015 { 1016 case 0xD9: 1017 switch (irm) 1018 { 1019 case 0xE0: // FCHS 1020 n = 3; 1021 break; 1022 case 0xE8: 1023 case 0xE9: 1024 case 0xEA: 1025 case 0xEB: 1026 case 0xEC: 1027 case 0xED: 1028 n = 2; 1029 break; 1030 case 0xF0: 1031 case 0xF1: 1032 case 0xF2: 1033 case 0xF3: 1034 case 0xF4: 1035 case 0xF5: 1036 case 0xF8: 1037 case 0xF9: 1038 case 0xFB: 1039 case 0xFC: 1040 case 0xFD: 1041 case 0xFE: 1042 case 0xFF: 1043 n = 5; 1044 break; 1045 1046 default: 1047 break; 1048 } 1049 break; 1050 case 0xDE: 1051 if (irm == 0xD9) // FCOMPP 1052 n = 2; 1053 break; 1054 1055 default: 1056 break; 1057 } 1058 break; 1059 1060 case 0xF6: 1061 if (reg == 6 || reg == 7) // DIV AL,rm8 1062 n = (mod == 3) ? 3 : 4; 1063 else if (reg == 4 || reg == 5 || reg == 0) // MUL/IMUL/TEST rm8 1064 n = (mod == 3) ? 1 : 2; 1065 else if (reg == 2 || reg == 3) // NOT/NEG rm 1066 n = (mod == 3) ? 1 : 4; 1067 break; 1068 1069 case 0xF7: 1070 if (reg == 6 || reg == 7) // DIV EAX,rm 1071 n = 4; 1072 else if (reg == 4 || reg == 5) // MUL/IMUL rm 1073 n = (mod == 3) ? 3 : 4; 1074 else if (reg == 2 || reg == 3) // NOT/NEG rm 1075 n = (mod == 3) ? 1 : 4; 1076 break; 1077 1078 case 0xFF: 1079 if (reg == 2 || reg == 3 || // CALL rm, CALL m,rm 1080 reg == 5) // JMP seg:offset 1081 n = 5; 1082 else if (reg == 4) 1083 n = (mod == 3) ? 1 : 2; 1084 else if (reg == 0 || reg == 1) // INC/DEC rm 1085 n = (mod == 3) ? 1 : 4; 1086 else if (reg == 6) // PUSH rm 1087 n = (mod == 3) ? 3 : 4; 1088 break; 1089 1090 case 0x0F: 1091 op2 = c.Iop & 0xFF; 1092 if ((op2 & 0xF0) == 0x80) // Jcc 1093 { n = 1; 1094 break; 1095 } 1096 if ((op2 & 0xF0) == 0x90) // SETcc 1097 { n = (mod == 3) ? 1 : 3; 1098 break; 1099 } 1100 if (op2 == 0xB6 || op2 == 0xB7 || // MOVZX 1101 op2 == 0xBE || op2 == 0xBF) // MOVSX 1102 { n = 1; 1103 break; 1104 } 1105 if (op2 == 0xAF) // IMUL r,m 1106 { n = (mod == 3) ? 1 : 2; 1107 break; 1108 } 1109 break; 1110 1111 default: 1112 break; 1113 } 1114 } 1115 if (n == 0) 1116 n = 5; // copout for now 1117 return n; 1118 } 1119 1120 /****************************************** 1121 * Determine pairing classification. 1122 * Don't deal with floating point, just assume they are all NP (Not Pairable). 1123 * Returns: 1124 * NP,UV,PU,PV optionally OR'd with PE 1125 */ 1126 1127 private int pair_class(code *c) 1128 { ubyte op; 1129 ubyte irm,mod,reg,rm; 1130 uint a32; 1131 int pc; 1132 1133 // Of course, with Intel this is *never* simple, and Intel's 1134 // documentation is vague about the specifics. 1135 1136 op = c.Iop & 0xFF; 1137 if ((c.Iop & 0xFF00) == 0x0F00) 1138 op = 0x0F; 1139 pc = pentcycl[op]; 1140 a32 = I32; 1141 if (c.Iflags & CFaddrsize) 1142 a32 ^= 1; 1143 irm = c.Irm; 1144 mod = (irm >> 6) & 3; 1145 reg = (irm >> 3) & 7; 1146 rm = irm & 7; 1147 switch (op) 1148 { 1149 case 0x0F: // 2 byte opcode 1150 if ((c.Iop & 0xF0) == 0x80) // if Jcc 1151 pc = PV | PF; 1152 break; 1153 1154 case 0x80: 1155 case 0x81: 1156 case 0x83: 1157 if (reg == 2 || // ADC EA,immed 1158 reg == 3) // SBB EA,immed 1159 { pc = PU; 1160 goto L2; 1161 } 1162 goto L1; // AND/OR/XOR/ADD/SUB/CMP EA,immed 1163 1164 case 0x84: 1165 case 0x85: // TEST EA,reg 1166 if (mod == 3) // TEST reg,reg 1167 pc = UV; 1168 break; 1169 1170 case 0xC0: 1171 case 0xC1: 1172 if (reg >= 4) 1173 pc = PU; 1174 break; 1175 1176 case 0xC6: 1177 case 0xC7: 1178 if (reg == 0) // MOV EA,immed 1179 { 1180 L1: 1181 pc = UV; 1182 L2: 1183 // if EA contains a displacement then 1184 // can't execute in V, or pair in U 1185 switch (mod) 1186 { case 0: 1187 if (a32) 1188 { if (rm == 5 || 1189 (rm == 4 && (c.Isib & 7) == 5) 1190 ) 1191 pc = NP; 1192 } 1193 else if (rm == 6) 1194 pc = NP; 1195 break; 1196 case 1: 1197 case 2: 1198 pc = NP; 1199 break; 1200 1201 default: 1202 break; 1203 } 1204 } 1205 break; 1206 1207 case 0xD9: 1208 if (irm < 0xC0) 1209 { 1210 if (reg == 0) 1211 pc = FX; 1212 } 1213 else if (irm < 0xC8) 1214 pc = FX; 1215 else if (irm < 0xD0) 1216 pc = PV; 1217 else 1218 { 1219 switch (irm) 1220 { 1221 case 0xE0: 1222 case 0xE1: 1223 case 0xE4: 1224 pc = FX; 1225 break; 1226 1227 default: 1228 break; 1229 } 1230 } 1231 break; 1232 1233 case 0xDB: 1234 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1235 pc = FX; 1236 break; 1237 1238 case 0xDD: 1239 if (irm < 0xC0) 1240 { 1241 if (reg == 0) 1242 pc = FX; 1243 } 1244 else if (irm >= 0xE0 && irm < 0xF0) 1245 pc = FX; 1246 break; 1247 1248 case 0xDF: 1249 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1250 pc = FX; 1251 break; 1252 1253 case 0xFE: 1254 if (reg == 0 || reg == 1) // INC/DEC EA 1255 pc = UV; 1256 break; 1257 case 0xFF: 1258 if (reg == 0 || reg == 1) // INC/DEC EA 1259 pc = UV; 1260 else if (reg == 2 || reg == 4) // CALL/JMP near ptr EA 1261 pc = PE|PV; 1262 else if (reg == 6 && mod == 3) // PUSH reg 1263 pc = PE | UV; 1264 break; 1265 1266 default: 1267 break; 1268 } 1269 if (c.Iflags & CFPREFIX && pc == UV) // if prefix byte 1270 pc = PU; 1271 return pc; 1272 } 1273 1274 /****************************************** 1275 * For an instruction, determine what is read 1276 * and what is written, and what is used for addressing. 1277 * Determine operand size if EA (larger is ok). 1278 */ 1279 1280 private void getinfo(Cinfo *ci,code *c) 1281 { 1282 memset(ci,0,Cinfo.sizeof); 1283 if (!c) 1284 return; 1285 ci.c = c; 1286 1287 if (PRO) 1288 { 1289 ci.uops = cast(ubyte)uops(c); 1290 ci.isz = cast(ubyte)calccodsize(c); 1291 } 1292 else 1293 ci.pair = cast(ubyte)pair_class(c); 1294 1295 ubyte op; 1296 ubyte op2; 1297 ubyte irm,mod,reg,rm; 1298 uint a32; 1299 int pc; 1300 uint r,w; 1301 int sz = I32 ? 4 : 2; 1302 1303 ci.r = 0; 1304 ci.w = 0; 1305 ci.a = 0; 1306 op = c.Iop & 0xFF; 1307 if ((c.Iop & 0xFF00) == 0x0F00) 1308 op = 0x0F; 1309 //printf("\tgetinfo %x, op %x \n",c,op); 1310 pc = pentcycl[op]; 1311 a32 = I32; 1312 if (c.Iflags & CFaddrsize) 1313 a32 ^= 1; 1314 if (c.Iflags & CFopsize) 1315 sz ^= 2 | 4; 1316 irm = c.Irm; 1317 mod = (irm >> 6) & 3; 1318 reg = (irm >> 3) & 7; 1319 rm = irm & 7; 1320 1321 r = oprw[op][0]; 1322 w = oprw[op][1]; 1323 1324 switch (op) 1325 { 1326 case 0x50: 1327 case 0x51: 1328 case 0x52: 1329 case 0x53: 1330 case 0x55: 1331 case 0x56: 1332 case 0x57: // PUSH reg 1333 ci.flags |= CIFL.push; 1334 goto Lpush; 1335 1336 case 0x54: // PUSH ESP 1337 case 0x6A: // PUSH imm8 1338 case 0x68: // PUSH imm 1339 case 0x0E: 1340 case 0x16: 1341 case 0x1E: 1342 case 0x06: 1343 case 0x9C: 1344 Lpush: 1345 ci.spadjust = -sz; 1346 ci.a |= mSP; 1347 break; 1348 1349 case 0x58: 1350 case 0x59: 1351 case 0x5A: 1352 case 0x5B: 1353 case 0x5C: 1354 case 0x5D: 1355 case 0x5E: 1356 case 0x5F: // POP reg 1357 case 0x1F: 1358 case 0x07: 1359 case 0x17: 1360 case 0x9D: // POPF 1361 Lpop: 1362 ci.spadjust = sz; 1363 ci.a |= mSP; 1364 break; 1365 1366 case 0x80: 1367 if (reg == 7) // CMP 1368 c.Iflags |= CFpsw; 1369 r = B | grprw[0][reg][0]; // Grp 1 (byte) 1370 w = B | grprw[0][reg][1]; 1371 break; 1372 1373 case 0x81: 1374 case 0x83: 1375 if (reg == 7) // CMP 1376 c.Iflags |= CFpsw; 1377 else if (irm == modregrm(3,0,SP)) // ADD ESP,imm 1378 { 1379 assert(c.IFL2 == FLconst); 1380 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint; 1381 } 1382 else if (irm == modregrm(3,5,SP)) // SUB ESP,imm 1383 { 1384 assert(c.IFL2 == FLconst); 1385 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint; 1386 } 1387 r = grprw[0][reg][0]; // Grp 1 1388 w = grprw[0][reg][1]; 1389 break; 1390 1391 case 0x8F: 1392 if (reg == 0) // POP rm 1393 goto Lpop; 1394 break; 1395 1396 case 0xA0: 1397 case 0xA1: 1398 case 0xA2: 1399 case 0xA3: 1400 // Fake having an EA to simplify code in conflict() 1401 ci.flags |= CIFL.ea; 1402 ci.reg = 0; 1403 ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6); 1404 c.IFL1 = c.IFL2; 1405 c.IEV1 = c.IEV2; 1406 break; 1407 1408 case 0xC2: 1409 case 0xC3: 1410 case 0xCA: 1411 case 0xCB: // RET 1412 ci.a |= mSP; 1413 break; 1414 1415 case 0xE8: 1416 if (c.Iflags & CFclassinit) // call to __j_classinit 1417 { r = 0; 1418 w = F; 1419 1420 version (CLASSINIT2) 1421 ci.pair = UV; // it is patched to CMP EAX,0 1422 else 1423 ci.pair = NP; 1424 1425 } 1426 break; 1427 1428 case 0xF6: 1429 r = grprw[3][reg][0]; // Grp 3, byte version 1430 w = grprw[3][reg][1]; 1431 break; 1432 1433 case 0xF7: 1434 r = grprw[1][reg][0]; // Grp 3 1435 w = grprw[1][reg][1]; 1436 break; 1437 1438 case 0x0F: 1439 op2 = c.Iop & 0xFF; 1440 if ((op2 & 0xF0) == 0x80) // if Jxx instructions 1441 { 1442 ci.r = F | N; 1443 ci.w = N; 1444 goto Lret; 1445 } 1446 ci.r = N; 1447 ci.w = N; // copout for now 1448 goto Lret; 1449 1450 case 0xD7: // XLAT 1451 ci.a = mAX | mBX; 1452 break; 1453 1454 case 0xFF: 1455 r = grprw[2][reg][0]; // Grp 5 1456 w = grprw[2][reg][1]; 1457 if (reg == 6) // PUSH rm 1458 goto Lpush; 1459 break; 1460 1461 case 0x38: 1462 case 0x39: 1463 case 0x3A: 1464 case 0x3B: 1465 case 0x3C: // CMP AL,imm8 1466 case 0x3D: // CMP EAX,imm32 1467 // For CMP opcodes, always test for flags 1468 c.Iflags |= CFpsw; 1469 break; 1470 1471 case ESCAPE: 1472 if (c.Iop == (ESCAPE | ESCadjfpu)) 1473 ci.fpuadjust = c.IEV1.Vint; 1474 break; 1475 1476 case 0xD0: 1477 case 0xD1: 1478 case 0xD2: 1479 case 0xD3: 1480 case 0xC0: 1481 case 0xC1: 1482 if (reg == 2 || reg == 3) // if RCL or RCR 1483 c.Iflags |= CFpsw; // always test for flags 1484 break; 1485 1486 case 0xD8: 1487 case 0xD9: 1488 case 0xDA: 1489 case 0xDB: 1490 case 0xDC: 1491 case 0xDD: 1492 case 0xDE: 1493 case 0xDF: 1494 if (irm < 0xC0) 1495 { r = grpf1[op - 0xD8][reg][0]; 1496 w = grpf1[op - 0xD8][reg][1]; 1497 switch (op) 1498 { 1499 case 0xD8: 1500 if (reg == 3) // if FCOMP 1501 ci.fpuadjust = -1; 1502 else 1503 ci.fp_op = FP.fop; 1504 break; 1505 1506 case 0xD9: 1507 if (reg == 0) // if FLD float 1508 { ci.fpuadjust = 1; 1509 ci.fp_op = FP.fld; 1510 } 1511 else if (reg == 3) // if FSTP float 1512 { ci.fpuadjust = -1; 1513 ci.fp_op = FP.fstp; 1514 } 1515 else if (reg == 5 || reg == 7) 1516 sz = 2; 1517 else if (reg == 4 || reg == 6) 1518 sz = 28; 1519 break; 1520 case 0xDA: 1521 if (reg == 3) // if FICOMP 1522 ci.fpuadjust = -1; 1523 break; 1524 case 0xDB: 1525 if (reg == 0 || reg == 5) 1526 { ci.fpuadjust = 1; 1527 ci.fp_op = FP.fld; // FILD / FLD long double 1528 } 1529 if (reg == 3 || reg == 7) 1530 ci.fpuadjust = -1; 1531 if (reg == 7) 1532 ci.fp_op = FP.fstp; // FSTP long double 1533 if (reg == 5 || reg == 7) 1534 sz = 10; 1535 break; 1536 case 0xDC: 1537 sz = 8; 1538 if (reg == 3) // if FCOMP 1539 ci.fpuadjust = -1; 1540 else 1541 ci.fp_op = FP.fop; 1542 break; 1543 case 0xDD: 1544 if (reg == 0) // if FLD double 1545 { ci.fpuadjust = 1; 1546 ci.fp_op = FP.fld; 1547 } 1548 if (reg == 3) // if FSTP double 1549 { ci.fpuadjust = -1; 1550 ci.fp_op = FP.fstp; 1551 } 1552 if (reg == 7) 1553 sz = 2; 1554 else if (reg == 4 || reg == 6) 1555 sz = 108; 1556 else 1557 sz = 8; 1558 break; 1559 case 0xDE: 1560 sz = 2; 1561 if (reg == 3) // if FICOMP 1562 ci.fpuadjust = -1; 1563 break; 1564 case 0xDF: 1565 sz = 2; 1566 if (reg == 4 || reg == 6) 1567 sz = 10; 1568 else if (reg == 5 || reg == 7) 1569 sz = 8; 1570 if (reg == 0 || reg == 4 || reg == 5) 1571 ci.fpuadjust = 1; 1572 else if (reg == 3 || reg == 6 || reg == 7) 1573 ci.fpuadjust = -1; 1574 break; 1575 1576 default: 1577 break; 1578 } 1579 break; 1580 } 1581 else if (op == 0xDE) 1582 { ci.fpuadjust = -1; // pop versions of Fop's 1583 if (irm == 0xD9) 1584 ci.fpuadjust = -2; // FCOMPP 1585 } 1586 1587 // Most floating point opcodes aren't staged, but are 1588 // sent right through, in order to make use of the large 1589 // latencies with floating point instructions. 1590 if (ci.fp_op == FP.fld || 1591 (op == 0xD9 && (irm & 0xF8) == 0xC0)) 1592 { } // FLD ST(i) 1593 else 1594 ci.flags |= CIFL.nostage; 1595 1596 switch (op) 1597 { 1598 case 0xD8: 1599 r = S; 1600 w = C; 1601 if ((irm & ~7) == 0xD0) 1602 w |= S; 1603 break; 1604 case 0xD9: 1605 // FCHS or FABS or FSQRT 1606 if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA) 1607 ci.fp_op = FP.fop; 1608 r = S; 1609 w = S|C; 1610 break; 1611 case 0xDA: 1612 if (irm == 0xE9) // FUCOMPP 1613 { r = S; 1614 w = S|C; 1615 break; 1616 } 1617 break; 1618 case 0xDB: 1619 if (irm == 0xE2) // FCLEX 1620 { r = 0; 1621 w = C; 1622 break; 1623 } 1624 if (irm == 0xE3) // FINIT 1625 { r = 0; 1626 w = S|C; 1627 break; 1628 } 1629 break; 1630 case 0xDC: 1631 case 0xDE: 1632 if ((irm & 0xF0) != 0xD0) 1633 { r = S; 1634 w = S|C; 1635 break; 1636 } 1637 break; 1638 case 0xDD: 1639 // Not entirely correct, but conservative 1640 r = S; 1641 w = S|C; 1642 break; 1643 case 0xDF: 1644 if (irm == 0xE0) // FSTSW AX 1645 { r = C; 1646 w = mAX; 1647 break; 1648 } 1649 break; 1650 1651 default: 1652 break; 1653 } 1654 break; 1655 1656 default: 1657 //printf("\t\tNo special case\n"); 1658 break; 1659 } 1660 1661 if ((r | w) & B) // if byte operation 1662 sz = 1; // operand size is 1 1663 1664 ci.r = r & ~(R | EA); 1665 ci.w = w & ~(R | EA); 1666 if (r & R) 1667 ci.r |= mask((r & B) ? (reg & 3) : reg); 1668 if (w & R) 1669 ci.w |= mask((w & B) ? (reg & 3) : reg); 1670 1671 // OR in bits for EA addressing mode 1672 if ((r | w) & EA) 1673 { ubyte sib; 1674 1675 sib = 0; 1676 switch (mod) 1677 { 1678 case 0: 1679 if (a32) 1680 { 1681 if (rm == 4) 1682 { 1683 sib = c.Isib; 1684 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1685 ci.a |= mask((sib >> 3) & 7); // index register 1686 if ((sib & 7) != 5) 1687 ci.a |= mask(sib & 7); // base register 1688 } 1689 else if (rm != 5) 1690 ci.a |= mask(rm); 1691 } 1692 else 1693 { 1694 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX]; 1695 ci.a |= ea16[rm]; 1696 } 1697 goto Lmem; 1698 1699 case 1: 1700 case 2: 1701 if (a32) 1702 { 1703 if (rm == 4) 1704 { 1705 sib = c.Isib; 1706 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1707 ci.a |= mask((sib >> 3) & 7); // index register 1708 ci.a |= mask(sib & 7); // base register 1709 } 1710 else 1711 ci.a |= mask(rm); 1712 } 1713 else 1714 { 1715 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX]; 1716 ci.a |= ea16[rm]; 1717 } 1718 1719 Lmem: 1720 if (r & EA) 1721 ci.r |= mMEM; 1722 if (w & EA) 1723 ci.w |= mMEM; 1724 ci.flags |= CIFL.ea; 1725 break; 1726 1727 case 3: 1728 if (r & EA) 1729 ci.r |= mask((r & B) ? (rm & 3) : rm); 1730 if (w & EA) 1731 ci.w |= mask((w & B) ? (rm & 3) : rm); 1732 break; 1733 1734 default: 1735 assert(0); 1736 } 1737 // Adjust sibmodrm so that addressing modes can be compared simply 1738 irm &= modregrm(3,0,7); 1739 if (a32) 1740 { 1741 if (irm != modregrm(0,0,5)) 1742 { 1743 switch (mod) 1744 { 1745 case 0: 1746 if ((sib & 7) != 5) // if not disp32[index] 1747 { 1748 c.IFL1 = FLconst; 1749 c.IEV1.Vpointer = 0; 1750 irm |= 0x80; 1751 } 1752 break; 1753 case 1: 1754 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1755 irm = modregrm(2, 0, rm); 1756 break; 1757 1758 default: 1759 break; 1760 } 1761 } 1762 } 1763 else 1764 { 1765 if (irm != modregrm(0,0,6)) 1766 { 1767 switch (mod) 1768 { 1769 case 0: 1770 c.IFL1 = FLconst; 1771 c.IEV1.Vpointer = 0; 1772 irm |= 0x80; 1773 break; 1774 case 1: 1775 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1776 irm = modregrm(2, 0, rm); 1777 break; 1778 1779 default: 1780 break; 1781 } 1782 } 1783 } 1784 1785 ci.r |= ci.a; 1786 ci.reg = reg; 1787 ci.sibmodrm = (sib << 8) | irm; 1788 } 1789 Lret: 1790 if (ci.w & mSP) // if stack pointer is modified 1791 ci.w |= mMEM; // then we are implicitly writing to memory 1792 if (op == LEA) // if LEA 1793 ci.r &= ~mMEM; // memory is not actually read 1794 ci.sz = cast(ubyte)sz; 1795 1796 //printf("\t\t"); ci.print(); 1797 } 1798 1799 /****************************************** 1800 * Determine if two instructions can pair. 1801 * Assume that in general, cu can pair in the U pipe and cv in the V. 1802 * Look for things like register contentions. 1803 * Input: 1804 * cu instruction for U pipe 1805 * cv instruction for V pipe 1806 * Returns: 1807 * !=0 if they can pair 1808 */ 1809 1810 private int pair_test(Cinfo *cu,Cinfo *cv) 1811 { 1812 uint pcu; 1813 uint pcv; 1814 uint r1,w1; 1815 uint r2,w2; 1816 uint x; 1817 1818 pcu = cu.pair; 1819 if (!(pcu & PU)) 1820 { 1821 // See if pairs with FXCH and cv is FXCH 1822 if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8) 1823 goto Lpair; 1824 goto Lnopair; 1825 } 1826 pcv = cv.pair; 1827 if (!(pcv & PV)) 1828 goto Lnopair; 1829 1830 r1 = cu.r; 1831 w1 = cu.w; 1832 r2 = cv.r; 1833 w2 = cv.w; 1834 1835 x = w1 & (r2 | w2) & ~(F|mMEM); // register contention 1836 if (x && // if register contention 1837 !(x == mSP && pcu & pcv & PE) // and not exception 1838 ) 1839 goto Lnopair; 1840 1841 // Look for flags contention 1842 if (w1 & r2 & F && !(pcv & PF)) 1843 goto Lnopair; 1844 1845 Lpair: 1846 return 1; 1847 1848 Lnopair: 1849 return 0; 1850 } 1851 1852 /****************************************** 1853 * Determine if two instructions have an AGI or register contention. 1854 * Returns: 1855 * !=0 if they have an AGI 1856 */ 1857 1858 private int pair_agi(Cinfo *c1, Cinfo *c2) 1859 { 1860 uint x = c1.w & c2.a; 1861 return x && !(x == mSP && c1.pair & c2.pair & PE); 1862 } 1863 1864 /******************************************** 1865 * Determine if three instructions can decode simultaneously 1866 * in Pentium Pro and Pentium II. 1867 * Input: 1868 * c0,c1,c2 candidates for decoders 0,1,2 1869 * c2 can be null 1870 * Returns: 1871 * !=0 if they can decode simultaneously 1872 */ 1873 1874 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2) 1875 { 1876 assert(c0); 1877 if (!c1) 1878 return 0; 1879 int c2isz = c2 ? c2.isz : 0; 1880 if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 || 1881 c0.isz + c1.isz + c2isz > 16) 1882 return 0; 1883 1884 // 4-1-1 decode 1885 if (c1.uops > 1 || 1886 (c2 && c2.uops > 1)) 1887 return 0; 1888 1889 return 1; 1890 } 1891 1892 /******************************************** 1893 * Get next instruction worth looking at for scheduling. 1894 * Returns: 1895 * null no more instructions 1896 */ 1897 1898 private code * cnext(code *c) 1899 { 1900 while (1) 1901 { 1902 c = code_next(c); 1903 if (!c) 1904 break; 1905 if (c.Iflags & (CFtarg | CFtarg2)) 1906 break; 1907 if (!(c.Iop == NOP || 1908 c.Iop == (ESCAPE | ESClinnum))) 1909 break; 1910 } 1911 return c; 1912 } 1913 1914 /****************************************** 1915 * Instruction scheduler. 1916 * Input: 1917 * c list of instructions to schedule 1918 * scratch scratch registers we can use 1919 * Returns: 1920 * revised list of scheduled instructions 1921 */ 1922 1923 /////////////////////////////////// 1924 // Determine if c1 and c2 are swappable. 1925 // c1 comes before c2. 1926 // If they do not conflict 1927 // return 0 1928 // If they do conflict 1929 // return 0x100 + delay_clocks 1930 // Input: 1931 // fpsched if 1, then adjust fxch_pre and fxch_post to swap, 1932 // then return 0 1933 // if 2, then adjust ci1 as well as ci2 1934 1935 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched) 1936 { 1937 code *c1; 1938 code *c2; 1939 uint r1,w1,a1; 1940 uint r2,w2,a2; 1941 int sz1,sz2; 1942 int i = 0; 1943 int delay_clocks; 1944 1945 c1 = ci1.c; 1946 c2 = ci2.c; 1947 1948 //printf("conflict %x %x\n",c1,c2); 1949 1950 r1 = ci1.r; 1951 w1 = ci1.w; 1952 a1 = ci1.a; 1953 sz1 = ci1.sz; 1954 1955 r2 = ci2.r; 1956 w2 = ci2.w; 1957 a2 = ci2.a; 1958 sz2 = ci2.sz; 1959 1960 //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1); 1961 //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2); 1962 1963 if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex)) 1964 goto Lconflict; 1965 1966 // Determine if we should handle FPU register conflicts separately 1967 //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op); 1968 if (fpsched && ci1.fp_op && ci2.fp_op) 1969 { 1970 w1 &= ~(S|C); 1971 r1 &= ~(S|C); 1972 w2 &= ~(S|C); 1973 r2 &= ~(S|C); 1974 } 1975 else 1976 fpsched = 0; 1977 1978 if ((r1 | r2) & N) 1979 { 1980 goto Lconflict; 1981 } 1982 1983 static if (0) 1984 { 1985 if (c1.Iop == 0xFF && c2.Iop == 0x8B) 1986 { c1.print(); c2.print(); i = 1; 1987 printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 1988 } 1989 } 1990 L1: 1991 if (w1 & r2 || (r1 | w1) & w2) 1992 { ubyte ifl1,ifl2; 1993 1994 if (i) printf("test\n"); 1995 1996 static if (0) 1997 { 1998 if (c1.IFL1 != c2.IFL1) printf("t1\n"); 1999 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n"); 2000 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n"); 2001 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n"); 2002 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n"); 2003 } 2004 2005 // make sure CFpsw is reliably set 2006 if (w1 & w2 & F && // if both instructions write to flags 2007 w1 != F && 2008 w2 != F && 2009 !((r1 | r2) & F) && // but neither instruction reads them 2010 !((c1.Iflags | c2.Iflags) & CFpsw)) // and we don't care about flags 2011 { 2012 w1 &= ~F; 2013 w2 &= ~F; // remove conflict 2014 goto L1; // and try again 2015 } 2016 2017 // If other than the memory reference is a conflict 2018 if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM) 2019 { if (i) printf("\t1\n"); 2020 if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 2021 goto Lconflict; 2022 } 2023 2024 // If referring to distinct types, then no dependency 2025 if (c1.Irex && c2.Irex && c1.Irex != c2.Irex) 2026 goto Lswap; 2027 2028 ifl1 = c1.IFL1; 2029 ifl2 = c2.IFL1; 2030 2031 // Special case: Allow indexed references using registers other than 2032 // ESP and EBP to be swapped with PUSH instructions 2033 if (((c1.Iop & ~7) == 0x50 || // PUSH reg 2034 c1.Iop == 0x6A || // PUSH imm8 2035 c1.Iop == 0x68 || // PUSH imm16/imm32 2036 (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA 2037 ) && 2038 ci2.flags & CIFL.ea && !(a2 & mSP) && 2039 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2040 ) 2041 { 2042 if (c1.Iop == 0xFF) 2043 { 2044 if (!(w2 & mMEM)) 2045 goto Lswap; 2046 } 2047 else 2048 goto Lswap; 2049 } 2050 2051 // Special case: Allow indexed references using registers other than 2052 // ESP and EBP to be swapped with PUSH instructions 2053 if (((c2.Iop & ~7) == 0x50 || // PUSH reg 2054 c2.Iop == 0x6A || // PUSH imm8 2055 c2.Iop == 0x68 || // PUSH imm16/imm32 2056 (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA 2057 ) && 2058 ci1.flags & CIFL.ea && !(a1 & mSP) && 2059 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2060 ) 2061 { 2062 if (c2.Iop == 0xFF) 2063 { 2064 if (!(w1 & mMEM)) 2065 goto Lswap; 2066 } 2067 else 2068 goto Lswap; 2069 } 2070 2071 // If not both an EA addressing mode, conflict 2072 if (!(ci1.flags & ci2.flags & CIFL.ea)) 2073 { if (i) printf("\t2\n"); 2074 goto Lconflict; 2075 } 2076 2077 if (ci1.sibmodrm == ci2.sibmodrm) 2078 { if (ifl1 != ifl2) 2079 goto Lswap; 2080 switch (ifl1) 2081 { 2082 case FLconst: 2083 if (c1.IEV1.Vint != c2.IEV1.Vint && 2084 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2085 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)) 2086 goto Lswap; 2087 break; 2088 case FLdatseg: 2089 if (c1.IEV1.Vseg != c2.IEV1.Vseg || 2090 c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2091 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2092 goto Lswap; 2093 break; 2094 2095 default: 2096 break; 2097 } 2098 } 2099 2100 if ((c1.Iflags | c2.Iflags) & CFunambig && 2101 (ifl1 != ifl2 || 2102 ci1.sibmodrm != ci2.sibmodrm || 2103 (c1.IEV1.Vint != c2.IEV1.Vint && 2104 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2105 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2106 ) 2107 ) 2108 ) 2109 { 2110 // Assume that [EBP] and [ESP] can point to the same location 2111 if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP)) 2112 goto Lconflict; 2113 goto Lswap; 2114 } 2115 2116 if (i) printf("\t3\n"); 2117 goto Lconflict; 2118 } 2119 2120 Lswap: 2121 if (fpsched) 2122 { 2123 //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op); 2124 ubyte x1 = ci1.fxch_pre; 2125 ubyte y1 = ci1.fxch_post; 2126 ubyte x2 = ci2.fxch_pre; 2127 ubyte y2 = ci2.fxch_post; 2128 2129 static uint X(uint a, uint b) { return (a << 8) | b; } 2130 switch (X(ci1.fp_op,ci2.fp_op)) 2131 { 2132 case X(FP.fstp, FP.fld): 2133 if (x1 || y1) 2134 goto Lconflict; 2135 if (x2) 2136 goto Lconflict; 2137 if (y2 == 0) 2138 ci2.fxch_post++; 2139 else if (y2 == 1) 2140 { 2141 ci2.fxch_pre++; 2142 ci2.fxch_post++; 2143 } 2144 else 2145 { 2146 goto Lconflict; 2147 } 2148 break; 2149 2150 case X(FP.fstp, FP.fop): 2151 if (x1 || y1) 2152 goto Lconflict; 2153 ci2.fxch_pre++; 2154 ci2.fxch_post++; 2155 break; 2156 2157 case X(FP.fop, FP.fop): 2158 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0) 2159 { ci2.fxch_pre = 1; 2160 ci2.fxch_post = 1; 2161 break; 2162 } 2163 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1) 2164 break; 2165 goto Lconflict; 2166 2167 case X(FP.fop, FP.fld): 2168 if (x1 || y1) 2169 goto Lconflict; 2170 if (x2) 2171 goto Lconflict; 2172 if (y2) 2173 break; 2174 else if (fpsched == 2) 2175 ci1.fxch_post = 1; 2176 ci2.fxch_post = 1; 2177 break; 2178 2179 default: 2180 goto Lconflict; 2181 } 2182 2183 //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post); 2184 } 2185 2186 //printf("w1 = x%x, w2 = x%x\n",w1,w2); 2187 if (i) printf("no conflict\n\n"); 2188 return 0; 2189 2190 Lconflict: 2191 //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2); 2192 delay_clocks = 0; 2193 2194 // Determine if AGI 2195 if (!PRO && pair_agi(ci1,ci2)) 2196 delay_clocks = 1; 2197 2198 // Special delays for floating point 2199 if (fpsched) 2200 { if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp) 2201 delay_clocks = 1; 2202 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp) 2203 delay_clocks = 3; 2204 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop) 2205 delay_clocks = 2; 2206 } 2207 else if (PRO) 2208 { 2209 // Look for partial register write stalls 2210 if (w1 & r2 & ALLREGS && sz1 < sz2) 2211 delay_clocks = 7; 2212 } 2213 else if ((w1 | r1) & (w2 | r2) & (C | S)) 2214 { 2215 int op = c1.Iop; 2216 int reg = c1.Irm & modregrm(0,7,0); 2217 if (ci1.fp_op == FP.fld || 2218 (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0) 2219 ) 2220 { } // FLD 2221 else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8) 2222 { } // FXCH 2223 else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8) 2224 { } // FXCH 2225 else 2226 delay_clocks = 3; 2227 } 2228 2229 if (i) printf("conflict %d\n\n",delay_clocks); 2230 return 0x100 + delay_clocks; 2231 } 2232 2233 enum TBLMAX = 2*3*20; // must be divisible by both 2 and 3 2234 // (U,V pipe in Pentium, 3 decode units 2235 // in Pentium Pro) 2236 2237 struct Schedule 2238 { 2239 nothrow: 2240 Cinfo*[TBLMAX] tbl; // even numbers are U pipe, odd numbers are V 2241 int tblmax; // max number of slots used 2242 2243 Cinfo[TBLMAX] cinfo; 2244 int cinfomax; 2245 2246 Barray!(Cinfo*) stagelist; // list of instructions in staging area 2247 2248 int fpustackused; // number of slots in FPU stack that are used 2249 2250 void initialize(int fpustackinit) // initialize scheduler 2251 { 2252 //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit); 2253 memset(&this, 0, Schedule.sizeof); 2254 fpustackused = fpustackinit; 2255 } 2256 2257 void dtor() 2258 { 2259 stagelist.dtor(); 2260 } 2261 2262 code **assemble(code **pc) // reassemble scheduled instructions 2263 { 2264 code *c; 2265 2266 debug 2267 if (debugs) printf("assemble:\n"); 2268 2269 assert(!*pc); 2270 2271 // Try to insert the rest of the staged instructions 2272 size_t sli; 2273 for (sli = 0; sli < stagelist.length; ++sli) 2274 { 2275 Cinfo* ci = stagelist[sli]; 2276 if (!ci) 2277 continue; 2278 if (!insert(ci)) 2279 break; 2280 } 2281 2282 // Get the instructions out of the schedule table 2283 assert(cast(uint)tblmax <= TBLMAX); 2284 for (int i = 0; i < tblmax; i++) 2285 { 2286 Cinfo* ci = tbl[i]; 2287 2288 debug 2289 if (debugs) 2290 { 2291 if (PRO) 2292 { immutable char[4][3] tbl = [ "0 "," 1 "," 2" ]; 2293 2294 if (ci) 2295 printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops); 2296 else 2297 printf("%s ",tbl[i - ((i / 3) * 3)].ptr); 2298 } 2299 else 2300 { 2301 printf((i & 1) ? " V " : "U "); 2302 } 2303 if (ci) 2304 ci.c.print(); 2305 else 2306 printf("\n"); 2307 } 2308 2309 if (!ci) 2310 continue; 2311 fpustackused += ci.fpuadjust; 2312 //printf("stage()1: fpustackused = %d\n", fpustackused); 2313 c = ci.c; 2314 if (i == 0) 2315 c.Iflags |= CFtarg; // by definition, first is always a jump target 2316 else 2317 c.Iflags &= ~CFtarg; // the rest are not 2318 2319 // Put in any FXCH prefix 2320 if (ci.fxch_pre) 2321 { code *cf; 2322 assert(i); 2323 cf = gen2(null,0xD9,0xC8 + ci.fxch_pre); 2324 *pc = cf; 2325 pc = &cf.next; 2326 } 2327 2328 *pc = c; 2329 do 2330 { 2331 assert(*pc != code_next(*pc)); 2332 pc = &(*pc).next; 2333 } while (*pc); 2334 2335 // Put in any FXCH postfix 2336 if (ci.fxch_post) 2337 { 2338 for (int j = i + 1; j < tblmax; j++) 2339 { if (tbl[j]) 2340 { if (tbl[j].fxch_pre == ci.fxch_post) 2341 { 2342 tbl[j].fxch_pre = 0; // they cancel each other out 2343 goto L1; 2344 } 2345 break; 2346 } 2347 } 2348 { code *cf; 2349 cf = gen2(null,0xD9,0xC8 + ci.fxch_post); 2350 *pc = cf; 2351 pc = &cf.next; 2352 } 2353 } 2354 L1: 2355 } 2356 2357 // Just append any instructions left in the staging area 2358 foreach (ci; stagelist[sli .. stagelist.length]) 2359 { 2360 if (!ci) 2361 continue; 2362 2363 debug 2364 if (debugs) { printf("appending: "); ci.c.print(); } 2365 2366 *pc = ci.c; 2367 do 2368 { 2369 pc = &(*pc).next; 2370 2371 } while (*pc); 2372 fpustackused += ci.fpuadjust; 2373 //printf("stage()2: fpustackused = %d\n", fpustackused); 2374 } 2375 stagelist.setLength(0); 2376 2377 return pc; 2378 } 2379 2380 /****************************** 2381 * Insert c into scheduling table. 2382 * Returns: 2383 * 0 could not be scheduled; have to start a new one 2384 */ 2385 2386 int insert(Cinfo *ci) 2387 { code *c; 2388 int clocks; 2389 int i; 2390 int ic = 0; 2391 int imin; 2392 targ_size_t offset; 2393 targ_size_t vpointer; 2394 int movesp = 0; 2395 int reg2 = -1; // avoid "may be uninitialized" warning 2396 2397 //printf("insert "); ci.c.print(); 2398 //printf("insert() %d\n", fpustackused); 2399 c = ci.c; 2400 //printf("\tc.Iop %x\n",c.Iop); 2401 vpointer = c.IEV1.Vpointer; 2402 assert(cast(uint)tblmax <= TBLMAX); 2403 if (tblmax == TBLMAX) // if out of space 2404 goto Lnoinsert; 2405 if (tblmax == 0) // if table is empty 2406 { // Just stuff it in the first slot 2407 i = tblmax; 2408 goto Linsert; 2409 } 2410 else if (c.Iflags & (CFtarg | CFtarg2)) 2411 // Jump targets can only be first in the scheduler 2412 goto Lnoinsert; 2413 2414 // Special case of: 2415 // PUSH reg1 2416 // MOV reg2,x[ESP] 2417 if (c.Iop == 0x8B && 2418 (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2419 c.Isib == modregrm(0,4,SP) && 2420 c.IFL1 == FLconst && 2421 (cast(byte)c.IEV1.Vpointer) >= REGSIZE 2422 ) 2423 { 2424 movesp = 1; // this is a MOV reg2,offset[ESP] 2425 offset = cast(byte)c.IEV1.Vpointer; 2426 reg2 = (c.Irm >> 3) & 7; 2427 } 2428 2429 2430 // Start at tblmax, and back up until we get a conflict 2431 ic = -1; 2432 imin = 0; 2433 for (i = tblmax; i >= 0; i--) 2434 { 2435 Cinfo* cit = tbl[i]; 2436 if (!cit) 2437 continue; 2438 2439 // Look for special case swap 2440 if (movesp && 2441 (cit.c.Iop & ~7) == 0x50 && // if PUSH reg1 2442 (cit.c.Iop & 7) != reg2 && // if reg1 != reg2 2443 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2444 ) 2445 { 2446 c.IEV1.Vpointer += cit.spadjust; 2447 //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2448 continue; 2449 } 2450 2451 if (movesp && 2452 cit.c.Iop == 0x83 && 2453 cit.c.Irm == modregrm(3,5,SP) && // if SUB ESP,offset 2454 cit.c.IFL2 == FLconst && 2455 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2456 ) 2457 { 2458 //printf("\t2, spadjust = %d\n",cit.spadjust); 2459 c.IEV1.Vpointer += cit.spadjust; 2460 continue; 2461 } 2462 2463 clocks = conflict(cit,ci,1); 2464 if (clocks) 2465 { int j; 2466 2467 ic = i; // where the conflict occurred 2468 clocks &= 0xFF; // convert to delay count 2469 2470 // Move forward the delay clocks 2471 if (clocks == 0) 2472 j = i + 1; 2473 else if (PRO) 2474 j = (((i + 3) / 3) * 3) + clocks * 3; 2475 else 2476 { j = ((i + 2) & ~1) + clocks * 2; 2477 2478 // It's possible we skipped over some AGI generating 2479 // instructions due to movesp. 2480 int k; 2481 for (k = i + 1; k < j; k++) 2482 { 2483 if (k >= TBLMAX) 2484 goto Lnoinsert; 2485 if (tbl[k] && pair_agi(tbl[k],ci)) 2486 { 2487 k = ((k + 2) & ~1) + 1; 2488 } 2489 } 2490 j = k; 2491 } 2492 2493 if (j >= TBLMAX) // exceed table size? 2494 goto Lnoinsert; 2495 imin = j; // first possible slot c can go in 2496 break; 2497 } 2498 } 2499 2500 2501 // Scan forward looking for a hole to put it in 2502 for (i = imin; i < TBLMAX; i++) 2503 { 2504 if (tbl[i]) 2505 { 2506 // In case, due to movesp, we skipped over some AGI instructions 2507 if (!PRO && pair_agi(tbl[i],ci)) 2508 { 2509 i = ((i + 2) & ~1) + 1; 2510 if (i >= TBLMAX) 2511 goto Lnoinsert; 2512 } 2513 } 2514 else 2515 { 2516 if (PRO) 2517 { int i0 = (i / 3) * 3; // index of decode unit 0 2518 Cinfo *ci0; 2519 2520 assert(((TBLMAX / 3) * 3) == TBLMAX); 2521 switch (i - i0) 2522 { 2523 case 0: // i0 can handle any instruction 2524 goto Linsert; 2525 case 1: 2526 ci0 = tbl[i0]; 2527 if (ci.uops > 1) 2528 { 2529 if (i0 >= imin && ci0.uops == 1) 2530 goto L1; 2531 i++; 2532 break; 2533 } 2534 if (triple_test(ci0,ci,tbl[i0 + 2])) 2535 goto Linsert; 2536 break; 2537 case 2: 2538 ci0 = tbl[i0]; 2539 if (ci.uops > 1) 2540 { 2541 if (i0 >= imin && ci0.uops == 1) 2542 { 2543 if (i >= tblmax) 2544 { if (i + 1 >= TBLMAX) 2545 goto Lnoinsert; 2546 tblmax = i + 1; 2547 } 2548 tbl[i0 + 2] = tbl[i0 + 1]; 2549 tbl[i0 + 1] = ci0; 2550 i = i0; 2551 goto Linsert; 2552 } 2553 break; 2554 } 2555 if (triple_test(ci0,tbl[i0 + 1],ci)) 2556 goto Linsert; 2557 break; 2558 default: 2559 assert(0); 2560 } 2561 } 2562 else 2563 { 2564 assert((TBLMAX & 1) == 0); 2565 if (i & 1) // if V pipe 2566 { 2567 if (pair_test(tbl[i - 1],ci)) 2568 { 2569 goto Linsert; 2570 } 2571 else if (i > imin && pair_test(ci,tbl[i - 1])) 2572 { 2573 L1: 2574 tbl[i] = tbl[i - 1]; 2575 if (i >= tblmax) 2576 tblmax = i + 1; 2577 i--; 2578 //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop); 2579 goto Linsert; 2580 } 2581 } 2582 else // will always fit in U pipe 2583 { 2584 assert(!tbl[i + 1]); // because V pipe should be empty 2585 goto Linsert; 2586 } 2587 } 2588 } 2589 } 2590 2591 Lnoinsert: 2592 //printf("\tnoinsert\n"); 2593 c.IEV1.Vpointer = vpointer; // reset to original value 2594 return 0; 2595 2596 Linsert: 2597 // Insert at location i 2598 assert(i < TBLMAX); 2599 assert(tblmax <= TBLMAX); 2600 tbl[i] = ci; 2601 //printf("\tinsert at location %d\n",i); 2602 2603 // If it's a scheduled floating point code, we have to adjust 2604 // the FXCH values 2605 if (ci.fp_op) 2606 { 2607 ci.fxch_pre = 0; 2608 ci.fxch_post = 0; // start over again 2609 2610 int fpu = fpustackused; 2611 for (int j = 0; j < tblmax; j++) 2612 { 2613 if (tbl[j]) 2614 { 2615 fpu += tbl[j].fpuadjust; 2616 if (fpu >= 8) // if FPU stack overflow 2617 { tbl[i] = null; 2618 //printf("fpu stack overflow\n"); 2619 goto Lnoinsert; 2620 } 2621 } 2622 } 2623 2624 for (int j = tblmax; j > i; j--) 2625 { 2626 if (j < TBLMAX && tbl[j]) 2627 conflict(tbl[j],ci,2); 2628 } 2629 } 2630 2631 if (movesp) 2632 { // Adjust [ESP] offsets 2633 2634 //printf("\tic = %d, inserting at %d\n",ic,i); 2635 assert(cast(uint)tblmax <= TBLMAX); 2636 for (int j = ic + 1; j < i; j++) 2637 { 2638 Cinfo* cit = tbl[j]; 2639 if (cit) 2640 { 2641 c.IEV1.Vpointer -= cit.spadjust; 2642 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2643 } 2644 } 2645 } 2646 if (i >= tblmax) 2647 tblmax = i + 1; 2648 2649 // Now do a hack. Look back at immediately preceding instructions, 2650 // and see if we can swap with a push. 2651 if (0 && movesp) 2652 { 2653 while (1) 2654 { 2655 int j; 2656 for (j = 1; i > j; j++) 2657 if (tbl[i - j]) 2658 break; 2659 2660 if (i >= j && tbl[i - j] && 2661 (tbl[i - j].c.Iop & ~7) == 0x50 && // if PUSH reg1 2662 (tbl[i - j].c.Iop & 7) != reg2 && // if reg1 != reg2 2663 cast(byte)c.IEV1.Vpointer >= REGSIZE) 2664 { 2665 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i); 2666 assert(cast(uint)i < TBLMAX); 2667 assert(cast(uint)(i - j) < TBLMAX); 2668 tbl[i] = tbl[i - j]; 2669 tbl[i - j] = ci; 2670 i -= j; 2671 c.IEV1.Vpointer -= REGSIZE; 2672 } 2673 else 2674 break; 2675 } 2676 } 2677 2678 //printf("\tinsert\n"); 2679 return 1; 2680 } 2681 2682 /****************************** 2683 * Insert c into staging area. 2684 * Params: 2685 * c = instruction to stage 2686 * Returns: 2687 * false if could not be scheduled; have to start a new one 2688 */ 2689 2690 bool stage(code *c) 2691 { 2692 //printf("stage: "); c.print(); 2693 if (cinfomax == TBLMAX) // if out of space 2694 return false; 2695 auto ci = &cinfo[cinfomax++]; 2696 getinfo(ci,c); 2697 2698 if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex)) 2699 { 2700 // Insert anything in stagelist 2701 foreach (ref cs; stagelist[]) 2702 { 2703 if (cs) 2704 { 2705 if (!insert(cs)) 2706 return false; 2707 cs = null; 2708 } 2709 } 2710 return insert(ci) != 0; 2711 } 2712 2713 // Look through stagelist, and insert any AGI conflicting instructions 2714 bool agi = false; 2715 foreach (ref cs; stagelist[]) 2716 { 2717 if (cs) 2718 { 2719 if (pair_agi(cs,ci)) 2720 { 2721 if (!insert(cs)) 2722 goto Lnostage; 2723 cs = null; 2724 agi = true; // we put out an AGI 2725 } 2726 } 2727 } 2728 2729 // Look through stagelist, and insert any other conflicting instructions 2730 foreach (i, ref cs; stagelist[]) 2731 { 2732 if (!cs) 2733 continue; 2734 if (conflict(cs,ci,0) && // if conflict 2735 !(cs.flags & ci.flags & CIFL.push)) 2736 { 2737 if (cs.spadjust) 2738 { 2739 // We need to insert all previous adjustments to ESP 2740 foreach (ref ca; stagelist[0 .. i]) 2741 { 2742 if (ca && ca.spadjust) 2743 { 2744 if (!insert(ca)) 2745 goto Lnostage; 2746 ca = null; 2747 } 2748 } 2749 } 2750 2751 if (!insert(cs)) 2752 goto Lnostage; 2753 cs = null; 2754 } 2755 } 2756 2757 // If floating point opcode, don't stage it, send it right out 2758 if (!agi && ci.flags & CIFL.nostage) 2759 { 2760 if (!insert(ci)) 2761 goto Lnostage; 2762 return true; 2763 } 2764 2765 stagelist.push(ci); // append to staging list 2766 return true; 2767 2768 Lnostage: 2769 return false; 2770 } 2771 2772 } 2773 2774 2775 2776 /******************************************** 2777 * Snip off tail of instruction sequence. 2778 * Returns: 2779 * next instruction (the tail) or 2780 * null for no more instructions 2781 */ 2782 2783 private code * csnip(code *c) 2784 { 2785 if (c) 2786 { 2787 uint iflags = c.Iflags & CFclassinit; 2788 code **pc; 2789 while (1) 2790 { 2791 pc = &c.next; 2792 c = *pc; 2793 if (!c) 2794 break; 2795 if (c.Iflags & (CFtarg | CFtarg2)) 2796 break; 2797 if (!(c.Iop == NOP || 2798 c.Iop == (ESCAPE | ESClinnum) || 2799 c.Iflags & iflags)) 2800 break; 2801 } 2802 *pc = null; 2803 } 2804 return c; 2805 } 2806 2807 2808 /****************************** 2809 * Schedule Pentium instructions, 2810 * based on Steve Russell's algorithm. 2811 */ 2812 2813 private code *schedule(code *c,regm_t scratch) 2814 { 2815 code *cresult = null; 2816 code **pctail = &cresult; 2817 Schedule sch = void; 2818 2819 sch.initialize(0); // initialize scheduling table 2820 while (c) 2821 { 2822 if ((c.Iop == NOP || 2823 ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) || 2824 c.Iflags & CFclassinit) && 2825 !(c.Iflags & (CFtarg | CFtarg2))) 2826 { code *cn; 2827 2828 // Just append this instruction to pctail and go to the next one 2829 *pctail = c; 2830 cn = code_next(c); 2831 c.next = null; 2832 pctail = &c.next; 2833 c = cn; 2834 continue; 2835 } 2836 2837 //printf("init\n"); 2838 sch.initialize(sch.fpustackused); // initialize scheduling table 2839 2840 while (c) 2841 { 2842 //printf("insert %p\n",c); 2843 if (!sch.stage(c)) // store c in scheduling table 2844 break; 2845 c = csnip(c); 2846 } 2847 2848 //printf("assem %d\n",sch.tblmax); 2849 pctail = sch.assemble(pctail); // reassemble instruction stream 2850 } 2851 sch.dtor(); 2852 2853 return cresult; 2854 } 2855 2856 /**************************************************************************/ 2857 2858 /******************************************** 2859 * Replace any occurrence of r1 in EA with r2. 2860 */ 2861 2862 private void repEA(code *c,uint r1,uint r2) 2863 { 2864 uint mod,reg,rm; 2865 uint rmn; 2866 2867 rmn = c.Irm; 2868 mod = rmn & 0xC0; 2869 reg = rmn & modregrm(0,7,0); 2870 rm = rmn & 7; 2871 2872 if (mod == 0xC0 && rm == r1) 2873 { } //c.Irm = mod | reg | r2; 2874 else if (is32bitaddr(I32,c.Iflags) && 2875 // If not disp32 2876 (rmn & modregrm(3,0,7)) != modregrm(0,0,5)) 2877 { 2878 if (rm == 4) 2879 { // SIB byte addressing 2880 uint sib; 2881 uint base; 2882 uint index; 2883 2884 sib = c.Isib; 2885 base = sib & 7; 2886 index = (sib >> 3) & 7; 2887 if (base == r1 && 2888 !(r1 == 5 && mod == 0) && 2889 !(r2 == 5 && mod == 0) 2890 ) 2891 base = r2; 2892 if (index == r1) 2893 index = r2; 2894 c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base); 2895 } 2896 else if (rm == r1) 2897 { 2898 if (r1 == BP && r2 == SP) 2899 { // Replace [EBP] with [ESP] 2900 c.Irm = cast(ubyte)(mod | reg | 4); 2901 c.Isib = modregrm(0,4,SP); 2902 } 2903 else if (r2 == BP && mod == 0) 2904 { 2905 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2); 2906 c.IFL1 = FLconst; 2907 c.IEV1.Vint = 0; 2908 } 2909 else 2910 c.Irm = cast(ubyte)(mod | reg | r2); 2911 } 2912 } 2913 } 2914 2915 /****************************************** 2916 * Instruction scheduler. 2917 * Input: 2918 * c list of instructions to schedule 2919 * scratch scratch registers we can use 2920 * Returns: 2921 * revised list of scheduled instructions 2922 */ 2923 2924 /****************************************** 2925 * Swap c1 and c2. 2926 * c1 comes before c2. 2927 * Swap in place to not disturb addresses of jmp targets 2928 */ 2929 2930 private void code_swap(code *c1,code *c2) 2931 { code cs; 2932 2933 // Special case of: 2934 // PUSH reg1 2935 // MOV reg2,x[ESP] 2936 //printf("code_swap(%x, %x)\n",c1,c2); 2937 if ((c1.Iop & ~7) == 0x50 && 2938 c2.Iop == 0x8B && 2939 (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2940 c2.Isib == modregrm(0,4,SP) && 2941 c2.IFL1 == FLconst && 2942 (cast(byte)c2.IEV1.Vpointer) >= REGSIZE && 2943 (c1.Iop & 7) != ((c2.Irm >> 3) & 7) 2944 ) 2945 c2.IEV1.Vpointer -= REGSIZE; 2946 2947 2948 cs = *c2; 2949 *c2 = *c1; 2950 *c1 = cs; 2951 // Retain original CFtarg 2952 c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2)); 2953 c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2)); 2954 2955 c1.next = c2.next; 2956 c2.next = cs.next; 2957 } 2958 2959 private code *peephole(code *cstart,regm_t scratch) 2960 { 2961 // Look for cases of: 2962 // MOV r1,r2 2963 // OP ?,r1 2964 // we can replace with: 2965 // MOV r1,r2 2966 // OP ?,r2 2967 // to improve pairing 2968 code *c1; 2969 uint r1,r2; 2970 uint mod,reg,rm; 2971 2972 //printf("peephole\n"); 2973 for (code *c = cstart; c; c = c1) 2974 { 2975 ubyte rmn; 2976 2977 //c.print(); 2978 c1 = cnext(c); 2979 Ln: 2980 if (!c1) 2981 break; 2982 if (c1.Iflags & (CFtarg | CFtarg2)) 2983 continue; 2984 2985 // Do: 2986 // PUSH reg 2987 if (I32 && (c.Iop & ~7) == 0x50) 2988 { 2989 uint regx = c.Iop & 7; 2990 2991 // MOV [ESP],regx => NOP 2992 if (c1.Iop == 0x8B && 2993 c1.Irm == modregrm(0,regx,4) && 2994 c1.Isib == modregrm(0,4,SP)) 2995 { c1.Iop = NOP; 2996 continue; 2997 } 2998 2999 // PUSH [ESP] => PUSH regx 3000 if (c1.Iop == 0xFF && 3001 c1.Irm == modregrm(0,6,4) && 3002 c1.Isib == modregrm(0,4,SP)) 3003 { c1.Iop = 0x50 + regx; 3004 continue; 3005 } 3006 3007 // CMP [ESP],imm => CMP regx,i,, 3008 if (c1.Iop == 0x83 && 3009 c1.Irm == modregrm(0,7,4) && 3010 c1.Isib == modregrm(0,4,SP)) 3011 { c1.Irm = modregrm(3,7,regx); 3012 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0) 3013 { // to TEST regx,regx 3014 c1.Iop = (c1.Iop & 1) | 0x84; 3015 c1.Irm = modregrm(3,regx,regx); 3016 } 3017 continue; 3018 } 3019 3020 } 3021 3022 // Do: 3023 // MOV reg,[ESP] => PUSH reg 3024 // ADD ESP,4 => NOP 3025 if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) && 3026 c.Isib == modregrm(0,4,SP) && 3027 c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) && 3028 !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4) 3029 { 3030 uint regx = (c.Irm >> 3) & 7; 3031 c.Iop = 0x58 + regx; 3032 c1.Iop = NOP; 3033 continue; 3034 } 3035 3036 // Combine two SUBs of the same register 3037 if (c.Iop == c1.Iop && 3038 c.Iop == 0x83 && 3039 (c.Irm & 0xC0) == 0xC0 && 3040 (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) && 3041 !(c1.Iflags & CFpsw) && 3042 c.IFL2 == FLconst && c1.IFL2 == FLconst 3043 ) 3044 { int i = cast(byte)c.IEV2.Vint; 3045 int i1 = cast(byte)c1.IEV2.Vint; 3046 switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3)) 3047 { 3048 case (0 << 3) | 0: // ADD, ADD 3049 case (5 << 3) | 5: // SUB, SUB 3050 i += i1; 3051 goto Laa; 3052 case (0 << 3) | 5: // ADD, SUB 3053 case (5 << 3) | 0: // SUB, ADD 3054 i -= i1; 3055 goto Laa; 3056 Laa: 3057 if (cast(byte)i != i) 3058 c.Iop &= ~2; 3059 c.IEV2.Vint = i; 3060 c1.Iop = NOP; 3061 if (i == 0) 3062 c.Iop = NOP; 3063 continue; 3064 3065 default: 3066 break; 3067 } 3068 } 3069 3070 if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3071 { r1 = (c.Irm >> 3) & 7; 3072 r2 = c.Irm & 7; 3073 } 3074 else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3075 { r1 = c.Irm & 7; 3076 r2 = (c.Irm >> 3) & 7; 3077 } 3078 else 3079 { 3080 continue; 3081 } 3082 3083 rmn = c1.Irm; 3084 mod = rmn & 0xC0; 3085 reg = rmn & modregrm(0,7,0); 3086 rm = rmn & 7; 3087 if (cod3_EA(c1)) 3088 repEA(c1,r1,r2); 3089 switch (c1.Iop) 3090 { 3091 case 0x50: 3092 case 0x51: 3093 case 0x52: 3094 case 0x53: 3095 case 0x54: 3096 case 0x55: 3097 case 0x56: 3098 case 0x57: // PUSH reg 3099 if ((c1.Iop & 7) == r1) 3100 { c1.Iop = 0x50 | r2; 3101 //printf("schedule PUSH reg\n"); 3102 } 3103 break; 3104 3105 case 0x81: 3106 case 0x83: 3107 // Look for CMP EA,imm 3108 if (reg == modregrm(0,7,0)) 3109 { 3110 if (mod == 0xC0 && rm == r1) 3111 c1.Irm = cast(ubyte)(mod | reg | r2); 3112 } 3113 break; 3114 3115 case 0x84: // TEST reg,byte ptr EA 3116 if (r1 >= 4 || r2 >= 4) // if not a byte register 3117 break; 3118 if ((rmn & 0xC0) == 0xC0) 3119 { 3120 if ((rmn & 3) == r1) 3121 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2); 3122 //printf("schedule 1\n"); 3123 } 3124 } 3125 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0)) 3126 { c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0); 3127 //printf("schedule 2\n"); 3128 } 3129 break; 3130 case 0x85: // TEST reg,word ptr EA 3131 if ((rmn & 0xC0) == 0xC0) 3132 { 3133 if ((rmn & 7) == r1) 3134 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3135 //printf("schedule 3\n"); 3136 } 3137 } 3138 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3139 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3140 //printf("schedule 4\n"); 3141 } 3142 break; 3143 3144 case 0x89: // MOV EA,reg 3145 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3146 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3147 //printf("schedule 5\n"); 3148 if (c1.Irm == modregrm(3,r2,r2)) 3149 goto Lnop; 3150 } 3151 break; 3152 3153 case 0x8B: // MOV reg,EA 3154 if ((rmn & 0xC0) == 0xC0 && 3155 (rmn & 7) == r1) // if EA == r1 3156 { c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3157 //printf("schedule 6\n"); 3158 if (c1.Irm == modregrm(3,r2,r2)) 3159 goto Lnop; 3160 } 3161 break; 3162 3163 case 0x3C: // CMP AL,imm8 3164 if (r1 == AX && r2 < 4) 3165 { c1.Iop = 0x80; 3166 c1.Irm = modregrm(3,7,r2); 3167 //printf("schedule 7, r2 = %d\n", r2); 3168 } 3169 break; 3170 3171 case 0x3D: // CMP AX,imm16 3172 if (r1 == AX) 3173 { c1.Iop = 0x81; 3174 c1.Irm = modregrm(3,7,r2); 3175 if (c1.IFL2 == FLconst && 3176 c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns) 3177 c1.Iop = 0x83; 3178 //printf("schedule 8\n"); 3179 } 3180 break; 3181 3182 default: 3183 break; 3184 } 3185 continue; 3186 Lnop: 3187 c1.Iop = NOP; 3188 c1 = cnext(c1); 3189 goto Ln; 3190 } 3191 return cstart; 3192 } 3193 3194 /*****************************************************************/ 3195 3196 /********************************************** 3197 * Replace complex instructions with simple ones more conducive 3198 * to scheduling. 3199 */ 3200 3201 code *simpleops(code *c,regm_t scratch) 3202 { code *cstart; 3203 uint reg; 3204 code *c2; 3205 3206 // Worry about using registers not saved yet by prolog 3207 scratch &= ~fregsaved; 3208 3209 if (!(scratch & (scratch - 1))) // if 0 or 1 registers 3210 return c; 3211 3212 reg = findreg(scratch); 3213 3214 cstart = c; 3215 for (code** pc = &cstart; *pc; pc = &(*pc).next) 3216 { 3217 c = *pc; 3218 if (c.Iflags & (CFtarg | CFtarg2 | CFopsize)) 3219 continue; 3220 if (c.Iop == 0x83 && 3221 (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) && 3222 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3223 ) 3224 { // Replace CMP mem,imm with: 3225 // MOV reg,mem 3226 // CMP reg,imm 3227 targ_long imm; 3228 3229 //printf("replacing CMP\n"); 3230 c.Iop = 0x8B; 3231 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3232 3233 c2 = code_calloc(); 3234 if (reg == AX) 3235 c2.Iop = 0x3D; 3236 else 3237 { c2.Iop = 0x83; 3238 c2.Irm = modregrm(3,7,reg); 3239 } 3240 c2.IFL2 = c.IFL2; 3241 c2.IEV2 = c.IEV2; 3242 3243 // See if c2 should be replaced by a TEST 3244 imm = c2.IEV2.Vuns; 3245 if (!(c2.Iop & 1)) 3246 imm &= 0xFF; 3247 else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize)) 3248 imm = cast(short) imm; 3249 if (imm == 0) 3250 { 3251 c2.Iop = 0x85; // TEST reg,reg 3252 c2.Irm = modregrm(3,reg,reg); 3253 } 3254 goto L1; 3255 } 3256 else if (c.Iop == 0xFF && 3257 (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) && 3258 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3259 ) 3260 { // Replace PUSH mem with: 3261 // MOV reg,mem 3262 // PUSH reg 3263 3264 // printf("replacing PUSH\n"); 3265 c.Iop = 0x8B; 3266 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3267 3268 c2 = gen1(null,0x50 + reg); 3269 L1: 3270 //c.print(); 3271 //c2.print(); 3272 c2.next = c.next; 3273 c.next = c2; 3274 3275 // Switch to another reg 3276 if (scratch & ~mask(reg)) 3277 reg = findreg(scratch & ~mask(reg)); 3278 } 3279 } 3280 return cstart; 3281 } 3282 3283 }