1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (C) 1995-1998 by Symantec 6 * Copyright (C) 2000-2020 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d) 10 */ 11 12 module dmd.backend.cgsched; 13 14 version (SCPP) 15 version = COMPILE; 16 version (MARS) 17 version = COMPILE; 18 19 version (COMPILE) 20 { 21 22 import core.stdc.stdio; 23 import core.stdc.stdlib; 24 import core.stdc..string; 25 26 import dmd.backend.cc; 27 import dmd.backend.cdef; 28 import dmd.backend.code; 29 import dmd.backend.code_x86; 30 import dmd.backend.dlist; 31 import dmd.backend.global; 32 import dmd.backend.mem; 33 import dmd.backend.ty; 34 import dmd.backend.barray; 35 36 extern (C++): 37 38 nothrow: 39 40 int REGSIZE(); 41 code *gen1(code *c, uint op); 42 code *gen2(code *c, uint op, uint rm); 43 44 private uint mask(uint m) { return 1 << m; } 45 46 // is32bitaddr works correctly only when x is 0 or 1. This is 47 // true today for the current definition of I32, but if the definition 48 // of I32 changes, this macro will need to change as well 49 // 50 // Note: even for linux targets, CFaddrsize can be set by the inline 51 // assembler. 52 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); } 53 54 // If we use Pentium Pro scheduler 55 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; } 56 57 private enum FP : ubyte 58 { 59 fstp = 1, /// FSTP mem 60 fld = 2, /// FLD mem 61 fop = 3, /// Fop ST0,mem or Fop ST0 62 } 63 64 private enum CIFL : ubyte 65 { 66 arraybounds = 1, /// this instruction is a jmp to array bounds 67 ea = 2, /// this instruction has a memory-referencing 68 /// modregrm EA byte 69 nostage = 4, /// don't stage these instructions 70 push = 8, /// it's a push we can swap around 71 } 72 73 // Struct where we gather information about an instruction 74 struct Cinfo 75 { 76 code *c; // the instruction 77 ubyte pair; // pairing information 78 ubyte sz; // operand size 79 ubyte isz; // instruction size 80 81 // For floating point scheduling 82 ubyte fxch_pre; 83 ubyte fxch_post; 84 FP fp_op; /// FPxxxx 85 86 ubyte flags; /// CIFLxxx 87 88 uint r; // read mask 89 uint w; // write mask 90 uint a; // registers used in addressing mode 91 ubyte reg; // reg field of modregrm byte 92 ubyte uops; // Pentium Pro micro-ops 93 uint sibmodrm; // (sib << 8) + mod__rm byte 94 uint spadjust; // if !=0, then amount ESP changes as a result of this 95 // instruction being executed 96 int fpuadjust; // if !=0, then amount FPU stack changes as a result 97 // of this instruction being executed 98 99 nothrow void print() // pretty-printer 100 { 101 Cinfo *ci = &this; 102 103 if (ci == null) 104 { 105 printf("Cinfo 0\n"); 106 return; 107 } 108 109 printf("Cinfo %p: c %p, pair %x, sz %d, isz %d, flags - ", 110 ci,c,pair,sz,isz); 111 if (ci.flags & CIFL.arraybounds) 112 printf("arraybounds,"); 113 if (ci.flags & CIFL.ea) 114 printf("ea,"); 115 if (ci.flags & CIFL.nostage) 116 printf("nostage,"); 117 if (ci.flags & CIFL.push) 118 printf("push,"); 119 if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea)) 120 printf("bad flag,"); 121 printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n", 122 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust); 123 if (ci.fp_op) 124 { 125 __gshared const(char*)[3] fpops = ["fstp","fld","fop"]; 126 127 printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n", 128 fpops[fp_op-1],fxch_pre,fxch_post); 129 } 130 } 131 132 } 133 134 135 /***************************************** 136 * Do Pentium optimizations. 137 * Input: 138 * scratch scratch registers we can use 139 */ 140 141 private void cgsched_pentium(code **pc,regm_t scratch) 142 { 143 //printf("scratch = x%02x\n",scratch); 144 if (config.target_scheduler >= TARGET_80486) 145 { 146 if (!I64) 147 *pc = peephole(*pc,0); 148 if (I32) // forget about 16 bit code 149 { 150 if (config.target_cpu == TARGET_Pentium || 151 config.target_cpu == TARGET_PentiumMMX) 152 *pc = simpleops(*pc,scratch); 153 *pc = schedule(*pc,0); 154 } 155 } 156 } 157 158 /************************************ 159 * Entry point 160 */ 161 void cgsched_block(block* b) 162 { 163 if (config.flags4 & CFG4speed && 164 config.target_cpu >= TARGET_Pentium && 165 b.BC != BCasm) 166 { 167 regm_t scratch = allregs; 168 169 scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg); 170 scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval); 171 cgsched_pentium(&b.Bcode,scratch); 172 //printf("after schedule:\n"); WRcodlst(b.Bcode); 173 } 174 } 175 176 enum 177 { 178 NP = 0, /// not pairable 179 PU = 1, /// pairable in U only, never executed in V 180 PV = 2, /// pairable in V only 181 UV = (PU|PV), /// pairable in both U and V 182 PE = 4, /// register contention exception 183 PF = 8, /// flags contention exception 184 FX = 0x10, /// pairable with FXCH instruction 185 } 186 187 extern (D) private immutable ubyte[256] pentcycl = 188 [ 189 UV,UV,UV,UV, UV,UV,NP,NP, // 0 190 UV,UV,UV,UV, UV,UV,NP,NP, // 8 191 PU,PU,PU,PU, PU,PU,NP,NP, // 10 192 PU,PU,PU,PU, PU,PU,NP,NP, // 18 193 UV,UV,UV,UV, UV,UV,NP,NP, // 20 194 UV,UV,UV,UV, UV,UV,NP,NP, // 28 195 UV,UV,UV,UV, UV,UV,NP,NP, // 30 196 UV,UV,UV,UV, UV,UV,NP,NP, // 38 197 198 UV,UV,UV,UV, UV,UV,UV,UV, // 40 199 UV,UV,UV,UV, UV,UV,UV,UV, // 48 200 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 50 PUSH reg 201 PE|UV,PE|UV,PE|UV,PE|UV, PE|UV,PE|UV,PE|UV,PE|UV, // 58 POP reg 202 NP,NP,NP,NP, NP,NP,NP,NP, // 60 203 PE|UV,NP,PE|UV,NP, NP,NP,NP,NP, // 68 204 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 70 Jcc rel8 205 PV|PF,PV|PF,PV|PF,PV|PF, PV|PF,PV|PF,PV|PF,PV|PF, // 78 Jcc rel8 206 207 NP,NP,NP,NP, NP,NP,NP,NP, // 80 208 UV,UV,UV,UV, NP,UV,NP,NP, // 88 209 NP,NP,NP,NP, NP,NP,NP,NP, // 90 210 NP,NP,NP,NP, NP,NP,NP,NP, // 98 211 UV,UV,UV,UV, NP,NP,NP,NP, // A0 212 UV,UV,NP,NP, NP,NP,NP,NP, // A8 213 UV,UV,UV,UV, UV,UV,UV,UV, // B0 214 UV,UV,UV,UV, UV,UV,UV,UV, // B8 215 216 NP,NP,NP,NP, NP,NP,NP,NP, // C0 217 NP,NP,NP,NP, NP,NP,NP,NP, // C8 218 PU,PU,NP,NP, NP,NP,NP,NP, // D0 219 FX,NP,FX,FX, NP,NP,FX,NP, // D8 all floating point 220 NP,NP,NP,NP, NP,NP,NP,NP, // E0 221 PE|PV,PV,NP,PV, NP,NP,NP,NP, // E8 222 NP,NP,NP,NP, NP,NP,NP,NP, // F0 223 NP,NP,NP,NP, NP,NP,NP,NP, // F8 224 ]; 225 226 /******************************************** 227 * For each opcode, determine read [0] and written [1] masks. 228 */ 229 230 enum 231 { 232 EA = 0x100000, 233 R = 0x200000, /// register (reg of modregrm field) 234 N = 0x400000, /// other things modified, not swappable 235 B = 0x800000, /// it's a byte operation 236 C = 0x1000000, /// floating point flags 237 mMEM = 0x2000000, /// memory 238 S = 0x4000000, /// floating point stack 239 F = 0x8000000, /// flags 240 } 241 242 extern (D) private immutable uint[2][256] oprw = 243 [ 244 // 00 245 [ EA|R|B, F|EA|B ], // ADD 246 [ EA|R, F|EA ], 247 [ EA|R|B, F|R|B ], 248 [ EA|R, F|R ], 249 [ mAX, F|mAX ], 250 [ mAX, F|mAX ], 251 [ N, N ], // PUSH ES 252 [ N, N ], // POP ES 253 254 // 08 255 [ EA|R|B, F|EA|B ], // OR 256 [ EA|R, F|EA ], 257 [ EA|R|B, F|R|B ], 258 [ EA|R, F|R ], 259 [ mAX, F|mAX ], 260 [ mAX, F|mAX ], 261 [ N, N ], // PUSH CS 262 [ N, N ], // 2 byte escape 263 264 // 10 265 [ F|EA|R|B,F|EA|B ], // ADC 266 [ F|EA|R, F|EA ], 267 [ F|EA|R|B,F|R|B ], 268 [ F|EA|R, F|R ], 269 [ F|mAX, F|mAX ], 270 [ F|mAX, F|mAX ], 271 [ N, N ], // PUSH SS 272 [ N, N ], // POP SS 273 274 // 18 275 [ F|EA|R|B,F|EA|B ], // SBB 276 [ F|EA|R, F|EA ], 277 [ F|EA|R|B,F|R|B ], 278 [ F|EA|R, F|R ], 279 [ F|mAX, F|mAX ], 280 [ F|mAX, F|mAX ], 281 [ N, N ], // PUSH DS 282 [ N, N ], // POP DS 283 284 // 20 285 [ EA|R|B, F|EA|B ], // AND 286 [ EA|R, F|EA ], 287 [ EA|R|B, F|R|B ], 288 [ EA|R, F|R ], 289 [ mAX, F|mAX ], 290 [ mAX, F|mAX ], 291 [ N, N ], // SEG ES 292 [ F|mAX, F|mAX ], // DAA 293 294 // 28 295 [ EA|R|B, F|EA|B ], // SUB 296 [ EA|R, F|EA ], 297 [ EA|R|B, F|R|B ], 298 [ EA|R, F|R ], 299 [ mAX, F|mAX ], 300 [ mAX, F|mAX ], 301 [ N, N ], // SEG CS 302 [ F|mAX, F|mAX ], // DAS 303 304 // 30 305 [ EA|R|B, F|EA|B ], // XOR 306 [ EA|R, F|EA ], 307 [ EA|R|B, F|R|B ], 308 [ EA|R, F|R ], 309 [ mAX, F|mAX ], 310 [ mAX, F|mAX ], 311 [ N, N ], // SEG SS 312 [ F|mAX, F|mAX ], // AAA 313 314 // 38 315 [ EA|R|B, F ], // CMP 316 [ EA|R, F ], 317 [ EA|R|B, F ], 318 [ EA|R, F ], 319 [ mAX, F ], // CMP AL,imm8 320 [ mAX, F ], // CMP EAX,imm16/32 321 [ N, N ], // SEG DS 322 [ N, N ], // AAS 323 324 // 40 325 [ mAX, F|mAX ], // INC EAX 326 [ mCX, F|mCX ], 327 [ mDX, F|mDX ], 328 [ mBX, F|mBX ], 329 [ mSP, F|mSP ], 330 [ mBP, F|mBP ], 331 [ mSI, F|mSI ], 332 [ mDI, F|mDI ], 333 334 // 48 335 [ mAX, F|mAX ], // DEC EAX 336 [ mCX, F|mCX ], 337 [ mDX, F|mDX ], 338 [ mBX, F|mBX ], 339 [ mSP, F|mSP ], 340 [ mBP, F|mBP ], 341 [ mSI, F|mSI ], 342 [ mDI, F|mDI ], 343 344 // 50 345 [ mAX|mSP, mSP|mMEM ], // PUSH EAX 346 [ mCX|mSP, mSP|mMEM ], 347 [ mDX|mSP, mSP|mMEM ], 348 [ mBX|mSP, mSP|mMEM ], 349 [ mSP|mSP, mSP|mMEM ], 350 [ mBP|mSP, mSP|mMEM ], 351 [ mSI|mSP, mSP|mMEM ], 352 [ mDI|mSP, mSP|mMEM ], 353 354 // 58 355 [ mSP|mMEM, mAX|mSP ], // POP EAX 356 [ mSP|mMEM, mCX|mSP ], 357 [ mSP|mMEM, mDX|mSP ], 358 [ mSP|mMEM, mBX|mSP ], 359 [ mSP|mMEM, mSP|mSP ], 360 [ mSP|mMEM, mBP|mSP ], 361 [ mSP|mMEM, mSI|mSP ], 362 [ mSP|mMEM, mDI|mSP ], 363 364 // 60 365 [ N, N ], // PUSHA 366 [ N, N ], // POPA 367 [ N, N ], // BOUND Gv,Ma 368 [ N, N ], // ARPL Ew,Rw 369 [ N, N ], // SEG FS 370 [ N, N ], // SEG GS 371 [ N, N ], // operand size prefix 372 [ N, N ], // address size prefix 373 374 // 68 375 [ mSP, mSP|mMEM ], // PUSH immed16/32 376 [ EA, F|R ], // IMUL Gv,Ev,lv 377 [ mSP, mSP|mMEM ], // PUSH immed8 378 [ EA, F|R ], // IMUL Gv,Ev,lb 379 [ N, N ], // INSB Yb,DX 380 [ N, N ], // INSW/D Yv,DX 381 [ N, N ], // OUTSB DX,Xb 382 [ N, N ], // OUTSW/D DX,Xv 383 384 // 70 385 [ F|N, N ], 386 [ F|N, N ], 387 [ F|N, N ], 388 [ F|N, N ], 389 [ F|N, N ], 390 [ F|N, N ], 391 [ F|N, N ], 392 [ F|N, N ], 393 394 // 78 395 [ F|N, N ], 396 [ F|N, N ], 397 [ F|N, N ], 398 [ F|N, N ], 399 [ F|N, N ], 400 [ F|N, N ], 401 [ F|N, N ], 402 [ F|N, N ], 403 404 // 80 405 [ N, N ], 406 [ N, N ], 407 [ N, N ], 408 [ N, N ], 409 [ EA|R, F ], // TEST EA,r8 410 [ EA|R, F ], // TEST EA,r16/32 411 [ EA|R, EA|R ], // XCHG EA,r8 412 [ EA|R, EA|R ], // XCHG EA,r16/32 413 414 // 88 415 [ R|B, EA|B ], // MOV EA8,r8 416 [ R, EA ], // MOV EA,r16/32 417 [ EA|B, R|B ], // MOV r8,EA8 418 [ EA, R ], // MOV r16/32,EA 419 [ N, N ], // MOV EA,segreg 420 [ EA, R ], // LEA r16/32,EA 421 [ N, N ], // MOV segreg,EA 422 [ mSP|mMEM, EA|mSP ], // POP mem16/32 423 424 // 90 425 [ 0, 0 ], // NOP 426 [ mAX|mCX, mAX|mCX ], 427 [ mAX|mDX, mAX|mDX ], 428 [ mAX|mBX, mAX|mBX ], 429 [ mAX|mSP, mAX|mSP ], 430 [ mAX|mBP, mAX|mBP ], 431 [ mAX|mSI, mAX|mSI ], 432 [ mAX|mDI, mAX|mDI ], 433 434 // 98 435 [ mAX, mAX ], // CBW 436 [ mAX, mDX ], // CWD 437 [ N, N|F ], // CALL far ptr 438 [ N, N ], // WAIT 439 [ F|mSP, mSP|mMEM ], // PUSHF 440 [ mSP|mMEM, F|mSP ], // POPF 441 [ mAX, F ], // SAHF 442 [ F, mAX ], // LAHF 443 444 // A0 445 [ mMEM, mAX ], // MOV AL,moffs8 446 [ mMEM, mAX ], // MOV EAX,moffs32 447 [ mAX, mMEM ], // MOV moffs8,AL 448 [ mAX, mMEM ], // MOV moffs32,EAX 449 [ N, N ], // MOVSB 450 [ N, N ], // MOVSW/D 451 [ N, N ], // CMPSB 452 [ N, N ], // CMPSW/D 453 454 // A8 455 [ mAX, F ], // TEST AL,imm8 456 [ mAX, F ], // TEST AX,imm16 457 [ N, N ], // STOSB 458 [ N, N ], // STOSW/D 459 [ N, N ], // LODSB 460 [ N, N ], // LODSW/D 461 [ N, N ], // SCASB 462 [ N, N ], // SCASW/D 463 464 // B0 465 [ 0, mAX ], // MOV AL,imm8 466 [ 0, mCX ], 467 [ 0, mDX ], 468 [ 0, mBX ], 469 [ 0, mAX ], 470 [ 0, mCX ], 471 [ 0, mDX ], 472 [ 0, mBX ], 473 474 // B8 475 [ 0, mAX ], // MOV AX,imm16 476 [ 0, mCX ], 477 [ 0, mDX ], 478 [ 0, mBX ], 479 [ 0, mSP ], 480 [ 0, mBP ], 481 [ 0, mSI ], 482 [ 0, mDI ], 483 484 // C0 485 [ EA, F|EA ], // Shift Eb,Ib 486 [ EA, F|EA ], 487 [ N, N ], 488 [ N, N ], 489 [ N, N ], 490 [ N, N ], 491 [ 0, EA|B ], // MOV EA8,imm8 492 [ 0, EA ], // MOV EA,imm16 493 494 // C8 495 [ N, N ], // ENTER 496 [ N, N ], // LEAVE 497 [ N, N ], // RETF lw 498 [ N, N ], // RETF 499 [ N, N ], // INT 3 500 [ N, N ], // INT lb 501 [ N, N ], // INTO 502 [ N, N ], // IRET 503 504 // D0 505 [ EA, F|EA ], // Shift EA,1 506 [ EA, F|EA ], 507 [ EA|mCX, F|EA ], // Shift EA,CL 508 [ EA|mCX, F|EA ], 509 [ mAX, F|mAX ], // AAM 510 [ mAX, F|mAX ], // AAD 511 [ N, N ], // reserved 512 [ mAX|mBX|mMEM, mAX ], // XLAT 513 514 // D8 515 [ N, N ], 516 [ N, N ], 517 [ N, N ], 518 [ N, N ], 519 [ N, N ], 520 [ N, N ], 521 [ N, N ], 522 [ N, N ], 523 524 // E0 525 [ F|mCX|N,mCX|N ], // LOOPNE jb 526 [ F|mCX|N,mCX|N ], // LOOPE jb 527 [ mCX|N, mCX|N ], // LOOP jb 528 [ mCX|N, N ], // JCXZ jb 529 [ N, N ], // IN AL,lb 530 [ N, N ], // IN EAX,lb 531 [ N, N ], // OUT lb,AL 532 [ N, N ], // OUT lb,EAX 533 534 // E8 535 [ N, N|F ], // CALL jv 536 [ N, N ], // JMP Jv 537 [ N, N ], // JMP Ab 538 [ N, N ], // JMP jb 539 [ N|mDX, N|mAX ], // IN AL,DX 540 [ N|mDX, N|mAX ], // IN AX,DX 541 [ N|mAX|mDX,N ], // OUT DX,AL 542 [ N|mAX|mDX,N ], // OUT DX,AX 543 544 // F0 545 [ N, N ], // LOCK 546 [ N, N ], // reserved 547 [ N, N ], // REPNE 548 [ N, N ], // REP,REPE 549 [ N, N ], // HLT 550 [ F, F ], // CMC 551 [ N, N ], 552 [ N, N ], 553 554 // F8 555 [ 0, F ], // CLC 556 [ 0, F ], // STC 557 [ N, N ], // CLI 558 [ N, N ], // STI 559 [ N, N ], // CLD 560 [ N, N ], // STD 561 [ EA, F|EA ], // INC/DEC 562 [ N, N ], 563 ]; 564 565 /**************************************** 566 * Same thing, but for groups. 567 */ 568 569 extern (D) private immutable uint[2][8][8] grprw = 570 [ 571 [ 572 // Grp 1 573 [ EA, F|EA ], // ADD 574 [ EA, F|EA ], // OR 575 [ F|EA, F|EA ], // ADC 576 [ F|EA, F|EA ], // SBB 577 [ EA, F|EA ], // AND 578 [ EA, F|EA ], // SUB 579 [ EA, F|EA ], // XOR 580 [ EA, F ], // CMP 581 ], 582 [ 583 // Grp 3 584 [ EA, F ], // TEST EA,imm 585 [ N, N ], // reserved 586 [ EA, EA ], // NOT 587 [ EA, F|EA ], // NEG 588 [ mAX|EA, F|mAX|mDX ], // MUL 589 [ mAX|EA, F|mAX|mDX ], // IMUL 590 [ mAX|mDX|EA, F|mAX|mDX ], // DIV 591 592 // Could generate an exception we want to catch 593 //mAX|mDX|EA|N, F|mAX|mDX|N, // IDIV 594 595 [ mAX|mDX|EA, F|mAX|mDX ], // IDIV 596 ], 597 [ 598 // Grp 5 599 [ EA, F|EA ], // INC Ev 600 [ EA, F|EA ], // DEC Ev 601 [ N|EA, N ], // CALL Ev 602 [ N|EA, N ], // CALL eP 603 [ N|EA, N ], // JMP Ev 604 [ N|EA, N ], // JMP Ep 605 [ mSP|EA, mSP|mMEM ], // PUSH Ev 606 [ N, N ], // reserved 607 ], 608 [ 609 // Grp 3, byte version 610 [ EA|B, F ], // TEST EA,imm 611 [ N, N ], // reserved 612 [ EA|B, EA|B ], // NOT 613 [ EA|B, F|EA|B ], // NEG 614 [ mAX|EA, F|mAX ], // MUL 615 [ mAX|EA, F|mAX ], // IMUL 616 [ mAX|EA, F|mAX ], // DIV 617 618 // Could generate an exception we want to catch 619 //mAX|EA|N, F|mAX|N, // IDIV 620 621 [ mAX|EA, F|mAX ], // IDIV 622 ] 623 ]; 624 625 /******************************************** 626 * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 627 * [][][0] = read 628 * [1] = write 629 */ 630 631 extern (D) private immutable uint[2][8][8] grpf1 = 632 [ 633 [ 634 // 0xD8 635 [ EA|S, S|C ], // FADD float 636 [ EA|S, S|C ], // FMUL float 637 [ EA|S, C ], // FCOM float 638 [ EA|S, S|C ], // FCOMP float 639 [ EA|S, S|C ], // FSUB float 640 [ EA|S, S|C ], // FSUBR float 641 [ EA|S, S|C ], // FDIV float 642 [ EA|S, S|C ], // FDIVR float 643 ], 644 [ 645 // 0xD9 646 [ EA, S|C ], // FLD float 647 [ N, N ], // 648 [ S, EA|C ], // FST float 649 [ S, EA|S|C ], // FSTP float 650 [ N, N ], // FLDENV 651 [ N, N ], // FLDCW 652 [ N, N ], // FSTENV 653 [ N, N ], // FSTCW 654 ], 655 [ 656 // 0xDA 657 [ EA|S, S|C ], // FIADD long 658 [ EA|S, S|C ], // FIMUL long 659 [ EA|S, C ], // FICOM long 660 [ EA|S, S|C ], // FICOMP long 661 [ EA|S, S|C ], // FISUB long 662 [ EA|S, S|C ], // FISUBR long 663 [ EA|S, S|C ], // FIDIV long 664 [ EA|S, S|C ], // FIDIVR long 665 ], 666 [ 667 // 0xDB 668 [ EA, S|C ], // FILD long 669 [ S, EA|S|C ], // FISTTP int 670 [ S, EA|C ], // FIST long 671 [ S, EA|S|C ], // FISTP long 672 [ N, N ], // 673 [ EA, S|C ], // FLD real80 674 [ N, N ], // 675 [ S, EA|S|C ], // FSTP real80 676 ], 677 [ 678 // 0xDC 679 [ EA|S, S|C ], // FADD double 680 [ EA|S, S|C ], // FMUL double 681 [ EA|S, C ], // FCOM double 682 [ EA|S, S|C ], // FCOMP double 683 [ EA|S, S|C ], // FSUB double 684 [ EA|S, S|C ], // FSUBR double 685 [ EA|S, S|C ], // FDIV double 686 [ EA|S, S|C ], // FDIVR double 687 ], 688 [ 689 // 0xDD 690 [ EA, S|C ], // FLD double 691 [ S, EA|S|C ], // FISTTP long 692 [ S, EA|C ], // FST double 693 [ S, EA|S|C ], // FSTP double 694 [ N, N ], // FRSTOR 695 [ N, N ], // 696 [ N, N ], // FSAVE 697 [ C, EA ], // FSTSW 698 ], 699 [ 700 // 0xDE 701 [ EA|S, S|C ], // FIADD short 702 [ EA|S, S|C ], // FIMUL short 703 [ EA|S, C ], // FICOM short 704 [ EA|S, S|C ], // FICOMP short 705 [ EA|S, S|C ], // FISUB short 706 [ EA|S, S|C ], // FISUBR short 707 [ EA|S, S|C ], // FIDIV short 708 [ EA|S, S|C ], // FIDIVR short 709 ], 710 [ 711 // 0xDF 712 [ EA, S|C ], // FILD short 713 [ S, EA|S|C ], // FISTTP short 714 [ S, EA|C ], // FIST short 715 [ S, EA|S|C ], // FISTP short 716 [ EA, S|C ], // FBLD packed BCD 717 [ EA, S|C ], // FILD long long 718 [ S, EA|S|C ], // FBSTP packed BCD 719 [ S, EA|S|C ], // FISTP long long 720 ] 721 ]; 722 723 724 /******************************************** 725 * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0. 726 */ 727 728 extern (D) private immutable ubyte[8][8] uopsgrpf1 = 729 [ 730 [ 731 // 0xD8 732 2, // FADD float 733 2, // FMUL float 734 2, // FCOM float 735 2, // FCOMP float 736 2, // FSUB float 737 2, // FSUBR float 738 2, // FDIV float 739 2, // FDIVR float 740 ], 741 [ 742 // 0xD9 743 1, // FLD float 744 0, // 745 2, // FST float 746 2, // FSTP float 747 5, // FLDENV 748 3, // FLDCW 749 5, // FSTENV 750 5, // FSTCW 751 ], 752 [ 753 // 0xDA 754 5, // FIADD long 755 5, // FIMUL long 756 5, // FICOM long 757 5, // FICOMP long 758 5, // FISUB long 759 5, // FISUBR long 760 5, // FIDIV long 761 5, // FIDIVR long 762 ], 763 [ 764 // 0xDB 765 4, // FILD long 766 0, // 767 4, // FIST long 768 4, // FISTP long 769 0, // 770 4, // FLD real80 771 0, // 772 5, // FSTP real80 773 ], 774 [ 775 // 0xDC 776 2, // FADD double 777 2, // FMUL double 778 2, // FCOM double 779 2, // FCOMP double 780 2, // FSUB double 781 2, // FSUBR double 782 2, // FDIV double 783 2, // FDIVR double 784 ], 785 [ 786 // 0xDD 787 1, // FLD double 788 0, // 789 2, // FST double 790 2, // FSTP double 791 5, // FRSTOR 792 0, // 793 5, // FSAVE 794 5, // FSTSW 795 ], 796 [ 797 // 0xDE 798 5, // FIADD short 799 5, // FIMUL short 800 5, // FICOM short 801 5, // FICOMP short 802 5, // FISUB short 803 5, // FISUBR short 804 5, // FIDIV short 805 5, // FIDIVR short 806 ], 807 [ 808 // 0xDF 809 4, // FILD short 810 0, // 811 4, // FIST short 812 4, // FISTP short 813 5, // FBLD packed BCD 814 4, // FILD long long 815 5, // FBSTP packed BCD 816 4, // FISTP long long 817 ] 818 ]; 819 820 /************************************************** 821 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 822 * 0 means special case, 823 * 5 means 'complex' 824 */ 825 826 extern (D) private immutable ubyte[256] insuops = 827 [ 0,0,0,0, 1,1,4,5, /* 00 */ 828 0,0,0,0, 1,1,4,0, /* 08 */ 829 0,0,0,0, 2,2,4,5, /* 10 */ 830 0,0,0,0, 2,2,4,5, /* 18 */ 831 0,0,0,0, 1,1,0,1, /* 20 */ 832 0,0,0,0, 1,1,0,1, /* 28 */ 833 0,0,0,0, 1,1,0,1, /* 30 */ 834 0,0,0,0, 1,1,0,1, /* 38 */ 835 1,1,1,1, 1,1,1,1, /* 40 */ 836 1,1,1,1, 1,1,1,1, /* 48 */ 837 3,3,3,3, 3,3,3,3, /* 50 */ 838 2,2,2,2, 3,2,2,2, /* 58 */ 839 5,5,5,5, 0,0,0,0, /* 60 */ 840 3,3,0,0, 5,5,5,5, /* 68 */ 841 1,1,1,1, 1,1,1,1, /* 70 */ 842 1,1,1,1, 1,1,1,1, /* 78 */ 843 0,0,0,0, 0,0,0,0, /* 80 */ 844 0,0,0,0, 0,1,4,0, /* 88 */ 845 1,3,3,3, 3,3,3,3, /* 90 */ 846 1,1,5,0, 5,5,1,1, /* 98 */ 847 1,1,2,2, 5,5,5,5, /* A0 */ 848 1,1,3,3, 2,2,3,3, /* A8 */ 849 1,1,1,1, 1,1,1,1, /* B0 */ 850 1,1,1,1, 1,1,1,1, /* B8 */ 851 0,0,5,4, 0,0,0,0, /* C0 */ 852 5,3,5,5, 5,3,5,5, /* C8 */ 853 0,0,0,0, 4,3,0,2, /* D0 */ 854 0,0,0,0, 0,0,0,0, /* D8 */ 855 4,4,4,2, 5,5,5,5, /* E0 */ 856 4,1,5,1, 5,5,5,5, /* E8 */ 857 0,0,5,5, 5,1,0,0, /* F0 */ 858 1,1,5,5, 4,4,0,0, /* F8 */ 859 ]; 860 861 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ]; 862 863 /************************************************ 864 * Determine number of micro-ops for Pentium Pro and Pentium II processors. 865 * 5 means 'complex'. 866 * Doesn't currently handle: 867 * floating point 868 * MMX 869 * 0F opcodes 870 * prefix bytes 871 */ 872 873 private int uops(code *c) 874 { int n; 875 int op; 876 int op2; 877 878 op = c.Iop & 0xFF; 879 if ((c.Iop & 0xFF00) == 0x0F00) 880 op = 0x0F; 881 n = insuops[op]; 882 if (!n) // if special case 883 { ubyte irm,mod,reg,rm; 884 885 irm = c.Irm; 886 mod = (irm >> 6) & 3; 887 reg = (irm >> 3) & 7; 888 rm = irm & 7; 889 890 switch (op) 891 { 892 case 0x10: 893 case 0x11: // ADC rm,r 894 case 0x18: 895 case 0x19: // SBB rm,r 896 n = (mod == 3) ? 2 : 4; 897 break; 898 899 case 0x12: 900 case 0x13: // ADC r,rm 901 case 0x1A: 902 case 0x1B: // SBB r,rm 903 n = (mod == 3) ? 2 : 3; 904 break; 905 906 case 0x00: 907 case 0x01: // ADD rm,r 908 case 0x08: 909 case 0x09: // OR rm,r 910 case 0x20: 911 case 0x21: // AND rm,r 912 case 0x28: 913 case 0x29: // SUB rm,r 914 case 0x30: 915 case 0x31: // XOR rm,r 916 n = (mod == 3) ? 1 : 4; 917 break; 918 919 case 0x02: 920 case 0x03: // ADD r,rm 921 case 0x0A: 922 case 0x0B: // OR r,rm 923 case 0x22: 924 case 0x23: // AND r,rm 925 case 0x2A: 926 case 0x2B: // SUB r,rm 927 case 0x32: 928 case 0x33: // XOR r,rm 929 case 0x38: 930 case 0x39: // CMP rm,r 931 case 0x3A: 932 case 0x3B: // CMP r,rm 933 case 0x69: // IMUL rm,r,imm 934 case 0x6B: // IMUL rm,r,imm8 935 case 0x84: 936 case 0x85: // TEST rm,r 937 n = (mod == 3) ? 1 : 2; 938 break; 939 940 case 0x80: 941 case 0x81: 942 case 0x82: 943 case 0x83: 944 if (reg == 2 || reg == 3) // ADC/SBB rm,imm 945 n = (mod == 3) ? 2 : 4; 946 else if (reg == 7) // CMP rm,imm 947 n = (mod == 3) ? 1 : 2; 948 else 949 n = (mod == 3) ? 1 : 4; 950 break; 951 952 case 0x86: 953 case 0x87: // XCHG rm,r 954 n = (mod == 3) ? 3 : 5; 955 break; 956 957 case 0x88: 958 case 0x89: // MOV rm,r 959 n = (mod == 3) ? 1 : 2; 960 break; 961 962 case 0x8A: 963 case 0x8B: // MOV r,rm 964 n = 1; 965 break; 966 967 case 0x8C: // MOV Sreg,rm 968 n = (mod == 3) ? 1 : 3; 969 break; 970 971 case 0x8F: 972 if (reg == 0) // POP m 973 n = 5; 974 break; 975 976 case 0xC6: 977 case 0xC7: 978 if (reg == 0) // MOV rm,imm 979 n = (mod == 3) ? 1 : 2; 980 break; 981 982 case 0xD0: 983 case 0xD1: 984 if (reg == 2 || reg == 3) // RCL/RCR rm,1 985 n = (mod == 3) ? 2 : 4; 986 else 987 n = (mod == 3) ? 1 : 4; 988 break; 989 990 case 0xC0: 991 case 0xC1: // RCL/RCR rm,imm8 992 case 0xD2: 993 case 0xD3: 994 if (reg == 2 || reg == 3) // RCL/RCR rm,CL 995 n = 5; 996 else 997 n = (mod == 3) ? 1 : 4; 998 break; 999 1000 case 0xD8: 1001 case 0xD9: 1002 case 0xDA: 1003 case 0xDB: 1004 case 0xDC: 1005 case 0xDD: 1006 case 0xDE: 1007 case 0xDF: 1008 // Floating point opcodes 1009 if (irm < 0xC0) 1010 { n = uopsgrpf1[op - 0xD8][reg]; 1011 break; 1012 } 1013 n = uopsx[op - 0xD8]; 1014 switch (op) 1015 { 1016 case 0xD9: 1017 switch (irm) 1018 { 1019 case 0xE0: // FCHS 1020 n = 3; 1021 break; 1022 case 0xE8: 1023 case 0xE9: 1024 case 0xEA: 1025 case 0xEB: 1026 case 0xEC: 1027 case 0xED: 1028 n = 2; 1029 break; 1030 case 0xF0: 1031 case 0xF1: 1032 case 0xF2: 1033 case 0xF3: 1034 case 0xF4: 1035 case 0xF5: 1036 case 0xF8: 1037 case 0xF9: 1038 case 0xFB: 1039 case 0xFC: 1040 case 0xFD: 1041 case 0xFE: 1042 case 0xFF: 1043 n = 5; 1044 break; 1045 1046 default: 1047 break; 1048 } 1049 break; 1050 case 0xDE: 1051 if (irm == 0xD9) // FCOMPP 1052 n = 2; 1053 break; 1054 1055 default: 1056 break; 1057 } 1058 break; 1059 1060 case 0xF6: 1061 if (reg == 6 || reg == 7) // DIV AL,rm8 1062 n = (mod == 3) ? 3 : 4; 1063 else if (reg == 4 || reg == 5 || reg == 0) // MUL/IMUL/TEST rm8 1064 n = (mod == 3) ? 1 : 2; 1065 else if (reg == 2 || reg == 3) // NOT/NEG rm 1066 n = (mod == 3) ? 1 : 4; 1067 break; 1068 1069 case 0xF7: 1070 if (reg == 6 || reg == 7) // DIV EAX,rm 1071 n = 4; 1072 else if (reg == 4 || reg == 5) // MUL/IMUL rm 1073 n = (mod == 3) ? 3 : 4; 1074 else if (reg == 2 || reg == 3) // NOT/NEG rm 1075 n = (mod == 3) ? 1 : 4; 1076 break; 1077 1078 case 0xFF: 1079 if (reg == 2 || reg == 3 || // CALL rm, CALL m,rm 1080 reg == 5) // JMP seg:offset 1081 n = 5; 1082 else if (reg == 4) 1083 n = (mod == 3) ? 1 : 2; 1084 else if (reg == 0 || reg == 1) // INC/DEC rm 1085 n = (mod == 3) ? 1 : 4; 1086 else if (reg == 6) // PUSH rm 1087 n = (mod == 3) ? 3 : 4; 1088 break; 1089 1090 case 0x0F: 1091 op2 = c.Iop & 0xFF; 1092 if ((op2 & 0xF0) == 0x80) // Jcc 1093 { n = 1; 1094 break; 1095 } 1096 if ((op2 & 0xF0) == 0x90) // SETcc 1097 { n = (mod == 3) ? 1 : 3; 1098 break; 1099 } 1100 if (op2 == 0xB6 || op2 == 0xB7 || // MOVZX 1101 op2 == 0xBE || op2 == 0xBF) // MOVSX 1102 { n = 1; 1103 break; 1104 } 1105 if (op2 == 0xAF) // IMUL r,m 1106 { n = (mod == 3) ? 1 : 2; 1107 break; 1108 } 1109 break; 1110 1111 default: 1112 break; 1113 } 1114 } 1115 if (n == 0) 1116 n = 5; // copout for now 1117 return n; 1118 } 1119 1120 /****************************************** 1121 * Determine pairing classification. 1122 * Don't deal with floating point, just assume they are all NP (Not Pairable). 1123 * Returns: 1124 * NP,UV,PU,PV optionally OR'd with PE 1125 */ 1126 1127 private int pair_class(code *c) 1128 { ubyte op; 1129 ubyte irm,mod,reg,rm; 1130 uint a32; 1131 int pc; 1132 1133 // Of course, with Intel this is *never* simple, and Intel's 1134 // documentation is vague about the specifics. 1135 1136 op = c.Iop & 0xFF; 1137 if ((c.Iop & 0xFF00) == 0x0F00) 1138 op = 0x0F; 1139 pc = pentcycl[op]; 1140 a32 = I32; 1141 if (c.Iflags & CFaddrsize) 1142 a32 ^= 1; 1143 irm = c.Irm; 1144 mod = (irm >> 6) & 3; 1145 reg = (irm >> 3) & 7; 1146 rm = irm & 7; 1147 switch (op) 1148 { 1149 case 0x0F: // 2 byte opcode 1150 if ((c.Iop & 0xF0) == 0x80) // if Jcc 1151 pc = PV | PF; 1152 break; 1153 1154 case 0x80: 1155 case 0x81: 1156 case 0x83: 1157 if (reg == 2 || // ADC EA,immed 1158 reg == 3) // SBB EA,immed 1159 { pc = PU; 1160 goto L2; 1161 } 1162 goto L1; // AND/OR/XOR/ADD/SUB/CMP EA,immed 1163 1164 case 0x84: 1165 case 0x85: // TEST EA,reg 1166 if (mod == 3) // TEST reg,reg 1167 pc = UV; 1168 break; 1169 1170 case 0xC0: 1171 case 0xC1: 1172 if (reg >= 4) 1173 pc = PU; 1174 break; 1175 1176 case 0xC6: 1177 case 0xC7: 1178 if (reg == 0) // MOV EA,immed 1179 { 1180 L1: 1181 pc = UV; 1182 L2: 1183 // if EA contains a displacement then 1184 // can't execute in V, or pair in U 1185 switch (mod) 1186 { case 0: 1187 if (a32) 1188 { if (rm == 5 || 1189 (rm == 4 && (c.Isib & 7) == 5) 1190 ) 1191 pc = NP; 1192 } 1193 else if (rm == 6) 1194 pc = NP; 1195 break; 1196 case 1: 1197 case 2: 1198 pc = NP; 1199 break; 1200 1201 default: 1202 break; 1203 } 1204 } 1205 break; 1206 1207 case 0xD9: 1208 if (irm < 0xC0) 1209 { 1210 if (reg == 0) 1211 pc = FX; 1212 } 1213 else if (irm < 0xC8) 1214 pc = FX; 1215 else if (irm < 0xD0) 1216 pc = PV; 1217 else 1218 { 1219 switch (irm) 1220 { 1221 case 0xE0: 1222 case 0xE1: 1223 case 0xE4: 1224 pc = FX; 1225 break; 1226 1227 default: 1228 break; 1229 } 1230 } 1231 break; 1232 1233 case 0xDB: 1234 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1235 pc = FX; 1236 break; 1237 1238 case 0xDD: 1239 if (irm < 0xC0) 1240 { 1241 if (reg == 0) 1242 pc = FX; 1243 } 1244 else if (irm >= 0xE0 && irm < 0xF0) 1245 pc = FX; 1246 break; 1247 1248 case 0xDF: 1249 if (irm < 0xC0 && (reg == 0 || reg == 5)) 1250 pc = FX; 1251 break; 1252 1253 case 0xFE: 1254 if (reg == 0 || reg == 1) // INC/DEC EA 1255 pc = UV; 1256 break; 1257 case 0xFF: 1258 if (reg == 0 || reg == 1) // INC/DEC EA 1259 pc = UV; 1260 else if (reg == 2 || reg == 4) // CALL/JMP near ptr EA 1261 pc = PE|PV; 1262 else if (reg == 6 && mod == 3) // PUSH reg 1263 pc = PE | UV; 1264 break; 1265 1266 default: 1267 break; 1268 } 1269 if (c.Iflags & CFPREFIX && pc == UV) // if prefix byte 1270 pc = PU; 1271 return pc; 1272 } 1273 1274 /****************************************** 1275 * For an instruction, determine what is read 1276 * and what is written, and what is used for addressing. 1277 * Determine operand size if EA (larger is ok). 1278 */ 1279 1280 private void getinfo(Cinfo *ci,code *c) 1281 { 1282 memset(ci,0,Cinfo.sizeof); 1283 if (!c) 1284 return; 1285 ci.c = c; 1286 1287 if (PRO) 1288 { 1289 ci.uops = cast(ubyte)uops(c); 1290 ci.isz = cast(ubyte)calccodsize(c); 1291 } 1292 else 1293 ci.pair = cast(ubyte)pair_class(c); 1294 1295 ubyte op; 1296 ubyte op2; 1297 ubyte irm,mod,reg,rm; 1298 uint a32; 1299 int pc; 1300 uint r,w; 1301 int sz = I32 ? 4 : 2; 1302 1303 ci.r = 0; 1304 ci.w = 0; 1305 ci.a = 0; 1306 op = c.Iop & 0xFF; 1307 if ((c.Iop & 0xFF00) == 0x0F00) 1308 op = 0x0F; 1309 //printf("\tgetinfo %x, op %x \n",c,op); 1310 pc = pentcycl[op]; 1311 a32 = I32; 1312 if (c.Iflags & CFaddrsize) 1313 a32 ^= 1; 1314 if (c.Iflags & CFopsize) 1315 sz ^= 2 | 4; 1316 irm = c.Irm; 1317 mod = (irm >> 6) & 3; 1318 reg = (irm >> 3) & 7; 1319 rm = irm & 7; 1320 1321 r = oprw[op][0]; 1322 w = oprw[op][1]; 1323 1324 switch (op) 1325 { 1326 case 0x50: 1327 case 0x51: 1328 case 0x52: 1329 case 0x53: 1330 case 0x55: 1331 case 0x56: 1332 case 0x57: // PUSH reg 1333 ci.flags |= CIFL.push; 1334 goto Lpush; 1335 1336 case 0x54: // PUSH ESP 1337 case 0x6A: // PUSH imm8 1338 case 0x68: // PUSH imm 1339 case 0x0E: 1340 case 0x16: 1341 case 0x1E: 1342 case 0x06: 1343 case 0x9C: 1344 Lpush: 1345 ci.spadjust = -sz; 1346 ci.a |= mSP; 1347 break; 1348 1349 case 0x58: 1350 case 0x59: 1351 case 0x5A: 1352 case 0x5B: 1353 case 0x5C: 1354 case 0x5D: 1355 case 0x5E: 1356 case 0x5F: // POP reg 1357 case 0x1F: 1358 case 0x07: 1359 case 0x17: 1360 case 0x9D: // POPF 1361 Lpop: 1362 ci.spadjust = sz; 1363 ci.a |= mSP; 1364 break; 1365 1366 case 0x80: 1367 if (reg == 7) // CMP 1368 c.Iflags |= CFpsw; 1369 r = B | grprw[0][reg][0]; // Grp 1 (byte) 1370 w = B | grprw[0][reg][1]; 1371 break; 1372 1373 case 0x81: 1374 case 0x83: 1375 if (reg == 7) // CMP 1376 c.Iflags |= CFpsw; 1377 else if (irm == modregrm(3,0,SP)) // ADD ESP,imm 1378 { 1379 assert(c.IFL2 == FLconst); 1380 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint; 1381 } 1382 else if (irm == modregrm(3,5,SP)) // SUB ESP,imm 1383 { 1384 assert(c.IFL2 == FLconst); 1385 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint; 1386 } 1387 r = grprw[0][reg][0]; // Grp 1 1388 w = grprw[0][reg][1]; 1389 break; 1390 1391 case 0x8F: 1392 if (reg == 0) // POP rm 1393 goto Lpop; 1394 break; 1395 1396 case 0xA0: 1397 case 0xA1: 1398 case 0xA2: 1399 case 0xA3: 1400 // Fake having an EA to simplify code in conflict() 1401 ci.flags |= CIFL.ea; 1402 ci.reg = 0; 1403 ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6); 1404 c.IFL1 = c.IFL2; 1405 c.IEV1 = c.IEV2; 1406 break; 1407 1408 case 0xC2: 1409 case 0xC3: 1410 case 0xCA: 1411 case 0xCB: // RET 1412 ci.a |= mSP; 1413 break; 1414 1415 case 0xE8: 1416 if (c.Iflags & CFclassinit) // call to __j_classinit 1417 { r = 0; 1418 w = F; 1419 1420 version (CLASSINIT2) 1421 ci.pair = UV; // it is patched to CMP EAX,0 1422 else 1423 ci.pair = NP; 1424 1425 } 1426 break; 1427 1428 case 0xF6: 1429 r = grprw[3][reg][0]; // Grp 3, byte version 1430 w = grprw[3][reg][1]; 1431 break; 1432 1433 case 0xF7: 1434 r = grprw[1][reg][0]; // Grp 3 1435 w = grprw[1][reg][1]; 1436 break; 1437 1438 case 0x0F: 1439 op2 = c.Iop & 0xFF; 1440 if ((op2 & 0xF0) == 0x80) // if Jxx instructions 1441 { 1442 ci.r = F | N; 1443 ci.w = N; 1444 goto Lret; 1445 } 1446 ci.r = N; 1447 ci.w = N; // copout for now 1448 goto Lret; 1449 1450 case 0xD7: // XLAT 1451 ci.a = mAX | mBX; 1452 break; 1453 1454 case 0xFF: 1455 r = grprw[2][reg][0]; // Grp 5 1456 w = grprw[2][reg][1]; 1457 if (reg == 6) // PUSH rm 1458 goto Lpush; 1459 break; 1460 1461 case 0x38: 1462 case 0x39: 1463 case 0x3A: 1464 case 0x3B: 1465 case 0x3C: // CMP AL,imm8 1466 case 0x3D: // CMP EAX,imm32 1467 // For CMP opcodes, always test for flags 1468 c.Iflags |= CFpsw; 1469 break; 1470 1471 case ESCAPE: 1472 if (c.Iop == (ESCAPE | ESCadjfpu)) 1473 ci.fpuadjust = c.IEV1.Vint; 1474 break; 1475 1476 case 0xD0: 1477 case 0xD1: 1478 case 0xD2: 1479 case 0xD3: 1480 case 0xC0: 1481 case 0xC1: 1482 if (reg == 2 || reg == 3) // if RCL or RCR 1483 c.Iflags |= CFpsw; // always test for flags 1484 break; 1485 1486 case 0xD8: 1487 case 0xD9: 1488 case 0xDA: 1489 case 0xDB: 1490 case 0xDC: 1491 case 0xDD: 1492 case 0xDE: 1493 case 0xDF: 1494 if (irm < 0xC0) 1495 { r = grpf1[op - 0xD8][reg][0]; 1496 w = grpf1[op - 0xD8][reg][1]; 1497 switch (op) 1498 { 1499 case 0xD8: 1500 if (reg == 3) // if FCOMP 1501 ci.fpuadjust = -1; 1502 else 1503 ci.fp_op = FP.fop; 1504 break; 1505 1506 case 0xD9: 1507 if (reg == 0) // if FLD float 1508 { ci.fpuadjust = 1; 1509 ci.fp_op = FP.fld; 1510 } 1511 else if (reg == 3) // if FSTP float 1512 { ci.fpuadjust = -1; 1513 ci.fp_op = FP.fstp; 1514 } 1515 else if (reg == 5 || reg == 7) 1516 sz = 2; 1517 else if (reg == 4 || reg == 6) 1518 sz = 28; 1519 break; 1520 case 0xDA: 1521 if (reg == 3) // if FICOMP 1522 ci.fpuadjust = -1; 1523 break; 1524 case 0xDB: 1525 if (reg == 0 || reg == 5) 1526 { ci.fpuadjust = 1; 1527 ci.fp_op = FP.fld; // FILD / FLD long double 1528 } 1529 if (reg == 3 || reg == 7) 1530 ci.fpuadjust = -1; 1531 if (reg == 7) 1532 ci.fp_op = FP.fstp; // FSTP long double 1533 if (reg == 5 || reg == 7) 1534 sz = 10; 1535 break; 1536 case 0xDC: 1537 sz = 8; 1538 if (reg == 3) // if FCOMP 1539 ci.fpuadjust = -1; 1540 else 1541 ci.fp_op = FP.fop; 1542 break; 1543 case 0xDD: 1544 if (reg == 0) // if FLD double 1545 { ci.fpuadjust = 1; 1546 ci.fp_op = FP.fld; 1547 } 1548 if (reg == 3) // if FSTP double 1549 { ci.fpuadjust = -1; 1550 ci.fp_op = FP.fstp; 1551 } 1552 if (reg == 7) 1553 sz = 2; 1554 else if (reg == 4 || reg == 6) 1555 sz = 108; 1556 else 1557 sz = 8; 1558 break; 1559 case 0xDE: 1560 sz = 2; 1561 if (reg == 3) // if FICOMP 1562 ci.fpuadjust = -1; 1563 break; 1564 case 0xDF: 1565 sz = 2; 1566 if (reg == 4 || reg == 6) 1567 sz = 10; 1568 else if (reg == 5 || reg == 7) 1569 sz = 8; 1570 if (reg == 0 || reg == 4 || reg == 5) 1571 ci.fpuadjust = 1; 1572 else if (reg == 3 || reg == 6 || reg == 7) 1573 ci.fpuadjust = -1; 1574 break; 1575 1576 default: 1577 break; 1578 } 1579 break; 1580 } 1581 else if (op == 0xDE) 1582 { ci.fpuadjust = -1; // pop versions of Fop's 1583 if (irm == 0xD9) 1584 ci.fpuadjust = -2; // FCOMPP 1585 } 1586 1587 // Most floating point opcodes aren't staged, but are 1588 // sent right through, in order to make use of the large 1589 // latencies with floating point instructions. 1590 if (ci.fp_op == FP.fld || 1591 (op == 0xD9 && (irm & 0xF8) == 0xC0)) 1592 { } // FLD ST(i) 1593 else 1594 ci.flags |= CIFL.nostage; 1595 1596 switch (op) 1597 { 1598 case 0xD8: 1599 r = S; 1600 w = C; 1601 if ((irm & ~7) == 0xD0) 1602 w |= S; 1603 break; 1604 case 0xD9: 1605 // FCHS or FABS or FSQRT 1606 if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA) 1607 ci.fp_op = FP.fop; 1608 r = S; 1609 w = S|C; 1610 break; 1611 case 0xDA: 1612 if (irm == 0xE9) // FUCOMPP 1613 { r = S; 1614 w = S|C; 1615 break; 1616 } 1617 break; 1618 case 0xDB: 1619 if (irm == 0xE2) // FCLEX 1620 { r = 0; 1621 w = C; 1622 break; 1623 } 1624 if (irm == 0xE3) // FINIT 1625 { r = 0; 1626 w = S|C; 1627 break; 1628 } 1629 break; 1630 case 0xDC: 1631 case 0xDE: 1632 if ((irm & 0xF0) != 0xD0) 1633 { r = S; 1634 w = S|C; 1635 break; 1636 } 1637 break; 1638 case 0xDD: 1639 // Not entirely correct, but conservative 1640 r = S; 1641 w = S|C; 1642 break; 1643 case 0xDF: 1644 if (irm == 0xE0) // FSTSW AX 1645 { r = C; 1646 w = mAX; 1647 break; 1648 } 1649 break; 1650 1651 default: 1652 break; 1653 } 1654 break; 1655 1656 default: 1657 //printf("\t\tNo special case\n"); 1658 break; 1659 } 1660 1661 if ((r | w) & B) // if byte operation 1662 sz = 1; // operand size is 1 1663 1664 ci.r = r & ~(R | EA); 1665 ci.w = w & ~(R | EA); 1666 if (r & R) 1667 ci.r |= mask((r & B) ? (reg & 3) : reg); 1668 if (w & R) 1669 ci.w |= mask((w & B) ? (reg & 3) : reg); 1670 1671 // OR in bits for EA addressing mode 1672 if ((r | w) & EA) 1673 { ubyte sib; 1674 1675 sib = 0; 1676 switch (mod) 1677 { 1678 case 0: 1679 if (a32) 1680 { 1681 if (rm == 4) 1682 { 1683 sib = c.Isib; 1684 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1685 ci.a |= mask((sib >> 3) & 7); // index register 1686 if ((sib & 7) != 5) 1687 ci.a |= mask(sib & 7); // base register 1688 } 1689 else if (rm != 5) 1690 ci.a |= mask(rm); 1691 } 1692 else 1693 { 1694 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX]; 1695 ci.a |= ea16[rm]; 1696 } 1697 goto Lmem; 1698 1699 case 1: 1700 case 2: 1701 if (a32) 1702 { 1703 if (rm == 4) 1704 { 1705 sib = c.Isib; 1706 if ((sib & modregrm(0,7,0)) != modregrm(0,4,0)) 1707 ci.a |= mask((sib >> 3) & 7); // index register 1708 ci.a |= mask(sib & 7); // base register 1709 } 1710 else 1711 ci.a |= mask(rm); 1712 } 1713 else 1714 { 1715 immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX]; 1716 ci.a |= ea16[rm]; 1717 } 1718 1719 Lmem: 1720 if (r & EA) 1721 ci.r |= mMEM; 1722 if (w & EA) 1723 ci.w |= mMEM; 1724 ci.flags |= CIFL.ea; 1725 break; 1726 1727 case 3: 1728 if (r & EA) 1729 ci.r |= mask((r & B) ? (rm & 3) : rm); 1730 if (w & EA) 1731 ci.w |= mask((w & B) ? (rm & 3) : rm); 1732 break; 1733 1734 default: 1735 assert(0); 1736 } 1737 // Adjust sibmodrm so that addressing modes can be compared simply 1738 irm &= modregrm(3,0,7); 1739 if (a32) 1740 { 1741 if (irm != modregrm(0,0,5)) 1742 { 1743 switch (mod) 1744 { 1745 case 0: 1746 if ((sib & 7) != 5) // if not disp32[index] 1747 { 1748 c.IFL1 = FLconst; 1749 c.IEV1.Vpointer = 0; 1750 irm |= 0x80; 1751 } 1752 break; 1753 case 1: 1754 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1755 irm = modregrm(2, 0, rm); 1756 break; 1757 1758 default: 1759 break; 1760 } 1761 } 1762 } 1763 else 1764 { 1765 if (irm != modregrm(0,0,6)) 1766 { 1767 switch (mod) 1768 { 1769 case 0: 1770 c.IFL1 = FLconst; 1771 c.IEV1.Vpointer = 0; 1772 irm |= 0x80; 1773 break; 1774 case 1: 1775 c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer; 1776 irm = modregrm(2, 0, rm); 1777 break; 1778 1779 default: 1780 break; 1781 } 1782 } 1783 } 1784 1785 ci.r |= ci.a; 1786 ci.reg = reg; 1787 ci.sibmodrm = (sib << 8) | irm; 1788 } 1789 Lret: 1790 if (ci.w & mSP) // if stack pointer is modified 1791 ci.w |= mMEM; // then we are implicitly writing to memory 1792 if (op == LEA) // if LEA 1793 ci.r &= ~mMEM; // memory is not actually read 1794 ci.sz = cast(ubyte)sz; 1795 1796 //printf("\t\t"); ci.print(); 1797 } 1798 1799 /****************************************** 1800 * Determine if two instructions can pair. 1801 * Assume that in general, cu can pair in the U pipe and cv in the V. 1802 * Look for things like register contentions. 1803 * Input: 1804 * cu instruction for U pipe 1805 * cv instruction for V pipe 1806 * Returns: 1807 * !=0 if they can pair 1808 */ 1809 1810 private int pair_test(Cinfo *cu,Cinfo *cv) 1811 { 1812 uint pcu; 1813 uint pcv; 1814 uint r1,w1; 1815 uint r2,w2; 1816 uint x; 1817 1818 pcu = cu.pair; 1819 if (!(pcu & PU)) 1820 { 1821 // See if pairs with FXCH and cv is FXCH 1822 if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8) 1823 goto Lpair; 1824 goto Lnopair; 1825 } 1826 pcv = cv.pair; 1827 if (!(pcv & PV)) 1828 goto Lnopair; 1829 1830 r1 = cu.r; 1831 w1 = cu.w; 1832 r2 = cv.r; 1833 w2 = cv.w; 1834 1835 x = w1 & (r2 | w2) & ~(F|mMEM); // register contention 1836 if (x && // if register contention 1837 !(x == mSP && pcu & pcv & PE) // and not exception 1838 ) 1839 goto Lnopair; 1840 1841 // Look for flags contention 1842 if (w1 & r2 & F && !(pcv & PF)) 1843 goto Lnopair; 1844 1845 Lpair: 1846 return 1; 1847 1848 Lnopair: 1849 return 0; 1850 } 1851 1852 /****************************************** 1853 * Determine if two instructions have an AGI or register contention. 1854 * Returns: 1855 * !=0 if they have an AGI 1856 */ 1857 1858 private int pair_agi(Cinfo *c1, Cinfo *c2) 1859 { 1860 uint x = c1.w & c2.a; 1861 return x && !(x == mSP && c1.pair & c2.pair & PE); 1862 } 1863 1864 /******************************************** 1865 * Determine if three instructions can decode simultaneously 1866 * in Pentium Pro and Pentium II. 1867 * Input: 1868 * c0,c1,c2 candidates for decoders 0,1,2 1869 * c2 can be null 1870 * Returns: 1871 * !=0 if they can decode simultaneously 1872 */ 1873 1874 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2) 1875 { 1876 assert(c0); 1877 if (!c1) 1878 return 0; 1879 int c2isz = c2 ? c2.isz : 0; 1880 if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 || 1881 c0.isz + c1.isz + c2isz > 16) 1882 return 0; 1883 1884 // 4-1-1 decode 1885 if (c1.uops > 1 || 1886 (c2 && c2.uops > 1)) 1887 return 0; 1888 1889 return 1; 1890 } 1891 1892 /******************************************** 1893 * Get next instruction worth looking at for scheduling. 1894 * Returns: 1895 * null no more instructions 1896 */ 1897 1898 private code * cnext(code *c) 1899 { 1900 while (1) 1901 { 1902 c = code_next(c); 1903 if (!c) 1904 break; 1905 if (c.Iflags & (CFtarg | CFtarg2)) 1906 break; 1907 if (!(c.Iop == NOP || 1908 c.Iop == (ESCAPE | ESClinnum))) 1909 break; 1910 } 1911 return c; 1912 } 1913 1914 /****************************************** 1915 * Instruction scheduler. 1916 * Input: 1917 * c list of instructions to schedule 1918 * scratch scratch registers we can use 1919 * Returns: 1920 * revised list of scheduled instructions 1921 */ 1922 1923 /////////////////////////////////// 1924 // Determine if c1 and c2 are swappable. 1925 // c1 comes before c2. 1926 // If they do not conflict 1927 // return 0 1928 // If they do conflict 1929 // return 0x100 + delay_clocks 1930 // Input: 1931 // fpsched if 1, then adjust fxch_pre and fxch_post to swap, 1932 // then return 0 1933 // if 2, then adjust ci1 as well as ci2 1934 1935 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched) 1936 { 1937 code *c1; 1938 code *c2; 1939 uint r1,w1,a1; 1940 uint r2,w2,a2; 1941 int sz1,sz2; 1942 int i = 0; 1943 int delay_clocks; 1944 1945 c1 = ci1.c; 1946 c2 = ci2.c; 1947 1948 //printf("conflict %x %x\n",c1,c2); 1949 1950 r1 = ci1.r; 1951 w1 = ci1.w; 1952 a1 = ci1.a; 1953 sz1 = ci1.sz; 1954 1955 r2 = ci2.r; 1956 w2 = ci2.w; 1957 a2 = ci2.a; 1958 sz2 = ci2.sz; 1959 1960 //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1); 1961 //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2); 1962 1963 if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex)) 1964 goto Lconflict; 1965 1966 // Determine if we should handle FPU register conflicts separately 1967 //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op); 1968 if (fpsched && ci1.fp_op && ci2.fp_op) 1969 { 1970 w1 &= ~(S|C); 1971 r1 &= ~(S|C); 1972 w2 &= ~(S|C); 1973 r2 &= ~(S|C); 1974 } 1975 else 1976 fpsched = 0; 1977 1978 if ((r1 | r2) & N) 1979 { 1980 goto Lconflict; 1981 } 1982 1983 static if (0) 1984 { 1985 if (c1.Iop == 0xFF && c2.Iop == 0x8B) 1986 { c1.print(); c2.print(); i = 1; 1987 printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 1988 } 1989 } 1990 L1: 1991 if (w1 & r2 || (r1 | w1) & w2) 1992 { ubyte ifl1,ifl2; 1993 1994 if (i) printf("test\n"); 1995 1996 static if (0) 1997 { 1998 if (c1.IFL1 != c2.IFL1) printf("t1\n"); 1999 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n"); 2000 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n"); 2001 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n"); 2002 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n"); 2003 } 2004 2005 // make sure CFpsw is reliably set 2006 if (w1 & w2 & F && // if both instructions write to flags 2007 w1 != F && 2008 w2 != F && 2009 !((r1 | r2) & F) && // but neither instruction reads them 2010 !((c1.Iflags | c2.Iflags) & CFpsw)) // and we don't care about flags 2011 { 2012 w1 &= ~F; 2013 w2 &= ~F; // remove conflict 2014 goto L1; // and try again 2015 } 2016 2017 // If other than the memory reference is a conflict 2018 if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM) 2019 { if (i) printf("\t1\n"); 2020 if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2); 2021 goto Lconflict; 2022 } 2023 2024 // If referring to distinct types, then no dependency 2025 if (c1.Irex && c2.Irex && c1.Irex != c2.Irex) 2026 goto Lswap; 2027 2028 ifl1 = c1.IFL1; 2029 ifl2 = c2.IFL1; 2030 2031 // Special case: Allow indexed references using registers other than 2032 // ESP and EBP to be swapped with PUSH instructions 2033 if (((c1.Iop & ~7) == 0x50 || // PUSH reg 2034 c1.Iop == 0x6A || // PUSH imm8 2035 c1.Iop == 0x68 || // PUSH imm16/imm32 2036 (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA 2037 ) && 2038 ci2.flags & CIFL.ea && !(a2 & mSP) && 2039 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2040 ) 2041 { 2042 if (c1.Iop == 0xFF) 2043 { 2044 if (!(w2 & mMEM)) 2045 goto Lswap; 2046 } 2047 else 2048 goto Lswap; 2049 } 2050 2051 // Special case: Allow indexed references using registers other than 2052 // ESP and EBP to be swapped with PUSH instructions 2053 if (((c2.Iop & ~7) == 0x50 || // PUSH reg 2054 c2.Iop == 0x6A || // PUSH imm8 2055 c2.Iop == 0x68 || // PUSH imm16/imm32 2056 (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA 2057 ) && 2058 ci1.flags & CIFL.ea && !(a1 & mSP) && 2059 !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0) 2060 ) 2061 { 2062 if (c2.Iop == 0xFF) 2063 { 2064 if (!(w1 & mMEM)) 2065 goto Lswap; 2066 } 2067 else 2068 goto Lswap; 2069 } 2070 2071 // If not both an EA addressing mode, conflict 2072 if (!(ci1.flags & ci2.flags & CIFL.ea)) 2073 { if (i) printf("\t2\n"); 2074 goto Lconflict; 2075 } 2076 2077 if (ci1.sibmodrm == ci2.sibmodrm) 2078 { if (ifl1 != ifl2) 2079 goto Lswap; 2080 switch (ifl1) 2081 { 2082 case FLconst: 2083 if (c1.IEV1.Vint != c2.IEV1.Vint && 2084 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2085 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)) 2086 goto Lswap; 2087 break; 2088 case FLdatseg: 2089 if (c1.IEV1.Vseg != c2.IEV1.Vseg || 2090 c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2091 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2092 goto Lswap; 2093 break; 2094 2095 default: 2096 break; 2097 } 2098 } 2099 2100 if ((c1.Iflags | c2.Iflags) & CFunambig && 2101 (ifl1 != ifl2 || 2102 ci1.sibmodrm != ci2.sibmodrm || 2103 (c1.IEV1.Vint != c2.IEV1.Vint && 2104 (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint || 2105 c2.IEV1.Vint + sz2 <= c1.IEV1.Vint) 2106 ) 2107 ) 2108 ) 2109 { 2110 // Assume that [EBP] and [ESP] can point to the same location 2111 if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP)) 2112 goto Lconflict; 2113 goto Lswap; 2114 } 2115 2116 if (i) printf("\t3\n"); 2117 goto Lconflict; 2118 } 2119 2120 Lswap: 2121 if (fpsched) 2122 { 2123 //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op); 2124 ubyte x1 = ci1.fxch_pre; 2125 ubyte y1 = ci1.fxch_post; 2126 ubyte x2 = ci2.fxch_pre; 2127 ubyte y2 = ci2.fxch_post; 2128 2129 static uint X(uint a, uint b) { return (a << 8) | b; } 2130 switch (X(ci1.fp_op,ci2.fp_op)) 2131 { 2132 case X(FP.fstp, FP.fld): 2133 if (x1 || y1) 2134 goto Lconflict; 2135 if (x2) 2136 goto Lconflict; 2137 if (y2 == 0) 2138 ci2.fxch_post++; 2139 else if (y2 == 1) 2140 { 2141 ci2.fxch_pre++; 2142 ci2.fxch_post++; 2143 } 2144 else 2145 { 2146 goto Lconflict; 2147 } 2148 break; 2149 2150 case X(FP.fstp, FP.fop): 2151 if (x1 || y1) 2152 goto Lconflict; 2153 ci2.fxch_pre++; 2154 ci2.fxch_post++; 2155 break; 2156 2157 case X(FP.fop, FP.fop): 2158 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0) 2159 { ci2.fxch_pre = 1; 2160 ci2.fxch_post = 1; 2161 break; 2162 } 2163 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1) 2164 break; 2165 goto Lconflict; 2166 2167 case X(FP.fop, FP.fld): 2168 if (x1 || y1) 2169 goto Lconflict; 2170 if (x2) 2171 goto Lconflict; 2172 if (y2) 2173 break; 2174 else if (fpsched == 2) 2175 ci1.fxch_post = 1; 2176 ci2.fxch_post = 1; 2177 break; 2178 2179 default: 2180 goto Lconflict; 2181 } 2182 2183 //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post); 2184 } 2185 2186 //printf("w1 = x%x, w2 = x%x\n",w1,w2); 2187 if (i) printf("no conflict\n\n"); 2188 return 0; 2189 2190 Lconflict: 2191 //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2); 2192 delay_clocks = 0; 2193 2194 // Determine if AGI 2195 if (!PRO && pair_agi(ci1,ci2)) 2196 delay_clocks = 1; 2197 2198 // Special delays for floating point 2199 if (fpsched) 2200 { if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp) 2201 delay_clocks = 1; 2202 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp) 2203 delay_clocks = 3; 2204 else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop) 2205 delay_clocks = 2; 2206 } 2207 else if (PRO) 2208 { 2209 // Look for partial register write stalls 2210 if (w1 & r2 & ALLREGS && sz1 < sz2) 2211 delay_clocks = 7; 2212 } 2213 else if ((w1 | r1) & (w2 | r2) & (C | S)) 2214 { 2215 int op = c1.Iop; 2216 int reg = c1.Irm & modregrm(0,7,0); 2217 if (ci1.fp_op == FP.fld || 2218 (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0) 2219 ) 2220 { } // FLD 2221 else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8) 2222 { } // FXCH 2223 else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8) 2224 { } // FXCH 2225 else 2226 delay_clocks = 3; 2227 } 2228 2229 if (i) printf("conflict %d\n\n",delay_clocks); 2230 return 0x100 + delay_clocks; 2231 } 2232 2233 enum TBLMAX = 2*3*20; // must be divisible by both 2 and 3 2234 // (U,V pipe in Pentium, 3 decode units 2235 // in Pentium Pro) 2236 2237 struct Schedule 2238 { 2239 nothrow: 2240 Cinfo*[TBLMAX] tbl; // even numbers are U pipe, odd numbers are V 2241 int tblmax; // max number of slots used 2242 2243 Cinfo[TBLMAX] cinfo; 2244 int cinfomax; 2245 2246 Barray!(Cinfo*) stagelist; // list of instructions in staging area 2247 2248 int fpustackused; // number of slots in FPU stack that are used 2249 2250 void initialize(int fpustackinit) // initialize scheduler 2251 { 2252 //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit); 2253 memset(&this, 0, Schedule.sizeof); 2254 fpustackused = fpustackinit; 2255 } 2256 2257 code **assemble(code **pc) // reassemble scheduled instructions 2258 { 2259 code *c; 2260 2261 debug 2262 if (debugs) printf("assemble:\n"); 2263 2264 assert(!*pc); 2265 2266 // Try to insert the rest of the staged instructions 2267 size_t sli; 2268 for (sli = 0; sli < stagelist.length; ++sli) 2269 { 2270 Cinfo* ci = stagelist[sli]; 2271 if (!ci) 2272 continue; 2273 if (!insert(ci)) 2274 break; 2275 } 2276 2277 // Get the instructions out of the schedule table 2278 assert(cast(uint)tblmax <= TBLMAX); 2279 for (int i = 0; i < tblmax; i++) 2280 { 2281 Cinfo* ci = tbl[i]; 2282 2283 debug 2284 if (debugs) 2285 { 2286 if (PRO) 2287 { immutable char[4][3] tbl = [ "0 "," 1 "," 2" ]; 2288 2289 if (ci) 2290 printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops); 2291 else 2292 printf("%s ",tbl[i - ((i / 3) * 3)].ptr); 2293 } 2294 else 2295 { 2296 printf((i & 1) ? " V " : "U "); 2297 } 2298 if (ci) 2299 ci.c.print(); 2300 else 2301 printf("\n"); 2302 } 2303 2304 if (!ci) 2305 continue; 2306 fpustackused += ci.fpuadjust; 2307 //printf("stage()1: fpustackused = %d\n", fpustackused); 2308 c = ci.c; 2309 if (i == 0) 2310 c.Iflags |= CFtarg; // by definition, first is always a jump target 2311 else 2312 c.Iflags &= ~CFtarg; // the rest are not 2313 2314 // Put in any FXCH prefix 2315 if (ci.fxch_pre) 2316 { code *cf; 2317 assert(i); 2318 cf = gen2(null,0xD9,0xC8 + ci.fxch_pre); 2319 *pc = cf; 2320 pc = &cf.next; 2321 } 2322 2323 *pc = c; 2324 do 2325 { 2326 assert(*pc != code_next(*pc)); 2327 pc = &(*pc).next; 2328 } while (*pc); 2329 2330 // Put in any FXCH postfix 2331 if (ci.fxch_post) 2332 { 2333 for (int j = i + 1; j < tblmax; j++) 2334 { if (tbl[j]) 2335 { if (tbl[j].fxch_pre == ci.fxch_post) 2336 { 2337 tbl[j].fxch_pre = 0; // they cancel each other out 2338 goto L1; 2339 } 2340 break; 2341 } 2342 } 2343 { code *cf; 2344 cf = gen2(null,0xD9,0xC8 + ci.fxch_post); 2345 *pc = cf; 2346 pc = &cf.next; 2347 } 2348 } 2349 L1: 2350 } 2351 2352 // Just append any instructions left in the staging area 2353 foreach (ci; stagelist[sli .. stagelist.length]) 2354 { 2355 if (!ci) 2356 continue; 2357 2358 debug 2359 if (debugs) { printf("appending: "); ci.c.print(); } 2360 2361 *pc = ci.c; 2362 do 2363 { 2364 pc = &(*pc).next; 2365 2366 } while (*pc); 2367 fpustackused += ci.fpuadjust; 2368 //printf("stage()2: fpustackused = %d\n", fpustackused); 2369 } 2370 stagelist.setLength(0); 2371 2372 return pc; 2373 } 2374 2375 /****************************** 2376 * Insert c into scheduling table. 2377 * Returns: 2378 * 0 could not be scheduled; have to start a new one 2379 */ 2380 2381 int insert(Cinfo *ci) 2382 { code *c; 2383 int clocks; 2384 int i; 2385 int ic = 0; 2386 int imin; 2387 targ_size_t offset; 2388 targ_size_t vpointer; 2389 int movesp = 0; 2390 int reg2 = -1; // avoid "may be uninitialized" warning 2391 2392 //printf("insert "); ci.c.print(); 2393 //printf("insert() %d\n", fpustackused); 2394 c = ci.c; 2395 //printf("\tc.Iop %x\n",c.Iop); 2396 vpointer = c.IEV1.Vpointer; 2397 assert(cast(uint)tblmax <= TBLMAX); 2398 if (tblmax == TBLMAX) // if out of space 2399 goto Lnoinsert; 2400 if (tblmax == 0) // if table is empty 2401 { // Just stuff it in the first slot 2402 i = tblmax; 2403 goto Linsert; 2404 } 2405 else if (c.Iflags & (CFtarg | CFtarg2)) 2406 // Jump targets can only be first in the scheduler 2407 goto Lnoinsert; 2408 2409 // Special case of: 2410 // PUSH reg1 2411 // MOV reg2,x[ESP] 2412 if (c.Iop == 0x8B && 2413 (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2414 c.Isib == modregrm(0,4,SP) && 2415 c.IFL1 == FLconst && 2416 (cast(byte)c.IEV1.Vpointer) >= REGSIZE 2417 ) 2418 { 2419 movesp = 1; // this is a MOV reg2,offset[ESP] 2420 offset = cast(byte)c.IEV1.Vpointer; 2421 reg2 = (c.Irm >> 3) & 7; 2422 } 2423 2424 2425 // Start at tblmax, and back up until we get a conflict 2426 ic = -1; 2427 imin = 0; 2428 for (i = tblmax; i >= 0; i--) 2429 { 2430 Cinfo* cit = tbl[i]; 2431 if (!cit) 2432 continue; 2433 2434 // Look for special case swap 2435 if (movesp && 2436 (cit.c.Iop & ~7) == 0x50 && // if PUSH reg1 2437 (cit.c.Iop & 7) != reg2 && // if reg1 != reg2 2438 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2439 ) 2440 { 2441 c.IEV1.Vpointer += cit.spadjust; 2442 //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2443 continue; 2444 } 2445 2446 if (movesp && 2447 cit.c.Iop == 0x83 && 2448 cit.c.Irm == modregrm(3,5,SP) && // if SUB ESP,offset 2449 cit.c.IFL2 == FLconst && 2450 (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust 2451 ) 2452 { 2453 //printf("\t2, spadjust = %d\n",cit.spadjust); 2454 c.IEV1.Vpointer += cit.spadjust; 2455 continue; 2456 } 2457 2458 clocks = conflict(cit,ci,1); 2459 if (clocks) 2460 { int j; 2461 2462 ic = i; // where the conflict occurred 2463 clocks &= 0xFF; // convert to delay count 2464 2465 // Move forward the delay clocks 2466 if (clocks == 0) 2467 j = i + 1; 2468 else if (PRO) 2469 j = (((i + 3) / 3) * 3) + clocks * 3; 2470 else 2471 { j = ((i + 2) & ~1) + clocks * 2; 2472 2473 // It's possible we skipped over some AGI generating 2474 // instructions due to movesp. 2475 int k; 2476 for (k = i + 1; k < j; k++) 2477 { 2478 if (k >= TBLMAX) 2479 goto Lnoinsert; 2480 if (tbl[k] && pair_agi(tbl[k],ci)) 2481 { 2482 k = ((k + 2) & ~1) + 1; 2483 } 2484 } 2485 j = k; 2486 } 2487 2488 if (j >= TBLMAX) // exceed table size? 2489 goto Lnoinsert; 2490 imin = j; // first possible slot c can go in 2491 break; 2492 } 2493 } 2494 2495 2496 // Scan forward looking for a hole to put it in 2497 for (i = imin; i < TBLMAX; i++) 2498 { 2499 if (tbl[i]) 2500 { 2501 // In case, due to movesp, we skipped over some AGI instructions 2502 if (!PRO && pair_agi(tbl[i],ci)) 2503 { 2504 i = ((i + 2) & ~1) + 1; 2505 if (i >= TBLMAX) 2506 goto Lnoinsert; 2507 } 2508 } 2509 else 2510 { 2511 if (PRO) 2512 { int i0 = (i / 3) * 3; // index of decode unit 0 2513 Cinfo *ci0; 2514 2515 assert(((TBLMAX / 3) * 3) == TBLMAX); 2516 switch (i - i0) 2517 { 2518 case 0: // i0 can handle any instruction 2519 goto Linsert; 2520 case 1: 2521 ci0 = tbl[i0]; 2522 if (ci.uops > 1) 2523 { 2524 if (i0 >= imin && ci0.uops == 1) 2525 goto L1; 2526 i++; 2527 break; 2528 } 2529 if (triple_test(ci0,ci,tbl[i0 + 2])) 2530 goto Linsert; 2531 break; 2532 case 2: 2533 ci0 = tbl[i0]; 2534 if (ci.uops > 1) 2535 { 2536 if (i0 >= imin && ci0.uops == 1) 2537 { 2538 if (i >= tblmax) 2539 { if (i + 1 >= TBLMAX) 2540 goto Lnoinsert; 2541 tblmax = i + 1; 2542 } 2543 tbl[i0 + 2] = tbl[i0 + 1]; 2544 tbl[i0 + 1] = ci0; 2545 i = i0; 2546 goto Linsert; 2547 } 2548 break; 2549 } 2550 if (triple_test(ci0,tbl[i0 + 1],ci)) 2551 goto Linsert; 2552 break; 2553 default: 2554 assert(0); 2555 } 2556 } 2557 else 2558 { 2559 assert((TBLMAX & 1) == 0); 2560 if (i & 1) // if V pipe 2561 { 2562 if (pair_test(tbl[i - 1],ci)) 2563 { 2564 goto Linsert; 2565 } 2566 else if (i > imin && pair_test(ci,tbl[i - 1])) 2567 { 2568 L1: 2569 tbl[i] = tbl[i - 1]; 2570 if (i >= tblmax) 2571 tblmax = i + 1; 2572 i--; 2573 //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop); 2574 goto Linsert; 2575 } 2576 } 2577 else // will always fit in U pipe 2578 { 2579 assert(!tbl[i + 1]); // because V pipe should be empty 2580 goto Linsert; 2581 } 2582 } 2583 } 2584 } 2585 2586 Lnoinsert: 2587 //printf("\tnoinsert\n"); 2588 c.IEV1.Vpointer = vpointer; // reset to original value 2589 return 0; 2590 2591 Linsert: 2592 // Insert at location i 2593 assert(i < TBLMAX); 2594 assert(tblmax <= TBLMAX); 2595 tbl[i] = ci; 2596 //printf("\tinsert at location %d\n",i); 2597 2598 // If it's a scheduled floating point code, we have to adjust 2599 // the FXCH values 2600 if (ci.fp_op) 2601 { 2602 ci.fxch_pre = 0; 2603 ci.fxch_post = 0; // start over again 2604 2605 int fpu = fpustackused; 2606 for (int j = 0; j < tblmax; j++) 2607 { 2608 if (tbl[j]) 2609 { 2610 fpu += tbl[j].fpuadjust; 2611 if (fpu >= 8) // if FPU stack overflow 2612 { tbl[i] = null; 2613 //printf("fpu stack overflow\n"); 2614 goto Lnoinsert; 2615 } 2616 } 2617 } 2618 2619 for (int j = tblmax; j > i; j--) 2620 { 2621 if (j < TBLMAX && tbl[j]) 2622 conflict(tbl[j],ci,2); 2623 } 2624 } 2625 2626 if (movesp) 2627 { // Adjust [ESP] offsets 2628 2629 //printf("\tic = %d, inserting at %d\n",ic,i); 2630 assert(cast(uint)tblmax <= TBLMAX); 2631 for (int j = ic + 1; j < i; j++) 2632 { 2633 Cinfo* cit = tbl[j]; 2634 if (cit) 2635 { 2636 c.IEV1.Vpointer -= cit.spadjust; 2637 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer); 2638 } 2639 } 2640 } 2641 if (i >= tblmax) 2642 tblmax = i + 1; 2643 2644 // Now do a hack. Look back at immediately preceding instructions, 2645 // and see if we can swap with a push. 2646 if (0 && movesp) 2647 { 2648 while (1) 2649 { 2650 int j; 2651 for (j = 1; i > j; j++) 2652 if (tbl[i - j]) 2653 break; 2654 2655 if (i >= j && tbl[i - j] && 2656 (tbl[i - j].c.Iop & ~7) == 0x50 && // if PUSH reg1 2657 (tbl[i - j].c.Iop & 7) != reg2 && // if reg1 != reg2 2658 cast(byte)c.IEV1.Vpointer >= REGSIZE) 2659 { 2660 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i); 2661 assert(cast(uint)i < TBLMAX); 2662 assert(cast(uint)(i - j) < TBLMAX); 2663 tbl[i] = tbl[i - j]; 2664 tbl[i - j] = ci; 2665 i -= j; 2666 c.IEV1.Vpointer -= REGSIZE; 2667 } 2668 else 2669 break; 2670 } 2671 } 2672 2673 //printf("\tinsert\n"); 2674 return 1; 2675 } 2676 2677 /****************************** 2678 * Insert c into staging area. 2679 * Params: 2680 * c = instruction to stage 2681 * Returns: 2682 * false if could not be scheduled; have to start a new one 2683 */ 2684 2685 bool stage(code *c) 2686 { 2687 //printf("stage: "); c.print(); 2688 if (cinfomax == TBLMAX) // if out of space 2689 return false; 2690 auto ci = &cinfo[cinfomax++]; 2691 getinfo(ci,c); 2692 2693 if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex)) 2694 { 2695 // Insert anything in stagelist 2696 foreach (ref cs; stagelist[]) 2697 { 2698 if (cs) 2699 { 2700 if (!insert(cs)) 2701 return false; 2702 cs = null; 2703 } 2704 } 2705 return insert(ci) != 0; 2706 } 2707 2708 // Look through stagelist, and insert any AGI conflicting instructions 2709 bool agi = false; 2710 foreach (ref cs; stagelist[]) 2711 { 2712 if (cs) 2713 { 2714 if (pair_agi(cs,ci)) 2715 { 2716 if (!insert(cs)) 2717 goto Lnostage; 2718 cs = null; 2719 agi = true; // we put out an AGI 2720 } 2721 } 2722 } 2723 2724 // Look through stagelist, and insert any other conflicting instructions 2725 foreach (i, ref cs; stagelist[]) 2726 { 2727 if (!cs) 2728 continue; 2729 if (conflict(cs,ci,0) && // if conflict 2730 !(cs.flags & ci.flags & CIFL.push)) 2731 { 2732 if (cs.spadjust) 2733 { 2734 // We need to insert all previous adjustments to ESP 2735 foreach (ref ca; stagelist[0 .. i]) 2736 { 2737 if (ca && ca.spadjust) 2738 { 2739 if (!insert(ca)) 2740 goto Lnostage; 2741 ca = null; 2742 } 2743 } 2744 } 2745 2746 if (!insert(cs)) 2747 goto Lnostage; 2748 cs = null; 2749 } 2750 } 2751 2752 // If floating point opcode, don't stage it, send it right out 2753 if (!agi && ci.flags & CIFL.nostage) 2754 { 2755 if (!insert(ci)) 2756 goto Lnostage; 2757 return true; 2758 } 2759 2760 stagelist.push(ci); // append to staging list 2761 return true; 2762 2763 Lnostage: 2764 return false; 2765 } 2766 2767 } 2768 2769 2770 2771 /******************************************** 2772 * Snip off tail of instruction sequence. 2773 * Returns: 2774 * next instruction (the tail) or 2775 * null for no more instructions 2776 */ 2777 2778 private code * csnip(code *c) 2779 { 2780 if (c) 2781 { 2782 uint iflags = c.Iflags & CFclassinit; 2783 code **pc; 2784 while (1) 2785 { 2786 pc = &c.next; 2787 c = *pc; 2788 if (!c) 2789 break; 2790 if (c.Iflags & (CFtarg | CFtarg2)) 2791 break; 2792 if (!(c.Iop == NOP || 2793 c.Iop == (ESCAPE | ESClinnum) || 2794 c.Iflags & iflags)) 2795 break; 2796 } 2797 *pc = null; 2798 } 2799 return c; 2800 } 2801 2802 2803 /****************************** 2804 * Schedule Pentium instructions, 2805 * based on Steve Russell's algorithm. 2806 */ 2807 2808 private code *schedule(code *c,regm_t scratch) 2809 { 2810 code *cresult = null; 2811 code **pctail = &cresult; 2812 Schedule sch = void; 2813 2814 sch.initialize(0); // initialize scheduling table 2815 while (c) 2816 { 2817 if ((c.Iop == NOP || 2818 ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) || 2819 c.Iflags & CFclassinit) && 2820 !(c.Iflags & (CFtarg | CFtarg2))) 2821 { code *cn; 2822 2823 // Just append this instruction to pctail and go to the next one 2824 *pctail = c; 2825 cn = code_next(c); 2826 c.next = null; 2827 pctail = &c.next; 2828 c = cn; 2829 continue; 2830 } 2831 2832 //printf("init\n"); 2833 sch.initialize(sch.fpustackused); // initialize scheduling table 2834 2835 while (c) 2836 { 2837 //printf("insert %p\n",c); 2838 if (!sch.stage(c)) // store c in scheduling table 2839 break; 2840 c = csnip(c); 2841 } 2842 2843 //printf("assem %d\n",sch.tblmax); 2844 pctail = sch.assemble(pctail); // reassemble instruction stream 2845 } 2846 2847 return cresult; 2848 } 2849 2850 /**************************************************************************/ 2851 2852 /******************************************** 2853 * Replace any occurrence of r1 in EA with r2. 2854 */ 2855 2856 private void repEA(code *c,uint r1,uint r2) 2857 { 2858 uint mod,reg,rm; 2859 uint rmn; 2860 2861 rmn = c.Irm; 2862 mod = rmn & 0xC0; 2863 reg = rmn & modregrm(0,7,0); 2864 rm = rmn & 7; 2865 2866 if (mod == 0xC0 && rm == r1) 2867 { } //c.Irm = mod | reg | r2; 2868 else if (is32bitaddr(I32,c.Iflags) && 2869 // If not disp32 2870 (rmn & modregrm(3,0,7)) != modregrm(0,0,5)) 2871 { 2872 if (rm == 4) 2873 { // SIB byte addressing 2874 uint sib; 2875 uint base; 2876 uint index; 2877 2878 sib = c.Isib; 2879 base = sib & 7; 2880 index = (sib >> 3) & 7; 2881 if (base == r1 && 2882 !(r1 == 5 && mod == 0) && 2883 !(r2 == 5 && mod == 0) 2884 ) 2885 base = r2; 2886 if (index == r1) 2887 index = r2; 2888 c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base); 2889 } 2890 else if (rm == r1) 2891 { 2892 if (r1 == BP && r2 == SP) 2893 { // Replace [EBP] with [ESP] 2894 c.Irm = cast(ubyte)(mod | reg | 4); 2895 c.Isib = modregrm(0,4,SP); 2896 } 2897 else if (r2 == BP && mod == 0) 2898 { 2899 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2); 2900 c.IFL1 = FLconst; 2901 c.IEV1.Vint = 0; 2902 } 2903 else 2904 c.Irm = cast(ubyte)(mod | reg | r2); 2905 } 2906 } 2907 } 2908 2909 /****************************************** 2910 * Instruction scheduler. 2911 * Input: 2912 * c list of instructions to schedule 2913 * scratch scratch registers we can use 2914 * Returns: 2915 * revised list of scheduled instructions 2916 */ 2917 2918 /****************************************** 2919 * Swap c1 and c2. 2920 * c1 comes before c2. 2921 * Swap in place to not disturb addresses of jmp targets 2922 */ 2923 2924 private void code_swap(code *c1,code *c2) 2925 { code cs; 2926 2927 // Special case of: 2928 // PUSH reg1 2929 // MOV reg2,x[ESP] 2930 //printf("code_swap(%x, %x)\n",c1,c2); 2931 if ((c1.Iop & ~7) == 0x50 && 2932 c2.Iop == 0x8B && 2933 (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) && 2934 c2.Isib == modregrm(0,4,SP) && 2935 c2.IFL1 == FLconst && 2936 (cast(byte)c2.IEV1.Vpointer) >= REGSIZE && 2937 (c1.Iop & 7) != ((c2.Irm >> 3) & 7) 2938 ) 2939 c2.IEV1.Vpointer -= REGSIZE; 2940 2941 2942 cs = *c2; 2943 *c2 = *c1; 2944 *c1 = cs; 2945 // Retain original CFtarg 2946 c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2)); 2947 c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2)); 2948 2949 c1.next = c2.next; 2950 c2.next = cs.next; 2951 } 2952 2953 code *peephole(code *cstart,regm_t scratch) 2954 { 2955 // Look for cases of: 2956 // MOV r1,r2 2957 // OP ?,r1 2958 // we can replace with: 2959 // MOV r1,r2 2960 // OP ?,r2 2961 // to improve pairing 2962 code *c1; 2963 uint r1,r2; 2964 uint mod,reg,rm; 2965 2966 //printf("peephole\n"); 2967 for (code *c = cstart; c; c = c1) 2968 { 2969 ubyte rmn; 2970 2971 //c.print(); 2972 c1 = cnext(c); 2973 Ln: 2974 if (!c1) 2975 break; 2976 if (c1.Iflags & (CFtarg | CFtarg2)) 2977 continue; 2978 2979 // Do: 2980 // PUSH reg 2981 if (I32 && (c.Iop & ~7) == 0x50) 2982 { 2983 uint regx = c.Iop & 7; 2984 2985 // MOV [ESP],regx => NOP 2986 if (c1.Iop == 0x8B && 2987 c1.Irm == modregrm(0,regx,4) && 2988 c1.Isib == modregrm(0,4,SP)) 2989 { c1.Iop = NOP; 2990 continue; 2991 } 2992 2993 // PUSH [ESP] => PUSH regx 2994 if (c1.Iop == 0xFF && 2995 c1.Irm == modregrm(0,6,4) && 2996 c1.Isib == modregrm(0,4,SP)) 2997 { c1.Iop = 0x50 + regx; 2998 continue; 2999 } 3000 3001 // CMP [ESP],imm => CMP regx,i,, 3002 if (c1.Iop == 0x83 && 3003 c1.Irm == modregrm(0,7,4) && 3004 c1.Isib == modregrm(0,4,SP)) 3005 { c1.Irm = modregrm(3,7,regx); 3006 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0) 3007 { // to TEST regx,regx 3008 c1.Iop = (c1.Iop & 1) | 0x84; 3009 c1.Irm = modregrm(3,regx,regx); 3010 } 3011 continue; 3012 } 3013 3014 } 3015 3016 // Do: 3017 // MOV reg,[ESP] => PUSH reg 3018 // ADD ESP,4 => NOP 3019 if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) && 3020 c.Isib == modregrm(0,4,SP) && 3021 c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) && 3022 !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4) 3023 { 3024 uint regx = (c.Irm >> 3) & 7; 3025 c.Iop = 0x58 + regx; 3026 c1.Iop = NOP; 3027 continue; 3028 } 3029 3030 // Combine two SUBs of the same register 3031 if (c.Iop == c1.Iop && 3032 c.Iop == 0x83 && 3033 (c.Irm & 0xC0) == 0xC0 && 3034 (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) && 3035 !(c1.Iflags & CFpsw) && 3036 c.IFL2 == FLconst && c1.IFL2 == FLconst 3037 ) 3038 { int i = cast(byte)c.IEV2.Vint; 3039 int i1 = cast(byte)c1.IEV2.Vint; 3040 switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3)) 3041 { 3042 case (0 << 3) | 0: // ADD, ADD 3043 case (5 << 3) | 5: // SUB, SUB 3044 i += i1; 3045 goto Laa; 3046 case (0 << 3) | 5: // ADD, SUB 3047 case (5 << 3) | 0: // SUB, ADD 3048 i -= i1; 3049 goto Laa; 3050 Laa: 3051 if (cast(byte)i != i) 3052 c.Iop &= ~2; 3053 c.IEV2.Vint = i; 3054 c1.Iop = NOP; 3055 if (i == 0) 3056 c.Iop = NOP; 3057 continue; 3058 3059 default: 3060 break; 3061 } 3062 } 3063 3064 if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3065 { r1 = (c.Irm >> 3) & 7; 3066 r2 = c.Irm & 7; 3067 } 3068 else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0) // MOV r1,r2 3069 { r1 = c.Irm & 7; 3070 r2 = (c.Irm >> 3) & 7; 3071 } 3072 else 3073 { 3074 continue; 3075 } 3076 3077 rmn = c1.Irm; 3078 mod = rmn & 0xC0; 3079 reg = rmn & modregrm(0,7,0); 3080 rm = rmn & 7; 3081 if (cod3_EA(c1)) 3082 repEA(c1,r1,r2); 3083 switch (c1.Iop) 3084 { 3085 case 0x50: 3086 case 0x51: 3087 case 0x52: 3088 case 0x53: 3089 case 0x54: 3090 case 0x55: 3091 case 0x56: 3092 case 0x57: // PUSH reg 3093 if ((c1.Iop & 7) == r1) 3094 { c1.Iop = 0x50 | r2; 3095 //printf("schedule PUSH reg\n"); 3096 } 3097 break; 3098 3099 case 0x81: 3100 case 0x83: 3101 // Look for CMP EA,imm 3102 if (reg == modregrm(0,7,0)) 3103 { 3104 if (mod == 0xC0 && rm == r1) 3105 c1.Irm = cast(ubyte)(mod | reg | r2); 3106 } 3107 break; 3108 3109 case 0x84: // TEST reg,byte ptr EA 3110 if (r1 >= 4 || r2 >= 4) // if not a byte register 3111 break; 3112 if ((rmn & 0xC0) == 0xC0) 3113 { 3114 if ((rmn & 3) == r1) 3115 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2); 3116 //printf("schedule 1\n"); 3117 } 3118 } 3119 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0)) 3120 { c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0); 3121 //printf("schedule 2\n"); 3122 } 3123 break; 3124 case 0x85: // TEST reg,word ptr EA 3125 if ((rmn & 0xC0) == 0xC0) 3126 { 3127 if ((rmn & 7) == r1) 3128 { c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3129 //printf("schedule 3\n"); 3130 } 3131 } 3132 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3133 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3134 //printf("schedule 4\n"); 3135 } 3136 break; 3137 3138 case 0x89: // MOV EA,reg 3139 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0)) 3140 { c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0); 3141 //printf("schedule 5\n"); 3142 if (c1.Irm == modregrm(3,r2,r2)) 3143 goto Lnop; 3144 } 3145 break; 3146 3147 case 0x8B: // MOV reg,EA 3148 if ((rmn & 0xC0) == 0xC0 && 3149 (rmn & 7) == r1) // if EA == r1 3150 { c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2); 3151 //printf("schedule 6\n"); 3152 if (c1.Irm == modregrm(3,r2,r2)) 3153 goto Lnop; 3154 } 3155 break; 3156 3157 case 0x3C: // CMP AL,imm8 3158 if (r1 == AX && r2 < 4) 3159 { c1.Iop = 0x80; 3160 c1.Irm = modregrm(3,7,r2); 3161 //printf("schedule 7, r2 = %d\n", r2); 3162 } 3163 break; 3164 3165 case 0x3D: // CMP AX,imm16 3166 if (r1 == AX) 3167 { c1.Iop = 0x81; 3168 c1.Irm = modregrm(3,7,r2); 3169 if (c1.IFL2 == FLconst && 3170 c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns) 3171 c1.Iop = 0x83; 3172 //printf("schedule 8\n"); 3173 } 3174 break; 3175 3176 default: 3177 break; 3178 } 3179 continue; 3180 Lnop: 3181 c1.Iop = NOP; 3182 c1 = cnext(c1); 3183 goto Ln; 3184 } 3185 return cstart; 3186 } 3187 3188 /*****************************************************************/ 3189 3190 /********************************************** 3191 * Replace complex instructions with simple ones more conducive 3192 * to scheduling. 3193 */ 3194 3195 code *simpleops(code *c,regm_t scratch) 3196 { code *cstart; 3197 uint reg; 3198 code *c2; 3199 3200 // Worry about using registers not saved yet by prolog 3201 scratch &= ~fregsaved; 3202 3203 if (!(scratch & (scratch - 1))) // if 0 or 1 registers 3204 return c; 3205 3206 reg = findreg(scratch); 3207 3208 cstart = c; 3209 for (code** pc = &cstart; *pc; pc = &(*pc).next) 3210 { 3211 c = *pc; 3212 if (c.Iflags & (CFtarg | CFtarg2 | CFopsize)) 3213 continue; 3214 if (c.Iop == 0x83 && 3215 (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) && 3216 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3217 ) 3218 { // Replace CMP mem,imm with: 3219 // MOV reg,mem 3220 // CMP reg,imm 3221 targ_long imm; 3222 3223 //printf("replacing CMP\n"); 3224 c.Iop = 0x8B; 3225 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3226 3227 c2 = code_calloc(); 3228 if (reg == AX) 3229 c2.Iop = 0x3D; 3230 else 3231 { c2.Iop = 0x83; 3232 c2.Irm = modregrm(3,7,reg); 3233 } 3234 c2.IFL2 = c.IFL2; 3235 c2.IEV2 = c.IEV2; 3236 3237 // See if c2 should be replaced by a TEST 3238 imm = c2.IEV2.Vuns; 3239 if (!(c2.Iop & 1)) 3240 imm &= 0xFF; 3241 else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize)) 3242 imm = cast(short) imm; 3243 if (imm == 0) 3244 { 3245 c2.Iop = 0x85; // TEST reg,reg 3246 c2.Irm = modregrm(3,reg,reg); 3247 } 3248 goto L1; 3249 } 3250 else if (c.Iop == 0xFF && 3251 (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) && 3252 (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0) 3253 ) 3254 { // Replace PUSH mem with: 3255 // MOV reg,mem 3256 // PUSH reg 3257 3258 // printf("replacing PUSH\n"); 3259 c.Iop = 0x8B; 3260 c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0); 3261 3262 c2 = gen1(null,0x50 + reg); 3263 L1: 3264 //c.print(); 3265 //c2.print(); 3266 c2.next = c.next; 3267 c.next = c2; 3268 3269 // Switch to another reg 3270 if (scratch & ~mask(reg)) 3271 reg = findreg(scratch & ~mask(reg)); 3272 } 3273 } 3274 return cstart; 3275 } 3276 3277 }