1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (C) 1995-1998 by Symantec
6  *              Copyright (C) 2000-2020 by The D Language Foundation, All Rights Reserved
7  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
8  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgsched.c, backend/cgsched.d)
10  */
11 
12 module dmd.backend.cgsched;
13 
14 version (SCPP)
15     version = COMPILE;
16 version (MARS)
17     version = COMPILE;
18 
19 version (COMPILE)
20 {
21 
22 import core.stdc.stdio;
23 import core.stdc.stdlib;
24 import core.stdc.string;
25 
26 import dmd.backend.cc;
27 import dmd.backend.cdef;
28 import dmd.backend.code;
29 import dmd.backend.code_x86;
30 import dmd.backend.dlist;
31 import dmd.backend.global;
32 import dmd.backend.mem;
33 import dmd.backend.ty;
34 import dmd.backend.barray;
35 
36 extern (C++):
37 
38 nothrow:
39 
40 int REGSIZE();
41 code *gen1(code *c, uint op);
42 code *gen2(code *c, uint op, uint rm);
43 
44 private uint mask(uint m) { return 1 << m; }
45 
46 // is32bitaddr works correctly only when x is 0 or 1.  This is
47 // true today for the current definition of I32, but if the definition
48 // of I32 changes, this macro will need to change as well
49 //
50 // Note: even for linux targets, CFaddrsize can be set by the inline
51 // assembler.
52 private bool is32bitaddr(bool x, uint Iflags) { return I64 || (x ^ ((Iflags & CFaddrsize) != 0)); }
53 
54 // If we use Pentium Pro scheduler
55 private bool PRO() { return config.target_cpu >= TARGET_PentiumPro; }
56 
57 private enum FP : ubyte
58 {
59     fstp = 1,       /// FSTP mem
60     fld  = 2,       /// FLD mem
61     fop  = 3,       /// Fop ST0,mem or Fop ST0
62 }
63 
64 private enum CIFL : ubyte
65 {
66     arraybounds = 1,     /// this instruction is a jmp to array bounds
67     ea          = 2,     /// this instruction has a memory-referencing
68                              /// modregrm EA byte
69     nostage     = 4,     /// don't stage these instructions
70     push        = 8,     /// it's a push we can swap around
71 }
72 
73 // Struct where we gather information about an instruction
74 struct Cinfo
75 {
76     code *c;            // the instruction
77     ubyte pair;         // pairing information
78     ubyte sz;           // operand size
79     ubyte isz;          // instruction size
80 
81     // For floating point scheduling
82     ubyte fxch_pre;
83     ubyte fxch_post;
84     FP fp_op;           /// FPxxxx
85 
86     ubyte flags;         /// CIFLxxx
87 
88     uint r;             // read mask
89     uint w;             // write mask
90     uint a;             // registers used in addressing mode
91     ubyte reg;          // reg field of modregrm byte
92     ubyte uops;         // Pentium Pro micro-ops
93     uint sibmodrm;      // (sib << 8) + mod__rm byte
94     uint spadjust;      // if !=0, then amount ESP changes as a result of this
95                         // instruction being executed
96     int fpuadjust;      // if !=0, then amount FPU stack changes as a result
97                         // of this instruction being executed
98 
99     nothrow void print()        // pretty-printer
100     {
101         Cinfo *ci = &this;
102 
103         if (ci == null)
104         {
105             printf("Cinfo 0\n");
106             return;
107         }
108 
109         printf("Cinfo %p:  c %p, pair %x, sz %d, isz %d, flags - ",
110                ci,c,pair,sz,isz);
111         if (ci.flags & CIFL.arraybounds)
112             printf("arraybounds,");
113         if (ci.flags & CIFL.ea)
114             printf("ea,");
115         if (ci.flags & CIFL.nostage)
116             printf("nostage,");
117         if (ci.flags & CIFL.push)
118             printf("push,");
119         if (ci.flags & ~(CIFL.arraybounds|CIFL.nostage|CIFL.push|CIFL.ea))
120             printf("bad flag,");
121         printf("\n\tr %x w %x a %x reg %x uops %x sibmodrm %x spadjust %d\n",
122                 cast(int)r,cast(int)w,cast(int)a,reg,uops,sibmodrm,cast(int)spadjust);
123         if (ci.fp_op)
124         {
125             __gshared const(char*)[3] fpops = ["fstp","fld","fop"];
126 
127             printf("\tfp_op %s, fxch_pre %x, fxch_post %x\n",
128                     fpops[fp_op-1],fxch_pre,fxch_post);
129         }
130     }
131 
132 }
133 
134 
135 /*****************************************
136  * Do Pentium optimizations.
137  * Input:
138  *      scratch         scratch registers we can use
139  */
140 
141 private void cgsched_pentium(code **pc,regm_t scratch)
142 {
143     //printf("scratch = x%02x\n",scratch);
144     if (config.target_scheduler >= TARGET_80486)
145     {
146         if (!I64)
147             *pc = peephole(*pc,0);
148         if (I32)                        // forget about 16 bit code
149         {
150             if (config.target_cpu == TARGET_Pentium ||
151                 config.target_cpu == TARGET_PentiumMMX)
152                 *pc = simpleops(*pc,scratch);
153             *pc = schedule(*pc,0);
154         }
155     }
156 }
157 
158 /************************************
159  * Entry point
160  */
161 void cgsched_block(block* b)
162 {
163     if (config.flags4 & CFG4speed &&
164         config.target_cpu >= TARGET_Pentium &&
165         b.BC != BCasm)
166     {
167         regm_t scratch = allregs;
168 
169         scratch &= ~(b.Bregcon.used | b.Bregcon.params | mfuncreg);
170         scratch &= ~(b.Bregcon.immed.mval | b.Bregcon.cse.mval);
171         cgsched_pentium(&b.Bcode,scratch);
172         //printf("after schedule:\n"); WRcodlst(b.Bcode);
173     }
174 }
175 
176 enum
177 {
178     NP    = 0,       /// not pairable
179     PU    = 1,       /// pairable in U only, never executed in V
180     PV    = 2,       /// pairable in V only
181     UV    = (PU|PV), /// pairable in both U and V
182     PE    = 4,       /// register contention exception
183     PF    = 8,       /// flags contention exception
184     FX    = 0x10,    /// pairable with FXCH instruction
185 }
186 
187 extern (D) private immutable ubyte[256] pentcycl =
188 [
189         UV,UV,UV,UV,    UV,UV,NP,NP,    // 0
190         UV,UV,UV,UV,    UV,UV,NP,NP,    // 8
191         PU,PU,PU,PU,    PU,PU,NP,NP,    // 10
192         PU,PU,PU,PU,    PU,PU,NP,NP,    // 18
193         UV,UV,UV,UV,    UV,UV,NP,NP,    // 20
194         UV,UV,UV,UV,    UV,UV,NP,NP,    // 28
195         UV,UV,UV,UV,    UV,UV,NP,NP,    // 30
196         UV,UV,UV,UV,    UV,UV,NP,NP,    // 38
197 
198         UV,UV,UV,UV,    UV,UV,UV,UV,    // 40
199         UV,UV,UV,UV,    UV,UV,UV,UV,    // 48
200         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 50  PUSH reg
201         PE|UV,PE|UV,PE|UV,PE|UV,        PE|UV,PE|UV,PE|UV,PE|UV, // 58  POP reg
202         NP,NP,NP,NP,    NP,NP,NP,NP,    // 60
203         PE|UV,NP,PE|UV,NP,      NP,NP,NP,NP,    // 68
204         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 70   Jcc rel8
205         PV|PF,PV|PF,PV|PF,PV|PF,        PV|PF,PV|PF,PV|PF,PV|PF,        // 78   Jcc rel8
206 
207         NP,NP,NP,NP,    NP,NP,NP,NP,    // 80
208         UV,UV,UV,UV,    NP,UV,NP,NP,    // 88
209         NP,NP,NP,NP,    NP,NP,NP,NP,    // 90
210         NP,NP,NP,NP,    NP,NP,NP,NP,    // 98
211         UV,UV,UV,UV,    NP,NP,NP,NP,    // A0
212         UV,UV,NP,NP,    NP,NP,NP,NP,    // A8
213         UV,UV,UV,UV,    UV,UV,UV,UV,    // B0
214         UV,UV,UV,UV,    UV,UV,UV,UV,    // B8
215 
216         NP,NP,NP,NP,    NP,NP,NP,NP,    // C0
217         NP,NP,NP,NP,    NP,NP,NP,NP,    // C8
218         PU,PU,NP,NP,    NP,NP,NP,NP,    // D0
219         FX,NP,FX,FX,    NP,NP,FX,NP,    // D8   all floating point
220         NP,NP,NP,NP,    NP,NP,NP,NP,    // E0
221         PE|PV,PV,NP,PV, NP,NP,NP,NP,    // E8
222         NP,NP,NP,NP,    NP,NP,NP,NP,    // F0
223         NP,NP,NP,NP,    NP,NP,NP,NP,    // F8
224 ];
225 
226 /********************************************
227  * For each opcode, determine read [0] and written [1] masks.
228  */
229 
230 enum
231 {
232     EA    = 0x100000,
233     R     = 0x200000,       /// register (reg of modregrm field)
234     N     = 0x400000,       /// other things modified, not swappable
235     B     = 0x800000,       /// it's a byte operation
236     C     = 0x1000000,      /// floating point flags
237     mMEM  = 0x2000000,      /// memory
238     S     = 0x4000000,      /// floating point stack
239     F     = 0x8000000,      /// flags
240 }
241 
242 extern (D) private immutable uint[2][256] oprw =
243 [
244       // 00
245       [ EA|R|B, F|EA|B ],       // ADD
246       [ EA|R,   F|EA   ],
247       [ EA|R|B, F|R|B  ],
248       [ EA|R,   F|R    ],
249       [ mAX,    F|mAX  ],
250       [ mAX,    F|mAX  ],
251       [ N,      N      ],       // PUSH ES
252       [ N,      N      ],       // POP  ES
253 
254       // 08
255       [ EA|R|B, F|EA|B ],       // OR
256       [ EA|R,   F|EA   ],
257       [ EA|R|B, F|R|B  ],
258       [ EA|R,   F|R    ],
259       [ mAX,    F|mAX  ],
260       [ mAX,    F|mAX  ],
261       [ N,      N      ],       // PUSH CS
262       [ N,      N      ],       // 2 byte escape
263 
264       // 10
265       [ F|EA|R|B,F|EA|B ],      // ADC
266       [ F|EA|R, F|EA    ],
267       [ F|EA|R|B,F|R|B  ],
268       [ F|EA|R, F|R     ],
269       [ F|mAX,  F|mAX   ],
270       [ F|mAX,  F|mAX   ],
271       [ N,      N       ],      // PUSH SS
272       [ N,      N       ],      // POP  SS
273 
274       // 18
275       [ F|EA|R|B,F|EA|B ],      // SBB
276       [ F|EA|R, F|EA    ],
277       [ F|EA|R|B,F|R|B  ],
278       [ F|EA|R, F|R     ],
279       [ F|mAX,  F|mAX   ],
280       [ F|mAX,  F|mAX   ],
281       [ N,      N       ],      // PUSH DS
282       [ N,      N       ],      // POP  DS
283 
284       // 20
285       [ EA|R|B, F|EA|B ],       // AND
286       [ EA|R,   F|EA   ],
287       [ EA|R|B, F|R|B  ],
288       [ EA|R,   F|R    ],
289       [ mAX,    F|mAX  ],
290       [ mAX,    F|mAX  ],
291       [ N,      N      ],       // SEG ES
292       [ F|mAX,  F|mAX  ],       // DAA
293 
294       // 28
295       [ EA|R|B, F|EA|B ],       // SUB
296       [ EA|R,   F|EA   ],
297       [ EA|R|B, F|R|B  ],
298       [ EA|R,   F|R    ],
299       [ mAX,    F|mAX  ],
300       [ mAX,    F|mAX  ],
301       [ N,      N      ],       // SEG CS
302       [ F|mAX,  F|mAX  ],       // DAS
303 
304       // 30
305       [ EA|R|B, F|EA|B ],       // XOR
306       [ EA|R,   F|EA   ],
307       [ EA|R|B, F|R|B  ],
308       [ EA|R,   F|R    ],
309       [ mAX,    F|mAX  ],
310       [ mAX,    F|mAX  ],
311       [ N,      N      ],       // SEG SS
312       [ F|mAX,  F|mAX  ],       // AAA
313 
314       // 38
315       [ EA|R|B, F ],            // CMP
316       [ EA|R,   F ],
317       [ EA|R|B, F ],
318       [ EA|R,   F ],
319       [ mAX,    F ],            // CMP AL,imm8
320       [ mAX,    F ],            // CMP EAX,imm16/32
321       [ N,      N ],            // SEG DS
322       [ N,      N ],            // AAS
323 
324       // 40
325       [ mAX,    F|mAX ],        // INC EAX
326       [ mCX,    F|mCX ],
327       [ mDX,    F|mDX ],
328       [ mBX,    F|mBX ],
329       [ mSP,    F|mSP ],
330       [ mBP,    F|mBP ],
331       [ mSI,    F|mSI ],
332       [ mDI,    F|mDI ],
333 
334       // 48
335       [ mAX,    F|mAX ],        // DEC EAX
336       [ mCX,    F|mCX ],
337       [ mDX,    F|mDX ],
338       [ mBX,    F|mBX ],
339       [ mSP,    F|mSP ],
340       [ mBP,    F|mBP ],
341       [ mSI,    F|mSI ],
342       [ mDI,    F|mDI ],
343 
344       // 50
345       [ mAX|mSP,        mSP|mMEM ],             // PUSH EAX
346       [ mCX|mSP,        mSP|mMEM ],
347       [ mDX|mSP,        mSP|mMEM ],
348       [ mBX|mSP,        mSP|mMEM ],
349       [ mSP|mSP,        mSP|mMEM ],
350       [ mBP|mSP,        mSP|mMEM ],
351       [ mSI|mSP,        mSP|mMEM ],
352       [ mDI|mSP,        mSP|mMEM ],
353 
354       // 58
355       [ mSP|mMEM,       mAX|mSP ],              // POP EAX
356       [ mSP|mMEM,       mCX|mSP ],
357       [ mSP|mMEM,       mDX|mSP ],
358       [ mSP|mMEM,       mBX|mSP ],
359       [ mSP|mMEM,       mSP|mSP ],
360       [ mSP|mMEM,       mBP|mSP ],
361       [ mSP|mMEM,       mSI|mSP ],
362       [ mSP|mMEM,       mDI|mSP ],
363 
364       // 60
365       [ N,      N ],            // PUSHA
366       [ N,      N ],            // POPA
367       [ N,      N ],            // BOUND Gv,Ma
368       [ N,      N ],            // ARPL  Ew,Rw
369       [ N,      N ],            // SEG FS
370       [ N,      N ],            // SEG GS
371       [ N,      N ],            // operand size prefix
372       [ N,      N ],            // address size prefix
373 
374       // 68
375       [ mSP,    mSP|mMEM ],     // PUSH immed16/32
376       [ EA,     F|R      ],     // IMUL Gv,Ev,lv
377       [ mSP,    mSP|mMEM ],     // PUSH immed8
378       [ EA,     F|R      ],     // IMUL Gv,Ev,lb
379       [ N,      N        ],     // INSB Yb,DX
380       [ N,      N        ],     // INSW/D Yv,DX
381       [ N,      N        ],     // OUTSB DX,Xb
382       [ N,      N        ],     // OUTSW/D DX,Xv
383 
384       // 70
385       [ F|N,    N ],
386       [ F|N,    N ],
387       [ F|N,    N ],
388       [ F|N,    N ],
389       [ F|N,    N ],
390       [ F|N,    N ],
391       [ F|N,    N ],
392       [ F|N,    N ],
393 
394       // 78
395       [ F|N,    N ],
396       [ F|N,    N ],
397       [ F|N,    N ],
398       [ F|N,    N ],
399       [ F|N,    N ],
400       [ F|N,    N ],
401       [ F|N,    N ],
402       [ F|N,    N ],
403 
404       // 80
405       [ N,      N    ],
406       [ N,      N    ],
407       [ N,      N    ],
408       [ N,      N    ],
409       [ EA|R,   F    ],         // TEST EA,r8
410       [ EA|R,   F    ],         // TEST EA,r16/32
411       [ EA|R,   EA|R ],         // XCHG EA,r8
412       [ EA|R,   EA|R ],         // XCHG EA,r16/32
413 
414       // 88
415       [ R|B,    EA|B ],         // MOV EA8,r8
416       [ R,      EA ],           // MOV EA,r16/32
417       [ EA|B,   R|B ],          // MOV r8,EA8
418       [ EA,     R ],            // MOV r16/32,EA
419       [ N,      N ],            // MOV EA,segreg
420       [ EA,     R ],            // LEA r16/32,EA
421       [ N,      N ],            // MOV segreg,EA
422       [ mSP|mMEM, EA|mSP ],     // POP mem16/32
423 
424       // 90
425       [ 0,              0       ],      // NOP
426       [ mAX|mCX,        mAX|mCX ],
427       [ mAX|mDX,        mAX|mDX ],
428       [ mAX|mBX,        mAX|mBX ],
429       [ mAX|mSP,        mAX|mSP ],
430       [ mAX|mBP,        mAX|mBP ],
431       [ mAX|mSI,        mAX|mSI ],
432       [ mAX|mDI,        mAX|mDI ],
433 
434       // 98
435       [ mAX,            mAX      ],     // CBW
436       [ mAX,            mDX      ],     // CWD
437       [ N,              N|F      ],     // CALL far ptr
438       [ N,              N        ],     // WAIT
439       [ F|mSP,          mSP|mMEM ],     // PUSHF
440       [ mSP|mMEM,       F|mSP    ],     // POPF
441       [ mAX,            F        ],     // SAHF
442       [ F,              mAX      ],     // LAHF
443 
444       // A0
445       [ mMEM,           mAX  ],         // MOV AL,moffs8
446       [ mMEM,           mAX  ],         // MOV EAX,moffs32
447       [ mAX,            mMEM ],         // MOV moffs8,AL
448       [ mAX,            mMEM ],         // MOV moffs32,EAX
449       [ N,              N    ],         // MOVSB
450       [ N,              N    ],         // MOVSW/D
451       [ N,              N    ],         // CMPSB
452       [ N,              N    ],         // CMPSW/D
453 
454       // A8
455       [ mAX,    F ],                    // TEST AL,imm8
456       [ mAX,    F ],                    // TEST AX,imm16
457       [ N,      N ],                    // STOSB
458       [ N,      N ],                    // STOSW/D
459       [ N,      N ],                    // LODSB
460       [ N,      N ],                    // LODSW/D
461       [ N,      N ],                    // SCASB
462       [ N,      N ],                    // SCASW/D
463 
464       // B0
465       [ 0,      mAX ],                  // MOV AL,imm8
466       [ 0,      mCX ],
467       [ 0,      mDX ],
468       [ 0,      mBX ],
469       [ 0,      mAX ],
470       [ 0,      mCX ],
471       [ 0,      mDX ],
472       [ 0,      mBX ],
473 
474       // B8
475       [ 0,      mAX ],                  // MOV AX,imm16
476       [ 0,      mCX ],
477       [ 0,      mDX ],
478       [ 0,      mBX ],
479       [ 0,      mSP ],
480       [ 0,      mBP ],
481       [ 0,      mSI ],
482       [ 0,      mDI ],
483 
484       // C0
485       [ EA,     F|EA ],         // Shift Eb,Ib
486       [ EA,     F|EA ],
487       [ N,      N    ],
488       [ N,      N    ],
489       [ N,      N    ],
490       [ N,      N    ],
491       [ 0,      EA|B ],         // MOV EA8,imm8
492       [ 0,      EA   ],         // MOV EA,imm16
493 
494       // C8
495       [ N,      N ],            // ENTER
496       [ N,      N ],            // LEAVE
497       [ N,      N ],            // RETF lw
498       [ N,      N ],            // RETF
499       [ N,      N ],            // INT 3
500       [ N,      N ],            // INT lb
501       [ N,      N ],            // INTO
502       [ N,      N ],            // IRET
503 
504       // D0
505       [ EA,             F|EA  ],        // Shift EA,1
506       [ EA,             F|EA  ],
507       [ EA|mCX,         F|EA  ],        // Shift EA,CL
508       [ EA|mCX,         F|EA  ],
509       [ mAX,            F|mAX ],        // AAM
510       [ mAX,            F|mAX ],        // AAD
511       [ N,              N     ],        // reserved
512       [ mAX|mBX|mMEM,   mAX   ],        // XLAT
513 
514       // D8
515       [ N,      N ],
516       [ N,      N ],
517       [ N,      N ],
518       [ N,      N ],
519       [ N,      N ],
520       [ N,      N ],
521       [ N,      N ],
522       [ N,      N ],
523 
524       // E0
525       [ F|mCX|N,mCX|N ],        // LOOPNE jb
526       [ F|mCX|N,mCX|N ],        // LOOPE  jb
527       [ mCX|N,  mCX|N ],        // LOOP   jb
528       [ mCX|N,  N     ],        // JCXZ   jb
529       [ N,      N     ],        // IN AL,lb
530       [ N,      N     ],        // IN EAX,lb
531       [ N,      N     ],        // OUT lb,AL
532       [ N,      N     ],        // OUT lb,EAX
533 
534       // E8
535       [ N,      N|F   ],        // CALL jv
536       [ N,      N     ],        // JMP Jv
537       [ N,      N     ],        // JMP Ab
538       [ N,      N     ],        // JMP jb
539       [ N|mDX,  N|mAX ],        // IN AL,DX
540       [ N|mDX,  N|mAX ],        // IN AX,DX
541       [ N|mAX|mDX,N   ],        // OUT DX,AL
542       [ N|mAX|mDX,N   ],        // OUT DX,AX
543 
544       // F0
545       [ N,      N ],            // LOCK
546       [ N,      N ],            // reserved
547       [ N,      N ],            // REPNE
548       [ N,      N ],            // REP,REPE
549       [ N,      N ],            // HLT
550       [ F,      F ],            // CMC
551       [ N,      N ],
552       [ N,      N ],
553 
554       // F8
555       [ 0,      F    ],         // CLC
556       [ 0,      F    ],         // STC
557       [ N,      N    ],         // CLI
558       [ N,      N    ],         // STI
559       [ N,      N    ],         // CLD
560       [ N,      N    ],         // STD
561       [ EA,     F|EA ],         // INC/DEC
562       [ N,      N    ],
563 ];
564 
565 /****************************************
566  * Same thing, but for groups.
567  */
568 
569 extern (D) private immutable uint[2][8][8] grprw =
570 [
571     [
572         // Grp 1
573       [ EA,     F|EA ],           // ADD
574       [ EA,     F|EA ],           // OR
575       [ F|EA,   F|EA ],           // ADC
576       [ F|EA,   F|EA ],           // SBB
577       [ EA,     F|EA ],           // AND
578       [ EA,     F|EA ],           // SUB
579       [ EA,     F|EA ],           // XOR
580       [ EA,     F    ],           // CMP
581     ],
582     [
583         // Grp 3
584       [ EA,     F ],              // TEST EA,imm
585       [ N,      N ],              // reserved
586       [ EA,     EA ],             // NOT
587       [ EA,     F|EA ],           // NEG
588       [ mAX|EA, F|mAX|mDX ],      // MUL
589       [ mAX|EA, F|mAX|mDX ],      // IMUL
590       [ mAX|mDX|EA, F|mAX|mDX ],  // DIV
591 
592         // Could generate an exception we want to catch
593         //mAX|mDX|EA|N,   F|mAX|mDX|N,    // IDIV
594 
595       [ mAX|mDX|EA,     F|mAX|mDX ],      // IDIV
596     ],
597     [
598         // Grp 5
599       [ EA,     F|EA ],           // INC Ev
600       [ EA,     F|EA ],           // DEC Ev
601       [ N|EA,   N ],              // CALL Ev
602       [ N|EA,   N ],              // CALL eP
603       [ N|EA,   N ],              // JMP Ev
604       [ N|EA,   N ],              // JMP Ep
605       [ mSP|EA, mSP|mMEM ],       // PUSH Ev
606       [ N,      N ],              // reserved
607     ],
608     [
609         // Grp 3, byte version
610       [ EA|B,   F ],              // TEST EA,imm
611       [ N,      N ],              // reserved
612       [ EA|B,   EA|B ],           // NOT
613       [ EA|B,   F|EA|B ],         // NEG
614       [ mAX|EA, F|mAX ],          // MUL
615       [ mAX|EA, F|mAX ],          // IMUL
616       [ mAX|EA, F|mAX ],          // DIV
617 
618         // Could generate an exception we want to catch
619         //mAX|EA|N,       F|mAX|N,        // IDIV
620 
621       [ mAX|EA, F|mAX ],          // IDIV
622     ]
623 ];
624 
625 /********************************************
626  * For floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
627  *      [][][0] = read
628  *          [1] = write
629  */
630 
631 extern (D) private immutable uint[2][8][8] grpf1 =
632 [
633     [
634         // 0xD8
635       [ EA|S,   S|C ],    // FADD  float
636       [ EA|S,   S|C ],    // FMUL  float
637       [ EA|S,   C ],      // FCOM  float
638       [ EA|S,   S|C ],    // FCOMP float
639       [ EA|S,   S|C ],    // FSUB  float
640       [ EA|S,   S|C ],    // FSUBR float
641       [ EA|S,   S|C ],    // FDIV  float
642       [ EA|S,   S|C ],    // FDIVR float
643     ],
644     [
645         // 0xD9
646       [ EA,     S|C ],    // FLD  float
647       [ N,      N ],      //
648       [ S,      EA|C ],   // FST  float
649       [ S,      EA|S|C ], // FSTP float
650       [ N,      N ],      // FLDENV
651       [ N,      N ],      // FLDCW
652       [ N,      N ],      // FSTENV
653       [ N,      N ],      // FSTCW
654     ],
655     [
656         // 0xDA
657       [ EA|S,   S|C ],    // FIADD  long
658       [ EA|S,   S|C ],    // FIMUL  long
659       [ EA|S,   C ],      // FICOM  long
660       [ EA|S,   S|C ],    // FICOMP long
661       [ EA|S,   S|C ],    // FISUB  long
662       [ EA|S,   S|C ],    // FISUBR long
663       [ EA|S,   S|C ],    // FIDIV  long
664       [ EA|S,   S|C ],    // FIDIVR long
665     ],
666     [
667         // 0xDB
668       [ EA,     S|C ],    // FILD long
669       [ S,      EA|S|C ], // FISTTP int
670       [ S,      EA|C ],   // FIST long
671       [ S,      EA|S|C ], // FISTP long
672       [ N,      N ],      //
673       [ EA,     S|C ],    // FLD real80
674       [ N,      N ],      //
675       [ S,      EA|S|C ], // FSTP real80
676     ],
677     [
678         // 0xDC
679       [ EA|S,   S|C ],    // FADD  double
680       [ EA|S,   S|C ],    // FMUL  double
681       [ EA|S,   C ],      // FCOM  double
682       [ EA|S,   S|C ],    // FCOMP double
683       [ EA|S,   S|C ],    // FSUB  double
684       [ EA|S,   S|C ],    // FSUBR double
685       [ EA|S,   S|C ],    // FDIV  double
686       [ EA|S,   S|C ],    // FDIVR double
687     ],
688     [
689         // 0xDD
690       [ EA,     S|C ],    // FLD double
691       [ S,      EA|S|C ], // FISTTP long
692       [ S,      EA|C ],   // FST double
693       [ S,      EA|S|C ], // FSTP double
694       [ N,      N ],      // FRSTOR
695       [ N,      N ],      //
696       [ N,      N ],      // FSAVE
697       [ C,      EA ],     // FSTSW
698     ],
699     [
700         // 0xDE
701       [ EA|S,   S|C ],    // FIADD  short
702       [ EA|S,   S|C ],    // FIMUL  short
703       [ EA|S,   C ],      // FICOM  short
704       [ EA|S,   S|C ],    // FICOMP short
705       [ EA|S,   S|C ],    // FISUB  short
706       [ EA|S,   S|C ],    // FISUBR short
707       [ EA|S,   S|C ],    // FIDIV  short
708       [ EA|S,   S|C ],    // FIDIVR short
709     ],
710     [
711         // 0xDF
712       [ EA,     S|C ],    // FILD short
713       [ S,      EA|S|C ], // FISTTP short
714       [ S,      EA|C ],   // FIST short
715       [ S,      EA|S|C ], // FISTP short
716       [ EA,     S|C ],    // FBLD packed BCD
717       [ EA,     S|C ],    // FILD long long
718       [ S,      EA|S|C ], // FBSTP packed BCD
719       [ S,      EA|S|C ], // FISTP long long
720     ]
721 ];
722 
723 
724 /********************************************
725  * Micro-ops for floating point opcodes 0xD8..0xDF, with Irm < 0xC0.
726  */
727 
728 extern (D) private immutable ubyte[8][8] uopsgrpf1 =
729 [
730     [
731         // 0xD8
732         2,              // FADD  float
733         2,              // FMUL  float
734         2,              // FCOM  float
735         2,              // FCOMP float
736         2,              // FSUB  float
737         2,              // FSUBR float
738         2,              // FDIV  float
739         2,              // FDIVR float
740     ],
741     [
742         // 0xD9
743         1,              // FLD  float
744         0,              //
745         2,              // FST  float
746         2,              // FSTP float
747         5,              // FLDENV
748         3,              // FLDCW
749         5,              // FSTENV
750         5,              // FSTCW
751     ],
752     [
753         // 0xDA
754         5,              // FIADD  long
755         5,              // FIMUL  long
756         5,              // FICOM  long
757         5,              // FICOMP long
758         5,              // FISUB  long
759         5,              // FISUBR long
760         5,              // FIDIV  long
761         5,              // FIDIVR long
762     ],
763     [
764         // 0xDB
765         4,              // FILD long
766         0,              //
767         4,              // FIST long
768         4,              // FISTP long
769         0,              //
770         4,              // FLD real80
771         0,              //
772         5,              // FSTP real80
773     ],
774     [
775         // 0xDC
776         2,              // FADD  double
777         2,              // FMUL  double
778         2,              // FCOM  double
779         2,              // FCOMP double
780         2,              // FSUB  double
781         2,              // FSUBR double
782         2,              // FDIV  double
783         2,              // FDIVR double
784     ],
785     [
786         // 0xDD
787         1,              // FLD double
788         0,              //
789         2,              // FST double
790         2,              // FSTP double
791         5,              // FRSTOR
792         0,              //
793         5,              // FSAVE
794         5,              // FSTSW
795     ],
796     [
797         // 0xDE
798         5,              // FIADD  short
799         5,              // FIMUL  short
800         5,              // FICOM  short
801         5,              // FICOMP short
802         5,              // FISUB  short
803         5,              // FISUBR short
804         5,              // FIDIV  short
805         5,              // FIDIVR short
806     ],
807     [
808         // 0xDF
809         4,              // FILD short
810         0,              //
811         4,              // FIST short
812         4,              // FISTP short
813         5,              // FBLD packed BCD
814         4,              // FILD long long
815         5,              // FBSTP packed BCD
816         4,              // FISTP long long
817     ]
818 ];
819 
820 /**************************************************
821  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
822  * 0 means special case,
823  * 5 means 'complex'
824  */
825 
826 extern (D) private immutable ubyte[256] insuops =
827 [       0,0,0,0,        1,1,4,5,                /* 00 */
828         0,0,0,0,        1,1,4,0,                /* 08 */
829         0,0,0,0,        2,2,4,5,                /* 10 */
830         0,0,0,0,        2,2,4,5,                /* 18 */
831         0,0,0,0,        1,1,0,1,                /* 20 */
832         0,0,0,0,        1,1,0,1,                /* 28 */
833         0,0,0,0,        1,1,0,1,                /* 30 */
834         0,0,0,0,        1,1,0,1,                /* 38 */
835         1,1,1,1,        1,1,1,1,                /* 40 */
836         1,1,1,1,        1,1,1,1,                /* 48 */
837         3,3,3,3,        3,3,3,3,                /* 50 */
838         2,2,2,2,        3,2,2,2,                /* 58 */
839         5,5,5,5,        0,0,0,0,                /* 60 */
840         3,3,0,0,        5,5,5,5,                /* 68 */
841         1,1,1,1,        1,1,1,1,                /* 70 */
842         1,1,1,1,        1,1,1,1,                /* 78 */
843         0,0,0,0,        0,0,0,0,                /* 80 */
844         0,0,0,0,        0,1,4,0,                /* 88 */
845         1,3,3,3,        3,3,3,3,                /* 90 */
846         1,1,5,0,        5,5,1,1,                /* 98 */
847         1,1,2,2,        5,5,5,5,                /* A0 */
848         1,1,3,3,        2,2,3,3,                /* A8 */
849         1,1,1,1,        1,1,1,1,                /* B0 */
850         1,1,1,1,        1,1,1,1,                /* B8 */
851         0,0,5,4,        0,0,0,0,                /* C0 */
852         5,3,5,5,        5,3,5,5,                /* C8 */
853         0,0,0,0,        4,3,0,2,                /* D0 */
854         0,0,0,0,        0,0,0,0,                /* D8 */
855         4,4,4,2,        5,5,5,5,                /* E0 */
856         4,1,5,1,        5,5,5,5,                /* E8 */
857         0,0,5,5,        5,1,0,0,                /* F0 */
858         1,1,5,5,        4,4,0,0,                /* F8 */
859 ];
860 
861 extern (D) private immutable ubyte[8] uopsx = [ 1,1,2,5,1,1,1,5 ];
862 
863 /************************************************
864  * Determine number of micro-ops for Pentium Pro and Pentium II processors.
865  * 5 means 'complex'.
866  * Doesn't currently handle:
867  *      floating point
868  *      MMX
869  *      0F opcodes
870  *      prefix bytes
871  */
872 
873 private int uops(code *c)
874 {   int n;
875     int op;
876     int op2;
877 
878     op = c.Iop & 0xFF;
879     if ((c.Iop & 0xFF00) == 0x0F00)
880         op = 0x0F;
881     n = insuops[op];
882     if (!n)                             // if special case
883     {   ubyte irm,mod,reg,rm;
884 
885         irm = c.Irm;
886         mod = (irm >> 6) & 3;
887         reg = (irm >> 3) & 7;
888         rm = irm & 7;
889 
890         switch (op)
891         {
892             case 0x10:
893             case 0x11:                  // ADC rm,r
894             case 0x18:
895             case 0x19:                  // SBB rm,r
896                 n = (mod == 3) ? 2 : 4;
897                 break;
898 
899             case 0x12:
900             case 0x13:                  // ADC r,rm
901             case 0x1A:
902             case 0x1B:                  // SBB r,rm
903                 n = (mod == 3) ? 2 : 3;
904                 break;
905 
906             case 0x00:
907             case 0x01:                  // ADD rm,r
908             case 0x08:
909             case 0x09:                  // OR rm,r
910             case 0x20:
911             case 0x21:                  // AND rm,r
912             case 0x28:
913             case 0x29:                  // SUB rm,r
914             case 0x30:
915             case 0x31:                  // XOR rm,r
916                 n = (mod == 3) ? 1 : 4;
917                 break;
918 
919             case 0x02:
920             case 0x03:                  // ADD r,rm
921             case 0x0A:
922             case 0x0B:                  // OR r,rm
923             case 0x22:
924             case 0x23:                  // AND r,rm
925             case 0x2A:
926             case 0x2B:                  // SUB r,rm
927             case 0x32:
928             case 0x33:                  // XOR r,rm
929             case 0x38:
930             case 0x39:                  // CMP rm,r
931             case 0x3A:
932             case 0x3B:                  // CMP r,rm
933             case 0x69:                  // IMUL rm,r,imm
934             case 0x6B:                  // IMUL rm,r,imm8
935             case 0x84:
936             case 0x85:                  // TEST rm,r
937                 n = (mod == 3) ? 1 : 2;
938                 break;
939 
940             case 0x80:
941             case 0x81:
942             case 0x82:
943             case 0x83:
944                 if (reg == 2 || reg == 3)       // ADC/SBB rm,imm
945                     n = (mod == 3) ? 2 : 4;
946                 else if (reg == 7)              // CMP rm,imm
947                     n = (mod == 3) ? 1 : 2;
948                 else
949                     n = (mod == 3) ? 1 : 4;
950                 break;
951 
952             case 0x86:
953             case 0x87:                          // XCHG rm,r
954                 n = (mod == 3) ? 3 : 5;
955                 break;
956 
957             case 0x88:
958             case 0x89:                          // MOV rm,r
959                 n = (mod == 3) ? 1 : 2;
960                 break;
961 
962             case 0x8A:
963             case 0x8B:                          // MOV r,rm
964                 n = 1;
965                 break;
966 
967             case 0x8C:                          // MOV Sreg,rm
968                 n = (mod == 3) ? 1 : 3;
969                 break;
970 
971             case 0x8F:
972                 if (reg == 0)                   // POP m
973                     n = 5;
974                 break;
975 
976             case 0xC6:
977             case 0xC7:
978                 if (reg == 0)                   // MOV rm,imm
979                     n = (mod == 3) ? 1 : 2;
980                 break;
981 
982             case 0xD0:
983             case 0xD1:
984                 if (reg == 2 || reg == 3)       // RCL/RCR rm,1
985                     n = (mod == 3) ? 2 : 4;
986                 else
987                     n = (mod == 3) ? 1 : 4;
988                 break;
989 
990             case 0xC0:
991             case 0xC1:                          // RCL/RCR rm,imm8
992             case 0xD2:
993             case 0xD3:
994                 if (reg == 2 || reg == 3)       // RCL/RCR rm,CL
995                     n = 5;
996                 else
997                     n = (mod == 3) ? 1 : 4;
998                 break;
999 
1000             case 0xD8:
1001             case 0xD9:
1002             case 0xDA:
1003             case 0xDB:
1004             case 0xDC:
1005             case 0xDD:
1006             case 0xDE:
1007             case 0xDF:
1008                 // Floating point opcodes
1009                 if (irm < 0xC0)
1010                 {   n = uopsgrpf1[op - 0xD8][reg];
1011                     break;
1012                 }
1013                 n = uopsx[op - 0xD8];
1014                 switch (op)
1015                 {
1016                     case 0xD9:
1017                         switch (irm)
1018                         {
1019                             case 0xE0:          // FCHS
1020                                 n = 3;
1021                                 break;
1022                             case 0xE8:
1023                             case 0xE9:
1024                             case 0xEA:
1025                             case 0xEB:
1026                             case 0xEC:
1027                             case 0xED:
1028                                 n = 2;
1029                                 break;
1030                             case 0xF0:
1031                             case 0xF1:
1032                             case 0xF2:
1033                             case 0xF3:
1034                             case 0xF4:
1035                             case 0xF5:
1036                             case 0xF8:
1037                             case 0xF9:
1038                             case 0xFB:
1039                             case 0xFC:
1040                             case 0xFD:
1041                             case 0xFE:
1042                             case 0xFF:
1043                                 n = 5;
1044                                 break;
1045 
1046                             default:
1047                                 break;
1048                         }
1049                         break;
1050                     case 0xDE:
1051                         if (irm == 0xD9)        // FCOMPP
1052                             n = 2;
1053                         break;
1054 
1055                     default:
1056                         break;
1057                 }
1058                 break;
1059 
1060             case 0xF6:
1061                 if (reg == 6 || reg == 7)       // DIV AL,rm8
1062                     n = (mod == 3) ? 3 : 4;
1063                 else if (reg == 4 || reg == 5 || reg == 0)      // MUL/IMUL/TEST rm8
1064                     n = (mod == 3) ? 1 : 2;
1065                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1066                     n = (mod == 3) ? 1 : 4;
1067                 break;
1068 
1069             case 0xF7:
1070                 if (reg == 6 || reg == 7)       // DIV EAX,rm
1071                     n = 4;
1072                 else if (reg == 4 || reg == 5)  // MUL/IMUL rm
1073                     n = (mod == 3) ? 3 : 4;
1074                 else if (reg == 2 || reg == 3)  // NOT/NEG rm
1075                     n = (mod == 3) ? 1 : 4;
1076                 break;
1077 
1078             case 0xFF:
1079                 if (reg == 2 || reg == 3 ||     // CALL rm, CALL m,rm
1080                     reg == 5)                   // JMP seg:offset
1081                     n = 5;
1082                 else if (reg == 4)
1083                     n = (mod == 3) ? 1 : 2;
1084                 else if (reg == 0 || reg == 1)  // INC/DEC rm
1085                     n = (mod == 3) ? 1 : 4;
1086                 else if (reg == 6)              // PUSH rm
1087                     n = (mod == 3) ? 3 : 4;
1088                 break;
1089 
1090             case 0x0F:
1091                 op2 = c.Iop & 0xFF;
1092                 if ((op2 & 0xF0) == 0x80)       // Jcc
1093                 {   n = 1;
1094                     break;
1095                 }
1096                 if ((op2 & 0xF0) == 0x90)       // SETcc
1097                 {   n = (mod == 3) ? 1 : 3;
1098                     break;
1099                 }
1100                 if (op2 == 0xB6 || op2 == 0xB7 ||       // MOVZX
1101                     op2 == 0xBE || op2 == 0xBF)         // MOVSX
1102                 {   n = 1;
1103                     break;
1104                 }
1105                 if (op2 == 0xAF)                        // IMUL r,m
1106                 {   n = (mod == 3) ? 1 : 2;
1107                     break;
1108                 }
1109                 break;
1110 
1111             default:
1112                 break;
1113         }
1114     }
1115     if (n == 0)
1116         n = 5;                                  // copout for now
1117     return n;
1118 }
1119 
1120 /******************************************
1121  * Determine pairing classification.
1122  * Don't deal with floating point, just assume they are all NP (Not Pairable).
1123  * Returns:
1124  *      NP,UV,PU,PV optionally OR'd with PE
1125  */
1126 
1127 private int pair_class(code *c)
1128 {   ubyte op;
1129     ubyte irm,mod,reg,rm;
1130     uint a32;
1131     int pc;
1132 
1133     // Of course, with Intel this is *never* simple, and Intel's
1134     // documentation is vague about the specifics.
1135 
1136     op = c.Iop & 0xFF;
1137     if ((c.Iop & 0xFF00) == 0x0F00)
1138         op = 0x0F;
1139     pc = pentcycl[op];
1140     a32 = I32;
1141     if (c.Iflags & CFaddrsize)
1142         a32 ^= 1;
1143     irm = c.Irm;
1144     mod = (irm >> 6) & 3;
1145     reg = (irm >> 3) & 7;
1146     rm = irm & 7;
1147     switch (op)
1148     {
1149         case 0x0F:                              // 2 byte opcode
1150             if ((c.Iop & 0xF0) == 0x80)        // if Jcc
1151                 pc = PV | PF;
1152             break;
1153 
1154         case 0x80:
1155         case 0x81:
1156         case 0x83:
1157             if (reg == 2 ||                     // ADC EA,immed
1158                 reg == 3)                       // SBB EA,immed
1159             {   pc = PU;
1160                 goto L2;
1161             }
1162             goto L1;                            // AND/OR/XOR/ADD/SUB/CMP EA,immed
1163 
1164         case 0x84:
1165         case 0x85:                              // TEST EA,reg
1166             if (mod == 3)                       // TEST reg,reg
1167                 pc = UV;
1168             break;
1169 
1170         case 0xC0:
1171         case 0xC1:
1172             if (reg >= 4)
1173                 pc = PU;
1174             break;
1175 
1176         case 0xC6:
1177         case 0xC7:
1178             if (reg == 0)                       // MOV EA,immed
1179             {
1180         L1:
1181                 pc = UV;
1182         L2:
1183                 // if EA contains a displacement then
1184                 // can't execute in V, or pair in U
1185                 switch (mod)
1186                 {   case 0:
1187                         if (a32)
1188                         {   if (rm == 5 ||
1189                                 (rm == 4 && (c.Isib & 7) == 5)
1190                                )
1191                                 pc = NP;
1192                         }
1193                         else if (rm == 6)
1194                             pc = NP;
1195                         break;
1196                     case 1:
1197                     case 2:
1198                         pc = NP;
1199                         break;
1200 
1201                     default:
1202                         break;
1203                 }
1204             }
1205             break;
1206 
1207         case 0xD9:
1208             if (irm < 0xC0)
1209             {
1210                 if (reg == 0)
1211                     pc = FX;
1212             }
1213             else if (irm < 0xC8)
1214                 pc = FX;
1215             else if (irm < 0xD0)
1216                 pc = PV;
1217             else
1218             {
1219                 switch (irm)
1220                 {
1221                     case 0xE0:
1222                     case 0xE1:
1223                     case 0xE4:
1224                         pc = FX;
1225                         break;
1226 
1227                     default:
1228                         break;
1229                 }
1230             }
1231             break;
1232 
1233         case 0xDB:
1234             if (irm < 0xC0 && (reg == 0 || reg == 5))
1235                 pc = FX;
1236             break;
1237 
1238         case 0xDD:
1239             if (irm < 0xC0)
1240             {
1241                 if (reg == 0)
1242                     pc = FX;
1243             }
1244             else if (irm >= 0xE0 && irm < 0xF0)
1245                 pc = FX;
1246             break;
1247 
1248         case 0xDF:
1249             if (irm < 0xC0 && (reg == 0 || reg == 5))
1250                 pc = FX;
1251             break;
1252 
1253         case 0xFE:
1254             if (reg == 0 || reg == 1)           // INC/DEC EA
1255                 pc = UV;
1256             break;
1257         case 0xFF:
1258             if (reg == 0 || reg == 1)           // INC/DEC EA
1259                 pc = UV;
1260             else if (reg == 2 || reg == 4)      // CALL/JMP near ptr EA
1261                 pc = PE|PV;
1262             else if (reg == 6 && mod == 3)      // PUSH reg
1263                 pc = PE | UV;
1264             break;
1265 
1266         default:
1267             break;
1268     }
1269     if (c.Iflags & CFPREFIX && pc == UV)       // if prefix byte
1270         pc = PU;
1271     return pc;
1272 }
1273 
1274 /******************************************
1275  * For an instruction, determine what is read
1276  * and what is written, and what is used for addressing.
1277  * Determine operand size if EA (larger is ok).
1278  */
1279 
1280 private void getinfo(Cinfo *ci,code *c)
1281 {
1282     memset(ci,0,Cinfo.sizeof);
1283     if (!c)
1284         return;
1285     ci.c = c;
1286 
1287     if (PRO)
1288     {
1289         ci.uops = cast(ubyte)uops(c);
1290         ci.isz = cast(ubyte)calccodsize(c);
1291     }
1292     else
1293         ci.pair = cast(ubyte)pair_class(c);
1294 
1295     ubyte op;
1296     ubyte op2;
1297     ubyte irm,mod,reg,rm;
1298     uint a32;
1299     int pc;
1300     uint r,w;
1301     int sz = I32 ? 4 : 2;
1302 
1303     ci.r = 0;
1304     ci.w = 0;
1305     ci.a = 0;
1306     op = c.Iop & 0xFF;
1307     if ((c.Iop & 0xFF00) == 0x0F00)
1308         op = 0x0F;
1309     //printf("\tgetinfo %x, op %x \n",c,op);
1310     pc = pentcycl[op];
1311     a32 = I32;
1312     if (c.Iflags & CFaddrsize)
1313         a32 ^= 1;
1314     if (c.Iflags & CFopsize)
1315         sz ^= 2 | 4;
1316     irm = c.Irm;
1317     mod = (irm >> 6) & 3;
1318     reg = (irm >> 3) & 7;
1319     rm = irm & 7;
1320 
1321     r = oprw[op][0];
1322     w = oprw[op][1];
1323 
1324     switch (op)
1325     {
1326         case 0x50:
1327         case 0x51:
1328         case 0x52:
1329         case 0x53:
1330         case 0x55:
1331         case 0x56:
1332         case 0x57:                              // PUSH reg
1333             ci.flags |= CIFL.push;
1334             goto Lpush;
1335 
1336         case 0x54:                              // PUSH ESP
1337         case 0x6A:                              // PUSH imm8
1338         case 0x68:                              // PUSH imm
1339         case 0x0E:
1340         case 0x16:
1341         case 0x1E:
1342         case 0x06:
1343         case 0x9C:
1344         Lpush:
1345             ci.spadjust = -sz;
1346             ci.a |= mSP;
1347             break;
1348 
1349         case 0x58:
1350         case 0x59:
1351         case 0x5A:
1352         case 0x5B:
1353         case 0x5C:
1354         case 0x5D:
1355         case 0x5E:
1356         case 0x5F:                              // POP reg
1357         case 0x1F:
1358         case 0x07:
1359         case 0x17:
1360         case 0x9D:                              // POPF
1361         Lpop:
1362             ci.spadjust = sz;
1363             ci.a |= mSP;
1364             break;
1365 
1366         case 0x80:
1367             if (reg == 7)                       // CMP
1368                 c.Iflags |= CFpsw;
1369             r = B | grprw[0][reg][0];           // Grp 1 (byte)
1370             w = B | grprw[0][reg][1];
1371             break;
1372 
1373         case 0x81:
1374         case 0x83:
1375             if (reg == 7)                       // CMP
1376                 c.Iflags |= CFpsw;
1377             else if (irm == modregrm(3,0,SP))   // ADD ESP,imm
1378             {
1379                 assert(c.IFL2 == FLconst);
1380                 ci.spadjust = (op == 0x81) ? c.IEV2.Vint : cast(byte)c.IEV2.Vint;
1381             }
1382             else if (irm == modregrm(3,5,SP))   // SUB ESP,imm
1383             {
1384                 assert(c.IFL2 == FLconst);
1385                 ci.spadjust = (op == 0x81) ? -c.IEV2.Vint : -cast(int)cast(byte)c.IEV2.Vint;
1386             }
1387             r = grprw[0][reg][0];               // Grp 1
1388             w = grprw[0][reg][1];
1389             break;
1390 
1391         case 0x8F:
1392             if (reg == 0)                       // POP rm
1393                 goto Lpop;
1394             break;
1395 
1396         case 0xA0:
1397         case 0xA1:
1398         case 0xA2:
1399         case 0xA3:
1400             // Fake having an EA to simplify code in conflict()
1401             ci.flags |= CIFL.ea;
1402             ci.reg = 0;
1403             ci.sibmodrm = a32 ? modregrm(0,0,5) : modregrm(0,0,6);
1404             c.IFL1 = c.IFL2;
1405             c.IEV1 = c.IEV2;
1406             break;
1407 
1408         case 0xC2:
1409         case 0xC3:
1410         case 0xCA:
1411         case 0xCB:                              // RET
1412             ci.a |= mSP;
1413             break;
1414 
1415         case 0xE8:
1416             if (c.Iflags & CFclassinit)        // call to __j_classinit
1417             {   r = 0;
1418                 w = F;
1419 
1420 version (CLASSINIT2)
1421                 ci.pair = UV;                  // it is patched to CMP EAX,0
1422 else
1423                 ci.pair = NP;
1424 
1425             }
1426             break;
1427 
1428         case 0xF6:
1429             r = grprw[3][reg][0];               // Grp 3, byte version
1430             w = grprw[3][reg][1];
1431             break;
1432 
1433         case 0xF7:
1434             r = grprw[1][reg][0];               // Grp 3
1435             w = grprw[1][reg][1];
1436             break;
1437 
1438         case 0x0F:
1439             op2 = c.Iop & 0xFF;
1440             if ((op2 & 0xF0) == 0x80)           // if Jxx instructions
1441             {
1442                 ci.r = F | N;
1443                 ci.w = N;
1444                 goto Lret;
1445             }
1446             ci.r = N;
1447             ci.w = N;          // copout for now
1448             goto Lret;
1449 
1450         case 0xD7:                              // XLAT
1451             ci.a = mAX | mBX;
1452             break;
1453 
1454         case 0xFF:
1455             r = grprw[2][reg][0];               // Grp 5
1456             w = grprw[2][reg][1];
1457             if (reg == 6)                       // PUSH rm
1458                 goto Lpush;
1459             break;
1460 
1461         case 0x38:
1462         case 0x39:
1463         case 0x3A:
1464         case 0x3B:
1465         case 0x3C:                              // CMP AL,imm8
1466         case 0x3D:                              // CMP EAX,imm32
1467             // For CMP opcodes, always test for flags
1468             c.Iflags |= CFpsw;
1469             break;
1470 
1471         case ESCAPE:
1472             if (c.Iop == (ESCAPE | ESCadjfpu))
1473                 ci.fpuadjust = c.IEV1.Vint;
1474             break;
1475 
1476         case 0xD0:
1477         case 0xD1:
1478         case 0xD2:
1479         case 0xD3:
1480         case 0xC0:
1481         case 0xC1:
1482             if (reg == 2 || reg == 3)           // if RCL or RCR
1483                 c.Iflags |= CFpsw;             // always test for flags
1484             break;
1485 
1486         case 0xD8:
1487         case 0xD9:
1488         case 0xDA:
1489         case 0xDB:
1490         case 0xDC:
1491         case 0xDD:
1492         case 0xDE:
1493         case 0xDF:
1494             if (irm < 0xC0)
1495             {   r = grpf1[op - 0xD8][reg][0];
1496                 w = grpf1[op - 0xD8][reg][1];
1497                 switch (op)
1498                 {
1499                     case 0xD8:
1500                         if (reg == 3)           // if FCOMP
1501                             ci.fpuadjust = -1;
1502                         else
1503                             ci.fp_op = FP.fop;
1504                         break;
1505 
1506                     case 0xD9:
1507                         if (reg == 0)           // if FLD float
1508                         {   ci.fpuadjust = 1;
1509                             ci.fp_op = FP.fld;
1510                         }
1511                         else if (reg == 3)      // if FSTP float
1512                         {   ci.fpuadjust = -1;
1513                             ci.fp_op = FP.fstp;
1514                         }
1515                         else if (reg == 5 || reg == 7)
1516                             sz = 2;
1517                         else if (reg == 4 || reg == 6)
1518                             sz = 28;
1519                         break;
1520                     case 0xDA:
1521                         if (reg == 3)           // if FICOMP
1522                             ci.fpuadjust = -1;
1523                         break;
1524                     case 0xDB:
1525                         if (reg == 0 || reg == 5)
1526                         {   ci.fpuadjust = 1;
1527                             ci.fp_op = FP.fld;  // FILD / FLD long double
1528                         }
1529                         if (reg == 3 || reg == 7)
1530                             ci.fpuadjust = -1;
1531                         if (reg == 7)
1532                             ci.fp_op = FP.fstp; // FSTP long double
1533                         if (reg == 5 || reg == 7)
1534                             sz = 10;
1535                         break;
1536                     case 0xDC:
1537                         sz = 8;
1538                         if (reg == 3)           // if FCOMP
1539                             ci.fpuadjust = -1;
1540                         else
1541                             ci.fp_op = FP.fop;
1542                         break;
1543                     case 0xDD:
1544                         if (reg == 0)           // if FLD double
1545                         {   ci.fpuadjust = 1;
1546                             ci.fp_op = FP.fld;
1547                         }
1548                         if (reg == 3)           // if FSTP double
1549                         {   ci.fpuadjust = -1;
1550                             ci.fp_op = FP.fstp;
1551                         }
1552                         if (reg == 7)
1553                             sz = 2;
1554                         else if (reg == 4 || reg == 6)
1555                             sz = 108;
1556                         else
1557                             sz = 8;
1558                         break;
1559                     case 0xDE:
1560                         sz = 2;
1561                         if (reg == 3)           // if FICOMP
1562                             ci.fpuadjust = -1;
1563                         break;
1564                     case 0xDF:
1565                         sz = 2;
1566                         if (reg == 4 || reg == 6)
1567                             sz = 10;
1568                         else if (reg == 5 || reg == 7)
1569                             sz = 8;
1570                         if (reg == 0 || reg == 4 || reg == 5)
1571                             ci.fpuadjust = 1;
1572                         else if (reg == 3 || reg == 6 || reg == 7)
1573                             ci.fpuadjust = -1;
1574                         break;
1575 
1576                     default:
1577                         break;
1578                 }
1579                 break;
1580             }
1581             else if (op == 0xDE)
1582             {   ci.fpuadjust = -1;             // pop versions of Fop's
1583                 if (irm == 0xD9)
1584                     ci.fpuadjust = -2;         // FCOMPP
1585             }
1586 
1587             // Most floating point opcodes aren't staged, but are
1588             // sent right through, in order to make use of the large
1589             // latencies with floating point instructions.
1590             if (ci.fp_op == FP.fld ||
1591                 (op == 0xD9 && (irm & 0xF8) == 0xC0))
1592             { }                                // FLD ST(i)
1593             else
1594                 ci.flags |= CIFL.nostage;
1595 
1596             switch (op)
1597             {
1598                 case 0xD8:
1599                     r = S;
1600                     w = C;
1601                     if ((irm & ~7) == 0xD0)
1602                         w |= S;
1603                     break;
1604                 case 0xD9:
1605                     // FCHS or FABS or FSQRT
1606                     if (irm == 0xE0 || irm == 0xE1 || irm == 0xFA)
1607                         ci.fp_op = FP.fop;
1608                     r = S;
1609                     w = S|C;
1610                     break;
1611                 case 0xDA:
1612                     if (irm == 0xE9)    // FUCOMPP
1613                     {   r = S;
1614                         w = S|C;
1615                         break;
1616                     }
1617                     break;
1618                 case 0xDB:
1619                     if (irm == 0xE2)    // FCLEX
1620                     {   r = 0;
1621                         w = C;
1622                         break;
1623                     }
1624                     if (irm == 0xE3)    // FINIT
1625                     {   r = 0;
1626                         w = S|C;
1627                         break;
1628                     }
1629                     break;
1630                 case 0xDC:
1631                 case 0xDE:
1632                     if ((irm & 0xF0) != 0xD0)
1633                     {   r = S;
1634                         w = S|C;
1635                         break;
1636                     }
1637                     break;
1638                 case 0xDD:
1639                     // Not entirely correct, but conservative
1640                     r = S;
1641                     w = S|C;
1642                     break;
1643                 case 0xDF:
1644                     if (irm == 0xE0)    // FSTSW AX
1645                     {   r = C;
1646                         w = mAX;
1647                         break;
1648                     }
1649                     break;
1650 
1651                 default:
1652                     break;
1653             }
1654             break;
1655 
1656         default:
1657             //printf("\t\tNo special case\n");
1658             break;
1659     }
1660 
1661     if ((r | w) & B)                            // if byte operation
1662         sz = 1;                                 // operand size is 1
1663 
1664     ci.r = r & ~(R | EA);
1665     ci.w = w & ~(R | EA);
1666     if (r & R)
1667         ci.r |= mask((r & B) ? (reg & 3) : reg);
1668     if (w & R)
1669         ci.w |= mask((w & B) ? (reg & 3) : reg);
1670 
1671     // OR in bits for EA addressing mode
1672     if ((r | w) & EA)
1673     {   ubyte sib;
1674 
1675         sib = 0;
1676         switch (mod)
1677         {
1678             case 0:
1679                 if (a32)
1680                 {
1681                     if (rm == 4)
1682                     {
1683                         sib = c.Isib;
1684                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1685                             ci.a |= mask((sib >> 3) & 7);      // index register
1686                         if ((sib & 7) != 5)
1687                             ci.a |= mask(sib & 7);             // base register
1688                     }
1689                     else if (rm != 5)
1690                         ci.a |= mask(rm);
1691                 }
1692                 else
1693                 {
1694                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,0,mBX];
1695                     ci.a |= ea16[rm];
1696                 }
1697                 goto Lmem;
1698 
1699             case 1:
1700             case 2:
1701                 if (a32)
1702                 {
1703                     if (rm == 4)
1704                     {
1705                         sib = c.Isib;
1706                         if ((sib & modregrm(0,7,0)) != modregrm(0,4,0))
1707                             ci.a |= mask((sib >> 3) & 7);      // index register
1708                         ci.a |= mask(sib & 7);                 // base register
1709                     }
1710                     else
1711                         ci.a |= mask(rm);
1712                 }
1713                 else
1714                 {
1715                     immutable ubyte[8] ea16 = [mBX|mSI,mBX|mDI,mBP|mSI,mBP|mDI,mSI,mDI,mBP,mBX];
1716                     ci.a |= ea16[rm];
1717                 }
1718 
1719             Lmem:
1720                 if (r & EA)
1721                     ci.r |= mMEM;
1722                 if (w & EA)
1723                     ci.w |= mMEM;
1724                 ci.flags |= CIFL.ea;
1725                 break;
1726 
1727             case 3:
1728                 if (r & EA)
1729                     ci.r |= mask((r & B) ? (rm & 3) : rm);
1730                 if (w & EA)
1731                     ci.w |= mask((w & B) ? (rm & 3) : rm);
1732                 break;
1733 
1734             default:
1735                 assert(0);
1736         }
1737         // Adjust sibmodrm so that addressing modes can be compared simply
1738         irm &= modregrm(3,0,7);
1739         if (a32)
1740         {
1741             if (irm != modregrm(0,0,5))
1742             {
1743                 switch (mod)
1744                 {
1745                 case 0:
1746                     if ((sib & 7) != 5)     // if not disp32[index]
1747                     {
1748                         c.IFL1 = FLconst;
1749                         c.IEV1.Vpointer = 0;
1750                         irm |= 0x80;
1751                     }
1752                     break;
1753                 case 1:
1754                     c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1755                     irm = modregrm(2, 0, rm);
1756                     break;
1757 
1758                 default:
1759                     break;
1760                 }
1761             }
1762         }
1763         else
1764         {
1765             if (irm != modregrm(0,0,6))
1766             {
1767                 switch (mod)
1768                 {
1769                     case 0:
1770                         c.IFL1 = FLconst;
1771                         c.IEV1.Vpointer = 0;
1772                         irm |= 0x80;
1773                         break;
1774                     case 1:
1775                         c.IEV1.Vpointer = cast(byte) c.IEV1.Vpointer;
1776                         irm = modregrm(2, 0, rm);
1777                         break;
1778 
1779                     default:
1780                         break;
1781                 }
1782             }
1783         }
1784 
1785         ci.r |= ci.a;
1786         ci.reg = reg;
1787         ci.sibmodrm = (sib << 8) | irm;
1788     }
1789 Lret:
1790     if (ci.w & mSP)                    // if stack pointer is modified
1791         ci.w |= mMEM;                  // then we are implicitly writing to memory
1792     if (op == LEA)                     // if LEA
1793         ci.r &= ~mMEM;                 // memory is not actually read
1794     ci.sz = cast(ubyte)sz;
1795 
1796     //printf("\t\t"); ci.print();
1797 }
1798 
1799 /******************************************
1800  * Determine if two instructions can pair.
1801  * Assume that in general, cu can pair in the U pipe and cv in the V.
1802  * Look for things like register contentions.
1803  * Input:
1804  *      cu      instruction for U pipe
1805  *      cv      instruction for V pipe
1806  * Returns:
1807  *      !=0 if they can pair
1808  */
1809 
1810 private int pair_test(Cinfo *cu,Cinfo *cv)
1811 {
1812     uint pcu;
1813     uint pcv;
1814     uint r1,w1;
1815     uint r2,w2;
1816     uint x;
1817 
1818     pcu = cu.pair;
1819     if (!(pcu & PU))
1820     {
1821         // See if pairs with FXCH and cv is FXCH
1822         if (pcu & FX && cv.c.Iop == 0xD9 && (cv.c.Irm & ~7) == 0xC8)
1823             goto Lpair;
1824         goto Lnopair;
1825     }
1826     pcv = cv.pair;
1827     if (!(pcv & PV))
1828         goto Lnopair;
1829 
1830     r1 = cu.r;
1831     w1 = cu.w;
1832     r2 = cv.r;
1833     w2 = cv.w;
1834 
1835     x = w1 & (r2 | w2) & ~(F|mMEM);     // register contention
1836     if (x &&                            // if register contention
1837         !(x == mSP && pcu & pcv & PE)   // and not exception
1838        )
1839         goto Lnopair;
1840 
1841     // Look for flags contention
1842     if (w1 & r2 & F && !(pcv & PF))
1843         goto Lnopair;
1844 
1845 Lpair:
1846     return 1;
1847 
1848 Lnopair:
1849     return 0;
1850 }
1851 
1852 /******************************************
1853  * Determine if two instructions have an AGI or register contention.
1854  * Returns:
1855  *      !=0 if they have an AGI
1856  */
1857 
1858 private int pair_agi(Cinfo *c1, Cinfo *c2)
1859 {
1860     uint x = c1.w & c2.a;
1861     return x && !(x == mSP && c1.pair & c2.pair & PE);
1862 }
1863 
1864 /********************************************
1865  * Determine if three instructions can decode simultaneously
1866  * in Pentium Pro and Pentium II.
1867  * Input:
1868  *      c0,c1,c2        candidates for decoders 0,1,2
1869  *                      c2 can be null
1870  * Returns:
1871  *      !=0 if they can decode simultaneously
1872  */
1873 
1874 private int triple_test(Cinfo *c0, Cinfo *c1, Cinfo *c2)
1875 {
1876     assert(c0);
1877     if (!c1)
1878         return 0;
1879     int c2isz = c2 ? c2.isz : 0;
1880     if (c0.isz > 7 || c1.isz > 7 || c2isz > 7 ||
1881         c0.isz + c1.isz + c2isz > 16)
1882         return 0;
1883 
1884     // 4-1-1 decode
1885     if (c1.uops > 1 ||
1886         (c2 && c2.uops > 1))
1887         return 0;
1888 
1889     return 1;
1890 }
1891 
1892 /********************************************
1893  * Get next instruction worth looking at for scheduling.
1894  * Returns:
1895  *      null    no more instructions
1896  */
1897 
1898 private code * cnext(code *c)
1899 {
1900     while (1)
1901     {
1902         c = code_next(c);
1903         if (!c)
1904             break;
1905         if (c.Iflags & (CFtarg | CFtarg2))
1906             break;
1907         if (!(c.Iop == NOP ||
1908               c.Iop == (ESCAPE | ESClinnum)))
1909             break;
1910     }
1911     return c;
1912 }
1913 
1914 /******************************************
1915  * Instruction scheduler.
1916  * Input:
1917  *      c               list of instructions to schedule
1918  *      scratch         scratch registers we can use
1919  * Returns:
1920  *      revised list of scheduled instructions
1921  */
1922 
1923 ///////////////////////////////////
1924 // Determine if c1 and c2 are swappable.
1925 // c1 comes before c2.
1926 // If they do not conflict
1927 //      return 0
1928 // If they do conflict
1929 //      return 0x100 + delay_clocks
1930 // Input:
1931 //      fpsched         if 1, then adjust fxch_pre and fxch_post to swap,
1932 //                      then return 0
1933 //                      if 2, then adjust ci1 as well as ci2
1934 
1935 private int conflict(Cinfo *ci1,Cinfo *ci2,int fpsched)
1936 {
1937     code *c1;
1938     code *c2;
1939     uint r1,w1,a1;
1940     uint r2,w2,a2;
1941     int sz1,sz2;
1942     int i = 0;
1943     int delay_clocks;
1944 
1945     c1 = ci1.c;
1946     c2 = ci2.c;
1947 
1948     //printf("conflict %x %x\n",c1,c2);
1949 
1950     r1 = ci1.r;
1951     w1 = ci1.w;
1952     a1 = ci1.a;
1953     sz1 = ci1.sz;
1954 
1955     r2 = ci2.r;
1956     w2 = ci2.w;
1957     a2 = ci2.a;
1958     sz2 = ci2.sz;
1959 
1960     //printf("r1 %lx w1 %lx a1 %lx sz1 %x\n",r1,w1,a1,sz1);
1961     //printf("r2 %lx w2 %lx a2 %lx sz2 %x\n",r2,w2,a2,sz2);
1962 
1963     if ((c1.Iflags | c2.Iflags) & (CFvolatile | CFvex))
1964         goto Lconflict;
1965 
1966     // Determine if we should handle FPU register conflicts separately
1967     //if (fpsched) printf("fp_op %d,%d:\n",ci1.fp_op,ci2.fp_op);
1968     if (fpsched && ci1.fp_op && ci2.fp_op)
1969     {
1970         w1 &= ~(S|C);
1971         r1 &= ~(S|C);
1972         w2 &= ~(S|C);
1973         r2 &= ~(S|C);
1974     }
1975     else
1976         fpsched = 0;
1977 
1978     if ((r1 | r2) & N)
1979     {
1980         goto Lconflict;
1981     }
1982 
1983 static if (0)
1984 {
1985     if (c1.Iop == 0xFF && c2.Iop == 0x8B)
1986     {   c1.print(); c2.print(); i = 1;
1987         printf("r1=%lx, w1=%lx, a1=%lx, sz1=%d, r2=%lx, w2=%lx, a2=%lx, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
1988     }
1989 }
1990 L1:
1991     if (w1 & r2 || (r1 | w1) & w2)
1992     {   ubyte ifl1,ifl2;
1993 
1994 if (i) printf("test\n");
1995 
1996 static if (0)
1997 {
1998 if (c1.IFL1 != c2.IFL1) printf("t1\n");
1999 if ((c1.Irm & modregrm(3,0,7)) != (c2.Irm & modregrm(3,0,7))) printf("t2\n");
2000 if ((issib(c1.Irm) && c1.Isib != c2.Isib)) printf("t3\n");
2001 if (c1.IEV1.Vpointer + sz1 <= c2.IEV1.Vpointer) printf("t4\n");
2002 if (c2.IEV1.Vpointer + sz2 <= c1.IEV1.Vpointer) printf("t5\n");
2003 }
2004 
2005         // make sure CFpsw is reliably set
2006         if (w1 & w2 & F &&              // if both instructions write to flags
2007             w1 != F &&
2008             w2 != F &&
2009             !((r1 | r2) & F) &&         // but neither instruction reads them
2010             !((c1.Iflags | c2.Iflags) & CFpsw))       // and we don't care about flags
2011         {
2012             w1 &= ~F;
2013             w2 &= ~F;                   // remove conflict
2014             goto L1;                    // and try again
2015         }
2016 
2017         // If other than the memory reference is a conflict
2018         if (w1 & r2 & ~mMEM || (r1 | w1) & w2 & ~mMEM)
2019         {   if (i) printf("\t1\n");
2020             if (i) printf("r1=%x, w1=%x, a1=%x, sz1=%d, r2=%x, w2=%x, a2=%x, sz2=%d\n",r1,w1,a1,sz1,r2,w2,a2,sz2);
2021             goto Lconflict;
2022         }
2023 
2024         // If referring to distinct types, then no dependency
2025         if (c1.Irex && c2.Irex && c1.Irex != c2.Irex)
2026             goto Lswap;
2027 
2028         ifl1 = c1.IFL1;
2029         ifl2 = c2.IFL1;
2030 
2031         // Special case: Allow indexed references using registers other than
2032         // ESP and EBP to be swapped with PUSH instructions
2033         if (((c1.Iop & ~7) == 0x50 ||          // PUSH reg
2034              c1.Iop == 0x6A ||                 // PUSH imm8
2035              c1.Iop == 0x68 ||                 // PUSH imm16/imm32
2036              (c1.Iop == 0xFF && ci1.reg == 6) // PUSH EA
2037             ) &&
2038             ci2.flags & CIFL.ea && !(a2 & mSP) &&
2039             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2040            )
2041         {
2042             if (c1.Iop == 0xFF)
2043             {
2044                 if (!(w2 & mMEM))
2045                     goto Lswap;
2046             }
2047             else
2048                 goto Lswap;
2049         }
2050 
2051         // Special case: Allow indexed references using registers other than
2052         // ESP and EBP to be swapped with PUSH instructions
2053         if (((c2.Iop & ~7) == 0x50 ||          // PUSH reg
2054              c2.Iop == 0x6A ||                 // PUSH imm8
2055              c2.Iop == 0x68 ||                 // PUSH imm16/imm32
2056              (c2.Iop == 0xFF && ci2.reg == 6) // PUSH EA
2057             ) &&
2058             ci1.flags & CIFL.ea && !(a1 & mSP) &&
2059             !(a2 & mBP && cast(int)c2.IEV1.Vpointer < 0)
2060            )
2061         {
2062             if (c2.Iop == 0xFF)
2063             {
2064                 if (!(w1 & mMEM))
2065                     goto Lswap;
2066             }
2067             else
2068                 goto Lswap;
2069         }
2070 
2071         // If not both an EA addressing mode, conflict
2072         if (!(ci1.flags & ci2.flags & CIFL.ea))
2073         {   if (i) printf("\t2\n");
2074             goto Lconflict;
2075         }
2076 
2077         if (ci1.sibmodrm == ci2.sibmodrm)
2078         {   if (ifl1 != ifl2)
2079                 goto Lswap;
2080             switch (ifl1)
2081             {
2082                 case FLconst:
2083                     if (c1.IEV1.Vint != c2.IEV1.Vint &&
2084                         (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2085                          c2.IEV1.Vint + sz2 <= c1.IEV1.Vint))
2086                         goto Lswap;
2087                     break;
2088                 case FLdatseg:
2089                     if (c1.IEV1.Vseg != c2.IEV1.Vseg ||
2090                         c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2091                         c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2092                         goto Lswap;
2093                     break;
2094 
2095                 default:
2096                     break;
2097             }
2098         }
2099 
2100         if ((c1.Iflags | c2.Iflags) & CFunambig &&
2101             (ifl1 != ifl2 ||
2102              ci1.sibmodrm != ci2.sibmodrm ||
2103              (c1.IEV1.Vint != c2.IEV1.Vint &&
2104               (c1.IEV1.Vint + sz1 <= c2.IEV1.Vint ||
2105                c2.IEV1.Vint + sz2 <= c1.IEV1.Vint)
2106              )
2107             )
2108            )
2109         {
2110             // Assume that [EBP] and [ESP] can point to the same location
2111             if (((a1 | a2) & (mBP | mSP)) == (mBP | mSP))
2112                 goto Lconflict;
2113             goto Lswap;
2114         }
2115 
2116         if (i) printf("\t3\n");
2117         goto Lconflict;
2118     }
2119 
2120 Lswap:
2121     if (fpsched)
2122     {
2123         //printf("\tfpsched %d,%d:\n",ci1.fp_op,ci2.fp_op);
2124         ubyte x1 = ci1.fxch_pre;
2125         ubyte y1 = ci1.fxch_post;
2126         ubyte x2 = ci2.fxch_pre;
2127         ubyte y2 = ci2.fxch_post;
2128 
2129         static uint X(uint a, uint b) { return (a << 8) | b; }
2130         switch (X(ci1.fp_op,ci2.fp_op))
2131         {
2132             case X(FP.fstp, FP.fld):
2133                 if (x1 || y1)
2134                     goto Lconflict;
2135                 if (x2)
2136                     goto Lconflict;
2137                 if (y2 == 0)
2138                     ci2.fxch_post++;
2139                 else if (y2 == 1)
2140                 {
2141                     ci2.fxch_pre++;
2142                     ci2.fxch_post++;
2143                 }
2144                 else
2145                 {
2146                     goto Lconflict;
2147                 }
2148                 break;
2149 
2150             case X(FP.fstp, FP.fop):
2151                 if (x1 || y1)
2152                     goto Lconflict;
2153                 ci2.fxch_pre++;
2154                 ci2.fxch_post++;
2155                 break;
2156 
2157             case X(FP.fop, FP.fop):
2158                 if (x1 == 0 && y1 == 1 && x2 == 0 && y2 == 0)
2159                 {   ci2.fxch_pre = 1;
2160                     ci2.fxch_post = 1;
2161                     break;
2162                 }
2163                 if (x1 == 0 && y1 == 0 && x2 == 1 && y2 == 1)
2164                     break;
2165                 goto Lconflict;
2166 
2167             case X(FP.fop, FP.fld):
2168                 if (x1 || y1)
2169                     goto Lconflict;
2170                 if (x2)
2171                     goto Lconflict;
2172                 if (y2)
2173                     break;
2174                 else if (fpsched == 2)
2175                     ci1.fxch_post = 1;
2176                 ci2.fxch_post = 1;
2177                 break;
2178 
2179             default:
2180                 goto Lconflict;
2181         }
2182 
2183         //printf("\tpre = %d, post = %d\n",ci2.fxch_pre,ci2.fxch_post);
2184     }
2185 
2186     //printf("w1 = x%x, w2 = x%x\n",w1,w2);
2187     if (i) printf("no conflict\n\n");
2188     return 0;
2189 
2190 Lconflict:
2191     //printf("r1=%x, w1=%x, r2=%x, w2=%x\n",r1,w1,r2,w2);
2192     delay_clocks = 0;
2193 
2194     // Determine if AGI
2195     if (!PRO && pair_agi(ci1,ci2))
2196         delay_clocks = 1;
2197 
2198     // Special delays for floating point
2199     if (fpsched)
2200     {   if (ci1.fp_op == FP.fld && ci2.fp_op == FP.fstp)
2201             delay_clocks = 1;
2202         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fstp)
2203             delay_clocks = 3;
2204         else if (ci1.fp_op == FP.fop && ci2.fp_op == FP.fop)
2205             delay_clocks = 2;
2206     }
2207     else if (PRO)
2208     {
2209         // Look for partial register write stalls
2210         if (w1 & r2 & ALLREGS && sz1 < sz2)
2211             delay_clocks = 7;
2212     }
2213     else if ((w1 | r1) & (w2 | r2) & (C | S))
2214     {
2215         int op = c1.Iop;
2216         int reg = c1.Irm & modregrm(0,7,0);
2217         if (ci1.fp_op == FP.fld ||
2218             (op == 0xD9 && (c1.Irm & 0xF8) == 0xC0)
2219            )
2220         { }                             // FLD
2221         else if (op == 0xD9 && (c1.Irm & 0xF8) == 0xC8)
2222         { }                             // FXCH
2223         else if (c2.Iop == 0xD9 && (c2.Irm & 0xF8) == 0xC8)
2224         { }                             // FXCH
2225         else
2226             delay_clocks = 3;
2227     }
2228 
2229     if (i) printf("conflict %d\n\n",delay_clocks);
2230     return 0x100 + delay_clocks;
2231 }
2232 
2233 enum TBLMAX = 2*3*20;        // must be divisible by both 2 and 3
2234                              // (U,V pipe in Pentium, 3 decode units
2235                              //  in Pentium Pro)
2236 
2237 struct Schedule
2238 {
2239 nothrow:
2240     Cinfo*[TBLMAX] tbl;         // even numbers are U pipe, odd numbers are V
2241     int tblmax;                 // max number of slots used
2242 
2243     Cinfo[TBLMAX] cinfo;
2244     int cinfomax;
2245 
2246     Barray!(Cinfo*) stagelist;  // list of instructions in staging area
2247 
2248     int fpustackused;           // number of slots in FPU stack that are used
2249 
2250     void initialize(int fpustackinit)          // initialize scheduler
2251     {
2252         //printf("Schedule::initialize(fpustackinit = %d)\n", fpustackinit);
2253         memset(&this, 0, Schedule.sizeof);
2254         fpustackused = fpustackinit;
2255     }
2256 
2257 code **assemble(code **pc)  // reassemble scheduled instructions
2258 {
2259     code *c;
2260 
2261     debug
2262     if (debugs) printf("assemble:\n");
2263 
2264     assert(!*pc);
2265 
2266     // Try to insert the rest of the staged instructions
2267     size_t sli;
2268     for (sli = 0; sli < stagelist.length; ++sli)
2269     {
2270         Cinfo* ci = stagelist[sli];
2271         if (!ci)
2272             continue;
2273         if (!insert(ci))
2274             break;
2275     }
2276 
2277     // Get the instructions out of the schedule table
2278     assert(cast(uint)tblmax <= TBLMAX);
2279     for (int i = 0; i < tblmax; i++)
2280     {
2281         Cinfo* ci = tbl[i];
2282 
2283         debug
2284         if (debugs)
2285         {
2286             if (PRO)
2287             {   immutable char[4][3] tbl = [ "0  "," 1 ","  2" ];
2288 
2289                 if (ci)
2290                     printf("%s %d ",tbl[i - ((i / 3) * 3)].ptr,ci.uops);
2291                 else
2292                     printf("%s   ",tbl[i - ((i / 3) * 3)].ptr);
2293             }
2294             else
2295             {
2296                 printf((i & 1) ? " V " : "U  ");
2297             }
2298             if (ci)
2299                 ci.c.print();
2300             else
2301                 printf("\n");
2302         }
2303 
2304         if (!ci)
2305             continue;
2306         fpustackused += ci.fpuadjust;
2307         //printf("stage()1: fpustackused = %d\n", fpustackused);
2308         c = ci.c;
2309         if (i == 0)
2310             c.Iflags |= CFtarg;        // by definition, first is always a jump target
2311         else
2312             c.Iflags &= ~CFtarg;       // the rest are not
2313 
2314         // Put in any FXCH prefix
2315         if (ci.fxch_pre)
2316         {   code *cf;
2317             assert(i);
2318             cf = gen2(null,0xD9,0xC8 + ci.fxch_pre);
2319             *pc = cf;
2320             pc = &cf.next;
2321         }
2322 
2323         *pc = c;
2324         do
2325         {
2326             assert(*pc != code_next(*pc));
2327             pc = &(*pc).next;
2328         } while (*pc);
2329 
2330         // Put in any FXCH postfix
2331         if (ci.fxch_post)
2332         {
2333             for (int j = i + 1; j < tblmax; j++)
2334             {   if (tbl[j])
2335                 {   if (tbl[j].fxch_pre == ci.fxch_post)
2336                     {
2337                         tbl[j].fxch_pre = 0;           // they cancel each other out
2338                         goto L1;
2339                     }
2340                     break;
2341                 }
2342             }
2343             {   code *cf;
2344                 cf = gen2(null,0xD9,0xC8 + ci.fxch_post);
2345                 *pc = cf;
2346                 pc = &cf.next;
2347             }
2348         }
2349     L1:
2350     }
2351 
2352     // Just append any instructions left in the staging area
2353     foreach (ci; stagelist[sli .. stagelist.length])
2354     {
2355         if (!ci)
2356             continue;
2357 
2358         debug
2359         if (debugs) { printf("appending: "); ci.c.print(); }
2360 
2361         *pc = ci.c;
2362         do
2363         {
2364             pc = &(*pc).next;
2365 
2366         } while (*pc);
2367         fpustackused += ci.fpuadjust;
2368         //printf("stage()2: fpustackused = %d\n", fpustackused);
2369     }
2370     stagelist.setLength(0);
2371 
2372     return pc;
2373 }
2374 
2375 /******************************
2376  * Insert c into scheduling table.
2377  * Returns:
2378  *      0       could not be scheduled; have to start a new one
2379  */
2380 
2381 int insert(Cinfo *ci)
2382 {   code *c;
2383     int clocks;
2384     int i;
2385     int ic = 0;
2386     int imin;
2387     targ_size_t offset;
2388     targ_size_t vpointer;
2389     int movesp = 0;
2390     int reg2 = -1;              // avoid "may be uninitialized" warning
2391 
2392     //printf("insert "); ci.c.print();
2393     //printf("insert() %d\n", fpustackused);
2394     c = ci.c;
2395     //printf("\tc.Iop %x\n",c.Iop);
2396     vpointer = c.IEV1.Vpointer;
2397     assert(cast(uint)tblmax <= TBLMAX);
2398     if (tblmax == TBLMAX)               // if out of space
2399         goto Lnoinsert;
2400     if (tblmax == 0)                    // if table is empty
2401     {   // Just stuff it in the first slot
2402         i = tblmax;
2403         goto Linsert;
2404     }
2405     else if (c.Iflags & (CFtarg | CFtarg2))
2406         // Jump targets can only be first in the scheduler
2407         goto Lnoinsert;
2408 
2409     // Special case of:
2410     //  PUSH reg1
2411     //  MOV  reg2,x[ESP]
2412     if (c.Iop == 0x8B &&
2413         (c.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2414         c.Isib == modregrm(0,4,SP) &&
2415         c.IFL1 == FLconst &&
2416         (cast(byte)c.IEV1.Vpointer) >= REGSIZE
2417        )
2418     {
2419         movesp = 1;                     // this is a MOV reg2,offset[ESP]
2420         offset = cast(byte)c.IEV1.Vpointer;
2421         reg2 = (c.Irm >> 3) & 7;
2422     }
2423 
2424 
2425     // Start at tblmax, and back up until we get a conflict
2426     ic = -1;
2427     imin = 0;
2428     for (i = tblmax; i >= 0; i--)
2429     {
2430         Cinfo* cit = tbl[i];
2431         if (!cit)
2432             continue;
2433 
2434         // Look for special case swap
2435         if (movesp &&
2436             (cit.c.Iop & ~7) == 0x50 &&               // if PUSH reg1
2437             (cit.c.Iop & 7) != reg2 &&                // if reg1 != reg2
2438             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2439            )
2440         {
2441             c.IEV1.Vpointer += cit.spadjust;
2442             //printf("\t1, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2443             continue;
2444         }
2445 
2446         if (movesp &&
2447             cit.c.Iop == 0x83 &&
2448             cit.c.Irm == modregrm(3,5,SP) &&          // if SUB ESP,offset
2449             cit.c.IFL2 == FLconst &&
2450             (cast(byte)c.IEV1.Vpointer) >= -cit.spadjust
2451            )
2452         {
2453             //printf("\t2, spadjust = %d\n",cit.spadjust);
2454             c.IEV1.Vpointer += cit.spadjust;
2455             continue;
2456         }
2457 
2458         clocks = conflict(cit,ci,1);
2459         if (clocks)
2460         {   int j;
2461 
2462             ic = i;                     // where the conflict occurred
2463             clocks &= 0xFF;             // convert to delay count
2464 
2465             // Move forward the delay clocks
2466             if (clocks == 0)
2467                 j = i + 1;
2468             else if (PRO)
2469                 j = (((i + 3) / 3) * 3) + clocks * 3;
2470             else
2471             {   j = ((i + 2) & ~1) + clocks * 2;
2472 
2473                 // It's possible we skipped over some AGI generating
2474                 // instructions due to movesp.
2475                 int k;
2476                 for (k = i + 1; k < j; k++)
2477                 {
2478                     if (k >= TBLMAX)
2479                         goto Lnoinsert;
2480                     if (tbl[k] && pair_agi(tbl[k],ci))
2481                     {
2482                         k = ((k + 2) & ~1) + 1;
2483                     }
2484                 }
2485                 j = k;
2486             }
2487 
2488             if (j >= TBLMAX)                    // exceed table size?
2489                 goto Lnoinsert;
2490             imin = j;                           // first possible slot c can go in
2491             break;
2492         }
2493     }
2494 
2495 
2496     // Scan forward looking for a hole to put it in
2497     for (i = imin; i < TBLMAX; i++)
2498     {
2499         if (tbl[i])
2500         {
2501             // In case, due to movesp, we skipped over some AGI instructions
2502             if (!PRO && pair_agi(tbl[i],ci))
2503             {
2504                 i = ((i + 2) & ~1) + 1;
2505                 if (i >= TBLMAX)
2506                     goto Lnoinsert;
2507             }
2508         }
2509         else
2510         {
2511             if (PRO)
2512             {   int i0 = (i / 3) * 3;           // index of decode unit 0
2513                 Cinfo *ci0;
2514 
2515                 assert(((TBLMAX / 3) * 3) == TBLMAX);
2516                 switch (i - i0)
2517                 {
2518                     case 0:                     // i0 can handle any instruction
2519                         goto Linsert;
2520                     case 1:
2521                         ci0 = tbl[i0];
2522                         if (ci.uops > 1)
2523                         {
2524                             if (i0 >= imin && ci0.uops == 1)
2525                                 goto L1;
2526                             i++;
2527                             break;
2528                         }
2529                         if (triple_test(ci0,ci,tbl[i0 + 2]))
2530                             goto Linsert;
2531                         break;
2532                     case 2:
2533                         ci0 = tbl[i0];
2534                         if (ci.uops > 1)
2535                         {
2536                             if (i0 >= imin && ci0.uops == 1)
2537                             {
2538                                 if (i >= tblmax)
2539                                 {   if (i + 1 >= TBLMAX)
2540                                         goto Lnoinsert;
2541                                     tblmax = i + 1;
2542                                 }
2543                                 tbl[i0 + 2] = tbl[i0 + 1];
2544                                 tbl[i0 + 1] = ci0;
2545                                 i = i0;
2546                                 goto Linsert;
2547                             }
2548                             break;
2549                         }
2550                         if (triple_test(ci0,tbl[i0 + 1],ci))
2551                             goto Linsert;
2552                         break;
2553                     default:
2554                         assert(0);
2555                 }
2556             }
2557             else
2558             {
2559                 assert((TBLMAX & 1) == 0);
2560                 if (i & 1)                      // if V pipe
2561                 {
2562                     if (pair_test(tbl[i - 1],ci))
2563                     {
2564                         goto Linsert;
2565                     }
2566                     else if (i > imin && pair_test(ci,tbl[i - 1]))
2567                     {
2568                 L1:
2569                         tbl[i] = tbl[i - 1];
2570                         if (i >= tblmax)
2571                             tblmax = i + 1;
2572                         i--;
2573                         //printf("\tswapping with x%02x\n",tbl[i + 1].c.Iop);
2574                         goto Linsert;
2575                     }
2576                 }
2577                 else                    // will always fit in U pipe
2578                 {
2579                     assert(!tbl[i + 1]);        // because V pipe should be empty
2580                     goto Linsert;
2581                 }
2582             }
2583         }
2584     }
2585 
2586 Lnoinsert:
2587     //printf("\tnoinsert\n");
2588     c.IEV1.Vpointer = vpointer;  // reset to original value
2589     return 0;
2590 
2591 Linsert:
2592     // Insert at location i
2593     assert(i < TBLMAX);
2594     assert(tblmax <= TBLMAX);
2595     tbl[i] = ci;
2596     //printf("\tinsert at location %d\n",i);
2597 
2598     // If it's a scheduled floating point code, we have to adjust
2599     // the FXCH values
2600     if (ci.fp_op)
2601     {
2602         ci.fxch_pre = 0;
2603         ci.fxch_post = 0;                      // start over again
2604 
2605         int fpu = fpustackused;
2606         for (int j = 0; j < tblmax; j++)
2607         {
2608             if (tbl[j])
2609             {
2610                 fpu += tbl[j].fpuadjust;
2611                 if (fpu >= 8)                   // if FPU stack overflow
2612                 {   tbl[i] = null;
2613                     //printf("fpu stack overflow\n");
2614                     goto Lnoinsert;
2615                 }
2616             }
2617         }
2618 
2619         for (int j = tblmax; j > i; j--)
2620         {
2621             if (j < TBLMAX && tbl[j])
2622                 conflict(tbl[j],ci,2);
2623         }
2624     }
2625 
2626     if (movesp)
2627     {   // Adjust [ESP] offsets
2628 
2629         //printf("\tic = %d, inserting at %d\n",ic,i);
2630         assert(cast(uint)tblmax <= TBLMAX);
2631         for (int j = ic + 1; j < i; j++)
2632         {
2633             Cinfo* cit = tbl[j];
2634             if (cit)
2635             {
2636                 c.IEV1.Vpointer -= cit.spadjust;
2637                 //printf("\t3, spadjust = %d, ptr = x%x\n",cit.spadjust,c.IEV1.Vpointer);
2638             }
2639         }
2640     }
2641     if (i >= tblmax)
2642         tblmax = i + 1;
2643 
2644     // Now do a hack. Look back at immediately preceding instructions,
2645     // and see if we can swap with a push.
2646     if (0 && movesp)
2647     {
2648         while (1)
2649         {
2650             int j;
2651             for (j = 1; i > j; j++)
2652                 if (tbl[i - j])
2653                     break;
2654 
2655             if (i >= j && tbl[i - j] &&
2656                    (tbl[i - j].c.Iop & ~7) == 0x50 &&       // if PUSH reg1
2657                    (tbl[i - j].c.Iop & 7) != reg2 &&  // if reg1 != reg2
2658                    cast(byte)c.IEV1.Vpointer >= REGSIZE)
2659             {
2660                 //printf("\t-4 prec, i-j=%d, i=%d\n",i-j,i);
2661                 assert(cast(uint)i < TBLMAX);
2662                 assert(cast(uint)(i - j) < TBLMAX);
2663                 tbl[i] = tbl[i - j];
2664                 tbl[i - j] = ci;
2665                 i -= j;
2666                 c.IEV1.Vpointer -= REGSIZE;
2667             }
2668             else
2669                 break;
2670         }
2671     }
2672 
2673     //printf("\tinsert\n");
2674     return 1;
2675 }
2676 
2677 /******************************
2678  * Insert c into staging area.
2679  * Params:
2680  *      c = instruction to stage
2681  * Returns:
2682  *      false if could not be scheduled; have to start a new one
2683  */
2684 
2685 bool stage(code *c)
2686 {
2687     //printf("stage: "); c.print();
2688     if (cinfomax == TBLMAX)             // if out of space
2689         return false;
2690     auto ci = &cinfo[cinfomax++];
2691     getinfo(ci,c);
2692 
2693     if (c.Iflags & (CFtarg | CFtarg2 | CFvolatile | CFvex))
2694     {
2695         // Insert anything in stagelist
2696         foreach (ref cs;  stagelist[])
2697         {
2698             if (cs)
2699             {
2700                 if (!insert(cs))
2701                     return false;
2702                 cs = null;
2703             }
2704         }
2705         return insert(ci) != 0;
2706     }
2707 
2708     // Look through stagelist, and insert any AGI conflicting instructions
2709     bool agi = false;
2710     foreach (ref cs; stagelist[])
2711     {
2712         if (cs)
2713         {
2714             if (pair_agi(cs,ci))
2715             {
2716                 if (!insert(cs))
2717                     goto Lnostage;
2718                 cs = null;
2719                 agi = true;                    // we put out an AGI
2720             }
2721         }
2722     }
2723 
2724     // Look through stagelist, and insert any other conflicting instructions
2725     foreach (i, ref cs; stagelist[])
2726     {
2727         if (!cs)
2728             continue;
2729         if (conflict(cs,ci,0) &&                // if conflict
2730             !(cs.flags & ci.flags & CIFL.push))
2731         {
2732             if (cs.spadjust)
2733             {
2734                 // We need to insert all previous adjustments to ESP
2735                 foreach (ref ca; stagelist[0 .. i])
2736                 {
2737                     if (ca && ca.spadjust)
2738                     {
2739                         if (!insert(ca))
2740                             goto Lnostage;
2741                         ca = null;
2742                     }
2743                 }
2744             }
2745 
2746             if (!insert(cs))
2747                 goto Lnostage;
2748             cs = null;
2749         }
2750     }
2751 
2752     // If floating point opcode, don't stage it, send it right out
2753     if (!agi && ci.flags & CIFL.nostage)
2754     {
2755         if (!insert(ci))
2756             goto Lnostage;
2757         return true;
2758     }
2759 
2760     stagelist.push(ci);         // append to staging list
2761     return true;
2762 
2763 Lnostage:
2764     return false;
2765 }
2766 
2767 }
2768 
2769 
2770 
2771 /********************************************
2772  * Snip off tail of instruction sequence.
2773  * Returns:
2774  *      next instruction (the tail) or
2775  *      null for no more instructions
2776  */
2777 
2778 private code * csnip(code *c)
2779 {
2780     if (c)
2781     {
2782         uint iflags = c.Iflags & CFclassinit;
2783         code **pc;
2784         while (1)
2785         {
2786             pc = &c.next;
2787             c = *pc;
2788             if (!c)
2789                 break;
2790             if (c.Iflags & (CFtarg | CFtarg2))
2791                 break;
2792             if (!(c.Iop == NOP ||
2793                   c.Iop == (ESCAPE | ESClinnum) ||
2794                   c.Iflags & iflags))
2795                 break;
2796         }
2797         *pc = null;
2798     }
2799     return c;
2800 }
2801 
2802 
2803 /******************************
2804  * Schedule Pentium instructions,
2805  * based on Steve Russell's algorithm.
2806  */
2807 
2808 private code *schedule(code *c,regm_t scratch)
2809 {
2810     code *cresult = null;
2811     code **pctail = &cresult;
2812     Schedule sch = void;
2813 
2814     sch.initialize(0);                  // initialize scheduling table
2815     while (c)
2816     {
2817         if ((c.Iop == NOP ||
2818              ((c.Iop & ESCAPEmask) == ESCAPE && c.Iop != (ESCAPE | ESCadjfpu)) ||
2819              c.Iflags & CFclassinit) &&
2820             !(c.Iflags & (CFtarg | CFtarg2)))
2821         {   code *cn;
2822 
2823             // Just append this instruction to pctail and go to the next one
2824             *pctail = c;
2825             cn = code_next(c);
2826             c.next = null;
2827             pctail = &c.next;
2828             c = cn;
2829             continue;
2830         }
2831 
2832         //printf("init\n");
2833         sch.initialize(sch.fpustackused);       // initialize scheduling table
2834 
2835         while (c)
2836         {
2837             //printf("insert %p\n",c);
2838             if (!sch.stage(c))          // store c in scheduling table
2839                 break;
2840             c = csnip(c);
2841         }
2842 
2843         //printf("assem %d\n",sch.tblmax);
2844         pctail = sch.assemble(pctail);  // reassemble instruction stream
2845     }
2846 
2847     return cresult;
2848 }
2849 
2850 /**************************************************************************/
2851 
2852 /********************************************
2853  * Replace any occurrence of r1 in EA with r2.
2854  */
2855 
2856 private void repEA(code *c,uint r1,uint r2)
2857 {
2858     uint mod,reg,rm;
2859     uint rmn;
2860 
2861     rmn = c.Irm;
2862     mod = rmn & 0xC0;
2863     reg = rmn & modregrm(0,7,0);
2864     rm =  rmn & 7;
2865 
2866     if (mod == 0xC0 && rm == r1)
2867     { }    //c.Irm = mod | reg | r2;
2868     else if (is32bitaddr(I32,c.Iflags) &&
2869         // If not disp32
2870         (rmn & modregrm(3,0,7)) != modregrm(0,0,5))
2871     {
2872         if (rm == 4)
2873         {   // SIB byte addressing
2874             uint sib;
2875             uint base;
2876             uint index;
2877 
2878             sib = c.Isib;
2879             base = sib & 7;
2880             index = (sib >> 3) & 7;
2881             if (base == r1 &&
2882                 !(r1 == 5 && mod == 0) &&
2883                 !(r2 == 5 && mod == 0)
2884                )
2885                 base = r2;
2886             if (index == r1)
2887                 index = r2;
2888             c.Isib = cast(ubyte)((sib & 0xC0) | (index << 3) | base);
2889         }
2890         else if (rm == r1)
2891         {
2892             if (r1 == BP && r2 == SP)
2893             {   // Replace [EBP] with [ESP]
2894                 c.Irm = cast(ubyte)(mod | reg | 4);
2895                 c.Isib = modregrm(0,4,SP);
2896             }
2897             else if (r2 == BP && mod == 0)
2898             {
2899                 c.Irm = cast(ubyte)(modregrm(1,0,0) | reg | r2);
2900                 c.IFL1 = FLconst;
2901                 c.IEV1.Vint = 0;
2902             }
2903             else
2904                 c.Irm = cast(ubyte)(mod | reg | r2);
2905         }
2906     }
2907 }
2908 
2909 /******************************************
2910  * Instruction scheduler.
2911  * Input:
2912  *      c               list of instructions to schedule
2913  *      scratch         scratch registers we can use
2914  * Returns:
2915  *      revised list of scheduled instructions
2916  */
2917 
2918 /******************************************
2919  * Swap c1 and c2.
2920  * c1 comes before c2.
2921  * Swap in place to not disturb addresses of jmp targets
2922  */
2923 
2924 private void code_swap(code *c1,code *c2)
2925 {   code cs;
2926 
2927     // Special case of:
2928     //  PUSH reg1
2929     //  MOV  reg2,x[ESP]
2930     //printf("code_swap(%x, %x)\n",c1,c2);
2931     if ((c1.Iop & ~7) == 0x50 &&
2932         c2.Iop == 0x8B &&
2933         (c2.Irm & modregrm(3,0,7)) == modregrm(1,0,4) &&
2934         c2.Isib == modregrm(0,4,SP) &&
2935         c2.IFL1 == FLconst &&
2936         (cast(byte)c2.IEV1.Vpointer) >= REGSIZE &&
2937         (c1.Iop & 7) != ((c2.Irm >> 3) & 7)
2938        )
2939         c2.IEV1.Vpointer -= REGSIZE;
2940 
2941 
2942     cs = *c2;
2943     *c2 = *c1;
2944     *c1 = cs;
2945     // Retain original CFtarg
2946     c1.Iflags = (c1.Iflags & ~(CFtarg | CFtarg2)) | (c2.Iflags & (CFtarg | CFtarg2));
2947     c2.Iflags = (c2.Iflags & ~(CFtarg | CFtarg2)) | (cs.Iflags & (CFtarg | CFtarg2));
2948 
2949     c1.next = c2.next;
2950     c2.next = cs.next;
2951 }
2952 
2953 code *peephole(code *cstart,regm_t scratch)
2954 {
2955     // Look for cases of:
2956     //  MOV r1,r2
2957     //  OP ?,r1
2958     // we can replace with:
2959     //  MOV r1,r2
2960     //  OP ?,r2
2961     // to improve pairing
2962     code *c1;
2963     uint r1,r2;
2964     uint mod,reg,rm;
2965 
2966     //printf("peephole\n");
2967     for (code *c = cstart; c; c = c1)
2968     {
2969         ubyte rmn;
2970 
2971         //c.print();
2972         c1 = cnext(c);
2973     Ln:
2974         if (!c1)
2975             break;
2976         if (c1.Iflags & (CFtarg | CFtarg2))
2977             continue;
2978 
2979         // Do:
2980         //      PUSH    reg
2981         if (I32 && (c.Iop & ~7) == 0x50)
2982         {
2983             uint regx = c.Iop & 7;
2984 
2985             //  MOV     [ESP],regx       =>      NOP
2986             if (c1.Iop == 0x8B &&
2987                 c1.Irm == modregrm(0,regx,4) &&
2988                 c1.Isib == modregrm(0,4,SP))
2989             {   c1.Iop = NOP;
2990                 continue;
2991             }
2992 
2993             //  PUSH    [ESP]           =>      PUSH    regx
2994             if (c1.Iop == 0xFF &&
2995                 c1.Irm == modregrm(0,6,4) &&
2996                 c1.Isib == modregrm(0,4,SP))
2997             {   c1.Iop = 0x50 + regx;
2998                 continue;
2999             }
3000 
3001             //  CMP     [ESP],imm       =>      CMP     regx,i,,
3002             if (c1.Iop == 0x83 &&
3003                 c1.Irm == modregrm(0,7,4) &&
3004                 c1.Isib == modregrm(0,4,SP))
3005             {   c1.Irm = modregrm(3,7,regx);
3006                 if (c1.IFL2 == FLconst && cast(byte)c1.IEV2.Vuns == 0)
3007                 {   // to TEST regx,regx
3008                     c1.Iop = (c1.Iop & 1) | 0x84;
3009                     c1.Irm = modregrm(3,regx,regx);
3010                 }
3011                 continue;
3012             }
3013 
3014         }
3015 
3016         // Do:
3017         //      MOV     reg,[ESP]       =>      PUSH    reg
3018         //      ADD     ESP,4           =>      NOP
3019         if (I32 && c.Iop == 0x8B && (c.Irm & 0xC7) == modregrm(0,0,4) &&
3020             c.Isib == modregrm(0,4,SP) &&
3021             c1.Iop == 0x83 && (c1.Irm & 0xC7) == modregrm(3,0,SP) &&
3022             !(c1.Iflags & CFpsw) && c1.IFL2 == FLconst && c1.IEV2.Vint == 4)
3023         {
3024             uint regx = (c.Irm >> 3) & 7;
3025             c.Iop = 0x58 + regx;
3026             c1.Iop = NOP;
3027             continue;
3028         }
3029 
3030         // Combine two SUBs of the same register
3031         if (c.Iop == c1.Iop &&
3032             c.Iop == 0x83 &&
3033             (c.Irm & 0xC0) == 0xC0 &&
3034             (c.Irm & modregrm(3,0,7)) == (c1.Irm & modregrm(3,0,7)) &&
3035             !(c1.Iflags & CFpsw) &&
3036             c.IFL2 == FLconst && c1.IFL2 == FLconst
3037            )
3038         {   int i = cast(byte)c.IEV2.Vint;
3039             int i1 = cast(byte)c1.IEV2.Vint;
3040             switch ((c.Irm & modregrm(0,7,0)) | ((c1.Irm & modregrm(0,7,0)) >> 3))
3041             {
3042                 case (0 << 3) | 0:              // ADD, ADD
3043                 case (5 << 3) | 5:              // SUB, SUB
3044                     i += i1;
3045                     goto Laa;
3046                 case (0 << 3) | 5:              // ADD, SUB
3047                 case (5 << 3) | 0:              // SUB, ADD
3048                     i -= i1;
3049                     goto Laa;
3050                 Laa:
3051                     if (cast(byte)i != i)
3052                         c.Iop &= ~2;
3053                     c.IEV2.Vint = i;
3054                     c1.Iop = NOP;
3055                     if (i == 0)
3056                         c.Iop = NOP;
3057                     continue;
3058 
3059                 default:
3060                     break;
3061             }
3062         }
3063 
3064         if (c.Iop == 0x8B && (c.Irm & 0xC0) == 0xC0)    // MOV r1,r2
3065         {   r1 = (c.Irm >> 3) & 7;
3066             r2 = c.Irm & 7;
3067         }
3068         else if (c.Iop == 0x89 && (c.Irm & 0xC0) == 0xC0)   // MOV r1,r2
3069         {   r1 = c.Irm & 7;
3070             r2 = (c.Irm >> 3) & 7;
3071         }
3072         else
3073         {
3074             continue;
3075         }
3076 
3077         rmn = c1.Irm;
3078         mod = rmn & 0xC0;
3079         reg = rmn & modregrm(0,7,0);
3080         rm =  rmn & 7;
3081         if (cod3_EA(c1))
3082             repEA(c1,r1,r2);
3083         switch (c1.Iop)
3084         {
3085             case 0x50:
3086             case 0x51:
3087             case 0x52:
3088             case 0x53:
3089             case 0x54:
3090             case 0x55:
3091             case 0x56:
3092             case 0x57:                          // PUSH reg
3093                 if ((c1.Iop & 7) == r1)
3094                 {   c1.Iop = 0x50 | r2;
3095                     //printf("schedule PUSH reg\n");
3096                 }
3097                 break;
3098 
3099             case 0x81:
3100             case 0x83:
3101                 // Look for CMP EA,imm
3102                 if (reg == modregrm(0,7,0))
3103                 {
3104                     if (mod == 0xC0 && rm == r1)
3105                         c1.Irm = cast(ubyte)(mod | reg | r2);
3106                 }
3107                 break;
3108 
3109             case 0x84:                  // TEST reg,byte ptr EA
3110                 if (r1 >= 4 || r2 >= 4) // if not a byte register
3111                     break;
3112                 if ((rmn & 0xC0) == 0xC0)
3113                 {
3114                     if ((rmn & 3) == r1)
3115                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,4)) | r2);
3116                         //printf("schedule 1\n");
3117                     }
3118                 }
3119                 if ((rmn & modregrm(0,3,0)) == modregrm(0,r1,0))
3120                 {   c1.Irm = (rmn & modregrm(3,4,7)) | modregrm(0,r2,0);
3121                     //printf("schedule 2\n");
3122                 }
3123                 break;
3124             case 0x85:                  // TEST reg,word ptr EA
3125                 if ((rmn & 0xC0) == 0xC0)
3126                 {
3127                     if ((rmn & 7) == r1)
3128                     {   c1.Irm = rmn = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3129                         //printf("schedule 3\n");
3130                     }
3131                 }
3132                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3133                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3134                     //printf("schedule 4\n");
3135                 }
3136                 break;
3137 
3138             case 0x89:                  // MOV EA,reg
3139                 if ((rmn & modregrm(0,7,0)) == modregrm(0,r1,0))
3140                 {   c1.Irm = (rmn & modregrm(3,0,7)) | modregrm(0,r2,0);
3141                     //printf("schedule 5\n");
3142                     if (c1.Irm == modregrm(3,r2,r2))
3143                         goto Lnop;
3144                 }
3145                 break;
3146 
3147             case 0x8B:                  // MOV reg,EA
3148                 if ((rmn & 0xC0) == 0xC0 &&
3149                     (rmn & 7) == r1)            // if EA == r1
3150                 {   c1.Irm = cast(ubyte)((rmn & modregrm(3,7,0)) | r2);
3151                     //printf("schedule 6\n");
3152                     if (c1.Irm == modregrm(3,r2,r2))
3153                         goto Lnop;
3154                 }
3155                 break;
3156 
3157             case 0x3C:                  // CMP AL,imm8
3158                 if (r1 == AX && r2 < 4)
3159                 {   c1.Iop = 0x80;
3160                     c1.Irm = modregrm(3,7,r2);
3161                     //printf("schedule 7, r2 = %d\n", r2);
3162                 }
3163                 break;
3164 
3165             case 0x3D:                  // CMP AX,imm16
3166                 if (r1 == AX)
3167                 {   c1.Iop = 0x81;
3168                     c1.Irm = modregrm(3,7,r2);
3169                     if (c1.IFL2 == FLconst &&
3170                         c1.IEV2.Vuns == cast(byte)c1.IEV2.Vuns)
3171                         c1.Iop = 0x83;
3172                     //printf("schedule 8\n");
3173                 }
3174                 break;
3175 
3176             default:
3177                 break;
3178         }
3179         continue;
3180 Lnop:
3181         c1.Iop = NOP;
3182         c1 = cnext(c1);
3183         goto Ln;
3184     }
3185     return cstart;
3186 }
3187 
3188 /*****************************************************************/
3189 
3190 /**********************************************
3191  * Replace complex instructions with simple ones more conducive
3192  * to scheduling.
3193  */
3194 
3195 code *simpleops(code *c,regm_t scratch)
3196 {   code *cstart;
3197     uint reg;
3198     code *c2;
3199 
3200     // Worry about using registers not saved yet by prolog
3201     scratch &= ~fregsaved;
3202 
3203     if (!(scratch & (scratch - 1)))     // if 0 or 1 registers
3204         return c;
3205 
3206     reg = findreg(scratch);
3207 
3208     cstart = c;
3209     for (code** pc = &cstart; *pc; pc = &(*pc).next)
3210     {
3211         c = *pc;
3212         if (c.Iflags & (CFtarg | CFtarg2 | CFopsize))
3213             continue;
3214         if (c.Iop == 0x83 &&
3215             (c.Irm & modregrm(0,7,0)) == modregrm(0,7,0) &&
3216             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3217            )
3218         {   // Replace CMP mem,imm with:
3219             //  MOV reg,mem
3220             //  CMP reg,imm
3221             targ_long imm;
3222 
3223             //printf("replacing CMP\n");
3224             c.Iop = 0x8B;
3225             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3226 
3227             c2 = code_calloc();
3228             if (reg == AX)
3229                 c2.Iop = 0x3D;
3230             else
3231             {   c2.Iop = 0x83;
3232                 c2.Irm = modregrm(3,7,reg);
3233             }
3234             c2.IFL2 = c.IFL2;
3235             c2.IEV2 = c.IEV2;
3236 
3237             // See if c2 should be replaced by a TEST
3238             imm = c2.IEV2.Vuns;
3239             if (!(c2.Iop & 1))
3240                 imm &= 0xFF;
3241             else if (I32 ? c.Iflags & CFopsize : !(c.Iflags & CFopsize))
3242                 imm = cast(short) imm;
3243             if (imm == 0)
3244             {
3245                 c2.Iop = 0x85;                 // TEST reg,reg
3246                 c2.Irm = modregrm(3,reg,reg);
3247             }
3248             goto L1;
3249         }
3250         else if (c.Iop == 0xFF &&
3251             (c.Irm & modregrm(0,7,0)) == modregrm(0,6,0) &&
3252             (c.Irm & modregrm(3,0,0)) != modregrm(3,0,0)
3253            )
3254         {   // Replace PUSH mem with:
3255             //  MOV reg,mem
3256             //  PUSH reg
3257 
3258            // printf("replacing PUSH\n");
3259             c.Iop = 0x8B;
3260             c.Irm = (c.Irm & modregrm(3,0,7)) | modregrm(0,reg,0);
3261 
3262             c2 = gen1(null,0x50 + reg);
3263         L1:
3264 //c.print();
3265 //c2.print();
3266             c2.next = c.next;
3267             c.next = c2;
3268 
3269             // Switch to another reg
3270             if (scratch & ~mask(reg))
3271                 reg = findreg(scratch & ~mask(reg));
3272         }
3273     }
3274     return cstart;
3275 }
3276 
3277 }