1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (C) 1994-1998 by Symantec
6  *              Copyright (C) 2000-2020 by The D Language Foundation, All Rights Reserved
7  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
8  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cod3.d, backend/cod3.d)
10  * Documentation:  https://dlang.org/phobos/dmd_backend_cod3.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/backend/cod3.d
12  */
13 
14 module dmd.backend.cod3;
15 
16 version (SCPP)
17     version = COMPILE;
18 version (MARS)
19     version = COMPILE;
20 
21 version (COMPILE)
22 {
23 
24 import core.stdc.stdio;
25 import core.stdc.stdlib;
26 import core.stdc.string;
27 
28 import dmd.backend.backend;
29 import dmd.backend.cc;
30 import dmd.backend.cdef;
31 import dmd.backend.cgcse;
32 import dmd.backend.code;
33 import dmd.backend.code_x86;
34 import dmd.backend.codebuilder;
35 import dmd.backend.dlist;
36 import dmd.backend.dvec;
37 import dmd.backend.melf;
38 import dmd.backend.mem;
39 import dmd.backend.el;
40 import dmd.backend.exh;
41 import dmd.backend.global;
42 import dmd.backend.obj;
43 import dmd.backend.oper;
44 import dmd.backend.outbuf;
45 import dmd.backend.rtlsym;
46 import dmd.backend.symtab;
47 import dmd.backend.ty;
48 import dmd.backend.type;
49 import dmd.backend.xmm;
50 
51 version (SCPP)
52 {
53     import parser;
54     import precomp;
55 }
56 
57 extern (C++):
58 
59 nothrow:
60 
61 version (MARS)
62     enum MARS = true;
63 else
64     enum MARS = false;
65 
66 int REGSIZE();
67 
68 extern __gshared CGstate cgstate;
69 extern __gshared ubyte[FLMAX] segfl;
70 extern __gshared bool[FLMAX] stackfl, flinsymtab;
71 
72 private extern (D) uint mask(uint m) { return 1 << m; }
73 
74 //private void genorreg(ref CodeBuilder c, uint t, uint f) { genregs(c, 0x09, f, t); }
75 
76 extern __gshared targ_size_t retsize;
77 
78 enum JMPJMPTABLE = false;               // benchmarking shows it's slower
79 
80 enum MINLL =           0x8000_0000_0000_0000L;
81 enum MAXLL =           0x7FFF_FFFF_FFFF_FFFFL;
82 
83 /*************
84  * Size in bytes of each instruction.
85  * 0 means illegal instruction.
86  * bit  M:      if there is a modregrm field (EV1 is reserved for modregrm)
87  * bit  T:      if there is a second operand (EV2)
88  * bit  E:      if second operand is only 8 bits
89  * bit  A:      a short version exists for the AX reg
90  * bit  R:      a short version exists for regs
91  * bits 2..0:   size of instruction (excluding optional bytes)
92  */
93 
94 enum
95 {
96     M = 0x80,
97     T = 0x40,
98     E = 0x20,
99     A = 0x10,
100     R = 0x08,
101     W = 0,
102 }
103 
104 private __gshared ubyte[256] inssize =
105 [       M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 00 */
106         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 08 */
107         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 10 */
108         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 18 */
109         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 20 */
110         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 28 */
111         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 30 */
112         M|2,M|2,M|2,M|2,        T|E|2,T|3,1,1,          /* 38 */
113         1,1,1,1,                1,1,1,1,                /* 40 */
114         1,1,1,1,                1,1,1,1,                /* 48 */
115         1,1,1,1,                1,1,1,1,                /* 50 */
116         1,1,1,1,                1,1,1,1,                /* 58 */
117         1,1,M|2,M|2,            1,1,1,1,                /* 60 */
118         T|3,M|T|4,T|E|2,M|T|E|3, 1,1,1,1,               /* 68 */
119         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 70 */
120         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* 78 */
121         M|T|E|A|3,M|T|A|4,M|T|E|3,M|T|E|3,      M|2,M|2,M|2,M|A|R|2, /* 80 */
122         M|A|2,M|A|2,M|A|2,M|A|2,        M|2,M|2,M|2,M|R|2,      /* 88 */
123         1,1,1,1,                1,1,1,1,                /* 90 */
124         1,1,T|5,1,              1,1,1,1,                /* 98 */
125 
126      // cod3_set32() patches this
127     //  T|5,T|5,T|5,T|5,        1,1,1,1,                /* A0 */
128         T|3,T|3,T|3,T|3,        1,1,1,1,                /* A0 */
129 
130         T|E|2,T|3,1,1,          1,1,1,1,                /* A8 */
131         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* B0 */
132         T|3,T|3,T|3,T|3,        T|3,T|3,T|3,T|3,                /* B8 */
133         M|T|E|3,M|T|E|3,T|3,1,  M|2,M|2,M|T|E|R|3,M|T|R|4,      /* C0 */
134         T|E|4,1,T|3,1,          1,T|E|2,1,1,            /* C8 */
135         M|2,M|2,M|2,M|2,        T|E|2,T|E|2,0,1,        /* D0 */
136         /* For the floating instructions, allow room for the FWAIT      */
137         M|2,M|2,M|2,M|2,        M|2,M|2,M|2,M|2,        /* D8 */
138         T|E|2,T|E|2,T|E|2,T|E|2, T|E|2,T|E|2,T|E|2,T|E|2,       /* E0 */
139         T|3,T|3,T|5,T|E|2,              1,1,1,1,                /* E8 */
140         1,0,1,1,                1,1,M|A|2,M|A|2,                /* F0 */
141         1,1,1,1,                1,1,M|2,M|R|2                   /* F8 */
142 ];
143 
144 private __gshared const ubyte[256] inssize32 =
145 [       2,2,2,2,        2,5,1,1,                /* 00 */
146         2,2,2,2,        2,5,1,1,                /* 08 */
147         2,2,2,2,        2,5,1,1,                /* 10 */
148         2,2,2,2,        2,5,1,1,                /* 18 */
149         2,2,2,2,        2,5,1,1,                /* 20 */
150         2,2,2,2,        2,5,1,1,                /* 28 */
151         2,2,2,2,        2,5,1,1,                /* 30 */
152         2,2,2,2,        2,5,1,1,                /* 38 */
153         1,1,1,1,        1,1,1,1,                /* 40 */
154         1,1,1,1,        1,1,1,1,                /* 48 */
155         1,1,1,1,        1,1,1,1,                /* 50 */
156         1,1,1,1,        1,1,1,1,                /* 58 */
157         1,1,2,2,        1,1,1,1,                /* 60 */
158         5,6,2,3,        1,1,1,1,                /* 68 */
159         2,2,2,2,        2,2,2,2,                /* 70 */
160         2,2,2,2,        2,2,2,2,                /* 78 */
161         3,6,3,3,        2,2,2,2,                /* 80 */
162         2,2,2,2,        2,2,2,2,                /* 88 */
163         1,1,1,1,        1,1,1,1,                /* 90 */
164         1,1,7,1,        1,1,1,1,                /* 98 */
165         5,5,5,5,        1,1,1,1,                /* A0 */
166         2,5,1,1,        1,1,1,1,                /* A8 */
167         2,2,2,2,        2,2,2,2,                /* B0 */
168         5,5,5,5,        5,5,5,5,                /* B8 */
169         3,3,3,1,        2,2,3,6,                /* C0 */
170         4,1,3,1,        1,2,1,1,                /* C8 */
171         2,2,2,2,        2,2,0,1,                /* D0 */
172         /* For the floating instructions, don't need room for the FWAIT */
173         2,2,2,2,        2,2,2,2,                /* D8 */
174 
175         2,2,2,2,        2,2,2,2,                /* E0 */
176         5,5,7,2,        1,1,1,1,                /* E8 */
177         1,0,1,1,        1,1,2,2,                /* F0 */
178         1,1,1,1,        1,1,2,2                 /* F8 */
179 ];
180 
181 /* For 2 byte opcodes starting with 0x0F        */
182 private __gshared ubyte[256] inssize2 =
183 [       M|3,M|3,M|3,M|3,        2,2,2,2,                // 00
184         2,2,M|3,2,              2,M|3,2,M|T|E|4,        // 08
185         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 10
186         M|3,2,2,2,              2,2,2,2,                // 18
187         M|3,M|3,M|3,M|3,        M|3,2,M|3,2,            // 20
188         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 28
189         2,2,2,2,                2,2,2,2,                // 30
190         M|4,2,M|T|E|5,2,        2,2,2,2,                // 38
191         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 40
192         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 48
193         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 50
194         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 58
195         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 60
196         M|3,M|3,M|3,M|3,        M|3,M|3,M|3,M|3,        // 68
197         M|T|E|4,M|T|E|4,M|T|E|4,M|T|E|4, M|3,M|3,M|3,2, // 70
198         2,2,2,2,                M|3,M|3,M|3,M|3,        // 78
199         W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 80
200         W|T|4,W|T|4,W|T|4,W|T|4, W|T|4,W|T|4,W|T|4,W|T|4, // 88
201         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 90
202         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // 98
203         2,2,2,M|3,      M|T|E|4,M|3,2,2,        // A0
204         2,2,2,M|3,      M|T|E|4,M|3,M|3,M|3,    // A8
205         M|E|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,     // B0
206         M|3,2,M|T|E|4,M|3, M|3,M|3,M|3,M|3,     // B8
207         M|3,M|3,M|T|E|4,M|3, M|T|E|4,M|T|E|4,M|T|E|4,M|3,       // C0
208         2,2,2,2,        2,2,2,2,                // C8
209         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D0
210         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // D8
211         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E0
212         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // E8
213         M|3,M|3,M|3,M|3, M|3,M|3,M|3,M|3,       // F0
214         M|3,M|3,M|3,M|3, M|3,M|3,M|3,2          // F8
215 ];
216 
217 /*************************************************
218  * Generate code to save `reg` in `regsave` stack area.
219  * Params:
220  *      regsave = register save areay on stack
221  *      cdb = where to write generated code
222  *      reg = register to save
223  *      idx = set to location in regsave for use in REGSAVE_restore()
224  */
225 
226 void REGSAVE_save(ref REGSAVE regsave, ref CodeBuilder cdb, reg_t reg, out uint idx)
227 {
228     if (isXMMreg(reg))
229     {
230         regsave.alignment = 16;
231         regsave.idx = (regsave.idx + 15) & ~15;
232         idx = regsave.idx;
233         regsave.idx += 16;
234         // MOVD idx[RBP],xmm
235         opcode_t op = STOAPD;
236         if (TARGET_LINUX && I32)
237             // Haven't yet figured out why stack is not aligned to 16
238             op = STOUPD;
239         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLregsave,cast(targ_uns) idx);
240     }
241     else
242     {
243         if (!regsave.alignment)
244             regsave.alignment = REGSIZE;
245         idx = regsave.idx;
246         regsave.idx += REGSIZE;
247         // MOV idx[RBP],reg
248         cdb.genc1(0x89,modregxrm(2, reg, BPRM),FLregsave,cast(targ_uns) idx);
249         if (I64)
250             code_orrex(cdb.last(), REX_W);
251     }
252     reflocal = true;
253     if (regsave.idx > regsave.top)
254         regsave.top = regsave.idx;              // keep high water mark
255 }
256 
257 /*******************************
258  * Restore `reg` from `regsave` area.
259  * Complement REGSAVE_save().
260  */
261 
262 void REGSAVE_restore(const ref REGSAVE regsave, ref CodeBuilder cdb, reg_t reg, uint idx)
263 {
264     if (isXMMreg(reg))
265     {
266         assert(regsave.alignment == 16);
267         // MOVD xmm,idx[RBP]
268         opcode_t op = LODAPD;
269         if (TARGET_LINUX && I32)
270             // Haven't yet figured out why stack is not aligned to 16
271             op = LODUPD;
272         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLregsave,cast(targ_uns) idx);
273     }
274     else
275     {   // MOV reg,idx[RBP]
276         cdb.genc1(0x8B,modregxrm(2, reg, BPRM),FLregsave,cast(targ_uns) idx);
277         if (I64)
278             code_orrex(cdb.last(), REX_W);
279     }
280 }
281 
282 /************************************
283  * Size for vex encoded instruction.
284  */
285 
286 ubyte vex_inssize(code *c)
287 {
288     assert(c.Iflags & CFvex && c.Ivex.pfx == 0xC4);
289     ubyte ins;
290     if (c.Iflags & CFvex3)
291     {
292         switch (c.Ivex.mmmm)
293         {
294         case 0: // no prefix
295         case 1: // 0F
296             ins = cast(ubyte)(inssize2[c.Ivex.op] + 2);
297             break;
298         case 2: // 0F 38
299             ins = cast(ubyte)(inssize2[0x38] + 1);
300             break;
301         case 3: // 0F 3A
302             ins = cast(ubyte)(inssize2[0x3A] + 1);
303             break;
304         default:
305             printf("Iop = %x mmmm = %x\n", c.Iop, c.Ivex.mmmm);
306             assert(0);
307         }
308     }
309     else
310     {
311         ins = cast(ubyte)(inssize2[c.Ivex.op] + 1);
312     }
313     return ins;
314 }
315 
316 /************************************
317  * Determine if there is a modregrm byte for code.
318  */
319 
320 int cod3_EA(code *c)
321 {   uint ins;
322 
323     opcode_t op1 = c.Iop & 0xFF;
324     if (op1 == ESCAPE)
325         ins = 0;
326     else if ((c.Iop & 0xFFFD00) == 0x0F3800)
327         ins = inssize2[(c.Iop >> 8) & 0xFF];
328     else if ((c.Iop & 0xFF00) == 0x0F00)
329         ins = inssize2[op1];
330     else
331         ins = inssize[op1];
332     return ins & M;
333 }
334 
335 /********************************
336  * setup ALLREGS and BYTEREGS
337  * called by: codgen
338  */
339 
340 void cod3_initregs()
341 {
342     if (I64)
343     {
344         ALLREGS = mAX|mBX|mCX|mDX|mSI|mDI| mR8|mR9|mR10|mR11|mR12|mR13|mR14|mR15;
345         BYTEREGS = ALLREGS;
346     }
347     else
348     {
349         ALLREGS = ALLREGS_INIT;
350         BYTEREGS = BYTEREGS_INIT;
351     }
352 }
353 
354 /********************************
355  * set initial global variable values
356  */
357 
358 void cod3_setdefault()
359 {
360     fregsaved = mBP | mSI | mDI;
361 }
362 
363 /********************************
364  * Fix global variables for 386.
365  */
366 
367 void cod3_set32()
368 {
369     inssize[0xA0] = T|5;
370     inssize[0xA1] = T|5;
371     inssize[0xA2] = T|5;
372     inssize[0xA3] = T|5;
373     BPRM = 5;                       /* [EBP] addressing mode        */
374     fregsaved = mBP | mBX | mSI | mDI;      // saved across function calls
375     FLOATREGS = FLOATREGS_32;
376     FLOATREGS2 = FLOATREGS2_32;
377     DOUBLEREGS = DOUBLEREGS_32;
378     if (config.flags3 & CFG3eseqds)
379         fregsaved |= mES;
380 
381     foreach (ref v; inssize2[0x80 .. 0x90])
382         v = W|T|6;
383 
384     TARGET_STACKALIGN = config.fpxmmregs ? 16 : 4;
385 }
386 
387 /********************************
388  * Fix global variables for I64.
389  */
390 
391 void cod3_set64()
392 {
393     inssize[0xA0] = T|5;                // MOV AL,mem
394     inssize[0xA1] = T|5;                // MOV RAX,mem
395     inssize[0xA2] = T|5;                // MOV mem,AL
396     inssize[0xA3] = T|5;                // MOV mem,RAX
397     BPRM = 5;                           // [RBP] addressing mode
398 
399 static if (TARGET_WINDOS)
400 {
401     fregsaved = mBP | mBX | mDI | mSI | mR12 | mR13 | mR14 | mR15 | mES | mXMM6 | mXMM7; // also XMM8..15;
402 }
403 else
404 {
405     fregsaved = mBP | mBX | mR12 | mR13 | mR14 | mR15 | mES;      // saved across function calls
406 }
407     FLOATREGS = FLOATREGS_64;
408     FLOATREGS2 = FLOATREGS2_64;
409     DOUBLEREGS = DOUBLEREGS_64;
410 
411     ALLREGS = mAX|mBX|mCX|mDX|mSI|mDI|  mR8|mR9|mR10|mR11|mR12|mR13|mR14|mR15;
412     BYTEREGS = ALLREGS;
413 
414     foreach (ref v; inssize2[0x80 .. 0x90])
415         v = W|T|6;
416 
417     TARGET_STACKALIGN = config.fpxmmregs ? 16 : 8;
418 }
419 
420 /*********************************
421  * Word or dword align start of function.
422  * Params:
423  *      seg = segment to write alignment bytes to
424  *      nbytes = number of alignment bytes to write
425  */
426 void cod3_align_bytes(int seg, size_t nbytes)
427 {
428     /* Table 4-2 from Intel Instruction Set Reference M-Z
429      * 1 bytes NOP                                        90
430      * 2 bytes 66 NOP                                     66 90
431      * 3 bytes NOP DWORD ptr [EAX]                        0F 1F 00
432      * 4 bytes NOP DWORD ptr [EAX + 00H]                  0F 1F 40 00
433      * 5 bytes NOP DWORD ptr [EAX + EAX*1 + 00H]          0F 1F 44 00 00
434      * 6 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00H]       66 0F 1F 44 00 00
435      * 7 bytes NOP DWORD ptr [EAX + 00000000H]            0F 1F 80 00 00 00 00
436      * 8 bytes NOP DWORD ptr [EAX + EAX*1 + 00000000H]    0F 1F 84 00 00 00 00 00
437      * 9 bytes 66 NOP DWORD ptr [EAX + EAX*1 + 00000000H] 66 0F 1F 84 00 00 00 00 00
438      * only for CPUs: CPUID.01H.EAX[Bytes 11:8] = 0110B or 1111B
439      */
440 
441     assert(SegData[seg].SDseg == seg);
442 
443     while (nbytes)
444     {   size_t n = nbytes;
445         const(char)* p;
446 
447         if (nbytes > 1 && (I64 || config.fpxmmregs))
448         {
449             switch (n)
450             {
451                 case 2:  p = "\x66\x90"; break;
452                 case 3:  p = "\x0F\x1F\x00"; break;
453                 case 4:  p = "\x0F\x1F\x40\x00"; break;
454                 case 5:  p = "\x0F\x1F\x44\x00\x00"; break;
455                 case 6:  p = "\x66\x0F\x1F\x44\x00\x00"; break;
456                 case 7:  p = "\x0F\x1F\x80\x00\x00\x00\x00"; break;
457                 case 8:  p = "\x0F\x1F\x84\x00\x00\x00\x00\x00"; break;
458                 default: p = "\x66\x0F\x1F\x84\x00\x00\x00\x00\x00"; n = 9; break;
459             }
460         }
461         else
462         {
463             static immutable ubyte[15] nops = [
464                 0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90,0x90
465             ]; // XCHG AX,AX
466             if (n > nops.length)
467                 n = nops.length;
468             p = cast(char*)nops;
469         }
470         objmod.write_bytes(SegData[seg],cast(uint)n,cast(char*)p);
471         nbytes -= n;
472     }
473 }
474 
475 /****************************
476  * Align start of function.
477  * Params:
478  *      seg = segment of function
479  */
480 void cod3_align(int seg)
481 {
482     uint nbytes;
483 static if (TARGET_WINDOS)
484 {
485     if (config.flags4 & CFG4speed)      // if optimized for speed
486     {
487         // Pick alignment based on CPU target
488         if (config.target_cpu == TARGET_80486 ||
489             config.target_cpu >= TARGET_PentiumPro)
490         {   // 486 does reads on 16 byte boundaries, so if we are near
491             // such a boundary, align us to it
492 
493             nbytes = -Offset(seg) & 15;
494             if (nbytes < 8)
495                 cod3_align_bytes(seg, nbytes);
496         }
497     }
498 }
499 else
500 {
501     nbytes = -Offset(seg) & 7;
502     cod3_align_bytes(seg, nbytes);
503 }
504 }
505 
506 
507 /**********************************
508  * Generate code to adjust the stack pointer by `nbytes`
509  * Params:
510  *      cdb = code builder
511  *      nbytes = number of bytes to adjust stack pointer
512  */
513 void cod3_stackadj(ref CodeBuilder cdb, int nbytes)
514 {
515     //printf("cod3_stackadj(%d)\n", nbytes);
516     uint grex = I64 ? REX_W << 16 : 0;
517     uint rm;
518     if (nbytes > 0)
519         rm = modregrm(3,5,SP); // SUB ESP,nbytes
520     else
521     {
522         nbytes = -nbytes;
523         rm = modregrm(3,0,SP); // ADD ESP,nbytes
524     }
525     cdb.genc2(0x81, grex | rm, nbytes);
526 }
527 
528 /**********************************
529  * Generate code to align the stack pointer at `nbytes`
530  * Params:
531  *      cdb = code builder
532  *      nbytes = number of bytes to align stack pointer
533  */
534 void cod3_stackalign(ref CodeBuilder cdb, int nbytes)
535 {
536     //printf("cod3_stackalign(%d)\n", nbytes);
537     const grex = I64 ? REX_W << 16 : 0;
538     const rm = modregrm(3, 4, SP);             // AND ESP,-nbytes
539     cdb.genc2(0x81, grex | rm, -nbytes);
540 }
541 
542 static if (ELFOBJ)
543 {
544 /* Constructor that links the ModuleReference to the head of
545  * the list pointed to by _Dmoduleref
546  */
547 void cod3_buildmodulector(Outbuffer* buf, int codeOffset, int refOffset)
548 {
549     /*      ret
550      * codeOffset:
551      *      pushad
552      *      mov     EAX,&ModuleReference
553      *      mov     ECX,_DmoduleRef
554      *      mov     EDX,[ECX]
555      *      mov     [EAX],EDX
556      *      mov     [ECX],EAX
557      *      popad
558      *      ret
559      */
560 
561     const int seg = CODE;
562 
563     if (I64 && config.flags3 & CFG3pic)
564     {   // LEA RAX,ModuleReference[RIP]
565         buf.writeByte(REX | REX_W);
566         buf.writeByte(LEA);
567         buf.writeByte(modregrm(0,AX,5));
568         codeOffset += 3;
569         codeOffset += Obj.writerel(seg, codeOffset, R_X86_64_PC32, 3 /*STI_DATA*/, refOffset - 4);
570 
571         // MOV RCX,_DmoduleRef@GOTPCREL[RIP]
572         buf.writeByte(REX | REX_W);
573         buf.writeByte(0x8B);
574         buf.writeByte(modregrm(0,CX,5));
575         codeOffset += 3;
576         codeOffset += Obj.writerel(seg, codeOffset, R_X86_64_GOTPCREL, Obj.external_def("_Dmodule_ref"), -4);
577     }
578     else
579     {
580         /* movl ModuleReference*, %eax */
581         buf.writeByte(0xB8);
582         codeOffset += 1;
583         const uint reltype = I64 ? R_X86_64_32 : R_386_32;
584         codeOffset += Obj.writerel(seg, codeOffset, reltype, 3 /*STI_DATA*/, refOffset);
585 
586         /* movl _Dmodule_ref, %ecx */
587         buf.writeByte(0xB9);
588         codeOffset += 1;
589         codeOffset += Obj.writerel(seg, codeOffset, reltype, Obj.external_def("_Dmodule_ref"), 0);
590     }
591 
592     if (I64)
593         buf.writeByte(REX | REX_W);
594     buf.writeByte(0x8B); buf.writeByte(0x11); /* movl (%ecx), %edx */
595     if (I64)
596         buf.writeByte(REX | REX_W);
597     buf.writeByte(0x89); buf.writeByte(0x10); /* movl %edx, (%eax) */
598     if (I64)
599         buf.writeByte(REX | REX_W);
600     buf.writeByte(0x89); buf.writeByte(0x01); /* movl %eax, (%ecx) */
601 
602     buf.writeByte(0xC3); /* ret */
603 }
604 
605 }
606 
607 
608 /*****************************
609  * Given a type, return a mask of
610  * registers to hold that type.
611  * Input:
612  *      tyf     function type
613  */
614 
615 regm_t regmask(tym_t tym, tym_t tyf)
616 {
617     switch (tybasic(tym))
618     {
619         case TYvoid:
620         case TYstruct:
621         case TYarray:
622             return 0;
623 
624         case TYbool:
625         case TYwchar_t:
626         case TYchar16:
627         case TYchar:
628         case TYschar:
629         case TYuchar:
630         case TYshort:
631         case TYushort:
632         case TYint:
633         case TYuint:
634         case TYnullptr:
635         case TYnptr:
636         case TYnref:
637         case TYsptr:
638         case TYcptr:
639         case TYimmutPtr:
640         case TYsharePtr:
641         case TYrestrictPtr:
642         case TYfgPtr:
643             return mAX;
644 
645         case TYfloat:
646         case TYifloat:
647             if (I64)
648                 return mXMM0;
649             if (config.exe & EX_flat)
650                 return mST0;
651             goto case TYlong;
652 
653         case TYlong:
654         case TYulong:
655         case TYdchar:
656             if (!I16)
657                 return mAX;
658             goto case TYfptr;
659 
660         case TYfptr:
661         case TYhptr:
662             return mDX | mAX;
663 
664         case TYcent:
665         case TYucent:
666             assert(I64);
667             return mDX | mAX;
668 
669         case TYvptr:
670             return mDX | mBX;
671 
672         case TYdouble:
673         case TYdouble_alias:
674         case TYidouble:
675             if (I64)
676                 return mXMM0;
677             if (config.exe & EX_flat)
678                 return mST0;
679             return DOUBLEREGS;
680 
681         case TYllong:
682         case TYullong:
683             return I64 ? cast(regm_t) mAX : (I32 ? mDX | mAX : DOUBLEREGS);
684 
685         case TYldouble:
686         case TYildouble:
687             return mST0;
688 
689         case TYcfloat:
690 static if (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
691 {
692             if (I32 && tybasic(tyf) == TYnfunc)
693                 return mDX | mAX;
694 }
695             goto case TYcdouble;
696 
697         case TYcdouble:
698             if (I64)
699                 return mXMM0 | mXMM1;
700             goto case TYcldouble;
701 
702         case TYcldouble:
703             return mST01;
704 
705         // SIMD vector types
706         case TYfloat4:
707         case TYdouble2:
708         case TYschar16:
709         case TYuchar16:
710         case TYshort8:
711         case TYushort8:
712         case TYlong4:
713         case TYulong4:
714         case TYllong2:
715         case TYullong2:
716 
717         case TYfloat8:
718         case TYdouble4:
719         case TYschar32:
720         case TYuchar32:
721         case TYshort16:
722         case TYushort16:
723         case TYlong8:
724         case TYulong8:
725         case TYllong4:
726         case TYullong4:
727             if (!config.fpxmmregs)
728             {   printf("SIMD operations not supported on this platform\n");
729                 exit(1);
730             }
731             return mXMM0;
732 
733         default:
734             debug WRTYxx(tym);
735             assert(0);
736     }
737 }
738 
739 /*******************************
740  * setup register allocator parameters with platform specific data
741  */
742 void cgreg_dst_regs(reg_t* dst_integer_reg, reg_t* dst_float_reg)
743 {
744     *dst_integer_reg = AX;
745     *dst_float_reg   = XMM0;
746 }
747 
748 void cgreg_set_priorities(tym_t ty, const(reg_t)** pseq, const(reg_t)** pseqmsw)
749 {
750     const sz = tysize(ty);
751 
752     if (tyxmmreg(ty))
753     {
754         static immutable ubyte[9] sequence = [XMM0,XMM1,XMM2,XMM3,XMM4,XMM5,XMM6,XMM7,NOREG];
755         *pseq = sequence.ptr;
756     }
757     else if (I64)
758     {
759         if (sz == REGSIZE * 2)
760         {
761             static immutable ubyte[3] seqmsw1 = [CX,DX,NOREG];
762             static immutable ubyte[5] seqlsw1 = [AX,BX,SI,DI,NOREG];
763             *pseq = seqlsw1.ptr;
764             *pseqmsw = seqmsw1.ptr;
765         }
766         else
767         {   // R10 is reserved for the static link
768             static immutable ubyte[15] sequence2 = [AX,CX,DX,SI,DI,R8,R9,R11,BX,R12,R13,R14,R15,BP,NOREG];
769             *pseq = cast(ubyte*)sequence2.ptr;
770         }
771     }
772     else if (I32)
773     {
774         if (sz == REGSIZE * 2)
775         {
776             static immutable ubyte[5] seqlsw3 = [AX,BX,SI,DI,NOREG];
777             static immutable ubyte[3] seqmsw3 = [CX,DX,NOREG];
778             *pseq = seqlsw3.ptr;
779             *pseqmsw = seqmsw3.ptr;
780         }
781         else
782         {
783             static immutable ubyte[8] sequence4 = [AX,CX,DX,BX,SI,DI,BP,NOREG];
784             *pseq = sequence4.ptr;
785         }
786     }
787     else
788     {   assert(I16);
789         if (typtr(ty))
790         {
791             // For pointer types, try to pick index register first
792             static immutable ubyte[8] seqidx5 = [BX,SI,DI,AX,CX,DX,BP,NOREG];
793             *pseq = seqidx5.ptr;
794         }
795         else
796         {
797             // Otherwise, try to pick index registers last
798             static immutable ubyte[8] sequence6 = [AX,CX,DX,BX,SI,DI,BP,NOREG];
799             *pseq = sequence6.ptr;
800         }
801     }
802 }
803 
804 /*******************************************
805  * Call finally block.
806  * Params:
807  *      bf = block to call
808  *      retregs = registers to preserve across call
809  * Returns:
810  *      code generated
811  */
812 private code *callFinallyBlock(block *bf, regm_t retregs)
813 {
814     CodeBuilder cdbs; cdbs.ctor();
815     CodeBuilder cdbr; cdbr.ctor();
816     int nalign = 0;
817 
818     calledFinally = true;
819     uint npush = gensaverestore(retregs,cdbs,cdbr);
820 
821     if (STACKALIGN >= 16)
822     {   npush += REGSIZE;
823         if (npush & (STACKALIGN - 1))
824         {   nalign = STACKALIGN - (npush & (STACKALIGN - 1));
825             cod3_stackadj(cdbs, nalign);
826         }
827     }
828     cdbs.genc(0xE8,0,0,0,FLblock,cast(targ_size_t)bf);
829     regcon.immed.mval = 0;
830     if (nalign)
831         cod3_stackadj(cdbs, -nalign);
832     cdbs.append(cdbr);
833     return cdbs.finish();
834 }
835 
836 /*******************************
837  * Generate block exit code
838  */
839 void outblkexitcode(ref CodeBuilder cdb, block *bl, ref int anyspill, const(char)* sflsave, Symbol** retsym, const regm_t mfuncregsave)
840 {
841     CodeBuilder cdb2; cdb2.ctor();
842     elem *e = bl.Belem;
843     block *nextb;
844     regm_t retregs = 0;
845 
846     if (bl.BC != BCasm)
847         assert(bl.Bcode == null);
848 
849     switch (bl.BC)                     /* block exit condition         */
850     {
851         case BCiftrue:
852         {
853             bool jcond = true;
854             block *bs1 = bl.nthSucc(0);
855             block *bs2 = bl.nthSucc(1);
856             if (bs1 == bl.Bnext)
857             {   // Swap bs1 and bs2
858                 block *btmp;
859 
860                 jcond ^= 1;
861                 btmp = bs1;
862                 bs1 = bs2;
863                 bs2 = btmp;
864             }
865             logexp(cdb,e,jcond,FLblock,cast(code *) bs1);
866             nextb = bs2;
867         }
868         L5:
869             if (configv.addlinenumbers && bl.Bsrcpos.Slinnum &&
870                 !(funcsym_p.ty() & mTYnaked))
871             {
872                 //printf("BCiftrue: %s(%u)\n", bl.Bsrcpos.Sfilename ? bl.Bsrcpos.Sfilename : "", bl.Bsrcpos.Slinnum);
873                 cdb.genlinnum(bl.Bsrcpos);
874             }
875             if (nextb != bl.Bnext)
876             {
877                 assert(!(bl.Bflags & BFLepilog));
878                 genjmp(cdb,JMP,FLblock,nextb);
879             }
880             break;
881 
882         case BCjmptab:
883         case BCifthen:
884         case BCswitch:
885         {
886             assert(!(bl.Bflags & BFLepilog));
887             doswitch(cdb,bl);               // hide messy details
888             break;
889         }
890 version (MARS)
891 {
892         case BCjcatch:          // D catch clause of try-catch
893             assert(ehmethod(funcsym_p) != EHmethod.EH_NONE);
894             // Mark all registers as destroyed. This will prevent
895             // register assignments to variables used in catch blocks.
896             getregs(cdb,lpadregs());
897 
898             if (config.ehmethod == EHmethod.EH_DWARF)
899             {
900                 /* Each block must have ESP set to the same value it was at the end
901                  * of the prolog. But the unwinder calls catch blocks with ESP set
902                  * at the value it was when the throwing function was called, which
903                  * may have arguments pushed on the stack.
904                  * This instruction will reset ESP to the correct offset from EBP.
905                  */
906                 cdb.gen1(ESCAPE | ESCfixesp);
907             }
908             goto case_goto;
909 }
910 version (SCPP)
911 {
912         case BCcatch:           // C++ catch clause of try-catch
913             // Mark all registers as destroyed. This will prevent
914             // register assignments to variables used in catch blocks.
915             getregs(cdb,allregs | mES);
916             goto case_goto;
917 
918         case BCtry:
919             usednteh |= EHtry;
920             if (config.exe == EX_WIN32)
921                 usednteh |= NTEHtry;
922             goto case_goto;
923 }
924         case BCgoto:
925             nextb = bl.nthSucc(0);
926             if ((MARS ||
927                  funcsym_p.Sfunc.Fflags3 & Fnteh) &&
928                 ehmethod(funcsym_p) != EHmethod.EH_DWARF &&
929                 bl.Btry != nextb.Btry &&
930                 nextb.BC != BC_finally)
931             {
932                 regm_t retregsx = 0;
933                 gencodelem(cdb,e,&retregsx,true);
934                 int toindex = nextb.Btry ? nextb.Btry.Bscope_index : -1;
935                 assert(bl.Btry);
936                 int fromindex = bl.Btry.Bscope_index;
937 version (MARS)
938 {
939                 if (toindex + 1 == fromindex)
940                 {   // Simply call __finally
941                     if (bl.Btry &&
942                         bl.Btry.nthSucc(1).BC == BCjcatch)
943                     {
944                         goto L5;        // it's a try-catch, not a try-finally
945                     }
946                 }
947 }
948                 if (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) ||
949                     config.ehmethod == EHmethod.EH_SEH)
950                 {
951                     nteh_unwind(cdb,0,toindex);
952                 }
953                 else
954                 {
955 version (MARS)
956 {
957                 if (toindex + 1 <= fromindex)
958                 {
959                     //c = cat(c, linux_unwind(0, toindex));
960                     block *bt;
961 
962                     //printf("B%d: fromindex = %d, toindex = %d\n", bl.Bdfoidx, fromindex, toindex);
963                     bt = bl;
964                     while ((bt = bt.Btry) != null && bt.Bscope_index != toindex)
965                     {   block *bf;
966 
967                         //printf("\tbt.Bscope_index = %d, bt.Blast_index = %d\n", bt.Bscope_index, bt.Blast_index);
968                         bf = bt.nthSucc(1);
969                         // Only look at try-finally blocks
970                         if (bf.BC == BCjcatch)
971                             continue;
972 
973                         if (bf == nextb)
974                             continue;
975                         //printf("\tbf = B%d, nextb = B%d\n", bf.Bdfoidx, nextb.Bdfoidx);
976                         if (nextb.BC == BCgoto &&
977                             !nextb.Belem &&
978                             bf == nextb.nthSucc(0))
979                             continue;
980 
981                         // call __finally
982                         cdb.append(callFinallyBlock(bf.nthSucc(0), retregsx));
983                     }
984                 }
985 }
986                 }
987                 goto L5;
988             }
989         case_goto:
990         {
991             regm_t retregsx = 0;
992             gencodelem(cdb,e,&retregsx,true);
993             if (anyspill)
994             {   // Add in the epilog code
995                 CodeBuilder cdbstore; cdbstore.ctor();
996                 CodeBuilder cdbload;  cdbload.ctor();
997 
998                 for (int i = 0; i < anyspill; i++)
999                 {   Symbol *s = globsym[i];
1000 
1001                     if (s.Sflags & SFLspill &&
1002                         vec_testbit(dfoidx,s.Srange))
1003                     {
1004                         s.Sfl = sflsave[i];    // undo block register assignments
1005                         cgreg_spillreg_epilog(bl,s,cdbstore,cdbload);
1006                     }
1007                 }
1008                 cdb.append(cdbstore);
1009                 cdb.append(cdbload);
1010             }
1011             nextb = bl.nthSucc(0);
1012             goto L5;
1013         }
1014 
1015         case BC_try:
1016             if (config.ehmethod == EHmethod.EH_NONE || funcsym_p.Sfunc.Fflags3 & Feh_none)
1017             {
1018                 /* Need to use frame pointer to access locals, not the stack pointer,
1019                  * because we'll be calling the BC_finally blocks and the stack will be off.
1020                  */
1021                 needframe = 1;
1022             }
1023             else if (config.ehmethod == EHmethod.EH_SEH || config.ehmethod == EHmethod.EH_WIN32)
1024             {
1025                 usednteh |= NTEH_try;
1026                 nteh_usevars();
1027             }
1028             else
1029                 usednteh |= EHtry;
1030             goto case_goto;
1031 
1032         case BC_finally:
1033             if (ehmethod(funcsym_p) == EHmethod.EH_DWARF)
1034             {
1035                 // Mark scratch registers as destroyed.
1036                 getregsNoSave(lpadregs());
1037 
1038                 regm_t retregsx = 0;
1039                 gencodelem(cdb,bl.Belem,&retregsx,true);
1040 
1041                 // JMP bl.nthSucc(1)
1042                 nextb = bl.nthSucc(1);
1043 
1044                 goto L5;
1045             }
1046             else
1047             {
1048                 if (config.ehmethod == EHmethod.EH_SEH ||
1049                     config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none))
1050                 {
1051                     // Mark all registers as destroyed. This will prevent
1052                     // register assignments to variables used in finally blocks.
1053                     getregsNoSave(lpadregs());
1054                 }
1055 
1056                 assert(!e);
1057                 // Generate CALL to finalizer code
1058                 cdb.append(callFinallyBlock(bl.nthSucc(0), 0));
1059 
1060                 // JMP bl.nthSucc(1)
1061                 nextb = bl.nthSucc(1);
1062 
1063                 goto L5;
1064             }
1065 
1066         case BC_lpad:
1067         {
1068             assert(ehmethod(funcsym_p) == EHmethod.EH_DWARF);
1069             // Mark all registers as destroyed. This will prevent
1070             // register assignments to variables used in finally blocks.
1071             getregsNoSave(lpadregs());
1072 
1073             regm_t retregsx = 0;
1074             gencodelem(cdb,bl.Belem,&retregsx,true);
1075 
1076             // JMP bl.nthSucc(0)
1077             nextb = bl.nthSucc(0);
1078             goto L5;
1079         }
1080 
1081         case BC_ret:
1082         {
1083             regm_t retregsx = 0;
1084             gencodelem(cdb,e,&retregsx,true);
1085             if (ehmethod(funcsym_p) == EHmethod.EH_DWARF)
1086             {
1087             }
1088             else
1089                 cdb.gen1(0xC3);   // RET
1090             break;
1091         }
1092 
1093 static if (NTEXCEPTIONS)
1094 {
1095         case BC_except:
1096         {
1097             assert(!e);
1098             usednteh |= NTEH_except;
1099             nteh_setsp(cdb,0x8B);
1100             getregsNoSave(allregs);
1101             nextb = bl.nthSucc(0);
1102             goto L5;
1103         }
1104         case BC_filter:
1105         {
1106             nteh_filter(cdb, bl);
1107             // Mark all registers as destroyed. This will prevent
1108             // register assignments to variables used in filter blocks.
1109             getregsNoSave(allregs);
1110             regm_t retregsx = regmask(e.Ety, TYnfunc);
1111             gencodelem(cdb,e,&retregsx,true);
1112             cdb.gen1(0xC3);   // RET
1113             break;
1114         }
1115 }
1116 
1117         case BCretexp:
1118             reg_t reg1, reg2, lreg, mreg;
1119             reg1 = reg2 = NOREG;
1120             if (config.exe == EX_WIN64) // broken
1121                 retregs = regmask(e.Ety, funcsym_p.ty());
1122             else
1123             {
1124                 retregs = allocretregs(e.Ety, e.ET, funcsym_p.ty(), &reg1, &reg2);
1125                 assert(reg1 != NOREG || !retregs);
1126             }
1127 
1128             lreg = mreg = NOREG;
1129             if (reg1 == NOREG)
1130             {}
1131             else if (tybasic(e.Ety) == TYcfloat)
1132                 lreg = ST01;
1133             else if (mask(reg1) & (mST0 | mST01))
1134                 lreg = reg1;
1135             else if (reg2 == NOREG)
1136                 lreg = reg1;
1137             else if (mask(reg1) & XMMREGS)
1138             {
1139                 lreg = XMM0;
1140                 mreg = XMM1;
1141             }
1142             else
1143             {
1144                 lreg = mask(reg1) & mLSW ? reg1 : AX;
1145                 mreg = mask(reg2) & mMSW ? reg2 : DX;
1146             }
1147             if (reg1 != NOREG)
1148                 retregs = (mask(lreg) | mask(mreg)) & ~mask(NOREG);
1149 
1150             // For the final load into the return regs, don't set regcon.used,
1151             // so that the optimizer can potentially use retregs for register
1152             // variable assignments.
1153 
1154             if (config.flags4 & CFG4optimized)
1155             {   regm_t usedsave;
1156 
1157                 docommas(cdb,&e);
1158                 usedsave = regcon.used;
1159                 if (!OTleaf(e.Eoper))
1160                     gencodelem(cdb,e,&retregs,true);
1161                 else
1162                 {
1163                     if (e.Eoper == OPconst)
1164                         regcon.mvar = 0;
1165                     gencodelem(cdb,e,&retregs,true);
1166                     regcon.used = usedsave;
1167                     if (e.Eoper == OPvar)
1168                     {   Symbol *s = e.EV.Vsym;
1169 
1170                         if (s.Sfl == FLreg && s.Sregm != mAX)
1171                             *retsym = s;
1172                     }
1173                 }
1174             }
1175             else
1176             {
1177                 gencodelem(cdb,e,&retregs,true);
1178             }
1179 
1180             if (reg1 == NOREG)
1181             {
1182             }
1183             else if ((mask(reg1) | mask(reg2)) & (mST0 | mST01))
1184             {
1185                 assert(reg1 == lreg && reg2 == NOREG);
1186             }
1187             // fix return registers
1188             else if (tybasic(e.Ety) == TYcfloat)
1189             {
1190                 assert(lreg == ST01);
1191                 if (I64)
1192                 {
1193                     assert(reg2 == NOREG);
1194                     // spill
1195                     pop87();
1196                     pop87();
1197                     cdb.genfltreg(0xD9, 3, tysize(TYfloat));
1198                     genfwait(cdb);
1199                     cdb.genfltreg(0xD9, 3, 0);
1200                     genfwait(cdb);
1201                     // reload
1202                     if (config.exe == EX_WIN64)
1203                     {
1204                         assert(reg1 == AX);
1205                         cdb.genfltreg(LOD, reg1, 0);
1206                         code_orrex(cdb.last(), REX_W);
1207                     }
1208                     else
1209                     {
1210                         assert(reg1 == XMM0);
1211                         cdb.genxmmreg(xmmload(TYdouble), reg1, 0, TYdouble);
1212                     }
1213                 }
1214                 else
1215                 {
1216                     assert(reg1 == AX && reg2 == DX);
1217                     regm_t pretregs = mask(reg1) | mask(reg2);
1218                     fixresult_complex87(cdb, e, retregs, &pretregs);
1219                 }
1220             }
1221             else if (reg2 == NOREG)
1222                 assert(lreg == reg1);
1223             else for (int v = 0; v < 2; v++)
1224             {
1225                 if (v ^ (reg1 != mreg))
1226                     genmovreg(cdb, reg1, lreg);
1227                 else
1228                     genmovreg(cdb, reg2, mreg);
1229             }
1230             if (reg1 != NOREG)
1231                 retregs = (mask(reg1) | mask(reg2)) & ~mask(NOREG);
1232             goto L4;
1233 
1234         case BCret:
1235         case BCexit:
1236             retregs = 0;
1237             gencodelem(cdb,e,&retregs,true);
1238         L4:
1239             if (retregs == mST0)
1240             {   assert(global87.stackused == 1);
1241                 pop87();                // account for return value
1242             }
1243             else if (retregs == mST01)
1244             {   assert(global87.stackused == 2);
1245                 pop87();
1246                 pop87();                // account for return value
1247             }
1248 
1249             if (bl.BC == BCexit)
1250             {
1251                 if (config.flags4 & CFG4optimized)
1252                     mfuncreg = mfuncregsave;
1253             }
1254             else if (MARS || usednteh & NTEH_try)
1255             {
1256                 block *bt = bl;
1257                 while ((bt = bt.Btry) != null)
1258                 {
1259                     block *bf = bt.nthSucc(1);
1260 version (MARS)
1261 {
1262                     // Only look at try-finally blocks
1263                     if (bf.BC == BCjcatch)
1264                     {
1265                         continue;
1266                     }
1267 }
1268                     if (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) ||
1269                         config.ehmethod == EHmethod.EH_SEH)
1270                     {
1271                         if (bt.Bscope_index == 0)
1272                         {
1273                             // call __finally
1274                             CodeBuilder cdbs; cdbs.ctor();
1275                             CodeBuilder cdbr; cdbr.ctor();
1276 
1277                             nteh_gensindex(cdb,-1);
1278                             gensaverestore(retregs,cdbs,cdbr);
1279                             cdb.append(cdbs);
1280                             cdb.genc(0xE8,0,0,0,FLblock,cast(targ_size_t)bf.nthSucc(0));
1281                             regcon.immed.mval = 0;
1282                             cdb.append(cdbr);
1283                         }
1284                         else
1285                         {
1286                             nteh_unwind(cdb,retregs,~0);
1287                         }
1288                         break;
1289                     }
1290                     else
1291                     {
1292                         // call __finally
1293                         cdb.append(callFinallyBlock(bf.nthSucc(0), retregs));
1294                     }
1295                 }
1296             }
1297             break;
1298 
1299         case BCasm:
1300         {
1301             assert(!e);
1302             // Mark destroyed registers
1303             CodeBuilder cdbx; cdbx.ctor();
1304             getregs(cdbx,iasm_regs(bl));         // mark destroyed registers
1305             code *c = cdbx.finish();
1306             if (bl.Bsucc)
1307             {   nextb = bl.nthSucc(0);
1308                 if (!bl.Bnext)
1309                 {
1310                     cdb.append(bl.Bcode);
1311                     cdb.append(c);
1312                     goto L5;
1313                 }
1314                 if (nextb != bl.Bnext &&
1315                     bl.Bnext &&
1316                     !(bl.Bnext.BC == BCgoto &&
1317                      !bl.Bnext.Belem &&
1318                      nextb == bl.Bnext.nthSucc(0)))
1319                 {
1320                     // See if already have JMP at end of block
1321                     code *cl = code_last(bl.Bcode);
1322                     if (!cl || cl.Iop != JMP)
1323                     {
1324                         cdb.append(bl.Bcode);
1325                         cdb.append(c);
1326                         goto L5;        // add JMP at end of block
1327                     }
1328                 }
1329             }
1330             cdb.append(bl.Bcode);
1331             break;
1332         }
1333 
1334         default:
1335             debug
1336             printf("bl.BC = %d\n",bl.BC);
1337             assert(0);
1338     }
1339 }
1340 
1341 /***************************
1342  * Allocate registers for function return values.
1343  *
1344  * Params:
1345  *    ty    = return type
1346  *    t     = return type extended info
1347  *    tyf   = function type
1348  *    reg1  = output for the first part register
1349  *    reg2  = output for the second part register
1350  *
1351  * Returns:
1352  *    a bit mask of return registers.
1353  *    0 if function returns on the stack or returns void.
1354  */
1355 regm_t allocretregs(tym_t ty, type *t, tym_t tyf, reg_t *reg1, reg_t *reg2)
1356 {
1357     tym_t ty1 = ty;
1358     tym_t ty2 = TYMAX;
1359 
1360     *reg1 = *reg2 = NOREG;
1361 
1362     if (tybasic(ty) == TYvoid)
1363         return 0;
1364 
1365     if (ty & mTYxmmgpr)
1366     {
1367         ty1 = TYdouble;
1368         ty2 = TYllong;
1369     }
1370     else if (ty & mTYgprxmm)
1371     {
1372         ty1 = TYllong;
1373         ty2 = TYdouble;
1374     }
1375 
1376     if (tybasic(ty) == TYstruct)
1377     {
1378         assert(t);
1379         ty1 = t.Tty;
1380     }
1381 
1382     switch (tyrelax(ty1))
1383     {
1384         case TYcent:
1385             if (!I64 || config.exe == EX_WIN64)
1386                 return 0;
1387             ty1 = ty2 = TYllong;
1388             break;
1389 
1390         case TYcdouble:
1391             if (tybasic(tyf) == TYjfunc && I32)
1392                 break;
1393             if (!I64 || config.exe == EX_WIN64)
1394                 return 0;
1395             ty1 = ty2 = TYdouble;
1396             break;
1397 
1398         case TYcfloat:
1399             if (tybasic(tyf) == TYjfunc && I32)
1400                 break;
1401             if (!I64)
1402                 goto case TYllong;
1403             if (config.exe == EX_WIN64)
1404                 ty1 = TYllong;
1405             else
1406                 ty1 = TYdouble;
1407             break;
1408 
1409         case TYcldouble:
1410             if (tybasic(tyf) == TYjfunc && I32)
1411                 break;
1412             if (!I64 || config.exe == EX_WIN64)
1413                 return 0;
1414             break;
1415 
1416         case TYllong:
1417             if (!I64)
1418                 ty1 = ty2 = TYlong;
1419             break;
1420 
1421         case TYarray:
1422             type* targ1, targ2;
1423             argtypes(t, targ1, targ2);
1424             if (targ1)
1425                 ty1 = targ1.Tty;
1426             else
1427                 return 0;
1428             if (targ2)
1429                 ty2 = targ2.Tty;
1430             break;
1431 
1432         case TYstruct:
1433             assert(t);
1434             if (I64 && config.exe != EX_WIN64)
1435             {
1436                 assert(tybasic(t.Tty) == TYstruct);
1437                 type *targ1 = t.Ttag.Sstruct.Sarg1type;
1438                 type *targ2 = t.Ttag.Sstruct.Sarg2type;
1439                 if (targ1)
1440                     ty1 = targ1.Tty;
1441                 else
1442                     return 0;
1443                 if (targ2)
1444                     ty2 = targ2.Tty;
1445                 break;
1446             }
1447             else if (!(t.Ttag.Sstruct.Sflags & STRnotpod))
1448             {
1449                 // windows only, return POD of 1, 2, 4, or 8 bytes on EAX(:EDX)
1450                 if (!(config.exe & (EX_WIN64 | EX_WIN32)))
1451                     return 0;
1452 
1453                 uint sz = cast(uint) type_size(t);
1454 
1455                 if (sz > 8 || sz == 0)
1456                     return 0;
1457 
1458                 if (sz == 8)
1459                 {
1460                     if (config.exe == EX_WIN64)
1461                         ty1 = TYllong;
1462                     else
1463                         ty1 = ty2 = TYlong;
1464                 }
1465                 else if (sz == 4 || sz == 2 || sz == 1)
1466                     ty1 = TYlong;
1467                 else
1468                     return 0;
1469 
1470                 break;
1471             }
1472             return 0;
1473 
1474         default:
1475             break;
1476     }
1477 
1478 
1479     static struct RetRegsAllocator
1480     {
1481     nothrow:
1482         static reg_t[2] gp_regs = [AX, DX];
1483         static reg_t[2] xmm_regs = [XMM0, XMM1];
1484 
1485         uint cntgpr = 0,
1486              cntxmm = 0;
1487 
1488         reg_t gpr() { return gp_regs[cntgpr++]; }
1489         reg_t xmm() { return xmm_regs[cntxmm++]; }
1490     }
1491 
1492     tym_t tym = ty1;
1493     reg_t *reg = reg1;
1494     RetRegsAllocator rralloc;
1495     for (int v = 0; v < 2; ++v)
1496     {
1497         if (tym == TYMAX) continue;
1498         switch (tysize(tym))
1499         {
1500         case 1:
1501         case 2:
1502         case 4:
1503             if (tyfloating(tym))
1504             {
1505                 if (I64)
1506                     *reg = rralloc.xmm();
1507                 else
1508                     *reg = ST0;
1509             }
1510             else
1511                 *reg = rralloc.gpr();
1512             break;
1513 
1514         case 8:
1515             if (tycomplex(tym))
1516             {
1517                 assert(tybasic(tyf) == TYjfunc && I32);
1518                 *reg = ST01;
1519                 break;
1520             }
1521             assert(I64 || tyfloating(tym));
1522             goto case 4;
1523 
1524         default:
1525             if (tybasic(tym) == TYldouble || tybasic(tym) == TYildouble)
1526             {
1527                 *reg = ST0;
1528                 break;
1529             }
1530             else if (tybasic(tym) == TYcldouble)
1531             {
1532                 *reg = ST01;
1533                 break;
1534             }
1535             else if (tycomplex(tym) && tybasic(tyf) == TYjfunc && I32)
1536             {
1537                 *reg = ST01;
1538                 break;
1539             }
1540             else if (tysimd(tym))
1541             {
1542                 *reg = rralloc.xmm();
1543                 break;
1544             }
1545 
1546             debug WRTYxx(tym);
1547             assert(0);
1548         }
1549         tym = ty2;
1550         reg = reg2;
1551     }
1552     return (mask(*reg1) | mask(*reg2)) & ~mask(NOREG);
1553 }
1554 
1555 /***********************************************
1556  * Struct necessary for sorting switch cases.
1557  */
1558 
1559 alias _compare_fp_t = extern(C) nothrow int function(const void*, const void*);
1560 extern(C) void qsort(void* base, size_t nmemb, size_t size, _compare_fp_t compar);
1561 
1562 extern (C)  // qsort cmp functions need to be "C"
1563 {
1564 struct CaseVal
1565 {
1566     targ_ullong val;
1567     block *target;
1568 
1569     /* Sort function for qsort() */
1570     extern (C) static nothrow int cmp(scope const(void*) p, scope const(void*) q)
1571     {
1572         const(CaseVal)* c1 = cast(const(CaseVal)*)p;
1573         const(CaseVal)* c2 = cast(const(CaseVal)*)q;
1574         return (c1.val < c2.val) ? -1 : ((c1.val == c2.val) ? 0 : 1);
1575     }
1576 }
1577 }
1578 
1579 /***
1580  * Generate comparison of [reg2,reg] with val
1581  */
1582 private void cmpval(ref CodeBuilder cdb, targ_llong val, uint sz, reg_t reg, reg_t reg2, reg_t sreg)
1583 {
1584     if (I64 && sz == 8)
1585     {
1586         assert(reg2 == NOREG);
1587         if (val == cast(int)val)    // if val is a 64 bit value sign-extended from 32 bits
1588         {
1589             cdb.genc2(0x81,modregrmx(3,7,reg),cast(targ_size_t)val);     // CMP reg,value32
1590             cdb.last().Irex |= REX_W;                  // 64 bit operand
1591         }
1592         else
1593         {
1594             assert(sreg != NOREG);
1595             movregconst(cdb,sreg,cast(targ_size_t)val,64);  // MOV sreg,val64
1596             genregs(cdb,0x3B,reg,sreg);    // CMP reg,sreg
1597             code_orrex(cdb.last(), REX_W);
1598             getregsNoSave(mask(sreg));                  // don't remember we loaded this constant
1599         }
1600     }
1601     else if (reg2 == NOREG)
1602         cdb.genc2(0x81,modregrmx(3,7,reg),cast(targ_size_t)val);         // CMP reg,casevalue
1603     else
1604     {
1605         cdb.genc2(0x81,modregrm(3,7,reg2),cast(targ_size_t)MSREG(val));  // CMP reg2,MSREG(casevalue)
1606         code *cnext = gennop(null);
1607         genjmp(cdb,JNE,FLcode,cast(block *) cnext);  // JNE cnext
1608         cdb.genc2(0x81,modregrm(3,7,reg),cast(targ_size_t)val);          // CMP reg,casevalue
1609         cdb.append(cnext);
1610     }
1611 }
1612 
1613 private void ifthen(ref CodeBuilder cdb, CaseVal *casevals, size_t ncases,
1614         uint sz, reg_t reg, reg_t reg2, reg_t sreg, block *bdefault, bool last)
1615 {
1616     if (ncases >= 4 && config.flags4 & CFG4speed)
1617     {
1618         size_t pivot = ncases >> 1;
1619 
1620         // Compares for casevals[0..pivot]
1621         CodeBuilder cdb1; cdb1.ctor();
1622         ifthen(cdb1, casevals, pivot, sz, reg, reg2, sreg, bdefault, true);
1623 
1624         // Compares for casevals[pivot+1..ncases]
1625         CodeBuilder cdb2; cdb2.ctor();
1626         ifthen(cdb2, casevals + pivot + 1, ncases - pivot - 1, sz, reg, reg2, sreg, bdefault, last);
1627         code *c2 = gennop(null);
1628 
1629         // Compare for caseval[pivot]
1630         cmpval(cdb, casevals[pivot].val, sz, reg, reg2, sreg);
1631         genjmp(cdb,JE,FLblock,casevals[pivot].target); // JE target
1632         // Note uint jump here, as cases were sorted using uint comparisons
1633         genjmp(cdb,JA,FLcode,cast(block *) c2);           // JG c2
1634 
1635         cdb.append(cdb1);
1636         cdb.append(c2);
1637         cdb.append(cdb2);
1638     }
1639     else
1640     {   // Not worth doing a binary search, just do a sequence of CMP/JE
1641         for (size_t n = 0; n < ncases; n++)
1642         {
1643             targ_llong val = casevals[n].val;
1644             cmpval(cdb, val, sz, reg, reg2, sreg);
1645             code *cnext = null;
1646             if (reg2 != NOREG)
1647             {
1648                 cnext = gennop(null);
1649                 genjmp(cdb,JNE,FLcode,cast(block *) cnext);  // JNE cnext
1650                 cdb.genc2(0x81,modregrm(3,7,reg2),cast(targ_size_t)MSREG(val));   // CMP reg2,MSREG(casevalue)
1651             }
1652             genjmp(cdb,JE,FLblock,casevals[n].target);   // JE caseaddr
1653             cdb.append(cnext);
1654         }
1655 
1656         if (last)       // if default is not next block
1657             genjmp(cdb,JMP,FLblock,bdefault);
1658     }
1659 }
1660 
1661 /*******************************
1662  * Generate code for blocks ending in a switch statement.
1663  * Take BCswitch and decide on
1664  *      BCifthen        use if - then code
1665  *      BCjmptab        index into jump table
1666  *      BCswitch        search table for match
1667  */
1668 
1669 void doswitch(ref CodeBuilder cdb, block *b)
1670 {
1671     targ_ulong msw;
1672 
1673     // If switch tables are in code segment and we need a CS: override to get at them
1674     bool csseg = cast(bool)(config.flags & CFGromable);
1675 
1676     //printf("doswitch(%d)\n", b.BC);
1677     elem *e = b.Belem;
1678     elem_debug(e);
1679     docommas(cdb,&e);
1680     cgstate.stackclean++;
1681     tym_t tys = tybasic(e.Ety);
1682     int sz = _tysize[tys];
1683     bool dword = (sz == 2 * REGSIZE);
1684     bool mswsame = true;                // assume all msw's are the same
1685     targ_llong *p = b.Bswitch;          // pointer to case data
1686     assert(p);
1687     uint ncases = cast(uint)*p++;       // number of cases
1688 
1689     targ_llong vmax = MINLL;            // smallest possible llong
1690     targ_llong vmin = MAXLL;            // largest possible llong
1691     for (uint n = 0; n < ncases; n++)   // find max and min case values
1692     {
1693         targ_llong val = *p++;
1694         if (val > vmax) vmax = val;
1695         if (val < vmin) vmin = val;
1696         if (REGSIZE == 2)
1697         {
1698             ushort ms = (val >> 16) & 0xFFFF;
1699             if (n == 0)
1700                 msw = ms;
1701             else if (msw != ms)
1702                 mswsame = 0;
1703         }
1704         else // REGSIZE == 4
1705         {
1706             targ_ulong ms = (val >> 32) & 0xFFFFFFFF;
1707             if (n == 0)
1708                 msw = ms;
1709             else if (msw != ms)
1710                 mswsame = 0;
1711         }
1712     }
1713     p -= ncases;
1714     //dbg_printf("vmax = x%lx, vmin = x%lx, vmax-vmin = x%lx\n",vmax,vmin,vmax - vmin);
1715 
1716     /* Three kinds of switch strategies - pick one
1717      */
1718     if (ncases <= 3)
1719         goto Lifthen;
1720     else if (I16 && cast(targ_ullong)(vmax - vmin) <= ncases * 2)
1721         goto Ljmptab;           // >=50% of the table is case values, rest is default
1722     else if (cast(targ_ullong)(vmax - vmin) <= ncases * 3)
1723         goto Ljmptab;           // >= 33% of the table is case values, rest is default
1724     else if (I16)
1725         goto Lswitch;
1726     else
1727         goto Lifthen;
1728 
1729     /*************************************************************************/
1730     {   // generate if-then sequence
1731     Lifthen:
1732         regm_t retregs = ALLREGS;
1733         b.BC = BCifthen;
1734         scodelem(cdb,e,&retregs,0,true);
1735         reg_t reg, reg2;
1736         if (dword)
1737         {   reg = findreglsw(retregs);
1738             reg2 = findregmsw(retregs);
1739         }
1740         else
1741         {
1742             reg = findreg(retregs);     // reg that result is in
1743             reg2 = NOREG;
1744         }
1745         list_t bl = b.Bsucc;
1746         block *bdefault = b.nthSucc(0);
1747         if (dword && mswsame)
1748         {
1749             cdb.genc2(0x81,modregrm(3,7,reg2),msw);   // CMP reg2,MSW
1750             genjmp(cdb,JNE,FLblock,bdefault);  // JNE default
1751             reg2 = NOREG;
1752         }
1753 
1754         reg_t sreg = NOREG;                          // may need a scratch register
1755 
1756         // Put into casevals[0..ncases] so we can sort then slice
1757         CaseVal *casevals = cast(CaseVal *)malloc(ncases * CaseVal.sizeof);
1758         assert(casevals);
1759         for (uint n = 0; n < ncases; n++)
1760         {
1761             casevals[n].val = p[n];
1762             bl = list_next(bl);
1763             casevals[n].target = list_block(bl);
1764 
1765             // See if we need a scratch register
1766             if (sreg == NOREG && I64 && sz == 8 && p[n] != cast(int)p[n])
1767             {   regm_t regm = ALLREGS & ~mask(reg);
1768                 allocreg(cdb,&regm, &sreg, TYint);
1769             }
1770         }
1771 
1772         // Sort cases so we can do a runtime binary search
1773         qsort(casevals, ncases, CaseVal.sizeof, &CaseVal.cmp);
1774 
1775         //for (uint n = 0; n < ncases; n++)
1776             //printf("casevals[%lld] = x%x\n", n, casevals[n].val);
1777 
1778         // Generate binary tree of comparisons
1779         ifthen(cdb, casevals, ncases, sz, reg, reg2, sreg, bdefault, bdefault != b.Bnext);
1780 
1781         free(casevals);
1782 
1783         cgstate.stackclean--;
1784         return;
1785     }
1786 
1787     /*************************************************************************/
1788     {
1789         // Use switch value to index into jump table
1790     Ljmptab:
1791         //printf("Ljmptab:\n");
1792 
1793         b.BC = BCjmptab;
1794 
1795         /* If vmin is small enough, we can just set it to 0 and the jump
1796          * table entries from 0..vmin-1 can be set with the default target.
1797          * This saves the SUB instruction.
1798          * Must be same computation as used in outjmptab().
1799          */
1800         if (vmin > 0 && vmin <= _tysize[TYint])
1801             vmin = 0;
1802 
1803         b.Btablesize = cast(int) (vmax - vmin + 1) * tysize(TYnptr);
1804         regm_t retregs = IDXREGS;
1805         if (dword)
1806             retregs |= mMSW;
1807 static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
1808 {
1809         if (I32 && config.flags3 & CFG3pic)
1810             retregs &= ~mBX;                            // need EBX for GOT
1811 }
1812         bool modify = (I16 || I64 || vmin);
1813         scodelem(cdb,e,&retregs,0,!modify);
1814         reg_t reg = findreg(retregs & IDXREGS); // reg that result is in
1815         reg_t reg2;
1816         if (dword)
1817             reg2 = findregmsw(retregs);
1818         if (modify)
1819         {
1820             assert(!(retregs & regcon.mvar));
1821             getregs(cdb,retregs);
1822         }
1823         if (vmin)                       // if there is a minimum
1824         {
1825             cdb.genc2(0x81,modregrm(3,5,reg),cast(targ_size_t)vmin); // SUB reg,vmin
1826             if (dword)
1827             {   cdb.genc2(0x81,modregrm(3,3,reg2),cast(targ_size_t)MSREG(vmin)); // SBB reg2,vmin
1828                 genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
1829             }
1830         }
1831         else if (dword)
1832         {   gentstreg(cdb,reg2);              // TEST reg2,reg2
1833             genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
1834         }
1835         if (vmax - vmin != REGMASK)     // if there is a maximum
1836         {                               // CMP reg,vmax-vmin
1837             cdb.genc2(0x81,modregrm(3,7,reg),cast(targ_size_t)(vmax-vmin));
1838             if (I64 && sz == 8)
1839                 code_orrex(cdb.last(), REX_W);
1840             genjmp(cdb,JA,FLblock,b.nthSucc(0));  // JA default
1841         }
1842         if (I64)
1843         {
1844             if (!vmin)
1845             {   // Need to clear out high 32 bits of reg
1846                 // Use 8B instead of 89, as 89 will be optimized away as a NOP
1847                 genregs(cdb,0x8B,reg,reg);                 // MOV reg,reg
1848             }
1849             if (config.flags3 & CFG3pic || config.exe == EX_WIN64)
1850             {
1851                 /* LEA    R1,disp[RIP]          48 8D 05 00 00 00 00
1852                  * MOVSXD R2,[reg*4][R1]        48 63 14 B8
1853                  * LEA    R1,[R1][R2]           48 8D 04 02
1854                  * JMP    R1                    FF E0
1855                  */
1856                 reg_t r1;
1857                 regm_t scratchm = ALLREGS & ~mask(reg);
1858                 allocreg(cdb,&scratchm,&r1,TYint);
1859                 reg_t r2;
1860                 scratchm = ALLREGS & ~(mask(reg) | mask(r1));
1861                 allocreg(cdb,&scratchm,&r2,TYint);
1862 
1863                 CodeBuilder cdbe; cdbe.ctor();
1864                 cdbe.genc1(LEA,(REX_W << 16) | modregxrm(0,r1,5),FLswitch,0);        // LEA R1,disp[RIP]
1865                 cdbe.last().IEV1.Vswitch = b;
1866                 cdbe.gen2sib(0x63,(REX_W << 16) | modregxrm(0,r2,4), modregxrmx(2,reg,r1)); // MOVSXD R2,[reg*4][R1]
1867                 cdbe.gen2sib(LEA,(REX_W << 16) | modregxrm(0,r1,4),modregxrmx(0,r1,r2));    // LEA R1,[R1][R2]
1868                 cdbe.gen2(0xFF,modregrmx(3,4,r1));                                          // JMP R1
1869 
1870                 b.Btablesize = cast(int) (vmax - vmin + 1) * 4;
1871                 code *ce = cdbe.finish();
1872                 pinholeopt(ce, null);
1873 
1874                 cdb.append(cdbe);
1875             }
1876             else
1877             {
1878                 cdb.genc1(0xFF,modregrm(0,4,4),FLswitch,0);   // JMP disp[reg*8]
1879                 cdb.last().IEV1.Vswitch = b;
1880                 cdb.last().Isib = modregrm(3,reg & 7,5);
1881                 if (reg & 8)
1882                     cdb.last().Irex |= REX_X;
1883             }
1884         }
1885         else if (I32)
1886         {
1887 static if (JMPJMPTABLE)
1888 {
1889             /* LEA jreg,offset ctable[reg][reg * 4]
1890                JMP jreg
1891               ctable:
1892                JMP case0
1893                JMP case1
1894                ...
1895              */
1896             CodeBuilder ctable; ctable.ctor();
1897             block *bdef = b.nthSucc(0);
1898             targ_llong u;
1899             for (u = vmin; ; u++)
1900             {   block *targ = bdef;
1901                 for (n = 0; n < ncases; n++)
1902                 {
1903                     if (p[n] == u)
1904                     {   targ = b.nthSucc(n + 1);
1905                         break;
1906                     }
1907                 }
1908                 genjmp(ctable,JMP,FLblock,targ);
1909                 ctable.last().Iflags |= CFjmp5;           // don't shrink these
1910                 if (u == vmax)
1911                     break;
1912             }
1913 
1914             // Allocate scratch register jreg
1915             regm_t scratchm = ALLREGS & ~mask(reg);
1916             uint jreg = AX;
1917             allocreg(cdb,&scratchm,&jreg,TYint);
1918 
1919             // LEA jreg, offset ctable[reg][reg*4]
1920             cdb.genc1(LEA,modregrm(2,jreg,4),FLcode,6);
1921             cdb.last().Isib = modregrm(2,reg,reg);
1922             cdb.gen2(0xFF,modregrm(3,4,jreg));      // JMP jreg
1923             cdb.append(ctable);
1924             b.Btablesize = 0;
1925             cgstate.stackclean--;
1926             return;
1927 }
1928 else static if (TARGET_OSX)
1929 {
1930             /*     CALL L1
1931              * L1: POP  R1
1932              *     ADD  R1,disp[reg*4][R1]
1933              *     JMP  R1
1934              */
1935             // Allocate scratch register r1
1936             regm_t scratchm = ALLREGS & ~mask(reg);
1937             reg_t r1;
1938             allocreg(cdb,&scratchm,&r1,TYint);
1939 
1940             cdb.genc2(CALL,0,0);                           //     CALL L1
1941             cdb.gen1(0x58 + r1);                           // L1: POP R1
1942             cdb.genc1(0x03,modregrm(2,r1,4),FLswitch,0);   // ADD R1,disp[reg*4][EBX]
1943             cdb.last().IEV1.Vswitch = b;
1944             cdb.last().Isib = modregrm(2,reg,r1);
1945             cdb.gen2(0xFF,modregrm(3,4,r1));               // JMP R1
1946 }
1947 else
1948 {
1949             if (config.flags3 & CFG3pic)
1950             {
1951                 /* MOV  R1,EBX
1952                  * SUB  R1,funcsym_p@GOTOFF[offset][reg*4][EBX]
1953                  * JMP  R1
1954                  */
1955 
1956                 // Load GOT in EBX
1957                 load_localgot(cdb);
1958 
1959                 // Allocate scratch register r1
1960                 regm_t scratchm = ALLREGS & ~(mask(reg) | mBX);
1961                 reg_t r1;
1962                 allocreg(cdb,&scratchm,&r1,TYint);
1963 
1964                 genmovreg(cdb,r1,BX);              // MOV R1,EBX
1965                 cdb.genc1(0x2B,modregxrm(2,r1,4),FLswitch,0);   // SUB R1,disp[reg*4][EBX]
1966                 cdb.last().IEV1.Vswitch = b;
1967                 cdb.last().Isib = modregrm(2,reg,BX);
1968                 cdb.gen2(0xFF,modregrmx(3,4,r1));               // JMP R1
1969             }
1970             else
1971             {
1972                 cdb.genc1(0xFF,modregrm(0,4,4),FLswitch,0);     // JMP disp[idxreg*4]
1973                 cdb.last().IEV1.Vswitch = b;
1974                 cdb.last().Isib = modregrm(2,reg,5);
1975             }
1976 }
1977         }
1978         else if (I16)
1979         {
1980             cdb.gen2(0xD1,modregrm(3,4,reg));                   // SHL reg,1
1981             uint rm = getaddrmode(retregs) | modregrm(0,4,0);
1982             cdb.genc1(0xFF,rm,FLswitch,0);                  // JMP [CS:]disp[idxreg]
1983             cdb.last().IEV1.Vswitch = b;
1984             cdb.last().Iflags |= csseg ? CFcs : 0;                       // segment override
1985         }
1986         else
1987             assert(0);
1988         cgstate.stackclean--;
1989         return;
1990     }
1991 
1992     /*************************************************************************/
1993     {
1994         /* Scan a table of case values, and jump to corresponding address.
1995          * Since it relies on REPNE SCASW, it has really nothing to recommend it
1996          * over Lifthen for 32 and 64 bit code.
1997          * Note that it has not been tested with MACHOBJ (OSX).
1998          */
1999     Lswitch:
2000         regm_t retregs = mAX;                  // SCASW requires AX
2001         if (dword)
2002             retregs |= mDX;
2003         else if (ncases <= 6 || config.flags4 & CFG4speed)
2004             goto Lifthen;
2005         scodelem(cdb,e,&retregs,0,true);
2006         if (dword && mswsame)
2007         {   /* CMP DX,MSW       */
2008             cdb.genc2(0x81,modregrm(3,7,DX),msw);
2009             genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
2010         }
2011         getregs(cdb,mCX|mDI);
2012 static if (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2013 {
2014         if (config.flags3 & CFG3pic)
2015         {   // Add in GOT
2016             getregs(cdb,mDX);
2017             cdb.genc2(CALL,0,0);        //     CALL L1
2018             cdb.gen1(0x58 + DI);        // L1: POP EDI
2019 
2020                                         //     ADD EDI,_GLOBAL_OFFSET_TABLE_+3
2021             Symbol *gotsym = Obj.getGOTsym();
2022             cdb.gencs(0x81,modregrm(3,0,DI),FLextern,gotsym);
2023             cdb.last().Iflags = CFoff;
2024             cdb.last().IEV2.Voffset = 3;
2025 
2026             makeitextern(gotsym);
2027 
2028             genmovreg(cdb, DX, DI);    // MOV EDX, EDI
2029                                         // ADD EDI,offset of switch table
2030             cdb.gencs(0x81,modregrm(3,0,DI),FLswitch,null);
2031             cdb.last().IEV2.Vswitch = b;
2032         }
2033 }
2034         if (!(config.flags3 & CFG3pic))
2035         {
2036                                         // MOV DI,offset of switch table
2037             cdb.gencs(0xC7,modregrm(3,0,DI),FLswitch,null);
2038             cdb.last().IEV2.Vswitch = b;
2039         }
2040         movregconst(cdb,CX,ncases,0);    // MOV CX,ncases
2041 
2042         /* The switch table will be accessed through ES:DI.
2043          * Therefore, load ES with proper segment value.
2044          */
2045         if (config.flags3 & CFG3eseqds)
2046         {
2047             assert(!csseg);
2048             getregs(cdb,mCX);           // allocate CX
2049         }
2050         else
2051         {
2052             getregs(cdb,mES|mCX);       // allocate ES and CX
2053             cdb.gen1(csseg ? 0x0E : 0x1E);      // PUSH CS/DS
2054             cdb.gen1(0x07);                     // POP  ES
2055         }
2056 
2057         targ_size_t disp = (ncases - 1) * _tysize[TYint];  // displacement to jump table
2058         if (dword && !mswsame)
2059         {
2060 
2061             /* Build the following:
2062                 L1:     SCASW
2063                         JNE     L2
2064                         CMP     DX,[CS:]disp[DI]
2065                 L2:     LOOPNE  L1
2066              */
2067 
2068             const int mod = (disp > 127) ? 2 : 1;         // displacement size
2069             code *cloop = genc2(null,0xE0,0,-7 - mod - csseg);   // LOOPNE scasw
2070             cdb.gen1(0xAF);                                      // SCASW
2071             code_orflag(cdb.last(),CFtarg2);                     // target of jump
2072             genjmp(cdb,JNE,FLcode,cast(block *) cloop); // JNE loop
2073                                                                  // CMP DX,[CS:]disp[DI]
2074             cdb.genc1(0x39,modregrm(mod,DX,5),FLconst,disp);
2075             cdb.last().Iflags |= csseg ? CFcs : 0;              // possible seg override
2076             cdb.append(cloop);
2077             disp += ncases * _tysize[TYint];           // skip over msw table
2078         }
2079         else
2080         {
2081             cdb.gen1(0xF2);              // REPNE
2082             cdb.gen1(0xAF);              // SCASW
2083         }
2084         genjmp(cdb,JNE,FLblock,b.nthSucc(0)); // JNE default
2085         const int mod = (disp > 127) ? 2 : 1;     // 1 or 2 byte displacement
2086         if (csseg)
2087             cdb.gen1(SEGCS);            // table is in code segment
2088 static if (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2089 {
2090         if (config.flags3 & CFG3pic)
2091         {                               // ADD EDX,(ncases-1)*2[EDI]
2092             cdb.genc1(0x03,modregrm(mod,DX,7),FLconst,disp);
2093                                         // JMP EDX
2094             cdb.gen2(0xFF,modregrm(3,4,DX));
2095         }
2096 }
2097         if (!(config.flags3 & CFG3pic))
2098         {                               // JMP (ncases-1)*2[DI]
2099             cdb.genc1(0xFF,modregrm(mod,4,(I32 ? 7 : 5)),FLconst,disp);
2100             cdb.last().Iflags |= csseg ? CFcs : 0;
2101         }
2102         b.Btablesize = disp + _tysize[TYint] + ncases * tysize(TYnptr);
2103         //assert(b.Bcode);
2104         cgstate.stackclean--;
2105         return;
2106     }
2107 }
2108 
2109 /******************************
2110  * Output data block for a jump table (BCjmptab).
2111  * The 'holes' in the table get filled with the
2112  * default label.
2113  */
2114 
2115 void outjmptab(block *b)
2116 {
2117     if (JMPJMPTABLE && I32)
2118         return;
2119 
2120     targ_llong *p = b.Bswitch;               // pointer to case data
2121     size_t ncases = cast(size_t)*p++;        // number of cases
2122 
2123     /* Find vmin and vmax, the range of the table will be [vmin .. vmax + 1]
2124      * Must be same computation as used in doswitch().
2125      */
2126     targ_llong vmax = MINLL;                 // smallest possible llong
2127     targ_llong vmin = MAXLL;                 // largest possible llong
2128     for (size_t n = 0; n < ncases; n++)      // find min case value
2129     {   targ_llong val = p[n];
2130         if (val > vmax) vmax = val;
2131         if (val < vmin) vmin = val;
2132     }
2133     if (vmin > 0 && vmin <= _tysize[TYint])
2134         vmin = 0;
2135     assert(vmin <= vmax);
2136 
2137     /* Segment and offset into which the jump table will be emitted
2138      */
2139     int jmpseg = objmod.jmpTableSegment(funcsym_p);
2140     targ_size_t *poffset = &Offset(jmpseg);
2141 
2142     /* Align start of jump table
2143      */
2144     targ_size_t alignbytes = _align(0,*poffset) - *poffset;
2145     objmod.lidata(jmpseg,*poffset,alignbytes);
2146     assert(*poffset == b.Btableoffset);        // should match precomputed value
2147 
2148     Symbol *gotsym = null;
2149     targ_size_t def = b.nthSucc(0).Boffset;  // default address
2150     for (targ_llong u = vmin; ; u++)
2151     {   targ_size_t targ = def;                     // default
2152         for (size_t n = 0; n < ncases; n++)
2153         {       if (p[n] == u)
2154                 {       targ = b.nthSucc(cast(int)(n + 1)).Boffset;
2155                         break;
2156                 }
2157         }
2158 static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2159 {
2160         if (I64)
2161         {
2162             if (config.flags3 & CFG3pic)
2163             {
2164                 objmod.reftodatseg(jmpseg,*poffset,targ + (u - vmin) * 4,funcsym_p.Sseg,CFswitch);
2165                 *poffset += 4;
2166             }
2167             else
2168             {
2169                 objmod.reftodatseg(jmpseg,*poffset,targ,funcsym_p.Sxtrnnum,CFoffset64 | CFswitch);
2170                 *poffset += 8;
2171             }
2172         }
2173         else
2174         {
2175             if (config.flags3 & CFG3pic)
2176             {
2177                 assert(config.flags & CFGromable);
2178                 // Want a GOTPC fixup to _GLOBAL_OFFSET_TABLE_
2179                 if (!gotsym)
2180                     gotsym = Obj.getGOTsym();
2181                 objmod.reftoident(jmpseg,*poffset,gotsym,*poffset - targ,CFswitch);
2182             }
2183             else
2184                 objmod.reftocodeseg(jmpseg,*poffset,targ);
2185             *poffset += 4;
2186         }
2187 }
2188 else static if (TARGET_OSX)
2189 {
2190         targ_size_t val;
2191         if (I64)
2192             val = targ - b.Btableoffset;
2193         else
2194             val = targ - b.Btablebase;
2195         objmod.write_bytes(SegData[jmpseg],4,&val);
2196 }
2197 else static if (TARGET_WINDOS)
2198 {
2199         if (I64)
2200         {
2201             targ_size_t val = targ - b.Btableoffset;
2202             objmod.write_bytes(SegData[jmpseg],4,&val);
2203         }
2204         else
2205         {
2206             objmod.reftocodeseg(jmpseg,*poffset,targ);
2207             *poffset += tysize(TYnptr);
2208         }
2209 }
2210 else
2211         assert(0);
2212 
2213         if (u == vmax)                  // for case that (vmax == ~0)
2214             break;
2215     }
2216 }
2217 
2218 
2219 /******************************
2220  * Output data block for a switch table.
2221  * Two consecutive tables, the first is the case value table, the
2222  * second is the address table.
2223  */
2224 
2225 void outswitab(block *b)
2226 {
2227     //printf("outswitab()\n");
2228     targ_llong *p = b.Bswitch;        // pointer to case data
2229     uint ncases = cast(uint)*p++;     // number of cases
2230 
2231     const int seg = objmod.jmpTableSegment(funcsym_p);
2232     targ_size_t *poffset = &Offset(seg);
2233     targ_size_t offset = *poffset;
2234     targ_size_t alignbytes = _align(0,*poffset) - *poffset;
2235     objmod.lidata(seg,*poffset,alignbytes);  // any alignment bytes necessary
2236     assert(*poffset == offset + alignbytes);
2237 
2238     uint sz = _tysize[TYint];
2239     assert(SegData[seg].SDseg == seg);
2240     for (uint n = 0; n < ncases; n++)          // send out value table
2241     {
2242         //printf("\tcase %d, offset = x%x\n", n, *poffset);
2243         objmod.write_bytes(SegData[seg],sz,p);
2244         p++;
2245     }
2246     offset += alignbytes + sz * ncases;
2247     assert(*poffset == offset);
2248 
2249     if (b.Btablesize == ncases * (REGSIZE * 2 + tysize(TYnptr)))
2250     {
2251         // Send out MSW table
2252         p -= ncases;
2253         for (uint n = 0; n < ncases; n++)
2254         {
2255             targ_size_t val = cast(targ_size_t)MSREG(*p);
2256             p++;
2257             objmod.write_bytes(SegData[seg],REGSIZE,&val);
2258         }
2259         offset += REGSIZE * ncases;
2260         assert(*poffset == offset);
2261     }
2262 
2263     list_t bl = b.Bsucc;
2264     for (uint n = 0; n < ncases; n++)          // send out address table
2265     {
2266         bl = list_next(bl);
2267         objmod.reftocodeseg(seg,*poffset,list_block(bl).Boffset);
2268         *poffset += tysize(TYnptr);
2269     }
2270     assert(*poffset == offset + ncases * tysize(TYnptr));
2271 }
2272 
2273 /*****************************
2274  * Return a jump opcode relevant to the elem for a JMP true.
2275  */
2276 
2277 int jmpopcode(elem *e)
2278 {
2279     tym_t tym;
2280     int zero,i,jp,op;
2281     static immutable ubyte[6][2][2] jops =
2282     [   /* <=  >   <   >=  ==  !=    <=0 >0  <0  >=0 ==0 !=0    */
2283        [ [JLE,JG ,JL ,JGE,JE ,JNE],[JLE,JG ,JS ,JNS,JE ,JNE] ], /* signed   */
2284        [ [JBE,JA ,JB ,JAE,JE ,JNE],[JE ,JNE,JB ,JAE,JE ,JNE] ], /* uint */
2285 /+
2286        [ [JLE,JG ,JL ,JGE,JE ,JNE],[JLE,JG ,JL ,JGE,JE ,JNE] ], /* real     */
2287        [ [JBE,JA ,JB ,JAE,JE ,JNE],[JBE,JA ,JB ,JAE,JE ,JNE] ], /* 8087     */
2288        [ [JA ,JBE,JAE,JB ,JE ,JNE],[JBE,JA ,JB ,JAE,JE ,JNE] ], /* 8087 R   */
2289 +/
2290     ];
2291 
2292     enum
2293     {
2294         XP     = (JP  << 8),
2295         XNP    = (JNP << 8),
2296     }
2297     static immutable uint[26][1] jfops =
2298     /*   le     gt lt     ge  eqeq    ne     unord lg  leg  ule ul uge  */
2299     [
2300       [ XNP|JBE,JA,XNP|JB,JAE,XNP|JE, XP|JNE,JP,   JNE,JNP, JBE,JC,XP|JAE,
2301 
2302     /*  ug    ue ngt nge nlt    nle    ord nlg nleg nule nul nuge    nug     nue */
2303         XP|JA,JE,JBE,JB, XP|JAE,XP|JA, JNP,JE, JP,  JA,  JNC,XNP|JB, XNP|JBE,JNE        ], /* 8087     */
2304     ];
2305 
2306     assert(e);
2307     while (e.Eoper == OPcomma ||
2308         /* The OTleaf(e.EV.E1.Eoper) is to line up with the case in cdeq() where  */
2309         /* we decide if mPSW is passed on when evaluating E2 or not.    */
2310          (e.Eoper == OPeq && OTleaf(e.EV.E1.Eoper)))
2311     {
2312         e = e.EV.E2;                      /* right operand determines it  */
2313     }
2314 
2315     op = e.Eoper;
2316     tym_t tymx = tybasic(e.Ety);
2317     bool needsNanCheck = tyfloating(tymx) && config.inline8087 &&
2318         (tymx == TYldouble || tymx == TYildouble || tymx == TYcldouble ||
2319          tymx == TYcdouble || tymx == TYcfloat ||
2320          (tyxmmreg(tymx) && config.fpxmmregs && e.Ecount != e.Ecomsub) ||
2321          op == OPind ||
2322          (OTcall(op) && (regmask(tymx, tybasic(e.EV.E1.Eoper)) & (mST0 | XMMREGS))));
2323     if (e.Ecount != e.Ecomsub)          // comsubs just get Z bit set
2324     {
2325         if (needsNanCheck) // except for floating point values that need a NaN check
2326             return XP|JNE;
2327         else
2328             return JNE;
2329     }
2330     if (!OTrel(op))                       // not relational operator
2331     {
2332         if (needsNanCheck)
2333             return XP|JNE;
2334 
2335         if (op == OPu32_64) { e = e.EV.E1; op = e.Eoper; }
2336         if (op == OPu16_32) { e = e.EV.E1; op = e.Eoper; }
2337         if (op == OPu8_16) op = e.EV.E1.Eoper;
2338         return ((op >= OPbt && op <= OPbts) || op == OPbtst) ? JC : JNE;
2339     }
2340 
2341     if (e.EV.E2.Eoper == OPconst)
2342         zero = !boolres(e.EV.E2);
2343     else
2344         zero = 0;
2345 
2346     tym = e.EV.E1.Ety;
2347     if (tyfloating(tym))
2348     {
2349 static if (1)
2350 {
2351         i = 0;
2352         if (config.inline8087)
2353         {   i = 1;
2354 
2355 static if (1)
2356 {
2357             if (rel_exception(op) || config.flags4 & CFG4fastfloat)
2358             {
2359                 const bool NOSAHF = (I64 || config.fpxmmregs);
2360                 if (zero)
2361                 {
2362                     if (NOSAHF)
2363                         op = swaprel(op);
2364                 }
2365                 else if (NOSAHF)
2366                     op = swaprel(op);
2367                 else if (cmporder87(e.EV.E2))
2368                     op = swaprel(op);
2369                 else
2370                 { }
2371             }
2372             else
2373             {
2374                 if (zero && config.target_cpu < TARGET_80386)
2375                 { }
2376                 else
2377                     op = swaprel(op);
2378             }
2379 }
2380 else
2381 {
2382             if (zero && !rel_exception(op) && config.target_cpu >= TARGET_80386)
2383                 op = swaprel(op);
2384             else if (!zero &&
2385                 (cmporder87(e.EV.E2) || !(rel_exception(op) || config.flags4 & CFG4fastfloat)))
2386                 /* compare is reversed */
2387                 op = swaprel(op);
2388 }
2389         }
2390         jp = jfops[0][op - OPle];
2391         goto L1;
2392 }
2393 else
2394 {
2395         i = (config.inline8087) ? (3 + cmporder87(e.EV.E2)) : 2;
2396 }
2397     }
2398     else if (tyuns(tym) || tyuns(e.EV.E2.Ety))
2399         i = 1;
2400     else if (tyintegral(tym) || typtr(tym))
2401         i = 0;
2402     else
2403     {
2404         debug
2405         elem_print(e);
2406         WRTYxx(tym);
2407         assert(0);
2408     }
2409 
2410     jp = jops[i][zero][op - OPle];        /* table starts with OPle       */
2411 
2412     /* Try to rewrite uint comparisons so they rely on just the Carry flag
2413      */
2414     if (i == 1 && (jp == JA || jp == JBE) &&
2415         (e.EV.E2.Eoper != OPconst && e.EV.E2.Eoper != OPrelconst))
2416     {
2417         jp = (jp == JA) ? JC : JNC;
2418     }
2419 
2420 L1:
2421     debug
2422     if ((jp & 0xF0) != 0x70)
2423     {
2424         WROP(op);
2425         printf("i %d zero %d op x%x jp x%x\n",i,zero,op,jp);
2426     }
2427 
2428     assert((jp & 0xF0) == 0x70);
2429     return jp;
2430 }
2431 
2432 /**********************************
2433  * Append code to cdb which validates pointer described by
2434  * addressing mode in *pcs. Modify addressing mode in *pcs.
2435  * Params:
2436  *    cdb = append generated code to this
2437  *    pcs = original addressing mode to be updated
2438  *    keepmsk = mask of registers we must not destroy or use
2439  *              if (keepmsk & RMstore), this will be only a store operation
2440  *              into the lvalue
2441  */
2442 
2443 void cod3_ptrchk(ref CodeBuilder cdb,code *pcs,regm_t keepmsk)
2444 {
2445     ubyte sib;
2446     reg_t reg;
2447     uint flagsave;
2448 
2449     assert(!I64);
2450     if (!I16 && pcs.Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
2451         return;         // not designed to deal with 48 bit far pointers
2452 
2453     ubyte rm = pcs.Irm;
2454     assert(!(rm & 0x40));       // no disp8 or reg addressing modes
2455 
2456     // If the addressing mode is already a register
2457     reg = rm & 7;
2458     if (I16)
2459     {   static immutable ubyte[8] imode = [ BP,BP,BP,BP,SI,DI,BP,BX ];
2460 
2461         reg = imode[reg];               // convert [SI] to SI, etc.
2462     }
2463     regm_t idxregs = mask(reg);
2464     if ((rm & 0x80 && (pcs.IFL1 != FLoffset || pcs.IEV1.Vuns)) ||
2465         !(idxregs & ALLREGS)
2466        )
2467     {
2468         // Load the offset into a register, so we can push the address
2469         regm_t idxregs2 = (I16 ? IDXREGS : ALLREGS) & ~keepmsk; // only these can be index regs
2470         assert(idxregs2);
2471         allocreg(cdb,&idxregs2,&reg,TYoffset);
2472 
2473         const opsave = pcs.Iop;
2474         flagsave = pcs.Iflags;
2475         pcs.Iop = LEA;
2476         pcs.Irm |= modregrm(0,reg,0);
2477         pcs.Iflags &= ~(CFopsize | CFss | CFes | CFcs);        // no prefix bytes needed
2478         cdb.gen(pcs);                 // LEA reg,EA
2479 
2480         pcs.Iflags = flagsave;
2481         pcs.Iop = opsave;
2482     }
2483 
2484     // registers destroyed by the function call
2485     //used = (mBP | ALLREGS | mES) & ~fregsaved;
2486     regm_t used = 0;                           // much less code generated this way
2487 
2488     code *cs2 = null;
2489     regm_t tosave = used & (keepmsk | idxregs);
2490     for (int i = 0; tosave; i++)
2491     {
2492         regm_t mi = mask(i);
2493 
2494         assert(i < REGMAX);
2495         if (mi & tosave)        /* i = register to save                 */
2496         {
2497             int push,pop;
2498 
2499             stackchanged = 1;
2500             if (i == ES)
2501             {   push = 0x06;
2502                 pop = 0x07;
2503             }
2504             else
2505             {   push = 0x50 + i;
2506                 pop = push | 8;
2507             }
2508             cdb.gen1(push);                     // PUSH i
2509             cs2 = cat(gen1(null,pop),cs2);      // POP i
2510             tosave &= ~mi;
2511         }
2512     }
2513 
2514     // For 16 bit models, push a far pointer
2515     if (I16)
2516     {
2517         int segreg;
2518 
2519         switch (pcs.Iflags & (CFes | CFss | CFcs | CFds | CFfs | CFgs))
2520         {   case CFes:  segreg = 0x06;  break;
2521             case CFss:  segreg = 0x16;  break;
2522             case CFcs:  segreg = 0x0E;  break;
2523             case 0:     segreg = 0x1E;  break;  // DS
2524             default:
2525                 assert(0);
2526         }
2527 
2528         // See if we should default to SS:
2529         // (Happens when BP is part of the addressing mode)
2530         if (segreg == 0x1E && (rm & 0xC0) != 0xC0 &&
2531             rm & 2 && (rm & 7) != 7)
2532         {
2533             segreg = 0x16;
2534             if (config.wflags & WFssneds)
2535                 pcs.Iflags |= CFss;    // because BP won't be there anymore
2536         }
2537         cdb.gen1(segreg);               // PUSH segreg
2538     }
2539 
2540     cdb.gen1(0x50 + reg);               // PUSH reg
2541 
2542     // Rewrite the addressing mode in *pcs so it is just 0[reg]
2543     setaddrmode(pcs, idxregs);
2544     pcs.IFL1 = FLoffset;
2545     pcs.IEV1.Vuns = 0;
2546 
2547     // Call the validation function
2548     {
2549         makeitextern(getRtlsym(RTLSYM_PTRCHK));
2550 
2551         used &= ~(keepmsk | idxregs);           // regs destroyed by this exercise
2552         getregs(cdb,used);
2553                                                 // CALL __ptrchk
2554         cdb.gencs((LARGECODE) ? 0x9A : CALL,0,FLfunc,getRtlsym(RTLSYM_PTRCHK));
2555     }
2556 
2557     cdb.append(cs2);
2558 }
2559 
2560 /***********************************
2561  * Determine if BP can be used as a general purpose register.
2562  * Note parallels between this routine and prolog().
2563  * Returns:
2564  *      0       can't be used, needed for frame
2565  *      mBP     can be used
2566  */
2567 
2568 regm_t cod3_useBP()
2569 {
2570     tym_t tym;
2571     tym_t tyf;
2572 
2573     // Note that DOSX memory model cannot use EBP as a general purpose
2574     // register, as SS != DS.
2575     if (!(config.exe & EX_flat) || config.flags & (CFGalwaysframe | CFGnoebp))
2576         goto Lcant;
2577 
2578     if (anyiasm)
2579         goto Lcant;
2580 
2581     tyf = funcsym_p.ty();
2582     if (tyf & mTYnaked)                 // if no prolog/epilog for function
2583         goto Lcant;
2584 
2585     if (funcsym_p.Sfunc.Fflags3 & Ffakeeh)
2586     {
2587         goto Lcant;                     // need consistent stack frame
2588     }
2589 
2590     tym = tybasic(tyf);
2591     if (tym == TYifunc)
2592         goto Lcant;
2593 
2594     stackoffsets(0);
2595     localsize = Auto.offset + Fast.offset;                // an estimate only
2596 //    if (localsize)
2597     {
2598         if (!(config.flags4 & CFG4speed) ||
2599             config.target_cpu < TARGET_Pentium ||
2600             tyfarfunc(tym) ||
2601             config.flags & CFGstack ||
2602             localsize >= 0x100 ||       // arbitrary value < 0x1000
2603             (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru)) ||
2604             calledFinally ||
2605             Alloca.size
2606            )
2607             goto Lcant;
2608     }
2609     return mBP;
2610 
2611 Lcant:
2612     return 0;
2613 }
2614 
2615 /*************************************************
2616  * Generate code segment to be used later to restore a cse
2617  */
2618 
2619 bool cse_simple(code *c, elem *e)
2620 {
2621     regm_t regm;
2622     reg_t reg;
2623     int sz = tysize(e.Ety);
2624 
2625     if (!I16 &&                                  // don't bother with 16 bit code
2626         e.Eoper == OPadd &&
2627         sz == REGSIZE &&
2628         e.EV.E2.Eoper == OPconst &&
2629         e.EV.E1.Eoper == OPvar &&
2630         isregvar(e.EV.E1,&regm,&reg) &&
2631         !(e.EV.E1.EV.Vsym.Sflags & SFLspill)
2632        )
2633     {
2634         memset(c,0,(*c).sizeof);
2635 
2636         // Make this an LEA instruction
2637         c.Iop = LEA;
2638         buildEA(c,reg,-1,1,e.EV.E2.EV.Vuns);
2639         if (I64)
2640         {   if (sz == 8)
2641                 c.Irex |= REX_W;
2642         }
2643 
2644         return true;
2645     }
2646     else if (e.Eoper == OPind &&
2647         sz <= REGSIZE &&
2648         e.EV.E1.Eoper == OPvar &&
2649         isregvar(e.EV.E1,&regm,&reg) &&
2650         (I32 || I64 || regm & IDXREGS) &&
2651         !(e.EV.E1.EV.Vsym.Sflags & SFLspill)
2652        )
2653     {
2654         memset(c,0,(*c).sizeof);
2655 
2656         // Make this a MOV instruction
2657         c.Iop = (sz == 1) ? 0x8A : 0x8B;       // MOV reg,EA
2658         buildEA(c,reg,-1,1,0);
2659         if (sz == 2 && I32)
2660             c.Iflags |= CFopsize;
2661         else if (I64)
2662         {   if (sz == 8)
2663                 c.Irex |= REX_W;
2664         }
2665 
2666         return true;
2667     }
2668     return false;
2669 }
2670 
2671 /**************************
2672  * Store `reg` to the common subexpression save area in index `slot`.
2673  * Params:
2674  *      cdb = where to write code to
2675  *      tym = type of value that's in `reg`
2676  *      reg = register to save
2677  *      slot = index into common subexpression save area
2678  */
2679 void gen_storecse(ref CodeBuilder cdb, tym_t tym, reg_t reg, size_t slot)
2680 {
2681     // MOV slot[BP],reg
2682     if (isXMMreg(reg) && config.fpxmmregs) // watch out for ES
2683     {
2684         const aligned = tyvector(tym) ? STACKALIGN >= 16 : true;
2685         const op = xmmstore(tym, aligned);
2686         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLcs,cast(targ_size_t)slot);
2687         return;
2688     }
2689     opcode_t op = STO;              // normal mov
2690     if (reg == ES)
2691     {
2692         reg = 0;            // the real reg number
2693         op = 0x8C;          // segment reg mov
2694     }
2695     cdb.genc1(op,modregxrm(2, reg, BPRM),FLcs,cast(targ_uns)slot);
2696     if (I64)
2697         code_orrex(cdb.last(), REX_W);
2698 }
2699 
2700 void gen_testcse(ref CodeBuilder cdb, tym_t tym, uint sz, size_t slot)
2701 {
2702     // CMP slot[BP],0
2703     cdb.genc(sz == 1 ? 0x80 : 0x81,modregrm(2,7,BPRM),
2704                 FLcs,cast(targ_uns)slot, FLconst,cast(targ_uns) 0);
2705     if ((I64 || I32) && sz == 2)
2706         cdb.last().Iflags |= CFopsize;
2707     if (I64 && sz == 8)
2708         code_orrex(cdb.last(), REX_W);
2709 }
2710 
2711 void gen_loadcse(ref CodeBuilder cdb, tym_t tym, reg_t reg, size_t slot)
2712 {
2713     // MOV reg,slot[BP]
2714     if (isXMMreg(reg) && config.fpxmmregs)
2715     {
2716         const aligned = tyvector(tym) ? STACKALIGN >= 16 : true;
2717         const op = xmmload(tym, aligned);
2718         cdb.genc1(op,modregxrm(2, reg - XMM0, BPRM),FLcs,cast(targ_size_t)slot);
2719         return;
2720     }
2721     opcode_t op = LOD;
2722     if (reg == ES)
2723     {
2724         op = 0x8E;
2725         reg = 0;
2726     }
2727     cdb.genc1(op,modregxrm(2,reg,BPRM),FLcs,cast(targ_uns)slot);
2728     if (I64)
2729         code_orrex(cdb.last(), REX_W);
2730 }
2731 
2732 /***************************************
2733  * Gen code for OPframeptr
2734  */
2735 
2736 void cdframeptr(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
2737 {
2738     regm_t retregs = *pretregs & allregs;
2739     if  (!retregs)
2740         retregs = allregs;
2741     reg_t reg;
2742     allocreg(cdb,&retregs, &reg, TYint);
2743 
2744     code cs;
2745     cs.Iop = ESCAPE | ESCframeptr;
2746     cs.Iflags = 0;
2747     cs.Irex = 0;
2748     cs.Irm = cast(ubyte)reg;
2749     cdb.gen(&cs);
2750     fixresult(cdb,e,retregs,pretregs);
2751 }
2752 
2753 /***************************************
2754  * Gen code for load of _GLOBAL_OFFSET_TABLE_.
2755  * This value gets cached in the local variable 'localgot'.
2756  */
2757 
2758 void cdgot(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
2759 {
2760     static if (TARGET_OSX)
2761     {
2762         regm_t retregs = *pretregs & allregs;
2763         if  (!retregs)
2764             retregs = allregs;
2765         reg_t reg;
2766         allocreg(cdb,&retregs, &reg, TYnptr);
2767 
2768         cdb.genc(CALL,0,0,0,FLgot,0);     //     CALL L1
2769         cdb.gen1(0x58 + reg);             // L1: POP reg
2770 
2771         fixresult(cdb,e,retregs,pretregs);
2772     }
2773     else static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2774     {
2775         regm_t retregs = *pretregs & allregs;
2776         if  (!retregs)
2777             retregs = allregs;
2778         reg_t reg;
2779         allocreg(cdb,&retregs, &reg, TYnptr);
2780 
2781         cdb.genc2(CALL,0,0);        //     CALL L1
2782         cdb.gen1(0x58 + reg);       // L1: POP reg
2783 
2784                                     //     ADD reg,_GLOBAL_OFFSET_TABLE_+3
2785         Symbol *gotsym = Obj.getGOTsym();
2786         cdb.gencs(0x81,modregrm(3,0,reg),FLextern,gotsym);
2787         /* Because the 2:3 offset from L1: is hardcoded,
2788          * this sequence of instructions must not
2789          * have any instructions in between,
2790          * so set CFvolatile to prevent the scheduler from rearranging it.
2791          */
2792         code *cgot = cdb.last();
2793         cgot.Iflags = CFoff | CFvolatile;
2794         cgot.IEV2.Voffset = (reg == AX) ? 2 : 3;
2795 
2796         makeitextern(gotsym);
2797         fixresult(cdb,e,retregs,pretregs);
2798     }
2799     else
2800         assert(0);
2801 }
2802 
2803 /**************************************************
2804  * Load contents of localgot into EBX.
2805  */
2806 
2807 void load_localgot(ref CodeBuilder cdb)
2808 {
2809     static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2810     {
2811         if (config.flags3 & CFG3pic && I32)
2812         {
2813             if (localgot && !(localgot.Sflags & SFLdead))
2814             {
2815                 localgot.Sflags &= ~GTregcand;     // because this hack doesn't work with reg allocator
2816                 elem *e = el_var(localgot);
2817                 regm_t retregs = mBX;
2818                 codelem(cdb,e,&retregs,false);
2819                 el_free(e);
2820             }
2821             else
2822             {
2823                 elem *e = el_long(TYnptr, 0);
2824                 e.Eoper = OPgot;
2825                 regm_t retregs = mBX;
2826                 codelem(cdb,e,&retregs,false);
2827                 el_free(e);
2828             }
2829         }
2830     }
2831 }
2832 
2833 static if (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
2834 {
2835 /*****************************
2836  * Returns:
2837  *      # of bytes stored
2838  */
2839 
2840 
2841 private int obj_namestring(char *p,const(char)* name)
2842 {
2843     size_t len = strlen(name);
2844     if (len > 255)
2845     {
2846         short *ps = cast(short *)p;
2847         p[0] = 0xFF;
2848         p[1] = 0;
2849         ps[1] = cast(short)len;
2850         memcpy(p + 4,name,len);
2851         const int ONS_OHD = 4;           // max # of extra bytes added by obj_namestring()
2852         len += ONS_OHD;
2853     }
2854     else
2855     {
2856         p[0] = cast(char)len;
2857         memcpy(p + 1,name,len);
2858         len++;
2859     }
2860     return cast(int)len;
2861 }
2862 }
2863 
2864 void genregs(ref CodeBuilder cdb,opcode_t op,uint dstreg,uint srcreg)
2865 {
2866     return cdb.gen2(op,modregxrmx(3,dstreg,srcreg));
2867 }
2868 
2869 void gentstreg(ref CodeBuilder cdb, uint t)
2870 {
2871     cdb.gen2(0x85,modregxrmx(3,t,t));   // TEST t,t
2872     code_orflag(cdb.last(),CFpsw);
2873 }
2874 
2875 void genpush(ref CodeBuilder cdb, reg_t reg)
2876 {
2877     cdb.gen1(0x50 + (reg & 7));
2878     if (reg & 8)
2879         code_orrex(cdb.last(), REX_B);
2880 }
2881 
2882 void genpop(ref CodeBuilder cdb, reg_t reg)
2883 {
2884     cdb.gen1(0x58 + (reg & 7));
2885     if (reg & 8)
2886         code_orrex(cdb.last(), REX_B);
2887 }
2888 
2889 /**************************
2890  * Generate a MOV to,from register instruction.
2891  * Smart enough to dump redundant register moves, and segment
2892  * register moves.
2893  */
2894 
2895 code *genmovreg(uint to,uint from)
2896 {
2897     CodeBuilder cdb; cdb.ctor();
2898     genmovreg(cdb, to, from);
2899     return cdb.finish();
2900 }
2901 
2902 void genmovreg(ref CodeBuilder cdb,uint to,uint from)
2903 {
2904     genmovreg(cdb, to, from, TYMAX);
2905 }
2906 
2907 void genmovreg(ref CodeBuilder cdb, uint to, uint from, tym_t tym)
2908 {
2909     // register kind. ex: GPR,XMM,SEG
2910     static uint _K(uint reg)
2911     {
2912         switch (reg)
2913         {
2914         case ES:                   return ES;
2915         case XMM15:
2916         case XMM0: .. case XMM7:   return XMM0;
2917         case AX:   .. case R15:    return AX;
2918         default:                   return reg;
2919         }
2920     }
2921 
2922     // kind combination (order kept)
2923     static uint _X(uint to, uint from) { return (_K(to) << 8) + _K(from); }
2924 
2925     if (to != from)
2926     {
2927         if (tym == TYMAX) tym = TYsize_t; // avoid register slicing
2928         switch (_X(to, from))
2929         {
2930             case _X(AX, AX):
2931                 genregs(cdb, 0x89, from, to);    // MOV to,from
2932                 if (I64 && tysize(tym) >= 8)
2933                     code_orrex(cdb.last(), REX_W);
2934                 break;
2935 
2936             case _X(XMM0, XMM0):             // MOVD/Q to,from
2937                 genregs(cdb, xmmload(tym), to-XMM0, from-XMM0);
2938                 checkSetVex(cdb.last(), tym);
2939                 break;
2940 
2941             case _X(AX, XMM0):               // MOVD/Q to,from
2942                 genregs(cdb, STOD, from-XMM0, to);
2943                 if (I64 && tysize(tym) >= 8)
2944                     code_orrex(cdb.last(), REX_W);
2945                 checkSetVex(cdb.last(), tym);
2946                 break;
2947 
2948             case _X(XMM0, AX):               // MOVD/Q to,from
2949                 genregs(cdb, LODD, to-XMM0, from);
2950                 if (I64 && tysize(tym) >= 8)
2951                     code_orrex(cdb.last(),  REX_W);
2952                 checkSetVex(cdb.last(), tym);
2953                 break;
2954 
2955             case _X(ES, AX):
2956                 assert(tysize(tym) <= REGSIZE);
2957                 genregs(cdb, 0x8E, 0, from);
2958                 break;
2959 
2960             case _X(AX, ES):
2961                 assert(tysize(tym) <= REGSIZE);
2962                 genregs(cdb, 0x8C, 0, to);
2963                 break;
2964 
2965             default:
2966                 debug printf("genmovreg(to = %s, from = %s)\n"
2967                     , regm_str(mask(to)), regm_str(mask(from)));
2968                 assert(0);
2969         }
2970     }
2971 }
2972 
2973 /***************************************
2974  * Generate immediate multiply instruction for r1=r2*imm.
2975  * Optimize it into LEA's if we can.
2976  */
2977 
2978 void genmulimm(ref CodeBuilder cdb,uint r1,uint r2,targ_int imm)
2979 {
2980     // These optimizations should probably be put into pinholeopt()
2981     switch (imm)
2982     {
2983         case 1:
2984             genmovreg(cdb,r1,r2);
2985             break;
2986 
2987         case 5:
2988         {
2989             code cs;
2990             cs.Iop = LEA;
2991             cs.Iflags = 0;
2992             cs.Irex = 0;
2993             buildEA(&cs,r2,r2,4,0);
2994             cs.orReg(r1);
2995             cdb.gen(&cs);
2996             break;
2997         }
2998 
2999         default:
3000             cdb.genc2(0x69,modregxrmx(3,r1,r2),imm);    // IMUL r1,r2,imm
3001             break;
3002     }
3003 }
3004 
3005 /******************************
3006  * Load CX with the value of _AHSHIFT.
3007  */
3008 
3009 void genshift(ref CodeBuilder cdb)
3010 {
3011     version (SCPP)
3012     {
3013         // Set up ahshift to trick ourselves into giving the right fixup,
3014         // which must be seg-relative, external frame, external target.
3015         cdb.gencs(0xC7,modregrm(3,0,CX),FLfunc,getRtlsym(RTLSYM_AHSHIFT));
3016         cdb.last().Iflags |= CFoff;
3017     }
3018     else
3019         assert(0);
3020 }
3021 
3022 /******************************
3023  * Move constant value into reg.
3024  * Take advantage of existing values in registers.
3025  * If flags & mPSW
3026  *      set flags based on result
3027  * Else if flags & 8
3028  *      do not disturb flags
3029  * Else
3030  *      don't care about flags
3031  * If flags & 1 then byte move
3032  * If flags & 2 then short move (for I32 and I64)
3033  * If flags & 4 then don't disturb unused portion of register
3034  * If flags & 16 then reg is a byte register AL..BH
3035  * If flags & 64 (0x40) then 64 bit move (I64 only)
3036  * Returns:
3037  *      code (if any) generated
3038  */
3039 
3040 void movregconst(ref CodeBuilder cdb,reg_t reg,targ_size_t value,regm_t flags)
3041 {
3042     reg_t r;
3043     regm_t mreg;
3044 
3045     //printf("movregconst(reg=%s, value= %lld (%llx), flags=%x)\n", regm_str(mask(reg)), value, value, flags);
3046 
3047     regm_t regm = regcon.immed.mval & mask(reg);
3048     targ_size_t regv = regcon.immed.value[reg];
3049 
3050     if (flags & 1)      // 8 bits
3051     {
3052         value &= 0xFF;
3053         regm &= BYTEREGS;
3054 
3055         // If we already have the right value in the right register
3056         if (regm && (regv & 0xFF) == value)
3057             goto L2;
3058 
3059         if (flags & 16 && reg & 4 &&    // if an H byte register
3060             regcon.immed.mval & mask(reg & 3) &&
3061             (((regv = regcon.immed.value[reg & 3]) >> 8) & 0xFF) == value)
3062             goto L2;
3063 
3064         /* Avoid byte register loads to avoid dependency stalls.
3065          */
3066         if ((I32 || I64) &&
3067             config.target_cpu >= TARGET_PentiumPro && !(flags & 4))
3068             goto L3;
3069 
3070         // See if another register has the right value
3071         r = 0;
3072         for (mreg = (regcon.immed.mval & BYTEREGS); mreg; mreg >>= 1)
3073         {
3074             if (mreg & 1)
3075             {
3076                 if ((regcon.immed.value[r] & 0xFF) == value)
3077                 {
3078                     genregs(cdb,0x8A,reg,r);          // MOV regL,rL
3079                     if (I64 && reg >= 4 || r >= 4)
3080                         code_orrex(cdb.last(), REX);
3081                     goto L2;
3082                 }
3083                 if (!(I64 && reg >= 4) &&
3084                     r < 4 && ((regcon.immed.value[r] >> 8) & 0xFF) == value)
3085                 {
3086                     genregs(cdb,0x8A,reg,r | 4);      // MOV regL,rH
3087                     goto L2;
3088                 }
3089             }
3090             r++;
3091         }
3092 
3093         if (value == 0 && !(flags & 8))
3094         {
3095             if (!(flags & 4) &&                 // if we can set the whole register
3096                 !(flags & 16 && reg & 4))       // and reg is not an H register
3097             {
3098                 genregs(cdb,0x31,reg,reg);      // XOR reg,reg
3099                 regimmed_set(reg,value);
3100                 regv = 0;
3101             }
3102             else
3103                 genregs(cdb,0x30,reg,reg);      // XOR regL,regL
3104             flags &= ~mPSW;                     // flags already set by XOR
3105         }
3106         else
3107         {
3108             cdb.genc2(0xC6,modregrmx(3,0,reg),value);  // MOV regL,value
3109             if (reg >= 4 && I64)
3110             {
3111                 code_orrex(cdb.last(), REX);
3112             }
3113         }
3114     L2:
3115         if (flags & mPSW)
3116             genregs(cdb,0x84,reg,reg);            // TEST regL,regL
3117 
3118         if (regm)
3119             // Set just the 'L' part of the register value
3120             regimmed_set(reg,(regv & ~cast(targ_size_t)0xFF) | value);
3121         else if (flags & 16 && reg & 4 && regcon.immed.mval & mask(reg & 3))
3122             // Set just the 'H' part of the register value
3123             regimmed_set((reg & 3),(regv & ~cast(targ_size_t)0xFF00) | (value << 8));
3124         return;
3125     }
3126 L3:
3127     if (I16)
3128         value = cast(targ_short) value;             // sign-extend MSW
3129     else if (I32)
3130         value = cast(targ_int) value;
3131 
3132     if (!I16 && flags & 2)                      // load 16 bit value
3133     {
3134         value &= 0xFFFF;
3135         if (value && !(flags & mPSW))
3136         {
3137             cdb.genc2(0xC7,modregrmx(3,0,reg),value); // MOV reg,value
3138             regimmed_set(reg, value);
3139             return;
3140         }
3141     }
3142 
3143     // If we already have the right value in the right register
3144     if (regm && (regv & 0xFFFFFFFF) == (value & 0xFFFFFFFF) && !(flags & 64))
3145     {
3146         if (flags & mPSW)
3147             gentstreg(cdb,reg);
3148     }
3149     else if (flags & 64 && regm && regv == value)
3150     {   // Look at the full 64 bits
3151         if (flags & mPSW)
3152         {
3153             gentstreg(cdb,reg);
3154             code_orrex(cdb.last(), REX_W);
3155         }
3156     }
3157     else
3158     {
3159         if (flags & mPSW)
3160         {
3161             switch (value)
3162             {
3163                 case 0:
3164                     genregs(cdb,0x31,reg,reg);
3165                     break;
3166 
3167                 case 1:
3168                     if (I64)
3169                         goto L4;
3170                     genregs(cdb,0x31,reg,reg);
3171                     goto inc;
3172 
3173                 case ~cast(targ_size_t)0:
3174                     if (I64)
3175                         goto L4;
3176                     genregs(cdb,0x31,reg,reg);
3177                     goto dec;
3178 
3179                 default:
3180                 L4:
3181                     if (flags & 64)
3182                     {
3183                         cdb.genc2(0xB8 + (reg&7),REX_W << 16 | (reg&8) << 13,value); // MOV reg,value64
3184                         gentstreg(cdb,reg);
3185                         code_orrex(cdb.last(), REX_W);
3186                     }
3187                     else
3188                     {
3189                         value &= 0xFFFFFFFF;
3190                         cdb.genc2(0xB8 + (reg&7),(reg&8) << 13,value); // MOV reg,value
3191                         gentstreg(cdb,reg);
3192                     }
3193                     break;
3194             }
3195         }
3196         else
3197         {
3198             // Look for single byte conversion
3199             if (regcon.immed.mval & mAX)
3200             {
3201                 if (I32)
3202                 {
3203                     if (reg == AX && value == cast(targ_short) regv)
3204                     {
3205                         cdb.gen1(0x98);               // CWDE
3206                         goto done;
3207                     }
3208                     if (reg == DX &&
3209                         value == (regcon.immed.value[AX] & 0x80000000 ? 0xFFFFFFFF : 0) &&
3210                         !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
3211                        )
3212                     {
3213                         cdb.gen1(0x99);               // CDQ
3214                         goto done;
3215                     }
3216                 }
3217                 else if (I16)
3218                 {
3219                     if (reg == AX &&
3220                         cast(targ_short) value == cast(byte) regv)
3221                     {
3222                         cdb.gen1(0x98);               // CBW
3223                         goto done;
3224                     }
3225 
3226                     if (reg == DX &&
3227                         cast(targ_short) value == (regcon.immed.value[AX] & 0x8000 ? cast(targ_short) 0xFFFF : cast(targ_short) 0) &&
3228                         !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_Pentium)
3229                        )
3230                     {
3231                         cdb.gen1(0x99);               // CWD
3232                         goto done;
3233                     }
3234                 }
3235             }
3236             if (value == 0 && !(flags & 8) && config.target_cpu >= TARGET_80486)
3237             {
3238                 genregs(cdb,0x31,reg,reg);              // XOR reg,reg
3239                 goto done;
3240             }
3241 
3242             if (!I64 && regm && !(flags & 8))
3243             {
3244                 if (regv + 1 == value ||
3245                     // Catch case of (0xFFFF+1 == 0) for 16 bit compiles
3246                     (I16 && cast(targ_short)(regv + 1) == cast(targ_short)value))
3247                 {
3248                 inc:
3249                     cdb.gen1(0x40 + reg);     // INC reg
3250                     goto done;
3251                 }
3252                 if (regv - 1 == value)
3253                 {
3254                 dec:
3255                     cdb.gen1(0x48 + reg);     // DEC reg
3256                     goto done;
3257                 }
3258             }
3259 
3260             // See if another register has the right value
3261             r = 0;
3262             for (mreg = regcon.immed.mval; mreg; mreg >>= 1)
3263             {
3264                 debug
3265                 assert(!I16 || regcon.immed.value[r] == cast(targ_short)regcon.immed.value[r]);
3266 
3267                 if (mreg & 1 && regcon.immed.value[r] == value)
3268                 {
3269                     genmovreg(cdb,reg,r);
3270                     goto done;
3271                 }
3272                 r++;
3273             }
3274 
3275             if (value == 0 && !(flags & 8))
3276             {
3277                 genregs(cdb,0x31,reg,reg);              // XOR reg,reg
3278             }
3279             else
3280             {   // See if we can just load a byte
3281                 if (regm & BYTEREGS &&
3282                     !(config.flags4 & CFG4speed && config.target_cpu >= TARGET_PentiumPro)
3283                    )
3284                 {
3285                     if ((regv & ~cast(targ_size_t)0xFF) == (value & ~cast(targ_size_t)0xFF))
3286                     {
3287                         movregconst(cdb,reg,value,(flags & 8) |4|1);  // load regL
3288                         return;
3289                     }
3290                     if (regm & (mAX|mBX|mCX|mDX) &&
3291                         (regv & ~cast(targ_size_t)0xFF00) == (value & ~cast(targ_size_t)0xFF00) &&
3292                         !I64)
3293                     {
3294                         movregconst(cdb,4|reg,value >> 8,(flags & 8) |4|1|16); // load regH
3295                         return;
3296                     }
3297                 }
3298                 if (flags & 64)
3299                     cdb.genc2(0xB8 + (reg&7),REX_W << 16 | (reg&8) << 13,value); // MOV reg,value64
3300                 else
3301                 {
3302                     value &= 0xFFFFFFFF;
3303                     cdb.genc2(0xB8 + (reg&7),(reg&8) << 13,value); // MOV reg,value
3304                 }
3305             }
3306         }
3307     done:
3308         regimmed_set(reg,value);
3309     }
3310 }
3311 
3312 /**************************
3313  * Generate a jump instruction.
3314  */
3315 
3316 void genjmp(ref CodeBuilder cdb,opcode_t op,uint fltarg,block *targ)
3317 {
3318     code cs;
3319     cs.Iop = op & 0xFF;
3320     cs.Iflags = 0;
3321     cs.Irex = 0;
3322     if (op != JMP && op != 0xE8)        // if not already long branch
3323           cs.Iflags = CFjmp16;          // assume long branch for op = 0x7x
3324     cs.IFL2 = cast(ubyte)fltarg;        // FLblock (or FLcode)
3325     cs.IEV2.Vblock = targ;              // target block (or code)
3326     if (fltarg == FLcode)
3327         (cast(code *)targ).Iflags |= CFtarg;
3328 
3329     if (config.flags4 & CFG4fastfloat)  // if fast floating point
3330     {
3331         cdb.gen(&cs);
3332         return;
3333     }
3334 
3335     switch (op & 0xFF00)                // look at second jump opcode
3336     {
3337         // The JP and JNP come from floating point comparisons
3338         case JP << 8:
3339             cdb.gen(&cs);
3340             cs.Iop = JP;
3341             cdb.gen(&cs);
3342             break;
3343 
3344         case JNP << 8:
3345         {
3346             // Do a JP around the jump instruction
3347             code *cnop = gennop(null);
3348             genjmp(cdb,JP,FLcode,cast(block *) cnop);
3349             cdb.gen(&cs);
3350             cdb.append(cnop);
3351             break;
3352         }
3353 
3354         case 1 << 8:                    // toggled no jump
3355         case 0 << 8:
3356             cdb.gen(&cs);
3357             break;
3358 
3359         default:
3360             debug
3361             printf("jop = x%x\n",op);
3362             assert(0);
3363     }
3364 }
3365 
3366 /*********************************************
3367  * Generate first part of prolog for interrupt function.
3368  */
3369 void prolog_ifunc(ref CodeBuilder cdb, tym_t* tyf)
3370 {
3371     static immutable ubyte[4] ops2 = [ 0x60,0x1E,0x06,0 ];
3372     static immutable ubyte[11] ops0 = [ 0x50,0x51,0x52,0x53,
3373                                     0x54,0x55,0x56,0x57,
3374                                     0x1E,0x06,0 ];
3375 
3376     immutable(ubyte)* p = (config.target_cpu >= TARGET_80286) ? ops2.ptr : ops0.ptr;
3377     do
3378         cdb.gen1(*p);
3379     while (*++p);
3380 
3381     genregs(cdb,0x8B,BP,SP);     // MOV BP,SP
3382     if (localsize)
3383         cod3_stackadj(cdb, cast(int)localsize);
3384 
3385     *tyf |= mTYloadds;
3386 }
3387 
3388 void prolog_ifunc2(ref CodeBuilder cdb, tym_t tyf, tym_t tym, bool pushds)
3389 {
3390     /* Determine if we need to reload DS        */
3391     if (tyf & mTYloadds)
3392     {
3393         if (!pushds)                           // if not already pushed
3394             cdb.gen1(0x1E);                    // PUSH DS
3395         spoff += _tysize[TYint];
3396         cdb.genc(0xC7,modregrm(3,0,AX),0,0,FLdatseg,cast(targ_uns) 0); // MOV  AX,DGROUP
3397         code *c = cdb.last();
3398         c.IEV2.Vseg = DATA;
3399         c.Iflags ^= CFseg | CFoff;            // turn off CFoff, on CFseg
3400         cdb.gen2(0x8E,modregrm(3,3,AX));       // MOV  DS,AX
3401         useregs(mAX);
3402     }
3403 
3404     if (tym == TYifunc)
3405         cdb.gen1(0xFC);                        // CLD
3406 }
3407 
3408 void prolog_16bit_windows_farfunc(ref CodeBuilder cdb, tym_t* tyf, bool* pushds)
3409 {
3410     int wflags = config.wflags;
3411     if (wflags & WFreduced && !(*tyf & mTYexport))
3412     {   // reduced prolog/epilog for non-exported functions
3413         wflags &= ~(WFdgroup | WFds | WFss);
3414     }
3415 
3416     getregsNoSave(mAX);                     // should not have any value in AX
3417 
3418     int segreg;
3419     switch (wflags & (WFdgroup | WFds | WFss))
3420     {
3421         case WFdgroup:                      // MOV  AX,DGROUP
3422         {
3423             if (wflags & WFreduced)
3424                 *tyf &= ~mTYloadds;          // remove redundancy
3425             cdb.genc(0xC7,modregrm(3,0,AX),0,0,FLdatseg,cast(targ_uns) 0);
3426             code *c = cdb.last();
3427             c.IEV2.Vseg = DATA;
3428             c.Iflags ^= CFseg | CFoff;     // turn off CFoff, on CFseg
3429             break;
3430         }
3431 
3432         case WFss:
3433             segreg = 2;                     // SS
3434             goto Lmovax;
3435 
3436         case WFds:
3437             segreg = 3;                     // DS
3438         Lmovax:
3439             cdb.gen2(0x8C,modregrm(3,segreg,AX)); // MOV AX,segreg
3440             if (wflags & WFds)
3441                 cdb.gen1(0x90);             // NOP
3442             break;
3443 
3444         case 0:
3445             break;
3446 
3447         default:
3448             debug
3449             printf("config.wflags = x%x\n",config.wflags);
3450             assert(0);
3451     }
3452     if (wflags & WFincbp)
3453         cdb.gen1(0x40 + BP);              // INC  BP
3454     cdb.gen1(0x50 + BP);                  // PUSH BP
3455     genregs(cdb,0x8B,BP,SP); // MOV  BP,SP
3456     if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
3457     {
3458         cdb.gen1(0x1E);                       // PUSH DS
3459         *pushds = true;
3460         BPoff = -REGSIZE;
3461     }
3462     if (wflags & (WFds | WFss | WFdgroup))
3463         cdb.gen2(0x8E,modregrm(3,3,AX));      // MOV  DS,AX
3464 }
3465 
3466 /**********************************************
3467  * Set up frame register.
3468  * Params:
3469  *      cdb        = write generated code here
3470  *      farfunc    = true if a far function
3471  *      enter      = set to true if ENTER instruction can be used, false otherwise
3472  *      xlocalsize = amount of local variables, set to amount to be subtracted from stack pointer
3473  *      cfa_offset = set to frame pointer's offset from the CFA
3474  * Returns:
3475  *      generated code
3476  */
3477 void prolog_frame(ref CodeBuilder cdb, bool farfunc, ref uint xlocalsize, out bool enter, out int cfa_offset)
3478 {
3479     //printf("prolog_frame\n");
3480     cfa_offset = 0;
3481 
3482     if (0 && config.exe == EX_WIN64)
3483     {
3484         // PUSH RBP
3485         // LEA RBP,0[RSP]
3486         cdb. gen1(0x50 + BP);
3487         cdb.genc1(LEA,(REX_W<<16) | (modregrm(0,4,SP)<<8) | modregrm(2,BP,4),FLconst,0);
3488         enter = false;
3489         return;
3490     }
3491 
3492     if (config.wflags & WFincbp && farfunc)
3493         cdb.gen1(0x40 + BP);      // INC  BP
3494     if (config.target_cpu < TARGET_80286 ||
3495         config.exe & (EX_LINUX | EX_LINUX64 | EX_OSX | EX_OSX64 | EX_FREEBSD | EX_FREEBSD64 | EX_DRAGONFLYBSD64 | EX_SOLARIS | EX_SOLARIS64 | EX_WIN64) ||
3496         !localsize ||
3497         config.flags & CFGstack ||
3498         (xlocalsize >= 0x1000 && config.exe & EX_flat) ||
3499         localsize >= 0x10000 ||
3500         (NTEXCEPTIONS == 2 &&
3501          (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) || config.ehmethod == EHmethod.EH_SEH))) ||
3502         (config.target_cpu >= TARGET_80386 &&
3503          config.flags4 & CFG4speed)
3504        )
3505     {
3506         cdb.gen1(0x50 + BP);      // PUSH BP
3507         genregs(cdb,0x8B,BP,SP);      // MOV  BP,SP
3508         if (I64)
3509             code_orrex(cdb.last(), REX_W);   // MOV RBP,RSP
3510         if ((config.objfmt & (OBJ_ELF | OBJ_MACH)) && config.fulltypes)
3511             // Don't reorder instructions, as dwarf CFA relies on it
3512             code_orflag(cdb.last(), CFvolatile);
3513 static if (NTEXCEPTIONS == 2)
3514 {
3515         if (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.ehmethod == EHmethod.EH_WIN32 && !(funcsym_p.Sfunc.Fflags3 & Feh_none) || config.ehmethod == EHmethod.EH_SEH))
3516         {
3517             nteh_prolog(cdb);
3518             int sz = nteh_contextsym_size();
3519             assert(sz != 0);        // should be 5*4, not 0
3520             xlocalsize -= sz;      // sz is already subtracted from ESP
3521                                     // by nteh_prolog()
3522         }
3523 }
3524         if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3525             config.ehmethod == EHmethod.EH_DWARF)
3526         {
3527             int off = 2 * REGSIZE;      // 1 for the return address + 1 for the PUSH EBP
3528             dwarf_CFA_set_loc(1);           // address after PUSH EBP
3529             dwarf_CFA_set_reg_offset(SP, off); // CFA is now 8[ESP]
3530             dwarf_CFA_offset(BP, -off);       // EBP is at 0[ESP]
3531             dwarf_CFA_set_loc(I64 ? 4 : 3);   // address after MOV EBP,ESP
3532             /* Oddly, the CFA is not the same as the frame pointer,
3533              * which is why the offset of BP is set to 8
3534              */
3535             dwarf_CFA_set_reg_offset(BP, off);        // CFA is now 0[EBP]
3536             cfa_offset = off;  // remember the difference between the CFA and the frame pointer
3537         }
3538         enter = false;              /* do not use ENTER instruction */
3539     }
3540     else
3541         enter = true;
3542 }
3543 
3544 /**********************************************
3545  * Enforce stack alignment.
3546  * Input:
3547  *      cdb     code builder.
3548  * Returns:
3549  *      generated code
3550  */
3551 void prolog_stackalign(ref CodeBuilder cdb)
3552 {
3553     if (!enforcealign)
3554         return;
3555 
3556     const offset = (hasframe ? 2 : 1) * REGSIZE;   // 1 for the return address + 1 for the PUSH EBP
3557     if (offset & (STACKALIGN - 1) || TARGET_STACKALIGN < STACKALIGN)
3558         cod3_stackalign(cdb, STACKALIGN);
3559 }
3560 
3561 void prolog_frameadj(ref CodeBuilder cdb, tym_t tyf, uint xlocalsize, bool enter, bool* pushalloc)
3562 {
3563     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
3564 static if (TARGET_LINUX)
3565 {
3566     bool check = false;               // seems that Linux doesn't need to fault in stack pages
3567 }
3568 else
3569 {
3570     bool check = (config.flags & CFGstack && !(I32 && xlocalsize < 0x1000)) // if stack overflow check
3571         || (TARGET_WINDOS && xlocalsize >= 0x1000 && config.exe & EX_flat);
3572 }
3573     if (check)
3574     {
3575         if (I16)
3576         {
3577             // BUG: Won't work if parameter is passed in AX
3578             movregconst(cdb,AX,xlocalsize,false); // MOV AX,localsize
3579             makeitextern(getRtlsym(RTLSYM_CHKSTK));
3580                                                     // CALL _chkstk
3581             cdb.gencs((LARGECODE) ? 0x9A : CALL,0,FLfunc,getRtlsym(RTLSYM_CHKSTK));
3582             useregs((ALLREGS | mBP | mES) & ~getRtlsym(RTLSYM_CHKSTK).Sregsaved);
3583         }
3584         else
3585         {
3586             /* Watch out for 64 bit code where EDX is passed as a register parameter
3587              */
3588             reg_t reg = I64 ? R11 : DX;  // scratch register
3589 
3590             /*      MOV     EDX, xlocalsize/0x1000
3591              *  L1: SUB     ESP, 0x1000
3592              *      TEST    [ESP],ESP
3593              *      DEC     EDX
3594              *      JNE     L1
3595              *      SUB     ESP, xlocalsize % 0x1000
3596              */
3597             movregconst(cdb, reg, xlocalsize / 0x1000, false);
3598             cod3_stackadj(cdb, 0x1000);
3599             code_orflag(cdb.last(), CFtarg2);
3600             cdb.gen2sib(0x85, modregrm(0,SP,4),modregrm(0,4,SP));
3601             if (I64)
3602             {   cdb.gen2(0xFF, modregrmx(3,1,R11));   // DEC R11D
3603                 cdb.genc2(JNE,0,cast(targ_uns)-15);
3604             }
3605             else
3606             {   cdb.gen1(0x48 + DX);                  // DEC EDX
3607                 cdb.genc2(JNE,0,cast(targ_uns)-12);
3608             }
3609             regimmed_set(reg,0);             // reg is now 0
3610             cod3_stackadj(cdb, xlocalsize & 0xFFF);
3611             useregs(mask(reg));
3612         }
3613     }
3614     else
3615     {
3616         if (enter)
3617         {   // ENTER xlocalsize,0
3618             cdb.genc(ENTER,0,FLconst,xlocalsize,FLconst,cast(targ_uns) 0);
3619             assert(!(config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D)); // didn't emit Dwarf data
3620         }
3621         else if (xlocalsize == REGSIZE && config.flags4 & CFG4optimized)
3622         {
3623             cdb. gen1(0x50 + pushallocreg);    // PUSH AX
3624             // Do this to prevent an -x[EBP] to be moved in
3625             // front of the push.
3626             code_orflag(cdb.last(),CFvolatile);
3627             *pushalloc = true;
3628         }
3629         else
3630             cod3_stackadj(cdb, xlocalsize);
3631     }
3632 }
3633 
3634 void prolog_frameadj2(ref CodeBuilder cdb, tym_t tyf, uint xlocalsize, bool* pushalloc)
3635 {
3636     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
3637     if (xlocalsize == REGSIZE)
3638     {
3639         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3640         *pushalloc = true;
3641     }
3642     else if (xlocalsize == 2 * REGSIZE)
3643     {
3644         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3645         cdb.gen1(0x50 + pushallocreg);      // PUSH AX
3646         *pushalloc = true;
3647     }
3648     else
3649         cod3_stackadj(cdb, xlocalsize);
3650 }
3651 
3652 void prolog_setupalloca(ref CodeBuilder cdb)
3653 {
3654     //printf("prolog_setupalloca() offset x%x size x%x alignment x%x\n",
3655         //cast(int)Alloca.offset, cast(int)Alloca.size, cast(int)Alloca.alignment);
3656     // Set up magic parameter for alloca()
3657     // MOV -REGSIZE[BP],localsize - BPoff
3658     cdb.genc(0xC7,modregrm(2,0,BPRM),
3659             FLconst,Alloca.offset + BPoff,
3660             FLconst,localsize - BPoff);
3661     if (I64)
3662         code_orrex(cdb.last(), REX_W);
3663 }
3664 
3665 /**************************************
3666  * Save registers that the function destroys,
3667  * but that the ABI says should be preserved across
3668  * function calls.
3669  *
3670  * Emit Dwarf info for these saves.
3671  * Params:
3672  *      cdb = append generated instructions to this
3673  *      topush = mask of registers to push
3674  *      cfa_offset = offset of frame pointer from CFA
3675  */
3676 
3677 void prolog_saveregs(ref CodeBuilder cdb, regm_t topush, int cfa_offset)
3678 {
3679     if (pushoffuse)
3680     {
3681         // Save to preallocated section in the stack frame
3682         int xmmtopush = numbitsset(topush & XMMREGS);   // XMM regs take 16 bytes
3683         int gptopush = numbitsset(topush) - xmmtopush;  // general purpose registers to save
3684         targ_size_t xmmoffset = pushoff + BPoff;
3685         if (!hasframe || enforcealign)
3686             xmmoffset += EBPtoESP;
3687         targ_size_t gpoffset = xmmoffset + xmmtopush * 16;
3688         while (topush)
3689         {
3690             reg_t reg = findreg(topush);
3691             topush &= ~mask(reg);
3692             if (isXMMreg(reg))
3693             {
3694                 if (hasframe && !enforcealign)
3695                 {
3696                     // MOVUPD xmmoffset[EBP],xmm
3697                     cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,BPRM),FLconst,xmmoffset);
3698                 }
3699                 else
3700                 {
3701                     // MOVUPD xmmoffset[ESP],xmm
3702                     cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,xmmoffset);
3703                 }
3704                 xmmoffset += 16;
3705             }
3706             else
3707             {
3708                 if (hasframe && !enforcealign)
3709                 {
3710                     // MOV gpoffset[EBP],reg
3711                     cdb.genc1(0x89,modregxrm(2,reg,BPRM),FLconst,gpoffset);
3712                 }
3713                 else
3714                 {
3715                     // MOV gpoffset[ESP],reg
3716                     cdb.genc1(0x89,modregxrm(2,reg,4) + 256*modregrm(0,4,SP),FLconst,gpoffset);
3717                 }
3718                 if (I64)
3719                     code_orrex(cdb.last(), REX_W);
3720                 if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3721                     config.ehmethod == EHmethod.EH_DWARF)
3722                 {   // Emit debug_frame data giving location of saved register
3723                     code *c = cdb.finish();
3724                     pinholeopt(c, null);
3725                     dwarf_CFA_set_loc(calcblksize(c));  // address after save
3726                     dwarf_CFA_offset(reg, cast(int)(gpoffset - cfa_offset));
3727                     cdb.reset();
3728                     cdb.append(c);
3729                 }
3730                 gpoffset += REGSIZE;
3731             }
3732         }
3733     }
3734     else
3735     {
3736         while (topush)                      /* while registers to push      */
3737         {
3738             reg_t reg = findreg(topush);
3739             topush &= ~mask(reg);
3740             if (isXMMreg(reg))
3741             {
3742                 // SUB RSP,16
3743                 cod3_stackadj(cdb, 16);
3744                 // MOVUPD 0[RSP],xmm
3745                 cdb.genc1(STOUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,0);
3746                 EBPtoESP += 16;
3747                 spoff += 16;
3748             }
3749             else
3750             {
3751                 genpush(cdb, reg);
3752                 EBPtoESP += REGSIZE;
3753                 spoff += REGSIZE;
3754                 if (config.fulltypes == CVDWARF_C || config.fulltypes == CVDWARF_D ||
3755                     config.ehmethod == EHmethod.EH_DWARF)
3756                 {   // Emit debug_frame data giving location of saved register
3757                     // relative to 0[EBP]
3758                     code *c = cdb.finish();
3759                     pinholeopt(c, null);
3760                     dwarf_CFA_set_loc(calcblksize(c));  // address after PUSH reg
3761                     dwarf_CFA_offset(reg, -EBPtoESP - cfa_offset);
3762                     cdb.reset();
3763                     cdb.append(c);
3764                 }
3765             }
3766         }
3767     }
3768 }
3769 
3770 /**************************************
3771  * Undo prolog_saveregs()
3772  */
3773 
3774 private void epilog_restoreregs(ref CodeBuilder cdb, regm_t topop)
3775 {
3776     debug
3777     if (topop & ~(XMMREGS | 0xFFFF))
3778         printf("fregsaved = %s, mfuncreg = %s\n",regm_str(fregsaved),regm_str(mfuncreg));
3779 
3780     assert(!(topop & ~(XMMREGS | 0xFFFF)));
3781     if (pushoffuse)
3782     {
3783         // Save to preallocated section in the stack frame
3784         int xmmtopop = numbitsset(topop & XMMREGS);   // XMM regs take 16 bytes
3785         int gptopop = numbitsset(topop) - xmmtopop;   // general purpose registers to save
3786         targ_size_t xmmoffset = pushoff + BPoff;
3787         if (!hasframe || enforcealign)
3788             xmmoffset += EBPtoESP;
3789         targ_size_t gpoffset = xmmoffset + xmmtopop * 16;
3790         while (topop)
3791         {
3792             reg_t reg = findreg(topop);
3793             topop &= ~mask(reg);
3794             if (isXMMreg(reg))
3795             {
3796                 if (hasframe && !enforcealign)
3797                 {
3798                     // MOVUPD xmm,xmmoffset[EBP]
3799                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,BPRM),FLconst,xmmoffset);
3800                 }
3801                 else
3802                 {
3803                     // MOVUPD xmm,xmmoffset[ESP]
3804                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,xmmoffset);
3805                 }
3806                 xmmoffset += 16;
3807             }
3808             else
3809             {
3810                 if (hasframe && !enforcealign)
3811                 {
3812                     // MOV reg,gpoffset[EBP]
3813                     cdb.genc1(0x8B,modregxrm(2,reg,BPRM),FLconst,gpoffset);
3814                 }
3815                 else
3816                 {
3817                     // MOV reg,gpoffset[ESP]
3818                     cdb.genc1(0x8B,modregxrm(2,reg,4) + 256*modregrm(0,4,SP),FLconst,gpoffset);
3819                 }
3820                 if (I64)
3821                     code_orrex(cdb.last(), REX_W);
3822                 gpoffset += REGSIZE;
3823             }
3824         }
3825     }
3826     else
3827     {
3828         reg_t reg = I64 ? XMM7 : DI;
3829         if (!(topop & XMMREGS))
3830             reg = R15;
3831         regm_t regm = 1 << reg;
3832 
3833         while (topop)
3834         {   if (topop & regm)
3835             {
3836                 if (isXMMreg(reg))
3837                 {
3838                     // MOVUPD xmm,0[RSP]
3839                     cdb.genc1(LODUPD,modregxrm(2,reg-XMM0,4) + 256*modregrm(0,4,SP),FLconst,0);
3840                     // ADD RSP,16
3841                     cod3_stackadj(cdb, -16);
3842                 }
3843                 else
3844                 {
3845                     cdb.gen1(0x58 + (reg & 7));         // POP reg
3846                     if (reg & 8)
3847                         code_orrex(cdb.last(), REX_B);
3848                 }
3849                 topop &= ~regm;
3850             }
3851             regm >>= 1;
3852             reg--;
3853         }
3854     }
3855 }
3856 
3857 version (SCPP)
3858 {
3859 void prolog_trace(ref CodeBuilder cdb, bool farfunc, uint* regsaved)
3860 {
3861     Symbol *s = getRtlsym(farfunc ? RTLSYM_TRACE_PRO_F : RTLSYM_TRACE_PRO_N);
3862     makeitextern(s);
3863     cdb.gencs(I16 ? 0x9A : CALL,0,FLfunc,s);      // CALL _trace
3864     if (!I16)
3865         code_orflag(cdb.last(),CFoff | CFselfrel);
3866     /* Embedding the function name inline after the call works, but it
3867      * makes disassembling the code annoying.
3868      */
3869     static if (ELFOBJ || MACHOBJ)
3870     {
3871         // Generate length prefixed name that is recognized by profiler
3872         size_t len = strlen(funcsym_p.Sident);
3873         char *buffer = cast(char *)malloc(len + 4);
3874         assert(buffer);
3875         if (len <= 254)
3876         {
3877             buffer[0] = len;
3878             memcpy(buffer + 1, funcsym_p.Sident, len);
3879             len++;
3880         }
3881         else
3882         {
3883             buffer[0] = 0xFF;
3884             buffer[1] = 0;
3885             buffer[2] = len & 0xFF;
3886             buffer[3] = len >> 8;
3887             memcpy(buffer + 4, funcsym_p.Sident, len);
3888             len += 4;
3889         }
3890         cdb.genasm(buffer, len);         // append func name
3891         free(buffer);
3892     }
3893     else
3894     {
3895         char [IDMAX+IDOHD+1] name = void;
3896         size_t len = objmod.mangle(funcsym_p,name.ptr);
3897         assert(len < name.length);
3898         cdb.genasm(name.ptr,len);             // append func name
3899     }
3900     *regsaved = s.Sregsaved;
3901 }
3902 }
3903 
3904 /******************************
3905  * Generate special varargs prolog for Posix 64 bit systems.
3906  * Params:
3907  *      cdb = sink for generated code
3908  *      sv = symbol for __va_argsave
3909  *      namedargs = registers that named parameters (not ... arguments) were passed in.
3910  */
3911 void prolog_genvarargs(ref CodeBuilder cdb, Symbol* sv, regm_t namedargs)
3912 {
3913     /* Generate code to move any arguments passed in registers into
3914      * the stack variable __va_argsave,
3915      * so we can reference it via pointers through va_arg().
3916      *   struct __va_argsave_t {
3917      *     size_t[6] regs;
3918      *     real[8] fpregs;
3919      *     uint offset_regs;
3920      *     uint offset_fpregs;
3921      *     void* stack_args;
3922      *     void* reg_args;
3923      *   }
3924      * The MOVAPS instructions seg fault if data is not aligned on
3925      * 16 bytes, so this gives us a nice check to ensure no mistakes.
3926         MOV     voff+0*8[RBP],EDI
3927         MOV     voff+1*8[RBP],ESI
3928         MOV     voff+2*8[RBP],RDX
3929         MOV     voff+3*8[RBP],RCX
3930         MOV     voff+4*8[RBP],R8
3931         MOV     voff+5*8[RBP],R9
3932         MOVZX   EAX,AL                      // AL = 0..8, # of XMM registers used
3933         SHL     EAX,2                       // 4 bytes for each MOVAPS
3934         LEA     R11,offset L2[RIP]
3935         SUB     R11,RAX
3936         LEA     RAX,voff+6*8+0x7F[RBP]
3937         JMP     R11d
3938         MOVAPS  -0x0F[RAX],XMM7             // only save XMM registers if actually used
3939         MOVAPS  -0x1F[RAX],XMM6
3940         MOVAPS  -0x2F[RAX],XMM5
3941         MOVAPS  -0x3F[RAX],XMM4
3942         MOVAPS  -0x4F[RAX],XMM3
3943         MOVAPS  -0x5F[RAX],XMM2
3944         MOVAPS  -0x6F[RAX],XMM1
3945         MOVAPS  -0x7F[RAX],XMM0
3946       L2:
3947         MOV     1[RAX],offset_regs          // set __va_argsave.offset_regs
3948         MOV     5[RAX],offset_fpregs        // set __va_argsave.offset_fpregs
3949         LEA     R11, Para.size+Para.offset[RBP]
3950         MOV     9[RAX],R11                  // set __va_argsave.stack_args
3951         SUB     RAX,6*8+0x7F                // point to start of __va_argsave
3952         MOV     6*8+8*16+4+4+8[RAX],RAX     // set __va_argsave.reg_args
3953     * RAX and R11 are destroyed.
3954     */
3955 
3956     /* Save registers into the voff area on the stack
3957      */
3958     targ_size_t voff = Auto.size + BPoff + sv.Soffset;  // EBP offset of start of sv
3959     const int vregnum = 6;
3960     const uint vsize = vregnum * 8 + 8 * 16;
3961 
3962     static immutable ubyte[vregnum] regs = [ DI,SI,DX,CX,R8,R9 ];
3963 
3964     if (!hasframe || enforcealign)
3965         voff += EBPtoESP;
3966 
3967     for (int i = 0; i < vregnum; i++)
3968     {
3969         uint r = regs[i];
3970         if (!(mask(r) & namedargs))  // unnamed arguments would be the ... ones
3971         {
3972             uint ea = (REX_W << 16) | modregxrm(2,r,BPRM);
3973             if (!hasframe || enforcealign)
3974                 ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,r,4);
3975             cdb.genc1(0x89,ea,FLconst,voff + i*8);
3976         }
3977     }
3978 
3979     genregs(cdb,MOVZXb,AX,AX);                 // MOVZX EAX,AL
3980     cdb.genc2(0xC1,modregrm(3,4,AX),2);                     // SHL EAX,2
3981     int raxoff = cast(int)(voff+6*8+0x7F);
3982     uint L2offset = (raxoff < -0x7F) ? 0x2D : 0x2A;
3983     if (!hasframe || enforcealign)
3984         L2offset += 1;                                      // +1 for sib byte
3985     // LEA R11,offset L2[RIP]
3986     cdb.genc1(LEA,(REX_W << 16) | modregxrm(0,R11,5),FLconst,L2offset);
3987     genregs(cdb,0x29,AX,R11);                  // SUB R11,RAX
3988     code_orrex(cdb.last(), REX_W);
3989     // LEA RAX,voff+vsize-6*8-16+0x7F[RBP]
3990     uint ea = (REX_W << 16) | modregrm(2,AX,BPRM);
3991     if (!hasframe || enforcealign)
3992         // add sib byte for [RSP] addressing
3993         ea = (REX_W << 16) | (modregrm(0,4,SP) << 8) | modregxrm(2,AX,4);
3994     cdb.genc1(LEA,ea,FLconst,raxoff);
3995     cdb.gen2(0xFF,modregrmx(3,4,R11));                      // JMP R11d
3996     for (int i = 0; i < 8; i++)
3997     {
3998         // MOVAPS -15-16*i[RAX],XMM7-i
3999         cdb.genc1(0x0F29,modregrm(0,XMM7-i,0),FLconst,-15-16*i);
4000     }
4001 
4002     /* Compute offset_regs and offset_fpregs
4003      */
4004     uint offset_regs = 0;
4005     uint offset_fpregs = vregnum * 8;
4006     for (int i = AX; i <= XMM7; i++)
4007     {
4008         regm_t m = mask(i);
4009         if (m & namedargs)
4010         {
4011             if (m & (mDI|mSI|mDX|mCX|mR8|mR9))
4012                 offset_regs += 8;
4013             else if (m & XMMREGS)
4014                 offset_fpregs += 16;
4015             namedargs &= ~m;
4016             if (!namedargs)
4017                 break;
4018         }
4019     }
4020     // MOV 1[RAX],offset_regs
4021     cdb.genc(0xC7,modregrm(2,0,AX),FLconst,1,FLconst,offset_regs);
4022 
4023     // MOV 5[RAX],offset_fpregs
4024     cdb.genc(0xC7,modregrm(2,0,AX),FLconst,5,FLconst,offset_fpregs);
4025 
4026     // LEA R11, Para.size+Para.offset[RBP]
4027     ea = modregxrm(2,R11,BPRM);
4028     if (!hasframe)
4029         ea = (modregrm(0,4,SP) << 8) | modregrm(2,DX,4);
4030     Para.offset = (Para.offset + (REGSIZE - 1)) & ~(REGSIZE - 1);
4031     cdb.genc1(LEA,(REX_W << 16) | ea,FLconst,Para.size + Para.offset);
4032 
4033     // MOV 9[RAX],R11
4034     cdb.genc1(0x89,(REX_W << 16) | modregxrm(2,R11,AX),FLconst,9);
4035 
4036     // SUB RAX,6*8+0x7F             // point to start of __va_argsave
4037     cdb.genc2(0x2D,0,6*8+0x7F);
4038     code_orrex(cdb.last(), REX_W);
4039 
4040     // MOV 6*8+8*16+4+4+8[RAX],RAX  // set __va_argsave.reg_args
4041     cdb.genc1(0x89,(REX_W << 16) | modregrm(2,AX,AX),FLconst,6*8+8*16+4+4+8);
4042 
4043     pinholeopt(cdb.peek(), null);
4044     useregs(mAX|mR11);
4045 }
4046 
4047 void prolog_gen_win64_varargs(ref CodeBuilder cdb)
4048 {
4049     /* The Microsoft scheme.
4050      * http://msdn.microsoft.com/en-US/library/dd2wa36c(v=vs.80)
4051      * Copy registers onto stack.
4052          mov     8[RSP],RCX
4053          mov     010h[RSP],RDX
4054          mov     018h[RSP],R8
4055          mov     020h[RSP],R9
4056      */
4057 }
4058 
4059 /************************************
4060  * Params:
4061  *      cdb = generated code sink
4062  *      tf = what's the type of the function
4063  *      pushalloc = use PUSH to allocate on the stack rather than subtracting from SP
4064  *      namedargs = set to the registers that named parameters were passed in
4065  */
4066 void prolog_loadparams(ref CodeBuilder cdb, tym_t tyf, bool pushalloc, out regm_t namedargs)
4067 {
4068     //printf("prolog_loadparams()\n");
4069     debug
4070     for (SYMIDX si = 0; si < globsym.length; si++)
4071     {
4072         Symbol *s = globsym[si];
4073         if (debugr && (s.Sclass == SCfastpar || s.Sclass == SCshadowreg))
4074         {
4075             printf("symbol '%s' is fastpar in register [l %s, m %s]\n", s.Sident.ptr,
4076                 regm_str(mask(s.Spreg)),
4077                 (s.Spreg2 == NOREG ? "NOREG" : regm_str(mask(s.Spreg2))));
4078             if (s.Sfl == FLreg)
4079                 printf("\tassigned to register %s\n", regm_str(mask(s.Sreglsw)));
4080         }
4081     }
4082 
4083     uint pushallocreg = (tyf == TYmfunc) ? CX : AX;
4084 
4085     /* Copy SCfastpar and SCshadowreg (parameters passed in registers) that were not assigned
4086      * registers into their stack locations.
4087      */
4088     regm_t shadowregm = 0;
4089     for (SYMIDX si = 0; si < globsym.length; si++)
4090     {
4091         Symbol *s = globsym[si];
4092         uint sz = cast(uint)type_size(s.Stype);
4093 
4094         if ((s.Sclass == SCfastpar || s.Sclass == SCshadowreg) && s.Sfl != FLreg)
4095         {   // Argument is passed in a register
4096 
4097             type *t = s.Stype;
4098             type *t2 = null;
4099 
4100             tym_t tyb = tybasic(t.Tty);
4101 
4102             // This logic is same as FuncParamRegs_alloc function at src/dmd/backend/cod1.d
4103             //
4104             // Find suitable SROA based on the element type
4105             // (Don't put volatile parameters in registers)
4106             if (tyb == TYarray && !(t.Tty & mTYvolatile))
4107             {
4108                 type *targ1;
4109                 argtypes(t, targ1, t2);
4110                 if (targ1)
4111                     t = targ1;
4112             }
4113 
4114             // If struct just wraps another type
4115             if (tyb == TYstruct)
4116             {
4117                 // On windows 64 bits, structs occupy a general purpose register,
4118                 // regardless of the struct size or the number & types of its fields.
4119                 if (config.exe != EX_WIN64)
4120                 {
4121                     type *targ1 = t.Ttag.Sstruct.Sarg1type;
4122                     t2 = t.Ttag.Sstruct.Sarg2type;
4123                     if (targ1)
4124                         t = targ1;
4125                 }
4126             }
4127 
4128             if (Symbol_Sisdead(s, anyiasm))
4129             {
4130                 // Ignore it, as it is never referenced
4131             }
4132             else
4133             {
4134                 targ_size_t offset = Fast.size + BPoff;
4135                 if (s.Sclass == SCshadowreg)
4136                     offset = Para.size;
4137                 offset += s.Soffset;
4138                 if (!hasframe || (enforcealign && s.Sclass != SCshadowreg))
4139                     offset += EBPtoESP;
4140 
4141                 reg_t preg = s.Spreg;
4142                 foreach (i; 0 .. 2)     // twice, once for each possible parameter register
4143                 {
4144                     shadowregm |= mask(preg);
4145                     opcode_t op = 0x89;                  // MOV x[EBP],preg
4146                     if (isXMMreg(preg))
4147                         op = xmmstore(t.Tty);
4148                     if (!(pushalloc && preg == pushallocreg) || s.Sclass == SCshadowreg)
4149                     {
4150                         if (hasframe && (!enforcealign || s.Sclass == SCshadowreg))
4151                         {
4152                             // MOV x[EBP],preg
4153                             cdb.genc1(op,modregxrm(2,preg,BPRM),FLconst,offset);
4154                             if (isXMMreg(preg))
4155                             {
4156                                 checkSetVex(cdb.last(), t.Tty);
4157                             }
4158                             else
4159                             {
4160                                 //printf("%s Fast.size = %d, BPoff = %d, Soffset = %d, sz = %d\n",
4161                                 //         s.Sident, (int)Fast.size, (int)BPoff, (int)s.Soffset, (int)sz);
4162                                 if (I64 && sz > 4)
4163                                     code_orrex(cdb.last(), REX_W);
4164                             }
4165                         }
4166                         else
4167                         {
4168                             // MOV offset[ESP],preg
4169                             // BUG: byte size?
4170                             cdb.genc1(op,
4171                                       (modregrm(0,4,SP) << 8) |
4172                                        modregxrm(2,preg,4),FLconst,offset);
4173                             if (isXMMreg(preg))
4174                             {
4175                                 checkSetVex(cdb.last(), t.Tty);
4176                             }
4177                             else
4178                             {
4179                                 if (I64 && sz > 4)
4180                                     cdb.last().Irex |= REX_W;
4181                             }
4182                         }
4183                     }
4184                     preg = s.Spreg2;
4185                     if (preg == NOREG)
4186                         break;
4187                     if (t2)
4188                         t = t2;
4189                     offset += REGSIZE;
4190                 }
4191             }
4192         }
4193     }
4194 
4195     if (config.exe == EX_WIN64 && variadic(funcsym_p.Stype))
4196     {
4197         /* The Microsoft scheme.
4198          * http://msdn.microsoft.com/en-US/library/dd2wa36c(v=vs.80)
4199          * Copy registers onto stack.
4200              mov     8[RSP],RCX or XMM0
4201              mov     010h[RSP],RDX or XMM1
4202              mov     018h[RSP],R8 or XMM2
4203              mov     020h[RSP],R9 or XMM3
4204          */
4205         static immutable reg_t[4] vregs = [ CX,DX,R8,R9 ];
4206         for (int i = 0; i < vregs.length; ++i)
4207         {
4208             uint preg = vregs[i];
4209             uint offset = cast(uint)(Para.size + i * REGSIZE);
4210             if (!(shadowregm & (mask(preg) | mask(XMM0 + i))))
4211             {
4212                 if (hasframe)
4213                 {
4214                     // MOV x[EBP],preg
4215                     cdb.genc1(0x89,
4216                                      modregxrm(2,preg,BPRM),FLconst, offset);
4217                     code_orrex(cdb.last(), REX_W);
4218                 }
4219                 else
4220                 {
4221                     // MOV offset[ESP],preg
4222                     cdb.genc1(0x89,
4223                                      (modregrm(0,4,SP) << 8) |
4224                                      modregxrm(2,preg,4),FLconst,offset + EBPtoESP);
4225                 }
4226                 cdb.last().Irex |= REX_W;
4227             }
4228         }
4229     }
4230 
4231     /* Copy SCfastpar and SCshadowreg (parameters passed in registers) that were assigned registers
4232      * into their assigned registers.
4233      * Note that we have a big problem if Pa is passed in R1 and assigned to R2,
4234      * and Pb is passed in R2 but assigned to R1. Detect it and assert.
4235      */
4236     regm_t assignregs = 0;
4237     for (SYMIDX si = 0; si < globsym.length; si++)
4238     {
4239         Symbol *s = globsym[si];
4240         uint sz = cast(uint)type_size(s.Stype);
4241 
4242         if (s.Sclass == SCfastpar || s.Sclass == SCshadowreg)
4243             namedargs |= s.Spregm();
4244 
4245         if ((s.Sclass == SCfastpar || s.Sclass == SCshadowreg) && s.Sfl == FLreg)
4246         {   // Argument is passed in a register
4247 
4248             type *t = s.Stype;
4249             type *t2 = null;
4250             if (tybasic(t.Tty) == TYstruct && config.exe != EX_WIN64)
4251             {   type *targ1 = t.Ttag.Sstruct.Sarg1type;
4252                 t2 = t.Ttag.Sstruct.Sarg2type;
4253                 if (targ1)
4254                     t = targ1;
4255             }
4256 
4257             reg_t preg = s.Spreg;
4258             reg_t r = s.Sreglsw;
4259             for (int i = 0; i < 2; ++i)
4260             {
4261                 if (preg == NOREG)
4262                     break;
4263                 assert(!(mask(preg) & assignregs));         // not already stepped on
4264                 assignregs |= mask(r);
4265 
4266                 // MOV reg,preg
4267                 if (r == preg)
4268                 {
4269                 }
4270                 else if (mask(preg) & XMMREGS)
4271                 {
4272                     const op = xmmload(t.Tty);      // MOVSS/D xreg,preg
4273                     uint xreg = r - XMM0;
4274                     cdb.gen2(op,modregxrmx(3,xreg,preg - XMM0));
4275                 }
4276                 else
4277                 {
4278                     //printf("test1 mov %s, %s\n", regstring[r], regstring[preg]);
4279                     genmovreg(cdb,r,preg);
4280                     if (I64 && sz == 8)
4281                         code_orrex(cdb.last(), REX_W);
4282                 }
4283                 preg = s.Spreg2;
4284                 r = s.Sregmsw;
4285                 if (t2)
4286                     t = t2;
4287             }
4288         }
4289     }
4290 
4291     /* For parameters that were passed on the stack, but are enregistered,
4292      * initialize the registers with the parameter stack values.
4293      * Do not use assignaddr(), as it will replace the stack reference with
4294      * the register.
4295      */
4296     for (SYMIDX si = 0; si < globsym.length; si++)
4297     {
4298         Symbol *s = globsym[si];
4299         uint sz = cast(uint)type_size(s.Stype);
4300 
4301         if ((s.Sclass == SCregpar || s.Sclass == SCparameter) &&
4302             s.Sfl == FLreg &&
4303             (refparam
4304                 // This variable has been reference by a nested function
4305                 || MARS && s.Stype.Tty & mTYvolatile
4306                 ))
4307         {
4308             // MOV reg,param[BP]
4309             //assert(refparam);
4310             if (mask(s.Sreglsw) & XMMREGS)
4311             {
4312                 const op = xmmload(s.Stype.Tty);  // MOVSS/D xreg,mem
4313                 uint xreg = s.Sreglsw - XMM0;
4314                 cdb.genc1(op,modregxrm(2,xreg,BPRM),FLconst,Para.size + s.Soffset);
4315                 if (!hasframe)
4316                 {   // Convert to ESP relative address rather than EBP
4317                     code *c = cdb.last();
4318                     c.Irm = cast(ubyte)modregxrm(2,xreg,4);
4319                     c.Isib = modregrm(0,4,SP);
4320                     c.IEV1.Vpointer += EBPtoESP;
4321                 }
4322             }
4323             else
4324             {
4325                 cdb.genc1(sz == 1 ? 0x8A : 0x8B,
4326                     modregxrm(2,s.Sreglsw,BPRM),FLconst,Para.size + s.Soffset);
4327                 code *c = cdb.last();
4328                 if (!I16 && sz == SHORTSIZE)
4329                     c.Iflags |= CFopsize; // operand size
4330                 if (I64 && sz >= REGSIZE)
4331                     c.Irex |= REX_W;
4332                 if (I64 && sz == 1 && s.Sreglsw >= 4)
4333                     c.Irex |= REX;
4334                 if (!hasframe)
4335                 {   // Convert to ESP relative address rather than EBP
4336                     assert(!I16);
4337                     c.Irm = cast(ubyte)modregxrm(2,s.Sreglsw,4);
4338                     c.Isib = modregrm(0,4,SP);
4339                     c.IEV1.Vpointer += EBPtoESP;
4340                 }
4341                 if (sz > REGSIZE)
4342                 {
4343                     cdb.genc1(0x8B,
4344                         modregxrm(2,s.Sregmsw,BPRM),FLconst,Para.size + s.Soffset + REGSIZE);
4345                     code *cx = cdb.last();
4346                     if (I64)
4347                         cx.Irex |= REX_W;
4348                     if (!hasframe)
4349                     {   // Convert to ESP relative address rather than EBP
4350                         assert(!I16);
4351                         cx.Irm = cast(ubyte)modregxrm(2,s.Sregmsw,4);
4352                         cx.Isib = modregrm(0,4,SP);
4353                         cx.IEV1.Vpointer += EBPtoESP;
4354                     }
4355                 }
4356             }
4357         }
4358     }
4359 }
4360 
4361 /*******************************
4362  * Generate and return function epilog.
4363  * Output:
4364  *      retsize         Size of function epilog
4365  */
4366 
4367 void epilog(block *b)
4368 {
4369     code *cpopds;
4370     reg_t reg;
4371     reg_t regx;                      // register that's not a return reg
4372     regm_t topop,regm;
4373     targ_size_t xlocalsize = localsize;
4374 
4375     CodeBuilder cdbx; cdbx.ctor();
4376     tym_t tyf = funcsym_p.ty();
4377     tym_t tym = tybasic(tyf);
4378     bool farfunc = tyfarfunc(tym) != 0;
4379     if (!(b.Bflags & BFLepilog))       // if no epilog code
4380         goto Lret;                      // just generate RET
4381     regx = (b.BC == BCret) ? AX : CX;
4382 
4383     retsize = 0;
4384 
4385     if (tyf & mTYnaked)                 // if no prolog/epilog
4386         return;
4387 
4388     if (tym == TYifunc)
4389     {
4390         static immutable ubyte[5] ops2 = [ 0x07,0x1F,0x61,0xCF,0 ];
4391         static immutable ubyte[12] ops0 = [ 0x07,0x1F,0x5F,0x5E,
4392                                         0x5D,0x5B,0x5B,0x5A,
4393                                         0x59,0x58,0xCF,0 ];
4394 
4395         genregs(cdbx,0x8B,SP,BP);              // MOV SP,BP
4396         auto p = (config.target_cpu >= TARGET_80286) ? ops2.ptr : ops0.ptr;
4397         do
4398             cdbx.gen1(*p);
4399         while (*++p);
4400         goto Lopt;
4401     }
4402 
4403     if (config.flags & CFGtrace &&
4404         (!(config.flags4 & CFG4allcomdat) ||
4405          funcsym_p.Sclass == SCcomdat ||
4406          funcsym_p.Sclass == SCglobal ||
4407          (config.flags2 & CFG2comdat && SymInline(funcsym_p))
4408         )
4409        )
4410     {
4411         Symbol *s = getRtlsym(farfunc ? RTLSYM_TRACE_EPI_F : RTLSYM_TRACE_EPI_N);
4412         makeitextern(s);
4413         cdbx.gencs(I16 ? 0x9A : CALL,0,FLfunc,s);      // CALLF _trace
4414         if (!I16)
4415             code_orflag(cdbx.last(),CFoff | CFselfrel);
4416         useregs((ALLREGS | mBP | mES) & ~s.Sregsaved);
4417     }
4418 
4419     if (usednteh & (NTEH_try | NTEH_except | NTEHcpp | EHcleanup | EHtry | NTEHpassthru) && (config.exe == EX_WIN32 || MARS))
4420     {
4421         nteh_epilog(cdbx);
4422     }
4423 
4424     cpopds = null;
4425     if (tyf & mTYloadds)
4426     {
4427         cdbx.gen1(0x1F);             // POP DS
4428         cpopds = cdbx.last();
4429     }
4430 
4431     /* Pop all the general purpose registers saved on the stack
4432      * by the prolog code. Remember to do them in the reverse
4433      * order they were pushed.
4434      */
4435     topop = fregsaved & ~mfuncreg;
4436     epilog_restoreregs(cdbx, topop);
4437 
4438     version (MARS)
4439     {
4440         if (usednteh & NTEHjmonitor)
4441         {
4442             regm_t retregs = 0;
4443             if (b.BC == BCretexp)
4444                 retregs = regmask(b.Belem.Ety, tym);
4445             nteh_monitor_epilog(cdbx,retregs);
4446             xlocalsize += 8;
4447         }
4448     }
4449 
4450     if (config.wflags & WFwindows && farfunc)
4451     {
4452         int wflags = config.wflags;
4453         if (wflags & WFreduced && !(tyf & mTYexport))
4454         {   // reduced prolog/epilog for non-exported functions
4455             wflags &= ~(WFdgroup | WFds | WFss);
4456             if (!(wflags & WFsaveds))
4457                 goto L4;
4458         }
4459 
4460         if (localsize)
4461         {
4462             cdbx.genc1(LEA,modregrm(1,SP,6),FLconst,cast(targ_uns)-2); /* LEA SP,-2[BP] */
4463         }
4464         if (wflags & (WFsaveds | WFds | WFss | WFdgroup))
4465         {
4466             if (cpopds)
4467                 cpopds.Iop = NOP;              // don't need previous one
4468             cdbx.gen1(0x1F);                    // POP DS
4469         }
4470         cdbx.gen1(0x58 + BP);                   // POP BP
4471         if (config.wflags & WFincbp)
4472             cdbx.gen1(0x48 + BP);               // DEC BP
4473         assert(hasframe);
4474     }
4475     else
4476     {
4477         if (needframe || (xlocalsize && hasframe))
4478         {
4479         L4:
4480             assert(hasframe);
4481             if (xlocalsize || enforcealign)
4482             {
4483                 if (config.flags2 & CFG2stomp)
4484                 {   /*   MOV  ECX,0xBEAF
4485                      * L1:
4486                      *   MOV  [ESP],ECX
4487                      *   ADD  ESP,4
4488                      *   CMP  EBP,ESP
4489                      *   JNE  L1
4490                      *   POP  EBP
4491                      */
4492                     /* Value should be:
4493                      * 1. != 0 (code checks for null pointers)
4494                      * 2. be odd (to mess up alignment)
4495                      * 3. fall in first 64K (likely marked as inaccessible)
4496                      * 4. be a value that stands out in the debugger
4497                      */
4498                     assert(I32 || I64);
4499                     targ_size_t value = 0x0000BEAF;
4500                     reg_t regcx = CX;
4501                     mfuncreg &= ~mask(regcx);
4502                     uint grex = I64 ? REX_W << 16 : 0;
4503                     cdbx.genc2(0xC7,grex | modregrmx(3,0,regcx),value);   // MOV regcx,value
4504                     cdbx.gen2sib(0x89,grex | modregrm(0,regcx,4),modregrm(0,4,SP)); // MOV [ESP],regcx
4505                     code *c1 = cdbx.last();
4506                     cdbx.genc2(0x81,grex | modregrm(3,0,SP),REGSIZE);     // ADD ESP,REGSIZE
4507                     genregs(cdbx,0x39,SP,BP);                             // CMP EBP,ESP
4508                     if (I64)
4509                         code_orrex(cdbx.last(),REX_W);
4510                     genjmp(cdbx,JNE,FLcode,cast(block *)c1);                  // JNE L1
4511                     // explicitly mark as short jump, needed for correct retsize calculation (Bugzilla 15779)
4512                     cdbx.last().Iflags &= ~CFjmp16;
4513                     cdbx.gen1(0x58 + BP);                                 // POP BP
4514                 }
4515                 else if (config.exe == EX_WIN64)
4516                 {   // See http://msdn.microsoft.com/en-us/library/tawsa7cb(v=vs.80).aspx
4517                     // LEA RSP,0[RBP]
4518                     cdbx.genc1(LEA,(REX_W<<16)|modregrm(2,SP,BPRM),FLconst,0);
4519                     cdbx.gen1(0x58 + BP);      // POP RBP
4520                 }
4521                 else if (config.target_cpu >= TARGET_80286 &&
4522                     !(config.target_cpu >= TARGET_80386 && config.flags4 & CFG4speed)
4523                    )
4524                     cdbx.gen1(LEAVE);          // LEAVE
4525                 else if (0 && xlocalsize == REGSIZE && Alloca.size == 0 && I32)
4526                 {   // This doesn't work - I should figure out why
4527                     mfuncreg &= ~mask(regx);
4528                     cdbx.gen1(0x58 + regx);    // POP regx
4529                     cdbx.gen1(0x58 + BP);      // POP BP
4530                 }
4531                 else
4532                 {
4533                     genregs(cdbx,0x8B,SP,BP);  // MOV SP,BP
4534                     if (I64)
4535                         code_orrex(cdbx.last(), REX_W);   // MOV RSP,RBP
4536                     cdbx.gen1(0x58 + BP);      // POP BP
4537                 }
4538             }
4539             else
4540                 cdbx.gen1(0x58 + BP);          // POP BP
4541             if (config.wflags & WFincbp && farfunc)
4542                 cdbx.gen1(0x48 + BP);              // DEC BP
4543         }
4544         else if (xlocalsize == REGSIZE && (!I16 || b.BC == BCret))
4545         {
4546             mfuncreg &= ~mask(regx);
4547             cdbx.gen1(0x58 + regx);                    // POP regx
4548         }
4549         else if (xlocalsize)
4550             cod3_stackadj(cdbx, cast(int)-xlocalsize);
4551     }
4552     if (b.BC == BCret || b.BC == BCretexp)
4553     {
4554 Lret:
4555         opcode_t op = tyfarfunc(tym) ? 0xCA : 0xC2;
4556         if (tym == TYhfunc)
4557         {
4558             cdbx.genc2(0xC2,0,4);                       // RET 4
4559         }
4560         else if (!typfunc(tym) ||                       // if caller cleans the stack
4561                  config.exe == EX_WIN64 ||
4562                  Para.offset == 0)                      // or nothing pushed on the stack anyway
4563         {
4564             op++;                                       // to a regular RET
4565             cdbx.gen1(op);
4566         }
4567         else
4568         {   // Stack is always aligned on register size boundary
4569             Para.offset = (Para.offset + (REGSIZE - 1)) & ~(REGSIZE - 1);
4570             if (Para.offset >= 0x10000)
4571             {
4572                 /*
4573                     POP REG
4574                     ADD ESP, Para.offset
4575                     JMP REG
4576                 */
4577                 cdbx.gen1(0x58+regx);
4578                 cdbx.genc2(0x81, modregrm(3,0,SP), Para.offset);
4579                 if (I64)
4580                     code_orrex(cdbx.last(), REX_W);
4581                 cdbx.genc2(0xFF, modregrm(3,4,regx), 0);
4582                 if (I64)
4583                     code_orrex(cdbx.last(), REX_W);
4584             }
4585             else
4586                 cdbx.genc2(op,0,Para.offset);          // RET Para.offset
4587         }
4588     }
4589 
4590 Lopt:
4591     // If last instruction in ce is ADD SP,imm, and first instruction
4592     // in c sets SP, we can dump the ADD.
4593     CodeBuilder cdb; cdb.ctor();
4594     cdb.append(b.Bcode);
4595     code *cr = cdb.last();
4596     code *c = cdbx.peek();
4597     if (cr && c && !I64)
4598     {
4599         if (cr.Iop == 0x81 && cr.Irm == modregrm(3,0,SP))     // if ADD SP,imm
4600         {
4601             if (
4602                 c.Iop == LEAVE ||                                // LEAVE
4603                 (c.Iop == 0x8B && c.Irm == modregrm(3,SP,BP)) || // MOV SP,BP
4604                 (c.Iop == LEA && c.Irm == modregrm(1,SP,6))     // LEA SP,-imm[BP]
4605                )
4606                 cr.Iop = NOP;
4607             else if (c.Iop == 0x58 + BP)                       // if POP BP
4608             {
4609                 cr.Iop = 0x8B;
4610                 cr.Irm = modregrm(3,SP,BP);                    // MOV SP,BP
4611             }
4612         }
4613         else
4614         {
4615 static if (0)
4616 {
4617         // These optimizations don't work if the called function
4618         // cleans off the stack.
4619         if (c.Iop == 0xC3 && cr.Iop == CALL)     // CALL near
4620         {
4621             cr.Iop = 0xE9;                             // JMP near
4622             c.Iop = NOP;
4623         }
4624         else if (c.Iop == 0xCB && cr.Iop == 0x9A)     // CALL far
4625         {
4626             cr.Iop = 0xEA;                             // JMP far
4627             c.Iop = NOP;
4628         }
4629 }
4630         }
4631     }
4632 
4633     pinholeopt(c, null);
4634     retsize += calcblksize(c);          // compute size of function epilog
4635     cdb.append(cdbx);
4636     b.Bcode = cdb.finish();
4637 }
4638 
4639 /*******************************
4640  * Return offset of SP from BP.
4641  */
4642 
4643 targ_size_t cod3_spoff()
4644 {
4645     //printf("spoff = x%x, localsize = x%x\n", (int)spoff, (int)localsize);
4646     return spoff + localsize;
4647 }
4648 
4649 void gen_spill_reg(ref CodeBuilder cdb, Symbol* s, bool toreg)
4650 {
4651     code cs;
4652     const regm_t keepmsk = toreg ? RMload : RMstore;
4653 
4654     elem* e = el_var(s); // so we can trick getlvalue() into working for us
4655 
4656     if (mask(s.Sreglsw) & XMMREGS)
4657     {   // Convert to save/restore of XMM register
4658         if (toreg)
4659             cs.Iop = xmmload(s.Stype.Tty);        // MOVSS/D xreg,mem
4660         else
4661             cs.Iop = xmmstore(s.Stype.Tty);       // MOVSS/D mem,xreg
4662         getlvalue(cdb,&cs,e,keepmsk);
4663         cs.orReg(s.Sreglsw - XMM0);
4664         cdb.gen(&cs);
4665     }
4666     else
4667     {
4668         const int sz = cast(int)type_size(s.Stype);
4669         cs.Iop = toreg ? 0x8B : 0x89; // MOV reg,mem[ESP] : MOV mem[ESP],reg
4670         cs.Iop ^= (sz == 1);
4671         getlvalue(cdb,&cs,e,keepmsk);
4672         cs.orReg(s.Sreglsw);
4673         if (I64 && sz == 1 && s.Sreglsw >= 4)
4674             cs.Irex |= REX;
4675         if ((cs.Irm & 0xC0) == 0xC0 &&                  // reg,reg
4676             (((cs.Irm >> 3) ^ cs.Irm) & 7) == 0 &&      // registers match
4677             (((cs.Irex >> 2) ^ cs.Irex) & 1) == 0)      // REX_R and REX_B match
4678         { }                                             // skip MOV reg,reg
4679         else
4680             cdb.gen(&cs);
4681         if (sz > REGSIZE)
4682         {
4683             cs.setReg(s.Sregmsw);
4684             getlvalue_msw(&cs);
4685             if ((cs.Irm & 0xC0) == 0xC0 &&              // reg,reg
4686                 (((cs.Irm >> 3) ^ cs.Irm) & 7) == 0 &&  // registers match
4687                 (((cs.Irex >> 2) ^ cs.Irex) & 1) == 0)  // REX_R and REX_B match
4688             { }                                         // skip MOV reg,reg
4689             else
4690                 cdb.gen(&cs);
4691         }
4692     }
4693 
4694     el_free(e);
4695 }
4696 
4697 /****************************
4698  * Generate code for, and output a thunk.
4699  * Params:
4700  *      sthunk =  Symbol of thunk
4701  *      sfunc =   Symbol of thunk's target function
4702  *      thisty =  Type of this pointer
4703  *      p =       ESP parameter offset to this pointer
4704  *      d =       offset to add to 'this' pointer
4705  *      d2 =      offset from 'this' to vptr
4706  *      i =       offset into vtbl[]
4707  */
4708 
4709 void cod3_thunk(Symbol *sthunk,Symbol *sfunc,uint p,tym_t thisty,
4710         uint d,int i,uint d2)
4711 {
4712     targ_size_t thunkoffset;
4713 
4714     int seg = sthunk.Sseg;
4715     cod3_align(seg);
4716 
4717     // Skip over return address
4718     tym_t thunkty = tybasic(sthunk.ty());
4719     if (tyfarfunc(thunkty))
4720         p += I32 ? 8 : tysize(TYfptr);          // far function
4721     else
4722         p += tysize(TYnptr);
4723 
4724     CodeBuilder cdb; cdb.ctor();
4725     if (!I16)
4726     {
4727         /*
4728            Generate:
4729             ADD p[ESP],d
4730            For direct call:
4731             JMP sfunc
4732            For virtual call:
4733             MOV EAX, p[ESP]                     EAX = this
4734             MOV EAX, d2[EAX]                    EAX = this.vptr
4735             JMP i[EAX]                          jump to virtual function
4736          */
4737         reg_t reg = 0;
4738         if (cast(int)d < 0)
4739         {
4740             d = -d;
4741             reg = 5;                            // switch from ADD to SUB
4742         }
4743         if (thunkty == TYmfunc)
4744         {                                       // ADD ECX,d
4745             if (d)
4746                 cdb.genc2(0x81,modregrm(3,reg,CX),d);
4747         }
4748         else if (thunkty == TYjfunc || (I64 && thunkty == TYnfunc))
4749         {                                       // ADD EAX,d
4750             int rm = AX;
4751             if (config.exe == EX_WIN64)
4752                 rm = CX;
4753             else if (I64)
4754                 rm = DI;
4755             if (d)
4756                 cdb.genc2(0x81,modregrm(3,reg,rm),d);
4757         }
4758         else
4759         {
4760             cdb.genc(0x81,modregrm(2,reg,4),
4761                 FLconst,p,                      // to this
4762                 FLconst,d);                     // ADD p[ESP],d
4763             cdb.last().Isib = modregrm(0,4,SP);
4764         }
4765         if (I64 && cdb.peek())
4766             cdb.last().Irex |= REX_W;
4767     }
4768     else
4769     {
4770         /*
4771            Generate:
4772             MOV BX,SP
4773             ADD [SS:] p[BX],d
4774            For direct call:
4775             JMP sfunc
4776            For virtual call:
4777             MOV BX, p[BX]                       BX = this
4778             MOV BX, d2[BX]                      BX = this.vptr
4779             JMP i[BX]                           jump to virtual function
4780          */
4781 
4782         genregs(cdb,0x89,SP,BX);           // MOV BX,SP
4783         cdb.genc(0x81,modregrm(2,0,7),
4784             FLconst,p,                                  // to this
4785             FLconst,d);                                 // ADD p[BX],d
4786         if (config.wflags & WFssneds ||
4787             // If DS needs reloading from SS,
4788             // then assume SS != DS on thunk entry
4789             (LARGEDATA && config.wflags & WFss))
4790             cdb.last().Iflags |= CFss;                 // SS:
4791     }
4792 
4793     if ((i & 0xFFFF) != 0xFFFF)                 // if virtual call
4794     {
4795         const bool FARTHIS = (tysize(thisty) > REGSIZE);
4796         const bool FARVPTR = FARTHIS;
4797 
4798         assert(thisty != TYvptr);               // can't handle this case
4799 
4800         if (!I16)
4801         {
4802             assert(!FARTHIS && !LARGECODE);
4803             if (thunkty == TYmfunc)     // if 'this' is in ECX
4804             {
4805                 // MOV EAX,d2[ECX]
4806                 cdb.genc1(0x8B,modregrm(2,AX,CX),FLconst,d2);
4807             }
4808             else if (thunkty == TYjfunc)        // if 'this' is in EAX
4809             {
4810                 // MOV EAX,d2[EAX]
4811                 cdb.genc1(0x8B,modregrm(2,AX,AX),FLconst,d2);
4812             }
4813             else
4814             {
4815                 // MOV EAX,p[ESP]
4816                 cdb.genc1(0x8B,(modregrm(0,4,SP) << 8) | modregrm(2,AX,4),FLconst,cast(targ_uns) p);
4817                 if (I64)
4818                     cdb.last().Irex |= REX_W;
4819 
4820                 // MOV EAX,d2[EAX]
4821                 cdb.genc1(0x8B,modregrm(2,AX,AX),FLconst,d2);
4822             }
4823             if (I64)
4824                 code_orrex(cdb.last(), REX_W);
4825                                                         // JMP i[EAX]
4826             cdb.genc1(0xFF,modregrm(2,4,0),FLconst,cast(targ_uns) i);
4827         }
4828         else
4829         {
4830             // MOV/LES BX,[SS:] p[BX]
4831             cdb.genc1((FARTHIS ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,cast(targ_uns) p);
4832             if (config.wflags & WFssneds ||
4833                 // If DS needs reloading from SS,
4834                 // then assume SS != DS on thunk entry
4835                 (LARGEDATA && config.wflags & WFss))
4836                 cdb.last().Iflags |= CFss;             // SS:
4837 
4838             // MOV/LES BX,[ES:]d2[BX]
4839             cdb.genc1((FARVPTR ? 0xC4 : 0x8B),modregrm(2,BX,7),FLconst,d2);
4840             if (FARTHIS)
4841                 cdb.last().Iflags |= CFes;             // ES:
4842 
4843                                                         // JMP i[BX]
4844             cdb.genc1(0xFF,modregrm(2,(LARGECODE ? 5 : 4),7),FLconst,cast(targ_uns) i);
4845             if (FARVPTR)
4846                 cdb.last().Iflags |= CFes;             // ES:
4847         }
4848     }
4849     else
4850     {
4851 static if (0)
4852 {
4853         localgot = null;                // no local variables
4854         code *c1 = load_localgot();
4855         if (c1)
4856         {
4857             assignaddrc(c1);
4858             cdb.append(c1);
4859         }
4860 }
4861         cdb.gencs((LARGECODE ? 0xEA : 0xE9),0,FLfunc,sfunc); // JMP sfunc
4862         cdb.last().Iflags |= LARGECODE ? (CFseg | CFoff) : (CFselfrel | CFoff);
4863     }
4864 
4865     thunkoffset = Offset(seg);
4866     code *c = cdb.finish();
4867     pinholeopt(c,null);
4868     codout(seg,c);
4869     code_free(c);
4870 
4871     sthunk.Soffset = thunkoffset;
4872     sthunk.Ssize = Offset(seg) - thunkoffset; // size of thunk
4873     sthunk.Sseg = seg;
4874     static if (TARGET_LINUX || TARGET_OSX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
4875     {
4876         objmod.pubdef(seg,sthunk,sthunk.Soffset);
4877     }
4878     static if (TARGET_WINDOS)
4879     {
4880         if (config.objfmt == OBJ_MSCOFF)
4881             objmod.pubdef(seg,sthunk,sthunk.Soffset);
4882     }
4883     searchfixlist(sthunk);              // resolve forward refs
4884 }
4885 
4886 /*****************************
4887  * Assume symbol s is extern.
4888  */
4889 
4890 void makeitextern(Symbol *s)
4891 {
4892     if (s.Sxtrnnum == 0)
4893     {
4894         s.Sclass = SCextern;           /* external             */
4895         /*printf("makeitextern(x%x)\n",s);*/
4896         objmod.external(s);
4897     }
4898 }
4899 
4900 
4901 /*******************************
4902  * Replace JMPs in Bgotocode with JMP SHORTs whereever possible.
4903  * This routine depends on FLcode jumps to only be forward
4904  * referenced.
4905  * BFLjmpoptdone is set to true if nothing more can be done
4906  * with this block.
4907  * Input:
4908  *      flag    !=0 means don't have correct Boffsets yet
4909  * Returns:
4910  *      number of bytes saved
4911  */
4912 
4913 int branch(block *bl,int flag)
4914 {
4915     int bytesaved;
4916     code* c,cn,ct;
4917     targ_size_t offset,disp;
4918     targ_size_t csize;
4919 
4920     if (!flag)
4921         bl.Bflags |= BFLjmpoptdone;      // assume this will be all
4922     c = bl.Bcode;
4923     if (!c)
4924         return 0;
4925     bytesaved = 0;
4926     offset = bl.Boffset;                 /* offset of start of block     */
4927     while (1)
4928     {
4929         ubyte op;
4930 
4931         csize = calccodsize(c);
4932         cn = code_next(c);
4933         op = cast(ubyte)c.Iop;
4934         if ((op & ~0x0F) == 0x70 && c.Iflags & CFjmp16 ||
4935             (op == JMP && !(c.Iflags & CFjmp5)))
4936         {
4937           L1:
4938             switch (c.IFL2)
4939             {
4940                 case FLblock:
4941                     if (flag)           // no offsets yet, don't optimize
4942                         goto L3;
4943                     disp = c.IEV2.Vblock.Boffset - offset - csize;
4944 
4945                     /* If this is a forward branch, and there is an aligned
4946                      * block intervening, it is possible that shrinking
4947                      * the jump instruction will cause it to be out of
4948                      * range of the target. This happens if the alignment
4949                      * prevents the target block from moving correspondingly
4950                      * closer.
4951                      */
4952                     if (disp >= 0x7F-4 && c.IEV2.Vblock.Boffset > offset)
4953                     {   /* Look for intervening alignment
4954                          */
4955                         for (block *b = bl.Bnext; b; b = b.Bnext)
4956                         {
4957                             if (b.Balign)
4958                             {
4959                                 bl.Bflags &= ~BFLjmpoptdone;   // some JMPs left
4960                                 goto L3;
4961                             }
4962                             if (b == c.IEV2.Vblock)
4963                                 break;
4964                         }
4965                     }
4966 
4967                     break;
4968 
4969                 case FLcode:
4970                 {
4971                     code *cr;
4972 
4973                     disp = 0;
4974 
4975                     ct = c.IEV2.Vcode;         /* target of branch     */
4976                     assert(ct.Iflags & (CFtarg | CFtarg2));
4977                     for (cr = cn; cr; cr = code_next(cr))
4978                     {
4979                         if (cr == ct)
4980                             break;
4981                         disp += calccodsize(cr);
4982                     }
4983 
4984                     if (!cr)
4985                     {   // Didn't find it in forward search. Try backwards jump
4986                         int s = 0;
4987                         disp = 0;
4988                         for (cr = bl.Bcode; cr != cn; cr = code_next(cr))
4989                         {
4990                             assert(cr != null); // must have found it
4991                             if (cr == ct)
4992                                 s = 1;
4993                             if (s)
4994                                 disp += calccodsize(cr);
4995                         }
4996                     }
4997 
4998                     if (config.flags4 & CFG4optimized && !flag)
4999                     {
5000                         /* Propagate branch forward past junk   */
5001                         while (1)
5002                         {
5003                             if (ct.Iop == NOP ||
5004                                 ct.Iop == (ESCAPE | ESClinnum))
5005                             {
5006                                 ct = code_next(ct);
5007                                 if (!ct)
5008                                     goto L2;
5009                             }
5010                             else
5011                             {
5012                                 c.IEV2.Vcode = ct;
5013                                 ct.Iflags |= CFtarg;
5014                                 break;
5015                             }
5016                         }
5017 
5018                         /* And eliminate jmps to jmps   */
5019                         if ((op == ct.Iop || ct.Iop == JMP) &&
5020                             (op == JMP || c.Iflags & CFjmp16))
5021                         {
5022                             c.IFL2 = ct.IFL2;
5023                             c.IEV2.Vcode = ct.IEV2.Vcode;
5024                             /*printf("eliminating branch\n");*/
5025                             goto L1;
5026                         }
5027                      L2:
5028                         { }
5029                     }
5030                 }
5031                     break;
5032 
5033                 default:
5034                     goto L3;
5035             }
5036 
5037             if (disp == 0)                      // bra to next instruction
5038             {
5039                 bytesaved += csize;
5040                 c.Iop = NOP;                   // del branch instruction
5041                 c.IEV2.Vcode = null;
5042                 c = cn;
5043                 if (!c)
5044                     break;
5045                 continue;
5046             }
5047             else if (cast(targ_size_t)cast(targ_schar)(disp - 2) == (disp - 2) &&
5048                      cast(targ_size_t)cast(targ_schar)disp == disp)
5049             {
5050                 if (op == JMP)
5051                 {
5052                     c.Iop = JMPS;              // JMP SHORT
5053                     bytesaved += I16 ? 1 : 3;
5054                 }
5055                 else                            // else Jcond
5056                 {
5057                     c.Iflags &= ~CFjmp16;      // a branch is ok
5058                     bytesaved += I16 ? 3 : 4;
5059 
5060                     // Replace a cond jump around a call to a function that
5061                     // never returns with a cond jump to that function.
5062                     if (config.flags4 & CFG4optimized &&
5063                         config.target_cpu >= TARGET_80386 &&
5064                         disp == (I16 ? 3 : 5) &&
5065                         cn &&
5066                         cn.Iop == CALL &&
5067                         cn.IFL2 == FLfunc &&
5068                         cn.IEV2.Vsym.Sflags & SFLexit &&
5069                         !(cn.Iflags & (CFtarg | CFtarg2))
5070                        )
5071                     {
5072                         cn.Iop = 0x0F00 | ((c.Iop & 0x0F) ^ 0x81);
5073                         c.Iop = NOP;
5074                         c.IEV2.Vcode = null;
5075                         bytesaved++;
5076 
5077                         // If nobody else points to ct, we can remove the CFtarg
5078                         if (flag && ct)
5079                         {
5080                             code *cx;
5081                             for (cx = bl.Bcode; 1; cx = code_next(cx))
5082                             {
5083                                 if (!cx)
5084                                 {
5085                                     ct.Iflags &= ~CFtarg;
5086                                     break;
5087                                 }
5088                                 if (cx.IEV2.Vcode == ct)
5089                                     break;
5090                             }
5091                         }
5092                     }
5093                 }
5094                 csize = calccodsize(c);
5095             }
5096             else
5097                 bl.Bflags &= ~BFLjmpoptdone;   // some JMPs left
5098         }
5099 L3:
5100         if (cn)
5101         {
5102             offset += csize;
5103             c = cn;
5104         }
5105         else
5106             break;
5107     }
5108     //printf("bytesaved = x%x\n",bytesaved);
5109     return bytesaved;
5110 }
5111 
5112 
5113 /************************************************
5114  * Adjust all Soffset's of stack variables so they
5115  * are all relative to the frame pointer.
5116  */
5117 
5118 version (MARS)
5119 {
5120 void cod3_adjSymOffsets()
5121 {
5122     SYMIDX si;
5123 
5124     //printf("cod3_adjSymOffsets()\n");
5125     for (si = 0; si < globsym.length; si++)
5126     {
5127         //printf("\tglobsym[%d] = %p\n",si,globsym[si]);
5128         Symbol *s = globsym[si];
5129 
5130         switch (s.Sclass)
5131         {
5132             case SCparameter:
5133             case SCregpar:
5134             case SCshadowreg:
5135 //printf("s = '%s', Soffset = x%x, Para.size = x%x, EBPtoESP = x%x\n", s.Sident, s.Soffset, Para.size, EBPtoESP);
5136                 s.Soffset += Para.size;
5137                 if (0 && !(funcsym_p.Sfunc.Fflags3 & Fmember))
5138                 {
5139                     if (!hasframe)
5140                         s.Soffset += EBPtoESP;
5141                     if (funcsym_p.Sfunc.Fflags3 & Fnested)
5142                         s.Soffset += REGSIZE;
5143                 }
5144                 break;
5145 
5146             case SCfastpar:
5147 //printf("\tfastpar %s %p Soffset %x Fast.size %x BPoff %x\n", s.Sident, s, (int)s.Soffset, (int)Fast.size, (int)BPoff);
5148                 s.Soffset += Fast.size + BPoff;
5149                 break;
5150 
5151             case SCauto:
5152             case SCregister:
5153                 if (s.Sfl == FLfast)
5154                     s.Soffset += Fast.size + BPoff;
5155                 else
5156 //printf("s = '%s', Soffset = x%x, Auto.size = x%x, BPoff = x%x EBPtoESP = x%x\n", s.Sident, (int)s.Soffset, (int)Auto.size, (int)BPoff, (int)EBPtoESP);
5157 //              if (!(funcsym_p.Sfunc.Fflags3 & Fnested))
5158                     s.Soffset += Auto.size + BPoff;
5159                 break;
5160 
5161             case SCbprel:
5162                 break;
5163 
5164             default:
5165                 continue;
5166         }
5167         static if (0)
5168         {
5169             if (!hasframe)
5170                 s.Soffset += EBPtoESP;
5171         }
5172     }
5173 }
5174 
5175 }
5176 
5177 /*******************************
5178  * Take symbol info in union ev and replace it with a real address
5179  * in Vpointer.
5180  */
5181 
5182 void assignaddr(block *bl)
5183 {
5184     int EBPtoESPsave = EBPtoESP;
5185     int hasframesave = hasframe;
5186 
5187     if (bl.Bflags & BFLoutsideprolog)
5188     {
5189         EBPtoESP = -REGSIZE;
5190         hasframe = 0;
5191     }
5192     assignaddrc(bl.Bcode);
5193     hasframe = hasframesave;
5194     EBPtoESP = EBPtoESPsave;
5195 }
5196 
5197 void assignaddrc(code *c)
5198 {
5199     int sn;
5200     Symbol *s;
5201     ubyte ins,rm;
5202     targ_size_t soff;
5203     targ_size_t base;
5204 
5205     base = EBPtoESP;
5206     for (; c; c = code_next(c))
5207     {
5208         debug
5209         {
5210         if (0)
5211         {       printf("assignaddrc()\n");
5212                 code_print(c);
5213         }
5214         if (code_next(c) && code_next(code_next(c)) == c)
5215             assert(0);
5216         }
5217 
5218         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
5219             ins = vex_inssize(c);
5220         else if ((c.Iop & 0xFFFD00) == 0x0F3800)
5221             ins = inssize2[(c.Iop >> 8) & 0xFF];
5222         else if ((c.Iop & 0xFF00) == 0x0F00)
5223             ins = inssize2[c.Iop & 0xFF];
5224         else if ((c.Iop & 0xFF) == ESCAPE)
5225         {
5226             if (c.Iop == (ESCAPE | ESCadjesp))
5227             {
5228                 //printf("adjusting EBPtoESP (%d) by %ld\n",EBPtoESP,(long)c.IEV1.Vint);
5229                 EBPtoESP += c.IEV1.Vint;
5230                 c.Iop = NOP;
5231             }
5232             else if (c.Iop == (ESCAPE | ESCfixesp))
5233             {
5234                 //printf("fix ESP\n");
5235                 if (hasframe)
5236                 {
5237                     // LEA ESP,-EBPtoESP[EBP]
5238                     c.Iop = LEA;
5239                     if (c.Irm & 8)
5240                         c.Irex |= REX_R;
5241                     c.Irm = modregrm(2,SP,BP);
5242                     c.Iflags = CFoff;
5243                     c.IFL1 = FLconst;
5244                     c.IEV1.Vuns = -EBPtoESP;
5245                     if (enforcealign)
5246                     {
5247                         // AND ESP, -STACKALIGN
5248                         code *cn = code_calloc();
5249                         cn.Iop = 0x81;
5250                         cn.Irm = modregrm(3, 4, SP);
5251                         cn.Iflags = CFoff;
5252                         cn.IFL2 = FLconst;
5253                         cn.IEV2.Vsize_t = -STACKALIGN;
5254                         if (I64)
5255                             c.Irex |= REX_W;
5256                         cn.next = c.next;
5257                         c.next = cn;
5258                     }
5259                 }
5260             }
5261             else if (c.Iop == (ESCAPE | ESCframeptr))
5262             {   // Convert to load of frame pointer
5263                 // c.Irm is the register to use
5264                 if (hasframe && !enforcealign)
5265                 {   // MOV reg,EBP
5266                     c.Iop = 0x89;
5267                     if (c.Irm & 8)
5268                         c.Irex |= REX_B;
5269                     c.Irm = modregrm(3,BP,c.Irm & 7);
5270                 }
5271                 else
5272                 {   // LEA reg,EBPtoESP[ESP]
5273                     c.Iop = LEA;
5274                     if (c.Irm & 8)
5275                         c.Irex |= REX_R;
5276                     c.Irm = modregrm(2,c.Irm & 7,4);
5277                     c.Isib = modregrm(0,4,SP);
5278                     c.Iflags = CFoff;
5279                     c.IFL1 = FLconst;
5280                     c.IEV1.Vuns = EBPtoESP;
5281                 }
5282             }
5283             if (I64)
5284                 c.Irex |= REX_W;
5285             continue;
5286         }
5287         else
5288             ins = inssize[c.Iop & 0xFF];
5289         if (!(ins & M) ||
5290             ((rm = c.Irm) & 0xC0) == 0xC0)
5291             goto do2;           /* if no first operand          */
5292         if (is32bitaddr(I32,c.Iflags))
5293         {
5294 
5295             if (
5296                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
5297                )
5298                 goto do2;       /* if no first operand  */
5299         }
5300         else
5301         {
5302             if (
5303                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
5304                )
5305                 goto do2;       /* if no first operand  */
5306         }
5307         s = c.IEV1.Vsym;
5308         switch (c.IFL1)
5309         {
5310             case FLdata:
5311                 if (config.objfmt == OBJ_OMF && s.Sclass != SCcomdat && s.Sclass != SCextern)
5312                 {
5313                     version (MARS)
5314                     {
5315                         c.IEV1.Vseg = s.Sseg;
5316                     }
5317                     else
5318                     {
5319                         c.IEV1.Vseg = DATA;
5320                     }
5321                     c.IEV1.Vpointer += s.Soffset;
5322                     c.IFL1 = FLdatseg;
5323                 }
5324                 else
5325                     c.IFL1 = FLextern;
5326                 goto do2;
5327 
5328             case FLudata:
5329                 if (config.objfmt == OBJ_OMF)
5330                 {
5331                     version (MARS)
5332                     {
5333                         c.IEV1.Vseg = s.Sseg;
5334                     }
5335                     else
5336                     {
5337                         c.IEV1.Vseg = UDATA;
5338                     }
5339                     c.IEV1.Vpointer += s.Soffset;
5340                     c.IFL1 = FLdatseg;
5341                 }
5342                 else
5343                     c.IFL1 = FLextern;
5344                 goto do2;
5345 
5346             case FLtlsdata:
5347                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5348                     c.IFL1 = FLextern;
5349                 goto do2;
5350 
5351             case FLdatseg:
5352                 //c.IEV1.Vseg = DATA;
5353                 goto do2;
5354 
5355             case FLfardata:
5356             case FLcsdata:
5357             case FLpseudo:
5358                 goto do2;
5359 
5360             case FLstack:
5361                 //printf("Soffset = %d, EBPtoESP = %d, base = %d, pointer = %d\n",
5362                 //s.Soffset,EBPtoESP,base,c.IEV1.Vpointer);
5363                 c.IEV1.Vpointer += s.Soffset + EBPtoESP - base - EEStack.offset;
5364                 break;
5365 
5366             case FLfast:
5367                 soff = Fast.size;
5368                 goto L1;
5369 
5370             case FLreg:
5371             case FLauto:
5372                 soff = Auto.size;
5373             L1:
5374                 if (Symbol_Sisdead(s, anyiasm))
5375                 {
5376                     c.Iop = NOP;               // remove references to it
5377                     continue;
5378                 }
5379                 if (s.Sfl == FLreg && c.IEV1.Vpointer < 2)
5380                 {
5381                     reg_t reg = s.Sreglsw;
5382 
5383                     assert(!(s.Sregm & ~mask(reg)));
5384                     if (c.IEV1.Vpointer == 1)
5385                     {
5386                         assert(reg < 4);    /* must be a BYTEREGS   */
5387                         reg |= 4;           /* convert to high byte reg */
5388                     }
5389                     if (reg & 8)
5390                     {
5391                         assert(I64);
5392                         c.Irex |= REX_B;
5393                         reg &= 7;
5394                     }
5395                     c.Irm = (c.Irm & modregrm(0,7,0))
5396                             | modregrm(3,0,reg);
5397                     assert(c.Iop != LES && c.Iop != LEA);
5398                     goto do2;
5399                 }
5400                 else
5401                 {   c.IEV1.Vpointer += s.Soffset + soff + BPoff;
5402                     if (s.Sflags & SFLunambig)
5403                         c.Iflags |= CFunambig;
5404             L2:
5405                     if (!hasframe || (enforcealign && c.IFL1 != FLpara))
5406                     {   /* Convert to ESP relative address instead of EBP */
5407                         assert(!I16);
5408                         c.IEV1.Vpointer += EBPtoESP;
5409                         ubyte crm = c.Irm;
5410                         if ((crm & 7) == 4)              // if SIB byte
5411                         {
5412                             assert((c.Isib & 7) == BP);
5413                             assert((crm & 0xC0) != 0);
5414                             c.Isib = (c.Isib & ~7) | modregrm(0,0,SP);
5415                         }
5416                         else
5417                         {
5418                             assert((crm & 7) == 5);
5419                             c.Irm = (crm & modregrm(0,7,0))
5420                                     | modregrm(2,0,4);
5421                             c.Isib = modregrm(0,4,SP);
5422                         }
5423                     }
5424                 }
5425                 break;
5426 
5427             case FLpara:
5428 //printf("s = %s, Soffset = %d, Para.size = %d, BPoff = %d, EBPtoESP = %d\n", s.Sident.ptr, s.Soffset, Para.size, BPoff, EBPtoESP);
5429                 soff = Para.size - BPoff;    // cancel out add of BPoff
5430                 goto L1;
5431 
5432             case FLfltreg:
5433                 c.IEV1.Vpointer += Foff + BPoff;
5434                 c.Iflags |= CFunambig;
5435                 goto L2;
5436 
5437             case FLallocatmp:
5438                 c.IEV1.Vpointer += Alloca.offset + BPoff;
5439                 goto L2;
5440 
5441             case FLfuncarg:
5442                 c.IEV1.Vpointer += cgstate.funcarg.offset + BPoff;
5443                 goto L2;
5444 
5445             case FLbprel:
5446                 c.IEV1.Vpointer += s.Soffset;
5447                 break;
5448 
5449             case FLcs:
5450                 sn = c.IEV1.Vuns;
5451                 if (!CSE.loaded(sn))            // if never loaded
5452                 {
5453                     c.Iop = NOP;
5454                     continue;
5455                 }
5456                 c.IEV1.Vpointer = CSE.offset(sn) + CSoff + BPoff;
5457                 c.Iflags |= CFunambig;
5458                 goto L2;
5459 
5460             case FLregsave:
5461                 sn = c.IEV1.Vuns;
5462                 c.IEV1.Vpointer = sn + regsave.off + BPoff;
5463                 c.Iflags |= CFunambig;
5464                 goto L2;
5465 
5466             case FLndp:
5467                 version (MARS)
5468                 {
5469                     assert(c.IEV1.Vuns < global87.save.length);
5470                 }
5471                 c.IEV1.Vpointer = c.IEV1.Vuns * tysize(TYldouble) + NDPoff + BPoff;
5472                 c.Iflags |= CFunambig;
5473                 goto L2;
5474 
5475             case FLoffset:
5476                 break;
5477 
5478             case FLlocalsize:
5479                 c.IEV1.Vpointer += localsize;
5480                 break;
5481 
5482             case FLconst:
5483             default:
5484                 goto do2;
5485         }
5486         c.IFL1 = FLconst;
5487     do2:
5488         /* Ignore TEST (F6 and F7) opcodes      */
5489         if (!(ins & T)) goto done;              /* if no second operand */
5490         s = c.IEV2.Vsym;
5491         switch (c.IFL2)
5492         {
5493             case FLdata:
5494                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5495                 {
5496                     c.IFL2 = FLextern;
5497                     goto do2;
5498                 }
5499                 else
5500                 {
5501                     if (s.Sclass == SCcomdat)
5502                     {   c.IFL2 = FLextern;
5503                         goto do2;
5504                     }
5505                     c.IEV2.Vseg = MARS ? s.Sseg : DATA;
5506                     c.IEV2.Vpointer += s.Soffset;
5507                     c.IFL2 = FLdatseg;
5508                     goto done;
5509                 }
5510 
5511             case FLudata:
5512                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5513                 {
5514                     c.IFL2 = FLextern;
5515                     goto do2;
5516                 }
5517                 else
5518                 {
5519                     c.IEV2.Vseg = MARS ? s.Sseg : UDATA;
5520                     c.IEV2.Vpointer += s.Soffset;
5521                     c.IFL2 = FLdatseg;
5522                     goto done;
5523                 }
5524 
5525             case FLtlsdata:
5526                 if (config.objfmt == OBJ_ELF || config.objfmt == OBJ_MACH)
5527                 {
5528                     c.IFL2 = FLextern;
5529                     goto do2;
5530                 }
5531                 goto done;
5532 
5533             case FLdatseg:
5534                 //c.IEV2.Vseg = DATA;
5535                 goto done;
5536 
5537             case FLcsdata:
5538             case FLfardata:
5539                 goto done;
5540 
5541             case FLreg:
5542             case FLpseudo:
5543                 assert(0);
5544                 /* NOTREACHED */
5545 
5546             case FLfast:
5547                 c.IEV2.Vpointer += s.Soffset + Fast.size + BPoff;
5548                 break;
5549 
5550             case FLauto:
5551                 c.IEV2.Vpointer += s.Soffset + Auto.size + BPoff;
5552             L3:
5553                 if (!hasframe || (enforcealign && c.IFL2 != FLpara))
5554                     /* Convert to ESP relative address instead of EBP */
5555                     c.IEV2.Vpointer += EBPtoESP;
5556                 break;
5557 
5558             case FLpara:
5559                 c.IEV2.Vpointer += s.Soffset + Para.size;
5560                 goto L3;
5561 
5562             case FLfltreg:
5563                 c.IEV2.Vpointer += Foff + BPoff;
5564                 goto L3;
5565 
5566             case FLallocatmp:
5567                 c.IEV2.Vpointer += Alloca.offset + BPoff;
5568                 goto L3;
5569 
5570             case FLfuncarg:
5571                 c.IEV2.Vpointer += cgstate.funcarg.offset + BPoff;
5572                 goto L3;
5573 
5574             case FLbprel:
5575                 c.IEV2.Vpointer += s.Soffset;
5576                 break;
5577 
5578             case FLstack:
5579                 c.IEV2.Vpointer += s.Soffset + EBPtoESP - base;
5580                 break;
5581 
5582             case FLcs:
5583             case FLndp:
5584             case FLregsave:
5585                 assert(0);
5586 
5587             case FLconst:
5588                 break;
5589 
5590             case FLlocalsize:
5591                 c.IEV2.Vpointer += localsize;
5592                 break;
5593 
5594             default:
5595                 goto done;
5596         }
5597         c.IFL2 = FLconst;
5598   done:
5599         { }
5600     }
5601 }
5602 
5603 /*******************************
5604  * Return offset from BP of symbol s.
5605  */
5606 
5607 targ_size_t cod3_bpoffset(Symbol *s)
5608 {
5609     targ_size_t offset;
5610 
5611     symbol_debug(s);
5612     offset = s.Soffset;
5613     switch (s.Sfl)
5614     {
5615         case FLpara:
5616             offset += Para.size;
5617             break;
5618 
5619         case FLfast:
5620             offset += Fast.size + BPoff;
5621             break;
5622 
5623         case FLauto:
5624             offset += Auto.size + BPoff;
5625             break;
5626 
5627         default:
5628             WRFL(cast(FL)s.Sfl);
5629             symbol_print(s);
5630             assert(0);
5631     }
5632     assert(hasframe);
5633     return offset;
5634 }
5635 
5636 
5637 /*******************************
5638  * Find shorter versions of the same instructions.
5639  * Does these optimizations:
5640  *      replaces jmps to the next instruction with NOPs
5641  *      sign extension of modregrm displacement
5642  *      sign extension of immediate data (can't do it for OR, AND, XOR
5643  *              as the opcodes are not defined)
5644  *      short versions for AX EA
5645  *      short versions for reg EA
5646  * Code is neither removed nor added.
5647  * Params:
5648  *      b = block for code (or null)
5649  *      c = code list to optimize
5650  */
5651 
5652 void pinholeopt(code *c,block *b)
5653 {
5654     targ_size_t a;
5655     uint mod;
5656     ubyte ins;
5657     int usespace;
5658     int useopsize;
5659     int space;
5660     block *bn;
5661 
5662     debug
5663     {
5664         __gshared int tested; if (!tested) { tested++; pinholeopt_unittest(); }
5665     }
5666 
5667     debug
5668     {
5669         code *cstart = c;
5670         if (debugc)
5671         {
5672             printf("+pinholeopt(%p)\n",c);
5673         }
5674     }
5675 
5676     if (b)
5677     {
5678         bn = b.Bnext;
5679         usespace = (config.flags4 & CFG4space && b.BC != BCasm);
5680         useopsize = (I16 || (config.flags4 & CFG4space && b.BC != BCasm));
5681     }
5682     else
5683     {
5684         bn = null;
5685         usespace = (config.flags4 & CFG4space);
5686         useopsize = (I16 || config.flags4 & CFG4space);
5687     }
5688     for (; c; c = code_next(c))
5689     {
5690     L1:
5691         opcode_t op = c.Iop;
5692         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
5693             ins = vex_inssize(c);
5694         else if ((op & 0xFFFD00) == 0x0F3800)
5695             ins = inssize2[(op >> 8) & 0xFF];
5696         else if ((op & 0xFF00) == 0x0F00)
5697             ins = inssize2[op & 0xFF];
5698         else
5699             ins = inssize[op & 0xFF];
5700         if (ins & M)            // if modregrm byte
5701         {
5702             int shortop = (c.Iflags & CFopsize) ? !I16 : I16;
5703             int local_BPRM = BPRM;
5704 
5705             if (c.Iflags & CFaddrsize)
5706                 local_BPRM ^= 5 ^ 6;    // toggle between 5 and 6
5707 
5708             uint rm = c.Irm;
5709             reg_t reg = rm & modregrm(0,7,0);          // isolate reg field
5710             reg_t ereg = rm & 7;
5711             //printf("c = %p, op = %02x rm = %02x\n", c, op, rm);
5712 
5713             /* If immediate second operand      */
5714             if ((ins & T ||
5715                  ((op == 0xF6 || op == 0xF7) && (reg < modregrm(0,2,0) || reg > modregrm(0,3,0)))
5716                 ) &&
5717                 c.IFL2 == FLconst)
5718             {
5719                 int flags = c.Iflags & CFpsw;      /* if want result in flags */
5720                 targ_long u = c.IEV2.Vuns;
5721                 if (ins & E)
5722                     u = cast(byte) u;
5723                 else if (shortop)
5724                     u = cast(short) u;
5725 
5726                 // Replace CMP reg,0 with TEST reg,reg
5727                 if ((op & 0xFE) == 0x80 &&              // 80 is CMP R8,imm8; 81 is CMP reg,imm
5728                     rm >= modregrm(3,7,AX) &&
5729                     u == 0)
5730                 {
5731                     c.Iop = (op & 1) | 0x84;
5732                     c.Irm = modregrm(3,ereg,ereg);
5733                     if (c.Irex & REX_B)
5734                         c.Irex |= REX_R;
5735                     goto L1;
5736                 }
5737 
5738                 /* Optimize ANDs with an immediate constant             */
5739                 if ((op == 0x81 || op == 0x80) && reg == modregrm(0,4,0))
5740                 {
5741                     if (rm >= modregrm(3,4,AX))         // AND reg,imm
5742                     {
5743                         if (u == 0)
5744                         {
5745                             /* Replace with XOR reg,reg     */
5746                             c.Iop = 0x30 | (op & 1);
5747                             c.Irm = modregrm(3,ereg,ereg);
5748                             if (c.Irex & REX_B)
5749                                 c.Irex |= REX_R;
5750                             goto L1;
5751                         }
5752                         if (u == 0xFFFFFFFF && !flags)
5753                         {
5754                             c.Iop = NOP;
5755                             goto L1;
5756                         }
5757                     }
5758                     if (op == 0x81 && !flags)
5759                     {   // If we can do the operation in one byte
5760 
5761                         // If EA is not SI or DI
5762                         if ((rm < modregrm(3,4,SP) || I64) &&
5763                             (config.flags4 & CFG4space ||
5764                              config.target_cpu < TARGET_PentiumPro)
5765                            )
5766                         {
5767                             if ((u & 0xFFFFFF00) == 0xFFFFFF00)
5768                                 goto L2;
5769                             else if (rm < modregrm(3,0,0) || (!c.Irex && ereg < 4))
5770                             {
5771                                 if (!shortop)
5772                                 {
5773                                     if ((u & 0xFFFF00FF) == 0xFFFF00FF)
5774                                         goto L3;
5775                                 }
5776                                 else
5777                                 {
5778                                     if ((u & 0xFF) == 0xFF)
5779                                         goto L3;
5780                                 }
5781                             }
5782                         }
5783                         if (!shortop && useopsize)
5784                         {
5785                             if ((u & 0xFFFF0000) == 0xFFFF0000)
5786                             {
5787                                 c.Iflags ^= CFopsize;
5788                                 goto L1;
5789                             }
5790                             if ((u & 0xFFFF) == 0xFFFF && rm < modregrm(3,4,AX))
5791                             {
5792                                 c.IEV1.Voffset += 2; /* address MSW      */
5793                                 c.IEV2.Vuns >>= 16;
5794                                 c.Iflags ^= CFopsize;
5795                                 goto L1;
5796                             }
5797                             if (rm >= modregrm(3,4,AX))
5798                             {
5799                                 if (u == 0xFF && (rm <= modregrm(3,4,BX) || I64))
5800                                 {
5801                                     c.Iop = MOVZXb;     // MOVZX
5802                                     c.Irm = modregrm(3,ereg,ereg);
5803                                     if (c.Irex & REX_B)
5804                                         c.Irex |= REX_R;
5805                                     goto L1;
5806                                 }
5807                                 if (u == 0xFFFF)
5808                                 {
5809                                     c.Iop = MOVZXw;     // MOVZX
5810                                     c.Irm = modregrm(3,ereg,ereg);
5811                                     if (c.Irex & REX_B)
5812                                         c.Irex |= REX_R;
5813                                     goto L1;
5814                                 }
5815                             }
5816                         }
5817                     }
5818                 }
5819 
5820                 /* Look for ADD,OR,SUB,XOR with u that we can eliminate */
5821                 if (!flags &&
5822                     (op == 0x81 || op == 0x80) &&
5823                     (reg == modregrm(0,0,0) || reg == modregrm(0,1,0) ||  // ADD,OR
5824                      reg == modregrm(0,5,0) || reg == modregrm(0,6,0))    // SUB, XOR
5825                    )
5826                 {
5827                     if (u == 0)
5828                     {
5829                         c.Iop = NOP;
5830                         goto L1;
5831                     }
5832                     if (u == ~0 && reg == modregrm(0,6,0))  /* XOR  */
5833                     {
5834                         c.Iop = 0xF6 | (op & 1);       /* NOT  */
5835                         c.Irm ^= modregrm(0,6^2,0);
5836                         goto L1;
5837                     }
5838                     if (!shortop &&
5839                         useopsize &&
5840                         op == 0x81 &&
5841                         (u & 0xFFFF0000) == 0 &&
5842                         (reg == modregrm(0,6,0) || reg == modregrm(0,1,0)))
5843                     {
5844                         c.Iflags ^= CFopsize;
5845                         goto L1;
5846                     }
5847                 }
5848 
5849                 /* Look for TEST or OR or XOR with an immediate constant */
5850                 /* that we can replace with a byte operation            */
5851                 if (op == 0xF7 && reg == modregrm(0,0,0) ||
5852                     op == 0x81 && reg == modregrm(0,6,0) && !flags ||
5853                     op == 0x81 && reg == modregrm(0,1,0))
5854                 {
5855                     // See if we can replace a dword with a word
5856                     // (avoid for 32 bit instructions, because CFopsize
5857                     //  is too slow)
5858                     if (!shortop && useopsize)
5859                     {
5860                         if ((u & 0xFFFF0000) == 0)
5861                         {
5862                             c.Iflags ^= CFopsize;
5863                             goto L1;
5864                         }
5865                         /* If memory (not register) addressing mode     */
5866                         if ((u & 0xFFFF) == 0 && rm < modregrm(3,0,AX))
5867                         {
5868                             c.IEV1.Voffset += 2; /* address MSW  */
5869                             c.IEV2.Vuns >>= 16;
5870                             c.Iflags ^= CFopsize;
5871                             goto L1;
5872                         }
5873                     }
5874 
5875                     // If EA is not SI or DI
5876                     if (rm < (modregrm(3,0,SP) | reg) &&
5877                         (usespace ||
5878                          config.target_cpu < TARGET_PentiumPro)
5879                        )
5880                     {
5881                         if ((u & 0xFFFFFF00) == 0)
5882                         {
5883                         L2: c.Iop--;           /* to byte instruction  */
5884                             c.Iflags &= ~CFopsize;
5885                             goto L1;
5886                         }
5887                         if (((u & 0xFFFF00FF) == 0 ||
5888                              (shortop && (u & 0xFF) == 0)) &&
5889                             (rm < modregrm(3,0,0) || (!c.Irex && ereg < 4)))
5890                         {
5891                         L3:
5892                             c.IEV2.Vuns >>= 8;
5893                             if (rm >= (modregrm(3,0,AX) | reg))
5894                                 c.Irm |= 4;    /* AX.AH, BX.BH, etc. */
5895                             else
5896                                 c.IEV1.Voffset += 1;
5897                             goto L2;
5898                         }
5899                     }
5900 
5901                     // BUG: which is right?
5902                     //else if ((u & 0xFFFF0000) == 0)
5903 
5904                     else if (0 && op == 0xF7 &&
5905                              rm >= modregrm(3,0,SP) &&
5906                              (u & 0xFFFF0000) == 0)
5907 
5908                         c.Iflags &= ~CFopsize;
5909                 }
5910 
5911                 // Try to replace TEST reg,-1 with TEST reg,reg
5912                 if (op == 0xF6 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7)) // TEST regL,immed8
5913                 {
5914                     if ((u & 0xFF) == 0xFF)
5915                     {
5916                       L4:
5917                         c.Iop = 0x84;          // TEST regL,regL
5918                         c.Irm = modregrm(3,ereg,ereg);
5919                         if (c.Irex & REX_B)
5920                             c.Irex |= REX_R;
5921                         c.Iflags &= ~CFopsize;
5922                         goto L1;
5923                     }
5924                 }
5925                 if (op == 0xF7 && rm >= modregrm(3,0,AX) && rm <= modregrm(3,0,7) && (I64 || ereg < 4))
5926                 {
5927                     if (u == 0xFF)
5928                     {
5929                         if (ereg & 4) // SIL,DIL,BPL,SPL need REX prefix
5930                             c.Irex |= REX;
5931                         goto L4;
5932                     }
5933                     if ((u & 0xFFFF) == 0xFF00 && shortop && !c.Irex && ereg < 4)
5934                     {
5935                         ereg |= 4;                /* to regH      */
5936                         goto L4;
5937                     }
5938                 }
5939 
5940                 /* Look for sign extended immediate data */
5941                 if (cast(byte) u == u)
5942                 {
5943                     if (op == 0x81)
5944                     {
5945                         if (reg != 0x08 && reg != 0x20 && reg != 0x30)
5946                             c.Iop = op = 0x83;         /* 8 bit sgn ext */
5947                     }
5948                     else if (op == 0x69)                /* IMUL rw,ew,dw */
5949                         c.Iop = op = 0x6B;             /* IMUL rw,ew,db */
5950                 }
5951 
5952                 // Look for SHIFT EA,imm8 we can replace with short form
5953                 if (u == 1 && ((op & 0xFE) == 0xC0))
5954                     c.Iop |= 0xD0;
5955 
5956             } /* if immediate second operand */
5957 
5958             /* Look for AX short form */
5959             if (ins & A)
5960             {
5961                 if (rm == modregrm(0,AX,local_BPRM) &&
5962                     !(c.Irex & REX_R) &&               // and it's AX, not R8
5963                     (op & ~3) == 0x88 &&
5964                     !I64)
5965                 {
5966                     op = ((op & 3) + 0xA0) ^ 2;
5967                     /* 8A. A0 */
5968                     /* 8B. A1 */
5969                     /* 88. A2 */
5970                     /* 89. A3 */
5971                     c.Iop = op;
5972                     c.IFL2 = c.IFL1;
5973                     c.IEV2 = c.IEV1;
5974                 }
5975 
5976                 /* Replace MOV REG1,REG2 with MOV EREG1,EREG2   */
5977                 else if (!I16 &&
5978                          (op == 0x89 || op == 0x8B) &&
5979                          (rm & 0xC0) == 0xC0 &&
5980                          (!b || b.BC != BCasm)
5981                         )
5982                     c.Iflags &= ~CFopsize;
5983 
5984                 // If rm is AX
5985                 else if ((rm & modregrm(3,0,7)) == modregrm(3,0,AX) && !(c.Irex & (REX_R | REX_B)))
5986                 {
5987                     switch (op)
5988                     {
5989                         case 0x80:  op = reg | 4; break;
5990                         case 0x81:  op = reg | 5; break;
5991                         case 0x87:  op = 0x90 + (reg>>3); break;    // XCHG
5992 
5993                         case 0xF6:
5994                             if (reg == 0)
5995                                 op = 0xA8;  /* TEST AL,immed8       */
5996                             break;
5997 
5998                         case 0xF7:
5999                             if (reg == 0)
6000                                 op = 0xA9;  /* TEST AX,immed16      */
6001                             break;
6002 
6003                         default:
6004                             break;
6005                     }
6006                     c.Iop = op;
6007                 }
6008             }
6009 
6010             /* Look for reg short form */
6011             if ((ins & R) && (rm & 0xC0) == 0xC0)
6012             {
6013                 switch (op)
6014                 {
6015                     case 0xC6:  op = 0xB0 + ereg; break;
6016                     case 0xC7: // if no sign extension
6017                         if (!(c.Irex & REX_W && c.IEV2.Vint < 0))
6018                         {
6019                             c.Irm = 0;
6020                             c.Irex &= ~REX_W;
6021                             op = 0xB8 + ereg;
6022                         }
6023                         break;
6024 
6025                     case 0xFF:
6026                         switch (reg)
6027                         {   case 6<<3: op = 0x50+ereg; break;/* PUSH*/
6028                             case 0<<3: if (!I64) op = 0x40+ereg; break; /* INC*/
6029                             case 1<<3: if (!I64) op = 0x48+ereg; break; /* DEC*/
6030                             default: break;
6031                         }
6032                         break;
6033 
6034                     case 0x8F:  op = 0x58 + ereg; break;
6035                     case 0x87:
6036                         if (reg == 0 && !(c.Irex & (REX_R | REX_B))) // Issue 12968: Needed to ensure it's referencing RAX, not R8
6037                             op = 0x90 + ereg;
6038                         break;
6039 
6040                     default:
6041                         break;
6042                 }
6043                 c.Iop = op;
6044             }
6045 
6046             // Look to remove redundant REX prefix on XOR
6047             if (c.Irex == REX_W // ignore ops involving R8..R15
6048                 && (op == 0x31 || op == 0x33) // XOR
6049                 && ((rm & 0xC0) == 0xC0) // register direct
6050                 && ((reg >> 3) == ereg)) // register with itself
6051             {
6052                 c.Irex = 0;
6053             }
6054 
6055             // Look to replace SHL reg,1 with ADD reg,reg
6056             if ((op & ~1) == 0xD0 &&
6057                      (rm & modregrm(3,7,0)) == modregrm(3,4,0) &&
6058                      config.target_cpu >= TARGET_80486)
6059             {
6060                 c.Iop &= 1;
6061                 c.Irm = cast(ubyte)((rm & modregrm(3,0,7)) | (ereg << 3));
6062                 if (c.Irex & REX_B)
6063                     c.Irex |= REX_R;
6064                 if (!(c.Iflags & CFpsw) && !I16)
6065                     c.Iflags &= ~CFopsize;
6066                 goto L1;
6067             }
6068 
6069             /* Look for sign extended modregrm displacement, or 0
6070              * displacement.
6071              */
6072 
6073             if (((rm & 0xC0) == 0x80) && // it's a 16/32 bit disp
6074                 c.IFL1 == FLconst)      // and it's a constant
6075             {
6076                 a = c.IEV1.Vpointer;
6077                 if (a == 0 && (rm & 7) != local_BPRM &&         // if 0[disp]
6078                     !(local_BPRM == 5 && (rm & 7) == 4 && (c.Isib & 7) == BP)
6079                    )
6080                     c.Irm &= 0x3F;
6081                 else if (!I16)
6082                 {
6083                     if (cast(targ_size_t)cast(targ_schar)a == a)
6084                         c.Irm ^= 0xC0;                 /* do 8 sx      */
6085                 }
6086                 else if ((cast(targ_size_t)cast(targ_schar)a & 0xFFFF) == (a & 0xFFFF))
6087                     c.Irm ^= 0xC0;                     /* do 8 sx      */
6088             }
6089 
6090             /* Look for LEA reg,[ireg], replace with MOV reg,ireg       */
6091             if (op == LEA)
6092             {
6093                 rm = c.Irm & 7;
6094                 mod = c.Irm & modregrm(3,0,0);
6095                 if (mod == 0)
6096                 {
6097                     if (!I16)
6098                     {
6099                         switch (rm)
6100                         {
6101                             case 4:
6102                             case 5:
6103                                 break;
6104 
6105                             default:
6106                                 c.Irm |= modregrm(3,0,0);
6107                                 c.Iop = 0x8B;
6108                                 break;
6109                         }
6110                     }
6111                     else
6112                     {
6113                         switch (rm)
6114                         {
6115                             case 4:     rm = modregrm(3,0,SI);  goto L6;
6116                             case 5:     rm = modregrm(3,0,DI);  goto L6;
6117                             case 7:     rm = modregrm(3,0,BX);  goto L6;
6118                             L6:     c.Irm = cast(ubyte)(rm + reg);
6119                                     c.Iop = 0x8B;
6120                                     break;
6121 
6122                             default:
6123                                     break;
6124                         }
6125                     }
6126                 }
6127 
6128                 /* replace LEA reg,0[BP] with MOV reg,BP        */
6129                 else if (mod == modregrm(1,0,0) && rm == local_BPRM &&
6130                         c.IFL1 == FLconst && c.IEV1.Vpointer == 0)
6131                 {
6132                     c.Iop = 0x8B;          /* MOV reg,BP   */
6133                     c.Irm = cast(ubyte)(modregrm(3,0,BP) + reg);
6134                 }
6135             }
6136 
6137             // Replace [R13] with 0[R13]
6138             if (c.Irex & REX_B && ((c.Irm & modregrm(3,0,7)) == modregrm(0,0,BP) ||
6139                                     issib(c.Irm) && (c.Irm & modregrm(3,0,0)) == 0 && (c.Isib & 7) == BP))
6140             {
6141                 c.Irm |= modregrm(1,0,0);
6142                 c.IFL1 = FLconst;
6143                 c.IEV1.Vpointer = 0;
6144             }
6145         }
6146         else if (!(c.Iflags & CFvex))
6147         {
6148             switch (op)
6149             {
6150                 default:
6151                     // Look for MOV r64, immediate
6152                     if ((c.Irex & REX_W) && (op & ~7) == 0xB8)
6153                     {
6154                         /* Look for zero extended immediate data */
6155                         if (c.IEV2.Vsize_t == c.IEV2.Vuns)
6156                         {
6157                             c.Irex &= ~REX_W;
6158                         }
6159                         /* Look for sign extended immediate data */
6160                         else if (c.IEV2.Vsize_t == c.IEV2.Vint)
6161                         {
6162                             c.Irm = modregrm(3,0,op & 7);
6163                             c.Iop = op = 0xC7;
6164                             c.IEV2.Vsize_t = c.IEV2.Vuns;
6165                         }
6166                     }
6167                     if ((op & ~0x0F) != 0x70)
6168                         break;
6169                     goto case JMP;
6170 
6171                 case JMP:
6172                     switch (c.IFL2)
6173                     {
6174                         case FLcode:
6175                             if (c.IEV2.Vcode == code_next(c))
6176                             {
6177                                 c.Iop = NOP;
6178                                 continue;
6179                             }
6180                             break;
6181 
6182                         case FLblock:
6183                             if (!code_next(c) && c.IEV2.Vblock == bn)
6184                             {
6185                                 c.Iop = NOP;
6186                                 continue;
6187                             }
6188                             break;
6189 
6190                         case FLconst:
6191                         case FLfunc:
6192                         case FLextern:
6193                             break;
6194 
6195                         default:
6196                             WRFL(cast(FL)c.IFL2);
6197                             assert(0);
6198                     }
6199                     break;
6200 
6201                 case 0x68:                      // PUSH immed16
6202                     if (c.IFL2 == FLconst)
6203                     {
6204                         targ_long u = c.IEV2.Vuns;
6205                         if (I64 ||
6206                             ((c.Iflags & CFopsize) ? I16 : I32))
6207                         {   // PUSH 32/64 bit operand
6208                             if (u == cast(byte) u)
6209                                 c.Iop = 0x6A;          // PUSH immed8
6210                         }
6211                         else // PUSH 16 bit operand
6212                         {
6213                             if (cast(short)u == cast(byte) u)
6214                                 c.Iop = 0x6A;          // PUSH immed8
6215                         }
6216                     }
6217                     break;
6218             }
6219         }
6220     }
6221 
6222     debug
6223     if (debugc)
6224     {
6225         printf("-pinholeopt(%p)\n",cstart);
6226         for (c = cstart; c; c = code_next(c))
6227             code_print(c);
6228     }
6229 }
6230 
6231 
6232 debug
6233 {
6234 private void pinholeopt_unittest()
6235 {
6236     //printf("pinholeopt_unittest()\n");
6237     static struct CS
6238     {
6239         uint model,op,ea;
6240         targ_size_t ev1,ev2;
6241         uint flags;
6242     }
6243     __gshared CS[2][22] tests =
6244     [
6245         // XOR reg,immed                            NOT regL
6246         [ { 16,0x81,modregrm(3,6,BX),0,0xFF,0 },    { 0,0xF6,modregrm(3,2,BX),0,0xFF } ],
6247 
6248         // MOV 0[BX],3                               MOV [BX],3
6249         [ { 16,0xC7,modregrm(2,0,7),0,3 },          { 0,0xC7,modregrm(0,0,7),0,3 } ],
6250 
6251 /+      // only if config.flags4 & CFG4space
6252         // TEST regL,immed8
6253         [ { 0,0xF6,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }],
6254         [ { 0,0xF7,modregrm(3,0,BX),0,0xFF,0 },    { 0,0x84,modregrm(3,BX,BX),0,0xFF }],
6255         [ { 64,0xF6,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }],
6256         [ { 64,0xF7,modregrmx(3,0,R8),0,0xFF,0 },  { 0,0x84,modregxrmx(3,R8,R8),0,0xFF }],
6257 +/
6258 
6259         // PUSH immed => PUSH immed8
6260         [ { 0,0x68,0,0,0 },    { 0,0x6A,0,0,0 }],
6261         [ { 0,0x68,0,0,0x7F }, { 0,0x6A,0,0,0x7F }],
6262         [ { 0,0x68,0,0,0x80 }, { 0,0x68,0,0,0x80 }],
6263         [ { 16,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }],
6264         [ { 16,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }],
6265         [ { 16,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }],
6266         [ { 16,0x68,0,0,0x10000,0 },     { 0,0x6A,0,0,0x10000,0 }],
6267         [ { 16,0x68,0,0,0x10000,CFopsize }, { 0,0x68,0,0,0x10000,CFopsize }],
6268         [ { 32,0x68,0,0,0,CFopsize },    { 0,0x6A,0,0,0,CFopsize }],
6269         [ { 32,0x68,0,0,0x7F,CFopsize }, { 0,0x6A,0,0,0x7F,CFopsize }],
6270         [ { 32,0x68,0,0,0x80,CFopsize }, { 0,0x68,0,0,0x80,CFopsize }],
6271         [ { 32,0x68,0,0,0x10000,CFopsize },    { 0,0x6A,0,0,0x10000,CFopsize }],
6272         [ { 32,0x68,0,0,0x8000,CFopsize }, { 0,0x68,0,0,0x8000,CFopsize }],
6273 
6274         // clear r64, for r64 != R8..R15
6275         [ { 64,0x31,0x800C0,0,0,0 }, { 0,0x31,0xC0,0,0,0}],
6276         [ { 64,0x33,0x800C0,0,0,0 }, { 0,0x33,0xC0,0,0,0}],
6277 
6278         // MOV r64, immed
6279         [ { 64,0xC7,0x800C0,0,0xFFFFFFFF,0 }, { 0,0xC7,0x800C0,0,0xFFFFFFFF,0}],
6280         [ { 64,0xC7,0x800C0,0,0x7FFFFFFF,0 }, { 0,0xB8,0,0,0x7FFFFFFF,0}],
6281         [ { 64,0xB8,0x80000,0,0xFFFFFFFF,0 }, { 0,0xB8,0,0,0xFFFFFFFF,0 }],
6282         [ { 64,0xB8,0x80000,0,cast(targ_size_t)0x1FFFFFFFF,0 }, { 0,0xB8,0x80000,0,cast(targ_size_t)0x1FFFFFFFF,0 }],
6283         [ { 64,0xB8,0x80000,0,cast(targ_size_t)0xFFFFFFFFFFFFFFFF,0 }, { 0,0xC7,0x800C0,0,cast(targ_size_t)0xFFFFFFFF,0}],
6284     ];
6285 
6286     //config.flags4 |= CFG4space;
6287     for (int i = 0; i < tests.length; i++)
6288     {   CS *pin  = &tests[i][0];
6289         CS *pout = &tests[i][1];
6290         code cs = void;
6291         memset(&cs, 0, cs.sizeof);
6292         if (pin.model)
6293         {
6294             if (I16 && pin.model != 16)
6295                 continue;
6296             if (I32 && pin.model != 32)
6297                 continue;
6298             if (I64 && pin.model != 64)
6299                 continue;
6300         }
6301         //printf("[%d]\n", i);
6302         cs.Iop = pin.op;
6303         cs.Iea = pin.ea;
6304         cs.IFL1 = FLconst;
6305         cs.IFL2 = FLconst;
6306         cs.IEV1.Vsize_t = pin.ev1;
6307         cs.IEV2.Vsize_t = pin.ev2;
6308         cs.Iflags = pin.flags;
6309         pinholeopt(&cs, null);
6310         if (cs.Iop != pout.op)
6311         {   printf("[%d] Iop = x%02x, pout = x%02x\n", i, cs.Iop, pout.op);
6312             assert(0);
6313         }
6314         assert(cs.Iea == pout.ea);
6315         assert(cs.IEV1.Vsize_t == pout.ev1);
6316         assert(cs.IEV2.Vsize_t == pout.ev2);
6317         assert(cs.Iflags == pout.flags);
6318     }
6319 }
6320 }
6321 
6322 void simplify_code(code* c)
6323 {
6324     reg_t reg;
6325     if (config.flags4 & CFG4optimized &&
6326         (c.Iop == 0x81 || c.Iop == 0x80) &&
6327         c.IFL2 == FLconst &&
6328         reghasvalue((c.Iop == 0x80) ? BYTEREGS : ALLREGS,I64 ? c.IEV2.Vsize_t : c.IEV2.Vlong,&reg) &&
6329         !(I16 && c.Iflags & CFopsize)
6330        )
6331     {
6332         // See if we can replace immediate instruction with register instruction
6333         static immutable ubyte[8] regop =
6334                 [ 0x00,0x08,0x10,0x18,0x20,0x28,0x30,0x38 ];
6335 
6336         //printf("replacing 0x%02x, val = x%lx\n",c.Iop,c.IEV2.Vlong);
6337         c.Iop = regop[(c.Irm & modregrm(0,7,0)) >> 3] | (c.Iop & 1);
6338         code_newreg(c, reg);
6339         if (I64 && !(c.Iop & 1) && (reg & 4))
6340             c.Irex |= REX;
6341     }
6342 }
6343 
6344 /**************************
6345  * Compute jump addresses for FLcode.
6346  * Note: only works for forward referenced code.
6347  *       only direct jumps and branches are detected.
6348  *       LOOP instructions only work for backward refs.
6349  */
6350 
6351 void jmpaddr(code *c)
6352 {
6353     code* ci,cn,ctarg,cstart;
6354     targ_size_t ad;
6355 
6356     //printf("jmpaddr()\n");
6357     cstart = c;                           /* remember start of code       */
6358     while (c)
6359     {
6360         const op = c.Iop;
6361         if (op <= 0xEB &&
6362             inssize[op] & T &&   // if second operand
6363             c.IFL2 == FLcode &&
6364             ((op & ~0x0F) == 0x70 || op == JMP || op == JMPS || op == JCXZ || op == CALL))
6365         {
6366             ci = code_next(c);
6367             ctarg = c.IEV2.Vcode;  /* target code                  */
6368             ad = 0;                 /* IP displacement              */
6369             while (ci && ci != ctarg)
6370             {
6371                 ad += calccodsize(ci);
6372                 ci = code_next(ci);
6373             }
6374             if (!ci)
6375                 goto Lbackjmp;      // couldn't find it
6376             if (!I16 || op == JMP || op == JMPS || op == JCXZ || op == CALL)
6377                 c.IEV2.Vpointer = ad;
6378             else                    /* else conditional             */
6379             {
6380                 if (!(c.Iflags & CFjmp16))     /* if branch    */
6381                     c.IEV2.Vpointer = ad;
6382                 else            /* branch around a long jump    */
6383                 {
6384                     cn = code_next(c);
6385                     c.next = code_calloc();
6386                     code_next(c).next = cn;
6387                     c.Iop = op ^ 1;        /* converse jmp */
6388                     c.Iflags &= ~CFjmp16;
6389                     c.IEV2.Vpointer = I16 ? 3 : 5;
6390                     cn = code_next(c);
6391                     cn.Iop = JMP;          /* long jump    */
6392                     cn.IFL2 = FLconst;
6393                     cn.IEV2.Vpointer = ad;
6394                 }
6395             }
6396             c.IFL2 = FLconst;
6397         }
6398         if (op == LOOP && c.IFL2 == FLcode)    /* backwards refs       */
6399         {
6400           Lbackjmp:
6401             ctarg = c.IEV2.Vcode;
6402             for (ci = cstart; ci != ctarg; ci = code_next(ci))
6403                 if (!ci || ci == c)
6404                     assert(0);
6405             ad = 2;                 /* - IP displacement            */
6406             while (ci != c)
6407             {
6408                 assert(ci);
6409                 ad += calccodsize(ci);
6410                 ci = code_next(ci);
6411             }
6412             c.IEV2.Vpointer = (-ad) & 0xFF;
6413             c.IFL2 = FLconst;
6414         }
6415         c = code_next(c);
6416     }
6417 }
6418 
6419 /*******************************
6420  * Calculate bl.Bsize.
6421  */
6422 
6423 uint calcblksize(code *c)
6424 {
6425     uint size;
6426     for (size = 0; c; c = code_next(c))
6427     {
6428         uint sz = calccodsize(c);
6429         //printf("off=%02x, sz = %d, code %p: op=%02x\n", size, sz, c, c.Iop);
6430         size += sz;
6431     }
6432     //printf("calcblksize(c = x%x) = %d\n", c, size);
6433     return size;
6434 }
6435 
6436 /*****************************
6437  * Calculate and return code size of a code.
6438  * Note that NOPs are sometimes used as markers, but are
6439  * never output. LINNUMs are never output.
6440  * Note: This routine must be fast. Profiling shows it is significant.
6441  */
6442 
6443 uint calccodsize(code *c)
6444 {
6445     uint size;
6446     ubyte rm,mod,ins;
6447     uint iflags;
6448     uint i32 = I32 || I64;
6449     uint a32 = i32;
6450 
6451     debug
6452     assert((a32 & ~1) == 0);
6453 
6454     iflags = c.Iflags;
6455     opcode_t op = c.Iop;
6456     //printf("calccodsize(x%08x), Iflags = x%x\n", op, iflags);
6457     if (iflags & CFvex && c.Ivex.pfx == 0xC4)
6458     {
6459         ins = vex_inssize(c);
6460         size = ins & 7;
6461         goto Lmodrm;
6462     }
6463     else if ((op & 0xFF00) == 0x0F00 || (op & 0xFFFD00) == 0x0F3800)
6464         op = 0x0F;
6465     else
6466         op &= 0xFF;
6467     switch (op)
6468     {
6469         case 0x0F:
6470             if ((c.Iop & 0xFFFD00) == 0x0F3800)
6471             {   // 3 byte op ( 0F38-- or 0F3A-- )
6472                 ins = inssize2[(c.Iop >> 8) & 0xFF];
6473                 size = ins & 7;
6474                 if (c.Iop & 0xFF000000)
6475                   size++;
6476             }
6477             else
6478             {   // 2 byte op ( 0F-- )
6479                 ins = inssize2[c.Iop & 0xFF];
6480                 size = ins & 7;
6481                 if (c.Iop & 0xFF0000)
6482                   size++;
6483             }
6484             break;
6485 
6486         case 0x90:
6487             size = (c.Iop == PAUSE) ? 2 : 1;
6488             goto Lret2;
6489 
6490         case NOP:
6491         case ESCAPE:
6492             size = 0;                   // since these won't be output
6493             goto Lret2;
6494 
6495         case ASM:
6496             if (c.Iflags == CFaddrsize)        // kludge for DA inline asm
6497                 size = _tysize[TYnptr];
6498             else
6499                 size = cast(uint)c.IEV1.len;
6500             goto Lret2;
6501 
6502         case 0xA1:
6503         case 0xA3:
6504             if (c.Irex)
6505             {
6506                 size = 9;               // 64 bit immediate value for MOV to/from RAX
6507                 goto Lret;
6508             }
6509             goto Ldefault;
6510 
6511         case 0xF6:                      /* TEST mem8,immed8             */
6512             ins = inssize[op];
6513             size = ins & 7;
6514             if (i32)
6515                 size = inssize32[op];
6516             if ((c.Irm & (7<<3)) == 0)
6517                 size++;                 /* size of immed8               */
6518             break;
6519 
6520         case 0xF7:
6521             ins = inssize[op];
6522             size = ins & 7;
6523             if (i32)
6524                 size = inssize32[op];
6525             if ((c.Irm & (7<<3)) == 0)
6526                 size += (i32 ^ ((iflags & CFopsize) !=0)) ? 4 : 2;
6527             break;
6528 
6529         default:
6530         Ldefault:
6531             ins = inssize[op];
6532             size = ins & 7;
6533             if (i32)
6534                 size = inssize32[op];
6535     }
6536 
6537     if (iflags & (CFwait | CFopsize | CFaddrsize | CFSEG))
6538     {
6539         if (iflags & CFwait)    // if add FWAIT prefix
6540             size++;
6541         if (iflags & CFSEG)     // if segment override
6542             size++;
6543 
6544         // If the instruction has a second operand that is not an 8 bit,
6545         // and the operand size prefix is present, then fix the size computation
6546         // because the operand size will be different.
6547         // Walter, I had problems with this bit at the end.  There can still be
6548         // an ADDRSIZE prefix for these and it does indeed change the operand size.
6549 
6550         if (iflags & (CFopsize | CFaddrsize))
6551         {
6552             if ((ins & (T|E)) == T)
6553             {
6554                 if ((op & 0xAC) == 0xA0)
6555                 {
6556                     if (iflags & CFaddrsize && !I64)
6557                     {   if (I32)
6558                             size -= 2;
6559                         else
6560                             size += 2;
6561                     }
6562                 }
6563                 else if (iflags & CFopsize)
6564                 {   if (I16)
6565                         size += 2;
6566                     else
6567                         size -= 2;
6568                 }
6569             }
6570             if (iflags & CFaddrsize)
6571             {   if (!I64)
6572                     a32 ^= 1;
6573                 size++;
6574             }
6575             if (iflags & CFopsize)
6576                 size++;                         /* +1 for OPSIZE prefix         */
6577         }
6578     }
6579 
6580 Lmodrm:
6581     if ((op & ~0x0F) == 0x70)
6582     {
6583         if (iflags & CFjmp16)           // if long branch
6584             size += I16 ? 3 : 4;        // + 3(4) bytes for JMP
6585     }
6586     else if (ins & M)                   // if modregrm byte
6587     {
6588         rm = c.Irm;
6589         mod = rm & 0xC0;
6590         if (a32 || I64)
6591         {   // 32 bit addressing
6592             if (issib(rm))
6593                 size++;
6594             switch (mod)
6595             {   case 0:
6596                     if (issib(rm) && (c.Isib & 7) == 5 ||
6597                         (rm & 7) == 5)
6598                         size += 4;      /* disp32                       */
6599                     if (c.Irex & REX_B && (rm & 7) == 5)
6600                         /* Instead of selecting R13, this mode is an [RIP] relative
6601                          * address. Although valid, it's redundant, and should not
6602                          * be generated. Instead, generate 0[R13] instead of [R13].
6603                          */
6604                         assert(0);
6605                     break;
6606 
6607                 case 0x40:
6608                     size++;             /* disp8                        */
6609                     break;
6610 
6611                 case 0x80:
6612                     size += 4;          /* disp32                       */
6613                     break;
6614 
6615                 default:
6616                     break;
6617             }
6618         }
6619         else
6620         {   // 16 bit addressing
6621             if (mod == 0x40)            /* 01: 8 bit displacement       */
6622                 size++;
6623             else if (mod == 0x80 || (mod == 0 && (rm & 7) == 6))
6624                 size += 2;
6625         }
6626     }
6627 
6628 Lret:
6629     if (!(iflags & CFvex) && c.Irex)
6630     {
6631         size++;
6632         if (c.Irex & REX_W && (op & ~7) == 0xB8)
6633             size += 4;
6634     }
6635 Lret2:
6636     //printf("op = x%02x, size = %d\n",op,size);
6637     return size;
6638 }
6639 
6640 /********************************
6641  * Return !=0 if codes match.
6642  */
6643 
6644 static if (0)
6645 {
6646 
6647 int code_match(code *c1,code *c2)
6648 {
6649     code cs1,cs2;
6650     ubyte ins;
6651 
6652     if (c1 == c2)
6653         goto match;
6654     cs1 = *c1;
6655     cs2 = *c2;
6656     if (cs1.Iop != cs2.Iop)
6657         goto nomatch;
6658     switch (cs1.Iop)
6659     {
6660         case ESCAPE | ESCctor:
6661         case ESCAPE | ESCdtor:
6662             goto nomatch;
6663 
6664         case NOP:
6665             goto match;
6666 
6667         case ASM:
6668             if (cs1.IEV1.len == cs2.IEV1.len &&
6669                 memcmp(cs1.IEV1.bytes,cs2.IEV1.bytes,cs1.EV1.len) == 0)
6670                 goto match;
6671             else
6672                 goto nomatch;
6673 
6674         default:
6675             if ((cs1.Iop & 0xFF) == ESCAPE)
6676                 goto match;
6677             break;
6678     }
6679     if (cs1.Iflags != cs2.Iflags)
6680         goto nomatch;
6681 
6682     ins = inssize[cs1.Iop & 0xFF];
6683     if ((cs1.Iop & 0xFFFD00) == 0x0F3800)
6684     {
6685         ins = inssize2[(cs1.Iop >> 8) & 0xFF];
6686     }
6687     else if ((cs1.Iop & 0xFF00) == 0x0F00)
6688     {
6689         ins = inssize2[cs1.Iop & 0xFF];
6690     }
6691 
6692     if (ins & M)                // if modregrm byte
6693     {
6694         if (cs1.Irm != cs2.Irm)
6695             goto nomatch;
6696         if ((cs1.Irm & 0xC0) == 0xC0)
6697             goto do2;
6698         if (is32bitaddr(I32,cs1.Iflags))
6699         {
6700             if (issib(cs1.Irm) && cs1.Isib != cs2.Isib)
6701                 goto nomatch;
6702             if (
6703                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
6704                )
6705                 goto do2;       /* if no first operand  */
6706         }
6707         else
6708         {
6709             if (
6710                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
6711                )
6712                 goto do2;       /* if no first operand  */
6713         }
6714         if (cs1.IFL1 != cs2.IFL1)
6715             goto nomatch;
6716         if (flinsymtab[cs1.IFL1] && cs1.IEV1.Vsym != cs2.IEV1.Vsym)
6717             goto nomatch;
6718         if (cs1.IEV1.Voffset != cs2.IEV1.Voffset)
6719             goto nomatch;
6720     }
6721 
6722 do2:
6723     if (!(ins & T))                     // if no second operand
6724         goto match;
6725     if (cs1.IFL2 != cs2.IFL2)
6726         goto nomatch;
6727     if (flinsymtab[cs1.IFL2] && cs1.IEV2.Vsym != cs2.IEV2.Vsym)
6728         goto nomatch;
6729     if (cs1.IEV2.Voffset != cs2.IEV2.Voffset)
6730         goto nomatch;
6731 
6732 match:
6733     return 1;
6734 
6735 nomatch:
6736     return 0;
6737 }
6738 
6739 }
6740 
6741 /**************************
6742  * Write code to intermediate file.
6743  * Code starts at offset.
6744  * Returns:
6745  *      addr of end of code
6746  */
6747 
6748 private struct MiniCodeBuf
6749 {
6750 nothrow:
6751     size_t index;
6752     size_t offset;
6753     int seg;
6754     char[100] bytes; // = void;
6755 
6756     this(int seg)
6757     {
6758         index = 0;
6759         this.offset = cast(size_t)Offset(seg);
6760         this.seg = seg;
6761     }
6762 
6763     void flushx()
6764     {
6765         // Emit accumulated bytes to code segment
6766         debug assert(index < bytes.length);
6767         offset += objmod.bytes(seg, offset, cast(uint)index, bytes.ptr);
6768         index = 0;
6769     }
6770 
6771     void gen(char c) { bytes[index++] = c; }
6772 
6773     void genp(size_t n, void *p) { memcpy(&bytes[index], p, n); index += n; }
6774 
6775     void flush() { if (index) flushx(); }
6776 
6777     uint getOffset() { return cast(uint)(offset + index); }
6778 
6779     uint available() { return cast(uint)(bytes.sizeof - index); }
6780 }
6781 
6782 private void do8bit(MiniCodeBuf *pbuf, FL, evc *);
6783 private void do16bit(MiniCodeBuf *pbuf, FL, evc *,int);
6784 private void do32bit(MiniCodeBuf *pbuf, FL, evc *,int,int = 0);
6785 private void do64bit(MiniCodeBuf *pbuf, FL, evc *,int);
6786 
6787 uint codout(int seg, code *c)
6788 {
6789     ubyte rm,mod;
6790     ubyte ins;
6791     code *cn;
6792     uint flags;
6793     Symbol *s;
6794 
6795     debug
6796     if (debugc) printf("codout(%p), Coffset = x%llx\n",c,cast(ulong)Offset(seg));
6797 
6798     MiniCodeBuf ggen = void;
6799     ggen.index = 0;
6800     ggen.offset = cast(size_t)Offset(seg);
6801     ggen.seg = seg;
6802 
6803     for (; c; c = code_next(c))
6804     {
6805         debug
6806         {
6807         if (debugc) { printf("off=%02x, sz=%d, ",cast(int)ggen.getOffset(),cast(int)calccodsize(c)); code_print(c); }
6808         uint startoffset = ggen.getOffset();
6809         }
6810 
6811         opcode_t op = c.Iop;
6812         ins = inssize[op & 0xFF];
6813         switch (op & 0xFF)
6814         {
6815             case ESCAPE:
6816                 /* Check for SSE4 opcode v/pmaxuw xmm1,xmm2/m128 */
6817                 if(op == 0x660F383E || c.Iflags & CFvex) break;
6818 
6819                 switch (op & 0xFFFF00)
6820                 {   case ESClinnum:
6821                         /* put out line number stuff    */
6822                         objmod.linnum(c.IEV1.Vsrcpos,seg,ggen.getOffset());
6823                         break;
6824 version (SCPP)
6825 {
6826 static if (1)
6827 {
6828                     case ESCctor:
6829                     case ESCdtor:
6830                     case ESCoffset:
6831                         if (config.exe != EX_WIN32)
6832                             except_pair_setoffset(c,ggen.getOffset() - funcoffset);
6833                         break;
6834 
6835                     case ESCmark:
6836                     case ESCrelease:
6837                     case ESCmark2:
6838                     case ESCrelease2:
6839                         break;
6840 }
6841 else
6842 {
6843                     case ESCctor:
6844                         except_push(ggen.getOffset() - funcoffset,c.IEV1.Vtor,null);
6845                         break;
6846 
6847                     case ESCdtor:
6848                         except_pop(ggen.getOffset() - funcoffset,c.IEV1.Vtor,null);
6849                         break;
6850 
6851                     case ESCmark:
6852                         except_mark();
6853                         break;
6854 
6855                     case ESCrelease:
6856                         except_release();
6857                         break;
6858 }
6859 }
6860                     case ESCadjesp:
6861                         //printf("adjust ESP %ld\n", (long)c.IEV1.Vint);
6862                         break;
6863 
6864                     default:
6865                         break;
6866                 }
6867 
6868                 debug
6869                 assert(calccodsize(c) == 0);
6870 
6871                 continue;
6872 
6873             case NOP:                   /* don't send them out          */
6874                 if (op != NOP)
6875                     break;
6876                 debug
6877                 assert(calccodsize(c) == 0);
6878 
6879                 continue;
6880 
6881             case ASM:
6882                 if (op != ASM)
6883                     break;
6884                 ggen.flush();
6885                 if (c.Iflags == CFaddrsize)    // kludge for DA inline asm
6886                 {
6887                     do32bit(&ggen, FLblockoff,&c.IEV1,0);
6888                 }
6889                 else
6890                 {
6891                     ggen.offset += objmod.bytes(seg,ggen.offset,cast(uint)c.IEV1.len,c.IEV1.bytes);
6892                 }
6893                 debug
6894                 assert(calccodsize(c) == c.IEV1.len);
6895 
6896                 continue;
6897 
6898             default:
6899                 break;
6900         }
6901         flags = c.Iflags;
6902 
6903         // See if we need to flush (don't have room for largest code sequence)
6904         if (ggen.available() < (1+4+4+8+8))
6905             ggen.flush();
6906 
6907         // see if we need to put out prefix bytes
6908         if (flags & (CFwait | CFPREFIX | CFjmp16))
6909         {
6910             int override_;
6911 
6912             if (flags & CFwait)
6913                 ggen.gen(0x9B);                      // FWAIT
6914                                                 /* ? SEGES : SEGSS      */
6915             switch (flags & CFSEG)
6916             {   case CFes:      override_ = SEGES;       goto segover;
6917                 case CFss:      override_ = SEGSS;       goto segover;
6918                 case CFcs:      override_ = SEGCS;       goto segover;
6919                 case CFds:      override_ = SEGDS;       goto segover;
6920                 case CFfs:      override_ = SEGFS;       goto segover;
6921                 case CFgs:      override_ = SEGGS;       goto segover;
6922                 segover:        ggen.gen(cast(ubyte)override_);
6923                                 break;
6924 
6925                 default:        break;
6926             }
6927 
6928             if (flags & CFaddrsize)
6929                 ggen.gen(0x67);
6930 
6931             // Do this last because of instructions like ADDPD
6932             if (flags & CFopsize)
6933                 ggen.gen(0x66);                      /* operand size         */
6934 
6935             if ((op & ~0x0F) == 0x70 && flags & CFjmp16) /* long condit jmp */
6936             {
6937                 if (!I16)
6938                 {   // Put out 16 bit conditional jump
6939                     c.Iop = op = 0x0F00 | (0x80 | (op & 0x0F));
6940                 }
6941                 else
6942                 {
6943                     cn = code_calloc();
6944                     /*cxcalloc++;*/
6945                     cn.next = code_next(c);
6946                     c.next= cn;          // link into code
6947                     cn.Iop = JMP;              // JMP block
6948                     cn.IFL2 = c.IFL2;
6949                     cn.IEV2.Vblock = c.IEV2.Vblock;
6950                     c.Iop = op ^= 1;           // toggle condition
6951                     c.IFL2 = FLconst;
6952                     c.IEV2.Vpointer = I16 ? 3 : 5; // skip over JMP block
6953                     c.Iflags &= ~CFjmp16;
6954                 }
6955             }
6956         }
6957 
6958         if (flags & CFvex)
6959         {
6960             if (flags & CFvex3)
6961             {
6962                 ggen.gen(0xC4);
6963                 ggen.gen(cast(ubyte)VEX3_B1(c.Ivex));
6964                 ggen.gen(cast(ubyte)VEX3_B2(c.Ivex));
6965                 ggen.gen(c.Ivex.op);
6966             }
6967             else
6968             {
6969                 ggen.gen(0xC5);
6970                 ggen.gen(cast(ubyte)VEX2_B1(c.Ivex));
6971                 ggen.gen(c.Ivex.op);
6972             }
6973             ins = vex_inssize(c);
6974             goto Lmodrm;
6975         }
6976 
6977         if (op > 0xFF)
6978         {
6979             if ((op & 0xFFFD00) == 0x0F3800)
6980                 ins = inssize2[(op >> 8) & 0xFF];
6981             else if ((op & 0xFF00) == 0x0F00)
6982                 ins = inssize2[op & 0xFF];
6983 
6984             if (op & 0xFF000000)
6985             {
6986                 ubyte op1 = op >> 24;
6987                 if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
6988                 {
6989                     ggen.gen(op1);
6990                     if (c.Irex)
6991                         ggen.gen(c.Irex | REX);
6992                 }
6993                 else
6994                 {
6995                     if (c.Irex)
6996                         ggen.gen(c.Irex | REX);
6997                     ggen.gen(op1);
6998                 }
6999                 ggen.gen((op >> 16) & 0xFF);
7000                 ggen.gen((op >> 8) & 0xFF);
7001                 ggen.gen(op & 0xFF);
7002             }
7003             else if (op & 0xFF0000)
7004             {
7005                 ubyte op1 = cast(ubyte)(op >> 16);
7006                 if (op1 == 0xF2 || op1 == 0xF3 || op1 == 0x66)
7007                 {
7008                     ggen.gen(op1);
7009                     if (c.Irex)
7010                         ggen.gen(c.Irex | REX);
7011                 }
7012                 else
7013                 {
7014                     if (c.Irex)
7015                         ggen.gen(c.Irex | REX);
7016                     ggen.gen(op1);
7017                 }
7018                 ggen.gen((op >> 8) & 0xFF);
7019                 ggen.gen(op & 0xFF);
7020             }
7021             else
7022             {
7023                 if (c.Irex)
7024                     ggen.gen(c.Irex | REX);
7025                 ggen.gen((op >> 8) & 0xFF);
7026                 ggen.gen(op & 0xFF);
7027             }
7028         }
7029         else
7030         {
7031             if (c.Irex)
7032                 ggen.gen(c.Irex | REX);
7033             ggen.gen(cast(ubyte)op);
7034         }
7035   Lmodrm:
7036         if (ins & M)            /* if modregrm byte             */
7037         {
7038             rm = c.Irm;
7039             ggen.gen(rm);
7040 
7041             // Look for an address size override when working with the
7042             // MOD R/M and SIB bytes
7043 
7044             if (is32bitaddr( I32, flags))
7045             {
7046                 if (issib(rm))
7047                     ggen.gen(c.Isib);
7048                 switch (rm & 0xC0)
7049                 {
7050                     case 0x40:
7051                         do8bit(&ggen, cast(FL) c.IFL1,&c.IEV1);     // 8 bit
7052                         break;
7053 
7054                     case 0:
7055                         if (!(issib(rm) && (c.Isib & 7) == 5 ||
7056                               (rm & 7) == 5))
7057                             break;
7058                         goto case 0x80;
7059 
7060                     case 0x80:
7061                     {
7062                         int cfflags = CFoff;
7063                         targ_size_t val = 0;
7064                         if (I64)
7065                         {
7066                             if ((rm & modregrm(3,0,7)) == modregrm(0,0,5))      // if disp32[RIP]
7067                             {
7068                                 cfflags |= CFpc32;
7069                                 val = -4;
7070                                 reg_t reg = rm & modregrm(0,7,0);
7071                                 if (ins & T ||
7072                                     ((op == 0xF6 || op == 0xF7) && (reg == modregrm(0,0,0) || reg == modregrm(0,1,0))))
7073                                 {   if (ins & E || op == 0xF6)
7074                                         val = -5;
7075                                     else if (c.Iflags & CFopsize)
7076                                         val = -6;
7077                                     else
7078                                         val = -8;
7079                                 }
7080 static if (TARGET_OSX || TARGET_WINDOS)
7081 {
7082                                 /* Mach-O and Win64 fixups already take the 4 byte size
7083                                  * into account, so bias by 4
7084         `                        */
7085                                 val += 4;
7086 }
7087                             }
7088                         }
7089                         do32bit(&ggen, cast(FL)c.IFL1,&c.IEV1,cfflags,cast(int)val);
7090                         break;
7091                     }
7092 
7093                     default:
7094                         break;
7095                 }
7096             }
7097             else
7098             {
7099                 switch (rm & 0xC0)
7100                 {   case 0x40:
7101                         do8bit(&ggen, cast(FL) c.IFL1,&c.IEV1);     // 8 bit
7102                         break;
7103 
7104                     case 0:
7105                         if ((rm & 7) != 6)
7106                             break;
7107                         goto case 0x80;
7108 
7109                     case 0x80:
7110                         do16bit(&ggen, cast(FL)c.IFL1,&c.IEV1,CFoff);
7111                         break;
7112 
7113                     default:
7114                         break;
7115                 }
7116             }
7117         }
7118         else
7119         {
7120             if (op == ENTER)
7121                 do16bit(&ggen, cast(FL)c.IFL1,&c.IEV1,0);
7122         }
7123         flags &= CFseg | CFoff | CFselfrel;
7124         if (ins & T)                    /* if second operand            */
7125         {
7126             if (ins & E)            /* if data-8                    */
7127                 do8bit(&ggen, cast(FL) c.IFL2,&c.IEV2);
7128             else if (!I16)
7129             {
7130                 switch (op)
7131                 {
7132                     case 0xC2:              /* RETN imm16           */
7133                     case 0xCA:              /* RETF imm16           */
7134                     do16:
7135                         do16bit(&ggen, cast(FL)c.IFL2,&c.IEV2,flags);
7136                         break;
7137 
7138                     case 0xA1:
7139                     case 0xA3:
7140                         if (I64 && c.Irex)
7141                         {
7142                     do64:
7143                             do64bit(&ggen, cast(FL)c.IFL2,&c.IEV2,flags);
7144                             break;
7145                         }
7146                         goto case 0xA0;
7147 
7148                     case 0xA0:              /* MOV AL,byte ptr []   */
7149                     case 0xA2:
7150                         if (c.Iflags & CFaddrsize && !I64)
7151                             goto do16;
7152                         else
7153                     do32:
7154                             do32bit(&ggen, cast(FL)c.IFL2,&c.IEV2,flags);
7155                         break;
7156 
7157                     case 0x9A:
7158                     case 0xEA:
7159                         if (c.Iflags & CFopsize)
7160                             goto ptr1616;
7161                         else
7162                             goto ptr1632;
7163 
7164                     case 0x68:              // PUSH immed32
7165                         if (cast(FL)c.IFL2 == FLblock)
7166                         {
7167                             c.IFL2 = FLblockoff;
7168                             goto do32;
7169                         }
7170                         else
7171                             goto case_default;
7172 
7173                     case CALL:              // CALL rel
7174                     case JMP:               // JMP  rel
7175                         flags |= CFselfrel;
7176                         goto case_default;
7177 
7178                     default:
7179                         if ((op|0xF) == 0x0F8F) // Jcc rel16 rel32
7180                             flags |= CFselfrel;
7181                         if (I64 && (op & ~7) == 0xB8 && c.Irex & REX_W)
7182                             goto do64;
7183                     case_default:
7184                         if (c.Iflags & CFopsize)
7185                             goto do16;
7186                         else
7187                             goto do32;
7188                 }
7189             }
7190             else
7191             {
7192                 switch (op)
7193                 {
7194                     case 0xC2:
7195                     case 0xCA:
7196                         goto do16;
7197 
7198                     case 0xA0:
7199                     case 0xA1:
7200                     case 0xA2:
7201                     case 0xA3:
7202                         if (c.Iflags & CFaddrsize)
7203                             goto do32;
7204                         else
7205                             goto do16;
7206 
7207                     case 0x9A:
7208                     case 0xEA:
7209                         if (c.Iflags & CFopsize)
7210                             goto ptr1632;
7211                         else
7212                             goto ptr1616;
7213 
7214                     ptr1616:
7215                     ptr1632:
7216                         //assert(c.IFL2 == FLfunc);
7217                         ggen.flush();
7218                         if (c.IFL2 == FLdatseg)
7219                         {
7220                             objmod.reftodatseg(seg,ggen.offset,c.IEV2.Vpointer,
7221                                     c.IEV2.Vseg,flags);
7222                             ggen.offset += 4;
7223                         }
7224                         else
7225                         {
7226                             s = c.IEV2.Vsym;
7227                             ggen.offset += objmod.reftoident(seg,ggen.offset,s,0,flags);
7228                         }
7229                         break;
7230 
7231                     case 0x68:              // PUSH immed16
7232                         if (cast(FL)c.IFL2 == FLblock)
7233                         {   c.IFL2 = FLblockoff;
7234                             goto do16;
7235                         }
7236                         else
7237                             goto case_default16;
7238 
7239                     case CALL:
7240                     case JMP:
7241                         flags |= CFselfrel;
7242                         goto default;
7243 
7244                     default:
7245                     case_default16:
7246                         if (c.Iflags & CFopsize)
7247                             goto do32;
7248                         else
7249                             goto do16;
7250                 }
7251             }
7252         }
7253         else if (op == 0xF6)            /* TEST mem8,immed8             */
7254         {
7255             if ((rm & (7<<3)) == 0)
7256                 do8bit(&ggen, cast(FL)c.IFL2,&c.IEV2);
7257         }
7258         else if (op == 0xF7)
7259         {
7260             if ((rm & (7<<3)) == 0)     /* TEST mem16/32,immed16/32     */
7261             {
7262                 if ((I32 || I64) ^ ((c.Iflags & CFopsize) != 0))
7263                     do32bit(&ggen, cast(FL)c.IFL2,&c.IEV2,flags);
7264                 else
7265                     do16bit(&ggen, cast(FL)c.IFL2,&c.IEV2,flags);
7266             }
7267         }
7268 
7269         debug
7270         if (ggen.getOffset() - startoffset != calccodsize(c))
7271         {
7272             printf("actual: %d, calc: %d\n", cast(int)(ggen.getOffset() - startoffset), cast(int)calccodsize(c));
7273             code_print(c);
7274             assert(0);
7275         }
7276     }
7277     ggen.flush();
7278     Offset(seg) = ggen.offset;
7279     //printf("-codout(), Coffset = x%x\n", Offset(seg));
7280     return cast(uint)ggen.offset;                      /* ending address               */
7281 }
7282 
7283 
7284 private void do64bit(MiniCodeBuf *pbuf, FL fl, evc *uev,int flags)
7285 {
7286     char *p;
7287     Symbol *s;
7288     targ_size_t ad;
7289 
7290     assert(I64);
7291     switch (fl)
7292     {
7293         case FLconst:
7294             ad = *cast(targ_size_t *) uev;
7295         L1:
7296             pbuf.genp(8,&ad);
7297             return;
7298 
7299         case FLdatseg:
7300             pbuf.flush();
7301             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,CFoffset64 | flags);
7302             break;
7303 
7304         case FLframehandler:
7305             framehandleroffset = pbuf.getOffset();
7306             ad = 0;
7307             goto L1;
7308 
7309         case FLswitch:
7310             pbuf.flush();
7311             ad = uev.Vswitch.Btableoffset;
7312             if (config.flags & CFGromable)
7313                     objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7314             else
7315                     objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7316             break;
7317 
7318         case FLcsdata:
7319         case FLfardata:
7320             //symbol_print(uev.Vsym);
7321             // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
7322             // strings and statics are treated like offsets from a
7323             // un-named external with is the start of .rodata or .data
7324         case FLextern:                      /* external data symbol         */
7325         case FLtlsdata:
7326 static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
7327 {
7328         case FLgot:
7329         case FLgotoff:
7330 }
7331             pbuf.flush();
7332             s = uev.Vsym;               /* symbol pointer               */
7333             objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,CFoffset64 | flags);
7334             break;
7335 
7336 static if (TARGET_OSX)
7337 {
7338         case FLgot:
7339             funcsym_p.Slocalgotoffset = pbuf.getOffset();
7340             ad = 0;
7341             goto L1;
7342 }
7343 
7344         case FLfunc:                        /* function call                */
7345             s = uev.Vsym;               /* symbol pointer               */
7346             assert(TARGET_SEGMENTED || !tyfarfunc(s.ty()));
7347             pbuf.flush();
7348             objmod.reftoident(pbuf.seg,pbuf.offset,s,0,CFoffset64 | flags);
7349             break;
7350 
7351         case FLblock:                       /* displacement to another block */
7352             ad = uev.Vblock.Boffset - pbuf.getOffset() - 4;
7353             //printf("FLblock: funcoffset = %x, pbuf.getOffset = %x, Boffset = %x, ad = %x\n", funcoffset, pbuf.getOffset(), uev.Vblock.Boffset, ad);
7354             goto L1;
7355 
7356         case FLblockoff:
7357             pbuf.flush();
7358             assert(uev.Vblock);
7359             //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", pbuf.offset, uev.Vblock.Boffset, funcoffset);
7360             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7361             break;
7362 
7363         default:
7364             WRFL(fl);
7365             assert(0);
7366     }
7367     pbuf.offset += 8;
7368 }
7369 
7370 
7371 private void do32bit(MiniCodeBuf *pbuf, FL fl, evc *uev,int flags, int val)
7372 {
7373     char *p;
7374     Symbol *s;
7375     targ_size_t ad;
7376 
7377     //printf("do32bit(flags = x%x)\n", flags);
7378     switch (fl)
7379     {
7380         case FLconst:
7381             assert(targ_size_t.sizeof == 4 || targ_size_t.sizeof == 8);
7382             ad = * cast(targ_size_t *) uev;
7383         L1:
7384             pbuf.genp(4,&ad);
7385             return;
7386 
7387         case FLdatseg:
7388             pbuf.flush();
7389             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,flags);
7390             break;
7391 
7392         case FLframehandler:
7393             framehandleroffset = pbuf.getOffset();
7394             ad = 0;
7395             goto L1;
7396 
7397         case FLswitch:
7398             pbuf.flush();
7399             ad = uev.Vswitch.Btableoffset;
7400             if (config.flags & CFGromable)
7401             {
7402                 static if (TARGET_OSX)
7403                 {
7404                     // These are magic values based on the exact code generated for the switch jump
7405                     if (I64)
7406                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4;
7407                     else
7408                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4 - 8;
7409                     ad -= uev.Vswitch.Btablebase;
7410                     goto L1;
7411                 }
7412                 else static if (TARGET_WINDOS)
7413                 {
7414                     if (I64)
7415                     {
7416                         uev.Vswitch.Btablebase = pbuf.getOffset() + 4;
7417                         ad -= uev.Vswitch.Btablebase;
7418                         goto L1;
7419                     }
7420                     else
7421                         objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7422                 }
7423                 else
7424                 {
7425                     objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7426                 }
7427             }
7428             else
7429                     objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7430             break;
7431 
7432         case FLcode:
7433             //assert(JMPJMPTABLE);            // the only use case
7434             pbuf.flush();
7435             ad = *cast(targ_size_t *) uev + pbuf.getOffset();
7436             objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7437             break;
7438 
7439         case FLcsdata:
7440         case FLfardata:
7441             //symbol_print(uev.Vsym);
7442 
7443             // NOTE: In ELFOBJ all symbol refs have been tagged FLextern
7444             // strings and statics are treated like offsets from a
7445             // un-named external with is the start of .rodata or .data
7446         case FLextern:                      /* external data symbol         */
7447         case FLtlsdata:
7448     static if (TARGET_LINUX || TARGET_FREEBSD || TARGET_OPENBSD || TARGET_DRAGONFLYBSD || TARGET_SOLARIS)
7449     {
7450         case FLgot:
7451         case FLgotoff:
7452     }
7453             pbuf.flush();
7454             s = uev.Vsym;               /* symbol pointer               */
7455             if (TARGET_WINDOS && I64 && (flags & CFpc32))
7456             {
7457                 /* This is for those funky fixups where the location to be fixed up
7458                  * is a 'val' amount back from the current RIP, biased by adding 4.
7459                  */
7460                 assert(val >= -5 && val <= 0);
7461                 flags |= (-val & 7) << 24;          // set CFREL value
7462                 assert(CFREL == (7 << 24));
7463                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,flags);
7464             }
7465             else
7466                 objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset + val,flags);
7467             break;
7468 
7469     static if (TARGET_OSX)
7470     {
7471         case FLgot:
7472             funcsym_p.Slocalgotoffset = pbuf.getOffset();
7473             ad = 0;
7474             goto L1;
7475     }
7476 
7477         case FLfunc:                        /* function call                */
7478             s = uev.Vsym;               /* symbol pointer               */
7479             if (tyfarfunc(s.ty()))
7480             {   /* Large code references are always absolute    */
7481                 pbuf.flush();
7482                 pbuf.offset += objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags) - 4;
7483             }
7484             else if (s.Sseg == pbuf.seg &&
7485                      (s.Sclass == SCstatic || s.Sclass == SCglobal) &&
7486                      s.Sxtrnnum == 0 && flags & CFselfrel)
7487             {   /* if we know it's relative address     */
7488                 ad = s.Soffset - pbuf.getOffset() - 4;
7489                 goto L1;
7490             }
7491             else
7492             {
7493                 assert(TARGET_SEGMENTED || !tyfarfunc(s.ty()));
7494                 pbuf.flush();
7495                 objmod.reftoident(pbuf.seg,pbuf.offset,s,val,flags);
7496             }
7497             break;
7498 
7499         case FLblock:                       /* displacement to another block */
7500             ad = uev.Vblock.Boffset - pbuf.getOffset() - 4;
7501             //printf("FLblock: funcoffset = %x, pbuf.getOffset = %x, Boffset = %x, ad = %x\n", funcoffset, pbuf.getOffset(), uev.Vblock.Boffset, ad);
7502             goto L1;
7503 
7504         case FLblockoff:
7505             pbuf.flush();
7506             assert(uev.Vblock);
7507             //printf("FLblockoff: offset = %x, Boffset = %x, funcoffset = %x\n", pbuf.offset, uev.Vblock.Boffset, funcoffset);
7508             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7509             break;
7510 
7511         default:
7512             WRFL(fl);
7513             assert(0);
7514     }
7515     pbuf.offset += 4;
7516 }
7517 
7518 
7519 private void do16bit(MiniCodeBuf *pbuf, FL fl, evc *uev,int flags)
7520 {
7521     char *p;
7522     Symbol *s;
7523     targ_size_t ad;
7524 
7525     switch (fl)
7526     {
7527         case FLconst:
7528             pbuf.genp(2,cast(char *) uev);
7529             return;
7530 
7531         case FLdatseg:
7532             pbuf.flush();
7533             objmod.reftodatseg(pbuf.seg,pbuf.offset,uev.Vpointer,uev.Vseg,flags);
7534             break;
7535 
7536         case FLswitch:
7537             pbuf.flush();
7538             ad = uev.Vswitch.Btableoffset;
7539             if (config.flags & CFGromable)
7540                 objmod.reftocodeseg(pbuf.seg,pbuf.offset,ad);
7541             else
7542                 objmod.reftodatseg(pbuf.seg,pbuf.offset,ad,objmod.jmpTableSegment(funcsym_p),CFoff);
7543             break;
7544 
7545         case FLcsdata:
7546         case FLfardata:
7547         case FLextern:                      /* external data symbol         */
7548         case FLtlsdata:
7549             //assert(SIXTEENBIT || TARGET_SEGMENTED);
7550             pbuf.flush();
7551             s = uev.Vsym;               /* symbol pointer               */
7552             objmod.reftoident(pbuf.seg,pbuf.offset,s,uev.Voffset,flags);
7553             break;
7554 
7555         case FLfunc:                        /* function call                */
7556             //assert(SIXTEENBIT || TARGET_SEGMENTED);
7557             s = uev.Vsym;               /* symbol pointer               */
7558             if (tyfarfunc(s.ty()))
7559             {   /* Large code references are always absolute    */
7560                 pbuf.flush();
7561                 pbuf.offset += objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags) - 2;
7562             }
7563             else if (s.Sseg == pbuf.seg &&
7564                      (s.Sclass == SCstatic || s.Sclass == SCglobal) &&
7565                      s.Sxtrnnum == 0 && flags & CFselfrel)
7566             {   /* if we know it's relative address     */
7567                 ad = s.Soffset - pbuf.getOffset() - 2;
7568                 goto L1;
7569             }
7570             else
7571             {
7572                 pbuf.flush();
7573                 objmod.reftoident(pbuf.seg,pbuf.offset,s,0,flags);
7574             }
7575             break;
7576 
7577         case FLblock:                       /* displacement to another block */
7578             ad = uev.Vblock.Boffset - pbuf.getOffset() - 2;
7579             debug
7580             {
7581                 targ_ptrdiff_t delta = uev.Vblock.Boffset - pbuf.getOffset() - 2;
7582                 assert(cast(short)delta == delta);
7583             }
7584         L1:
7585             pbuf.genp(2,&ad);                    // displacement
7586             return;
7587 
7588         case FLblockoff:
7589             pbuf.flush();
7590             objmod.reftocodeseg(pbuf.seg,pbuf.offset,uev.Vblock.Boffset);
7591             break;
7592 
7593         default:
7594             WRFL(fl);
7595             assert(0);
7596     }
7597     pbuf.offset += 2;
7598 }
7599 
7600 
7601 private void do8bit(MiniCodeBuf *pbuf, FL fl, evc *uev)
7602 {
7603     char c;
7604     targ_ptrdiff_t delta;
7605 
7606     switch (fl)
7607     {
7608         case FLconst:
7609             c = cast(char)uev.Vuns;
7610             break;
7611 
7612         case FLblock:
7613             delta = uev.Vblock.Boffset - pbuf.getOffset() - 1;
7614             if (cast(byte)delta != delta)
7615             {
7616                 version (MARS)
7617                 {
7618                     if (uev.Vblock.Bsrcpos.Slinnum)
7619                         printf("%s(%d): ", uev.Vblock.Bsrcpos.Sfilename, uev.Vblock.Bsrcpos.Slinnum);
7620                 }
7621                 printf("block displacement of %lld exceeds the maximum offset of -128 to 127.\n", cast(long)delta);
7622                 err_exit();
7623             }
7624             c = cast(char)delta;
7625             debug assert(uev.Vblock.Boffset > pbuf.getOffset() || c != 0x7F);
7626             break;
7627 
7628         default:
7629             debug printf("fl = %d\n",fl);
7630             assert(0);
7631     }
7632     pbuf.gen(c);
7633 }
7634 
7635 
7636 /**********************************
7637  */
7638 
7639 version (SCPP)
7640 {
7641 static if (HYDRATE)
7642 {
7643 void code_hydrate(code **pc)
7644 {
7645     code *c;
7646     ubyte ins,rm;
7647     FL fl;
7648 
7649     assert(pc);
7650     while (*pc)
7651     {
7652         c = cast(code *) ph_hydrate(cast(void**)pc);
7653         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
7654             ins = vex_inssize(c);
7655         else if ((c.Iop & 0xFFFD00) == 0x0F3800)
7656             ins = inssize2[(c.Iop >> 8) & 0xFF];
7657         else if ((c.Iop & 0xFF00) == 0x0F00)
7658             ins = inssize2[c.Iop & 0xFF];
7659         else
7660             ins = inssize[c.Iop & 0xFF];
7661         switch (c.Iop)
7662         {
7663             default:
7664                 break;
7665 
7666             case ESCAPE | ESClinnum:
7667                 srcpos_hydrate(&c.IEV1.Vsrcpos);
7668                 goto done;
7669 
7670             case ESCAPE | ESCctor:
7671             case ESCAPE | ESCdtor:
7672                 el_hydrate(&c.IEV1.Vtor);
7673                 goto done;
7674 
7675             case ASM:
7676                 ph_hydrate(cast(void**)&c.IEV1.bytes);
7677                 goto done;
7678         }
7679         if (!(ins & M) ||
7680             ((rm = c.Irm) & 0xC0) == 0xC0)
7681             goto do2;           /* if no first operand          */
7682         if (is32bitaddr(I32,c.Iflags))
7683         {
7684 
7685             if (
7686                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
7687                )
7688                 goto do2;       /* if no first operand  */
7689         }
7690         else
7691         {
7692             if (
7693                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
7694                )
7695                 goto do2;       /* if no first operand  */
7696         }
7697         fl = cast(FL) c.IFL1;
7698         switch (fl)
7699         {
7700             case FLudata:
7701             case FLdata:
7702             case FLreg:
7703             case FLauto:
7704             case FLfast:
7705             case FLbprel:
7706             case FLpara:
7707             case FLcsdata:
7708             case FLfardata:
7709             case FLtlsdata:
7710             case FLfunc:
7711             case FLpseudo:
7712             case FLextern:
7713                 assert(flinsymtab[fl]);
7714                 symbol_hydrate(&c.IEV1.Vsym);
7715                 symbol_debug(c.IEV1.Vsym);
7716                 break;
7717 
7718             case FLdatseg:
7719             case FLfltreg:
7720             case FLallocatmp:
7721             case FLcs:
7722             case FLndp:
7723             case FLoffset:
7724             case FLlocalsize:
7725             case FLconst:
7726             case FLframehandler:
7727                 assert(!flinsymtab[fl]);
7728                 break;
7729 
7730             case FLcode:
7731                 ph_hydrate(cast(void**)&c.IEV1.Vcode);
7732                 break;
7733 
7734             case FLblock:
7735             case FLblockoff:
7736                 ph_hydrate(cast(void**)&c.IEV1.Vblock);
7737                 break;
7738 version (SCPP)
7739 {
7740             case FLctor:
7741             case FLdtor:
7742                 el_hydrate(cast(elem**)&c.IEV1.Vtor);
7743                 break;
7744 }
7745             case FLasm:
7746                 ph_hydrate(cast(void**)&c.IEV1.bytes);
7747                 break;
7748 
7749             default:
7750                 WRFL(fl);
7751                 assert(0);
7752         }
7753     do2:
7754         /* Ignore TEST (F6 and F7) opcodes      */
7755         if (!(ins & T))
7756             goto done;          /* if no second operand */
7757 
7758         fl = cast(FL) c.IFL2;
7759         switch (fl)
7760         {
7761             case FLudata:
7762             case FLdata:
7763             case FLreg:
7764             case FLauto:
7765             case FLfast:
7766             case FLbprel:
7767             case FLpara:
7768             case FLcsdata:
7769             case FLfardata:
7770             case FLtlsdata:
7771             case FLfunc:
7772             case FLpseudo:
7773             case FLextern:
7774                 assert(flinsymtab[fl]);
7775                 symbol_hydrate(&c.IEV2.Vsym);
7776                 symbol_debug(c.IEV2.Vsym);
7777                 break;
7778 
7779             case FLdatseg:
7780             case FLfltreg:
7781             case FLallocatmp:
7782             case FLcs:
7783             case FLndp:
7784             case FLoffset:
7785             case FLlocalsize:
7786             case FLconst:
7787             case FLframehandler:
7788                 assert(!flinsymtab[fl]);
7789                 break;
7790 
7791             case FLcode:
7792                 ph_hydrate(cast(void**)&c.IEV2.Vcode);
7793                 break;
7794 
7795             case FLblock:
7796             case FLblockoff:
7797                 ph_hydrate(cast(void**)&c.IEV2.Vblock);
7798                 break;
7799 
7800             default:
7801                 WRFL(fl);
7802                 assert(0);
7803         }
7804   done:
7805         { }
7806 
7807         pc = &c.next;
7808     }
7809 }
7810 }
7811 
7812 /**********************************
7813  */
7814 
7815 static if (DEHYDRATE)
7816 {
7817 void code_dehydrate(code **pc)
7818 {
7819     code *c;
7820     ubyte ins,rm;
7821     FL fl;
7822 
7823     while ((c = *pc) != null)
7824     {
7825         ph_dehydrate(pc);
7826 
7827         if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
7828             ins = vex_inssize(c);
7829         else if ((c.Iop & 0xFFFD00) == 0x0F3800)
7830             ins = inssize2[(c.Iop >> 8) & 0xFF];
7831         else if ((c.Iop & 0xFF00) == 0x0F00)
7832             ins = inssize2[c.Iop & 0xFF];
7833         else
7834             ins = inssize[c.Iop & 0xFF];
7835         switch (c.Iop)
7836         {
7837             default:
7838                 break;
7839 
7840             case ESCAPE | ESClinnum:
7841                 srcpos_dehydrate(&c.IEV1.Vsrcpos);
7842                 goto done;
7843 
7844             case ESCAPE | ESCctor:
7845             case ESCAPE | ESCdtor:
7846                 el_dehydrate(&c.IEV1.Vtor);
7847                 goto done;
7848 
7849             case ASM:
7850                 ph_dehydrate(&c.IEV1.bytes);
7851                 goto done;
7852         }
7853 
7854         if (!(ins & M) ||
7855             ((rm = c.Irm) & 0xC0) == 0xC0)
7856             goto do2;           /* if no first operand          */
7857         if (is32bitaddr(I32,c.Iflags))
7858         {
7859 
7860             if (
7861                 ((rm & 0xC0) == 0 && !((rm & 7) == 4 && (c.Isib & 7) == 5 || (rm & 7) == 5))
7862                )
7863                 goto do2;       /* if no first operand  */
7864         }
7865         else
7866         {
7867             if (
7868                 ((rm & 0xC0) == 0 && !((rm & 7) == 6))
7869                )
7870                 goto do2;       /* if no first operand  */
7871         }
7872         fl = cast(FL) c.IFL1;
7873         switch (fl)
7874         {
7875             case FLudata:
7876             case FLdata:
7877             case FLreg:
7878             case FLauto:
7879             case FLfast:
7880             case FLbprel:
7881             case FLpara:
7882             case FLcsdata:
7883             case FLfardata:
7884             case FLtlsdata:
7885             case FLfunc:
7886             case FLpseudo:
7887             case FLextern:
7888                 assert(flinsymtab[fl]);
7889                 symbol_dehydrate(&c.IEV1.Vsym);
7890                 break;
7891 
7892             case FLdatseg:
7893             case FLfltreg:
7894             case FLallocatmp:
7895             case FLcs:
7896             case FLndp:
7897             case FLoffset:
7898             case FLlocalsize:
7899             case FLconst:
7900             case FLframehandler:
7901                 assert(!flinsymtab[fl]);
7902                 break;
7903 
7904             case FLcode:
7905                 ph_dehydrate(&c.IEV1.Vcode);
7906                 break;
7907 
7908             case FLblock:
7909             case FLblockoff:
7910                 ph_dehydrate(&c.IEV1.Vblock);
7911                 break;
7912 version (SCPP)
7913 {
7914             case FLctor:
7915             case FLdtor:
7916                 el_dehydrate(&c.IEV1.Vtor);
7917                 break;
7918 }
7919             case FLasm:
7920                 ph_dehydrate(&c.IEV1.bytes);
7921                 break;
7922 
7923             default:
7924                 WRFL(fl);
7925                 assert(0);
7926                 break;
7927         }
7928     do2:
7929         /* Ignore TEST (F6 and F7) opcodes      */
7930         if (!(ins & T))
7931             goto done;          /* if no second operand */
7932 
7933         fl = cast(FL) c.IFL2;
7934         switch (fl)
7935         {
7936             case FLudata:
7937             case FLdata:
7938             case FLreg:
7939             case FLauto:
7940             case FLfast:
7941             case FLbprel:
7942             case FLpara:
7943             case FLcsdata:
7944             case FLfardata:
7945             case FLtlsdata:
7946             case FLfunc:
7947             case FLpseudo:
7948             case FLextern:
7949                 assert(flinsymtab[fl]);
7950                 symbol_dehydrate(&c.IEV2.Vsym);
7951                 break;
7952 
7953             case FLdatseg:
7954             case FLfltreg:
7955             case FLallocatmp:
7956             case FLcs:
7957             case FLndp:
7958             case FLoffset:
7959             case FLlocalsize:
7960             case FLconst:
7961             case FLframehandler:
7962                 assert(!flinsymtab[fl]);
7963                 break;
7964 
7965             case FLcode:
7966                 ph_dehydrate(&c.IEV2.Vcode);
7967                 break;
7968 
7969             case FLblock:
7970             case FLblockoff:
7971                 ph_dehydrate(&c.IEV2.Vblock);
7972                 break;
7973 
7974             default:
7975                 WRFL(fl);
7976                 assert(0);
7977                 break;
7978         }
7979   done:
7980         pc = &code_next(c);
7981     }
7982 }
7983 }
7984 }
7985 
7986 /***************************
7987  * Debug code to dump code structure.
7988  */
7989 
7990 void WRcodlst(code *c)
7991 {
7992     for (; c; c = code_next(c))
7993         code_print(c);
7994 }
7995 
7996 extern (C) void code_print(code* c)
7997 {
7998     ubyte ins;
7999     ubyte rexb;
8000 
8001     if (c == null)
8002     {
8003         printf("code 0\n");
8004         return;
8005     }
8006 
8007     const op = c.Iop;
8008     if (c.Iflags & CFvex && c.Ivex.pfx == 0xC4)
8009         ins = vex_inssize(c);
8010     else if ((c.Iop & 0xFFFD00) == 0x0F3800)
8011         ins = inssize2[(op >> 8) & 0xFF];
8012     else if ((c.Iop & 0xFF00) == 0x0F00)
8013         ins = inssize2[op & 0xFF];
8014     else
8015         ins = inssize[op & 0xFF];
8016 
8017     printf("code %p: nxt=%p ",c,code_next(c));
8018 
8019     if (c.Iflags & CFvex)
8020     {
8021         if (c.Iflags & CFvex3)
8022         {
8023             printf("vex=0xC4");
8024             printf(" 0x%02X", VEX3_B1(c.Ivex));
8025             printf(" 0x%02X", VEX3_B2(c.Ivex));
8026             rexb =
8027                 ( c.Ivex.w ? REX_W : 0) |
8028                 (!c.Ivex.r ? REX_R : 0) |
8029                 (!c.Ivex.x ? REX_X : 0) |
8030                 (!c.Ivex.b ? REX_B : 0);
8031         }
8032         else
8033         {
8034             printf("vex=0xC5");
8035             printf(" 0x%02X", VEX2_B1(c.Ivex));
8036             rexb = !c.Ivex.r ? REX_R : 0;
8037         }
8038         printf(" ");
8039     }
8040     else
8041         rexb = c.Irex;
8042 
8043     if (rexb)
8044     {
8045         printf("rex=0x%02X ", c.Irex);
8046         if (rexb & REX_W)
8047             printf("W");
8048         if (rexb & REX_R)
8049             printf("R");
8050         if (rexb & REX_X)
8051             printf("X");
8052         if (rexb & REX_B)
8053             printf("B");
8054         printf(" ");
8055     }
8056     printf("op=0x%02X",op);
8057 
8058     if ((op & 0xFF) == ESCAPE)
8059     {
8060         if ((op & 0xFF00) == ESClinnum)
8061         {
8062             printf(" linnum = %d\n",c.IEV1.Vsrcpos.Slinnum);
8063             return;
8064         }
8065         printf(" ESCAPE %d",c.Iop >> 8);
8066     }
8067     if (c.Iflags)
8068         printf(" flg=%x",c.Iflags);
8069     if (ins & M)
8070     {
8071         uint rm = c.Irm;
8072         printf(" rm=0x%02X=%d,%d,%d",rm,(rm>>6)&3,(rm>>3)&7,rm&7);
8073         if (!I16 && issib(rm))
8074         {
8075             ubyte sib = c.Isib;
8076             printf(" sib=%02x=%d,%d,%d",sib,(sib>>6)&3,(sib>>3)&7,sib&7);
8077         }
8078         if ((rm & 0xC7) == BPRM || (rm & 0xC0) == 0x80 || (rm & 0xC0) == 0x40)
8079         {
8080             switch (c.IFL1)
8081             {
8082                 case FLconst:
8083                 case FLoffset:
8084                     printf(" int = %4d",c.IEV1.Vuns);
8085                     break;
8086 
8087                 case FLblock:
8088                     printf(" block = %p",c.IEV1.Vblock);
8089                     break;
8090 
8091                 case FLswitch:
8092                 case FLblockoff:
8093                 case FLlocalsize:
8094                 case FLframehandler:
8095                 case 0:
8096                     break;
8097 
8098                 case FLdatseg:
8099                     printf(" FLdatseg %d.%llx",c.IEV1.Vseg,cast(ulong)c.IEV1.Vpointer);
8100                     break;
8101 
8102                 case FLauto:
8103                 case FLfast:
8104                 case FLreg:
8105                 case FLdata:
8106                 case FLudata:
8107                 case FLpara:
8108                 case FLbprel:
8109                 case FLtlsdata:
8110                 case FLextern:
8111                     printf(" ");
8112                     WRFL(cast(FL)c.IFL1);
8113                     printf(" sym='%s'",c.IEV1.Vsym.Sident.ptr);
8114                     if (c.IEV1.Voffset)
8115                         printf(".%d", cast(int)c.IEV1.Voffset);
8116                     break;
8117 
8118                 default:
8119                     WRFL(cast(FL)c.IFL1);
8120                     break;
8121             }
8122         }
8123     }
8124     if (ins & T)
8125     {
8126         printf(" ");
8127         WRFL(cast(FL)c.IFL2);
8128         switch (c.IFL2)
8129         {
8130             case FLconst:
8131                 printf(" int = %4d",c.IEV2.Vuns);
8132                 break;
8133 
8134             case FLblock:
8135                 printf(" block = %p",c.IEV2.Vblock);
8136                 break;
8137 
8138             case FLswitch:
8139             case FLblockoff:
8140             case 0:
8141             case FLlocalsize:
8142             case FLframehandler:
8143                 break;
8144 
8145             case FLdatseg:
8146                 printf(" %d.%llx",c.IEV2.Vseg,cast(ulong)c.IEV2.Vpointer);
8147                 break;
8148 
8149             case FLauto:
8150             case FLfast:
8151             case FLreg:
8152             case FLpara:
8153             case FLbprel:
8154             case FLfunc:
8155             case FLdata:
8156             case FLudata:
8157             case FLtlsdata:
8158                 printf(" sym='%s'",c.IEV2.Vsym.Sident.ptr);
8159                 break;
8160 
8161             case FLcode:
8162                 printf(" code = %p",c.IEV2.Vcode);
8163                 break;
8164 
8165             default:
8166                 WRFL(cast(FL)c.IFL2);
8167                 break;
8168         }
8169     }
8170     printf("\n");
8171 }
8172 
8173 }