1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (C) 2011-2021 by The D Language Foundation, All Rights Reserved
6  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
7  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d)
9  */
10 
11 module dmd.backend.cgxmm;
12 
13 version (SCPP)
14     version = COMPILE;
15 version (MARS)
16     version = COMPILE;
17 
18 version (COMPILE)
19 {
20 
21 import core.stdc.stdio;
22 import core.stdc.stdlib;
23 import core.stdc.string;
24 
25 import dmd.backend.cc;
26 import dmd.backend.cdef;
27 import dmd.backend.code;
28 import dmd.backend.code_x86;
29 import dmd.backend.codebuilder;
30 import dmd.backend.mem;
31 import dmd.backend.el;
32 import dmd.backend.global;
33 import dmd.backend.oper;
34 import dmd.backend.ty;
35 import dmd.backend.xmm;
36 
37 version (SCPP)
38     import dmd.backend.exh;
39 version (MARS)
40     import dmd.backend.errors;
41 
42 
43 extern (C++):
44 
45 nothrow:
46 
47 int REGSIZE();
48 
49 uint mask(uint m);
50 
51 /*******************************************
52  * Is operator a store operator?
53  */
54 
55 bool isXMMstore(opcode_t op)
56 {
57     switch (op)
58     {
59     case STOSS: case STOAPS: case STOUPS:
60     case STOSD: case STOAPD: case STOUPD:
61     case STOD: case STOQ: case STODQA: case STODQU:
62     case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true;
63     default: return false;
64     }
65 }
66 
67 /*******************************************
68  * Move constant value into xmm register xreg.
69  */
70 
71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags)
72 {
73     /* Generate:
74      *    MOV reg,value
75      *    MOV xreg,reg
76      * Not so efficient. We should at least do a PXOR for 0.
77      */
78     assert(mask(xreg) & XMMREGS);
79     assert(sz == 4 || sz == 8);
80     if (I32 && sz == 8)
81     {
82         reg_t r;
83         regm_t rm = ALLREGS;
84         allocreg(cdb,&rm,&r,TYint);         // allocate scratch register
85         static union U { targ_size_t s; targ_long[2] l; }
86         U u = void;
87         u.l[1] = 0;
88         u.s = value;
89         targ_long *p = &u.l[0];
90         movregconst(cdb,r,p[0],0);
91         cdb.genfltreg(STO,r,0);                     // MOV floatreg,r
92         movregconst(cdb,r,p[1],0);
93         cdb.genfltreg(STO,r,4);                     // MOV floatreg+4,r
94 
95         const op = xmmload(TYdouble, true);
96         cdb.genxmmreg(op,xreg,0,TYdouble);          // MOVSD XMMreg,floatreg
97     }
98     else
99     {
100         reg_t reg;
101         regwithvalue(cdb,ALLREGS,value,&reg,(sz == 8) ? 64 : 0);
102         cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg));     // MOVD xreg,reg
103         if (sz == 8)
104             code_orrex(cdb.last(), REX_W);
105         checkSetVex(cdb.last(), TYulong);
106     }
107 }
108 
109 /***********************************************
110  * Do simple orthogonal operators for XMM registers.
111  */
112 
113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
114 {
115     //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
116     elem *e1 = e.EV.E1;
117     elem *e2 = e.EV.E2;
118 
119     // float + ifloat is not actually addition
120     if ((e.Eoper == OPadd || e.Eoper == OPmin) &&
121         ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) ||
122          (tyreal(e2.Ety) && tyimaginary(e1.Ety))))
123     {
124         regm_t retregs = *pretregs & XMMREGS;
125         if (!retregs)
126             retregs = XMMREGS;
127 
128         regm_t rretregs;
129         reg_t rreg;
130         if (tyreal(e1.Ety))
131         {
132             const reg = findreg(retregs);
133             rreg = findreg(retregs & ~mask(reg));
134             retregs = mask(reg);
135             rretregs = mask(rreg);
136         }
137         else
138         {
139             // Pick the second register, not the first
140             rreg = findreg(retregs);
141             rretregs = mask(rreg);
142             const reg = findreg(retregs & ~rretregs);
143             retregs = mask(reg);
144         }
145         assert(retregs && rretregs);
146 
147         codelem(cdb,e1,&retregs,false); // eval left leaf
148         scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
149 
150         retregs |= rretregs;
151         if (e.Eoper == OPmin)
152         {
153             regm_t nretregs = XMMREGS & ~retregs;
154             reg_t sreg; // hold sign bit
155             const uint sz = tysize(e1.Ety);
156             allocreg(cdb,&nretregs,&sreg,e2.Ety);
157             targ_size_t signbit = 0x80000000;
158             if (sz == 8)
159                 signbit = cast(targ_size_t)0x8000000000000000L;
160             movxmmconst(cdb,sreg, sz, signbit, 0);
161             getregs(cdb,nretregs);
162             const opcode_t xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
163             cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0));
164         }
165         if (retregs != *pretregs)
166             fixresult(cdb,e,retregs,pretregs);
167         return;
168     }
169 
170     regm_t retregs = *pretregs & XMMREGS;
171     if (!retregs)
172         retregs = XMMREGS;
173     const constflag = OTrel(e.Eoper);
174     codelem(cdb,e1,&retregs,constflag); // eval left leaf
175     const reg = findreg(retregs);
176     regm_t rretregs = XMMREGS & ~retregs;
177     scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
178 
179     const rreg = findreg(rretregs);
180     const op = xmmoperator(e1.Ety, e.Eoper);
181 
182     /* We should take advantage of mem addressing modes for OP XMM,MEM
183      * but we do not at the moment.
184      */
185     if (OTrel(e.Eoper))
186     {
187         cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0));
188         checkSetVex(cdb.last(), e1.Ety);
189         return;
190     }
191 
192     getregs(cdb,retregs);
193     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
194     checkSetVex(cdb.last(), e1.Ety);
195     if (retregs != *pretregs)
196         fixresult(cdb,e,retregs,pretregs);
197 }
198 
199 
200 /************************
201  * Generate code for an assignment using XMM registers.
202  * Params:
203  *      opcode = store opcode to use, CMP means generate one
204  */
205 
206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs)
207 {
208     tym_t tymll;
209     int i;
210     code cs;
211     elem *e11;
212     bool regvar;                  /* true means evaluate into register variable */
213     regm_t varregm;
214     targ_int postinc;
215 
216     //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs));
217     tym_t tyml = tybasic(e1.Ety);              /* type of lvalue               */
218     regm_t retregs = *pretregs;
219 
220     if (!(retregs & XMMREGS))
221         retregs = XMMREGS;              // pick any XMM reg
222 
223     bool aligned = xmmIsAligned(e1);
224     // If default, select store opcode
225     cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op;
226     regvar = false;
227     varregm = 0;
228     if (config.flags4 & CFG4optimized)
229     {
230         // Be careful of cases like (x = x+x+x). We cannot evaluate in
231         // x if x is in a register.
232         reg_t varreg;
233         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
234             doinreg(e1.EV.Vsym,e2) &&           // and we can compute directly into it
235             varregm & XMMREGS
236            )
237         {   regvar = true;
238             retregs = varregm;    // evaluate directly in target register
239         }
240     }
241     if (*pretregs & mPSW && OTleaf(e1.Eoper))     // if evaluating e1 couldn't change flags
242     {   // Be careful that this lines up with jmpopcode()
243         retregs |= mPSW;
244         *pretregs &= ~mPSW;
245     }
246     scodelem(cdb,e2,&retregs,0,true);    // get rvalue
247 
248     // Look for special case of (*p++ = ...), where p is a register variable
249     if (e1.Eoper == OPind &&
250         ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) &&
251         e11.EV.E1.Eoper == OPvar &&
252         e11.EV.E1.EV.Vsym.Sfl == FLreg
253        )
254     {
255         postinc = e11.EV.E2.EV.Vint;
256         if (e11.Eoper == OPpostdec)
257             postinc = -postinc;
258         getlvalue(cdb,&cs,e11,RMstore | retregs);
259         freenode(e11.EV.E2);
260     }
261     else
262     {   postinc = 0;
263         getlvalue(cdb,&cs,e1,RMstore | retregs);       // get lvalue (cl == CNIL if regvar)
264     }
265 
266     getregs_imm(cdb,regvar ? varregm : 0);
267 
268     const reg = findreg(retregs & XMMREGS);
269     cs.Irm |= modregrm(0,(reg - XMM0) & 7,0);
270     if ((reg - XMM0) & 8)
271         cs.Irex |= REX_R;
272 
273     // Do not generate mov from register onto itself
274     if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0))))
275     {
276         cdb.gen(&cs);         // MOV EA+offset,reg
277         checkSetVex(cdb.last(), tyml);
278     }
279 
280     if (e1.Ecount ||                     // if lvalue is a CSE or
281         regvar)                           // rvalue can't be a CSE
282     {
283         getregs_imm(cdb,retregs);        // necessary if both lvalue and
284                                         //  rvalue are CSEs (since a reg
285                                         //  can hold only one e at a time)
286         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
287     }
288 
289     fixresult(cdb,e,retregs,pretregs);
290     if (postinc)
291     {
292         const increg = findreg(idxregm(&cs));  // the register to increment
293         if (*pretregs & mPSW)
294         {   // Use LEA to avoid touching the flags
295             uint rm = cs.Irm & 7;
296             if (cs.Irex & REX_B)
297                 rm |= 8;
298             cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc);
299             if (tysize(e11.EV.E1.Ety) == 8)
300                 code_orrex(cdb.last(), REX_W);
301         }
302         else if (I64)
303         {
304             cdb.genc2(0x81,modregrmx(3,0,increg),postinc);
305             if (tysize(e11.EV.E1.Ety) == 8)
306                 code_orrex(cdb.last(), REX_W);
307         }
308         else
309         {
310             if (postinc == 1)
311                 cdb.gen1(0x40 + increg);       // INC increg
312             else if (postinc == -cast(targ_int)1)
313                 cdb.gen1(0x48 + increg);       // DEC increg
314             else
315             {
316                 cdb.genc2(0x81,modregrm(3,0,increg),postinc);
317             }
318         }
319     }
320     freenode(e1);
321 }
322 
323 /********************************
324  * Generate code for conversion using SSE2 instructions.
325  *
326  *      OPs32_d
327  *      OPs64_d (64-bit only)
328  *      OPu32_d (64-bit only)
329  *      OPd_f
330  *      OPf_d
331  *      OPd_s32
332  *      OPd_s64 (64-bit only)
333  *
334  */
335 
336 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
337 {
338     //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs));
339     opcode_t op = NoOpcode;
340     regm_t regs;
341     tym_t ty;
342     ubyte rex = 0;
343     bool zx = false; // zero extend uint
344 
345     /* There are no ops for integer <. float/real conversions
346      * but there are instructions for them. In order to use these
347      * try to fuse chained conversions. Be careful not to loose
348      * precision for real to long.
349      */
350     elem *e1 = e.EV.E1;
351     switch (e.Eoper)
352     {
353     case OPd_f:
354         if (e1.Eoper == OPs32_d)
355         { }
356         else if (I64 && e1.Eoper == OPs64_d)
357             rex = REX_W;
358         else if (I64 && e1.Eoper == OPu32_d)
359         {   rex = REX_W;
360             zx = true;
361         }
362         else
363         {   regs = XMMREGS;
364             op = CVTSD2SS;
365             ty = TYfloat;
366             break;
367         }
368         if (e1.Ecount)
369         {
370             regs = XMMREGS;
371             op = CVTSD2SS;
372             ty = TYfloat;
373             break;
374         }
375         // directly use si2ss
376         regs = ALLREGS;
377         e1 = e1.EV.E1;  // fused operation
378         op = CVTSI2SS;
379         ty = TYfloat;
380         break;
381 
382     case OPs32_d:              goto Litod;
383     case OPs64_d: rex = REX_W; goto Litod;
384     case OPu32_d: rex = REX_W; zx = true; goto Litod;
385     Litod:
386         regs = ALLREGS;
387         op = CVTSI2SD;
388         ty = TYdouble;
389         break;
390 
391     case OPd_s32: ty = TYint;  goto Ldtoi;
392     case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi;
393     case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi;
394     Ldtoi:
395         regs = XMMREGS;
396         switch (e1.Eoper)
397         {
398         case OPf_d:
399             if (e1.Ecount)
400             {
401                 op = CVTTSD2SI;
402                 break;
403             }
404             e1 = e1.EV.E1;      // fused operation
405             op = CVTTSS2SI;
406             break;
407         case OPld_d:
408             if (e.Eoper == OPd_s64)
409             {
410                 cnvt87(cdb,e,pretregs); // precision
411                 return;
412             }
413             goto default;
414 
415         default:
416             op = CVTTSD2SI;
417             break;
418         }
419         break;
420 
421     case OPf_d:
422         regs = XMMREGS;
423         op = CVTSS2SD;
424         ty = TYdouble;
425         break;
426 
427     default:
428         assert(0);
429     }
430     assert(op != NoOpcode);
431 
432     codelem(cdb,e1, &regs, false);
433     reg_t reg = findreg(regs);
434     if (isXMMreg(reg))
435         reg -= XMM0;
436     else if (zx)
437     {   assert(I64);
438         getregs(cdb,regs);
439         genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit
440                                    // Don't use x89 because that will get optimized away
441         code_orflag(cdb.last(),CFvolatile);
442     }
443 
444     regm_t retregs = *pretregs;
445     if (tyxmmreg(ty)) // target is XMM
446     {   if (!(*pretregs & XMMREGS))
447             retregs = XMMREGS;
448     }
449     else              // source is XMM
450     {   assert(regs & XMMREGS);
451         if (!(retregs & ALLREGS))
452             retregs = ALLREGS;
453     }
454 
455     reg_t rreg;
456     allocreg(cdb,&retregs,&rreg,ty);
457     if (isXMMreg(rreg))
458         rreg -= XMM0;
459 
460     cdb.gen2(op, modregxrmx(3,rreg,reg));
461     assert(I64 || !rex);
462     if (rex)
463         code_orrex(cdb.last(), rex);
464 
465     if (*pretregs != retregs)
466         fixresult(cdb,e,retregs,pretregs);
467 }
468 
469 /********************************
470  * Generate code for op=
471  */
472 
473 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
474 {   elem *e1 = e.EV.E1;
475     elem *e2 = e.EV.E2;
476     tym_t ty1 = tybasic(e1.Ety);
477     const sz1 = _tysize[ty1];
478     regm_t rretregs = XMMREGS & ~*pretregs;
479     if (!rretregs)
480         rretregs = XMMREGS;
481 
482     codelem(cdb,e2,&rretregs,false); // eval right leaf
483     reg_t rreg = findreg(rretregs);
484 
485     code cs;
486     regm_t retregs;
487     reg_t reg;
488     bool regvar = false;
489     if (config.flags4 & CFG4optimized)
490     {
491         // Be careful of cases like (x = x+x+x). We cannot evaluate in
492         // x if x is in a register.
493         reg_t varreg;
494         regm_t varregm;
495         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
496             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
497            )
498         {   regvar = true;
499             retregs = varregm;
500             reg = varreg;                       // evaluate directly in target register
501             getregs(cdb,retregs);       // destroy these regs
502         }
503     }
504 
505     if (!regvar)
506     {
507         getlvalue(cdb,&cs,e1,rretregs);         // get EA
508         retregs = *pretregs & XMMREGS & ~rretregs;
509         if (!retregs)
510             retregs = XMMREGS & ~rretregs;
511         allocreg(cdb,&retregs,&reg,ty1);
512         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
513         code_newreg(&cs,reg - XMM0);
514         cdb.gen(&cs);
515         checkSetVex(cdb.last(), ty1);
516     }
517 
518     const op = xmmoperator(e1.Ety, e.Eoper);
519     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
520     checkSetVex(cdb.last(), e1.Ety);
521 
522     if (!regvar)
523     {
524         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
525         cdb.gen(&cs);
526         checkSetVex(cdb.last(), ty1);
527     }
528 
529     if (e1.Ecount ||                     // if lvalue is a CSE or
530         regvar)                           // rvalue can't be a CSE
531     {
532         getregs_imm(cdb,retregs);        // necessary if both lvalue and
533                                         //  rvalue are CSEs (since a reg
534                                         //  can hold only one e at a time)
535         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
536     }
537 
538     fixresult(cdb,e,retregs,pretregs);
539     freenode(e1);
540 }
541 
542 /********************************
543  * Generate code for post increment and post decrement.
544  */
545 
546 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
547 {
548     elem *e1 = e.EV.E1;
549     elem *e2 = e.EV.E2;
550     tym_t ty1 = tybasic(e1.Ety);
551 
552     regm_t retregs;
553     reg_t reg;
554     bool regvar = false;
555     if (config.flags4 & CFG4optimized)
556     {
557         // Be careful of cases like (x = x+x+x). We cannot evaluate in
558         // x if x is in a register.
559         reg_t varreg;
560         regm_t varregm;
561         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
562             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
563            )
564         {
565             regvar = true;
566             retregs = varregm;
567             reg = varreg;                       // evaluate directly in target register
568             getregs(cdb,retregs);       // destroy these regs
569         }
570     }
571 
572     code cs;
573     if (!regvar)
574     {
575         getlvalue(cdb,&cs,e1,0);                // get EA
576         retregs = XMMREGS & ~*pretregs;
577         if (!retregs)
578             retregs = XMMREGS;
579         allocreg(cdb,&retregs,&reg,ty1);
580         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
581         code_newreg(&cs,reg - XMM0);
582         cdb.gen(&cs);
583         checkSetVex(cdb.last(), ty1);
584     }
585 
586     // Result register
587     regm_t resultregs = XMMREGS & *pretregs & ~retregs;
588     if (!resultregs)
589         resultregs = XMMREGS & ~retregs;
590     reg_t resultreg;
591     allocreg(cdb,&resultregs, &resultreg, ty1);
592 
593     cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0));   // MOVSS/D resultreg,reg
594     checkSetVex(cdb.last(), ty1);
595 
596     regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs);
597     if (!rretregs)
598         rretregs = XMMREGS & ~(retregs | resultregs);
599     codelem(cdb,e2,&rretregs,false); // eval right leaf
600     const rreg = findreg(rretregs);
601 
602     const op = xmmoperator(e1.Ety, e.Eoper);
603     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));  // ADD reg,rreg
604     checkSetVex(cdb.last(), e1.Ety);
605 
606     if (!regvar)
607     {
608         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
609         cdb.gen(&cs);
610         checkSetVex(cdb.last(), ty1);
611     }
612 
613     if (e1.Ecount ||                     // if lvalue is a CSE or
614         regvar)                           // rvalue can't be a CSE
615     {
616         getregs_imm(cdb,retregs); // necessary if both lvalue and
617                                         //  rvalue are CSEs (since a reg
618                                         //  can hold only one e at a time)
619         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
620     }
621 
622     fixresult(cdb,e,resultregs,pretregs);
623     freenode(e1);
624 }
625 
626 /******************
627  * Negate operator
628  */
629 
630 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
631 {
632     //printf("xmmneg()\n");
633     //elem_print(e);
634     assert(*pretregs);
635     tym_t tyml = tybasic(e.EV.E1.Ety);
636     int sz = _tysize[tyml];
637 
638     regm_t retregs = *pretregs & XMMREGS;
639     if (!retregs)
640         retregs = XMMREGS;
641 
642     /* Generate:
643      *    MOV reg,e1
644      *    MOV rreg,signbit
645      *    XOR reg,rreg
646      */
647     codelem(cdb,e.EV.E1,&retregs,false);
648     getregs(cdb,retregs);
649     const reg = findreg(retregs);
650     regm_t rretregs = XMMREGS & ~retregs;
651     reg_t rreg;
652     allocreg(cdb,&rretregs,&rreg,tyml);
653     targ_size_t signbit = 0x80000000;
654     if (sz == 8)
655         signbit = cast(targ_size_t)0x8000000000000000L;
656     movxmmconst(cdb,rreg, sz, signbit, 0);
657 
658     getregs(cdb,retregs);
659     const op = (sz == 8) ? XORPD : XORPS;       // XORPD/S reg,rreg
660     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
661     fixresult(cdb,e,retregs,pretregs);
662 }
663 
664 /******************
665  * Absolute value operator OPabs
666  */
667 
668 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
669 {
670     //printf("xmmabs()\n");
671     //elem_print(e);
672     assert(*pretregs);
673     tym_t tyml = tybasic(e.EV.E1.Ety);
674     int sz = _tysize[tyml];
675 
676     regm_t retregs = *pretregs & XMMREGS;
677     if (!retregs)
678         retregs = XMMREGS;
679 
680     /* Generate:
681      *    MOV reg,e1
682      *    MOV rreg,mask
683      *    AND reg,rreg
684      */
685     codelem(cdb,e.EV.E1,&retregs,false);
686     getregs(cdb,retregs);
687     const reg = findreg(retregs);
688     regm_t rretregs = XMMREGS & ~retregs;
689     reg_t rreg;
690     allocreg(cdb,&rretregs,&rreg,tyml);
691     targ_size_t mask = 0x7FFF_FFFF;
692     if (sz == 8)
693         mask = cast(targ_size_t)0x7FFF_FFFF_FFFF_FFFFL;
694     movxmmconst(cdb,rreg, sz, mask, 0);
695 
696     getregs(cdb,retregs);
697     const op = (sz == 8) ? ANDPD : ANDPS;       // ANDPD/S reg,rreg
698     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
699     fixresult(cdb,e,retregs,pretregs);
700 }
701 
702 /*****************************
703  * Get correct load operator based on type.
704  * It is important to use the right one even if the number of bits moved is the same,
705  * as there are performance consequences for using the wrong one.
706  * Params:
707  *      tym = type of data to load
708  *      aligned = for vectors, true if aligned to 16 bytes
709  */
710 
711 opcode_t xmmload(tym_t tym, bool aligned)
712 {
713     opcode_t op;
714     if (tysize(tym) == 32)
715         aligned = false;
716     switch (tybasic(tym))
717     {
718         case TYuint:
719         case TYint:
720         case TYlong:
721         case TYulong:   op = LODD;  break;       // MOVD
722         case TYfloat:
723         case TYcfloat:
724         case TYifloat:  op = LODSS; break;       // MOVSS
725         case TYllong:
726         case TYullong:  op = LODQ;  break;       // MOVQ
727         case TYdouble:
728         case TYcdouble:
729         case TYidouble: op = LODSD; break;       // MOVSD
730 
731         case TYfloat8:
732         case TYfloat4:  op = aligned ? LODAPS : LODUPS; break;      // MOVAPS / MOVUPS
733         case TYdouble4:
734         case TYdouble2: op = aligned ? LODAPD : LODUPD; break;      // MOVAPD / MOVUPD
735         case TYschar16:
736         case TYuchar16:
737         case TYshort8:
738         case TYushort8:
739         case TYlong4:
740         case TYulong4:
741         case TYllong2:
742         case TYullong2:
743         case TYschar32:
744         case TYuchar32:
745         case TYshort16:
746         case TYushort16:
747         case TYlong8:
748         case TYulong8:
749         case TYllong4:
750         case TYullong4: op = aligned ? LODDQA : LODDQU; break;      // MOVDQA / MOVDQU
751 
752         default:
753             printf("tym = x%x\n", tym);
754             assert(0);
755     }
756     return op;
757 }
758 
759 /*****************************
760  * Get correct store operator based on type.
761  */
762 
763 opcode_t xmmstore(tym_t tym, bool aligned)
764 {
765     opcode_t op;
766     switch (tybasic(tym))
767     {
768         case TYuint:
769         case TYint:
770         case TYlong:
771         case TYulong:   op = STOD;  break;       // MOVD
772         case TYfloat:
773         case TYifloat:  op = STOSS; break;       // MOVSS
774         case TYllong:
775         case TYullong:  op = STOQ;  break;       // MOVQ
776         case TYdouble:
777         case TYidouble:
778         case TYcdouble:
779         case TYcfloat:  op = STOSD; break;       // MOVSD
780 
781         case TYfloat8:
782         case TYfloat4:  op = aligned ? STOAPS : STOUPS; break;      // MOVAPS / MOVUPS
783         case TYdouble4:
784         case TYdouble2: op = aligned ? STOAPD : STOUPD; break;      // MOVAPD / MOVUPD
785         case TYschar16:
786         case TYuchar16:
787         case TYshort8:
788         case TYushort8:
789         case TYlong4:
790         case TYulong4:
791         case TYllong2:
792         case TYullong2:
793         case TYschar32:
794         case TYuchar32:
795         case TYshort16:
796         case TYushort16:
797         case TYlong8:
798         case TYulong8:
799         case TYllong4:
800         case TYullong4: op = aligned ? STODQA : STODQU; break;      // MOVDQA / MOVDQU
801 
802         default:
803             printf("tym = 0x%x\n", tym);
804             assert(0);
805     }
806     return op;
807 }
808 
809 
810 /************************************
811  * Get correct XMM operator based on type and operator.
812  */
813 
814 private opcode_t xmmoperator(tym_t tym, OPER oper)
815 {
816     tym = tybasic(tym);
817     opcode_t op;
818     switch (oper)
819     {
820         case OPadd:
821         case OPaddass:
822         case OPpostinc:
823             switch (tym)
824             {
825                 case TYfloat:
826                 case TYifloat:  op = ADDSS;  break;
827                 case TYdouble:
828                 case TYidouble: op = ADDSD;  break;
829 
830                 // SIMD vector types
831                 case TYfloat8:
832                 case TYfloat4:  op = ADDPS;  break;
833                 case TYdouble4:
834                 case TYdouble2: op = ADDPD;  break;
835                 case TYschar32:
836                 case TYuchar32:
837                 case TYschar16:
838                 case TYuchar16: op = PADDB;  break;
839                 case TYshort16:
840                 case TYushort16:
841                 case TYshort8:
842                 case TYushort8: op = PADDW;  break;
843                 case TYlong8:
844                 case TYulong8:
845                 case TYlong4:
846                 case TYulong4:  op = PADDD;  break;
847                 case TYllong4:
848                 case TYullong4:
849                 case TYllong2:
850                 case TYullong2: op = PADDQ;  break;
851 
852                 default:
853                     printf("tym = x%x\n", tym);
854                     assert(0);
855             }
856             break;
857 
858         case OPmin:
859         case OPminass:
860         case OPpostdec:
861             switch (tym)
862             {
863                 case TYfloat:
864                 case TYifloat:  op = SUBSS;  break;
865                 case TYdouble:
866                 case TYidouble: op = SUBSD;  break;
867 
868                 // SIMD vector types
869                 case TYfloat8:
870                 case TYfloat4:  op = SUBPS;  break;
871                 case TYdouble4:
872                 case TYdouble2: op = SUBPD;  break;
873                 case TYschar32:
874                 case TYuchar32:
875                 case TYschar16:
876                 case TYuchar16: op = PSUBB;  break;
877                 case TYshort16:
878                 case TYushort16:
879                 case TYshort8:
880                 case TYushort8: op = PSUBW;  break;
881                 case TYlong8:
882                 case TYulong8:
883                 case TYlong4:
884                 case TYulong4:  op = PSUBD;  break;
885                 case TYllong4:
886                 case TYullong4:
887                 case TYllong2:
888                 case TYullong2: op = PSUBQ;  break;
889 
890                 default:        assert(0);
891             }
892             break;
893 
894         case OPmul:
895         case OPmulass:
896             switch (tym)
897             {
898                 case TYfloat:
899                 case TYifloat:  op = MULSS;  break;
900                 case TYdouble:
901                 case TYidouble: op = MULSD;  break;
902 
903                 // SIMD vector types
904                 case TYfloat8:
905                 case TYfloat4:  op = MULPS;  break;
906                 case TYdouble4:
907                 case TYdouble2: op = MULPD;  break;
908                 case TYshort16:
909                 case TYushort16:
910                 case TYshort8:
911                 case TYushort8: op = PMULLW; break;
912                 case TYlong8:
913                 case TYulong8:
914                 case TYlong4:
915                 case TYulong4:  op = PMULLD; break;
916 
917                 default:        assert(0);
918             }
919             break;
920 
921         case OPdiv:
922         case OPdivass:
923             switch (tym)
924             {
925                 case TYfloat:
926                 case TYifloat:  op = DIVSS;  break;
927                 case TYdouble:
928                 case TYidouble: op = DIVSD;  break;
929 
930                 // SIMD vector types
931                 case TYfloat8:
932                 case TYfloat4:  op = DIVPS;  break;
933                 case TYdouble4:
934                 case TYdouble2: op = DIVPD;  break;
935 
936                 default:        assert(0);
937             }
938             break;
939 
940         case OPor:
941         case OPorass:
942             switch (tym)
943             {
944                 // SIMD vector types
945                 case TYschar16:
946                 case TYuchar16:
947                 case TYshort8:
948                 case TYushort8:
949                 case TYlong4:
950                 case TYulong4:
951                 case TYllong2:
952                 case TYullong2:
953                 case TYschar32:
954                 case TYuchar32:
955                 case TYshort16:
956                 case TYushort16:
957                 case TYlong8:
958                 case TYulong8:
959                 case TYllong4:
960                 case TYullong4: op = POR; break;
961 
962                 default:        assert(0);
963             }
964             break;
965 
966         case OPand:
967         case OPandass:
968             switch (tym)
969             {
970                 // SIMD vector types
971                 case TYschar16:
972                 case TYuchar16:
973                 case TYshort8:
974                 case TYushort8:
975                 case TYlong4:
976                 case TYulong4:
977                 case TYllong2:
978                 case TYullong2:
979                 case TYschar32:
980                 case TYuchar32:
981                 case TYshort16:
982                 case TYushort16:
983                 case TYlong8:
984                 case TYulong8:
985                 case TYllong4:
986                 case TYullong4: op = PAND; break;
987 
988                 default:        assert(0);
989             }
990             break;
991 
992         case OPxor:
993         case OPxorass:
994             switch (tym)
995             {
996                 // SIMD vector types
997                 case TYschar16:
998                 case TYuchar16:
999                 case TYshort8:
1000                 case TYushort8:
1001                 case TYlong4:
1002                 case TYulong4:
1003                 case TYllong2:
1004                 case TYullong2:
1005                 case TYschar32:
1006                 case TYuchar32:
1007                 case TYshort16:
1008                 case TYushort16:
1009                 case TYlong8:
1010                 case TYulong8:
1011                 case TYllong4:
1012                 case TYullong4: op = PXOR; break;
1013 
1014                 default:        assert(0);
1015             }
1016             break;
1017 
1018         case OPlt:
1019         case OPle:
1020         case OPgt:
1021         case OPge:
1022         case OPne:
1023         case OPeqeq:
1024         case OPunord:        /* !<>=         */
1025         case OPlg:           /* <>           */
1026         case OPleg:          /* <>=          */
1027         case OPule:          /* !>           */
1028         case OPul:           /* !>=          */
1029         case OPuge:          /* !<           */
1030         case OPug:           /* !<=          */
1031         case OPue:           /* !<>          */
1032         case OPngt:
1033         case OPnge:
1034         case OPnlt:
1035         case OPnle:
1036         case OPord:
1037         case OPnlg:
1038         case OPnleg:
1039         case OPnule:
1040         case OPnul:
1041         case OPnuge:
1042         case OPnug:
1043         case OPnue:
1044             switch (tym)
1045             {
1046                 case TYfloat:
1047                 case TYifloat:  op = UCOMISS;  break;
1048                 case TYdouble:
1049                 case TYidouble: op = UCOMISD;  break;
1050 
1051                 default:        assert(0);
1052             }
1053             break;
1054 
1055         default:
1056             assert(0);
1057     }
1058     return op;
1059 }
1060 
1061 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1062 {
1063     /* e should look like one of:
1064      *    vector
1065      *      |
1066      *    param
1067      *    /   \
1068      *  param op2
1069      *  /   \
1070      * op   op1
1071      */
1072 
1073     if (!config.fpxmmregs)
1074     {   printf("SIMD operations not supported on this platform\n");
1075         exit(1);
1076     }
1077 
1078     const n = el_nparams(e.EV.E1);
1079     elem **params = cast(elem **)malloc(n * (elem *).sizeof);
1080     assert(params);
1081     elem **tmp = params;
1082     el_paramArray(&tmp, e.EV.E1);
1083 
1084 static if (0)
1085 {
1086     printf("cdvector()\n");
1087     for (int i = 0; i < n; i++)
1088     {
1089         printf("[%d]: ", i);
1090         elem_print(params[i]);
1091     }
1092 }
1093 
1094     if (*pretregs == 0)
1095     {   /* Evaluate for side effects only
1096          */
1097         foreach (i; 0 .. n)
1098         {
1099             codelem(cdb,params[i], pretregs, false);
1100             *pretregs = 0;      // in case they got set
1101         }
1102         return;
1103     }
1104 
1105     assert(n >= 2 && n <= 4);
1106 
1107     elem *eop = params[0];
1108     elem *op1 = params[1];
1109     elem *op2 = null;
1110     tym_t ty2 = 0;
1111     if (n >= 3)
1112     {   op2 = params[2];
1113         ty2 = tybasic(op2.Ety);
1114     }
1115 
1116     auto op = cast(opcode_t)el_tolong(eop);
1117     debug assert(!isXMMstore(op));
1118     tym_t ty1 = tybasic(op1.Ety);
1119 
1120     regm_t retregs;
1121     if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst)
1122     {   // Handle: op xmm,imm8
1123 
1124         retregs = *pretregs & XMMREGS;
1125         if (!retregs)
1126             retregs = XMMREGS;
1127         codelem(cdb,op1,&retregs,false); // eval left leaf
1128         const reg = findreg(retregs);
1129         int r;
1130         switch (op)
1131         {
1132             case PSLLD:  r = 6; op = 0x660F72;  break;
1133             case PSLLQ:  r = 6; op = 0x660F73;  break;
1134             case PSLLW:  r = 6; op = 0x660F71;  break;
1135             case PSRAD:  r = 4; op = 0x660F72;  break;
1136             case PSRAW:  r = 4; op = 0x660F71;  break;
1137             case PSRLD:  r = 2; op = 0x660F72;  break;
1138             case PSRLQ:  r = 2; op = 0x660F73;  break;
1139             case PSRLW:  r = 2; op = 0x660F71;  break;
1140             case PSRLDQ: r = 3; op = 0x660F73;  break;
1141             case PSLLDQ: r = 7; op = 0x660F73;  break;
1142 
1143             default:
1144                 printf("op = x%x\n", op);
1145                 assert(0);
1146         }
1147         getregs(cdb,retregs);
1148         cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2));
1149     }
1150     else if (n == 2)
1151     {   /* Handle: op xmm,mem
1152          * where xmm is written only, not read
1153          */
1154         code cs;
1155 
1156         if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar)
1157         {
1158             getlvalue(cdb,&cs, op1, RMload);     // get addressing mode
1159         }
1160         else
1161         {
1162             regm_t rretregs = XMMREGS;
1163             codelem(cdb,op1, &rretregs, false);
1164             const rreg = findreg(rretregs) - XMM0;
1165             cs.Irm = modregrm(3,0,rreg & 7);
1166             cs.Iflags = 0;
1167             cs.Irex = 0;
1168             if (rreg & 8)
1169                 cs.Irex |= REX_B;
1170         }
1171 
1172         retregs = *pretregs & XMMREGS;
1173         if (!retregs)
1174             retregs = XMMREGS;
1175         reg_t reg;
1176         allocreg(cdb,&retregs, &reg, e.Ety);
1177         code_newreg(&cs, reg - XMM0);
1178         cs.Iop = op;
1179         cdb.gen(&cs);
1180     }
1181     else if (n == 3 || n == 4)
1182     {   /* Handle:
1183          *      op xmm,mem        // n = 3
1184          *      op xmm,mem,imm8   // n = 4
1185          * Both xmm and mem are operands, evaluate xmm first.
1186          */
1187 
1188         code cs;
1189 
1190         retregs = *pretregs & XMMREGS;
1191         if (!retregs)
1192             retregs = XMMREGS;
1193         codelem(cdb,op1,&retregs,false); // eval left leaf
1194         const reg = findreg(retregs);
1195 
1196         if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar)
1197         {
1198             getlvalue(cdb,&cs, op2, RMload | retregs);     // get addressing mode
1199         }
1200         else
1201         {
1202             regm_t rretregs = XMMREGS & ~retregs;
1203             scodelem(cdb, op2, &rretregs, retregs, true);
1204             const rreg = findreg(rretregs) - XMM0;
1205             cs.Irm = modregrm(3,0,rreg & 7);
1206             cs.Iflags = 0;
1207             cs.Irex = 0;
1208             if (rreg & 8)
1209                 cs.Irex |= REX_B;
1210         }
1211 
1212         getregs(cdb,retregs);
1213 
1214         switch (op)
1215         {
1216             case CMPPD:   case CMPSS:   case CMPSD:   case CMPPS:
1217             case PSHUFD:  case PSHUFHW: case PSHUFLW:
1218             case BLENDPD: case BLENDPS: case DPPD:    case DPPS:
1219             case MPSADBW: case PBLENDW:
1220             case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS:
1221             case SHUFPD:  case SHUFPS:
1222                 if (n == 3)
1223                 {
1224                     version (MARS)
1225                         if (pass == PASSfinal)
1226                             error(e.Esrcpos.Sfilename, e.Esrcpos.Slinnum, e.Esrcpos.Scharnum, "missing 4th parameter to `__simd()`");
1227                     cs.IFL2 = FLconst;
1228                     cs.IEV2.Vsize_t = 0;
1229                 }
1230                 break;
1231             default:
1232                 break;
1233         }
1234 
1235         if (n == 4)
1236         {
1237             elem *imm8 = params[3];
1238             cs.IFL2 = FLconst;
1239 version (MARS)
1240 {
1241             if (imm8.Eoper != OPconst)
1242             {
1243                 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant");
1244                 cs.IEV2.Vsize_t = 0;
1245             }
1246             else
1247                 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1248 }
1249 else
1250 {
1251             cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1252 }
1253         }
1254         code_newreg(&cs, reg - XMM0);
1255         cs.Iop = op;
1256         cdb.gen(&cs);
1257     }
1258     else
1259         assert(0);
1260     fixresult(cdb,e,retregs,pretregs);
1261     free(params);
1262     freenode(e);
1263 }
1264 
1265 /***************
1266  * Generate code for vector "store" operations.
1267  * The tree e must look like:
1268  *  (op1 OPvecsto (op OPparam op2))
1269  * where op is the store instruction STOxxxx.
1270  */
1271 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1272 {
1273     //printf("cdvecsto()\n");
1274     //elem_print(e);
1275     elem *op1 = e.EV.E1;
1276     elem *op2 = e.EV.E2.EV.E2;
1277     elem *eop = e.EV.E2.EV.E1;
1278     const op = cast(opcode_t)el_tolong(eop);
1279     debug assert(isXMMstore(op));
1280     xmmeq(cdb, e, op, op1, op2, pretregs);
1281 }
1282 
1283 /***************
1284  * Generate code for OPvecfill (broadcast).
1285  * OPvecfill takes the single value in e1 and
1286  * fills the vector type with it.
1287  */
1288 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1289 {
1290     //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs));
1291 
1292     regm_t retregs = *pretregs & XMMREGS;
1293     if (!retregs)
1294         retregs = XMMREGS;
1295 
1296     code *c;
1297     code cs;
1298 
1299     elem *e1 = e.EV.E1;
1300 static if (0)
1301 {
1302     if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar)
1303     {
1304         cr = getlvalue(&cs, e1, RMload | retregs);     // get addressing mode
1305     }
1306     else
1307     {
1308         regm_t rretregs = XMMREGS & ~retregs;
1309         cr = scodelem(op2, &rretregs, retregs, true);
1310         const rreg = findreg(rretregs) - XMM0;
1311         cs.Irm = modregrm(3,0,rreg & 7);
1312         cs.Iflags = 0;
1313         cs.Irex = 0;
1314         if (rreg & 8)
1315             cs.Irex |= REX_B;
1316     }
1317 }
1318 
1319     /* e.Ety only gives us the size of the result vector, not its type.
1320      * We must combine it with the vector element type, e1.Ety, to
1321      * form the resulting vector type, ty.
1322      * The reason is someone may have painted the result of the OPvecfill to
1323      * a different vector type.
1324      */
1325     const sz = tysize(e.Ety);
1326     const ty1 = tybasic(e1.Ety);
1327     assert(sz == 16 || sz == 32);
1328     const bool x16 = (sz == 16);
1329 
1330     tym_t ty;
1331     switch (ty1)
1332     {
1333         case TYfloat:   ty = x16 ? TYfloat4  : TYfloat8;   break;
1334         case TYdouble:  ty = x16 ? TYdouble2 : TYdouble4;  break;
1335         case TYschar:   ty = x16 ? TYschar16 : TYschar32;  break;
1336         case TYuchar:   ty = x16 ? TYuchar16 : TYuchar32;  break;
1337         case TYshort:   ty = x16 ? TYshort8  : TYshort16;  break;
1338         case TYushort:  ty = x16 ? TYushort8 : TYushort16; break;
1339         case TYint:
1340         case TYlong:    ty = x16 ? TYlong4   : TYlong8;    break;
1341         case TYuint:
1342         case TYulong:   ty = x16 ? TYulong4  : TYulong8;   break;
1343         case TYllong:   ty = x16 ? TYllong2  : TYllong4;   break;
1344         case TYullong:  ty = x16 ? TYullong2 : TYullong4;  break;
1345 
1346         default:
1347             assert(0);
1348     }
1349 
1350     switch (ty)
1351     {
1352         case TYfloat4:
1353         case TYfloat8:
1354             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1355             {
1356                 // VBROADCASTSS X/YMM,MEM
1357                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1358                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1359                 reg_t reg;
1360                 allocreg(cdb,&retregs,&reg,ty);
1361                 cs.Iop = VBROADCASTSS;
1362                 cs.Irex &= ~REX_W;
1363                 code_newreg(&cs,reg - XMM0);
1364                 checkSetVex(&cs,ty);
1365                 cdb.gen(&cs);
1366             }
1367             else
1368             {
1369                 codelem(cdb,e1,&retregs,false); // eval left leaf
1370                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1371                 getregs(cdb,retregs);
1372                 if (config.avx >= 2)
1373                 {
1374                     // VBROADCASTSS X/YMM,XMM
1375                     cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg));
1376                     checkSetVex(cdb.last(), ty);
1377                 }
1378                 else
1379                 {
1380                     // (V)SHUFPS XMM,XMM,0
1381                     cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0);
1382                     checkSetVex(cdb.last(), ty);
1383                     if (tysize(ty) == 32)
1384                     {
1385                         // VINSERTF128 YMM,YMM,XMM,1
1386                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1387                         checkSetVex(cdb.last(), ty);
1388                     }
1389                 }
1390             }
1391             break;
1392 
1393         case TYdouble2:
1394         case TYdouble4:
1395             if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount)
1396             {
1397                 // VBROADCASTSD YMM,MEM
1398                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1399                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1400                 reg_t reg;
1401                 allocreg(cdb,&retregs,&reg,ty);
1402                 cs.Iop = VBROADCASTSD;
1403                 cs.Irex &= ~REX_W;
1404                 code_newreg(&cs,reg - XMM0);
1405                 checkSetVex(&cs,ty);
1406                 cdb.gen(&cs);
1407             }
1408             else
1409             {
1410                 codelem(cdb,e1,&retregs,false); // eval left leaf
1411                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1412                 getregs(cdb,retregs);
1413                 if (config.avx >= 2 && tysize(ty) == 32)
1414                 {
1415                     // VBROADCASTSD YMM,XMM
1416                     cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg));
1417                     checkSetVex(cdb.last(), ty);
1418                 }
1419                 else
1420                 {
1421                     // (V)UNPCKLPD XMM,XMM
1422                     cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg));
1423                     checkSetVex(cdb.last(), TYdouble2); // AVX-128
1424                     if (tysize(ty) == 32)
1425                     {
1426                         // VINSERTF128 YMM,YMM,XMM,1
1427                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1428                         checkSetVex(cdb.last(), ty);
1429                     }
1430                 }
1431             }
1432             break;
1433 
1434         case TYschar16:
1435         case TYuchar16:
1436         case TYschar32:
1437         case TYuchar32:
1438             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1439             {
1440                 // VPBROADCASTB X/YMM,MEM
1441                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1442                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1443                 reg_t reg;
1444                 allocreg(cdb,&retregs,&reg,ty);
1445                 cs.Iop = VPBROADCASTB;
1446                 cs.Irex &= ~REX_W;
1447                 code_newreg(&cs,reg - XMM0);
1448                 checkSetVex(&cs,ty);
1449                 cdb.gen(&cs);
1450             }
1451             else
1452             {
1453                 regm_t regm = ALLREGS;
1454                 codelem(cdb,e1,&regm,true); // eval left leaf
1455                 const r = findreg(regm);
1456 
1457                 reg_t reg;
1458                 allocreg(cdb,&retregs,&reg, e.Ety);
1459                 reg -= XMM0;
1460                 // (V)MOVD reg,r
1461                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1462                 checkSetVex(cdb.last(), TYushort8);
1463                 if (config.avx >= 2)
1464                 {
1465                     // VPBROADCASTB X/YMM,XMM
1466                     cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg));
1467                     checkSetVex(cdb.last(), ty);
1468                 }
1469                 else
1470                 {
1471                     if (config.avx)
1472                     {
1473                         reg_t zeroreg;
1474                         regm = XMMREGS & ~retregs;
1475                         // VPXOR XMM1,XMM1,XMM1
1476                         allocreg(cdb,&regm,&zeroreg, ty);
1477                         zeroreg -= XMM0;
1478                         cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg));
1479                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1480                         // VPSHUFB XMM,XMM,XMM1
1481                         cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg));
1482                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1483                     }
1484                     else
1485                     {
1486                         // PUNPCKLBW XMM,XMM
1487                         cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg));
1488                         // PUNPCKLWD XMM,XMM
1489                         cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1490                         // PSHUFD XMM,XMM,0
1491                         cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1492                     }
1493                     if (tysize(ty) == 32)
1494                     {
1495                         // VINSERTF128 YMM,YMM,XMM,1
1496                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1497                         checkSetVex(cdb.last(), ty);
1498                     }
1499                 }
1500             }
1501             break;
1502 
1503         case TYshort8:
1504         case TYushort8:
1505         case TYshort16:
1506         case TYushort16:
1507             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1508             {
1509                 // VPBROADCASTW X/YMM,MEM
1510                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1511                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1512                 reg_t reg;
1513                 allocreg(cdb,&retregs,&reg,ty);
1514                 cs.Iop = VPBROADCASTW;
1515                 cs.Irex &= ~REX_W;
1516                 cs.Iflags &= ~CFopsize;
1517                 code_newreg(&cs,reg - XMM0);
1518                 checkSetVex(&cs,ty);
1519                 cdb.gen(&cs);
1520             }
1521             else
1522             {
1523                 regm_t regm = ALLREGS;
1524                 codelem(cdb,e1,&regm,true); // eval left leaf
1525                 reg_t r = findreg(regm);
1526 
1527                 reg_t reg;
1528                 allocreg(cdb,&retregs,&reg, e.Ety);
1529                 reg -= XMM0;
1530                 // (V)MOVD reg,r
1531                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1532                 checkSetVex(cdb.last(), TYushort8);
1533                 if (config.avx >= 2)
1534                 {
1535                     // VPBROADCASTW X/YMM,XMM
1536                     cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg));
1537                     checkSetVex(cdb.last(), ty);
1538                 }
1539                 else
1540                 {
1541                     // (V)PUNPCKLWD XMM,XMM
1542                     cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1543                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1544                     // (V)PSHUFD XMM,XMM,0
1545                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1546                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1547                     if (tysize(ty) == 32)
1548                     {
1549                         // VINSERTF128 YMM,YMM,XMM,1
1550                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1551                         checkSetVex(cdb.last(), ty);
1552                     }
1553                 }
1554             }
1555             break;
1556 
1557         case TYlong8:
1558         case TYulong8:
1559         case TYlong4:
1560         case TYulong4:
1561             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1562             {
1563                 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM
1564                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1565                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1566                 reg_t reg;
1567                 allocreg(cdb,&retregs,&reg,ty);
1568                 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS;
1569                 cs.Irex &= ~REX_W;
1570                 code_newreg(&cs,reg - XMM0);
1571                 checkSetVex(&cs,ty);
1572                 cdb.gen(&cs);
1573             }
1574             else
1575             {
1576                 codelem(cdb,e1,&retregs,true); // eval left leaf
1577                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1578                 getregs(cdb,retregs);
1579                 if (config.avx >= 2)
1580                 {
1581                     // VPBROADCASTD X/YMM,XMM
1582                     cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg));
1583                     checkSetVex(cdb.last(), ty);
1584                 }
1585                 else
1586                 {
1587                     // (V)PSHUFD XMM,XMM,0
1588                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1589                     checkSetVex(cdb.last(), TYulong4); // AVX-128
1590                     if (tysize(ty) == 32)
1591                     {
1592                         // VINSERTF128 YMM,YMM,XMM,1
1593                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1594                         checkSetVex(cdb.last(), ty);
1595                     }
1596                 }
1597             }
1598             break;
1599 
1600         case TYllong2:
1601         case TYullong2:
1602         case TYllong4:
1603         case TYullong4:
1604             if (e1.Eoper == OPind && !e1.Ecount)
1605             {
1606                 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM
1607                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1608                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1609                 reg_t reg;
1610                 allocreg(cdb,&retregs,&reg,ty);
1611                 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ;
1612                 cs.Irex &= ~REX_W;
1613                 code_newreg(&cs,reg - XMM0);
1614                 checkSetVex(&cs,ty);
1615                 cdb.gen(&cs);
1616             }
1617             else
1618             {
1619                 codelem(cdb,e1,&retregs,true); // eval left leaf
1620                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1621                 getregs(cdb,retregs);
1622                 if (config.avx >= 2)
1623                 {
1624                     // VPBROADCASTQ X/YMM,XMM
1625                     cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg));
1626                     checkSetVex(cdb.last(), ty);
1627                 }
1628                 else
1629                 {
1630                     // (V)PUNPCKLQDQ XMM,XMM
1631                     cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0);
1632                     checkSetVex(cdb.last(), TYullong2); // AVX-128
1633                     if (tysize(ty) == 32)
1634                     {
1635                         // VINSERTF128 YMM,YMM,XMM,1
1636                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1637                         checkSetVex(cdb.last(), ty);
1638                     }
1639                 }
1640             }
1641             break;
1642 
1643         default:
1644             assert(0);
1645     }
1646 
1647     fixresult(cdb,e,retregs,pretregs);
1648 }
1649 
1650 /*******************************************
1651  * Determine if lvalue e is a vector aligned on a 16/32 byte boundary.
1652  * Assume it to be aligned unless can prove it is not.
1653  * Params:
1654  *      e = lvalue
1655  * Returns:
1656  *      false if definitely not aligned
1657  */
1658 
1659 bool xmmIsAligned(elem *e)
1660 {
1661     if (tyvector(e.Ety) && e.Eoper == OPvar)
1662     {
1663         Symbol *s = e.EV.Vsym;
1664         const alignsz = tyalignsize(e.Ety);
1665         if (Symbol_Salignsize(s) < alignsz ||
1666             e.EV.Voffset & (alignsz - 1) ||
1667             alignsz > STACKALIGN
1668            )
1669             return false;       // definitely not aligned
1670     }
1671     return true;        // assume aligned
1672 }
1673 
1674 /**************************************
1675  * VEX prefixes can be 2 or 3 bytes.
1676  * If it must be 3 bytes, set the CFvex3 flag.
1677  */
1678 
1679 void checkSetVex3(code *c)
1680 {
1681     // See Intel Vol. 2A 2.3.5.6
1682     if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 ||
1683         !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8))
1684        )
1685     {
1686         c.Iflags |= CFvex3;
1687     }
1688 }
1689 
1690 /*************************************
1691  * Determine if operation should be rewritten as a VEX
1692  * operation; and do so.
1693  * Params:
1694  *      c = code
1695  *      ty = type of operand
1696  */
1697 
1698 void checkSetVex(code *c, tym_t ty)
1699 {
1700     //printf("checkSetVex() %d %x\n", tysize(ty), c.Iop);
1701     if (config.avx || tysize(ty) == 32)
1702     {
1703         uint vreg = (c.Irm >> 3) & 7;
1704         if (c.Irex & REX_R)
1705             vreg |= 8;
1706 
1707         // TODO: This is too simplistic, depending on the instruction, vex.vvvv
1708         // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes
1709         // NDS (non-destructive source), except for the incomplete list of 2
1710         // operand instructions (NOO) handled by the switch.
1711         switch (c.Iop)
1712         {
1713             case LODSS:
1714             case LODSD:
1715             case STOSS:
1716             case STOSD:
1717                 if ((c.Irm & 0xC0) == 0xC0)
1718                     break;
1719                 goto case LODAPS;
1720 
1721             case LODAPS:
1722             case LODUPS:
1723             case LODAPD:
1724             case LODUPD:
1725             case LODDQA:
1726             case LODDQU:
1727             case LODD:
1728             case LODQ:
1729             case STOAPS:
1730             case STOUPS:
1731             case STOAPD:
1732             case STOUPD:
1733             case STODQA:
1734             case STODQU:
1735             case STOD:
1736             case STOQ:
1737             case COMISS:
1738             case COMISD:
1739             case UCOMISS:
1740             case UCOMISD:
1741             case MOVDDUP:
1742             case MOVSHDUP:
1743             case MOVSLDUP:
1744             case VBROADCASTSS:
1745             case PSHUFD:
1746             case PSHUFHW:
1747             case PSHUFLW:
1748             case VPBROADCASTB:
1749             case VPBROADCASTW:
1750             case VPBROADCASTD:
1751             case VPBROADCASTQ:
1752                 vreg = 0;       // for 2 operand vex instructions
1753                 break;
1754 
1755             case VBROADCASTSD:
1756             case VBROADCASTF128:
1757             case VBROADCASTI128:
1758                 assert(tysize(ty) == 32); // AVX-256 only instructions
1759                 vreg = 0;       // for 2 operand vex instructions
1760                 break;
1761 
1762             case NOP:
1763                 return;         // ignore
1764 
1765             default:
1766                 break;
1767         }
1768 
1769         opcode_t op = 0xC4000000 | (c.Iop & 0xFF);
1770         switch (c.Iop & 0xFFFFFF00)
1771         {
1772             static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); }
1773             case 0x00000F00: op |= MM_PP(1,0); break;
1774             case 0x00660F00: op |= MM_PP(1,1); break;
1775             case 0x00F30F00: op |= MM_PP(1,2); break;
1776             case 0x00F20F00: op |= MM_PP(1,3); break;
1777             case 0x660F3800: op |= MM_PP(2,1); break;
1778             case 0x660F3A00: op |= MM_PP(3,1); break;
1779             default:
1780                 printf("Iop = %x\n", c.Iop);
1781                 assert(0);
1782         }
1783         c.Iop = op;
1784         c.Ivex.pfx = 0xC4;
1785         c.Ivex.r = !(c.Irex & REX_R);
1786         c.Ivex.x = !(c.Irex & REX_X);
1787         c.Ivex.b = !(c.Irex & REX_B);
1788         c.Ivex.w = (c.Irex & REX_W) != 0;
1789         c.Ivex.l = tysize(ty) == 32;
1790 
1791         c.Ivex.vvvv = cast(ushort)~vreg;
1792 
1793         c.Iflags |= CFvex;
1794         checkSetVex3(c);
1795     }
1796 }
1797 
1798 /**************************************
1799  * Load complex operand into XMM registers or flags or both.
1800  */
1801 
1802 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1803 {
1804     //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
1805     //elem_print(e);
1806     assert(*pretregs & (XMMREGS | mPSW));
1807     if (*pretregs == (mXMM0 | mXMM1) &&
1808         e.Eoper != OPconst)
1809     {
1810         code cs = void;
1811         tym_t tym = tybasic(e.Ety);
1812         tym_t ty = tym == TYcdouble ? TYdouble : TYfloat;
1813         opcode_t opmv = xmmload(tym, xmmIsAligned(e));
1814 
1815         regm_t retregs0 = mXMM0;
1816         reg_t reg0;
1817         allocreg(cdb, &retregs0, &reg0, ty);
1818         loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0);  // MOVSS/MOVSD XMM0,data
1819         checkSetVex(cdb.last(), ty);
1820 
1821         regm_t retregs1 = mXMM1;
1822         reg_t reg1;
1823         allocreg(cdb, &retregs1, &reg1, ty);
1824         loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset
1825         checkSetVex(cdb.last(), ty);
1826 
1827         return;
1828     }
1829 
1830     // See test/complex.d for cases winding up here
1831     cload87(cdb, e, pretregs);
1832 }
1833 
1834 }