1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (C) 2011-2020 by The D Language Foundation, All Rights Reserved
6  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
7  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d)
9  */
10 
11 module dmd.backend.cgxmm;
12 
13 version (SCPP)
14     version = COMPILE;
15 version (MARS)
16     version = COMPILE;
17 
18 version (COMPILE)
19 {
20 
21 import core.stdc.stdio;
22 import core.stdc.stdlib;
23 import core.stdc..string;
24 
25 import dmd.backend.cc;
26 import dmd.backend.cdef;
27 import dmd.backend.code;
28 import dmd.backend.code_x86;
29 import dmd.backend.codebuilder;
30 import dmd.backend.mem;
31 import dmd.backend.el;
32 import dmd.backend.global;
33 import dmd.backend.oper;
34 import dmd.backend.ty;
35 import dmd.backend.xmm;
36 
37 version (SCPP)
38     import dmd.backend.exh;
39 version (MARS)
40     import dmd.backend.errors;
41 
42 
43 extern (C++):
44 
45 nothrow:
46 
47 int REGSIZE();
48 
49 uint mask(uint m);
50 
51 /*******************************************
52  * Is operator a store operator?
53  */
54 
55 bool isXMMstore(opcode_t op)
56 {
57     switch (op)
58     {
59     case STOSS: case STOAPS: case STOUPS:
60     case STOSD: case STOAPD: case STOUPD:
61     case STOD: case STOQ: case STODQA: case STODQU:
62     case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true;
63     default: return false;
64     }
65 }
66 
67 /*******************************************
68  * Move constant value into xmm register xreg.
69  */
70 
71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags)
72 {
73     /* Generate:
74      *    MOV reg,value
75      *    MOV xreg,reg
76      * Not so efficient. We should at least do a PXOR for 0.
77      */
78     assert(mask(xreg) & XMMREGS);
79     assert(sz == 4 || sz == 8);
80     if (I32 && sz == 8)
81     {
82         reg_t r;
83         regm_t rm = ALLREGS;
84         allocreg(cdb,&rm,&r,TYint);         // allocate scratch register
85         static union U { targ_size_t s; targ_long[2] l; }
86         U u = void;
87         u.l[1] = 0;
88         u.s = value;
89         targ_long *p = &u.l[0];
90         movregconst(cdb,r,p[0],0);
91         cdb.genfltreg(STO,r,0);                     // MOV floatreg,r
92         movregconst(cdb,r,p[1],0);
93         cdb.genfltreg(STO,r,4);                     // MOV floatreg+4,r
94 
95         const op = xmmload(TYdouble, true);
96         cdb.genxmmreg(op,xreg,0,TYdouble);          // MOVSD XMMreg,floatreg
97     }
98     else
99     {
100         reg_t reg;
101         regwithvalue(cdb,ALLREGS,value,&reg,(sz == 8) ? 64 : 0);
102         cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg));     // MOVD xreg,reg
103         if (sz == 8)
104             code_orrex(cdb.last(), REX_W);
105         checkSetVex(cdb.last(), TYulong);
106     }
107 }
108 
109 /***********************************************
110  * Do simple orthogonal operators for XMM registers.
111  */
112 
113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
114 {
115     //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
116     elem *e1 = e.EV.E1;
117     elem *e2 = e.EV.E2;
118 
119     // float + ifloat is not actually addition
120     if ((e.Eoper == OPadd || e.Eoper == OPmin) &&
121         ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) ||
122          (tyreal(e2.Ety) && tyimaginary(e1.Ety))))
123     {
124         regm_t retregs = *pretregs & XMMREGS;
125         if (!retregs)
126             retregs = XMMREGS;
127 
128         regm_t rretregs;
129         reg_t rreg;
130         if (tyreal(e1.Ety))
131         {
132             const reg = findreg(retregs);
133             rreg = findreg(retregs & ~mask(reg));
134             retregs = mask(reg);
135             rretregs = mask(rreg);
136         }
137         else
138         {
139             // Pick the second register, not the first
140             rreg = findreg(retregs);
141             rretregs = mask(rreg);
142             const reg = findreg(retregs & ~rretregs);
143             retregs = mask(reg);
144         }
145         assert(retregs && rretregs);
146 
147         codelem(cdb,e1,&retregs,false); // eval left leaf
148         scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
149 
150         retregs |= rretregs;
151         if (e.Eoper == OPmin)
152         {
153             regm_t nretregs = XMMREGS & ~retregs;
154             reg_t sreg; // hold sign bit
155             const uint sz = tysize(e1.Ety);
156             allocreg(cdb,&nretregs,&sreg,e2.Ety);
157             targ_size_t signbit = 0x80000000;
158             if (sz == 8)
159                 signbit = cast(targ_size_t)0x8000000000000000L;
160             movxmmconst(cdb,sreg, sz, signbit, 0);
161             getregs(cdb,nretregs);
162             const opcode_t xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
163             cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0));
164         }
165         if (retregs != *pretregs)
166             fixresult(cdb,e,retregs,pretregs);
167         return;
168     }
169 
170     regm_t retregs = *pretregs & XMMREGS;
171     if (!retregs)
172         retregs = XMMREGS;
173     const constflag = OTrel(e.Eoper);
174     codelem(cdb,e1,&retregs,constflag); // eval left leaf
175     const reg = findreg(retregs);
176     regm_t rretregs = XMMREGS & ~retregs;
177     scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
178 
179     const rreg = findreg(rretregs);
180     const op = xmmoperator(e1.Ety, e.Eoper);
181 
182     /* We should take advantage of mem addressing modes for OP XMM,MEM
183      * but we do not at the moment.
184      */
185     if (OTrel(e.Eoper))
186     {
187         cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0));
188         checkSetVex(cdb.last(), e1.Ety);
189         return;
190     }
191 
192     getregs(cdb,retregs);
193     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
194     checkSetVex(cdb.last(), e1.Ety);
195     if (retregs != *pretregs)
196         fixresult(cdb,e,retregs,pretregs);
197 }
198 
199 
200 /************************
201  * Generate code for an assignment using XMM registers.
202  * Params:
203  *      opcode = store opcode to use, CMP means generate one
204  */
205 
206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs)
207 {
208     tym_t tymll;
209     int i;
210     code cs;
211     elem *e11;
212     bool regvar;                  /* true means evaluate into register variable */
213     regm_t varregm;
214     targ_int postinc;
215 
216     //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs));
217     tym_t tyml = tybasic(e1.Ety);              /* type of lvalue               */
218     regm_t retregs = *pretregs;
219 
220     if (!(retregs & XMMREGS))
221         retregs = XMMREGS;              // pick any XMM reg
222 
223     bool aligned = xmmIsAligned(e1);
224     // If default, select store opcode
225     cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op;
226     regvar = false;
227     varregm = 0;
228     if (config.flags4 & CFG4optimized)
229     {
230         // Be careful of cases like (x = x+x+x). We cannot evaluate in
231         // x if x is in a register.
232         reg_t varreg;
233         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
234             doinreg(e1.EV.Vsym,e2) &&           // and we can compute directly into it
235             varregm & XMMREGS
236            )
237         {   regvar = true;
238             retregs = varregm;    // evaluate directly in target register
239         }
240     }
241     if (*pretregs & mPSW && OTleaf(e1.Eoper))     // if evaluating e1 couldn't change flags
242     {   // Be careful that this lines up with jmpopcode()
243         retregs |= mPSW;
244         *pretregs &= ~mPSW;
245     }
246     scodelem(cdb,e2,&retregs,0,true);    // get rvalue
247 
248     // Look for special case of (*p++ = ...), where p is a register variable
249     if (e1.Eoper == OPind &&
250         ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) &&
251         e11.EV.E1.Eoper == OPvar &&
252         e11.EV.E1.EV.Vsym.Sfl == FLreg
253        )
254     {
255         postinc = e11.EV.E2.EV.Vint;
256         if (e11.Eoper == OPpostdec)
257             postinc = -postinc;
258         getlvalue(cdb,&cs,e11,RMstore | retregs);
259         freenode(e11.EV.E2);
260     }
261     else
262     {   postinc = 0;
263         getlvalue(cdb,&cs,e1,RMstore | retregs);       // get lvalue (cl == CNIL if regvar)
264     }
265 
266     getregs_imm(cdb,regvar ? varregm : 0);
267 
268     const reg = findreg(retregs & XMMREGS);
269     cs.Irm |= modregrm(0,(reg - XMM0) & 7,0);
270     if ((reg - XMM0) & 8)
271         cs.Irex |= REX_R;
272 
273     // Do not generate mov from register onto itself
274     if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0))))
275     {
276         cdb.gen(&cs);         // MOV EA+offset,reg
277         if (op == OPeq)
278             checkSetVex(cdb.last(), tyml);
279     }
280 
281     if (e1.Ecount ||                     // if lvalue is a CSE or
282         regvar)                           // rvalue can't be a CSE
283     {
284         getregs_imm(cdb,retregs);        // necessary if both lvalue and
285                                         //  rvalue are CSEs (since a reg
286                                         //  can hold only one e at a time)
287         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
288     }
289 
290     fixresult(cdb,e,retregs,pretregs);
291     if (postinc)
292     {
293         const increg = findreg(idxregm(&cs));  // the register to increment
294         if (*pretregs & mPSW)
295         {   // Use LEA to avoid touching the flags
296             uint rm = cs.Irm & 7;
297             if (cs.Irex & REX_B)
298                 rm |= 8;
299             cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc);
300             if (tysize(e11.EV.E1.Ety) == 8)
301                 code_orrex(cdb.last(), REX_W);
302         }
303         else if (I64)
304         {
305             cdb.genc2(0x81,modregrmx(3,0,increg),postinc);
306             if (tysize(e11.EV.E1.Ety) == 8)
307                 code_orrex(cdb.last(), REX_W);
308         }
309         else
310         {
311             if (postinc == 1)
312                 cdb.gen1(0x40 + increg);       // INC increg
313             else if (postinc == -cast(targ_int)1)
314                 cdb.gen1(0x48 + increg);       // DEC increg
315             else
316             {
317                 cdb.genc2(0x81,modregrm(3,0,increg),postinc);
318             }
319         }
320     }
321     freenode(e1);
322 }
323 
324 /********************************
325  * Generate code for conversion using SSE2 instructions.
326  *
327  *      OPs32_d
328  *      OPs64_d (64-bit only)
329  *      OPu32_d (64-bit only)
330  *      OPd_f
331  *      OPf_d
332  *      OPd_s32
333  *      OPd_s64 (64-bit only)
334  *
335  */
336 
337 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
338 {
339     //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs));
340     opcode_t op = NoOpcode;
341     regm_t regs;
342     tym_t ty;
343     ubyte rex = 0;
344     bool zx = false; // zero extend uint
345 
346     /* There are no ops for integer <. float/real conversions
347      * but there are instructions for them. In order to use these
348      * try to fuse chained conversions. Be careful not to loose
349      * precision for real to long.
350      */
351     elem *e1 = e.EV.E1;
352     switch (e.Eoper)
353     {
354     case OPd_f:
355         if (e1.Eoper == OPs32_d)
356         { }
357         else if (I64 && e1.Eoper == OPs64_d)
358             rex = REX_W;
359         else if (I64 && e1.Eoper == OPu32_d)
360         {   rex = REX_W;
361             zx = true;
362         }
363         else
364         {   regs = XMMREGS;
365             op = CVTSD2SS;
366             ty = TYfloat;
367             break;
368         }
369         if (e1.Ecount)
370         {
371             regs = XMMREGS;
372             op = CVTSD2SS;
373             ty = TYfloat;
374             break;
375         }
376         // directly use si2ss
377         regs = ALLREGS;
378         e1 = e1.EV.E1;  // fused operation
379         op = CVTSI2SS;
380         ty = TYfloat;
381         break;
382 
383     case OPs32_d:              goto Litod;
384     case OPs64_d: rex = REX_W; goto Litod;
385     case OPu32_d: rex = REX_W; zx = true; goto Litod;
386     Litod:
387         regs = ALLREGS;
388         op = CVTSI2SD;
389         ty = TYdouble;
390         break;
391 
392     case OPd_s32: ty = TYint;  goto Ldtoi;
393     case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi;
394     case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi;
395     Ldtoi:
396         regs = XMMREGS;
397         switch (e1.Eoper)
398         {
399         case OPf_d:
400             if (e1.Ecount)
401             {
402                 op = CVTTSD2SI;
403                 break;
404             }
405             e1 = e1.EV.E1;      // fused operation
406             op = CVTTSS2SI;
407             break;
408         case OPld_d:
409             if (e.Eoper == OPd_s64)
410             {
411                 cnvt87(cdb,e,pretregs); // precision
412                 return;
413             }
414             goto default;
415 
416         default:
417             op = CVTTSD2SI;
418             break;
419         }
420         break;
421 
422     case OPf_d:
423         regs = XMMREGS;
424         op = CVTSS2SD;
425         ty = TYdouble;
426         break;
427 
428     default:
429         assert(0);
430     }
431     assert(op != NoOpcode);
432 
433     codelem(cdb,e1, &regs, false);
434     reg_t reg = findreg(regs);
435     if (isXMMreg(reg))
436         reg -= XMM0;
437     else if (zx)
438     {   assert(I64);
439         getregs(cdb,regs);
440         genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit
441                                    // Don't use x89 because that will get optimized away
442         code_orflag(cdb.last(),CFvolatile);
443     }
444 
445     regm_t retregs = *pretregs;
446     if (tyxmmreg(ty)) // target is XMM
447     {   if (!(*pretregs & XMMREGS))
448             retregs = XMMREGS;
449     }
450     else              // source is XMM
451     {   assert(regs & XMMREGS);
452         if (!(retregs & ALLREGS))
453             retregs = ALLREGS;
454     }
455 
456     reg_t rreg;
457     allocreg(cdb,&retregs,&rreg,ty);
458     if (isXMMreg(rreg))
459         rreg -= XMM0;
460 
461     cdb.gen2(op, modregxrmx(3,rreg,reg));
462     assert(I64 || !rex);
463     if (rex)
464         code_orrex(cdb.last(), rex);
465 
466     if (*pretregs != retregs)
467         fixresult(cdb,e,retregs,pretregs);
468 }
469 
470 /********************************
471  * Generate code for op=
472  */
473 
474 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
475 {   elem *e1 = e.EV.E1;
476     elem *e2 = e.EV.E2;
477     tym_t ty1 = tybasic(e1.Ety);
478     const sz1 = _tysize[ty1];
479     regm_t rretregs = XMMREGS & ~*pretregs;
480     if (!rretregs)
481         rretregs = XMMREGS;
482 
483     codelem(cdb,e2,&rretregs,false); // eval right leaf
484     reg_t rreg = findreg(rretregs);
485 
486     code cs;
487     regm_t retregs;
488     reg_t reg;
489     bool regvar = false;
490     if (config.flags4 & CFG4optimized)
491     {
492         // Be careful of cases like (x = x+x+x). We cannot evaluate in
493         // x if x is in a register.
494         reg_t varreg;
495         regm_t varregm;
496         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
497             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
498            )
499         {   regvar = true;
500             retregs = varregm;
501             reg = varreg;                       // evaluate directly in target register
502             getregs(cdb,retregs);       // destroy these regs
503         }
504     }
505 
506     if (!regvar)
507     {
508         getlvalue(cdb,&cs,e1,rretregs);         // get EA
509         retregs = *pretregs & XMMREGS & ~rretregs;
510         if (!retregs)
511             retregs = XMMREGS & ~rretregs;
512         allocreg(cdb,&retregs,&reg,ty1);
513         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
514         code_newreg(&cs,reg - XMM0);
515         cdb.gen(&cs);
516         checkSetVex(cdb.last(), ty1);
517     }
518 
519     const op = xmmoperator(e1.Ety, e.Eoper);
520     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
521     checkSetVex(cdb.last(), e1.Ety);
522 
523     if (!regvar)
524     {
525         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
526         cdb.gen(&cs);
527         checkSetVex(cdb.last(), ty1);
528     }
529 
530     if (e1.Ecount ||                     // if lvalue is a CSE or
531         regvar)                           // rvalue can't be a CSE
532     {
533         getregs_imm(cdb,retregs);        // necessary if both lvalue and
534                                         //  rvalue are CSEs (since a reg
535                                         //  can hold only one e at a time)
536         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
537     }
538 
539     fixresult(cdb,e,retregs,pretregs);
540     freenode(e1);
541 }
542 
543 /********************************
544  * Generate code for post increment and post decrement.
545  */
546 
547 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
548 {
549     elem *e1 = e.EV.E1;
550     elem *e2 = e.EV.E2;
551     tym_t ty1 = tybasic(e1.Ety);
552 
553     regm_t retregs;
554     reg_t reg;
555     bool regvar = false;
556     if (config.flags4 & CFG4optimized)
557     {
558         // Be careful of cases like (x = x+x+x). We cannot evaluate in
559         // x if x is in a register.
560         reg_t varreg;
561         regm_t varregm;
562         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
563             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
564            )
565         {
566             regvar = true;
567             retregs = varregm;
568             reg = varreg;                       // evaluate directly in target register
569             getregs(cdb,retregs);       // destroy these regs
570         }
571     }
572 
573     code cs;
574     if (!regvar)
575     {
576         getlvalue(cdb,&cs,e1,0);                // get EA
577         retregs = XMMREGS & ~*pretregs;
578         if (!retregs)
579             retregs = XMMREGS;
580         allocreg(cdb,&retregs,&reg,ty1);
581         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
582         code_newreg(&cs,reg - XMM0);
583         cdb.gen(&cs);
584         checkSetVex(cdb.last(), ty1);
585     }
586 
587     // Result register
588     regm_t resultregs = XMMREGS & *pretregs & ~retregs;
589     if (!resultregs)
590         resultregs = XMMREGS & ~retregs;
591     reg_t resultreg;
592     allocreg(cdb,&resultregs, &resultreg, ty1);
593 
594     cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0));   // MOVSS/D resultreg,reg
595     checkSetVex(cdb.last(), ty1);
596 
597     regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs);
598     if (!rretregs)
599         rretregs = XMMREGS & ~(retregs | resultregs);
600     codelem(cdb,e2,&rretregs,false); // eval right leaf
601     const rreg = findreg(rretregs);
602 
603     const op = xmmoperator(e1.Ety, e.Eoper);
604     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));  // ADD reg,rreg
605     checkSetVex(cdb.last(), e1.Ety);
606 
607     if (!regvar)
608     {
609         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
610         cdb.gen(&cs);
611         checkSetVex(cdb.last(), ty1);
612     }
613 
614     if (e1.Ecount ||                     // if lvalue is a CSE or
615         regvar)                           // rvalue can't be a CSE
616     {
617         getregs_imm(cdb,retregs); // necessary if both lvalue and
618                                         //  rvalue are CSEs (since a reg
619                                         //  can hold only one e at a time)
620         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
621     }
622 
623     fixresult(cdb,e,resultregs,pretregs);
624     freenode(e1);
625 }
626 
627 /******************
628  * Negate operator
629  */
630 
631 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
632 {
633     //printf("xmmneg()\n");
634     //elem_print(e);
635     assert(*pretregs);
636     tym_t tyml = tybasic(e.EV.E1.Ety);
637     int sz = _tysize[tyml];
638 
639     regm_t retregs = *pretregs & XMMREGS;
640     if (!retregs)
641         retregs = XMMREGS;
642 
643     /* Generate:
644      *    MOV reg,e1
645      *    MOV rreg,signbit
646      *    XOR reg,rreg
647      */
648     codelem(cdb,e.EV.E1,&retregs,false);
649     getregs(cdb,retregs);
650     const reg = findreg(retregs);
651     regm_t rretregs = XMMREGS & ~retregs;
652     reg_t rreg;
653     allocreg(cdb,&rretregs,&rreg,tyml);
654     targ_size_t signbit = 0x80000000;
655     if (sz == 8)
656         signbit = cast(targ_size_t)0x8000000000000000L;
657     movxmmconst(cdb,rreg, sz, signbit, 0);
658 
659     getregs(cdb,retregs);
660     const op = (sz == 8) ? XORPD : XORPS;       // XORPD/S reg,rreg
661     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
662     fixresult(cdb,e,retregs,pretregs);
663 }
664 
665 /*****************************
666  * Get correct load operator based on type.
667  * It is important to use the right one even if the number of bits moved is the same,
668  * as there are performance consequences for using the wrong one.
669  * Params:
670  *      tym = type of data to load
671  *      aligned = for vectors, true if aligned to 16 bytes
672  */
673 
674 opcode_t xmmload(tym_t tym, bool aligned)
675 {
676     opcode_t op;
677     if (tysize(tym) == 32)
678         aligned = false;
679     switch (tybasic(tym))
680     {
681         case TYuint:
682         case TYint:
683         case TYlong:
684         case TYulong:   op = LODD;  break;       // MOVD
685         case TYfloat:
686         case TYcfloat:
687         case TYifloat:  op = LODSS; break;       // MOVSS
688         case TYllong:
689         case TYullong:  op = LODQ;  break;       // MOVQ
690         case TYdouble:
691         case TYcdouble:
692         case TYidouble: op = LODSD; break;       // MOVSD
693 
694         case TYfloat8:
695         case TYfloat4:  op = aligned ? LODAPS : LODUPS; break;      // MOVAPS / MOVUPS
696         case TYdouble4:
697         case TYdouble2: op = aligned ? LODAPD : LODUPD; break;      // MOVAPD / MOVUPD
698         case TYschar16:
699         case TYuchar16:
700         case TYshort8:
701         case TYushort8:
702         case TYlong4:
703         case TYulong4:
704         case TYllong2:
705         case TYullong2:
706         case TYschar32:
707         case TYuchar32:
708         case TYshort16:
709         case TYushort16:
710         case TYlong8:
711         case TYulong8:
712         case TYllong4:
713         case TYullong4: op = aligned ? LODDQA : LODDQU; break;      // MOVDQA / MOVDQU
714 
715         default:
716             printf("tym = x%x\n", tym);
717             assert(0);
718     }
719     return op;
720 }
721 
722 /*****************************
723  * Get correct store operator based on type.
724  */
725 
726 opcode_t xmmstore(tym_t tym, bool aligned)
727 {
728     opcode_t op;
729     switch (tybasic(tym))
730     {
731         case TYuint:
732         case TYint:
733         case TYlong:
734         case TYulong:   op = STOD;  break;       // MOVD
735         case TYfloat:
736         case TYifloat:  op = STOSS; break;       // MOVSS
737         case TYllong:
738         case TYullong:  op = STOQ;  break;       // MOVQ
739         case TYdouble:
740         case TYidouble:
741         case TYcdouble:
742         case TYcfloat:  op = STOSD; break;       // MOVSD
743 
744         case TYfloat8:
745         case TYfloat4:  op = aligned ? STOAPS : STOUPS; break;      // MOVAPS / MOVUPS
746         case TYdouble4:
747         case TYdouble2: op = aligned ? STOAPD : STOUPD; break;      // MOVAPD / MOVUPD
748         case TYschar16:
749         case TYuchar16:
750         case TYshort8:
751         case TYushort8:
752         case TYlong4:
753         case TYulong4:
754         case TYllong2:
755         case TYullong2:
756         case TYschar32:
757         case TYuchar32:
758         case TYshort16:
759         case TYushort16:
760         case TYlong8:
761         case TYulong8:
762         case TYllong4:
763         case TYullong4: op = aligned ? STODQA : STODQU; break;      // MOVDQA / MOVDQU
764 
765         default:
766             printf("tym = 0x%x\n", tym);
767             assert(0);
768     }
769     return op;
770 }
771 
772 
773 /************************************
774  * Get correct XMM operator based on type and operator.
775  */
776 
777 private opcode_t xmmoperator(tym_t tym, OPER oper)
778 {
779     tym = tybasic(tym);
780     opcode_t op;
781     switch (oper)
782     {
783         case OPadd:
784         case OPaddass:
785         case OPpostinc:
786             switch (tym)
787             {
788                 case TYfloat:
789                 case TYifloat:  op = ADDSS;  break;
790                 case TYdouble:
791                 case TYidouble: op = ADDSD;  break;
792 
793                 // SIMD vector types
794                 case TYfloat8:
795                 case TYfloat4:  op = ADDPS;  break;
796                 case TYdouble4:
797                 case TYdouble2: op = ADDPD;  break;
798                 case TYschar32:
799                 case TYuchar32:
800                 case TYschar16:
801                 case TYuchar16: op = PADDB;  break;
802                 case TYshort16:
803                 case TYushort16:
804                 case TYshort8:
805                 case TYushort8: op = PADDW;  break;
806                 case TYlong8:
807                 case TYulong8:
808                 case TYlong4:
809                 case TYulong4:  op = PADDD;  break;
810                 case TYllong4:
811                 case TYullong4:
812                 case TYllong2:
813                 case TYullong2: op = PADDQ;  break;
814 
815                 default:
816                     printf("tym = x%x\n", tym);
817                     assert(0);
818             }
819             break;
820 
821         case OPmin:
822         case OPminass:
823         case OPpostdec:
824             switch (tym)
825             {
826                 case TYfloat:
827                 case TYifloat:  op = SUBSS;  break;
828                 case TYdouble:
829                 case TYidouble: op = SUBSD;  break;
830 
831                 // SIMD vector types
832                 case TYfloat8:
833                 case TYfloat4:  op = SUBPS;  break;
834                 case TYdouble4:
835                 case TYdouble2: op = SUBPD;  break;
836                 case TYschar32:
837                 case TYuchar32:
838                 case TYschar16:
839                 case TYuchar16: op = PSUBB;  break;
840                 case TYshort16:
841                 case TYushort16:
842                 case TYshort8:
843                 case TYushort8: op = PSUBW;  break;
844                 case TYlong8:
845                 case TYulong8:
846                 case TYlong4:
847                 case TYulong4:  op = PSUBD;  break;
848                 case TYllong4:
849                 case TYullong4:
850                 case TYllong2:
851                 case TYullong2: op = PSUBQ;  break;
852 
853                 default:        assert(0);
854             }
855             break;
856 
857         case OPmul:
858         case OPmulass:
859             switch (tym)
860             {
861                 case TYfloat:
862                 case TYifloat:  op = MULSS;  break;
863                 case TYdouble:
864                 case TYidouble: op = MULSD;  break;
865 
866                 // SIMD vector types
867                 case TYfloat8:
868                 case TYfloat4:  op = MULPS;  break;
869                 case TYdouble4:
870                 case TYdouble2: op = MULPD;  break;
871                 case TYshort16:
872                 case TYushort16:
873                 case TYshort8:
874                 case TYushort8: op = PMULLW; break;
875                 case TYlong8:
876                 case TYulong8:
877                 case TYlong4:
878                 case TYulong4:  op = PMULLD; break;
879 
880                 default:        assert(0);
881             }
882             break;
883 
884         case OPdiv:
885         case OPdivass:
886             switch (tym)
887             {
888                 case TYfloat:
889                 case TYifloat:  op = DIVSS;  break;
890                 case TYdouble:
891                 case TYidouble: op = DIVSD;  break;
892 
893                 // SIMD vector types
894                 case TYfloat8:
895                 case TYfloat4:  op = DIVPS;  break;
896                 case TYdouble4:
897                 case TYdouble2: op = DIVPD;  break;
898 
899                 default:        assert(0);
900             }
901             break;
902 
903         case OPor:
904         case OPorass:
905             switch (tym)
906             {
907                 // SIMD vector types
908                 case TYschar16:
909                 case TYuchar16:
910                 case TYshort8:
911                 case TYushort8:
912                 case TYlong4:
913                 case TYulong4:
914                 case TYllong2:
915                 case TYullong2:
916                 case TYschar32:
917                 case TYuchar32:
918                 case TYshort16:
919                 case TYushort16:
920                 case TYlong8:
921                 case TYulong8:
922                 case TYllong4:
923                 case TYullong4: op = POR; break;
924 
925                 default:        assert(0);
926             }
927             break;
928 
929         case OPand:
930         case OPandass:
931             switch (tym)
932             {
933                 // SIMD vector types
934                 case TYschar16:
935                 case TYuchar16:
936                 case TYshort8:
937                 case TYushort8:
938                 case TYlong4:
939                 case TYulong4:
940                 case TYllong2:
941                 case TYullong2:
942                 case TYschar32:
943                 case TYuchar32:
944                 case TYshort16:
945                 case TYushort16:
946                 case TYlong8:
947                 case TYulong8:
948                 case TYllong4:
949                 case TYullong4: op = PAND; break;
950 
951                 default:        assert(0);
952             }
953             break;
954 
955         case OPxor:
956         case OPxorass:
957             switch (tym)
958             {
959                 // SIMD vector types
960                 case TYschar16:
961                 case TYuchar16:
962                 case TYshort8:
963                 case TYushort8:
964                 case TYlong4:
965                 case TYulong4:
966                 case TYllong2:
967                 case TYullong2:
968                 case TYschar32:
969                 case TYuchar32:
970                 case TYshort16:
971                 case TYushort16:
972                 case TYlong8:
973                 case TYulong8:
974                 case TYllong4:
975                 case TYullong4: op = PXOR; break;
976 
977                 default:        assert(0);
978             }
979             break;
980 
981         case OPlt:
982         case OPle:
983         case OPgt:
984         case OPge:
985         case OPne:
986         case OPeqeq:
987         case OPunord:        /* !<>=         */
988         case OPlg:           /* <>           */
989         case OPleg:          /* <>=          */
990         case OPule:          /* !>           */
991         case OPul:           /* !>=          */
992         case OPuge:          /* !<           */
993         case OPug:           /* !<=          */
994         case OPue:           /* !<>          */
995         case OPngt:
996         case OPnge:
997         case OPnlt:
998         case OPnle:
999         case OPord:
1000         case OPnlg:
1001         case OPnleg:
1002         case OPnule:
1003         case OPnul:
1004         case OPnuge:
1005         case OPnug:
1006         case OPnue:
1007             switch (tym)
1008             {
1009                 case TYfloat:
1010                 case TYifloat:  op = UCOMISS;  break;
1011                 case TYdouble:
1012                 case TYidouble: op = UCOMISD;  break;
1013 
1014                 default:        assert(0);
1015             }
1016             break;
1017 
1018         default:
1019             assert(0);
1020     }
1021     return op;
1022 }
1023 
1024 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1025 {
1026     /* e should look like one of:
1027      *    vector
1028      *      |
1029      *    param
1030      *    /   \
1031      *  param op2
1032      *  /   \
1033      * op   op1
1034      */
1035 
1036     if (!config.fpxmmregs)
1037     {   printf("SIMD operations not supported on this platform\n");
1038         exit(1);
1039     }
1040 
1041     const n = el_nparams(e.EV.E1);
1042     elem **params = cast(elem **)malloc(n * (elem *).sizeof);
1043     assert(params);
1044     elem **tmp = params;
1045     el_paramArray(&tmp, e.EV.E1);
1046 
1047 static if (0)
1048 {
1049     printf("cdvector()\n");
1050     for (int i = 0; i < n; i++)
1051     {
1052         printf("[%d]: ", i);
1053         elem_print(params[i]);
1054     }
1055 }
1056 
1057     if (*pretregs == 0)
1058     {   /* Evaluate for side effects only
1059          */
1060         foreach (i; 0 .. n)
1061         {
1062             codelem(cdb,params[i], pretregs, false);
1063             *pretregs = 0;      // in case they got set
1064         }
1065         return;
1066     }
1067 
1068     assert(n >= 2 && n <= 4);
1069 
1070     elem *eop = params[0];
1071     elem *op1 = params[1];
1072     elem *op2 = null;
1073     tym_t ty2 = 0;
1074     if (n >= 3)
1075     {   op2 = params[2];
1076         ty2 = tybasic(op2.Ety);
1077     }
1078 
1079     auto op = cast(opcode_t)el_tolong(eop);
1080     debug assert(!isXMMstore(op));
1081     tym_t ty1 = tybasic(op1.Ety);
1082 
1083     regm_t retregs;
1084     if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst)
1085     {   // Handle: op xmm,imm8
1086 
1087         retregs = *pretregs & XMMREGS;
1088         if (!retregs)
1089             retregs = XMMREGS;
1090         codelem(cdb,op1,&retregs,false); // eval left leaf
1091         const reg = findreg(retregs);
1092         int r;
1093         switch (op)
1094         {
1095             case PSLLD:  r = 6; op = 0x660F72;  break;
1096             case PSLLQ:  r = 6; op = 0x660F73;  break;
1097             case PSLLW:  r = 6; op = 0x660F71;  break;
1098             case PSRAD:  r = 4; op = 0x660F72;  break;
1099             case PSRAW:  r = 4; op = 0x660F71;  break;
1100             case PSRLD:  r = 2; op = 0x660F72;  break;
1101             case PSRLQ:  r = 2; op = 0x660F73;  break;
1102             case PSRLW:  r = 2; op = 0x660F71;  break;
1103             case PSRLDQ: r = 3; op = 0x660F73;  break;
1104             case PSLLDQ: r = 7; op = 0x660F73;  break;
1105 
1106             default:
1107                 printf("op = x%x\n", op);
1108                 assert(0);
1109         }
1110         getregs(cdb,retregs);
1111         cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2));
1112     }
1113     else if (n == 2)
1114     {   /* Handle: op xmm,mem
1115          * where xmm is written only, not read
1116          */
1117         code cs;
1118 
1119         if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar)
1120         {
1121             getlvalue(cdb,&cs, op1, RMload);     // get addressing mode
1122         }
1123         else
1124         {
1125             regm_t rretregs = XMMREGS;
1126             codelem(cdb,op1, &rretregs, false);
1127             const rreg = findreg(rretregs) - XMM0;
1128             cs.Irm = modregrm(3,0,rreg & 7);
1129             cs.Iflags = 0;
1130             cs.Irex = 0;
1131             if (rreg & 8)
1132                 cs.Irex |= REX_B;
1133         }
1134 
1135         retregs = *pretregs & XMMREGS;
1136         if (!retregs)
1137             retregs = XMMREGS;
1138         reg_t reg;
1139         allocreg(cdb,&retregs, &reg, e.Ety);
1140         code_newreg(&cs, reg - XMM0);
1141         cs.Iop = op;
1142         cdb.gen(&cs);
1143     }
1144     else if (n == 3 || n == 4)
1145     {   /* Handle:
1146          *      op xmm,mem        // n = 3
1147          *      op xmm,mem,imm8   // n = 4
1148          * Both xmm and mem are operands, evaluate xmm first.
1149          */
1150 
1151         code cs;
1152 
1153         retregs = *pretregs & XMMREGS;
1154         if (!retregs)
1155             retregs = XMMREGS;
1156         codelem(cdb,op1,&retregs,false); // eval left leaf
1157         const reg = findreg(retregs);
1158 
1159         if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar)
1160         {
1161             getlvalue(cdb,&cs, op2, RMload | retregs);     // get addressing mode
1162         }
1163         else
1164         {
1165             regm_t rretregs = XMMREGS & ~retregs;
1166             scodelem(cdb, op2, &rretregs, retregs, true);
1167             const rreg = findreg(rretregs) - XMM0;
1168             cs.Irm = modregrm(3,0,rreg & 7);
1169             cs.Iflags = 0;
1170             cs.Irex = 0;
1171             if (rreg & 8)
1172                 cs.Irex |= REX_B;
1173         }
1174 
1175         getregs(cdb,retregs);
1176         if (n == 4)
1177         {
1178             switch (op)
1179             {
1180                 case CMPPD:   case CMPSS:   case CMPSD:   case CMPPS:
1181                 case PSHUFD:  case PSHUFHW: case PSHUFLW:
1182                 case BLENDPD: case BLENDPS: case DPPD:    case DPPS:
1183                 case MPSADBW: case PBLENDW:
1184                 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS:
1185                 case SHUFPD:  case SHUFPS:
1186                     break;
1187                 default:
1188                     printf("op = x%x\n", op);
1189                     assert(0);
1190             }
1191             elem *imm8 = params[3];
1192             cs.IFL2 = FLconst;
1193 version (MARS)
1194 {
1195             if (imm8.Eoper != OPconst)
1196             {
1197                 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant");
1198                 cs.IEV2.Vsize_t = 0;
1199             }
1200             else
1201                 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1202 }
1203 else
1204 {
1205             cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1206 }
1207         }
1208         code_newreg(&cs, reg - XMM0);
1209         cs.Iop = op;
1210         cdb.gen(&cs);
1211     }
1212     else
1213         assert(0);
1214     fixresult(cdb,e,retregs,pretregs);
1215     free(params);
1216     freenode(e);
1217 }
1218 
1219 /***************
1220  * Generate code for vector "store" operations.
1221  * The tree e must look like:
1222  *  (op1 OPvecsto (op OPparam op2))
1223  * where op is the store instruction STOxxxx.
1224  */
1225 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1226 {
1227     //printf("cdvecsto()\n");
1228     //elem_print(e);
1229     elem *op1 = e.EV.E1;
1230     elem *op2 = e.EV.E2.EV.E2;
1231     elem *eop = e.EV.E2.EV.E1;
1232     const op = cast(opcode_t)el_tolong(eop);
1233     debug assert(isXMMstore(op));
1234     xmmeq(cdb, e, op, op1, op2, pretregs);
1235 }
1236 
1237 /***************
1238  * Generate code for OPvecfill (broadcast).
1239  * OPvecfill takes the single value in e1 and
1240  * fills the vector type with it.
1241  */
1242 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1243 {
1244     //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs));
1245 
1246     regm_t retregs = *pretregs & XMMREGS;
1247     if (!retregs)
1248         retregs = XMMREGS;
1249 
1250     code *c;
1251     code cs;
1252 
1253     elem *e1 = e.EV.E1;
1254 static if (0)
1255 {
1256     if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar)
1257     {
1258         cr = getlvalue(&cs, e1, RMload | retregs);     // get addressing mode
1259     }
1260     else
1261     {
1262         regm_t rretregs = XMMREGS & ~retregs;
1263         cr = scodelem(op2, &rretregs, retregs, true);
1264         const rreg = findreg(rretregs) - XMM0;
1265         cs.Irm = modregrm(3,0,rreg & 7);
1266         cs.Iflags = 0;
1267         cs.Irex = 0;
1268         if (rreg & 8)
1269             cs.Irex |= REX_B;
1270     }
1271 }
1272 
1273     const ty = tybasic(e.Ety);
1274     switch (ty)
1275     {
1276         case TYfloat4:
1277         case TYfloat8:
1278             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1279             {
1280                 // VBROADCASTSS X/YMM,MEM
1281                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1282                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1283                 reg_t reg;
1284                 allocreg(cdb,&retregs,&reg,ty);
1285                 cs.Iop = VBROADCASTSS;
1286                 cs.Irex &= ~REX_W;
1287                 code_newreg(&cs,reg - XMM0);
1288                 checkSetVex(&cs,ty);
1289                 cdb.gen(&cs);
1290             }
1291             else
1292             {
1293                 codelem(cdb,e1,&retregs,false); // eval left leaf
1294                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1295                 getregs(cdb,retregs);
1296                 if (config.avx >= 2)
1297                 {
1298                     // VBROADCASTSS X/YMM,XMM
1299                     cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg));
1300                     checkSetVex(cdb.last(), ty);
1301                 }
1302                 else
1303                 {
1304                     // (V)SHUFPS XMM,XMM,0
1305                     cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0);
1306                     checkSetVex(cdb.last(), ty);
1307                     if (tysize(ty) == 32)
1308                     {
1309                         // VINSERTF128 YMM,YMM,XMM,1
1310                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1311                         checkSetVex(cdb.last(), ty);
1312                     }
1313                 }
1314             }
1315             break;
1316 
1317         case TYdouble2:
1318         case TYdouble4:
1319             if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount)
1320             {
1321                 // VBROADCASTSD YMM,MEM
1322                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1323                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1324                 reg_t reg;
1325                 allocreg(cdb,&retregs,&reg,ty);
1326                 cs.Iop = VBROADCASTSD;
1327                 cs.Irex &= ~REX_W;
1328                 code_newreg(&cs,reg - XMM0);
1329                 checkSetVex(&cs,ty);
1330                 cdb.gen(&cs);
1331             }
1332             else
1333             {
1334                 codelem(cdb,e1,&retregs,false); // eval left leaf
1335                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1336                 getregs(cdb,retregs);
1337                 if (config.avx >= 2 && tysize(ty) == 32)
1338                 {
1339                     // VBROADCASTSD YMM,XMM
1340                     cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg));
1341                     checkSetVex(cdb.last(), ty);
1342                 }
1343                 else
1344                 {
1345                     // (V)UNPCKLPD XMM,XMM
1346                     cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg));
1347                     checkSetVex(cdb.last(), TYdouble2); // AVX-128
1348                     if (tysize(ty) == 32)
1349                     {
1350                         // VINSERTF128 YMM,YMM,XMM,1
1351                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1352                         checkSetVex(cdb.last(), ty);
1353                     }
1354                 }
1355             }
1356             break;
1357 
1358         case TYschar16:
1359         case TYuchar16:
1360         case TYschar32:
1361         case TYuchar32:
1362             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1363             {
1364                 // VPBROADCASTB X/YMM,MEM
1365                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1366                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1367                 reg_t reg;
1368                 allocreg(cdb,&retregs,&reg,ty);
1369                 cs.Iop = VPBROADCASTB;
1370                 cs.Irex &= ~REX_W;
1371                 code_newreg(&cs,reg - XMM0);
1372                 checkSetVex(&cs,ty);
1373                 cdb.gen(&cs);
1374             }
1375             else
1376             {
1377                 regm_t regm = ALLREGS;
1378                 codelem(cdb,e1,&regm,true); // eval left leaf
1379                 const r = findreg(regm);
1380 
1381                 reg_t reg;
1382                 allocreg(cdb,&retregs,&reg, e.Ety);
1383                 reg -= XMM0;
1384                 // (V)MOVD reg,r
1385                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1386                 checkSetVex(cdb.last(), TYushort8);
1387                 if (config.avx >= 2)
1388                 {
1389                     // VPBROADCASTB X/YMM,XMM
1390                     cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg));
1391                     checkSetVex(cdb.last(), ty);
1392                 }
1393                 else
1394                 {
1395                     if (config.avx)
1396                     {
1397                         reg_t zeroreg;
1398                         regm = XMMREGS & ~retregs;
1399                         // VPXOR XMM1,XMM1,XMM1
1400                         allocreg(cdb,&regm,&zeroreg, ty);
1401                         zeroreg -= XMM0;
1402                         cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg));
1403                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1404                         // VPSHUFB XMM,XMM,XMM1
1405                         cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg));
1406                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1407                     }
1408                     else
1409                     {
1410                         // PUNPCKLBW XMM,XMM
1411                         cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg));
1412                         // PUNPCKLWD XMM,XMM
1413                         cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1414                         // PSHUFD XMM,XMM,0
1415                         cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1416                     }
1417                     if (tysize(ty) == 32)
1418                     {
1419                         // VINSERTF128 YMM,YMM,XMM,1
1420                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1421                         checkSetVex(cdb.last(), ty);
1422                     }
1423                 }
1424             }
1425             break;
1426 
1427         case TYshort8:
1428         case TYushort8:
1429         case TYshort16:
1430         case TYushort16:
1431             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1432             {
1433                 // VPBROADCASTW X/YMM,MEM
1434                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1435                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1436                 reg_t reg;
1437                 allocreg(cdb,&retregs,&reg,ty);
1438                 cs.Iop = VPBROADCASTW;
1439                 cs.Irex &= ~REX_W;
1440                 cs.Iflags &= ~CFopsize;
1441                 code_newreg(&cs,reg - XMM0);
1442                 checkSetVex(&cs,ty);
1443                 cdb.gen(&cs);
1444             }
1445             else
1446             {
1447                 regm_t regm = ALLREGS;
1448                 codelem(cdb,e1,&regm,true); // eval left leaf
1449                 reg_t r = findreg(regm);
1450 
1451                 reg_t reg;
1452                 allocreg(cdb,&retregs,&reg, e.Ety);
1453                 reg -= XMM0;
1454                 // (V)MOVD reg,r
1455                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1456                 checkSetVex(cdb.last(), TYushort8);
1457                 if (config.avx >= 2)
1458                 {
1459                     // VPBROADCASTW X/YMM,XMM
1460                     cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg));
1461                     checkSetVex(cdb.last(), ty);
1462                 }
1463                 else
1464                 {
1465                     // (V)PUNPCKLWD XMM,XMM
1466                     cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1467                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1468                     // (V)PSHUFD XMM,XMM,0
1469                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1470                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1471                     if (tysize(ty) == 32)
1472                     {
1473                         // VINSERTF128 YMM,YMM,XMM,1
1474                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1475                         checkSetVex(cdb.last(), ty);
1476                     }
1477                 }
1478             }
1479             break;
1480 
1481         case TYlong8:
1482         case TYulong8:
1483         case TYlong4:
1484         case TYulong4:
1485             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1486             {
1487                 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM
1488                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1489                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1490                 reg_t reg;
1491                 allocreg(cdb,&retregs,&reg,ty);
1492                 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS;
1493                 cs.Irex &= ~REX_W;
1494                 code_newreg(&cs,reg - XMM0);
1495                 checkSetVex(&cs,ty);
1496                 cdb.gen(&cs);
1497             }
1498             else
1499             {
1500                 codelem(cdb,e1,&retregs,true); // eval left leaf
1501                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1502                 getregs(cdb,retregs);
1503                 if (config.avx >= 2)
1504                 {
1505                     // VPBROADCASTD X/YMM,XMM
1506                     cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg));
1507                     checkSetVex(cdb.last(), ty);
1508                 }
1509                 else
1510                 {
1511                     // (V)PSHUFD XMM,XMM,0
1512                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1513                     checkSetVex(cdb.last(), TYulong4); // AVX-128
1514                     if (tysize(ty) == 32)
1515                     {
1516                         // VINSERTF128 YMM,YMM,XMM,1
1517                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1518                         checkSetVex(cdb.last(), ty);
1519                     }
1520                 }
1521             }
1522             break;
1523 
1524         case TYllong2:
1525         case TYullong2:
1526         case TYllong4:
1527         case TYullong4:
1528             if (e1.Eoper == OPind && !e1.Ecount)
1529             {
1530                 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM
1531                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1532                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1533                 reg_t reg;
1534                 allocreg(cdb,&retregs,&reg,ty);
1535                 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ;
1536                 cs.Irex &= ~REX_W;
1537                 code_newreg(&cs,reg - XMM0);
1538                 checkSetVex(&cs,ty);
1539                 cdb.gen(&cs);
1540             }
1541             else
1542             {
1543                 codelem(cdb,e1,&retregs,true); // eval left leaf
1544                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1545                 getregs(cdb,retregs);
1546                 if (config.avx >= 2)
1547                 {
1548                     // VPBROADCASTQ X/YMM,XMM
1549                     cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg));
1550                     checkSetVex(cdb.last(), ty);
1551                 }
1552                 else
1553                 {
1554                     // (V)PUNPCKLQDQ XMM,XMM
1555                     cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0);
1556                     checkSetVex(cdb.last(), TYullong2); // AVX-128
1557                     if (tysize(ty) == 32)
1558                     {
1559                         // VINSERTF128 YMM,YMM,XMM,1
1560                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1561                         checkSetVex(cdb.last(), ty);
1562                     }
1563                 }
1564             }
1565             break;
1566 
1567         default:
1568             assert(0);
1569     }
1570 
1571     fixresult(cdb,e,retregs,pretregs);
1572 }
1573 
1574 /*******************************************
1575  * Determine if lvalue e is a vector aligned on a 16/32 byte boundary.
1576  * Assume it to be aligned unless can prove it is not.
1577  * Params:
1578  *      e = lvalue
1579  * Returns:
1580  *      false if definitely not aligned
1581  */
1582 
1583 bool xmmIsAligned(elem *e)
1584 {
1585     if (tyvector(e.Ety) && e.Eoper == OPvar)
1586     {
1587         Symbol *s = e.EV.Vsym;
1588         const alignsz = tyalignsize(e.Ety);
1589         if (Symbol_Salignsize(s) < alignsz ||
1590             e.EV.Voffset & (alignsz - 1) ||
1591             alignsz > STACKALIGN
1592            )
1593             return false;       // definitely not aligned
1594     }
1595     return true;        // assume aligned
1596 }
1597 
1598 /**************************************
1599  * VEX prefixes can be 2 or 3 bytes.
1600  * If it must be 3 bytes, set the CFvex3 flag.
1601  */
1602 
1603 void checkSetVex3(code *c)
1604 {
1605     // See Intel Vol. 2A 2.3.5.6
1606     if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 ||
1607         !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8))
1608        )
1609     {
1610         c.Iflags |= CFvex3;
1611     }
1612 }
1613 
1614 /*************************************
1615  * Determine if operation should be rewritten as a VEX
1616  * operation; and do so.
1617  * Params:
1618  *      c = code
1619  *      ty = type of operand
1620  */
1621 
1622 void checkSetVex(code *c, tym_t ty)
1623 {
1624     if (config.avx || tysize(ty) == 32)
1625     {
1626         uint vreg = (c.Irm >> 3) & 7;
1627         if (c.Irex & REX_R)
1628             vreg |= 8;
1629 
1630         // TODO: This is too simplistic, depending on the instruction, vex.vvvv
1631         // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes
1632         // NDS (non-destructive source), except for the incomplete list of 2
1633         // operand instructions (NOO) handled by the switch.
1634         switch (c.Iop)
1635         {
1636             case LODSS:
1637             case LODSD:
1638             case STOSS:
1639             case STOSD:
1640                 if ((c.Irm & 0xC0) == 0xC0)
1641                     break;
1642                 goto case LODAPS;
1643 
1644             case LODAPS:
1645             case LODUPS:
1646             case LODAPD:
1647             case LODUPD:
1648             case LODDQA:
1649             case LODDQU:
1650             case LODD:
1651             case LODQ:
1652             case STOAPS:
1653             case STOUPS:
1654             case STOAPD:
1655             case STOUPD:
1656             case STODQA:
1657             case STODQU:
1658             case STOD:
1659             case STOQ:
1660             case COMISS:
1661             case COMISD:
1662             case UCOMISS:
1663             case UCOMISD:
1664             case MOVDDUP:
1665             case MOVSHDUP:
1666             case MOVSLDUP:
1667             case VBROADCASTSS:
1668             case PSHUFD:
1669             case PSHUFHW:
1670             case PSHUFLW:
1671             case VPBROADCASTB:
1672             case VPBROADCASTW:
1673             case VPBROADCASTD:
1674             case VPBROADCASTQ:
1675                 vreg = 0;       // for 2 operand vex instructions
1676                 break;
1677 
1678             case VBROADCASTSD:
1679             case VBROADCASTF128:
1680             case VBROADCASTI128:
1681                 assert(tysize(ty) == 32); // AVX-256 only instructions
1682                 vreg = 0;       // for 2 operand vex instructions
1683                 break;
1684 
1685             case NOP:
1686                 return;         // ignore
1687 
1688             default:
1689                 break;
1690         }
1691 
1692         opcode_t op = 0xC4000000 | (c.Iop & 0xFF);
1693         switch (c.Iop & 0xFFFFFF00)
1694         {
1695             static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); }
1696             case 0x00000F00: op |= MM_PP(1,0); break;
1697             case 0x00660F00: op |= MM_PP(1,1); break;
1698             case 0x00F30F00: op |= MM_PP(1,2); break;
1699             case 0x00F20F00: op |= MM_PP(1,3); break;
1700             case 0x660F3800: op |= MM_PP(2,1); break;
1701             case 0x660F3A00: op |= MM_PP(3,1); break;
1702             default:
1703                 printf("Iop = %x\n", c.Iop);
1704                 assert(0);
1705         }
1706         c.Iop = op;
1707         c.Ivex.pfx = 0xC4;
1708         c.Ivex.r = !(c.Irex & REX_R);
1709         c.Ivex.x = !(c.Irex & REX_X);
1710         c.Ivex.b = !(c.Irex & REX_B);
1711         c.Ivex.w = (c.Irex & REX_W) != 0;
1712         c.Ivex.l = tysize(ty) == 32;
1713 
1714         c.Ivex.vvvv = cast(ushort)~vreg;
1715 
1716         c.Iflags |= CFvex;
1717         checkSetVex3(c);
1718     }
1719 }
1720 
1721 }