1 /**
2  * Compiler implementation of the
3  * $(LINK2 http://www.dlang.org, D programming language).
4  *
5  * Copyright:   Copyright (C) 2011-2020 by The D Language Foundation, All Rights Reserved
6  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
7  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
8  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d)
9  */
10 
11 module dmd.backend.cgxmm;
12 
13 version (SCPP)
14     version = COMPILE;
15 version (MARS)
16     version = COMPILE;
17 
18 version (COMPILE)
19 {
20 
21 import core.stdc.stdio;
22 import core.stdc.stdlib;
23 import core.stdc.string;
24 
25 import dmd.backend.cc;
26 import dmd.backend.cdef;
27 import dmd.backend.code;
28 import dmd.backend.code_x86;
29 import dmd.backend.codebuilder;
30 import dmd.backend.mem;
31 import dmd.backend.el;
32 import dmd.backend.global;
33 import dmd.backend.oper;
34 import dmd.backend.ty;
35 import dmd.backend.xmm;
36 
37 version (SCPP)
38     import dmd.backend.exh;
39 version (MARS)
40     import dmd.backend.errors;
41 
42 
43 extern (C++):
44 
45 nothrow:
46 
47 int REGSIZE();
48 
49 uint mask(uint m);
50 
51 /*******************************************
52  * Is operator a store operator?
53  */
54 
55 bool isXMMstore(opcode_t op)
56 {
57     switch (op)
58     {
59     case STOSS: case STOAPS: case STOUPS:
60     case STOSD: case STOAPD: case STOUPD:
61     case STOD: case STOQ: case STODQA: case STODQU:
62     case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true;
63     default: return false;
64     }
65 }
66 
67 /*******************************************
68  * Move constant value into xmm register xreg.
69  */
70 
71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags)
72 {
73     /* Generate:
74      *    MOV reg,value
75      *    MOV xreg,reg
76      * Not so efficient. We should at least do a PXOR for 0.
77      */
78     assert(mask(xreg) & XMMREGS);
79     assert(sz == 4 || sz == 8);
80     if (I32 && sz == 8)
81     {
82         reg_t r;
83         regm_t rm = ALLREGS;
84         allocreg(cdb,&rm,&r,TYint);         // allocate scratch register
85         static union U { targ_size_t s; targ_long[2] l; }
86         U u = void;
87         u.l[1] = 0;
88         u.s = value;
89         targ_long *p = &u.l[0];
90         movregconst(cdb,r,p[0],0);
91         cdb.genfltreg(STO,r,0);                     // MOV floatreg,r
92         movregconst(cdb,r,p[1],0);
93         cdb.genfltreg(STO,r,4);                     // MOV floatreg+4,r
94 
95         const op = xmmload(TYdouble, true);
96         cdb.genxmmreg(op,xreg,0,TYdouble);          // MOVSD XMMreg,floatreg
97     }
98     else
99     {
100         reg_t reg;
101         regwithvalue(cdb,ALLREGS,value,&reg,(sz == 8) ? 64 : 0);
102         cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg));     // MOVD xreg,reg
103         if (sz == 8)
104             code_orrex(cdb.last(), REX_W);
105         checkSetVex(cdb.last(), TYulong);
106     }
107 }
108 
109 /***********************************************
110  * Do simple orthogonal operators for XMM registers.
111  */
112 
113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
114 {
115     //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
116     elem *e1 = e.EV.E1;
117     elem *e2 = e.EV.E2;
118 
119     // float + ifloat is not actually addition
120     if ((e.Eoper == OPadd || e.Eoper == OPmin) &&
121         ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) ||
122          (tyreal(e2.Ety) && tyimaginary(e1.Ety))))
123     {
124         regm_t retregs = *pretregs & XMMREGS;
125         if (!retregs)
126             retregs = XMMREGS;
127 
128         regm_t rretregs;
129         reg_t rreg;
130         if (tyreal(e1.Ety))
131         {
132             const reg = findreg(retregs);
133             rreg = findreg(retregs & ~mask(reg));
134             retregs = mask(reg);
135             rretregs = mask(rreg);
136         }
137         else
138         {
139             // Pick the second register, not the first
140             rreg = findreg(retregs);
141             rretregs = mask(rreg);
142             const reg = findreg(retregs & ~rretregs);
143             retregs = mask(reg);
144         }
145         assert(retregs && rretregs);
146 
147         codelem(cdb,e1,&retregs,false); // eval left leaf
148         scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
149 
150         retregs |= rretregs;
151         if (e.Eoper == OPmin)
152         {
153             regm_t nretregs = XMMREGS & ~retregs;
154             reg_t sreg; // hold sign bit
155             const uint sz = tysize(e1.Ety);
156             allocreg(cdb,&nretregs,&sreg,e2.Ety);
157             targ_size_t signbit = 0x80000000;
158             if (sz == 8)
159                 signbit = cast(targ_size_t)0x8000000000000000L;
160             movxmmconst(cdb,sreg, sz, signbit, 0);
161             getregs(cdb,nretregs);
162             const opcode_t xop = (sz == 8) ? XORPD : XORPS;       // XORPD/S rreg,sreg
163             cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0));
164         }
165         if (retregs != *pretregs)
166             fixresult(cdb,e,retregs,pretregs);
167         return;
168     }
169 
170     regm_t retregs = *pretregs & XMMREGS;
171     if (!retregs)
172         retregs = XMMREGS;
173     const constflag = OTrel(e.Eoper);
174     codelem(cdb,e1,&retregs,constflag); // eval left leaf
175     const reg = findreg(retregs);
176     regm_t rretregs = XMMREGS & ~retregs;
177     scodelem(cdb, e2, &rretregs, retregs, true);  // eval right leaf
178 
179     const rreg = findreg(rretregs);
180     const op = xmmoperator(e1.Ety, e.Eoper);
181 
182     /* We should take advantage of mem addressing modes for OP XMM,MEM
183      * but we do not at the moment.
184      */
185     if (OTrel(e.Eoper))
186     {
187         cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0));
188         checkSetVex(cdb.last(), e1.Ety);
189         return;
190     }
191 
192     getregs(cdb,retregs);
193     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
194     checkSetVex(cdb.last(), e1.Ety);
195     if (retregs != *pretregs)
196         fixresult(cdb,e,retregs,pretregs);
197 }
198 
199 
200 /************************
201  * Generate code for an assignment using XMM registers.
202  * Params:
203  *      opcode = store opcode to use, CMP means generate one
204  */
205 
206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs)
207 {
208     tym_t tymll;
209     int i;
210     code cs;
211     elem *e11;
212     bool regvar;                  /* true means evaluate into register variable */
213     regm_t varregm;
214     targ_int postinc;
215 
216     //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs));
217     tym_t tyml = tybasic(e1.Ety);              /* type of lvalue               */
218     regm_t retregs = *pretregs;
219 
220     if (!(retregs & XMMREGS))
221         retregs = XMMREGS;              // pick any XMM reg
222 
223     bool aligned = xmmIsAligned(e1);
224     // If default, select store opcode
225     cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op;
226     regvar = false;
227     varregm = 0;
228     if (config.flags4 & CFG4optimized)
229     {
230         // Be careful of cases like (x = x+x+x). We cannot evaluate in
231         // x if x is in a register.
232         reg_t varreg;
233         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
234             doinreg(e1.EV.Vsym,e2) &&           // and we can compute directly into it
235             varregm & XMMREGS
236            )
237         {   regvar = true;
238             retregs = varregm;    // evaluate directly in target register
239         }
240     }
241     if (*pretregs & mPSW && OTleaf(e1.Eoper))     // if evaluating e1 couldn't change flags
242     {   // Be careful that this lines up with jmpopcode()
243         retregs |= mPSW;
244         *pretregs &= ~mPSW;
245     }
246     scodelem(cdb,e2,&retregs,0,true);    // get rvalue
247 
248     // Look for special case of (*p++ = ...), where p is a register variable
249     if (e1.Eoper == OPind &&
250         ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) &&
251         e11.EV.E1.Eoper == OPvar &&
252         e11.EV.E1.EV.Vsym.Sfl == FLreg
253        )
254     {
255         postinc = e11.EV.E2.EV.Vint;
256         if (e11.Eoper == OPpostdec)
257             postinc = -postinc;
258         getlvalue(cdb,&cs,e11,RMstore | retregs);
259         freenode(e11.EV.E2);
260     }
261     else
262     {   postinc = 0;
263         getlvalue(cdb,&cs,e1,RMstore | retregs);       // get lvalue (cl == CNIL if regvar)
264     }
265 
266     getregs_imm(cdb,regvar ? varregm : 0);
267 
268     const reg = findreg(retregs & XMMREGS);
269     cs.Irm |= modregrm(0,(reg - XMM0) & 7,0);
270     if ((reg - XMM0) & 8)
271         cs.Irex |= REX_R;
272 
273     // Do not generate mov from register onto itself
274     if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0))))
275     {
276         cdb.gen(&cs);         // MOV EA+offset,reg
277         checkSetVex(cdb.last(), tyml);
278     }
279 
280     if (e1.Ecount ||                     // if lvalue is a CSE or
281         regvar)                           // rvalue can't be a CSE
282     {
283         getregs_imm(cdb,retregs);        // necessary if both lvalue and
284                                         //  rvalue are CSEs (since a reg
285                                         //  can hold only one e at a time)
286         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
287     }
288 
289     fixresult(cdb,e,retregs,pretregs);
290     if (postinc)
291     {
292         const increg = findreg(idxregm(&cs));  // the register to increment
293         if (*pretregs & mPSW)
294         {   // Use LEA to avoid touching the flags
295             uint rm = cs.Irm & 7;
296             if (cs.Irex & REX_B)
297                 rm |= 8;
298             cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc);
299             if (tysize(e11.EV.E1.Ety) == 8)
300                 code_orrex(cdb.last(), REX_W);
301         }
302         else if (I64)
303         {
304             cdb.genc2(0x81,modregrmx(3,0,increg),postinc);
305             if (tysize(e11.EV.E1.Ety) == 8)
306                 code_orrex(cdb.last(), REX_W);
307         }
308         else
309         {
310             if (postinc == 1)
311                 cdb.gen1(0x40 + increg);       // INC increg
312             else if (postinc == -cast(targ_int)1)
313                 cdb.gen1(0x48 + increg);       // DEC increg
314             else
315             {
316                 cdb.genc2(0x81,modregrm(3,0,increg),postinc);
317             }
318         }
319     }
320     freenode(e1);
321 }
322 
323 /********************************
324  * Generate code for conversion using SSE2 instructions.
325  *
326  *      OPs32_d
327  *      OPs64_d (64-bit only)
328  *      OPu32_d (64-bit only)
329  *      OPd_f
330  *      OPf_d
331  *      OPd_s32
332  *      OPd_s64 (64-bit only)
333  *
334  */
335 
336 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
337 {
338     //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs));
339     opcode_t op = NoOpcode;
340     regm_t regs;
341     tym_t ty;
342     ubyte rex = 0;
343     bool zx = false; // zero extend uint
344 
345     /* There are no ops for integer <. float/real conversions
346      * but there are instructions for them. In order to use these
347      * try to fuse chained conversions. Be careful not to loose
348      * precision for real to long.
349      */
350     elem *e1 = e.EV.E1;
351     switch (e.Eoper)
352     {
353     case OPd_f:
354         if (e1.Eoper == OPs32_d)
355         { }
356         else if (I64 && e1.Eoper == OPs64_d)
357             rex = REX_W;
358         else if (I64 && e1.Eoper == OPu32_d)
359         {   rex = REX_W;
360             zx = true;
361         }
362         else
363         {   regs = XMMREGS;
364             op = CVTSD2SS;
365             ty = TYfloat;
366             break;
367         }
368         if (e1.Ecount)
369         {
370             regs = XMMREGS;
371             op = CVTSD2SS;
372             ty = TYfloat;
373             break;
374         }
375         // directly use si2ss
376         regs = ALLREGS;
377         e1 = e1.EV.E1;  // fused operation
378         op = CVTSI2SS;
379         ty = TYfloat;
380         break;
381 
382     case OPs32_d:              goto Litod;
383     case OPs64_d: rex = REX_W; goto Litod;
384     case OPu32_d: rex = REX_W; zx = true; goto Litod;
385     Litod:
386         regs = ALLREGS;
387         op = CVTSI2SD;
388         ty = TYdouble;
389         break;
390 
391     case OPd_s32: ty = TYint;  goto Ldtoi;
392     case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi;
393     case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi;
394     Ldtoi:
395         regs = XMMREGS;
396         switch (e1.Eoper)
397         {
398         case OPf_d:
399             if (e1.Ecount)
400             {
401                 op = CVTTSD2SI;
402                 break;
403             }
404             e1 = e1.EV.E1;      // fused operation
405             op = CVTTSS2SI;
406             break;
407         case OPld_d:
408             if (e.Eoper == OPd_s64)
409             {
410                 cnvt87(cdb,e,pretregs); // precision
411                 return;
412             }
413             goto default;
414 
415         default:
416             op = CVTTSD2SI;
417             break;
418         }
419         break;
420 
421     case OPf_d:
422         regs = XMMREGS;
423         op = CVTSS2SD;
424         ty = TYdouble;
425         break;
426 
427     default:
428         assert(0);
429     }
430     assert(op != NoOpcode);
431 
432     codelem(cdb,e1, &regs, false);
433     reg_t reg = findreg(regs);
434     if (isXMMreg(reg))
435         reg -= XMM0;
436     else if (zx)
437     {   assert(I64);
438         getregs(cdb,regs);
439         genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit
440                                    // Don't use x89 because that will get optimized away
441         code_orflag(cdb.last(),CFvolatile);
442     }
443 
444     regm_t retregs = *pretregs;
445     if (tyxmmreg(ty)) // target is XMM
446     {   if (!(*pretregs & XMMREGS))
447             retregs = XMMREGS;
448     }
449     else              // source is XMM
450     {   assert(regs & XMMREGS);
451         if (!(retregs & ALLREGS))
452             retregs = ALLREGS;
453     }
454 
455     reg_t rreg;
456     allocreg(cdb,&retregs,&rreg,ty);
457     if (isXMMreg(rreg))
458         rreg -= XMM0;
459 
460     cdb.gen2(op, modregxrmx(3,rreg,reg));
461     assert(I64 || !rex);
462     if (rex)
463         code_orrex(cdb.last(), rex);
464 
465     if (*pretregs != retregs)
466         fixresult(cdb,e,retregs,pretregs);
467 }
468 
469 /********************************
470  * Generate code for op=
471  */
472 
473 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
474 {   elem *e1 = e.EV.E1;
475     elem *e2 = e.EV.E2;
476     tym_t ty1 = tybasic(e1.Ety);
477     const sz1 = _tysize[ty1];
478     regm_t rretregs = XMMREGS & ~*pretregs;
479     if (!rretregs)
480         rretregs = XMMREGS;
481 
482     codelem(cdb,e2,&rretregs,false); // eval right leaf
483     reg_t rreg = findreg(rretregs);
484 
485     code cs;
486     regm_t retregs;
487     reg_t reg;
488     bool regvar = false;
489     if (config.flags4 & CFG4optimized)
490     {
491         // Be careful of cases like (x = x+x+x). We cannot evaluate in
492         // x if x is in a register.
493         reg_t varreg;
494         regm_t varregm;
495         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
496             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
497            )
498         {   regvar = true;
499             retregs = varregm;
500             reg = varreg;                       // evaluate directly in target register
501             getregs(cdb,retregs);       // destroy these regs
502         }
503     }
504 
505     if (!regvar)
506     {
507         getlvalue(cdb,&cs,e1,rretregs);         // get EA
508         retregs = *pretregs & XMMREGS & ~rretregs;
509         if (!retregs)
510             retregs = XMMREGS & ~rretregs;
511         allocreg(cdb,&retregs,&reg,ty1);
512         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
513         code_newreg(&cs,reg - XMM0);
514         cdb.gen(&cs);
515         checkSetVex(cdb.last(), ty1);
516     }
517 
518     const op = xmmoperator(e1.Ety, e.Eoper);
519     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
520     checkSetVex(cdb.last(), e1.Ety);
521 
522     if (!regvar)
523     {
524         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
525         cdb.gen(&cs);
526         checkSetVex(cdb.last(), ty1);
527     }
528 
529     if (e1.Ecount ||                     // if lvalue is a CSE or
530         regvar)                           // rvalue can't be a CSE
531     {
532         getregs_imm(cdb,retregs);        // necessary if both lvalue and
533                                         //  rvalue are CSEs (since a reg
534                                         //  can hold only one e at a time)
535         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
536     }
537 
538     fixresult(cdb,e,retregs,pretregs);
539     freenode(e1);
540 }
541 
542 /********************************
543  * Generate code for post increment and post decrement.
544  */
545 
546 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
547 {
548     elem *e1 = e.EV.E1;
549     elem *e2 = e.EV.E2;
550     tym_t ty1 = tybasic(e1.Ety);
551 
552     regm_t retregs;
553     reg_t reg;
554     bool regvar = false;
555     if (config.flags4 & CFG4optimized)
556     {
557         // Be careful of cases like (x = x+x+x). We cannot evaluate in
558         // x if x is in a register.
559         reg_t varreg;
560         regm_t varregm;
561         if (isregvar(e1,&varregm,&varreg) &&    // if lvalue is register variable
562             doinreg(e1.EV.Vsym,e2)          // and we can compute directly into it
563            )
564         {
565             regvar = true;
566             retregs = varregm;
567             reg = varreg;                       // evaluate directly in target register
568             getregs(cdb,retregs);       // destroy these regs
569         }
570     }
571 
572     code cs;
573     if (!regvar)
574     {
575         getlvalue(cdb,&cs,e1,0);                // get EA
576         retregs = XMMREGS & ~*pretregs;
577         if (!retregs)
578             retregs = XMMREGS;
579         allocreg(cdb,&retregs,&reg,ty1);
580         cs.Iop = xmmload(ty1, true);            // MOVSD xmm,xmm_m64
581         code_newreg(&cs,reg - XMM0);
582         cdb.gen(&cs);
583         checkSetVex(cdb.last(), ty1);
584     }
585 
586     // Result register
587     regm_t resultregs = XMMREGS & *pretregs & ~retregs;
588     if (!resultregs)
589         resultregs = XMMREGS & ~retregs;
590     reg_t resultreg;
591     allocreg(cdb,&resultregs, &resultreg, ty1);
592 
593     cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0));   // MOVSS/D resultreg,reg
594     checkSetVex(cdb.last(), ty1);
595 
596     regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs);
597     if (!rretregs)
598         rretregs = XMMREGS & ~(retregs | resultregs);
599     codelem(cdb,e2,&rretregs,false); // eval right leaf
600     const rreg = findreg(rretregs);
601 
602     const op = xmmoperator(e1.Ety, e.Eoper);
603     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));  // ADD reg,rreg
604     checkSetVex(cdb.last(), e1.Ety);
605 
606     if (!regvar)
607     {
608         cs.Iop = xmmstore(ty1,true);      // reverse operand order of MOVS[SD]
609         cdb.gen(&cs);
610         checkSetVex(cdb.last(), ty1);
611     }
612 
613     if (e1.Ecount ||                     // if lvalue is a CSE or
614         regvar)                           // rvalue can't be a CSE
615     {
616         getregs_imm(cdb,retregs); // necessary if both lvalue and
617                                         //  rvalue are CSEs (since a reg
618                                         //  can hold only one e at a time)
619         cssave(e1,retregs,!OTleaf(e1.Eoper));     // if lvalue is a CSE
620     }
621 
622     fixresult(cdb,e,resultregs,pretregs);
623     freenode(e1);
624 }
625 
626 /******************
627  * Negate operator
628  */
629 
630 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
631 {
632     //printf("xmmneg()\n");
633     //elem_print(e);
634     assert(*pretregs);
635     tym_t tyml = tybasic(e.EV.E1.Ety);
636     int sz = _tysize[tyml];
637 
638     regm_t retregs = *pretregs & XMMREGS;
639     if (!retregs)
640         retregs = XMMREGS;
641 
642     /* Generate:
643      *    MOV reg,e1
644      *    MOV rreg,signbit
645      *    XOR reg,rreg
646      */
647     codelem(cdb,e.EV.E1,&retregs,false);
648     getregs(cdb,retregs);
649     const reg = findreg(retregs);
650     regm_t rretregs = XMMREGS & ~retregs;
651     reg_t rreg;
652     allocreg(cdb,&rretregs,&rreg,tyml);
653     targ_size_t signbit = 0x80000000;
654     if (sz == 8)
655         signbit = cast(targ_size_t)0x8000000000000000L;
656     movxmmconst(cdb,rreg, sz, signbit, 0);
657 
658     getregs(cdb,retregs);
659     const op = (sz == 8) ? XORPD : XORPS;       // XORPD/S reg,rreg
660     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
661     fixresult(cdb,e,retregs,pretregs);
662 }
663 
664 /******************
665  * Absolute value operator OPabs
666  */
667 
668 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs)
669 {
670     //printf("xmmabs()\n");
671     //elem_print(e);
672     assert(*pretregs);
673     tym_t tyml = tybasic(e.EV.E1.Ety);
674     int sz = _tysize[tyml];
675 
676     regm_t retregs = *pretregs & XMMREGS;
677     if (!retregs)
678         retregs = XMMREGS;
679 
680     /* Generate:
681      *    MOV reg,e1
682      *    MOV rreg,mask
683      *    AND reg,rreg
684      */
685     codelem(cdb,e.EV.E1,&retregs,false);
686     getregs(cdb,retregs);
687     const reg = findreg(retregs);
688     regm_t rretregs = XMMREGS & ~retregs;
689     reg_t rreg;
690     allocreg(cdb,&rretregs,&rreg,tyml);
691     targ_size_t mask = 0x7FFF_FFFF;
692     if (sz == 8)
693         mask = cast(targ_size_t)0x7FFF_FFFF_FFFF_FFFFL;
694     movxmmconst(cdb,rreg, sz, mask, 0);
695 
696     getregs(cdb,retregs);
697     const op = (sz == 8) ? ANDPD : ANDPS;       // ANDPD/S reg,rreg
698     cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0));
699     fixresult(cdb,e,retregs,pretregs);
700 }
701 
702 /*****************************
703  * Get correct load operator based on type.
704  * It is important to use the right one even if the number of bits moved is the same,
705  * as there are performance consequences for using the wrong one.
706  * Params:
707  *      tym = type of data to load
708  *      aligned = for vectors, true if aligned to 16 bytes
709  */
710 
711 opcode_t xmmload(tym_t tym, bool aligned)
712 {
713     opcode_t op;
714     if (tysize(tym) == 32)
715         aligned = false;
716     switch (tybasic(tym))
717     {
718         case TYuint:
719         case TYint:
720         case TYlong:
721         case TYulong:   op = LODD;  break;       // MOVD
722         case TYfloat:
723         case TYcfloat:
724         case TYifloat:  op = LODSS; break;       // MOVSS
725         case TYllong:
726         case TYullong:  op = LODQ;  break;       // MOVQ
727         case TYdouble:
728         case TYcdouble:
729         case TYidouble: op = LODSD; break;       // MOVSD
730 
731         case TYfloat8:
732         case TYfloat4:  op = aligned ? LODAPS : LODUPS; break;      // MOVAPS / MOVUPS
733         case TYdouble4:
734         case TYdouble2: op = aligned ? LODAPD : LODUPD; break;      // MOVAPD / MOVUPD
735         case TYschar16:
736         case TYuchar16:
737         case TYshort8:
738         case TYushort8:
739         case TYlong4:
740         case TYulong4:
741         case TYllong2:
742         case TYullong2:
743         case TYschar32:
744         case TYuchar32:
745         case TYshort16:
746         case TYushort16:
747         case TYlong8:
748         case TYulong8:
749         case TYllong4:
750         case TYullong4: op = aligned ? LODDQA : LODDQU; break;      // MOVDQA / MOVDQU
751 
752         default:
753             printf("tym = x%x\n", tym);
754             assert(0);
755     }
756     return op;
757 }
758 
759 /*****************************
760  * Get correct store operator based on type.
761  */
762 
763 opcode_t xmmstore(tym_t tym, bool aligned)
764 {
765     opcode_t op;
766     switch (tybasic(tym))
767     {
768         case TYuint:
769         case TYint:
770         case TYlong:
771         case TYulong:   op = STOD;  break;       // MOVD
772         case TYfloat:
773         case TYifloat:  op = STOSS; break;       // MOVSS
774         case TYllong:
775         case TYullong:  op = STOQ;  break;       // MOVQ
776         case TYdouble:
777         case TYidouble:
778         case TYcdouble:
779         case TYcfloat:  op = STOSD; break;       // MOVSD
780 
781         case TYfloat8:
782         case TYfloat4:  op = aligned ? STOAPS : STOUPS; break;      // MOVAPS / MOVUPS
783         case TYdouble4:
784         case TYdouble2: op = aligned ? STOAPD : STOUPD; break;      // MOVAPD / MOVUPD
785         case TYschar16:
786         case TYuchar16:
787         case TYshort8:
788         case TYushort8:
789         case TYlong4:
790         case TYulong4:
791         case TYllong2:
792         case TYullong2:
793         case TYschar32:
794         case TYuchar32:
795         case TYshort16:
796         case TYushort16:
797         case TYlong8:
798         case TYulong8:
799         case TYllong4:
800         case TYullong4: op = aligned ? STODQA : STODQU; break;      // MOVDQA / MOVDQU
801 
802         default:
803             printf("tym = 0x%x\n", tym);
804             assert(0);
805     }
806     return op;
807 }
808 
809 
810 /************************************
811  * Get correct XMM operator based on type and operator.
812  */
813 
814 private opcode_t xmmoperator(tym_t tym, OPER oper)
815 {
816     tym = tybasic(tym);
817     opcode_t op;
818     switch (oper)
819     {
820         case OPadd:
821         case OPaddass:
822         case OPpostinc:
823             switch (tym)
824             {
825                 case TYfloat:
826                 case TYifloat:  op = ADDSS;  break;
827                 case TYdouble:
828                 case TYidouble: op = ADDSD;  break;
829 
830                 // SIMD vector types
831                 case TYfloat8:
832                 case TYfloat4:  op = ADDPS;  break;
833                 case TYdouble4:
834                 case TYdouble2: op = ADDPD;  break;
835                 case TYschar32:
836                 case TYuchar32:
837                 case TYschar16:
838                 case TYuchar16: op = PADDB;  break;
839                 case TYshort16:
840                 case TYushort16:
841                 case TYshort8:
842                 case TYushort8: op = PADDW;  break;
843                 case TYlong8:
844                 case TYulong8:
845                 case TYlong4:
846                 case TYulong4:  op = PADDD;  break;
847                 case TYllong4:
848                 case TYullong4:
849                 case TYllong2:
850                 case TYullong2: op = PADDQ;  break;
851 
852                 default:
853                     printf("tym = x%x\n", tym);
854                     assert(0);
855             }
856             break;
857 
858         case OPmin:
859         case OPminass:
860         case OPpostdec:
861             switch (tym)
862             {
863                 case TYfloat:
864                 case TYifloat:  op = SUBSS;  break;
865                 case TYdouble:
866                 case TYidouble: op = SUBSD;  break;
867 
868                 // SIMD vector types
869                 case TYfloat8:
870                 case TYfloat4:  op = SUBPS;  break;
871                 case TYdouble4:
872                 case TYdouble2: op = SUBPD;  break;
873                 case TYschar32:
874                 case TYuchar32:
875                 case TYschar16:
876                 case TYuchar16: op = PSUBB;  break;
877                 case TYshort16:
878                 case TYushort16:
879                 case TYshort8:
880                 case TYushort8: op = PSUBW;  break;
881                 case TYlong8:
882                 case TYulong8:
883                 case TYlong4:
884                 case TYulong4:  op = PSUBD;  break;
885                 case TYllong4:
886                 case TYullong4:
887                 case TYllong2:
888                 case TYullong2: op = PSUBQ;  break;
889 
890                 default:        assert(0);
891             }
892             break;
893 
894         case OPmul:
895         case OPmulass:
896             switch (tym)
897             {
898                 case TYfloat:
899                 case TYifloat:  op = MULSS;  break;
900                 case TYdouble:
901                 case TYidouble: op = MULSD;  break;
902 
903                 // SIMD vector types
904                 case TYfloat8:
905                 case TYfloat4:  op = MULPS;  break;
906                 case TYdouble4:
907                 case TYdouble2: op = MULPD;  break;
908                 case TYshort16:
909                 case TYushort16:
910                 case TYshort8:
911                 case TYushort8: op = PMULLW; break;
912                 case TYlong8:
913                 case TYulong8:
914                 case TYlong4:
915                 case TYulong4:  op = PMULLD; break;
916 
917                 default:        assert(0);
918             }
919             break;
920 
921         case OPdiv:
922         case OPdivass:
923             switch (tym)
924             {
925                 case TYfloat:
926                 case TYifloat:  op = DIVSS;  break;
927                 case TYdouble:
928                 case TYidouble: op = DIVSD;  break;
929 
930                 // SIMD vector types
931                 case TYfloat8:
932                 case TYfloat4:  op = DIVPS;  break;
933                 case TYdouble4:
934                 case TYdouble2: op = DIVPD;  break;
935 
936                 default:        assert(0);
937             }
938             break;
939 
940         case OPor:
941         case OPorass:
942             switch (tym)
943             {
944                 // SIMD vector types
945                 case TYschar16:
946                 case TYuchar16:
947                 case TYshort8:
948                 case TYushort8:
949                 case TYlong4:
950                 case TYulong4:
951                 case TYllong2:
952                 case TYullong2:
953                 case TYschar32:
954                 case TYuchar32:
955                 case TYshort16:
956                 case TYushort16:
957                 case TYlong8:
958                 case TYulong8:
959                 case TYllong4:
960                 case TYullong4: op = POR; break;
961 
962                 default:        assert(0);
963             }
964             break;
965 
966         case OPand:
967         case OPandass:
968             switch (tym)
969             {
970                 // SIMD vector types
971                 case TYschar16:
972                 case TYuchar16:
973                 case TYshort8:
974                 case TYushort8:
975                 case TYlong4:
976                 case TYulong4:
977                 case TYllong2:
978                 case TYullong2:
979                 case TYschar32:
980                 case TYuchar32:
981                 case TYshort16:
982                 case TYushort16:
983                 case TYlong8:
984                 case TYulong8:
985                 case TYllong4:
986                 case TYullong4: op = PAND; break;
987 
988                 default:        assert(0);
989             }
990             break;
991 
992         case OPxor:
993         case OPxorass:
994             switch (tym)
995             {
996                 // SIMD vector types
997                 case TYschar16:
998                 case TYuchar16:
999                 case TYshort8:
1000                 case TYushort8:
1001                 case TYlong4:
1002                 case TYulong4:
1003                 case TYllong2:
1004                 case TYullong2:
1005                 case TYschar32:
1006                 case TYuchar32:
1007                 case TYshort16:
1008                 case TYushort16:
1009                 case TYlong8:
1010                 case TYulong8:
1011                 case TYllong4:
1012                 case TYullong4: op = PXOR; break;
1013 
1014                 default:        assert(0);
1015             }
1016             break;
1017 
1018         case OPlt:
1019         case OPle:
1020         case OPgt:
1021         case OPge:
1022         case OPne:
1023         case OPeqeq:
1024         case OPunord:        /* !<>=         */
1025         case OPlg:           /* <>           */
1026         case OPleg:          /* <>=          */
1027         case OPule:          /* !>           */
1028         case OPul:           /* !>=          */
1029         case OPuge:          /* !<           */
1030         case OPug:           /* !<=          */
1031         case OPue:           /* !<>          */
1032         case OPngt:
1033         case OPnge:
1034         case OPnlt:
1035         case OPnle:
1036         case OPord:
1037         case OPnlg:
1038         case OPnleg:
1039         case OPnule:
1040         case OPnul:
1041         case OPnuge:
1042         case OPnug:
1043         case OPnue:
1044             switch (tym)
1045             {
1046                 case TYfloat:
1047                 case TYifloat:  op = UCOMISS;  break;
1048                 case TYdouble:
1049                 case TYidouble: op = UCOMISD;  break;
1050 
1051                 default:        assert(0);
1052             }
1053             break;
1054 
1055         default:
1056             assert(0);
1057     }
1058     return op;
1059 }
1060 
1061 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1062 {
1063     /* e should look like one of:
1064      *    vector
1065      *      |
1066      *    param
1067      *    /   \
1068      *  param op2
1069      *  /   \
1070      * op   op1
1071      */
1072 
1073     if (!config.fpxmmregs)
1074     {   printf("SIMD operations not supported on this platform\n");
1075         exit(1);
1076     }
1077 
1078     const n = el_nparams(e.EV.E1);
1079     elem **params = cast(elem **)malloc(n * (elem *).sizeof);
1080     assert(params);
1081     elem **tmp = params;
1082     el_paramArray(&tmp, e.EV.E1);
1083 
1084 static if (0)
1085 {
1086     printf("cdvector()\n");
1087     for (int i = 0; i < n; i++)
1088     {
1089         printf("[%d]: ", i);
1090         elem_print(params[i]);
1091     }
1092 }
1093 
1094     if (*pretregs == 0)
1095     {   /* Evaluate for side effects only
1096          */
1097         foreach (i; 0 .. n)
1098         {
1099             codelem(cdb,params[i], pretregs, false);
1100             *pretregs = 0;      // in case they got set
1101         }
1102         return;
1103     }
1104 
1105     assert(n >= 2 && n <= 4);
1106 
1107     elem *eop = params[0];
1108     elem *op1 = params[1];
1109     elem *op2 = null;
1110     tym_t ty2 = 0;
1111     if (n >= 3)
1112     {   op2 = params[2];
1113         ty2 = tybasic(op2.Ety);
1114     }
1115 
1116     auto op = cast(opcode_t)el_tolong(eop);
1117     debug assert(!isXMMstore(op));
1118     tym_t ty1 = tybasic(op1.Ety);
1119 
1120     regm_t retregs;
1121     if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst)
1122     {   // Handle: op xmm,imm8
1123 
1124         retregs = *pretregs & XMMREGS;
1125         if (!retregs)
1126             retregs = XMMREGS;
1127         codelem(cdb,op1,&retregs,false); // eval left leaf
1128         const reg = findreg(retregs);
1129         int r;
1130         switch (op)
1131         {
1132             case PSLLD:  r = 6; op = 0x660F72;  break;
1133             case PSLLQ:  r = 6; op = 0x660F73;  break;
1134             case PSLLW:  r = 6; op = 0x660F71;  break;
1135             case PSRAD:  r = 4; op = 0x660F72;  break;
1136             case PSRAW:  r = 4; op = 0x660F71;  break;
1137             case PSRLD:  r = 2; op = 0x660F72;  break;
1138             case PSRLQ:  r = 2; op = 0x660F73;  break;
1139             case PSRLW:  r = 2; op = 0x660F71;  break;
1140             case PSRLDQ: r = 3; op = 0x660F73;  break;
1141             case PSLLDQ: r = 7; op = 0x660F73;  break;
1142 
1143             default:
1144                 printf("op = x%x\n", op);
1145                 assert(0);
1146         }
1147         getregs(cdb,retregs);
1148         cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2));
1149     }
1150     else if (n == 2)
1151     {   /* Handle: op xmm,mem
1152          * where xmm is written only, not read
1153          */
1154         code cs;
1155 
1156         if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar)
1157         {
1158             getlvalue(cdb,&cs, op1, RMload);     // get addressing mode
1159         }
1160         else
1161         {
1162             regm_t rretregs = XMMREGS;
1163             codelem(cdb,op1, &rretregs, false);
1164             const rreg = findreg(rretregs) - XMM0;
1165             cs.Irm = modregrm(3,0,rreg & 7);
1166             cs.Iflags = 0;
1167             cs.Irex = 0;
1168             if (rreg & 8)
1169                 cs.Irex |= REX_B;
1170         }
1171 
1172         retregs = *pretregs & XMMREGS;
1173         if (!retregs)
1174             retregs = XMMREGS;
1175         reg_t reg;
1176         allocreg(cdb,&retregs, &reg, e.Ety);
1177         code_newreg(&cs, reg - XMM0);
1178         cs.Iop = op;
1179         cdb.gen(&cs);
1180     }
1181     else if (n == 3 || n == 4)
1182     {   /* Handle:
1183          *      op xmm,mem        // n = 3
1184          *      op xmm,mem,imm8   // n = 4
1185          * Both xmm and mem are operands, evaluate xmm first.
1186          */
1187 
1188         code cs;
1189 
1190         retregs = *pretregs & XMMREGS;
1191         if (!retregs)
1192             retregs = XMMREGS;
1193         codelem(cdb,op1,&retregs,false); // eval left leaf
1194         const reg = findreg(retregs);
1195 
1196         if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar)
1197         {
1198             getlvalue(cdb,&cs, op2, RMload | retregs);     // get addressing mode
1199         }
1200         else
1201         {
1202             regm_t rretregs = XMMREGS & ~retregs;
1203             scodelem(cdb, op2, &rretregs, retregs, true);
1204             const rreg = findreg(rretregs) - XMM0;
1205             cs.Irm = modregrm(3,0,rreg & 7);
1206             cs.Iflags = 0;
1207             cs.Irex = 0;
1208             if (rreg & 8)
1209                 cs.Irex |= REX_B;
1210         }
1211 
1212         getregs(cdb,retregs);
1213         if (n == 4)
1214         {
1215             switch (op)
1216             {
1217                 case CMPPD:   case CMPSS:   case CMPSD:   case CMPPS:
1218                 case PSHUFD:  case PSHUFHW: case PSHUFLW:
1219                 case BLENDPD: case BLENDPS: case DPPD:    case DPPS:
1220                 case MPSADBW: case PBLENDW:
1221                 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS:
1222                 case SHUFPD:  case SHUFPS:
1223                     break;
1224                 default:
1225                     printf("op = x%x\n", op);
1226                     assert(0);
1227             }
1228             elem *imm8 = params[3];
1229             cs.IFL2 = FLconst;
1230 version (MARS)
1231 {
1232             if (imm8.Eoper != OPconst)
1233             {
1234                 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant");
1235                 cs.IEV2.Vsize_t = 0;
1236             }
1237             else
1238                 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1239 }
1240 else
1241 {
1242             cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8);
1243 }
1244         }
1245         code_newreg(&cs, reg - XMM0);
1246         cs.Iop = op;
1247         cdb.gen(&cs);
1248     }
1249     else
1250         assert(0);
1251     fixresult(cdb,e,retregs,pretregs);
1252     free(params);
1253     freenode(e);
1254 }
1255 
1256 /***************
1257  * Generate code for vector "store" operations.
1258  * The tree e must look like:
1259  *  (op1 OPvecsto (op OPparam op2))
1260  * where op is the store instruction STOxxxx.
1261  */
1262 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1263 {
1264     //printf("cdvecsto()\n");
1265     //elem_print(e);
1266     elem *op1 = e.EV.E1;
1267     elem *op2 = e.EV.E2.EV.E2;
1268     elem *eop = e.EV.E2.EV.E1;
1269     const op = cast(opcode_t)el_tolong(eop);
1270     debug assert(isXMMstore(op));
1271     xmmeq(cdb, e, op, op1, op2, pretregs);
1272 }
1273 
1274 /***************
1275  * Generate code for OPvecfill (broadcast).
1276  * OPvecfill takes the single value in e1 and
1277  * fills the vector type with it.
1278  */
1279 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1280 {
1281     //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs));
1282 
1283     regm_t retregs = *pretregs & XMMREGS;
1284     if (!retregs)
1285         retregs = XMMREGS;
1286 
1287     code *c;
1288     code cs;
1289 
1290     elem *e1 = e.EV.E1;
1291 static if (0)
1292 {
1293     if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar)
1294     {
1295         cr = getlvalue(&cs, e1, RMload | retregs);     // get addressing mode
1296     }
1297     else
1298     {
1299         regm_t rretregs = XMMREGS & ~retregs;
1300         cr = scodelem(op2, &rretregs, retregs, true);
1301         const rreg = findreg(rretregs) - XMM0;
1302         cs.Irm = modregrm(3,0,rreg & 7);
1303         cs.Iflags = 0;
1304         cs.Irex = 0;
1305         if (rreg & 8)
1306             cs.Irex |= REX_B;
1307     }
1308 }
1309 
1310     const ty = tybasic(e.Ety);
1311     switch (ty)
1312     {
1313         case TYfloat4:
1314         case TYfloat8:
1315             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1316             {
1317                 // VBROADCASTSS X/YMM,MEM
1318                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1319                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1320                 reg_t reg;
1321                 allocreg(cdb,&retregs,&reg,ty);
1322                 cs.Iop = VBROADCASTSS;
1323                 cs.Irex &= ~REX_W;
1324                 code_newreg(&cs,reg - XMM0);
1325                 checkSetVex(&cs,ty);
1326                 cdb.gen(&cs);
1327             }
1328             else
1329             {
1330                 codelem(cdb,e1,&retregs,false); // eval left leaf
1331                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1332                 getregs(cdb,retregs);
1333                 if (config.avx >= 2)
1334                 {
1335                     // VBROADCASTSS X/YMM,XMM
1336                     cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg));
1337                     checkSetVex(cdb.last(), ty);
1338                 }
1339                 else
1340                 {
1341                     // (V)SHUFPS XMM,XMM,0
1342                     cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0);
1343                     checkSetVex(cdb.last(), ty);
1344                     if (tysize(ty) == 32)
1345                     {
1346                         // VINSERTF128 YMM,YMM,XMM,1
1347                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1348                         checkSetVex(cdb.last(), ty);
1349                     }
1350                 }
1351             }
1352             break;
1353 
1354         case TYdouble2:
1355         case TYdouble4:
1356             if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount)
1357             {
1358                 // VBROADCASTSD YMM,MEM
1359                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1360                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1361                 reg_t reg;
1362                 allocreg(cdb,&retregs,&reg,ty);
1363                 cs.Iop = VBROADCASTSD;
1364                 cs.Irex &= ~REX_W;
1365                 code_newreg(&cs,reg - XMM0);
1366                 checkSetVex(&cs,ty);
1367                 cdb.gen(&cs);
1368             }
1369             else
1370             {
1371                 codelem(cdb,e1,&retregs,false); // eval left leaf
1372                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1373                 getregs(cdb,retregs);
1374                 if (config.avx >= 2 && tysize(ty) == 32)
1375                 {
1376                     // VBROADCASTSD YMM,XMM
1377                     cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg));
1378                     checkSetVex(cdb.last(), ty);
1379                 }
1380                 else
1381                 {
1382                     // (V)UNPCKLPD XMM,XMM
1383                     cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg));
1384                     checkSetVex(cdb.last(), TYdouble2); // AVX-128
1385                     if (tysize(ty) == 32)
1386                     {
1387                         // VINSERTF128 YMM,YMM,XMM,1
1388                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1389                         checkSetVex(cdb.last(), ty);
1390                     }
1391                 }
1392             }
1393             break;
1394 
1395         case TYschar16:
1396         case TYuchar16:
1397         case TYschar32:
1398         case TYuchar32:
1399             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1400             {
1401                 // VPBROADCASTB X/YMM,MEM
1402                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1403                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1404                 reg_t reg;
1405                 allocreg(cdb,&retregs,&reg,ty);
1406                 cs.Iop = VPBROADCASTB;
1407                 cs.Irex &= ~REX_W;
1408                 code_newreg(&cs,reg - XMM0);
1409                 checkSetVex(&cs,ty);
1410                 cdb.gen(&cs);
1411             }
1412             else
1413             {
1414                 regm_t regm = ALLREGS;
1415                 codelem(cdb,e1,&regm,true); // eval left leaf
1416                 const r = findreg(regm);
1417 
1418                 reg_t reg;
1419                 allocreg(cdb,&retregs,&reg, e.Ety);
1420                 reg -= XMM0;
1421                 // (V)MOVD reg,r
1422                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1423                 checkSetVex(cdb.last(), TYushort8);
1424                 if (config.avx >= 2)
1425                 {
1426                     // VPBROADCASTB X/YMM,XMM
1427                     cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg));
1428                     checkSetVex(cdb.last(), ty);
1429                 }
1430                 else
1431                 {
1432                     if (config.avx)
1433                     {
1434                         reg_t zeroreg;
1435                         regm = XMMREGS & ~retregs;
1436                         // VPXOR XMM1,XMM1,XMM1
1437                         allocreg(cdb,&regm,&zeroreg, ty);
1438                         zeroreg -= XMM0;
1439                         cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg));
1440                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1441                         // VPSHUFB XMM,XMM,XMM1
1442                         cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg));
1443                         checkSetVex(cdb.last(), TYuchar16); // AVX-128
1444                     }
1445                     else
1446                     {
1447                         // PUNPCKLBW XMM,XMM
1448                         cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg));
1449                         // PUNPCKLWD XMM,XMM
1450                         cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1451                         // PSHUFD XMM,XMM,0
1452                         cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1453                     }
1454                     if (tysize(ty) == 32)
1455                     {
1456                         // VINSERTF128 YMM,YMM,XMM,1
1457                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1458                         checkSetVex(cdb.last(), ty);
1459                     }
1460                 }
1461             }
1462             break;
1463 
1464         case TYshort8:
1465         case TYushort8:
1466         case TYshort16:
1467         case TYushort16:
1468             if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount)
1469             {
1470                 // VPBROADCASTW X/YMM,MEM
1471                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1472                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1473                 reg_t reg;
1474                 allocreg(cdb,&retregs,&reg,ty);
1475                 cs.Iop = VPBROADCASTW;
1476                 cs.Irex &= ~REX_W;
1477                 cs.Iflags &= ~CFopsize;
1478                 code_newreg(&cs,reg - XMM0);
1479                 checkSetVex(&cs,ty);
1480                 cdb.gen(&cs);
1481             }
1482             else
1483             {
1484                 regm_t regm = ALLREGS;
1485                 codelem(cdb,e1,&regm,true); // eval left leaf
1486                 reg_t r = findreg(regm);
1487 
1488                 reg_t reg;
1489                 allocreg(cdb,&retregs,&reg, e.Ety);
1490                 reg -= XMM0;
1491                 // (V)MOVD reg,r
1492                 cdb.gen2(LODD,modregxrmx(3,reg,r));
1493                 checkSetVex(cdb.last(), TYushort8);
1494                 if (config.avx >= 2)
1495                 {
1496                     // VPBROADCASTW X/YMM,XMM
1497                     cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg));
1498                     checkSetVex(cdb.last(), ty);
1499                 }
1500                 else
1501                 {
1502                     // (V)PUNPCKLWD XMM,XMM
1503                     cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg));
1504                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1505                     // (V)PSHUFD XMM,XMM,0
1506                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1507                     checkSetVex(cdb.last(), TYushort8); // AVX-128
1508                     if (tysize(ty) == 32)
1509                     {
1510                         // VINSERTF128 YMM,YMM,XMM,1
1511                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1512                         checkSetVex(cdb.last(), ty);
1513                     }
1514                 }
1515             }
1516             break;
1517 
1518         case TYlong8:
1519         case TYulong8:
1520         case TYlong4:
1521         case TYulong4:
1522             if (config.avx && e1.Eoper == OPind && !e1.Ecount)
1523             {
1524                 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM
1525                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1526                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1527                 reg_t reg;
1528                 allocreg(cdb,&retregs,&reg,ty);
1529                 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS;
1530                 cs.Irex &= ~REX_W;
1531                 code_newreg(&cs,reg - XMM0);
1532                 checkSetVex(&cs,ty);
1533                 cdb.gen(&cs);
1534             }
1535             else
1536             {
1537                 codelem(cdb,e1,&retregs,true); // eval left leaf
1538                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1539                 getregs(cdb,retregs);
1540                 if (config.avx >= 2)
1541                 {
1542                     // VPBROADCASTD X/YMM,XMM
1543                     cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg));
1544                     checkSetVex(cdb.last(), ty);
1545                 }
1546                 else
1547                 {
1548                     // (V)PSHUFD XMM,XMM,0
1549                     cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0);
1550                     checkSetVex(cdb.last(), TYulong4); // AVX-128
1551                     if (tysize(ty) == 32)
1552                     {
1553                         // VINSERTF128 YMM,YMM,XMM,1
1554                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1555                         checkSetVex(cdb.last(), ty);
1556                     }
1557                 }
1558             }
1559             break;
1560 
1561         case TYllong2:
1562         case TYullong2:
1563         case TYllong4:
1564         case TYullong4:
1565             if (e1.Eoper == OPind && !e1.Ecount)
1566             {
1567                 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM
1568                 getlvalue(cdb,&cs, e1, 0);         // get addressing mode
1569                 assert((cs.Irm & 0xC0) != 0xC0);   // AVX1 doesn't have register source operands
1570                 reg_t reg;
1571                 allocreg(cdb,&retregs,&reg,ty);
1572                 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ;
1573                 cs.Irex &= ~REX_W;
1574                 code_newreg(&cs,reg - XMM0);
1575                 checkSetVex(&cs,ty);
1576                 cdb.gen(&cs);
1577             }
1578             else
1579             {
1580                 codelem(cdb,e1,&retregs,true); // eval left leaf
1581                 const reg = cast(reg_t)(findreg(retregs) - XMM0);
1582                 getregs(cdb,retregs);
1583                 if (config.avx >= 2)
1584                 {
1585                     // VPBROADCASTQ X/YMM,XMM
1586                     cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg));
1587                     checkSetVex(cdb.last(), ty);
1588                 }
1589                 else
1590                 {
1591                     // (V)PUNPCKLQDQ XMM,XMM
1592                     cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0);
1593                     checkSetVex(cdb.last(), TYullong2); // AVX-128
1594                     if (tysize(ty) == 32)
1595                     {
1596                         // VINSERTF128 YMM,YMM,XMM,1
1597                         cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1);
1598                         checkSetVex(cdb.last(), ty);
1599                     }
1600                 }
1601             }
1602             break;
1603 
1604         default:
1605             assert(0);
1606     }
1607 
1608     fixresult(cdb,e,retregs,pretregs);
1609 }
1610 
1611 /*******************************************
1612  * Determine if lvalue e is a vector aligned on a 16/32 byte boundary.
1613  * Assume it to be aligned unless can prove it is not.
1614  * Params:
1615  *      e = lvalue
1616  * Returns:
1617  *      false if definitely not aligned
1618  */
1619 
1620 bool xmmIsAligned(elem *e)
1621 {
1622     if (tyvector(e.Ety) && e.Eoper == OPvar)
1623     {
1624         Symbol *s = e.EV.Vsym;
1625         const alignsz = tyalignsize(e.Ety);
1626         if (Symbol_Salignsize(s) < alignsz ||
1627             e.EV.Voffset & (alignsz - 1) ||
1628             alignsz > STACKALIGN
1629            )
1630             return false;       // definitely not aligned
1631     }
1632     return true;        // assume aligned
1633 }
1634 
1635 /**************************************
1636  * VEX prefixes can be 2 or 3 bytes.
1637  * If it must be 3 bytes, set the CFvex3 flag.
1638  */
1639 
1640 void checkSetVex3(code *c)
1641 {
1642     // See Intel Vol. 2A 2.3.5.6
1643     if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 ||
1644         !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8))
1645        )
1646     {
1647         c.Iflags |= CFvex3;
1648     }
1649 }
1650 
1651 /*************************************
1652  * Determine if operation should be rewritten as a VEX
1653  * operation; and do so.
1654  * Params:
1655  *      c = code
1656  *      ty = type of operand
1657  */
1658 
1659 void checkSetVex(code *c, tym_t ty)
1660 {
1661     if (config.avx || tysize(ty) == 32)
1662     {
1663         uint vreg = (c.Irm >> 3) & 7;
1664         if (c.Irex & REX_R)
1665             vreg |= 8;
1666 
1667         // TODO: This is too simplistic, depending on the instruction, vex.vvvv
1668         // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes
1669         // NDS (non-destructive source), except for the incomplete list of 2
1670         // operand instructions (NOO) handled by the switch.
1671         switch (c.Iop)
1672         {
1673             case LODSS:
1674             case LODSD:
1675             case STOSS:
1676             case STOSD:
1677                 if ((c.Irm & 0xC0) == 0xC0)
1678                     break;
1679                 goto case LODAPS;
1680 
1681             case LODAPS:
1682             case LODUPS:
1683             case LODAPD:
1684             case LODUPD:
1685             case LODDQA:
1686             case LODDQU:
1687             case LODD:
1688             case LODQ:
1689             case STOAPS:
1690             case STOUPS:
1691             case STOAPD:
1692             case STOUPD:
1693             case STODQA:
1694             case STODQU:
1695             case STOD:
1696             case STOQ:
1697             case COMISS:
1698             case COMISD:
1699             case UCOMISS:
1700             case UCOMISD:
1701             case MOVDDUP:
1702             case MOVSHDUP:
1703             case MOVSLDUP:
1704             case VBROADCASTSS:
1705             case PSHUFD:
1706             case PSHUFHW:
1707             case PSHUFLW:
1708             case VPBROADCASTB:
1709             case VPBROADCASTW:
1710             case VPBROADCASTD:
1711             case VPBROADCASTQ:
1712                 vreg = 0;       // for 2 operand vex instructions
1713                 break;
1714 
1715             case VBROADCASTSD:
1716             case VBROADCASTF128:
1717             case VBROADCASTI128:
1718                 assert(tysize(ty) == 32); // AVX-256 only instructions
1719                 vreg = 0;       // for 2 operand vex instructions
1720                 break;
1721 
1722             case NOP:
1723                 return;         // ignore
1724 
1725             default:
1726                 break;
1727         }
1728 
1729         opcode_t op = 0xC4000000 | (c.Iop & 0xFF);
1730         switch (c.Iop & 0xFFFFFF00)
1731         {
1732             static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); }
1733             case 0x00000F00: op |= MM_PP(1,0); break;
1734             case 0x00660F00: op |= MM_PP(1,1); break;
1735             case 0x00F30F00: op |= MM_PP(1,2); break;
1736             case 0x00F20F00: op |= MM_PP(1,3); break;
1737             case 0x660F3800: op |= MM_PP(2,1); break;
1738             case 0x660F3A00: op |= MM_PP(3,1); break;
1739             default:
1740                 printf("Iop = %x\n", c.Iop);
1741                 assert(0);
1742         }
1743         c.Iop = op;
1744         c.Ivex.pfx = 0xC4;
1745         c.Ivex.r = !(c.Irex & REX_R);
1746         c.Ivex.x = !(c.Irex & REX_X);
1747         c.Ivex.b = !(c.Irex & REX_B);
1748         c.Ivex.w = (c.Irex & REX_W) != 0;
1749         c.Ivex.l = tysize(ty) == 32;
1750 
1751         c.Ivex.vvvv = cast(ushort)~vreg;
1752 
1753         c.Iflags |= CFvex;
1754         checkSetVex3(c);
1755     }
1756 }
1757 
1758 /**************************************
1759  * Load complex operand into XMM registers or flags or both.
1760  */
1761 
1762 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs)
1763 {
1764     //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs));
1765     //elem_print(e);
1766     assert(*pretregs & (XMMREGS | mPSW));
1767     if (*pretregs == (mXMM0 | mXMM1) &&
1768         e.Eoper != OPconst)
1769     {
1770         code cs = void;
1771         tym_t tym = tybasic(e.Ety);
1772         tym_t ty = tym == TYcdouble ? TYdouble : TYfloat;
1773         opcode_t opmv = xmmload(tym, xmmIsAligned(e));
1774 
1775         regm_t retregs0 = mXMM0;
1776         reg_t reg0;
1777         allocreg(cdb, &retregs0, &reg0, ty);
1778         loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0);  // MOVSS/MOVSD XMM0,data
1779         checkSetVex(cdb.last(), ty);
1780 
1781         regm_t retregs1 = mXMM1;
1782         reg_t reg1;
1783         allocreg(cdb, &retregs1, &reg1, ty);
1784         loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset
1785         checkSetVex(cdb.last(), ty);
1786 
1787         return;
1788     }
1789     cload87(cdb, e, pretregs);
1790 }
1791 
1792 }