1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (C) 2011-2020 by The D Language Foundation, All Rights Reserved 6 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d) 9 */ 10 11 module dmd.backend.cgxmm; 12 13 version (SCPP) 14 version = COMPILE; 15 version (MARS) 16 version = COMPILE; 17 18 version (COMPILE) 19 { 20 21 import core.stdc.stdio; 22 import core.stdc.stdlib; 23 import core.stdc.string; 24 25 import dmd.backend.cc; 26 import dmd.backend.cdef; 27 import dmd.backend.code; 28 import dmd.backend.code_x86; 29 import dmd.backend.codebuilder; 30 import dmd.backend.mem; 31 import dmd.backend.el; 32 import dmd.backend.global; 33 import dmd.backend.oper; 34 import dmd.backend.ty; 35 import dmd.backend.xmm; 36 37 version (SCPP) 38 import dmd.backend.exh; 39 version (MARS) 40 import dmd.backend.errors; 41 42 43 extern (C++): 44 45 nothrow: 46 47 int REGSIZE(); 48 49 uint mask(uint m); 50 51 /******************************************* 52 * Is operator a store operator? 53 */ 54 55 bool isXMMstore(opcode_t op) 56 { 57 switch (op) 58 { 59 case STOSS: case STOAPS: case STOUPS: 60 case STOSD: case STOAPD: case STOUPD: 61 case STOD: case STOQ: case STODQA: case STODQU: 62 case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true; 63 default: return false; 64 } 65 } 66 67 /******************************************* 68 * Move constant value into xmm register xreg. 69 */ 70 71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags) 72 { 73 /* Generate: 74 * MOV reg,value 75 * MOV xreg,reg 76 * Not so efficient. We should at least do a PXOR for 0. 77 */ 78 assert(mask(xreg) & XMMREGS); 79 assert(sz == 4 || sz == 8); 80 if (I32 && sz == 8) 81 { 82 reg_t r; 83 regm_t rm = ALLREGS; 84 allocreg(cdb,&rm,&r,TYint); // allocate scratch register 85 static union U { targ_size_t s; targ_long[2] l; } 86 U u = void; 87 u.l[1] = 0; 88 u.s = value; 89 targ_long *p = &u.l[0]; 90 movregconst(cdb,r,p[0],0); 91 cdb.genfltreg(STO,r,0); // MOV floatreg,r 92 movregconst(cdb,r,p[1],0); 93 cdb.genfltreg(STO,r,4); // MOV floatreg+4,r 94 95 const op = xmmload(TYdouble, true); 96 cdb.genxmmreg(op,xreg,0,TYdouble); // MOVSD XMMreg,floatreg 97 } 98 else 99 { 100 reg_t reg; 101 regwithvalue(cdb,ALLREGS,value,®,(sz == 8) ? 64 : 0); 102 cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg)); // MOVD xreg,reg 103 if (sz == 8) 104 code_orrex(cdb.last(), REX_W); 105 checkSetVex(cdb.last(), TYulong); 106 } 107 } 108 109 /*********************************************** 110 * Do simple orthogonal operators for XMM registers. 111 */ 112 113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 114 { 115 //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 116 elem *e1 = e.EV.E1; 117 elem *e2 = e.EV.E2; 118 119 // float + ifloat is not actually addition 120 if ((e.Eoper == OPadd || e.Eoper == OPmin) && 121 ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) || 122 (tyreal(e2.Ety) && tyimaginary(e1.Ety)))) 123 { 124 regm_t retregs = *pretregs & XMMREGS; 125 if (!retregs) 126 retregs = XMMREGS; 127 128 regm_t rretregs; 129 reg_t rreg; 130 if (tyreal(e1.Ety)) 131 { 132 const reg = findreg(retregs); 133 rreg = findreg(retregs & ~mask(reg)); 134 retregs = mask(reg); 135 rretregs = mask(rreg); 136 } 137 else 138 { 139 // Pick the second register, not the first 140 rreg = findreg(retregs); 141 rretregs = mask(rreg); 142 const reg = findreg(retregs & ~rretregs); 143 retregs = mask(reg); 144 } 145 assert(retregs && rretregs); 146 147 codelem(cdb,e1,&retregs,false); // eval left leaf 148 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 149 150 retregs |= rretregs; 151 if (e.Eoper == OPmin) 152 { 153 regm_t nretregs = XMMREGS & ~retregs; 154 reg_t sreg; // hold sign bit 155 const uint sz = tysize(e1.Ety); 156 allocreg(cdb,&nretregs,&sreg,e2.Ety); 157 targ_size_t signbit = 0x80000000; 158 if (sz == 8) 159 signbit = cast(targ_size_t)0x8000000000000000L; 160 movxmmconst(cdb,sreg, sz, signbit, 0); 161 getregs(cdb,nretregs); 162 const opcode_t xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg 163 cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)); 164 } 165 if (retregs != *pretregs) 166 fixresult(cdb,e,retregs,pretregs); 167 return; 168 } 169 170 regm_t retregs = *pretregs & XMMREGS; 171 if (!retregs) 172 retregs = XMMREGS; 173 const constflag = OTrel(e.Eoper); 174 codelem(cdb,e1,&retregs,constflag); // eval left leaf 175 const reg = findreg(retregs); 176 regm_t rretregs = XMMREGS & ~retregs; 177 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 178 179 const rreg = findreg(rretregs); 180 const op = xmmoperator(e1.Ety, e.Eoper); 181 182 /* We should take advantage of mem addressing modes for OP XMM,MEM 183 * but we do not at the moment. 184 */ 185 if (OTrel(e.Eoper)) 186 { 187 cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0)); 188 checkSetVex(cdb.last(), e1.Ety); 189 return; 190 } 191 192 getregs(cdb,retregs); 193 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 194 checkSetVex(cdb.last(), e1.Ety); 195 if (retregs != *pretregs) 196 fixresult(cdb,e,retregs,pretregs); 197 } 198 199 200 /************************ 201 * Generate code for an assignment using XMM registers. 202 * Params: 203 * opcode = store opcode to use, CMP means generate one 204 */ 205 206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs) 207 { 208 tym_t tymll; 209 int i; 210 code cs; 211 elem *e11; 212 bool regvar; /* true means evaluate into register variable */ 213 regm_t varregm; 214 targ_int postinc; 215 216 //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs)); 217 tym_t tyml = tybasic(e1.Ety); /* type of lvalue */ 218 regm_t retregs = *pretregs; 219 220 if (!(retregs & XMMREGS)) 221 retregs = XMMREGS; // pick any XMM reg 222 223 bool aligned = xmmIsAligned(e1); 224 // If default, select store opcode 225 cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op; 226 regvar = false; 227 varregm = 0; 228 if (config.flags4 & CFG4optimized) 229 { 230 // Be careful of cases like (x = x+x+x). We cannot evaluate in 231 // x if x is in a register. 232 reg_t varreg; 233 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 234 doinreg(e1.EV.Vsym,e2) && // and we can compute directly into it 235 varregm & XMMREGS 236 ) 237 { regvar = true; 238 retregs = varregm; // evaluate directly in target register 239 } 240 } 241 if (*pretregs & mPSW && OTleaf(e1.Eoper)) // if evaluating e1 couldn't change flags 242 { // Be careful that this lines up with jmpopcode() 243 retregs |= mPSW; 244 *pretregs &= ~mPSW; 245 } 246 scodelem(cdb,e2,&retregs,0,true); // get rvalue 247 248 // Look for special case of (*p++ = ...), where p is a register variable 249 if (e1.Eoper == OPind && 250 ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) && 251 e11.EV.E1.Eoper == OPvar && 252 e11.EV.E1.EV.Vsym.Sfl == FLreg 253 ) 254 { 255 postinc = e11.EV.E2.EV.Vint; 256 if (e11.Eoper == OPpostdec) 257 postinc = -postinc; 258 getlvalue(cdb,&cs,e11,RMstore | retregs); 259 freenode(e11.EV.E2); 260 } 261 else 262 { postinc = 0; 263 getlvalue(cdb,&cs,e1,RMstore | retregs); // get lvalue (cl == CNIL if regvar) 264 } 265 266 getregs_imm(cdb,regvar ? varregm : 0); 267 268 const reg = findreg(retregs & XMMREGS); 269 cs.Irm |= modregrm(0,(reg - XMM0) & 7,0); 270 if ((reg - XMM0) & 8) 271 cs.Irex |= REX_R; 272 273 // Do not generate mov from register onto itself 274 if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)))) 275 { 276 cdb.gen(&cs); // MOV EA+offset,reg 277 if (op == OPeq) 278 checkSetVex(cdb.last(), tyml); 279 } 280 281 if (e1.Ecount || // if lvalue is a CSE or 282 regvar) // rvalue can't be a CSE 283 { 284 getregs_imm(cdb,retregs); // necessary if both lvalue and 285 // rvalue are CSEs (since a reg 286 // can hold only one e at a time) 287 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 288 } 289 290 fixresult(cdb,e,retregs,pretregs); 291 if (postinc) 292 { 293 const increg = findreg(idxregm(&cs)); // the register to increment 294 if (*pretregs & mPSW) 295 { // Use LEA to avoid touching the flags 296 uint rm = cs.Irm & 7; 297 if (cs.Irex & REX_B) 298 rm |= 8; 299 cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc); 300 if (tysize(e11.EV.E1.Ety) == 8) 301 code_orrex(cdb.last(), REX_W); 302 } 303 else if (I64) 304 { 305 cdb.genc2(0x81,modregrmx(3,0,increg),postinc); 306 if (tysize(e11.EV.E1.Ety) == 8) 307 code_orrex(cdb.last(), REX_W); 308 } 309 else 310 { 311 if (postinc == 1) 312 cdb.gen1(0x40 + increg); // INC increg 313 else if (postinc == -cast(targ_int)1) 314 cdb.gen1(0x48 + increg); // DEC increg 315 else 316 { 317 cdb.genc2(0x81,modregrm(3,0,increg),postinc); 318 } 319 } 320 } 321 freenode(e1); 322 } 323 324 /******************************** 325 * Generate code for conversion using SSE2 instructions. 326 * 327 * OPs32_d 328 * OPs64_d (64-bit only) 329 * OPu32_d (64-bit only) 330 * OPd_f 331 * OPf_d 332 * OPd_s32 333 * OPd_s64 (64-bit only) 334 * 335 */ 336 337 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 338 { 339 //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs)); 340 opcode_t op = NoOpcode; 341 regm_t regs; 342 tym_t ty; 343 ubyte rex = 0; 344 bool zx = false; // zero extend uint 345 346 /* There are no ops for integer <. float/real conversions 347 * but there are instructions for them. In order to use these 348 * try to fuse chained conversions. Be careful not to loose 349 * precision for real to long. 350 */ 351 elem *e1 = e.EV.E1; 352 switch (e.Eoper) 353 { 354 case OPd_f: 355 if (e1.Eoper == OPs32_d) 356 { } 357 else if (I64 && e1.Eoper == OPs64_d) 358 rex = REX_W; 359 else if (I64 && e1.Eoper == OPu32_d) 360 { rex = REX_W; 361 zx = true; 362 } 363 else 364 { regs = XMMREGS; 365 op = CVTSD2SS; 366 ty = TYfloat; 367 break; 368 } 369 if (e1.Ecount) 370 { 371 regs = XMMREGS; 372 op = CVTSD2SS; 373 ty = TYfloat; 374 break; 375 } 376 // directly use si2ss 377 regs = ALLREGS; 378 e1 = e1.EV.E1; // fused operation 379 op = CVTSI2SS; 380 ty = TYfloat; 381 break; 382 383 case OPs32_d: goto Litod; 384 case OPs64_d: rex = REX_W; goto Litod; 385 case OPu32_d: rex = REX_W; zx = true; goto Litod; 386 Litod: 387 regs = ALLREGS; 388 op = CVTSI2SD; 389 ty = TYdouble; 390 break; 391 392 case OPd_s32: ty = TYint; goto Ldtoi; 393 case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi; 394 case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi; 395 Ldtoi: 396 regs = XMMREGS; 397 switch (e1.Eoper) 398 { 399 case OPf_d: 400 if (e1.Ecount) 401 { 402 op = CVTTSD2SI; 403 break; 404 } 405 e1 = e1.EV.E1; // fused operation 406 op = CVTTSS2SI; 407 break; 408 case OPld_d: 409 if (e.Eoper == OPd_s64) 410 { 411 cnvt87(cdb,e,pretregs); // precision 412 return; 413 } 414 goto default; 415 416 default: 417 op = CVTTSD2SI; 418 break; 419 } 420 break; 421 422 case OPf_d: 423 regs = XMMREGS; 424 op = CVTSS2SD; 425 ty = TYdouble; 426 break; 427 428 default: 429 assert(0); 430 } 431 assert(op != NoOpcode); 432 433 codelem(cdb,e1, ®s, false); 434 reg_t reg = findreg(regs); 435 if (isXMMreg(reg)) 436 reg -= XMM0; 437 else if (zx) 438 { assert(I64); 439 getregs(cdb,regs); 440 genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit 441 // Don't use x89 because that will get optimized away 442 code_orflag(cdb.last(),CFvolatile); 443 } 444 445 regm_t retregs = *pretregs; 446 if (tyxmmreg(ty)) // target is XMM 447 { if (!(*pretregs & XMMREGS)) 448 retregs = XMMREGS; 449 } 450 else // source is XMM 451 { assert(regs & XMMREGS); 452 if (!(retregs & ALLREGS)) 453 retregs = ALLREGS; 454 } 455 456 reg_t rreg; 457 allocreg(cdb,&retregs,&rreg,ty); 458 if (isXMMreg(rreg)) 459 rreg -= XMM0; 460 461 cdb.gen2(op, modregxrmx(3,rreg,reg)); 462 assert(I64 || !rex); 463 if (rex) 464 code_orrex(cdb.last(), rex); 465 466 if (*pretregs != retregs) 467 fixresult(cdb,e,retregs,pretregs); 468 } 469 470 /******************************** 471 * Generate code for op= 472 */ 473 474 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 475 { elem *e1 = e.EV.E1; 476 elem *e2 = e.EV.E2; 477 tym_t ty1 = tybasic(e1.Ety); 478 const sz1 = _tysize[ty1]; 479 regm_t rretregs = XMMREGS & ~*pretregs; 480 if (!rretregs) 481 rretregs = XMMREGS; 482 483 codelem(cdb,e2,&rretregs,false); // eval right leaf 484 reg_t rreg = findreg(rretregs); 485 486 code cs; 487 regm_t retregs; 488 reg_t reg; 489 bool regvar = false; 490 if (config.flags4 & CFG4optimized) 491 { 492 // Be careful of cases like (x = x+x+x). We cannot evaluate in 493 // x if x is in a register. 494 reg_t varreg; 495 regm_t varregm; 496 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 497 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 498 ) 499 { regvar = true; 500 retregs = varregm; 501 reg = varreg; // evaluate directly in target register 502 getregs(cdb,retregs); // destroy these regs 503 } 504 } 505 506 if (!regvar) 507 { 508 getlvalue(cdb,&cs,e1,rretregs); // get EA 509 retregs = *pretregs & XMMREGS & ~rretregs; 510 if (!retregs) 511 retregs = XMMREGS & ~rretregs; 512 allocreg(cdb,&retregs,®,ty1); 513 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 514 code_newreg(&cs,reg - XMM0); 515 cdb.gen(&cs); 516 checkSetVex(cdb.last(), ty1); 517 } 518 519 const op = xmmoperator(e1.Ety, e.Eoper); 520 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 521 checkSetVex(cdb.last(), e1.Ety); 522 523 if (!regvar) 524 { 525 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 526 cdb.gen(&cs); 527 checkSetVex(cdb.last(), ty1); 528 } 529 530 if (e1.Ecount || // if lvalue is a CSE or 531 regvar) // rvalue can't be a CSE 532 { 533 getregs_imm(cdb,retregs); // necessary if both lvalue and 534 // rvalue are CSEs (since a reg 535 // can hold only one e at a time) 536 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 537 } 538 539 fixresult(cdb,e,retregs,pretregs); 540 freenode(e1); 541 } 542 543 /******************************** 544 * Generate code for post increment and post decrement. 545 */ 546 547 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 548 { 549 elem *e1 = e.EV.E1; 550 elem *e2 = e.EV.E2; 551 tym_t ty1 = tybasic(e1.Ety); 552 553 regm_t retregs; 554 reg_t reg; 555 bool regvar = false; 556 if (config.flags4 & CFG4optimized) 557 { 558 // Be careful of cases like (x = x+x+x). We cannot evaluate in 559 // x if x is in a register. 560 reg_t varreg; 561 regm_t varregm; 562 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 563 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 564 ) 565 { 566 regvar = true; 567 retregs = varregm; 568 reg = varreg; // evaluate directly in target register 569 getregs(cdb,retregs); // destroy these regs 570 } 571 } 572 573 code cs; 574 if (!regvar) 575 { 576 getlvalue(cdb,&cs,e1,0); // get EA 577 retregs = XMMREGS & ~*pretregs; 578 if (!retregs) 579 retregs = XMMREGS; 580 allocreg(cdb,&retregs,®,ty1); 581 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 582 code_newreg(&cs,reg - XMM0); 583 cdb.gen(&cs); 584 checkSetVex(cdb.last(), ty1); 585 } 586 587 // Result register 588 regm_t resultregs = XMMREGS & *pretregs & ~retregs; 589 if (!resultregs) 590 resultregs = XMMREGS & ~retregs; 591 reg_t resultreg; 592 allocreg(cdb,&resultregs, &resultreg, ty1); 593 594 cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0)); // MOVSS/D resultreg,reg 595 checkSetVex(cdb.last(), ty1); 596 597 regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs); 598 if (!rretregs) 599 rretregs = XMMREGS & ~(retregs | resultregs); 600 codelem(cdb,e2,&rretregs,false); // eval right leaf 601 const rreg = findreg(rretregs); 602 603 const op = xmmoperator(e1.Ety, e.Eoper); 604 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); // ADD reg,rreg 605 checkSetVex(cdb.last(), e1.Ety); 606 607 if (!regvar) 608 { 609 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 610 cdb.gen(&cs); 611 checkSetVex(cdb.last(), ty1); 612 } 613 614 if (e1.Ecount || // if lvalue is a CSE or 615 regvar) // rvalue can't be a CSE 616 { 617 getregs_imm(cdb,retregs); // necessary if both lvalue and 618 // rvalue are CSEs (since a reg 619 // can hold only one e at a time) 620 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 621 } 622 623 fixresult(cdb,e,resultregs,pretregs); 624 freenode(e1); 625 } 626 627 /****************** 628 * Negate operator 629 */ 630 631 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 632 { 633 //printf("xmmneg()\n"); 634 //elem_print(e); 635 assert(*pretregs); 636 tym_t tyml = tybasic(e.EV.E1.Ety); 637 int sz = _tysize[tyml]; 638 639 regm_t retregs = *pretregs & XMMREGS; 640 if (!retregs) 641 retregs = XMMREGS; 642 643 /* Generate: 644 * MOV reg,e1 645 * MOV rreg,signbit 646 * XOR reg,rreg 647 */ 648 codelem(cdb,e.EV.E1,&retregs,false); 649 getregs(cdb,retregs); 650 const reg = findreg(retregs); 651 regm_t rretregs = XMMREGS & ~retregs; 652 reg_t rreg; 653 allocreg(cdb,&rretregs,&rreg,tyml); 654 targ_size_t signbit = 0x80000000; 655 if (sz == 8) 656 signbit = cast(targ_size_t)0x8000000000000000L; 657 movxmmconst(cdb,rreg, sz, signbit, 0); 658 659 getregs(cdb,retregs); 660 const op = (sz == 8) ? XORPD : XORPS; // XORPD/S reg,rreg 661 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 662 fixresult(cdb,e,retregs,pretregs); 663 } 664 665 /***************************** 666 * Get correct load operator based on type. 667 * It is important to use the right one even if the number of bits moved is the same, 668 * as there are performance consequences for using the wrong one. 669 * Params: 670 * tym = type of data to load 671 * aligned = for vectors, true if aligned to 16 bytes 672 */ 673 674 opcode_t xmmload(tym_t tym, bool aligned) 675 { 676 opcode_t op; 677 if (tysize(tym) == 32) 678 aligned = false; 679 switch (tybasic(tym)) 680 { 681 case TYuint: 682 case TYint: 683 case TYlong: 684 case TYulong: op = LODD; break; // MOVD 685 case TYfloat: 686 case TYcfloat: 687 case TYifloat: op = LODSS; break; // MOVSS 688 case TYllong: 689 case TYullong: op = LODQ; break; // MOVQ 690 case TYdouble: 691 case TYcdouble: 692 case TYidouble: op = LODSD; break; // MOVSD 693 694 case TYfloat8: 695 case TYfloat4: op = aligned ? LODAPS : LODUPS; break; // MOVAPS / MOVUPS 696 case TYdouble4: 697 case TYdouble2: op = aligned ? LODAPD : LODUPD; break; // MOVAPD / MOVUPD 698 case TYschar16: 699 case TYuchar16: 700 case TYshort8: 701 case TYushort8: 702 case TYlong4: 703 case TYulong4: 704 case TYllong2: 705 case TYullong2: 706 case TYschar32: 707 case TYuchar32: 708 case TYshort16: 709 case TYushort16: 710 case TYlong8: 711 case TYulong8: 712 case TYllong4: 713 case TYullong4: op = aligned ? LODDQA : LODDQU; break; // MOVDQA / MOVDQU 714 715 default: 716 printf("tym = x%x\n", tym); 717 assert(0); 718 } 719 return op; 720 } 721 722 /***************************** 723 * Get correct store operator based on type. 724 */ 725 726 opcode_t xmmstore(tym_t tym, bool aligned) 727 { 728 opcode_t op; 729 switch (tybasic(tym)) 730 { 731 case TYuint: 732 case TYint: 733 case TYlong: 734 case TYulong: op = STOD; break; // MOVD 735 case TYfloat: 736 case TYifloat: op = STOSS; break; // MOVSS 737 case TYllong: 738 case TYullong: op = STOQ; break; // MOVQ 739 case TYdouble: 740 case TYidouble: 741 case TYcdouble: 742 case TYcfloat: op = STOSD; break; // MOVSD 743 744 case TYfloat8: 745 case TYfloat4: op = aligned ? STOAPS : STOUPS; break; // MOVAPS / MOVUPS 746 case TYdouble4: 747 case TYdouble2: op = aligned ? STOAPD : STOUPD; break; // MOVAPD / MOVUPD 748 case TYschar16: 749 case TYuchar16: 750 case TYshort8: 751 case TYushort8: 752 case TYlong4: 753 case TYulong4: 754 case TYllong2: 755 case TYullong2: 756 case TYschar32: 757 case TYuchar32: 758 case TYshort16: 759 case TYushort16: 760 case TYlong8: 761 case TYulong8: 762 case TYllong4: 763 case TYullong4: op = aligned ? STODQA : STODQU; break; // MOVDQA / MOVDQU 764 765 default: 766 printf("tym = 0x%x\n", tym); 767 assert(0); 768 } 769 return op; 770 } 771 772 773 /************************************ 774 * Get correct XMM operator based on type and operator. 775 */ 776 777 private opcode_t xmmoperator(tym_t tym, OPER oper) 778 { 779 tym = tybasic(tym); 780 opcode_t op; 781 switch (oper) 782 { 783 case OPadd: 784 case OPaddass: 785 case OPpostinc: 786 switch (tym) 787 { 788 case TYfloat: 789 case TYifloat: op = ADDSS; break; 790 case TYdouble: 791 case TYidouble: op = ADDSD; break; 792 793 // SIMD vector types 794 case TYfloat8: 795 case TYfloat4: op = ADDPS; break; 796 case TYdouble4: 797 case TYdouble2: op = ADDPD; break; 798 case TYschar32: 799 case TYuchar32: 800 case TYschar16: 801 case TYuchar16: op = PADDB; break; 802 case TYshort16: 803 case TYushort16: 804 case TYshort8: 805 case TYushort8: op = PADDW; break; 806 case TYlong8: 807 case TYulong8: 808 case TYlong4: 809 case TYulong4: op = PADDD; break; 810 case TYllong4: 811 case TYullong4: 812 case TYllong2: 813 case TYullong2: op = PADDQ; break; 814 815 default: 816 printf("tym = x%x\n", tym); 817 assert(0); 818 } 819 break; 820 821 case OPmin: 822 case OPminass: 823 case OPpostdec: 824 switch (tym) 825 { 826 case TYfloat: 827 case TYifloat: op = SUBSS; break; 828 case TYdouble: 829 case TYidouble: op = SUBSD; break; 830 831 // SIMD vector types 832 case TYfloat8: 833 case TYfloat4: op = SUBPS; break; 834 case TYdouble4: 835 case TYdouble2: op = SUBPD; break; 836 case TYschar32: 837 case TYuchar32: 838 case TYschar16: 839 case TYuchar16: op = PSUBB; break; 840 case TYshort16: 841 case TYushort16: 842 case TYshort8: 843 case TYushort8: op = PSUBW; break; 844 case TYlong8: 845 case TYulong8: 846 case TYlong4: 847 case TYulong4: op = PSUBD; break; 848 case TYllong4: 849 case TYullong4: 850 case TYllong2: 851 case TYullong2: op = PSUBQ; break; 852 853 default: assert(0); 854 } 855 break; 856 857 case OPmul: 858 case OPmulass: 859 switch (tym) 860 { 861 case TYfloat: 862 case TYifloat: op = MULSS; break; 863 case TYdouble: 864 case TYidouble: op = MULSD; break; 865 866 // SIMD vector types 867 case TYfloat8: 868 case TYfloat4: op = MULPS; break; 869 case TYdouble4: 870 case TYdouble2: op = MULPD; break; 871 case TYshort16: 872 case TYushort16: 873 case TYshort8: 874 case TYushort8: op = PMULLW; break; 875 case TYlong8: 876 case TYulong8: 877 case TYlong4: 878 case TYulong4: op = PMULLD; break; 879 880 default: assert(0); 881 } 882 break; 883 884 case OPdiv: 885 case OPdivass: 886 switch (tym) 887 { 888 case TYfloat: 889 case TYifloat: op = DIVSS; break; 890 case TYdouble: 891 case TYidouble: op = DIVSD; break; 892 893 // SIMD vector types 894 case TYfloat8: 895 case TYfloat4: op = DIVPS; break; 896 case TYdouble4: 897 case TYdouble2: op = DIVPD; break; 898 899 default: assert(0); 900 } 901 break; 902 903 case OPor: 904 case OPorass: 905 switch (tym) 906 { 907 // SIMD vector types 908 case TYschar16: 909 case TYuchar16: 910 case TYshort8: 911 case TYushort8: 912 case TYlong4: 913 case TYulong4: 914 case TYllong2: 915 case TYullong2: 916 case TYschar32: 917 case TYuchar32: 918 case TYshort16: 919 case TYushort16: 920 case TYlong8: 921 case TYulong8: 922 case TYllong4: 923 case TYullong4: op = POR; break; 924 925 default: assert(0); 926 } 927 break; 928 929 case OPand: 930 case OPandass: 931 switch (tym) 932 { 933 // SIMD vector types 934 case TYschar16: 935 case TYuchar16: 936 case TYshort8: 937 case TYushort8: 938 case TYlong4: 939 case TYulong4: 940 case TYllong2: 941 case TYullong2: 942 case TYschar32: 943 case TYuchar32: 944 case TYshort16: 945 case TYushort16: 946 case TYlong8: 947 case TYulong8: 948 case TYllong4: 949 case TYullong4: op = PAND; break; 950 951 default: assert(0); 952 } 953 break; 954 955 case OPxor: 956 case OPxorass: 957 switch (tym) 958 { 959 // SIMD vector types 960 case TYschar16: 961 case TYuchar16: 962 case TYshort8: 963 case TYushort8: 964 case TYlong4: 965 case TYulong4: 966 case TYllong2: 967 case TYullong2: 968 case TYschar32: 969 case TYuchar32: 970 case TYshort16: 971 case TYushort16: 972 case TYlong8: 973 case TYulong8: 974 case TYllong4: 975 case TYullong4: op = PXOR; break; 976 977 default: assert(0); 978 } 979 break; 980 981 case OPlt: 982 case OPle: 983 case OPgt: 984 case OPge: 985 case OPne: 986 case OPeqeq: 987 case OPunord: /* !<>= */ 988 case OPlg: /* <> */ 989 case OPleg: /* <>= */ 990 case OPule: /* !> */ 991 case OPul: /* !>= */ 992 case OPuge: /* !< */ 993 case OPug: /* !<= */ 994 case OPue: /* !<> */ 995 case OPngt: 996 case OPnge: 997 case OPnlt: 998 case OPnle: 999 case OPord: 1000 case OPnlg: 1001 case OPnleg: 1002 case OPnule: 1003 case OPnul: 1004 case OPnuge: 1005 case OPnug: 1006 case OPnue: 1007 switch (tym) 1008 { 1009 case TYfloat: 1010 case TYifloat: op = UCOMISS; break; 1011 case TYdouble: 1012 case TYidouble: op = UCOMISD; break; 1013 1014 default: assert(0); 1015 } 1016 break; 1017 1018 default: 1019 assert(0); 1020 } 1021 return op; 1022 } 1023 1024 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1025 { 1026 /* e should look like one of: 1027 * vector 1028 * | 1029 * param 1030 * / \ 1031 * param op2 1032 * / \ 1033 * op op1 1034 */ 1035 1036 if (!config.fpxmmregs) 1037 { printf("SIMD operations not supported on this platform\n"); 1038 exit(1); 1039 } 1040 1041 const n = el_nparams(e.EV.E1); 1042 elem **params = cast(elem **)malloc(n * (elem *).sizeof); 1043 assert(params); 1044 elem **tmp = params; 1045 el_paramArray(&tmp, e.EV.E1); 1046 1047 static if (0) 1048 { 1049 printf("cdvector()\n"); 1050 for (int i = 0; i < n; i++) 1051 { 1052 printf("[%d]: ", i); 1053 elem_print(params[i]); 1054 } 1055 } 1056 1057 if (*pretregs == 0) 1058 { /* Evaluate for side effects only 1059 */ 1060 foreach (i; 0 .. n) 1061 { 1062 codelem(cdb,params[i], pretregs, false); 1063 *pretregs = 0; // in case they got set 1064 } 1065 return; 1066 } 1067 1068 assert(n >= 2 && n <= 4); 1069 1070 elem *eop = params[0]; 1071 elem *op1 = params[1]; 1072 elem *op2 = null; 1073 tym_t ty2 = 0; 1074 if (n >= 3) 1075 { op2 = params[2]; 1076 ty2 = tybasic(op2.Ety); 1077 } 1078 1079 auto op = cast(opcode_t)el_tolong(eop); 1080 debug assert(!isXMMstore(op)); 1081 tym_t ty1 = tybasic(op1.Ety); 1082 1083 regm_t retregs; 1084 if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst) 1085 { // Handle: op xmm,imm8 1086 1087 retregs = *pretregs & XMMREGS; 1088 if (!retregs) 1089 retregs = XMMREGS; 1090 codelem(cdb,op1,&retregs,false); // eval left leaf 1091 const reg = findreg(retregs); 1092 int r; 1093 switch (op) 1094 { 1095 case PSLLD: r = 6; op = 0x660F72; break; 1096 case PSLLQ: r = 6; op = 0x660F73; break; 1097 case PSLLW: r = 6; op = 0x660F71; break; 1098 case PSRAD: r = 4; op = 0x660F72; break; 1099 case PSRAW: r = 4; op = 0x660F71; break; 1100 case PSRLD: r = 2; op = 0x660F72; break; 1101 case PSRLQ: r = 2; op = 0x660F73; break; 1102 case PSRLW: r = 2; op = 0x660F71; break; 1103 case PSRLDQ: r = 3; op = 0x660F73; break; 1104 case PSLLDQ: r = 7; op = 0x660F73; break; 1105 1106 default: 1107 printf("op = x%x\n", op); 1108 assert(0); 1109 } 1110 getregs(cdb,retregs); 1111 cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2)); 1112 } 1113 else if (n == 2) 1114 { /* Handle: op xmm,mem 1115 * where xmm is written only, not read 1116 */ 1117 code cs; 1118 1119 if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar) 1120 { 1121 getlvalue(cdb,&cs, op1, RMload); // get addressing mode 1122 } 1123 else 1124 { 1125 regm_t rretregs = XMMREGS; 1126 codelem(cdb,op1, &rretregs, false); 1127 const rreg = findreg(rretregs) - XMM0; 1128 cs.Irm = modregrm(3,0,rreg & 7); 1129 cs.Iflags = 0; 1130 cs.Irex = 0; 1131 if (rreg & 8) 1132 cs.Irex |= REX_B; 1133 } 1134 1135 retregs = *pretregs & XMMREGS; 1136 if (!retregs) 1137 retregs = XMMREGS; 1138 reg_t reg; 1139 allocreg(cdb,&retregs, ®, e.Ety); 1140 code_newreg(&cs, reg - XMM0); 1141 cs.Iop = op; 1142 cdb.gen(&cs); 1143 } 1144 else if (n == 3 || n == 4) 1145 { /* Handle: 1146 * op xmm,mem // n = 3 1147 * op xmm,mem,imm8 // n = 4 1148 * Both xmm and mem are operands, evaluate xmm first. 1149 */ 1150 1151 code cs; 1152 1153 retregs = *pretregs & XMMREGS; 1154 if (!retregs) 1155 retregs = XMMREGS; 1156 codelem(cdb,op1,&retregs,false); // eval left leaf 1157 const reg = findreg(retregs); 1158 1159 if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar) 1160 { 1161 getlvalue(cdb,&cs, op2, RMload | retregs); // get addressing mode 1162 } 1163 else 1164 { 1165 regm_t rretregs = XMMREGS & ~retregs; 1166 scodelem(cdb, op2, &rretregs, retregs, true); 1167 const rreg = findreg(rretregs) - XMM0; 1168 cs.Irm = modregrm(3,0,rreg & 7); 1169 cs.Iflags = 0; 1170 cs.Irex = 0; 1171 if (rreg & 8) 1172 cs.Irex |= REX_B; 1173 } 1174 1175 getregs(cdb,retregs); 1176 if (n == 4) 1177 { 1178 switch (op) 1179 { 1180 case CMPPD: case CMPSS: case CMPSD: case CMPPS: 1181 case PSHUFD: case PSHUFHW: case PSHUFLW: 1182 case BLENDPD: case BLENDPS: case DPPD: case DPPS: 1183 case MPSADBW: case PBLENDW: 1184 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS: 1185 case SHUFPD: case SHUFPS: 1186 break; 1187 default: 1188 printf("op = x%x\n", op); 1189 assert(0); 1190 } 1191 elem *imm8 = params[3]; 1192 cs.IFL2 = FLconst; 1193 version (MARS) 1194 { 1195 if (imm8.Eoper != OPconst) 1196 { 1197 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant"); 1198 cs.IEV2.Vsize_t = 0; 1199 } 1200 else 1201 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1202 } 1203 else 1204 { 1205 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1206 } 1207 } 1208 code_newreg(&cs, reg - XMM0); 1209 cs.Iop = op; 1210 cdb.gen(&cs); 1211 } 1212 else 1213 assert(0); 1214 fixresult(cdb,e,retregs,pretregs); 1215 free(params); 1216 freenode(e); 1217 } 1218 1219 /*************** 1220 * Generate code for vector "store" operations. 1221 * The tree e must look like: 1222 * (op1 OPvecsto (op OPparam op2)) 1223 * where op is the store instruction STOxxxx. 1224 */ 1225 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1226 { 1227 //printf("cdvecsto()\n"); 1228 //elem_print(e); 1229 elem *op1 = e.EV.E1; 1230 elem *op2 = e.EV.E2.EV.E2; 1231 elem *eop = e.EV.E2.EV.E1; 1232 const op = cast(opcode_t)el_tolong(eop); 1233 debug assert(isXMMstore(op)); 1234 xmmeq(cdb, e, op, op1, op2, pretregs); 1235 } 1236 1237 /*************** 1238 * Generate code for OPvecfill (broadcast). 1239 * OPvecfill takes the single value in e1 and 1240 * fills the vector type with it. 1241 */ 1242 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1243 { 1244 //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs)); 1245 1246 regm_t retregs = *pretregs & XMMREGS; 1247 if (!retregs) 1248 retregs = XMMREGS; 1249 1250 code *c; 1251 code cs; 1252 1253 elem *e1 = e.EV.E1; 1254 static if (0) 1255 { 1256 if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar) 1257 { 1258 cr = getlvalue(&cs, e1, RMload | retregs); // get addressing mode 1259 } 1260 else 1261 { 1262 regm_t rretregs = XMMREGS & ~retregs; 1263 cr = scodelem(op2, &rretregs, retregs, true); 1264 const rreg = findreg(rretregs) - XMM0; 1265 cs.Irm = modregrm(3,0,rreg & 7); 1266 cs.Iflags = 0; 1267 cs.Irex = 0; 1268 if (rreg & 8) 1269 cs.Irex |= REX_B; 1270 } 1271 } 1272 1273 const ty = tybasic(e.Ety); 1274 switch (ty) 1275 { 1276 case TYfloat4: 1277 case TYfloat8: 1278 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1279 { 1280 // VBROADCASTSS X/YMM,MEM 1281 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1282 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1283 reg_t reg; 1284 allocreg(cdb,&retregs,®,ty); 1285 cs.Iop = VBROADCASTSS; 1286 cs.Irex &= ~REX_W; 1287 code_newreg(&cs,reg - XMM0); 1288 checkSetVex(&cs,ty); 1289 cdb.gen(&cs); 1290 } 1291 else 1292 { 1293 codelem(cdb,e1,&retregs,false); // eval left leaf 1294 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1295 getregs(cdb,retregs); 1296 if (config.avx >= 2) 1297 { 1298 // VBROADCASTSS X/YMM,XMM 1299 cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg)); 1300 checkSetVex(cdb.last(), ty); 1301 } 1302 else 1303 { 1304 // (V)SHUFPS XMM,XMM,0 1305 cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0); 1306 checkSetVex(cdb.last(), ty); 1307 if (tysize(ty) == 32) 1308 { 1309 // VINSERTF128 YMM,YMM,XMM,1 1310 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1311 checkSetVex(cdb.last(), ty); 1312 } 1313 } 1314 } 1315 break; 1316 1317 case TYdouble2: 1318 case TYdouble4: 1319 if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount) 1320 { 1321 // VBROADCASTSD YMM,MEM 1322 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1323 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1324 reg_t reg; 1325 allocreg(cdb,&retregs,®,ty); 1326 cs.Iop = VBROADCASTSD; 1327 cs.Irex &= ~REX_W; 1328 code_newreg(&cs,reg - XMM0); 1329 checkSetVex(&cs,ty); 1330 cdb.gen(&cs); 1331 } 1332 else 1333 { 1334 codelem(cdb,e1,&retregs,false); // eval left leaf 1335 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1336 getregs(cdb,retregs); 1337 if (config.avx >= 2 && tysize(ty) == 32) 1338 { 1339 // VBROADCASTSD YMM,XMM 1340 cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg)); 1341 checkSetVex(cdb.last(), ty); 1342 } 1343 else 1344 { 1345 // (V)UNPCKLPD XMM,XMM 1346 cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg)); 1347 checkSetVex(cdb.last(), TYdouble2); // AVX-128 1348 if (tysize(ty) == 32) 1349 { 1350 // VINSERTF128 YMM,YMM,XMM,1 1351 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1352 checkSetVex(cdb.last(), ty); 1353 } 1354 } 1355 } 1356 break; 1357 1358 case TYschar16: 1359 case TYuchar16: 1360 case TYschar32: 1361 case TYuchar32: 1362 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1363 { 1364 // VPBROADCASTB X/YMM,MEM 1365 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1366 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1367 reg_t reg; 1368 allocreg(cdb,&retregs,®,ty); 1369 cs.Iop = VPBROADCASTB; 1370 cs.Irex &= ~REX_W; 1371 code_newreg(&cs,reg - XMM0); 1372 checkSetVex(&cs,ty); 1373 cdb.gen(&cs); 1374 } 1375 else 1376 { 1377 regm_t regm = ALLREGS; 1378 codelem(cdb,e1,®m,true); // eval left leaf 1379 const r = findreg(regm); 1380 1381 reg_t reg; 1382 allocreg(cdb,&retregs,®, e.Ety); 1383 reg -= XMM0; 1384 // (V)MOVD reg,r 1385 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1386 checkSetVex(cdb.last(), TYushort8); 1387 if (config.avx >= 2) 1388 { 1389 // VPBROADCASTB X/YMM,XMM 1390 cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg)); 1391 checkSetVex(cdb.last(), ty); 1392 } 1393 else 1394 { 1395 if (config.avx) 1396 { 1397 reg_t zeroreg; 1398 regm = XMMREGS & ~retregs; 1399 // VPXOR XMM1,XMM1,XMM1 1400 allocreg(cdb,®m,&zeroreg, ty); 1401 zeroreg -= XMM0; 1402 cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg)); 1403 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1404 // VPSHUFB XMM,XMM,XMM1 1405 cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg)); 1406 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1407 } 1408 else 1409 { 1410 // PUNPCKLBW XMM,XMM 1411 cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg)); 1412 // PUNPCKLWD XMM,XMM 1413 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1414 // PSHUFD XMM,XMM,0 1415 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1416 } 1417 if (tysize(ty) == 32) 1418 { 1419 // VINSERTF128 YMM,YMM,XMM,1 1420 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1421 checkSetVex(cdb.last(), ty); 1422 } 1423 } 1424 } 1425 break; 1426 1427 case TYshort8: 1428 case TYushort8: 1429 case TYshort16: 1430 case TYushort16: 1431 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1432 { 1433 // VPBROADCASTW X/YMM,MEM 1434 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1435 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1436 reg_t reg; 1437 allocreg(cdb,&retregs,®,ty); 1438 cs.Iop = VPBROADCASTW; 1439 cs.Irex &= ~REX_W; 1440 cs.Iflags &= ~CFopsize; 1441 code_newreg(&cs,reg - XMM0); 1442 checkSetVex(&cs,ty); 1443 cdb.gen(&cs); 1444 } 1445 else 1446 { 1447 regm_t regm = ALLREGS; 1448 codelem(cdb,e1,®m,true); // eval left leaf 1449 reg_t r = findreg(regm); 1450 1451 reg_t reg; 1452 allocreg(cdb,&retregs,®, e.Ety); 1453 reg -= XMM0; 1454 // (V)MOVD reg,r 1455 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1456 checkSetVex(cdb.last(), TYushort8); 1457 if (config.avx >= 2) 1458 { 1459 // VPBROADCASTW X/YMM,XMM 1460 cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg)); 1461 checkSetVex(cdb.last(), ty); 1462 } 1463 else 1464 { 1465 // (V)PUNPCKLWD XMM,XMM 1466 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1467 checkSetVex(cdb.last(), TYushort8); // AVX-128 1468 // (V)PSHUFD XMM,XMM,0 1469 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1470 checkSetVex(cdb.last(), TYushort8); // AVX-128 1471 if (tysize(ty) == 32) 1472 { 1473 // VINSERTF128 YMM,YMM,XMM,1 1474 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1475 checkSetVex(cdb.last(), ty); 1476 } 1477 } 1478 } 1479 break; 1480 1481 case TYlong8: 1482 case TYulong8: 1483 case TYlong4: 1484 case TYulong4: 1485 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1486 { 1487 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM 1488 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1489 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1490 reg_t reg; 1491 allocreg(cdb,&retregs,®,ty); 1492 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS; 1493 cs.Irex &= ~REX_W; 1494 code_newreg(&cs,reg - XMM0); 1495 checkSetVex(&cs,ty); 1496 cdb.gen(&cs); 1497 } 1498 else 1499 { 1500 codelem(cdb,e1,&retregs,true); // eval left leaf 1501 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1502 getregs(cdb,retregs); 1503 if (config.avx >= 2) 1504 { 1505 // VPBROADCASTD X/YMM,XMM 1506 cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg)); 1507 checkSetVex(cdb.last(), ty); 1508 } 1509 else 1510 { 1511 // (V)PSHUFD XMM,XMM,0 1512 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1513 checkSetVex(cdb.last(), TYulong4); // AVX-128 1514 if (tysize(ty) == 32) 1515 { 1516 // VINSERTF128 YMM,YMM,XMM,1 1517 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1518 checkSetVex(cdb.last(), ty); 1519 } 1520 } 1521 } 1522 break; 1523 1524 case TYllong2: 1525 case TYullong2: 1526 case TYllong4: 1527 case TYullong4: 1528 if (e1.Eoper == OPind && !e1.Ecount) 1529 { 1530 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM 1531 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1532 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1533 reg_t reg; 1534 allocreg(cdb,&retregs,®,ty); 1535 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ; 1536 cs.Irex &= ~REX_W; 1537 code_newreg(&cs,reg - XMM0); 1538 checkSetVex(&cs,ty); 1539 cdb.gen(&cs); 1540 } 1541 else 1542 { 1543 codelem(cdb,e1,&retregs,true); // eval left leaf 1544 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1545 getregs(cdb,retregs); 1546 if (config.avx >= 2) 1547 { 1548 // VPBROADCASTQ X/YMM,XMM 1549 cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg)); 1550 checkSetVex(cdb.last(), ty); 1551 } 1552 else 1553 { 1554 // (V)PUNPCKLQDQ XMM,XMM 1555 cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0); 1556 checkSetVex(cdb.last(), TYullong2); // AVX-128 1557 if (tysize(ty) == 32) 1558 { 1559 // VINSERTF128 YMM,YMM,XMM,1 1560 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1561 checkSetVex(cdb.last(), ty); 1562 } 1563 } 1564 } 1565 break; 1566 1567 default: 1568 assert(0); 1569 } 1570 1571 fixresult(cdb,e,retregs,pretregs); 1572 } 1573 1574 /******************************************* 1575 * Determine if lvalue e is a vector aligned on a 16/32 byte boundary. 1576 * Assume it to be aligned unless can prove it is not. 1577 * Params: 1578 * e = lvalue 1579 * Returns: 1580 * false if definitely not aligned 1581 */ 1582 1583 bool xmmIsAligned(elem *e) 1584 { 1585 if (tyvector(e.Ety) && e.Eoper == OPvar) 1586 { 1587 Symbol *s = e.EV.Vsym; 1588 const alignsz = tyalignsize(e.Ety); 1589 if (Symbol_Salignsize(s) < alignsz || 1590 e.EV.Voffset & (alignsz - 1) || 1591 alignsz > STACKALIGN 1592 ) 1593 return false; // definitely not aligned 1594 } 1595 return true; // assume aligned 1596 } 1597 1598 /************************************** 1599 * VEX prefixes can be 2 or 3 bytes. 1600 * If it must be 3 bytes, set the CFvex3 flag. 1601 */ 1602 1603 void checkSetVex3(code *c) 1604 { 1605 // See Intel Vol. 2A 2.3.5.6 1606 if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 || 1607 !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8)) 1608 ) 1609 { 1610 c.Iflags |= CFvex3; 1611 } 1612 } 1613 1614 /************************************* 1615 * Determine if operation should be rewritten as a VEX 1616 * operation; and do so. 1617 * Params: 1618 * c = code 1619 * ty = type of operand 1620 */ 1621 1622 void checkSetVex(code *c, tym_t ty) 1623 { 1624 if (config.avx || tysize(ty) == 32) 1625 { 1626 uint vreg = (c.Irm >> 3) & 7; 1627 if (c.Irex & REX_R) 1628 vreg |= 8; 1629 1630 // TODO: This is too simplistic, depending on the instruction, vex.vvvv 1631 // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes 1632 // NDS (non-destructive source), except for the incomplete list of 2 1633 // operand instructions (NOO) handled by the switch. 1634 switch (c.Iop) 1635 { 1636 case LODSS: 1637 case LODSD: 1638 case STOSS: 1639 case STOSD: 1640 if ((c.Irm & 0xC0) == 0xC0) 1641 break; 1642 goto case LODAPS; 1643 1644 case LODAPS: 1645 case LODUPS: 1646 case LODAPD: 1647 case LODUPD: 1648 case LODDQA: 1649 case LODDQU: 1650 case LODD: 1651 case LODQ: 1652 case STOAPS: 1653 case STOUPS: 1654 case STOAPD: 1655 case STOUPD: 1656 case STODQA: 1657 case STODQU: 1658 case STOD: 1659 case STOQ: 1660 case COMISS: 1661 case COMISD: 1662 case UCOMISS: 1663 case UCOMISD: 1664 case MOVDDUP: 1665 case MOVSHDUP: 1666 case MOVSLDUP: 1667 case VBROADCASTSS: 1668 case PSHUFD: 1669 case PSHUFHW: 1670 case PSHUFLW: 1671 case VPBROADCASTB: 1672 case VPBROADCASTW: 1673 case VPBROADCASTD: 1674 case VPBROADCASTQ: 1675 vreg = 0; // for 2 operand vex instructions 1676 break; 1677 1678 case VBROADCASTSD: 1679 case VBROADCASTF128: 1680 case VBROADCASTI128: 1681 assert(tysize(ty) == 32); // AVX-256 only instructions 1682 vreg = 0; // for 2 operand vex instructions 1683 break; 1684 1685 case NOP: 1686 return; // ignore 1687 1688 default: 1689 break; 1690 } 1691 1692 opcode_t op = 0xC4000000 | (c.Iop & 0xFF); 1693 switch (c.Iop & 0xFFFFFF00) 1694 { 1695 static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); } 1696 case 0x00000F00: op |= MM_PP(1,0); break; 1697 case 0x00660F00: op |= MM_PP(1,1); break; 1698 case 0x00F30F00: op |= MM_PP(1,2); break; 1699 case 0x00F20F00: op |= MM_PP(1,3); break; 1700 case 0x660F3800: op |= MM_PP(2,1); break; 1701 case 0x660F3A00: op |= MM_PP(3,1); break; 1702 default: 1703 printf("Iop = %x\n", c.Iop); 1704 assert(0); 1705 } 1706 c.Iop = op; 1707 c.Ivex.pfx = 0xC4; 1708 c.Ivex.r = !(c.Irex & REX_R); 1709 c.Ivex.x = !(c.Irex & REX_X); 1710 c.Ivex.b = !(c.Irex & REX_B); 1711 c.Ivex.w = (c.Irex & REX_W) != 0; 1712 c.Ivex.l = tysize(ty) == 32; 1713 1714 c.Ivex.vvvv = cast(ushort)~vreg; 1715 1716 c.Iflags |= CFvex; 1717 checkSetVex3(c); 1718 } 1719 } 1720 1721 }