1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (C) 2011-2021 by The D Language Foundation, All Rights Reserved 6 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d) 9 */ 10 11 module dmd.backend.cgxmm; 12 13 version (SCPP) 14 version = COMPILE; 15 version (MARS) 16 version = COMPILE; 17 18 version (COMPILE) 19 { 20 21 import core.stdc.stdio; 22 import core.stdc.stdlib; 23 import core.stdc.string; 24 25 import dmd.backend.cc; 26 import dmd.backend.cdef; 27 import dmd.backend.code; 28 import dmd.backend.code_x86; 29 import dmd.backend.codebuilder; 30 import dmd.backend.mem; 31 import dmd.backend.el; 32 import dmd.backend.global; 33 import dmd.backend.oper; 34 import dmd.backend.ty; 35 import dmd.backend.xmm; 36 37 version (SCPP) 38 import dmd.backend.exh; 39 version (MARS) 40 import dmd.backend.errors; 41 42 43 extern (C++): 44 45 nothrow: 46 47 int REGSIZE(); 48 49 uint mask(uint m); 50 51 /******************************************* 52 * Is operator a store operator? 53 */ 54 55 bool isXMMstore(opcode_t op) 56 { 57 switch (op) 58 { 59 case STOSS: case STOAPS: case STOUPS: 60 case STOSD: case STOAPD: case STOUPD: 61 case STOD: case STOQ: case STODQA: case STODQU: 62 case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true; 63 default: return false; 64 } 65 } 66 67 /******************************************* 68 * Move constant value into xmm register xreg. 69 */ 70 71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags) 72 { 73 /* Generate: 74 * MOV reg,value 75 * MOV xreg,reg 76 * Not so efficient. We should at least do a PXOR for 0. 77 */ 78 assert(mask(xreg) & XMMREGS); 79 assert(sz == 4 || sz == 8); 80 if (I32 && sz == 8) 81 { 82 reg_t r; 83 regm_t rm = ALLREGS; 84 allocreg(cdb,&rm,&r,TYint); // allocate scratch register 85 static union U { targ_size_t s; targ_long[2] l; } 86 U u = void; 87 u.l[1] = 0; 88 u.s = value; 89 targ_long *p = &u.l[0]; 90 movregconst(cdb,r,p[0],0); 91 cdb.genfltreg(STO,r,0); // MOV floatreg,r 92 movregconst(cdb,r,p[1],0); 93 cdb.genfltreg(STO,r,4); // MOV floatreg+4,r 94 95 const op = xmmload(TYdouble, true); 96 cdb.genxmmreg(op,xreg,0,TYdouble); // MOVSD XMMreg,floatreg 97 } 98 else 99 { 100 reg_t reg; 101 regwithvalue(cdb,ALLREGS,value,®,(sz == 8) ? 64 : 0); 102 cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg)); // MOVD xreg,reg 103 if (sz == 8) 104 code_orrex(cdb.last(), REX_W); 105 checkSetVex(cdb.last(), TYulong); 106 } 107 } 108 109 /*********************************************** 110 * Do simple orthogonal operators for XMM registers. 111 */ 112 113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 114 { 115 //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 116 elem *e1 = e.EV.E1; 117 elem *e2 = e.EV.E2; 118 119 // float + ifloat is not actually addition 120 if ((e.Eoper == OPadd || e.Eoper == OPmin) && 121 ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) || 122 (tyreal(e2.Ety) && tyimaginary(e1.Ety)))) 123 { 124 regm_t retregs = *pretregs & XMMREGS; 125 if (!retregs) 126 retregs = XMMREGS; 127 128 regm_t rretregs; 129 reg_t rreg; 130 if (tyreal(e1.Ety)) 131 { 132 const reg = findreg(retregs); 133 rreg = findreg(retregs & ~mask(reg)); 134 retregs = mask(reg); 135 rretregs = mask(rreg); 136 } 137 else 138 { 139 // Pick the second register, not the first 140 rreg = findreg(retregs); 141 rretregs = mask(rreg); 142 const reg = findreg(retregs & ~rretregs); 143 retregs = mask(reg); 144 } 145 assert(retregs && rretregs); 146 147 codelem(cdb,e1,&retregs,false); // eval left leaf 148 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 149 150 retregs |= rretregs; 151 if (e.Eoper == OPmin) 152 { 153 regm_t nretregs = XMMREGS & ~retregs; 154 reg_t sreg; // hold sign bit 155 const uint sz = tysize(e1.Ety); 156 allocreg(cdb,&nretregs,&sreg,e2.Ety); 157 targ_size_t signbit = 0x80000000; 158 if (sz == 8) 159 signbit = cast(targ_size_t)0x8000000000000000L; 160 movxmmconst(cdb,sreg, sz, signbit, 0); 161 getregs(cdb,nretregs); 162 const opcode_t xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg 163 cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)); 164 } 165 if (retregs != *pretregs) 166 fixresult(cdb,e,retregs,pretregs); 167 return; 168 } 169 170 regm_t retregs = *pretregs & XMMREGS; 171 if (!retregs) 172 retregs = XMMREGS; 173 const constflag = OTrel(e.Eoper); 174 codelem(cdb,e1,&retregs,constflag); // eval left leaf 175 const reg = findreg(retregs); 176 regm_t rretregs = XMMREGS & ~retregs; 177 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 178 179 const rreg = findreg(rretregs); 180 const op = xmmoperator(e1.Ety, e.Eoper); 181 182 /* We should take advantage of mem addressing modes for OP XMM,MEM 183 * but we do not at the moment. 184 */ 185 if (OTrel(e.Eoper)) 186 { 187 cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0)); 188 checkSetVex(cdb.last(), e1.Ety); 189 return; 190 } 191 192 getregs(cdb,retregs); 193 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 194 checkSetVex(cdb.last(), e1.Ety); 195 if (retregs != *pretregs) 196 fixresult(cdb,e,retregs,pretregs); 197 } 198 199 200 /************************ 201 * Generate code for an assignment using XMM registers. 202 * Params: 203 * opcode = store opcode to use, CMP means generate one 204 */ 205 206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs) 207 { 208 tym_t tymll; 209 int i; 210 code cs; 211 elem *e11; 212 bool regvar; /* true means evaluate into register variable */ 213 regm_t varregm; 214 targ_int postinc; 215 216 //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs)); 217 tym_t tyml = tybasic(e1.Ety); /* type of lvalue */ 218 regm_t retregs = *pretregs; 219 220 if (!(retregs & XMMREGS)) 221 retregs = XMMREGS; // pick any XMM reg 222 223 bool aligned = xmmIsAligned(e1); 224 // If default, select store opcode 225 cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op; 226 regvar = false; 227 varregm = 0; 228 if (config.flags4 & CFG4optimized) 229 { 230 // Be careful of cases like (x = x+x+x). We cannot evaluate in 231 // x if x is in a register. 232 reg_t varreg; 233 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 234 doinreg(e1.EV.Vsym,e2) && // and we can compute directly into it 235 varregm & XMMREGS 236 ) 237 { regvar = true; 238 retregs = varregm; // evaluate directly in target register 239 } 240 } 241 if (*pretregs & mPSW && OTleaf(e1.Eoper)) // if evaluating e1 couldn't change flags 242 { // Be careful that this lines up with jmpopcode() 243 retregs |= mPSW; 244 *pretregs &= ~mPSW; 245 } 246 scodelem(cdb,e2,&retregs,0,true); // get rvalue 247 248 // Look for special case of (*p++ = ...), where p is a register variable 249 if (e1.Eoper == OPind && 250 ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) && 251 e11.EV.E1.Eoper == OPvar && 252 e11.EV.E1.EV.Vsym.Sfl == FLreg 253 ) 254 { 255 postinc = e11.EV.E2.EV.Vint; 256 if (e11.Eoper == OPpostdec) 257 postinc = -postinc; 258 getlvalue(cdb,&cs,e11,RMstore | retregs); 259 freenode(e11.EV.E2); 260 } 261 else 262 { postinc = 0; 263 getlvalue(cdb,&cs,e1,RMstore | retregs); // get lvalue (cl == CNIL if regvar) 264 } 265 266 getregs_imm(cdb,regvar ? varregm : 0); 267 268 const reg = findreg(retregs & XMMREGS); 269 cs.Irm |= modregrm(0,(reg - XMM0) & 7,0); 270 if ((reg - XMM0) & 8) 271 cs.Irex |= REX_R; 272 273 // Do not generate mov from register onto itself 274 if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)))) 275 { 276 cdb.gen(&cs); // MOV EA+offset,reg 277 checkSetVex(cdb.last(), tyml); 278 } 279 280 if (e1.Ecount || // if lvalue is a CSE or 281 regvar) // rvalue can't be a CSE 282 { 283 getregs_imm(cdb,retregs); // necessary if both lvalue and 284 // rvalue are CSEs (since a reg 285 // can hold only one e at a time) 286 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 287 } 288 289 fixresult(cdb,e,retregs,pretregs); 290 if (postinc) 291 { 292 const increg = findreg(idxregm(&cs)); // the register to increment 293 if (*pretregs & mPSW) 294 { // Use LEA to avoid touching the flags 295 uint rm = cs.Irm & 7; 296 if (cs.Irex & REX_B) 297 rm |= 8; 298 cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc); 299 if (tysize(e11.EV.E1.Ety) == 8) 300 code_orrex(cdb.last(), REX_W); 301 } 302 else if (I64) 303 { 304 cdb.genc2(0x81,modregrmx(3,0,increg),postinc); 305 if (tysize(e11.EV.E1.Ety) == 8) 306 code_orrex(cdb.last(), REX_W); 307 } 308 else 309 { 310 if (postinc == 1) 311 cdb.gen1(0x40 + increg); // INC increg 312 else if (postinc == -cast(targ_int)1) 313 cdb.gen1(0x48 + increg); // DEC increg 314 else 315 { 316 cdb.genc2(0x81,modregrm(3,0,increg),postinc); 317 } 318 } 319 } 320 freenode(e1); 321 } 322 323 /******************************** 324 * Generate code for conversion using SSE2 instructions. 325 * 326 * OPs32_d 327 * OPs64_d (64-bit only) 328 * OPu32_d (64-bit only) 329 * OPd_f 330 * OPf_d 331 * OPd_s32 332 * OPd_s64 (64-bit only) 333 * 334 */ 335 336 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 337 { 338 //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs)); 339 opcode_t op = NoOpcode; 340 regm_t regs; 341 tym_t ty; 342 ubyte rex = 0; 343 bool zx = false; // zero extend uint 344 345 /* There are no ops for integer <. float/real conversions 346 * but there are instructions for them. In order to use these 347 * try to fuse chained conversions. Be careful not to loose 348 * precision for real to long. 349 */ 350 elem *e1 = e.EV.E1; 351 switch (e.Eoper) 352 { 353 case OPd_f: 354 if (e1.Eoper == OPs32_d) 355 { } 356 else if (I64 && e1.Eoper == OPs64_d) 357 rex = REX_W; 358 else if (I64 && e1.Eoper == OPu32_d) 359 { rex = REX_W; 360 zx = true; 361 } 362 else 363 { regs = XMMREGS; 364 op = CVTSD2SS; 365 ty = TYfloat; 366 break; 367 } 368 if (e1.Ecount) 369 { 370 regs = XMMREGS; 371 op = CVTSD2SS; 372 ty = TYfloat; 373 break; 374 } 375 // directly use si2ss 376 regs = ALLREGS; 377 e1 = e1.EV.E1; // fused operation 378 op = CVTSI2SS; 379 ty = TYfloat; 380 break; 381 382 case OPs32_d: goto Litod; 383 case OPs64_d: rex = REX_W; goto Litod; 384 case OPu32_d: rex = REX_W; zx = true; goto Litod; 385 Litod: 386 regs = ALLREGS; 387 op = CVTSI2SD; 388 ty = TYdouble; 389 break; 390 391 case OPd_s32: ty = TYint; goto Ldtoi; 392 case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi; 393 case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi; 394 Ldtoi: 395 regs = XMMREGS; 396 switch (e1.Eoper) 397 { 398 case OPf_d: 399 if (e1.Ecount) 400 { 401 op = CVTTSD2SI; 402 break; 403 } 404 e1 = e1.EV.E1; // fused operation 405 op = CVTTSS2SI; 406 break; 407 case OPld_d: 408 if (e.Eoper == OPd_s64) 409 { 410 cnvt87(cdb,e,pretregs); // precision 411 return; 412 } 413 goto default; 414 415 default: 416 op = CVTTSD2SI; 417 break; 418 } 419 break; 420 421 case OPf_d: 422 regs = XMMREGS; 423 op = CVTSS2SD; 424 ty = TYdouble; 425 break; 426 427 default: 428 assert(0); 429 } 430 assert(op != NoOpcode); 431 432 codelem(cdb,e1, ®s, false); 433 reg_t reg = findreg(regs); 434 if (isXMMreg(reg)) 435 reg -= XMM0; 436 else if (zx) 437 { assert(I64); 438 getregs(cdb,regs); 439 genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit 440 // Don't use x89 because that will get optimized away 441 code_orflag(cdb.last(),CFvolatile); 442 } 443 444 regm_t retregs = *pretregs; 445 if (tyxmmreg(ty)) // target is XMM 446 { if (!(*pretregs & XMMREGS)) 447 retregs = XMMREGS; 448 } 449 else // source is XMM 450 { assert(regs & XMMREGS); 451 if (!(retregs & ALLREGS)) 452 retregs = ALLREGS; 453 } 454 455 reg_t rreg; 456 allocreg(cdb,&retregs,&rreg,ty); 457 if (isXMMreg(rreg)) 458 rreg -= XMM0; 459 460 cdb.gen2(op, modregxrmx(3,rreg,reg)); 461 assert(I64 || !rex); 462 if (rex) 463 code_orrex(cdb.last(), rex); 464 465 if (*pretregs != retregs) 466 fixresult(cdb,e,retregs,pretregs); 467 } 468 469 /******************************** 470 * Generate code for op= 471 */ 472 473 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 474 { elem *e1 = e.EV.E1; 475 elem *e2 = e.EV.E2; 476 tym_t ty1 = tybasic(e1.Ety); 477 const sz1 = _tysize[ty1]; 478 regm_t rretregs = XMMREGS & ~*pretregs; 479 if (!rretregs) 480 rretregs = XMMREGS; 481 482 codelem(cdb,e2,&rretregs,false); // eval right leaf 483 reg_t rreg = findreg(rretregs); 484 485 code cs; 486 regm_t retregs; 487 reg_t reg; 488 bool regvar = false; 489 if (config.flags4 & CFG4optimized) 490 { 491 // Be careful of cases like (x = x+x+x). We cannot evaluate in 492 // x if x is in a register. 493 reg_t varreg; 494 regm_t varregm; 495 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 496 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 497 ) 498 { regvar = true; 499 retregs = varregm; 500 reg = varreg; // evaluate directly in target register 501 getregs(cdb,retregs); // destroy these regs 502 } 503 } 504 505 if (!regvar) 506 { 507 getlvalue(cdb,&cs,e1,rretregs); // get EA 508 retregs = *pretregs & XMMREGS & ~rretregs; 509 if (!retregs) 510 retregs = XMMREGS & ~rretregs; 511 allocreg(cdb,&retregs,®,ty1); 512 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 513 code_newreg(&cs,reg - XMM0); 514 cdb.gen(&cs); 515 checkSetVex(cdb.last(), ty1); 516 } 517 518 const op = xmmoperator(e1.Ety, e.Eoper); 519 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 520 checkSetVex(cdb.last(), e1.Ety); 521 522 if (!regvar) 523 { 524 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 525 cdb.gen(&cs); 526 checkSetVex(cdb.last(), ty1); 527 } 528 529 if (e1.Ecount || // if lvalue is a CSE or 530 regvar) // rvalue can't be a CSE 531 { 532 getregs_imm(cdb,retregs); // necessary if both lvalue and 533 // rvalue are CSEs (since a reg 534 // can hold only one e at a time) 535 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 536 } 537 538 fixresult(cdb,e,retregs,pretregs); 539 freenode(e1); 540 } 541 542 /******************************** 543 * Generate code for post increment and post decrement. 544 */ 545 546 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 547 { 548 elem *e1 = e.EV.E1; 549 elem *e2 = e.EV.E2; 550 tym_t ty1 = tybasic(e1.Ety); 551 552 regm_t retregs; 553 reg_t reg; 554 bool regvar = false; 555 if (config.flags4 & CFG4optimized) 556 { 557 // Be careful of cases like (x = x+x+x). We cannot evaluate in 558 // x if x is in a register. 559 reg_t varreg; 560 regm_t varregm; 561 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 562 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 563 ) 564 { 565 regvar = true; 566 retregs = varregm; 567 reg = varreg; // evaluate directly in target register 568 getregs(cdb,retregs); // destroy these regs 569 } 570 } 571 572 code cs; 573 if (!regvar) 574 { 575 getlvalue(cdb,&cs,e1,0); // get EA 576 retregs = XMMREGS & ~*pretregs; 577 if (!retregs) 578 retregs = XMMREGS; 579 allocreg(cdb,&retregs,®,ty1); 580 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 581 code_newreg(&cs,reg - XMM0); 582 cdb.gen(&cs); 583 checkSetVex(cdb.last(), ty1); 584 } 585 586 // Result register 587 regm_t resultregs = XMMREGS & *pretregs & ~retregs; 588 if (!resultregs) 589 resultregs = XMMREGS & ~retregs; 590 reg_t resultreg; 591 allocreg(cdb,&resultregs, &resultreg, ty1); 592 593 cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0)); // MOVSS/D resultreg,reg 594 checkSetVex(cdb.last(), ty1); 595 596 regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs); 597 if (!rretregs) 598 rretregs = XMMREGS & ~(retregs | resultregs); 599 codelem(cdb,e2,&rretregs,false); // eval right leaf 600 const rreg = findreg(rretregs); 601 602 const op = xmmoperator(e1.Ety, e.Eoper); 603 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); // ADD reg,rreg 604 checkSetVex(cdb.last(), e1.Ety); 605 606 if (!regvar) 607 { 608 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 609 cdb.gen(&cs); 610 checkSetVex(cdb.last(), ty1); 611 } 612 613 if (e1.Ecount || // if lvalue is a CSE or 614 regvar) // rvalue can't be a CSE 615 { 616 getregs_imm(cdb,retregs); // necessary if both lvalue and 617 // rvalue are CSEs (since a reg 618 // can hold only one e at a time) 619 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 620 } 621 622 fixresult(cdb,e,resultregs,pretregs); 623 freenode(e1); 624 } 625 626 /****************** 627 * Negate operator 628 */ 629 630 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 631 { 632 //printf("xmmneg()\n"); 633 //elem_print(e); 634 assert(*pretregs); 635 tym_t tyml = tybasic(e.EV.E1.Ety); 636 int sz = _tysize[tyml]; 637 638 regm_t retregs = *pretregs & XMMREGS; 639 if (!retregs) 640 retregs = XMMREGS; 641 642 /* Generate: 643 * MOV reg,e1 644 * MOV rreg,signbit 645 * XOR reg,rreg 646 */ 647 codelem(cdb,e.EV.E1,&retregs,false); 648 getregs(cdb,retregs); 649 const reg = findreg(retregs); 650 regm_t rretregs = XMMREGS & ~retregs; 651 reg_t rreg; 652 allocreg(cdb,&rretregs,&rreg,tyml); 653 targ_size_t signbit = 0x80000000; 654 if (sz == 8) 655 signbit = cast(targ_size_t)0x8000000000000000L; 656 movxmmconst(cdb,rreg, sz, signbit, 0); 657 658 getregs(cdb,retregs); 659 const op = (sz == 8) ? XORPD : XORPS; // XORPD/S reg,rreg 660 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 661 fixresult(cdb,e,retregs,pretregs); 662 } 663 664 /****************** 665 * Absolute value operator OPabs 666 */ 667 668 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 669 { 670 //printf("xmmabs()\n"); 671 //elem_print(e); 672 assert(*pretregs); 673 tym_t tyml = tybasic(e.EV.E1.Ety); 674 int sz = _tysize[tyml]; 675 676 regm_t retregs = *pretregs & XMMREGS; 677 if (!retregs) 678 retregs = XMMREGS; 679 680 /* Generate: 681 * MOV reg,e1 682 * MOV rreg,mask 683 * AND reg,rreg 684 */ 685 codelem(cdb,e.EV.E1,&retregs,false); 686 getregs(cdb,retregs); 687 const reg = findreg(retregs); 688 regm_t rretregs = XMMREGS & ~retregs; 689 reg_t rreg; 690 allocreg(cdb,&rretregs,&rreg,tyml); 691 targ_size_t mask = 0x7FFF_FFFF; 692 if (sz == 8) 693 mask = cast(targ_size_t)0x7FFF_FFFF_FFFF_FFFFL; 694 movxmmconst(cdb,rreg, sz, mask, 0); 695 696 getregs(cdb,retregs); 697 const op = (sz == 8) ? ANDPD : ANDPS; // ANDPD/S reg,rreg 698 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 699 fixresult(cdb,e,retregs,pretregs); 700 } 701 702 /***************************** 703 * Get correct load operator based on type. 704 * It is important to use the right one even if the number of bits moved is the same, 705 * as there are performance consequences for using the wrong one. 706 * Params: 707 * tym = type of data to load 708 * aligned = for vectors, true if aligned to 16 bytes 709 */ 710 711 opcode_t xmmload(tym_t tym, bool aligned) 712 { 713 opcode_t op; 714 if (tysize(tym) == 32) 715 aligned = false; 716 switch (tybasic(tym)) 717 { 718 case TYuint: 719 case TYint: 720 case TYlong: 721 case TYulong: op = LODD; break; // MOVD 722 case TYfloat: 723 case TYcfloat: 724 case TYifloat: op = LODSS; break; // MOVSS 725 case TYllong: 726 case TYullong: op = LODQ; break; // MOVQ 727 case TYdouble: 728 case TYcdouble: 729 case TYidouble: op = LODSD; break; // MOVSD 730 731 case TYfloat8: 732 case TYfloat4: op = aligned ? LODAPS : LODUPS; break; // MOVAPS / MOVUPS 733 case TYdouble4: 734 case TYdouble2: op = aligned ? LODAPD : LODUPD; break; // MOVAPD / MOVUPD 735 case TYschar16: 736 case TYuchar16: 737 case TYshort8: 738 case TYushort8: 739 case TYlong4: 740 case TYulong4: 741 case TYllong2: 742 case TYullong2: 743 case TYschar32: 744 case TYuchar32: 745 case TYshort16: 746 case TYushort16: 747 case TYlong8: 748 case TYulong8: 749 case TYllong4: 750 case TYullong4: op = aligned ? LODDQA : LODDQU; break; // MOVDQA / MOVDQU 751 752 default: 753 printf("tym = x%x\n", tym); 754 assert(0); 755 } 756 return op; 757 } 758 759 /***************************** 760 * Get correct store operator based on type. 761 */ 762 763 opcode_t xmmstore(tym_t tym, bool aligned) 764 { 765 opcode_t op; 766 switch (tybasic(tym)) 767 { 768 case TYuint: 769 case TYint: 770 case TYlong: 771 case TYulong: op = STOD; break; // MOVD 772 case TYfloat: 773 case TYifloat: op = STOSS; break; // MOVSS 774 case TYllong: 775 case TYullong: op = STOQ; break; // MOVQ 776 case TYdouble: 777 case TYidouble: 778 case TYcdouble: 779 case TYcfloat: op = STOSD; break; // MOVSD 780 781 case TYfloat8: 782 case TYfloat4: op = aligned ? STOAPS : STOUPS; break; // MOVAPS / MOVUPS 783 case TYdouble4: 784 case TYdouble2: op = aligned ? STOAPD : STOUPD; break; // MOVAPD / MOVUPD 785 case TYschar16: 786 case TYuchar16: 787 case TYshort8: 788 case TYushort8: 789 case TYlong4: 790 case TYulong4: 791 case TYllong2: 792 case TYullong2: 793 case TYschar32: 794 case TYuchar32: 795 case TYshort16: 796 case TYushort16: 797 case TYlong8: 798 case TYulong8: 799 case TYllong4: 800 case TYullong4: op = aligned ? STODQA : STODQU; break; // MOVDQA / MOVDQU 801 802 default: 803 printf("tym = 0x%x\n", tym); 804 assert(0); 805 } 806 return op; 807 } 808 809 810 /************************************ 811 * Get correct XMM operator based on type and operator. 812 */ 813 814 private opcode_t xmmoperator(tym_t tym, OPER oper) 815 { 816 tym = tybasic(tym); 817 opcode_t op; 818 switch (oper) 819 { 820 case OPadd: 821 case OPaddass: 822 case OPpostinc: 823 switch (tym) 824 { 825 case TYfloat: 826 case TYifloat: op = ADDSS; break; 827 case TYdouble: 828 case TYidouble: op = ADDSD; break; 829 830 // SIMD vector types 831 case TYfloat8: 832 case TYfloat4: op = ADDPS; break; 833 case TYdouble4: 834 case TYdouble2: op = ADDPD; break; 835 case TYschar32: 836 case TYuchar32: 837 case TYschar16: 838 case TYuchar16: op = PADDB; break; 839 case TYshort16: 840 case TYushort16: 841 case TYshort8: 842 case TYushort8: op = PADDW; break; 843 case TYlong8: 844 case TYulong8: 845 case TYlong4: 846 case TYulong4: op = PADDD; break; 847 case TYllong4: 848 case TYullong4: 849 case TYllong2: 850 case TYullong2: op = PADDQ; break; 851 852 default: 853 printf("tym = x%x\n", tym); 854 assert(0); 855 } 856 break; 857 858 case OPmin: 859 case OPminass: 860 case OPpostdec: 861 switch (tym) 862 { 863 case TYfloat: 864 case TYifloat: op = SUBSS; break; 865 case TYdouble: 866 case TYidouble: op = SUBSD; break; 867 868 // SIMD vector types 869 case TYfloat8: 870 case TYfloat4: op = SUBPS; break; 871 case TYdouble4: 872 case TYdouble2: op = SUBPD; break; 873 case TYschar32: 874 case TYuchar32: 875 case TYschar16: 876 case TYuchar16: op = PSUBB; break; 877 case TYshort16: 878 case TYushort16: 879 case TYshort8: 880 case TYushort8: op = PSUBW; break; 881 case TYlong8: 882 case TYulong8: 883 case TYlong4: 884 case TYulong4: op = PSUBD; break; 885 case TYllong4: 886 case TYullong4: 887 case TYllong2: 888 case TYullong2: op = PSUBQ; break; 889 890 default: assert(0); 891 } 892 break; 893 894 case OPmul: 895 case OPmulass: 896 switch (tym) 897 { 898 case TYfloat: 899 case TYifloat: op = MULSS; break; 900 case TYdouble: 901 case TYidouble: op = MULSD; break; 902 903 // SIMD vector types 904 case TYfloat8: 905 case TYfloat4: op = MULPS; break; 906 case TYdouble4: 907 case TYdouble2: op = MULPD; break; 908 case TYshort16: 909 case TYushort16: 910 case TYshort8: 911 case TYushort8: op = PMULLW; break; 912 case TYlong8: 913 case TYulong8: 914 case TYlong4: 915 case TYulong4: op = PMULLD; break; 916 917 default: assert(0); 918 } 919 break; 920 921 case OPdiv: 922 case OPdivass: 923 switch (tym) 924 { 925 case TYfloat: 926 case TYifloat: op = DIVSS; break; 927 case TYdouble: 928 case TYidouble: op = DIVSD; break; 929 930 // SIMD vector types 931 case TYfloat8: 932 case TYfloat4: op = DIVPS; break; 933 case TYdouble4: 934 case TYdouble2: op = DIVPD; break; 935 936 default: assert(0); 937 } 938 break; 939 940 case OPor: 941 case OPorass: 942 switch (tym) 943 { 944 // SIMD vector types 945 case TYschar16: 946 case TYuchar16: 947 case TYshort8: 948 case TYushort8: 949 case TYlong4: 950 case TYulong4: 951 case TYllong2: 952 case TYullong2: 953 case TYschar32: 954 case TYuchar32: 955 case TYshort16: 956 case TYushort16: 957 case TYlong8: 958 case TYulong8: 959 case TYllong4: 960 case TYullong4: op = POR; break; 961 962 default: assert(0); 963 } 964 break; 965 966 case OPand: 967 case OPandass: 968 switch (tym) 969 { 970 // SIMD vector types 971 case TYschar16: 972 case TYuchar16: 973 case TYshort8: 974 case TYushort8: 975 case TYlong4: 976 case TYulong4: 977 case TYllong2: 978 case TYullong2: 979 case TYschar32: 980 case TYuchar32: 981 case TYshort16: 982 case TYushort16: 983 case TYlong8: 984 case TYulong8: 985 case TYllong4: 986 case TYullong4: op = PAND; break; 987 988 default: assert(0); 989 } 990 break; 991 992 case OPxor: 993 case OPxorass: 994 switch (tym) 995 { 996 // SIMD vector types 997 case TYschar16: 998 case TYuchar16: 999 case TYshort8: 1000 case TYushort8: 1001 case TYlong4: 1002 case TYulong4: 1003 case TYllong2: 1004 case TYullong2: 1005 case TYschar32: 1006 case TYuchar32: 1007 case TYshort16: 1008 case TYushort16: 1009 case TYlong8: 1010 case TYulong8: 1011 case TYllong4: 1012 case TYullong4: op = PXOR; break; 1013 1014 default: assert(0); 1015 } 1016 break; 1017 1018 case OPlt: 1019 case OPle: 1020 case OPgt: 1021 case OPge: 1022 case OPne: 1023 case OPeqeq: 1024 case OPunord: /* !<>= */ 1025 case OPlg: /* <> */ 1026 case OPleg: /* <>= */ 1027 case OPule: /* !> */ 1028 case OPul: /* !>= */ 1029 case OPuge: /* !< */ 1030 case OPug: /* !<= */ 1031 case OPue: /* !<> */ 1032 case OPngt: 1033 case OPnge: 1034 case OPnlt: 1035 case OPnle: 1036 case OPord: 1037 case OPnlg: 1038 case OPnleg: 1039 case OPnule: 1040 case OPnul: 1041 case OPnuge: 1042 case OPnug: 1043 case OPnue: 1044 switch (tym) 1045 { 1046 case TYfloat: 1047 case TYifloat: op = UCOMISS; break; 1048 case TYdouble: 1049 case TYidouble: op = UCOMISD; break; 1050 1051 default: assert(0); 1052 } 1053 break; 1054 1055 default: 1056 assert(0); 1057 } 1058 return op; 1059 } 1060 1061 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1062 { 1063 /* e should look like one of: 1064 * vector 1065 * | 1066 * param 1067 * / \ 1068 * param op2 1069 * / \ 1070 * op op1 1071 */ 1072 1073 if (!config.fpxmmregs) 1074 { printf("SIMD operations not supported on this platform\n"); 1075 exit(1); 1076 } 1077 1078 const n = el_nparams(e.EV.E1); 1079 elem **params = cast(elem **)malloc(n * (elem *).sizeof); 1080 assert(params); 1081 elem **tmp = params; 1082 el_paramArray(&tmp, e.EV.E1); 1083 1084 static if (0) 1085 { 1086 printf("cdvector()\n"); 1087 for (int i = 0; i < n; i++) 1088 { 1089 printf("[%d]: ", i); 1090 elem_print(params[i]); 1091 } 1092 } 1093 1094 if (*pretregs == 0) 1095 { /* Evaluate for side effects only 1096 */ 1097 foreach (i; 0 .. n) 1098 { 1099 codelem(cdb,params[i], pretregs, false); 1100 *pretregs = 0; // in case they got set 1101 } 1102 return; 1103 } 1104 1105 assert(n >= 2 && n <= 4); 1106 1107 elem *eop = params[0]; 1108 elem *op1 = params[1]; 1109 elem *op2 = null; 1110 tym_t ty2 = 0; 1111 if (n >= 3) 1112 { op2 = params[2]; 1113 ty2 = tybasic(op2.Ety); 1114 } 1115 1116 auto op = cast(opcode_t)el_tolong(eop); 1117 debug assert(!isXMMstore(op)); 1118 tym_t ty1 = tybasic(op1.Ety); 1119 1120 regm_t retregs; 1121 if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst) 1122 { // Handle: op xmm,imm8 1123 1124 retregs = *pretregs & XMMREGS; 1125 if (!retregs) 1126 retregs = XMMREGS; 1127 codelem(cdb,op1,&retregs,false); // eval left leaf 1128 const reg = findreg(retregs); 1129 int r; 1130 switch (op) 1131 { 1132 case PSLLD: r = 6; op = 0x660F72; break; 1133 case PSLLQ: r = 6; op = 0x660F73; break; 1134 case PSLLW: r = 6; op = 0x660F71; break; 1135 case PSRAD: r = 4; op = 0x660F72; break; 1136 case PSRAW: r = 4; op = 0x660F71; break; 1137 case PSRLD: r = 2; op = 0x660F72; break; 1138 case PSRLQ: r = 2; op = 0x660F73; break; 1139 case PSRLW: r = 2; op = 0x660F71; break; 1140 case PSRLDQ: r = 3; op = 0x660F73; break; 1141 case PSLLDQ: r = 7; op = 0x660F73; break; 1142 1143 default: 1144 printf("op = x%x\n", op); 1145 assert(0); 1146 } 1147 getregs(cdb,retregs); 1148 cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2)); 1149 } 1150 else if (n == 2) 1151 { /* Handle: op xmm,mem 1152 * where xmm is written only, not read 1153 */ 1154 code cs; 1155 1156 if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar) 1157 { 1158 getlvalue(cdb,&cs, op1, RMload); // get addressing mode 1159 } 1160 else 1161 { 1162 regm_t rretregs = XMMREGS; 1163 codelem(cdb,op1, &rretregs, false); 1164 const rreg = findreg(rretregs) - XMM0; 1165 cs.Irm = modregrm(3,0,rreg & 7); 1166 cs.Iflags = 0; 1167 cs.Irex = 0; 1168 if (rreg & 8) 1169 cs.Irex |= REX_B; 1170 } 1171 1172 retregs = *pretregs & XMMREGS; 1173 if (!retregs) 1174 retregs = XMMREGS; 1175 reg_t reg; 1176 allocreg(cdb,&retregs, ®, e.Ety); 1177 code_newreg(&cs, reg - XMM0); 1178 cs.Iop = op; 1179 cdb.gen(&cs); 1180 } 1181 else if (n == 3 || n == 4) 1182 { /* Handle: 1183 * op xmm,mem // n = 3 1184 * op xmm,mem,imm8 // n = 4 1185 * Both xmm and mem are operands, evaluate xmm first. 1186 */ 1187 1188 code cs; 1189 1190 retregs = *pretregs & XMMREGS; 1191 if (!retregs) 1192 retregs = XMMREGS; 1193 codelem(cdb,op1,&retregs,false); // eval left leaf 1194 const reg = findreg(retregs); 1195 1196 if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar) 1197 { 1198 getlvalue(cdb,&cs, op2, RMload | retregs); // get addressing mode 1199 } 1200 else 1201 { 1202 regm_t rretregs = XMMREGS & ~retregs; 1203 scodelem(cdb, op2, &rretregs, retregs, true); 1204 const rreg = findreg(rretregs) - XMM0; 1205 cs.Irm = modregrm(3,0,rreg & 7); 1206 cs.Iflags = 0; 1207 cs.Irex = 0; 1208 if (rreg & 8) 1209 cs.Irex |= REX_B; 1210 } 1211 1212 getregs(cdb,retregs); 1213 1214 switch (op) 1215 { 1216 case CMPPD: case CMPSS: case CMPSD: case CMPPS: 1217 case PSHUFD: case PSHUFHW: case PSHUFLW: 1218 case BLENDPD: case BLENDPS: case DPPD: case DPPS: 1219 case MPSADBW: case PBLENDW: 1220 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS: 1221 case SHUFPD: case SHUFPS: 1222 if (n == 3) 1223 { 1224 version (MARS) 1225 if (pass == PASSfinal) 1226 error(e.Esrcpos.Sfilename, e.Esrcpos.Slinnum, e.Esrcpos.Scharnum, "missing 4th parameter to `__simd()`"); 1227 cs.IFL2 = FLconst; 1228 cs.IEV2.Vsize_t = 0; 1229 } 1230 break; 1231 default: 1232 break; 1233 } 1234 1235 if (n == 4) 1236 { 1237 elem *imm8 = params[3]; 1238 cs.IFL2 = FLconst; 1239 version (MARS) 1240 { 1241 if (imm8.Eoper != OPconst) 1242 { 1243 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant"); 1244 cs.IEV2.Vsize_t = 0; 1245 } 1246 else 1247 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1248 } 1249 else 1250 { 1251 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1252 } 1253 } 1254 code_newreg(&cs, reg - XMM0); 1255 cs.Iop = op; 1256 cdb.gen(&cs); 1257 } 1258 else 1259 assert(0); 1260 fixresult(cdb,e,retregs,pretregs); 1261 free(params); 1262 freenode(e); 1263 } 1264 1265 /*************** 1266 * Generate code for vector "store" operations. 1267 * The tree e must look like: 1268 * (op1 OPvecsto (op OPparam op2)) 1269 * where op is the store instruction STOxxxx. 1270 */ 1271 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1272 { 1273 //printf("cdvecsto()\n"); 1274 //elem_print(e); 1275 elem *op1 = e.EV.E1; 1276 elem *op2 = e.EV.E2.EV.E2; 1277 elem *eop = e.EV.E2.EV.E1; 1278 const op = cast(opcode_t)el_tolong(eop); 1279 debug assert(isXMMstore(op)); 1280 xmmeq(cdb, e, op, op1, op2, pretregs); 1281 } 1282 1283 /*************** 1284 * Generate code for OPvecfill (broadcast). 1285 * OPvecfill takes the single value in e1 and 1286 * fills the vector type with it. 1287 */ 1288 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1289 { 1290 //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs)); 1291 1292 regm_t retregs = *pretregs & XMMREGS; 1293 if (!retregs) 1294 retregs = XMMREGS; 1295 1296 code *c; 1297 code cs; 1298 1299 elem *e1 = e.EV.E1; 1300 static if (0) 1301 { 1302 if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar) 1303 { 1304 cr = getlvalue(&cs, e1, RMload | retregs); // get addressing mode 1305 } 1306 else 1307 { 1308 regm_t rretregs = XMMREGS & ~retregs; 1309 cr = scodelem(op2, &rretregs, retregs, true); 1310 const rreg = findreg(rretregs) - XMM0; 1311 cs.Irm = modregrm(3,0,rreg & 7); 1312 cs.Iflags = 0; 1313 cs.Irex = 0; 1314 if (rreg & 8) 1315 cs.Irex |= REX_B; 1316 } 1317 } 1318 1319 /* e.Ety only gives us the size of the result vector, not its type. 1320 * We must combine it with the vector element type, e1.Ety, to 1321 * form the resulting vector type, ty. 1322 * The reason is someone may have painted the result of the OPvecfill to 1323 * a different vector type. 1324 */ 1325 const sz = tysize(e.Ety); 1326 const ty1 = tybasic(e1.Ety); 1327 assert(sz == 16 || sz == 32); 1328 const bool x16 = (sz == 16); 1329 1330 tym_t ty; 1331 switch (ty1) 1332 { 1333 case TYfloat: ty = x16 ? TYfloat4 : TYfloat8; break; 1334 case TYdouble: ty = x16 ? TYdouble2 : TYdouble4; break; 1335 case TYschar: ty = x16 ? TYschar16 : TYschar32; break; 1336 case TYuchar: ty = x16 ? TYuchar16 : TYuchar32; break; 1337 case TYshort: ty = x16 ? TYshort8 : TYshort16; break; 1338 case TYushort: ty = x16 ? TYushort8 : TYushort16; break; 1339 case TYint: 1340 case TYlong: ty = x16 ? TYlong4 : TYlong8; break; 1341 case TYuint: 1342 case TYulong: ty = x16 ? TYulong4 : TYulong8; break; 1343 case TYllong: ty = x16 ? TYllong2 : TYllong4; break; 1344 case TYullong: ty = x16 ? TYullong2 : TYullong4; break; 1345 1346 default: 1347 assert(0); 1348 } 1349 1350 switch (ty) 1351 { 1352 case TYfloat4: 1353 case TYfloat8: 1354 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1355 { 1356 // VBROADCASTSS X/YMM,MEM 1357 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1358 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1359 reg_t reg; 1360 allocreg(cdb,&retregs,®,ty); 1361 cs.Iop = VBROADCASTSS; 1362 cs.Irex &= ~REX_W; 1363 code_newreg(&cs,reg - XMM0); 1364 checkSetVex(&cs,ty); 1365 cdb.gen(&cs); 1366 } 1367 else 1368 { 1369 codelem(cdb,e1,&retregs,false); // eval left leaf 1370 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1371 getregs(cdb,retregs); 1372 if (config.avx >= 2) 1373 { 1374 // VBROADCASTSS X/YMM,XMM 1375 cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg)); 1376 checkSetVex(cdb.last(), ty); 1377 } 1378 else 1379 { 1380 // (V)SHUFPS XMM,XMM,0 1381 cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0); 1382 checkSetVex(cdb.last(), ty); 1383 if (tysize(ty) == 32) 1384 { 1385 // VINSERTF128 YMM,YMM,XMM,1 1386 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1387 checkSetVex(cdb.last(), ty); 1388 } 1389 } 1390 } 1391 break; 1392 1393 case TYdouble2: 1394 case TYdouble4: 1395 if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount) 1396 { 1397 // VBROADCASTSD YMM,MEM 1398 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1399 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1400 reg_t reg; 1401 allocreg(cdb,&retregs,®,ty); 1402 cs.Iop = VBROADCASTSD; 1403 cs.Irex &= ~REX_W; 1404 code_newreg(&cs,reg - XMM0); 1405 checkSetVex(&cs,ty); 1406 cdb.gen(&cs); 1407 } 1408 else 1409 { 1410 codelem(cdb,e1,&retregs,false); // eval left leaf 1411 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1412 getregs(cdb,retregs); 1413 if (config.avx >= 2 && tysize(ty) == 32) 1414 { 1415 // VBROADCASTSD YMM,XMM 1416 cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg)); 1417 checkSetVex(cdb.last(), ty); 1418 } 1419 else 1420 { 1421 // (V)UNPCKLPD XMM,XMM 1422 cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg)); 1423 checkSetVex(cdb.last(), TYdouble2); // AVX-128 1424 if (tysize(ty) == 32) 1425 { 1426 // VINSERTF128 YMM,YMM,XMM,1 1427 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1428 checkSetVex(cdb.last(), ty); 1429 } 1430 } 1431 } 1432 break; 1433 1434 case TYschar16: 1435 case TYuchar16: 1436 case TYschar32: 1437 case TYuchar32: 1438 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1439 { 1440 // VPBROADCASTB X/YMM,MEM 1441 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1442 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1443 reg_t reg; 1444 allocreg(cdb,&retregs,®,ty); 1445 cs.Iop = VPBROADCASTB; 1446 cs.Irex &= ~REX_W; 1447 code_newreg(&cs,reg - XMM0); 1448 checkSetVex(&cs,ty); 1449 cdb.gen(&cs); 1450 } 1451 else 1452 { 1453 regm_t regm = ALLREGS; 1454 codelem(cdb,e1,®m,true); // eval left leaf 1455 const r = findreg(regm); 1456 1457 reg_t reg; 1458 allocreg(cdb,&retregs,®, e.Ety); 1459 reg -= XMM0; 1460 // (V)MOVD reg,r 1461 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1462 checkSetVex(cdb.last(), TYushort8); 1463 if (config.avx >= 2) 1464 { 1465 // VPBROADCASTB X/YMM,XMM 1466 cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg)); 1467 checkSetVex(cdb.last(), ty); 1468 } 1469 else 1470 { 1471 if (config.avx) 1472 { 1473 reg_t zeroreg; 1474 regm = XMMREGS & ~retregs; 1475 // VPXOR XMM1,XMM1,XMM1 1476 allocreg(cdb,®m,&zeroreg, ty); 1477 zeroreg -= XMM0; 1478 cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg)); 1479 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1480 // VPSHUFB XMM,XMM,XMM1 1481 cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg)); 1482 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1483 } 1484 else 1485 { 1486 // PUNPCKLBW XMM,XMM 1487 cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg)); 1488 // PUNPCKLWD XMM,XMM 1489 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1490 // PSHUFD XMM,XMM,0 1491 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1492 } 1493 if (tysize(ty) == 32) 1494 { 1495 // VINSERTF128 YMM,YMM,XMM,1 1496 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1497 checkSetVex(cdb.last(), ty); 1498 } 1499 } 1500 } 1501 break; 1502 1503 case TYshort8: 1504 case TYushort8: 1505 case TYshort16: 1506 case TYushort16: 1507 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1508 { 1509 // VPBROADCASTW X/YMM,MEM 1510 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1511 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1512 reg_t reg; 1513 allocreg(cdb,&retregs,®,ty); 1514 cs.Iop = VPBROADCASTW; 1515 cs.Irex &= ~REX_W; 1516 cs.Iflags &= ~CFopsize; 1517 code_newreg(&cs,reg - XMM0); 1518 checkSetVex(&cs,ty); 1519 cdb.gen(&cs); 1520 } 1521 else 1522 { 1523 regm_t regm = ALLREGS; 1524 codelem(cdb,e1,®m,true); // eval left leaf 1525 reg_t r = findreg(regm); 1526 1527 reg_t reg; 1528 allocreg(cdb,&retregs,®, e.Ety); 1529 reg -= XMM0; 1530 // (V)MOVD reg,r 1531 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1532 checkSetVex(cdb.last(), TYushort8); 1533 if (config.avx >= 2) 1534 { 1535 // VPBROADCASTW X/YMM,XMM 1536 cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg)); 1537 checkSetVex(cdb.last(), ty); 1538 } 1539 else 1540 { 1541 // (V)PUNPCKLWD XMM,XMM 1542 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1543 checkSetVex(cdb.last(), TYushort8); // AVX-128 1544 // (V)PSHUFD XMM,XMM,0 1545 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1546 checkSetVex(cdb.last(), TYushort8); // AVX-128 1547 if (tysize(ty) == 32) 1548 { 1549 // VINSERTF128 YMM,YMM,XMM,1 1550 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1551 checkSetVex(cdb.last(), ty); 1552 } 1553 } 1554 } 1555 break; 1556 1557 case TYlong8: 1558 case TYulong8: 1559 case TYlong4: 1560 case TYulong4: 1561 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1562 { 1563 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM 1564 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1565 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1566 reg_t reg; 1567 allocreg(cdb,&retregs,®,ty); 1568 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS; 1569 cs.Irex &= ~REX_W; 1570 code_newreg(&cs,reg - XMM0); 1571 checkSetVex(&cs,ty); 1572 cdb.gen(&cs); 1573 } 1574 else 1575 { 1576 codelem(cdb,e1,&retregs,true); // eval left leaf 1577 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1578 getregs(cdb,retregs); 1579 if (config.avx >= 2) 1580 { 1581 // VPBROADCASTD X/YMM,XMM 1582 cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg)); 1583 checkSetVex(cdb.last(), ty); 1584 } 1585 else 1586 { 1587 // (V)PSHUFD XMM,XMM,0 1588 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1589 checkSetVex(cdb.last(), TYulong4); // AVX-128 1590 if (tysize(ty) == 32) 1591 { 1592 // VINSERTF128 YMM,YMM,XMM,1 1593 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1594 checkSetVex(cdb.last(), ty); 1595 } 1596 } 1597 } 1598 break; 1599 1600 case TYllong2: 1601 case TYullong2: 1602 case TYllong4: 1603 case TYullong4: 1604 if (e1.Eoper == OPind && !e1.Ecount) 1605 { 1606 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM 1607 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1608 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1609 reg_t reg; 1610 allocreg(cdb,&retregs,®,ty); 1611 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ; 1612 cs.Irex &= ~REX_W; 1613 code_newreg(&cs,reg - XMM0); 1614 checkSetVex(&cs,ty); 1615 cdb.gen(&cs); 1616 } 1617 else 1618 { 1619 codelem(cdb,e1,&retregs,true); // eval left leaf 1620 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1621 getregs(cdb,retregs); 1622 if (config.avx >= 2) 1623 { 1624 // VPBROADCASTQ X/YMM,XMM 1625 cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg)); 1626 checkSetVex(cdb.last(), ty); 1627 } 1628 else 1629 { 1630 // (V)PUNPCKLQDQ XMM,XMM 1631 cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0); 1632 checkSetVex(cdb.last(), TYullong2); // AVX-128 1633 if (tysize(ty) == 32) 1634 { 1635 // VINSERTF128 YMM,YMM,XMM,1 1636 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1637 checkSetVex(cdb.last(), ty); 1638 } 1639 } 1640 } 1641 break; 1642 1643 default: 1644 assert(0); 1645 } 1646 1647 fixresult(cdb,e,retregs,pretregs); 1648 } 1649 1650 /******************************************* 1651 * Determine if lvalue e is a vector aligned on a 16/32 byte boundary. 1652 * Assume it to be aligned unless can prove it is not. 1653 * Params: 1654 * e = lvalue 1655 * Returns: 1656 * false if definitely not aligned 1657 */ 1658 1659 bool xmmIsAligned(elem *e) 1660 { 1661 if (tyvector(e.Ety) && e.Eoper == OPvar) 1662 { 1663 Symbol *s = e.EV.Vsym; 1664 const alignsz = tyalignsize(e.Ety); 1665 if (Symbol_Salignsize(s) < alignsz || 1666 e.EV.Voffset & (alignsz - 1) || 1667 alignsz > STACKALIGN 1668 ) 1669 return false; // definitely not aligned 1670 } 1671 return true; // assume aligned 1672 } 1673 1674 /************************************** 1675 * VEX prefixes can be 2 or 3 bytes. 1676 * If it must be 3 bytes, set the CFvex3 flag. 1677 */ 1678 1679 void checkSetVex3(code *c) 1680 { 1681 // See Intel Vol. 2A 2.3.5.6 1682 if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 || 1683 !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8)) 1684 ) 1685 { 1686 c.Iflags |= CFvex3; 1687 } 1688 } 1689 1690 /************************************* 1691 * Determine if operation should be rewritten as a VEX 1692 * operation; and do so. 1693 * Params: 1694 * c = code 1695 * ty = type of operand 1696 */ 1697 1698 void checkSetVex(code *c, tym_t ty) 1699 { 1700 //printf("checkSetVex() %d %x\n", tysize(ty), c.Iop); 1701 if (config.avx || tysize(ty) == 32) 1702 { 1703 uint vreg = (c.Irm >> 3) & 7; 1704 if (c.Irex & REX_R) 1705 vreg |= 8; 1706 1707 // TODO: This is too simplistic, depending on the instruction, vex.vvvv 1708 // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes 1709 // NDS (non-destructive source), except for the incomplete list of 2 1710 // operand instructions (NOO) handled by the switch. 1711 switch (c.Iop) 1712 { 1713 case LODSS: 1714 case LODSD: 1715 case STOSS: 1716 case STOSD: 1717 if ((c.Irm & 0xC0) == 0xC0) 1718 break; 1719 goto case LODAPS; 1720 1721 case LODAPS: 1722 case LODUPS: 1723 case LODAPD: 1724 case LODUPD: 1725 case LODDQA: 1726 case LODDQU: 1727 case LODD: 1728 case LODQ: 1729 case STOAPS: 1730 case STOUPS: 1731 case STOAPD: 1732 case STOUPD: 1733 case STODQA: 1734 case STODQU: 1735 case STOD: 1736 case STOQ: 1737 case COMISS: 1738 case COMISD: 1739 case UCOMISS: 1740 case UCOMISD: 1741 case MOVDDUP: 1742 case MOVSHDUP: 1743 case MOVSLDUP: 1744 case VBROADCASTSS: 1745 case PSHUFD: 1746 case PSHUFHW: 1747 case PSHUFLW: 1748 case VPBROADCASTB: 1749 case VPBROADCASTW: 1750 case VPBROADCASTD: 1751 case VPBROADCASTQ: 1752 vreg = 0; // for 2 operand vex instructions 1753 break; 1754 1755 case VBROADCASTSD: 1756 case VBROADCASTF128: 1757 case VBROADCASTI128: 1758 assert(tysize(ty) == 32); // AVX-256 only instructions 1759 vreg = 0; // for 2 operand vex instructions 1760 break; 1761 1762 case NOP: 1763 return; // ignore 1764 1765 default: 1766 break; 1767 } 1768 1769 opcode_t op = 0xC4000000 | (c.Iop & 0xFF); 1770 switch (c.Iop & 0xFFFFFF00) 1771 { 1772 static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); } 1773 case 0x00000F00: op |= MM_PP(1,0); break; 1774 case 0x00660F00: op |= MM_PP(1,1); break; 1775 case 0x00F30F00: op |= MM_PP(1,2); break; 1776 case 0x00F20F00: op |= MM_PP(1,3); break; 1777 case 0x660F3800: op |= MM_PP(2,1); break; 1778 case 0x660F3A00: op |= MM_PP(3,1); break; 1779 default: 1780 printf("Iop = %x\n", c.Iop); 1781 assert(0); 1782 } 1783 c.Iop = op; 1784 c.Ivex.pfx = 0xC4; 1785 c.Ivex.r = !(c.Irex & REX_R); 1786 c.Ivex.x = !(c.Irex & REX_X); 1787 c.Ivex.b = !(c.Irex & REX_B); 1788 c.Ivex.w = (c.Irex & REX_W) != 0; 1789 c.Ivex.l = tysize(ty) == 32; 1790 1791 c.Ivex.vvvv = cast(ushort)~vreg; 1792 1793 c.Iflags |= CFvex; 1794 checkSetVex3(c); 1795 } 1796 } 1797 1798 /************************************** 1799 * Load complex operand into XMM registers or flags or both. 1800 */ 1801 1802 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1803 { 1804 //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 1805 //elem_print(e); 1806 assert(*pretregs & (XMMREGS | mPSW)); 1807 if (*pretregs == (mXMM0 | mXMM1) && 1808 e.Eoper != OPconst) 1809 { 1810 code cs = void; 1811 tym_t tym = tybasic(e.Ety); 1812 tym_t ty = tym == TYcdouble ? TYdouble : TYfloat; 1813 opcode_t opmv = xmmload(tym, xmmIsAligned(e)); 1814 1815 regm_t retregs0 = mXMM0; 1816 reg_t reg0; 1817 allocreg(cdb, &retregs0, ®0, ty); 1818 loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0); // MOVSS/MOVSD XMM0,data 1819 checkSetVex(cdb.last(), ty); 1820 1821 regm_t retregs1 = mXMM1; 1822 reg_t reg1; 1823 allocreg(cdb, &retregs1, ®1, ty); 1824 loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset 1825 checkSetVex(cdb.last(), ty); 1826 1827 return; 1828 } 1829 1830 // See test/complex.d for cases winding up here 1831 cload87(cdb, e, pretregs); 1832 } 1833 1834 }