1 /** 2 * Compiler implementation of the 3 * $(LINK2 http://www.dlang.org, D programming language). 4 * 5 * Copyright: Copyright (C) 2011-2020 by The D Language Foundation, All Rights Reserved 6 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 7 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 8 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/backend/cgxmm.d, backend/cgxmm.d) 9 */ 10 11 module dmd.backend.cgxmm; 12 13 version (SCPP) 14 version = COMPILE; 15 version (MARS) 16 version = COMPILE; 17 18 version (COMPILE) 19 { 20 21 import core.stdc.stdio; 22 import core.stdc.stdlib; 23 import core.stdc.string; 24 25 import dmd.backend.cc; 26 import dmd.backend.cdef; 27 import dmd.backend.code; 28 import dmd.backend.code_x86; 29 import dmd.backend.codebuilder; 30 import dmd.backend.mem; 31 import dmd.backend.el; 32 import dmd.backend.global; 33 import dmd.backend.oper; 34 import dmd.backend.ty; 35 import dmd.backend.xmm; 36 37 version (SCPP) 38 import dmd.backend.exh; 39 version (MARS) 40 import dmd.backend.errors; 41 42 43 extern (C++): 44 45 nothrow: 46 47 int REGSIZE(); 48 49 uint mask(uint m); 50 51 /******************************************* 52 * Is operator a store operator? 53 */ 54 55 bool isXMMstore(opcode_t op) 56 { 57 switch (op) 58 { 59 case STOSS: case STOAPS: case STOUPS: 60 case STOSD: case STOAPD: case STOUPD: 61 case STOD: case STOQ: case STODQA: case STODQU: 62 case STOHPD: case STOHPS: case STOLPD: case STOLPS: return true; 63 default: return false; 64 } 65 } 66 67 /******************************************* 68 * Move constant value into xmm register xreg. 69 */ 70 71 private void movxmmconst(ref CodeBuilder cdb, reg_t xreg, uint sz, targ_size_t value, regm_t flags) 72 { 73 /* Generate: 74 * MOV reg,value 75 * MOV xreg,reg 76 * Not so efficient. We should at least do a PXOR for 0. 77 */ 78 assert(mask(xreg) & XMMREGS); 79 assert(sz == 4 || sz == 8); 80 if (I32 && sz == 8) 81 { 82 reg_t r; 83 regm_t rm = ALLREGS; 84 allocreg(cdb,&rm,&r,TYint); // allocate scratch register 85 static union U { targ_size_t s; targ_long[2] l; } 86 U u = void; 87 u.l[1] = 0; 88 u.s = value; 89 targ_long *p = &u.l[0]; 90 movregconst(cdb,r,p[0],0); 91 cdb.genfltreg(STO,r,0); // MOV floatreg,r 92 movregconst(cdb,r,p[1],0); 93 cdb.genfltreg(STO,r,4); // MOV floatreg+4,r 94 95 const op = xmmload(TYdouble, true); 96 cdb.genxmmreg(op,xreg,0,TYdouble); // MOVSD XMMreg,floatreg 97 } 98 else 99 { 100 reg_t reg; 101 regwithvalue(cdb,ALLREGS,value,®,(sz == 8) ? 64 : 0); 102 cdb.gen2(LODD,modregxrmx(3,xreg-XMM0,reg)); // MOVD xreg,reg 103 if (sz == 8) 104 code_orrex(cdb.last(), REX_W); 105 checkSetVex(cdb.last(), TYulong); 106 } 107 } 108 109 /*********************************************** 110 * Do simple orthogonal operators for XMM registers. 111 */ 112 113 void orthxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 114 { 115 //printf("orthxmm(e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 116 elem *e1 = e.EV.E1; 117 elem *e2 = e.EV.E2; 118 119 // float + ifloat is not actually addition 120 if ((e.Eoper == OPadd || e.Eoper == OPmin) && 121 ((tyreal(e1.Ety) && tyimaginary(e2.Ety)) || 122 (tyreal(e2.Ety) && tyimaginary(e1.Ety)))) 123 { 124 regm_t retregs = *pretregs & XMMREGS; 125 if (!retregs) 126 retregs = XMMREGS; 127 128 regm_t rretregs; 129 reg_t rreg; 130 if (tyreal(e1.Ety)) 131 { 132 const reg = findreg(retregs); 133 rreg = findreg(retregs & ~mask(reg)); 134 retregs = mask(reg); 135 rretregs = mask(rreg); 136 } 137 else 138 { 139 // Pick the second register, not the first 140 rreg = findreg(retregs); 141 rretregs = mask(rreg); 142 const reg = findreg(retregs & ~rretregs); 143 retregs = mask(reg); 144 } 145 assert(retregs && rretregs); 146 147 codelem(cdb,e1,&retregs,false); // eval left leaf 148 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 149 150 retregs |= rretregs; 151 if (e.Eoper == OPmin) 152 { 153 regm_t nretregs = XMMREGS & ~retregs; 154 reg_t sreg; // hold sign bit 155 const uint sz = tysize(e1.Ety); 156 allocreg(cdb,&nretregs,&sreg,e2.Ety); 157 targ_size_t signbit = 0x80000000; 158 if (sz == 8) 159 signbit = cast(targ_size_t)0x8000000000000000L; 160 movxmmconst(cdb,sreg, sz, signbit, 0); 161 getregs(cdb,nretregs); 162 const opcode_t xop = (sz == 8) ? XORPD : XORPS; // XORPD/S rreg,sreg 163 cdb.gen2(xop,modregxrmx(3,rreg-XMM0,sreg-XMM0)); 164 } 165 if (retregs != *pretregs) 166 fixresult(cdb,e,retregs,pretregs); 167 return; 168 } 169 170 regm_t retregs = *pretregs & XMMREGS; 171 if (!retregs) 172 retregs = XMMREGS; 173 const constflag = OTrel(e.Eoper); 174 codelem(cdb,e1,&retregs,constflag); // eval left leaf 175 const reg = findreg(retregs); 176 regm_t rretregs = XMMREGS & ~retregs; 177 scodelem(cdb, e2, &rretregs, retregs, true); // eval right leaf 178 179 const rreg = findreg(rretregs); 180 const op = xmmoperator(e1.Ety, e.Eoper); 181 182 /* We should take advantage of mem addressing modes for OP XMM,MEM 183 * but we do not at the moment. 184 */ 185 if (OTrel(e.Eoper)) 186 { 187 cdb.gen2(op,modregxrmx(3,rreg-XMM0,reg-XMM0)); 188 checkSetVex(cdb.last(), e1.Ety); 189 return; 190 } 191 192 getregs(cdb,retregs); 193 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 194 checkSetVex(cdb.last(), e1.Ety); 195 if (retregs != *pretregs) 196 fixresult(cdb,e,retregs,pretregs); 197 } 198 199 200 /************************ 201 * Generate code for an assignment using XMM registers. 202 * Params: 203 * opcode = store opcode to use, CMP means generate one 204 */ 205 206 void xmmeq(ref CodeBuilder cdb, elem *e, opcode_t op, elem *e1, elem *e2,regm_t *pretregs) 207 { 208 tym_t tymll; 209 int i; 210 code cs; 211 elem *e11; 212 bool regvar; /* true means evaluate into register variable */ 213 regm_t varregm; 214 targ_int postinc; 215 216 //printf("xmmeq(e1 = %p, e2 = %p, *pretregs = %s)\n", e1, e2, regm_str(*pretregs)); 217 tym_t tyml = tybasic(e1.Ety); /* type of lvalue */ 218 regm_t retregs = *pretregs; 219 220 if (!(retregs & XMMREGS)) 221 retregs = XMMREGS; // pick any XMM reg 222 223 bool aligned = xmmIsAligned(e1); 224 // If default, select store opcode 225 cs.Iop = (op == CMP) ? xmmstore(tyml, aligned) : op; 226 regvar = false; 227 varregm = 0; 228 if (config.flags4 & CFG4optimized) 229 { 230 // Be careful of cases like (x = x+x+x). We cannot evaluate in 231 // x if x is in a register. 232 reg_t varreg; 233 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 234 doinreg(e1.EV.Vsym,e2) && // and we can compute directly into it 235 varregm & XMMREGS 236 ) 237 { regvar = true; 238 retregs = varregm; // evaluate directly in target register 239 } 240 } 241 if (*pretregs & mPSW && OTleaf(e1.Eoper)) // if evaluating e1 couldn't change flags 242 { // Be careful that this lines up with jmpopcode() 243 retregs |= mPSW; 244 *pretregs &= ~mPSW; 245 } 246 scodelem(cdb,e2,&retregs,0,true); // get rvalue 247 248 // Look for special case of (*p++ = ...), where p is a register variable 249 if (e1.Eoper == OPind && 250 ((e11 = e1.EV.E1).Eoper == OPpostinc || e11.Eoper == OPpostdec) && 251 e11.EV.E1.Eoper == OPvar && 252 e11.EV.E1.EV.Vsym.Sfl == FLreg 253 ) 254 { 255 postinc = e11.EV.E2.EV.Vint; 256 if (e11.Eoper == OPpostdec) 257 postinc = -postinc; 258 getlvalue(cdb,&cs,e11,RMstore | retregs); 259 freenode(e11.EV.E2); 260 } 261 else 262 { postinc = 0; 263 getlvalue(cdb,&cs,e1,RMstore | retregs); // get lvalue (cl == CNIL if regvar) 264 } 265 266 getregs_imm(cdb,regvar ? varregm : 0); 267 268 const reg = findreg(retregs & XMMREGS); 269 cs.Irm |= modregrm(0,(reg - XMM0) & 7,0); 270 if ((reg - XMM0) & 8) 271 cs.Irex |= REX_R; 272 273 // Do not generate mov from register onto itself 274 if (!(regvar && reg == XMM0 + ((cs.Irm & 7) | (cs.Irex & REX_B ? 8 : 0)))) 275 { 276 cdb.gen(&cs); // MOV EA+offset,reg 277 checkSetVex(cdb.last(), tyml); 278 } 279 280 if (e1.Ecount || // if lvalue is a CSE or 281 regvar) // rvalue can't be a CSE 282 { 283 getregs_imm(cdb,retregs); // necessary if both lvalue and 284 // rvalue are CSEs (since a reg 285 // can hold only one e at a time) 286 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 287 } 288 289 fixresult(cdb,e,retregs,pretregs); 290 if (postinc) 291 { 292 const increg = findreg(idxregm(&cs)); // the register to increment 293 if (*pretregs & mPSW) 294 { // Use LEA to avoid touching the flags 295 uint rm = cs.Irm & 7; 296 if (cs.Irex & REX_B) 297 rm |= 8; 298 cdb.genc1(LEA,buildModregrm(2,increg,rm),FLconst,postinc); 299 if (tysize(e11.EV.E1.Ety) == 8) 300 code_orrex(cdb.last(), REX_W); 301 } 302 else if (I64) 303 { 304 cdb.genc2(0x81,modregrmx(3,0,increg),postinc); 305 if (tysize(e11.EV.E1.Ety) == 8) 306 code_orrex(cdb.last(), REX_W); 307 } 308 else 309 { 310 if (postinc == 1) 311 cdb.gen1(0x40 + increg); // INC increg 312 else if (postinc == -cast(targ_int)1) 313 cdb.gen1(0x48 + increg); // DEC increg 314 else 315 { 316 cdb.genc2(0x81,modregrm(3,0,increg),postinc); 317 } 318 } 319 } 320 freenode(e1); 321 } 322 323 /******************************** 324 * Generate code for conversion using SSE2 instructions. 325 * 326 * OPs32_d 327 * OPs64_d (64-bit only) 328 * OPu32_d (64-bit only) 329 * OPd_f 330 * OPf_d 331 * OPd_s32 332 * OPd_s64 (64-bit only) 333 * 334 */ 335 336 void xmmcnvt(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 337 { 338 //printf("xmmconvt: %p, %s\n", e, regm_str(*pretregs)); 339 opcode_t op = NoOpcode; 340 regm_t regs; 341 tym_t ty; 342 ubyte rex = 0; 343 bool zx = false; // zero extend uint 344 345 /* There are no ops for integer <. float/real conversions 346 * but there are instructions for them. In order to use these 347 * try to fuse chained conversions. Be careful not to loose 348 * precision for real to long. 349 */ 350 elem *e1 = e.EV.E1; 351 switch (e.Eoper) 352 { 353 case OPd_f: 354 if (e1.Eoper == OPs32_d) 355 { } 356 else if (I64 && e1.Eoper == OPs64_d) 357 rex = REX_W; 358 else if (I64 && e1.Eoper == OPu32_d) 359 { rex = REX_W; 360 zx = true; 361 } 362 else 363 { regs = XMMREGS; 364 op = CVTSD2SS; 365 ty = TYfloat; 366 break; 367 } 368 if (e1.Ecount) 369 { 370 regs = XMMREGS; 371 op = CVTSD2SS; 372 ty = TYfloat; 373 break; 374 } 375 // directly use si2ss 376 regs = ALLREGS; 377 e1 = e1.EV.E1; // fused operation 378 op = CVTSI2SS; 379 ty = TYfloat; 380 break; 381 382 case OPs32_d: goto Litod; 383 case OPs64_d: rex = REX_W; goto Litod; 384 case OPu32_d: rex = REX_W; zx = true; goto Litod; 385 Litod: 386 regs = ALLREGS; 387 op = CVTSI2SD; 388 ty = TYdouble; 389 break; 390 391 case OPd_s32: ty = TYint; goto Ldtoi; 392 case OPd_u32: ty = TYlong; if (I64) rex = REX_W; goto Ldtoi; 393 case OPd_s64: ty = TYlong; rex = REX_W; goto Ldtoi; 394 Ldtoi: 395 regs = XMMREGS; 396 switch (e1.Eoper) 397 { 398 case OPf_d: 399 if (e1.Ecount) 400 { 401 op = CVTTSD2SI; 402 break; 403 } 404 e1 = e1.EV.E1; // fused operation 405 op = CVTTSS2SI; 406 break; 407 case OPld_d: 408 if (e.Eoper == OPd_s64) 409 { 410 cnvt87(cdb,e,pretregs); // precision 411 return; 412 } 413 goto default; 414 415 default: 416 op = CVTTSD2SI; 417 break; 418 } 419 break; 420 421 case OPf_d: 422 regs = XMMREGS; 423 op = CVTSS2SD; 424 ty = TYdouble; 425 break; 426 427 default: 428 assert(0); 429 } 430 assert(op != NoOpcode); 431 432 codelem(cdb,e1, ®s, false); 433 reg_t reg = findreg(regs); 434 if (isXMMreg(reg)) 435 reg -= XMM0; 436 else if (zx) 437 { assert(I64); 438 getregs(cdb,regs); 439 genregs(cdb,0x8B,reg,reg); // MOV reg,reg to zero upper 32-bit 440 // Don't use x89 because that will get optimized away 441 code_orflag(cdb.last(),CFvolatile); 442 } 443 444 regm_t retregs = *pretregs; 445 if (tyxmmreg(ty)) // target is XMM 446 { if (!(*pretregs & XMMREGS)) 447 retregs = XMMREGS; 448 } 449 else // source is XMM 450 { assert(regs & XMMREGS); 451 if (!(retregs & ALLREGS)) 452 retregs = ALLREGS; 453 } 454 455 reg_t rreg; 456 allocreg(cdb,&retregs,&rreg,ty); 457 if (isXMMreg(rreg)) 458 rreg -= XMM0; 459 460 cdb.gen2(op, modregxrmx(3,rreg,reg)); 461 assert(I64 || !rex); 462 if (rex) 463 code_orrex(cdb.last(), rex); 464 465 if (*pretregs != retregs) 466 fixresult(cdb,e,retregs,pretregs); 467 } 468 469 /******************************** 470 * Generate code for op= 471 */ 472 473 void xmmopass(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 474 { elem *e1 = e.EV.E1; 475 elem *e2 = e.EV.E2; 476 tym_t ty1 = tybasic(e1.Ety); 477 const sz1 = _tysize[ty1]; 478 regm_t rretregs = XMMREGS & ~*pretregs; 479 if (!rretregs) 480 rretregs = XMMREGS; 481 482 codelem(cdb,e2,&rretregs,false); // eval right leaf 483 reg_t rreg = findreg(rretregs); 484 485 code cs; 486 regm_t retregs; 487 reg_t reg; 488 bool regvar = false; 489 if (config.flags4 & CFG4optimized) 490 { 491 // Be careful of cases like (x = x+x+x). We cannot evaluate in 492 // x if x is in a register. 493 reg_t varreg; 494 regm_t varregm; 495 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 496 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 497 ) 498 { regvar = true; 499 retregs = varregm; 500 reg = varreg; // evaluate directly in target register 501 getregs(cdb,retregs); // destroy these regs 502 } 503 } 504 505 if (!regvar) 506 { 507 getlvalue(cdb,&cs,e1,rretregs); // get EA 508 retregs = *pretregs & XMMREGS & ~rretregs; 509 if (!retregs) 510 retregs = XMMREGS & ~rretregs; 511 allocreg(cdb,&retregs,®,ty1); 512 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 513 code_newreg(&cs,reg - XMM0); 514 cdb.gen(&cs); 515 checkSetVex(cdb.last(), ty1); 516 } 517 518 const op = xmmoperator(e1.Ety, e.Eoper); 519 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 520 checkSetVex(cdb.last(), e1.Ety); 521 522 if (!regvar) 523 { 524 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 525 cdb.gen(&cs); 526 checkSetVex(cdb.last(), ty1); 527 } 528 529 if (e1.Ecount || // if lvalue is a CSE or 530 regvar) // rvalue can't be a CSE 531 { 532 getregs_imm(cdb,retregs); // necessary if both lvalue and 533 // rvalue are CSEs (since a reg 534 // can hold only one e at a time) 535 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 536 } 537 538 fixresult(cdb,e,retregs,pretregs); 539 freenode(e1); 540 } 541 542 /******************************** 543 * Generate code for post increment and post decrement. 544 */ 545 546 void xmmpost(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 547 { 548 elem *e1 = e.EV.E1; 549 elem *e2 = e.EV.E2; 550 tym_t ty1 = tybasic(e1.Ety); 551 552 regm_t retregs; 553 reg_t reg; 554 bool regvar = false; 555 if (config.flags4 & CFG4optimized) 556 { 557 // Be careful of cases like (x = x+x+x). We cannot evaluate in 558 // x if x is in a register. 559 reg_t varreg; 560 regm_t varregm; 561 if (isregvar(e1,&varregm,&varreg) && // if lvalue is register variable 562 doinreg(e1.EV.Vsym,e2) // and we can compute directly into it 563 ) 564 { 565 regvar = true; 566 retregs = varregm; 567 reg = varreg; // evaluate directly in target register 568 getregs(cdb,retregs); // destroy these regs 569 } 570 } 571 572 code cs; 573 if (!regvar) 574 { 575 getlvalue(cdb,&cs,e1,0); // get EA 576 retregs = XMMREGS & ~*pretregs; 577 if (!retregs) 578 retregs = XMMREGS; 579 allocreg(cdb,&retregs,®,ty1); 580 cs.Iop = xmmload(ty1, true); // MOVSD xmm,xmm_m64 581 code_newreg(&cs,reg - XMM0); 582 cdb.gen(&cs); 583 checkSetVex(cdb.last(), ty1); 584 } 585 586 // Result register 587 regm_t resultregs = XMMREGS & *pretregs & ~retregs; 588 if (!resultregs) 589 resultregs = XMMREGS & ~retregs; 590 reg_t resultreg; 591 allocreg(cdb,&resultregs, &resultreg, ty1); 592 593 cdb.gen2(xmmload(ty1,true),modregxrmx(3,resultreg-XMM0,reg-XMM0)); // MOVSS/D resultreg,reg 594 checkSetVex(cdb.last(), ty1); 595 596 regm_t rretregs = XMMREGS & ~(*pretregs | retregs | resultregs); 597 if (!rretregs) 598 rretregs = XMMREGS & ~(retregs | resultregs); 599 codelem(cdb,e2,&rretregs,false); // eval right leaf 600 const rreg = findreg(rretregs); 601 602 const op = xmmoperator(e1.Ety, e.Eoper); 603 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); // ADD reg,rreg 604 checkSetVex(cdb.last(), e1.Ety); 605 606 if (!regvar) 607 { 608 cs.Iop = xmmstore(ty1,true); // reverse operand order of MOVS[SD] 609 cdb.gen(&cs); 610 checkSetVex(cdb.last(), ty1); 611 } 612 613 if (e1.Ecount || // if lvalue is a CSE or 614 regvar) // rvalue can't be a CSE 615 { 616 getregs_imm(cdb,retregs); // necessary if both lvalue and 617 // rvalue are CSEs (since a reg 618 // can hold only one e at a time) 619 cssave(e1,retregs,!OTleaf(e1.Eoper)); // if lvalue is a CSE 620 } 621 622 fixresult(cdb,e,resultregs,pretregs); 623 freenode(e1); 624 } 625 626 /****************** 627 * Negate operator 628 */ 629 630 void xmmneg(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 631 { 632 //printf("xmmneg()\n"); 633 //elem_print(e); 634 assert(*pretregs); 635 tym_t tyml = tybasic(e.EV.E1.Ety); 636 int sz = _tysize[tyml]; 637 638 regm_t retregs = *pretregs & XMMREGS; 639 if (!retregs) 640 retregs = XMMREGS; 641 642 /* Generate: 643 * MOV reg,e1 644 * MOV rreg,signbit 645 * XOR reg,rreg 646 */ 647 codelem(cdb,e.EV.E1,&retregs,false); 648 getregs(cdb,retregs); 649 const reg = findreg(retregs); 650 regm_t rretregs = XMMREGS & ~retregs; 651 reg_t rreg; 652 allocreg(cdb,&rretregs,&rreg,tyml); 653 targ_size_t signbit = 0x80000000; 654 if (sz == 8) 655 signbit = cast(targ_size_t)0x8000000000000000L; 656 movxmmconst(cdb,rreg, sz, signbit, 0); 657 658 getregs(cdb,retregs); 659 const op = (sz == 8) ? XORPD : XORPS; // XORPD/S reg,rreg 660 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 661 fixresult(cdb,e,retregs,pretregs); 662 } 663 664 /****************** 665 * Absolute value operator OPabs 666 */ 667 668 void xmmabs(ref CodeBuilder cdb,elem *e,regm_t *pretregs) 669 { 670 //printf("xmmabs()\n"); 671 //elem_print(e); 672 assert(*pretregs); 673 tym_t tyml = tybasic(e.EV.E1.Ety); 674 int sz = _tysize[tyml]; 675 676 regm_t retregs = *pretregs & XMMREGS; 677 if (!retregs) 678 retregs = XMMREGS; 679 680 /* Generate: 681 * MOV reg,e1 682 * MOV rreg,mask 683 * AND reg,rreg 684 */ 685 codelem(cdb,e.EV.E1,&retregs,false); 686 getregs(cdb,retregs); 687 const reg = findreg(retregs); 688 regm_t rretregs = XMMREGS & ~retregs; 689 reg_t rreg; 690 allocreg(cdb,&rretregs,&rreg,tyml); 691 targ_size_t mask = 0x7FFF_FFFF; 692 if (sz == 8) 693 mask = cast(targ_size_t)0x7FFF_FFFF_FFFF_FFFFL; 694 movxmmconst(cdb,rreg, sz, mask, 0); 695 696 getregs(cdb,retregs); 697 const op = (sz == 8) ? ANDPD : ANDPS; // ANDPD/S reg,rreg 698 cdb.gen2(op,modregxrmx(3,reg-XMM0,rreg-XMM0)); 699 fixresult(cdb,e,retregs,pretregs); 700 } 701 702 /***************************** 703 * Get correct load operator based on type. 704 * It is important to use the right one even if the number of bits moved is the same, 705 * as there are performance consequences for using the wrong one. 706 * Params: 707 * tym = type of data to load 708 * aligned = for vectors, true if aligned to 16 bytes 709 */ 710 711 opcode_t xmmload(tym_t tym, bool aligned) 712 { 713 opcode_t op; 714 if (tysize(tym) == 32) 715 aligned = false; 716 switch (tybasic(tym)) 717 { 718 case TYuint: 719 case TYint: 720 case TYlong: 721 case TYulong: op = LODD; break; // MOVD 722 case TYfloat: 723 case TYcfloat: 724 case TYifloat: op = LODSS; break; // MOVSS 725 case TYllong: 726 case TYullong: op = LODQ; break; // MOVQ 727 case TYdouble: 728 case TYcdouble: 729 case TYidouble: op = LODSD; break; // MOVSD 730 731 case TYfloat8: 732 case TYfloat4: op = aligned ? LODAPS : LODUPS; break; // MOVAPS / MOVUPS 733 case TYdouble4: 734 case TYdouble2: op = aligned ? LODAPD : LODUPD; break; // MOVAPD / MOVUPD 735 case TYschar16: 736 case TYuchar16: 737 case TYshort8: 738 case TYushort8: 739 case TYlong4: 740 case TYulong4: 741 case TYllong2: 742 case TYullong2: 743 case TYschar32: 744 case TYuchar32: 745 case TYshort16: 746 case TYushort16: 747 case TYlong8: 748 case TYulong8: 749 case TYllong4: 750 case TYullong4: op = aligned ? LODDQA : LODDQU; break; // MOVDQA / MOVDQU 751 752 default: 753 printf("tym = x%x\n", tym); 754 assert(0); 755 } 756 return op; 757 } 758 759 /***************************** 760 * Get correct store operator based on type. 761 */ 762 763 opcode_t xmmstore(tym_t tym, bool aligned) 764 { 765 opcode_t op; 766 switch (tybasic(tym)) 767 { 768 case TYuint: 769 case TYint: 770 case TYlong: 771 case TYulong: op = STOD; break; // MOVD 772 case TYfloat: 773 case TYifloat: op = STOSS; break; // MOVSS 774 case TYllong: 775 case TYullong: op = STOQ; break; // MOVQ 776 case TYdouble: 777 case TYidouble: 778 case TYcdouble: 779 case TYcfloat: op = STOSD; break; // MOVSD 780 781 case TYfloat8: 782 case TYfloat4: op = aligned ? STOAPS : STOUPS; break; // MOVAPS / MOVUPS 783 case TYdouble4: 784 case TYdouble2: op = aligned ? STOAPD : STOUPD; break; // MOVAPD / MOVUPD 785 case TYschar16: 786 case TYuchar16: 787 case TYshort8: 788 case TYushort8: 789 case TYlong4: 790 case TYulong4: 791 case TYllong2: 792 case TYullong2: 793 case TYschar32: 794 case TYuchar32: 795 case TYshort16: 796 case TYushort16: 797 case TYlong8: 798 case TYulong8: 799 case TYllong4: 800 case TYullong4: op = aligned ? STODQA : STODQU; break; // MOVDQA / MOVDQU 801 802 default: 803 printf("tym = 0x%x\n", tym); 804 assert(0); 805 } 806 return op; 807 } 808 809 810 /************************************ 811 * Get correct XMM operator based on type and operator. 812 */ 813 814 private opcode_t xmmoperator(tym_t tym, OPER oper) 815 { 816 tym = tybasic(tym); 817 opcode_t op; 818 switch (oper) 819 { 820 case OPadd: 821 case OPaddass: 822 case OPpostinc: 823 switch (tym) 824 { 825 case TYfloat: 826 case TYifloat: op = ADDSS; break; 827 case TYdouble: 828 case TYidouble: op = ADDSD; break; 829 830 // SIMD vector types 831 case TYfloat8: 832 case TYfloat4: op = ADDPS; break; 833 case TYdouble4: 834 case TYdouble2: op = ADDPD; break; 835 case TYschar32: 836 case TYuchar32: 837 case TYschar16: 838 case TYuchar16: op = PADDB; break; 839 case TYshort16: 840 case TYushort16: 841 case TYshort8: 842 case TYushort8: op = PADDW; break; 843 case TYlong8: 844 case TYulong8: 845 case TYlong4: 846 case TYulong4: op = PADDD; break; 847 case TYllong4: 848 case TYullong4: 849 case TYllong2: 850 case TYullong2: op = PADDQ; break; 851 852 default: 853 printf("tym = x%x\n", tym); 854 assert(0); 855 } 856 break; 857 858 case OPmin: 859 case OPminass: 860 case OPpostdec: 861 switch (tym) 862 { 863 case TYfloat: 864 case TYifloat: op = SUBSS; break; 865 case TYdouble: 866 case TYidouble: op = SUBSD; break; 867 868 // SIMD vector types 869 case TYfloat8: 870 case TYfloat4: op = SUBPS; break; 871 case TYdouble4: 872 case TYdouble2: op = SUBPD; break; 873 case TYschar32: 874 case TYuchar32: 875 case TYschar16: 876 case TYuchar16: op = PSUBB; break; 877 case TYshort16: 878 case TYushort16: 879 case TYshort8: 880 case TYushort8: op = PSUBW; break; 881 case TYlong8: 882 case TYulong8: 883 case TYlong4: 884 case TYulong4: op = PSUBD; break; 885 case TYllong4: 886 case TYullong4: 887 case TYllong2: 888 case TYullong2: op = PSUBQ; break; 889 890 default: assert(0); 891 } 892 break; 893 894 case OPmul: 895 case OPmulass: 896 switch (tym) 897 { 898 case TYfloat: 899 case TYifloat: op = MULSS; break; 900 case TYdouble: 901 case TYidouble: op = MULSD; break; 902 903 // SIMD vector types 904 case TYfloat8: 905 case TYfloat4: op = MULPS; break; 906 case TYdouble4: 907 case TYdouble2: op = MULPD; break; 908 case TYshort16: 909 case TYushort16: 910 case TYshort8: 911 case TYushort8: op = PMULLW; break; 912 case TYlong8: 913 case TYulong8: 914 case TYlong4: 915 case TYulong4: op = PMULLD; break; 916 917 default: assert(0); 918 } 919 break; 920 921 case OPdiv: 922 case OPdivass: 923 switch (tym) 924 { 925 case TYfloat: 926 case TYifloat: op = DIVSS; break; 927 case TYdouble: 928 case TYidouble: op = DIVSD; break; 929 930 // SIMD vector types 931 case TYfloat8: 932 case TYfloat4: op = DIVPS; break; 933 case TYdouble4: 934 case TYdouble2: op = DIVPD; break; 935 936 default: assert(0); 937 } 938 break; 939 940 case OPor: 941 case OPorass: 942 switch (tym) 943 { 944 // SIMD vector types 945 case TYschar16: 946 case TYuchar16: 947 case TYshort8: 948 case TYushort8: 949 case TYlong4: 950 case TYulong4: 951 case TYllong2: 952 case TYullong2: 953 case TYschar32: 954 case TYuchar32: 955 case TYshort16: 956 case TYushort16: 957 case TYlong8: 958 case TYulong8: 959 case TYllong4: 960 case TYullong4: op = POR; break; 961 962 default: assert(0); 963 } 964 break; 965 966 case OPand: 967 case OPandass: 968 switch (tym) 969 { 970 // SIMD vector types 971 case TYschar16: 972 case TYuchar16: 973 case TYshort8: 974 case TYushort8: 975 case TYlong4: 976 case TYulong4: 977 case TYllong2: 978 case TYullong2: 979 case TYschar32: 980 case TYuchar32: 981 case TYshort16: 982 case TYushort16: 983 case TYlong8: 984 case TYulong8: 985 case TYllong4: 986 case TYullong4: op = PAND; break; 987 988 default: assert(0); 989 } 990 break; 991 992 case OPxor: 993 case OPxorass: 994 switch (tym) 995 { 996 // SIMD vector types 997 case TYschar16: 998 case TYuchar16: 999 case TYshort8: 1000 case TYushort8: 1001 case TYlong4: 1002 case TYulong4: 1003 case TYllong2: 1004 case TYullong2: 1005 case TYschar32: 1006 case TYuchar32: 1007 case TYshort16: 1008 case TYushort16: 1009 case TYlong8: 1010 case TYulong8: 1011 case TYllong4: 1012 case TYullong4: op = PXOR; break; 1013 1014 default: assert(0); 1015 } 1016 break; 1017 1018 case OPlt: 1019 case OPle: 1020 case OPgt: 1021 case OPge: 1022 case OPne: 1023 case OPeqeq: 1024 case OPunord: /* !<>= */ 1025 case OPlg: /* <> */ 1026 case OPleg: /* <>= */ 1027 case OPule: /* !> */ 1028 case OPul: /* !>= */ 1029 case OPuge: /* !< */ 1030 case OPug: /* !<= */ 1031 case OPue: /* !<> */ 1032 case OPngt: 1033 case OPnge: 1034 case OPnlt: 1035 case OPnle: 1036 case OPord: 1037 case OPnlg: 1038 case OPnleg: 1039 case OPnule: 1040 case OPnul: 1041 case OPnuge: 1042 case OPnug: 1043 case OPnue: 1044 switch (tym) 1045 { 1046 case TYfloat: 1047 case TYifloat: op = UCOMISS; break; 1048 case TYdouble: 1049 case TYidouble: op = UCOMISD; break; 1050 1051 default: assert(0); 1052 } 1053 break; 1054 1055 default: 1056 assert(0); 1057 } 1058 return op; 1059 } 1060 1061 void cdvector(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1062 { 1063 /* e should look like one of: 1064 * vector 1065 * | 1066 * param 1067 * / \ 1068 * param op2 1069 * / \ 1070 * op op1 1071 */ 1072 1073 if (!config.fpxmmregs) 1074 { printf("SIMD operations not supported on this platform\n"); 1075 exit(1); 1076 } 1077 1078 const n = el_nparams(e.EV.E1); 1079 elem **params = cast(elem **)malloc(n * (elem *).sizeof); 1080 assert(params); 1081 elem **tmp = params; 1082 el_paramArray(&tmp, e.EV.E1); 1083 1084 static if (0) 1085 { 1086 printf("cdvector()\n"); 1087 for (int i = 0; i < n; i++) 1088 { 1089 printf("[%d]: ", i); 1090 elem_print(params[i]); 1091 } 1092 } 1093 1094 if (*pretregs == 0) 1095 { /* Evaluate for side effects only 1096 */ 1097 foreach (i; 0 .. n) 1098 { 1099 codelem(cdb,params[i], pretregs, false); 1100 *pretregs = 0; // in case they got set 1101 } 1102 return; 1103 } 1104 1105 assert(n >= 2 && n <= 4); 1106 1107 elem *eop = params[0]; 1108 elem *op1 = params[1]; 1109 elem *op2 = null; 1110 tym_t ty2 = 0; 1111 if (n >= 3) 1112 { op2 = params[2]; 1113 ty2 = tybasic(op2.Ety); 1114 } 1115 1116 auto op = cast(opcode_t)el_tolong(eop); 1117 debug assert(!isXMMstore(op)); 1118 tym_t ty1 = tybasic(op1.Ety); 1119 1120 regm_t retregs; 1121 if (n == 3 && ty2 == TYuchar && op2.Eoper == OPconst) 1122 { // Handle: op xmm,imm8 1123 1124 retregs = *pretregs & XMMREGS; 1125 if (!retregs) 1126 retregs = XMMREGS; 1127 codelem(cdb,op1,&retregs,false); // eval left leaf 1128 const reg = findreg(retregs); 1129 int r; 1130 switch (op) 1131 { 1132 case PSLLD: r = 6; op = 0x660F72; break; 1133 case PSLLQ: r = 6; op = 0x660F73; break; 1134 case PSLLW: r = 6; op = 0x660F71; break; 1135 case PSRAD: r = 4; op = 0x660F72; break; 1136 case PSRAW: r = 4; op = 0x660F71; break; 1137 case PSRLD: r = 2; op = 0x660F72; break; 1138 case PSRLQ: r = 2; op = 0x660F73; break; 1139 case PSRLW: r = 2; op = 0x660F71; break; 1140 case PSRLDQ: r = 3; op = 0x660F73; break; 1141 case PSLLDQ: r = 7; op = 0x660F73; break; 1142 1143 default: 1144 printf("op = x%x\n", op); 1145 assert(0); 1146 } 1147 getregs(cdb,retregs); 1148 cdb.genc2(op,modregrmx(3,r,reg-XMM0), cast(uint)el_tolong(op2)); 1149 } 1150 else if (n == 2) 1151 { /* Handle: op xmm,mem 1152 * where xmm is written only, not read 1153 */ 1154 code cs; 1155 1156 if ((op1.Eoper == OPind && !op1.Ecount) || op1.Eoper == OPvar) 1157 { 1158 getlvalue(cdb,&cs, op1, RMload); // get addressing mode 1159 } 1160 else 1161 { 1162 regm_t rretregs = XMMREGS; 1163 codelem(cdb,op1, &rretregs, false); 1164 const rreg = findreg(rretregs) - XMM0; 1165 cs.Irm = modregrm(3,0,rreg & 7); 1166 cs.Iflags = 0; 1167 cs.Irex = 0; 1168 if (rreg & 8) 1169 cs.Irex |= REX_B; 1170 } 1171 1172 retregs = *pretregs & XMMREGS; 1173 if (!retregs) 1174 retregs = XMMREGS; 1175 reg_t reg; 1176 allocreg(cdb,&retregs, ®, e.Ety); 1177 code_newreg(&cs, reg - XMM0); 1178 cs.Iop = op; 1179 cdb.gen(&cs); 1180 } 1181 else if (n == 3 || n == 4) 1182 { /* Handle: 1183 * op xmm,mem // n = 3 1184 * op xmm,mem,imm8 // n = 4 1185 * Both xmm and mem are operands, evaluate xmm first. 1186 */ 1187 1188 code cs; 1189 1190 retregs = *pretregs & XMMREGS; 1191 if (!retregs) 1192 retregs = XMMREGS; 1193 codelem(cdb,op1,&retregs,false); // eval left leaf 1194 const reg = findreg(retregs); 1195 1196 if ((op2.Eoper == OPind && !op2.Ecount) || op2.Eoper == OPvar) 1197 { 1198 getlvalue(cdb,&cs, op2, RMload | retregs); // get addressing mode 1199 } 1200 else 1201 { 1202 regm_t rretregs = XMMREGS & ~retregs; 1203 scodelem(cdb, op2, &rretregs, retregs, true); 1204 const rreg = findreg(rretregs) - XMM0; 1205 cs.Irm = modregrm(3,0,rreg & 7); 1206 cs.Iflags = 0; 1207 cs.Irex = 0; 1208 if (rreg & 8) 1209 cs.Irex |= REX_B; 1210 } 1211 1212 getregs(cdb,retregs); 1213 if (n == 4) 1214 { 1215 switch (op) 1216 { 1217 case CMPPD: case CMPSS: case CMPSD: case CMPPS: 1218 case PSHUFD: case PSHUFHW: case PSHUFLW: 1219 case BLENDPD: case BLENDPS: case DPPD: case DPPS: 1220 case MPSADBW: case PBLENDW: 1221 case ROUNDPD: case ROUNDPS: case ROUNDSD: case ROUNDSS: 1222 case SHUFPD: case SHUFPS: 1223 break; 1224 default: 1225 printf("op = x%x\n", op); 1226 assert(0); 1227 } 1228 elem *imm8 = params[3]; 1229 cs.IFL2 = FLconst; 1230 version (MARS) 1231 { 1232 if (imm8.Eoper != OPconst) 1233 { 1234 error(imm8.Esrcpos.Sfilename, imm8.Esrcpos.Slinnum, imm8.Esrcpos.Scharnum, "last parameter to `__simd()` must be a constant"); 1235 cs.IEV2.Vsize_t = 0; 1236 } 1237 else 1238 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1239 } 1240 else 1241 { 1242 cs.IEV2.Vsize_t = cast(targ_size_t)el_tolong(imm8); 1243 } 1244 } 1245 code_newreg(&cs, reg - XMM0); 1246 cs.Iop = op; 1247 cdb.gen(&cs); 1248 } 1249 else 1250 assert(0); 1251 fixresult(cdb,e,retregs,pretregs); 1252 free(params); 1253 freenode(e); 1254 } 1255 1256 /*************** 1257 * Generate code for vector "store" operations. 1258 * The tree e must look like: 1259 * (op1 OPvecsto (op OPparam op2)) 1260 * where op is the store instruction STOxxxx. 1261 */ 1262 void cdvecsto(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1263 { 1264 //printf("cdvecsto()\n"); 1265 //elem_print(e); 1266 elem *op1 = e.EV.E1; 1267 elem *op2 = e.EV.E2.EV.E2; 1268 elem *eop = e.EV.E2.EV.E1; 1269 const op = cast(opcode_t)el_tolong(eop); 1270 debug assert(isXMMstore(op)); 1271 xmmeq(cdb, e, op, op1, op2, pretregs); 1272 } 1273 1274 /*************** 1275 * Generate code for OPvecfill (broadcast). 1276 * OPvecfill takes the single value in e1 and 1277 * fills the vector type with it. 1278 */ 1279 void cdvecfill(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1280 { 1281 //printf("cdvecfill(e = %p, *pretregs = %s)\n",e,regm_str(*pretregs)); 1282 1283 regm_t retregs = *pretregs & XMMREGS; 1284 if (!retregs) 1285 retregs = XMMREGS; 1286 1287 code *c; 1288 code cs; 1289 1290 elem *e1 = e.EV.E1; 1291 static if (0) 1292 { 1293 if ((e1.Eoper == OPind && !e1.Ecount) || e1.Eoper == OPvar) 1294 { 1295 cr = getlvalue(&cs, e1, RMload | retregs); // get addressing mode 1296 } 1297 else 1298 { 1299 regm_t rretregs = XMMREGS & ~retregs; 1300 cr = scodelem(op2, &rretregs, retregs, true); 1301 const rreg = findreg(rretregs) - XMM0; 1302 cs.Irm = modregrm(3,0,rreg & 7); 1303 cs.Iflags = 0; 1304 cs.Irex = 0; 1305 if (rreg & 8) 1306 cs.Irex |= REX_B; 1307 } 1308 } 1309 1310 const ty = tybasic(e.Ety); 1311 switch (ty) 1312 { 1313 case TYfloat4: 1314 case TYfloat8: 1315 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1316 { 1317 // VBROADCASTSS X/YMM,MEM 1318 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1319 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1320 reg_t reg; 1321 allocreg(cdb,&retregs,®,ty); 1322 cs.Iop = VBROADCASTSS; 1323 cs.Irex &= ~REX_W; 1324 code_newreg(&cs,reg - XMM0); 1325 checkSetVex(&cs,ty); 1326 cdb.gen(&cs); 1327 } 1328 else 1329 { 1330 codelem(cdb,e1,&retregs,false); // eval left leaf 1331 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1332 getregs(cdb,retregs); 1333 if (config.avx >= 2) 1334 { 1335 // VBROADCASTSS X/YMM,XMM 1336 cdb.gen2(VBROADCASTSS, modregxrmx(3,reg,reg)); 1337 checkSetVex(cdb.last(), ty); 1338 } 1339 else 1340 { 1341 // (V)SHUFPS XMM,XMM,0 1342 cdb.genc2(SHUFPS, modregxrmx(3,reg,reg), 0); 1343 checkSetVex(cdb.last(), ty); 1344 if (tysize(ty) == 32) 1345 { 1346 // VINSERTF128 YMM,YMM,XMM,1 1347 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1348 checkSetVex(cdb.last(), ty); 1349 } 1350 } 1351 } 1352 break; 1353 1354 case TYdouble2: 1355 case TYdouble4: 1356 if (config.avx && tysize(ty) == 32 && e1.Eoper == OPind && !e1.Ecount) 1357 { 1358 // VBROADCASTSD YMM,MEM 1359 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1360 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1361 reg_t reg; 1362 allocreg(cdb,&retregs,®,ty); 1363 cs.Iop = VBROADCASTSD; 1364 cs.Irex &= ~REX_W; 1365 code_newreg(&cs,reg - XMM0); 1366 checkSetVex(&cs,ty); 1367 cdb.gen(&cs); 1368 } 1369 else 1370 { 1371 codelem(cdb,e1,&retregs,false); // eval left leaf 1372 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1373 getregs(cdb,retregs); 1374 if (config.avx >= 2 && tysize(ty) == 32) 1375 { 1376 // VBROADCASTSD YMM,XMM 1377 cdb.gen2(VBROADCASTSD, modregxrmx(3,reg,reg)); 1378 checkSetVex(cdb.last(), ty); 1379 } 1380 else 1381 { 1382 // (V)UNPCKLPD XMM,XMM 1383 cdb.gen2(UNPCKLPD, modregxrmx(3,reg,reg)); 1384 checkSetVex(cdb.last(), TYdouble2); // AVX-128 1385 if (tysize(ty) == 32) 1386 { 1387 // VINSERTF128 YMM,YMM,XMM,1 1388 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1389 checkSetVex(cdb.last(), ty); 1390 } 1391 } 1392 } 1393 break; 1394 1395 case TYschar16: 1396 case TYuchar16: 1397 case TYschar32: 1398 case TYuchar32: 1399 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1400 { 1401 // VPBROADCASTB X/YMM,MEM 1402 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1403 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1404 reg_t reg; 1405 allocreg(cdb,&retregs,®,ty); 1406 cs.Iop = VPBROADCASTB; 1407 cs.Irex &= ~REX_W; 1408 code_newreg(&cs,reg - XMM0); 1409 checkSetVex(&cs,ty); 1410 cdb.gen(&cs); 1411 } 1412 else 1413 { 1414 regm_t regm = ALLREGS; 1415 codelem(cdb,e1,®m,true); // eval left leaf 1416 const r = findreg(regm); 1417 1418 reg_t reg; 1419 allocreg(cdb,&retregs,®, e.Ety); 1420 reg -= XMM0; 1421 // (V)MOVD reg,r 1422 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1423 checkSetVex(cdb.last(), TYushort8); 1424 if (config.avx >= 2) 1425 { 1426 // VPBROADCASTB X/YMM,XMM 1427 cdb.gen2(VPBROADCASTB, modregxrmx(3,reg,reg)); 1428 checkSetVex(cdb.last(), ty); 1429 } 1430 else 1431 { 1432 if (config.avx) 1433 { 1434 reg_t zeroreg; 1435 regm = XMMREGS & ~retregs; 1436 // VPXOR XMM1,XMM1,XMM1 1437 allocreg(cdb,®m,&zeroreg, ty); 1438 zeroreg -= XMM0; 1439 cdb.gen2(PXOR, modregxrmx(3,zeroreg,zeroreg)); 1440 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1441 // VPSHUFB XMM,XMM,XMM1 1442 cdb.gen2(PSHUFB, modregxrmx(3,reg,zeroreg)); 1443 checkSetVex(cdb.last(), TYuchar16); // AVX-128 1444 } 1445 else 1446 { 1447 // PUNPCKLBW XMM,XMM 1448 cdb.gen2(PUNPCKLBW, modregxrmx(3,reg,reg)); 1449 // PUNPCKLWD XMM,XMM 1450 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1451 // PSHUFD XMM,XMM,0 1452 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1453 } 1454 if (tysize(ty) == 32) 1455 { 1456 // VINSERTF128 YMM,YMM,XMM,1 1457 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1458 checkSetVex(cdb.last(), ty); 1459 } 1460 } 1461 } 1462 break; 1463 1464 case TYshort8: 1465 case TYushort8: 1466 case TYshort16: 1467 case TYushort16: 1468 if (config.avx >= 2 && e1.Eoper == OPind && !e1.Ecount) 1469 { 1470 // VPBROADCASTW X/YMM,MEM 1471 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1472 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1473 reg_t reg; 1474 allocreg(cdb,&retregs,®,ty); 1475 cs.Iop = VPBROADCASTW; 1476 cs.Irex &= ~REX_W; 1477 cs.Iflags &= ~CFopsize; 1478 code_newreg(&cs,reg - XMM0); 1479 checkSetVex(&cs,ty); 1480 cdb.gen(&cs); 1481 } 1482 else 1483 { 1484 regm_t regm = ALLREGS; 1485 codelem(cdb,e1,®m,true); // eval left leaf 1486 reg_t r = findreg(regm); 1487 1488 reg_t reg; 1489 allocreg(cdb,&retregs,®, e.Ety); 1490 reg -= XMM0; 1491 // (V)MOVD reg,r 1492 cdb.gen2(LODD,modregxrmx(3,reg,r)); 1493 checkSetVex(cdb.last(), TYushort8); 1494 if (config.avx >= 2) 1495 { 1496 // VPBROADCASTW X/YMM,XMM 1497 cdb.gen2(VPBROADCASTW, modregxrmx(3,reg,reg)); 1498 checkSetVex(cdb.last(), ty); 1499 } 1500 else 1501 { 1502 // (V)PUNPCKLWD XMM,XMM 1503 cdb.gen2(PUNPCKLWD, modregxrmx(3,reg,reg)); 1504 checkSetVex(cdb.last(), TYushort8); // AVX-128 1505 // (V)PSHUFD XMM,XMM,0 1506 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1507 checkSetVex(cdb.last(), TYushort8); // AVX-128 1508 if (tysize(ty) == 32) 1509 { 1510 // VINSERTF128 YMM,YMM,XMM,1 1511 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1512 checkSetVex(cdb.last(), ty); 1513 } 1514 } 1515 } 1516 break; 1517 1518 case TYlong8: 1519 case TYulong8: 1520 case TYlong4: 1521 case TYulong4: 1522 if (config.avx && e1.Eoper == OPind && !e1.Ecount) 1523 { 1524 // VPBROADCASTD/VBROADCASTSS X/YMM,MEM 1525 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1526 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1527 reg_t reg; 1528 allocreg(cdb,&retregs,®,ty); 1529 cs.Iop = config.avx >= 2 ? VPBROADCASTD : VBROADCASTSS; 1530 cs.Irex &= ~REX_W; 1531 code_newreg(&cs,reg - XMM0); 1532 checkSetVex(&cs,ty); 1533 cdb.gen(&cs); 1534 } 1535 else 1536 { 1537 codelem(cdb,e1,&retregs,true); // eval left leaf 1538 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1539 getregs(cdb,retregs); 1540 if (config.avx >= 2) 1541 { 1542 // VPBROADCASTD X/YMM,XMM 1543 cdb.gen2(VPBROADCASTD, modregxrmx(3,reg,reg)); 1544 checkSetVex(cdb.last(), ty); 1545 } 1546 else 1547 { 1548 // (V)PSHUFD XMM,XMM,0 1549 cdb.genc2(PSHUFD, modregxrmx(3,reg,reg), 0); 1550 checkSetVex(cdb.last(), TYulong4); // AVX-128 1551 if (tysize(ty) == 32) 1552 { 1553 // VINSERTF128 YMM,YMM,XMM,1 1554 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1555 checkSetVex(cdb.last(), ty); 1556 } 1557 } 1558 } 1559 break; 1560 1561 case TYllong2: 1562 case TYullong2: 1563 case TYllong4: 1564 case TYullong4: 1565 if (e1.Eoper == OPind && !e1.Ecount) 1566 { 1567 // VPBROADCASTQ/VBROADCASTSD/(V)PUNPCKLQDQ X/YMM,MEM 1568 getlvalue(cdb,&cs, e1, 0); // get addressing mode 1569 assert((cs.Irm & 0xC0) != 0xC0); // AVX1 doesn't have register source operands 1570 reg_t reg; 1571 allocreg(cdb,&retregs,®,ty); 1572 cs.Iop = config.avx >= 2 ? VPBROADCASTQ : tysize(ty) == 32 ? VBROADCASTSD : PUNPCKLQDQ; 1573 cs.Irex &= ~REX_W; 1574 code_newreg(&cs,reg - XMM0); 1575 checkSetVex(&cs,ty); 1576 cdb.gen(&cs); 1577 } 1578 else 1579 { 1580 codelem(cdb,e1,&retregs,true); // eval left leaf 1581 const reg = cast(reg_t)(findreg(retregs) - XMM0); 1582 getregs(cdb,retregs); 1583 if (config.avx >= 2) 1584 { 1585 // VPBROADCASTQ X/YMM,XMM 1586 cdb.gen2(VPBROADCASTQ, modregxrmx(3,reg,reg)); 1587 checkSetVex(cdb.last(), ty); 1588 } 1589 else 1590 { 1591 // (V)PUNPCKLQDQ XMM,XMM 1592 cdb.genc2(PUNPCKLQDQ, modregxrmx(3,reg,reg), 0); 1593 checkSetVex(cdb.last(), TYullong2); // AVX-128 1594 if (tysize(ty) == 32) 1595 { 1596 // VINSERTF128 YMM,YMM,XMM,1 1597 cdb.genc2(VINSERTF128, modregxrmx(3,reg,reg), 1); 1598 checkSetVex(cdb.last(), ty); 1599 } 1600 } 1601 } 1602 break; 1603 1604 default: 1605 assert(0); 1606 } 1607 1608 fixresult(cdb,e,retregs,pretregs); 1609 } 1610 1611 /******************************************* 1612 * Determine if lvalue e is a vector aligned on a 16/32 byte boundary. 1613 * Assume it to be aligned unless can prove it is not. 1614 * Params: 1615 * e = lvalue 1616 * Returns: 1617 * false if definitely not aligned 1618 */ 1619 1620 bool xmmIsAligned(elem *e) 1621 { 1622 if (tyvector(e.Ety) && e.Eoper == OPvar) 1623 { 1624 Symbol *s = e.EV.Vsym; 1625 const alignsz = tyalignsize(e.Ety); 1626 if (Symbol_Salignsize(s) < alignsz || 1627 e.EV.Voffset & (alignsz - 1) || 1628 alignsz > STACKALIGN 1629 ) 1630 return false; // definitely not aligned 1631 } 1632 return true; // assume aligned 1633 } 1634 1635 /************************************** 1636 * VEX prefixes can be 2 or 3 bytes. 1637 * If it must be 3 bytes, set the CFvex3 flag. 1638 */ 1639 1640 void checkSetVex3(code *c) 1641 { 1642 // See Intel Vol. 2A 2.3.5.6 1643 if (c.Ivex.w || !c.Ivex.x || !c.Ivex.b || c.Ivex.mmmm > 0x1 || 1644 !I64 && (c.Ivex.r || !(c.Ivex.vvvv & 8)) 1645 ) 1646 { 1647 c.Iflags |= CFvex3; 1648 } 1649 } 1650 1651 /************************************* 1652 * Determine if operation should be rewritten as a VEX 1653 * operation; and do so. 1654 * Params: 1655 * c = code 1656 * ty = type of operand 1657 */ 1658 1659 void checkSetVex(code *c, tym_t ty) 1660 { 1661 if (config.avx || tysize(ty) == 32) 1662 { 1663 uint vreg = (c.Irm >> 3) & 7; 1664 if (c.Irex & REX_R) 1665 vreg |= 8; 1666 1667 // TODO: This is too simplistic, depending on the instruction, vex.vvvv 1668 // encodes NDS, NDD, DDS, or no operand (NOO). The code below assumes 1669 // NDS (non-destructive source), except for the incomplete list of 2 1670 // operand instructions (NOO) handled by the switch. 1671 switch (c.Iop) 1672 { 1673 case LODSS: 1674 case LODSD: 1675 case STOSS: 1676 case STOSD: 1677 if ((c.Irm & 0xC0) == 0xC0) 1678 break; 1679 goto case LODAPS; 1680 1681 case LODAPS: 1682 case LODUPS: 1683 case LODAPD: 1684 case LODUPD: 1685 case LODDQA: 1686 case LODDQU: 1687 case LODD: 1688 case LODQ: 1689 case STOAPS: 1690 case STOUPS: 1691 case STOAPD: 1692 case STOUPD: 1693 case STODQA: 1694 case STODQU: 1695 case STOD: 1696 case STOQ: 1697 case COMISS: 1698 case COMISD: 1699 case UCOMISS: 1700 case UCOMISD: 1701 case MOVDDUP: 1702 case MOVSHDUP: 1703 case MOVSLDUP: 1704 case VBROADCASTSS: 1705 case PSHUFD: 1706 case PSHUFHW: 1707 case PSHUFLW: 1708 case VPBROADCASTB: 1709 case VPBROADCASTW: 1710 case VPBROADCASTD: 1711 case VPBROADCASTQ: 1712 vreg = 0; // for 2 operand vex instructions 1713 break; 1714 1715 case VBROADCASTSD: 1716 case VBROADCASTF128: 1717 case VBROADCASTI128: 1718 assert(tysize(ty) == 32); // AVX-256 only instructions 1719 vreg = 0; // for 2 operand vex instructions 1720 break; 1721 1722 case NOP: 1723 return; // ignore 1724 1725 default: 1726 break; 1727 } 1728 1729 opcode_t op = 0xC4000000 | (c.Iop & 0xFF); 1730 switch (c.Iop & 0xFFFFFF00) 1731 { 1732 static uint MM_PP(uint mm, uint pp) { return (mm << 16) | (pp << 8); } 1733 case 0x00000F00: op |= MM_PP(1,0); break; 1734 case 0x00660F00: op |= MM_PP(1,1); break; 1735 case 0x00F30F00: op |= MM_PP(1,2); break; 1736 case 0x00F20F00: op |= MM_PP(1,3); break; 1737 case 0x660F3800: op |= MM_PP(2,1); break; 1738 case 0x660F3A00: op |= MM_PP(3,1); break; 1739 default: 1740 printf("Iop = %x\n", c.Iop); 1741 assert(0); 1742 } 1743 c.Iop = op; 1744 c.Ivex.pfx = 0xC4; 1745 c.Ivex.r = !(c.Irex & REX_R); 1746 c.Ivex.x = !(c.Irex & REX_X); 1747 c.Ivex.b = !(c.Irex & REX_B); 1748 c.Ivex.w = (c.Irex & REX_W) != 0; 1749 c.Ivex.l = tysize(ty) == 32; 1750 1751 c.Ivex.vvvv = cast(ushort)~vreg; 1752 1753 c.Iflags |= CFvex; 1754 checkSetVex3(c); 1755 } 1756 } 1757 1758 /************************************** 1759 * Load complex operand into XMM registers or flags or both. 1760 */ 1761 1762 void cloadxmm(ref CodeBuilder cdb, elem *e, regm_t *pretregs) 1763 { 1764 //printf("e = %p, *pretregs = %s)\n", e, regm_str(*pretregs)); 1765 //elem_print(e); 1766 assert(*pretregs & (XMMREGS | mPSW)); 1767 if (*pretregs == (mXMM0 | mXMM1) && 1768 e.Eoper != OPconst) 1769 { 1770 code cs = void; 1771 tym_t tym = tybasic(e.Ety); 1772 tym_t ty = tym == TYcdouble ? TYdouble : TYfloat; 1773 opcode_t opmv = xmmload(tym, xmmIsAligned(e)); 1774 1775 regm_t retregs0 = mXMM0; 1776 reg_t reg0; 1777 allocreg(cdb, &retregs0, ®0, ty); 1778 loadea(cdb, e, &cs, opmv, reg0, 0, RMload, 0); // MOVSS/MOVSD XMM0,data 1779 checkSetVex(cdb.last(), ty); 1780 1781 regm_t retregs1 = mXMM1; 1782 reg_t reg1; 1783 allocreg(cdb, &retregs1, ®1, ty); 1784 loadea(cdb, e, &cs, opmv, reg1, tysize(ty), RMload, mXMM0); // MOVSS/MOVSD XMM1,data+offset 1785 checkSetVex(cdb.last(), ty); 1786 1787 return; 1788 } 1789 cload87(cdb, e, pretregs); 1790 } 1791 1792 }