1 /** 2 * Implements the lexical analyzer, which converts source code into lexical tokens. 3 * 4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) 5 * 6 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) 10 * Documentation: https://dlang.org/phobos/dmd_lexer.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d 12 */ 13 14 module dmd.lexer; 15 16 import core.stdc.ctype; 17 import core.stdc.errno; 18 import core.stdc.stdarg; 19 import core.stdc.stdio; 20 import core.stdc.stdlib : getenv; 21 import core.stdc.string; 22 import core.stdc.time; 23 24 import dmd.diagnostic : DiagnosticHandler, Severity, DefaultDiagnosticHandler, DefaultDiagnosticReporter; 25 import dmd.entity; 26 import dmd.errors; 27 import dmd.globals; 28 import dmd.id; 29 import dmd.identifier; 30 import dmd.root.ctfloat; 31 import dmd.root.outbuffer; 32 import dmd.root.port; 33 import dmd.root.rmem; 34 import dmd.root.string; 35 import dmd.tokens; 36 import dmd.utf; 37 import dmd.utils; 38 39 nothrow: 40 41 private enum LS = 0x2028; // UTF line separator 42 private enum PS = 0x2029; // UTF paragraph separator 43 44 /******************************************** 45 * Do our own char maps 46 */ 47 private static immutable cmtable = () { 48 ubyte[256] table; 49 foreach (const c; 0 .. table.length) 50 { 51 if ('0' <= c && c <= '7') 52 table[c] |= CMoctal; 53 if (c_isxdigit(c)) 54 table[c] |= CMhex; 55 if (c_isalnum(c) || c == '_') 56 table[c] |= CMidchar; 57 58 switch (c) 59 { 60 case 'x': case 'X': 61 case 'b': case 'B': 62 table[c] |= CMzerosecond; 63 break; 64 65 case '0': .. case '9': 66 case 'e': case 'E': 67 case 'f': case 'F': 68 case 'l': case 'L': 69 case 'p': case 'P': 70 case 'u': case 'U': 71 case 'i': 72 case '.': 73 case '_': 74 table[c] |= CMzerosecond | CMdigitsecond; 75 break; 76 77 default: 78 break; 79 } 80 81 switch (c) 82 { 83 case '\\': 84 case '\n': 85 case '\r': 86 case 0: 87 case 0x1A: 88 case '\'': 89 break; 90 default: 91 if (!(c & 0x80)) 92 table[c] |= CMsinglechar; 93 break; 94 } 95 } 96 return table; 97 }(); 98 99 private 100 { 101 enum CMoctal = 0x1; 102 enum CMhex = 0x2; 103 enum CMidchar = 0x4; 104 enum CMzerosecond = 0x8; 105 enum CMdigitsecond = 0x10; 106 enum CMsinglechar = 0x20; 107 } 108 109 private bool isoctal(const char c) pure @nogc @safe 110 { 111 return (cmtable[c] & CMoctal) != 0; 112 } 113 114 private bool ishex(const char c) pure @nogc @safe 115 { 116 return (cmtable[c] & CMhex) != 0; 117 } 118 119 private bool isidchar(const char c) pure @nogc @safe 120 { 121 return (cmtable[c] & CMidchar) != 0; 122 } 123 124 private bool isZeroSecond(const char c) pure @nogc @safe 125 { 126 return (cmtable[c] & CMzerosecond) != 0; 127 } 128 129 private bool isDigitSecond(const char c) pure @nogc @safe 130 { 131 return (cmtable[c] & CMdigitsecond) != 0; 132 } 133 134 private bool issinglechar(const char c) pure @nogc @safe 135 { 136 return (cmtable[c] & CMsinglechar) != 0; 137 } 138 139 private bool c_isxdigit(const int c) pure @nogc @safe 140 { 141 return (( c >= '0' && c <= '9') || 142 ( c >= 'a' && c <= 'f') || 143 ( c >= 'A' && c <= 'F')); 144 } 145 146 private bool c_isalnum(const int c) pure @nogc @safe 147 { 148 return (( c >= '0' && c <= '9') || 149 ( c >= 'a' && c <= 'z') || 150 ( c >= 'A' && c <= 'Z')); 151 } 152 153 unittest 154 { 155 //printf("lexer.unittest\n"); 156 /* Not much here, just trying things out. 157 */ 158 string text = "int"; // We rely on the implicit null-terminator 159 DefaultDiagnosticHandler diagnosticHandler; 160 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0, diagnosticHandler.diagnosticHandler); 161 TOK tok; 162 tok = lex1.nextToken(); 163 diagnosticHandler.report(); 164 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); 165 assert(tok == TOK.int32); 166 tok = lex1.nextToken(); 167 diagnosticHandler.report(); 168 assert(tok == TOK.endOfFile); 169 tok = lex1.nextToken(); 170 diagnosticHandler.report(); 171 assert(tok == TOK.endOfFile); 172 tok = lex1.nextToken(); 173 diagnosticHandler.report(); 174 assert(tok == TOK.endOfFile); 175 } 176 177 unittest 178 { 179 // We don't want to see Lexer error output during these tests. 180 uint errors = global.startGagging(); 181 scope(exit) global.endGagging(errors); 182 183 // Test malformed input: even malformed input should end in a TOK.endOfFile. 184 static immutable char[][] testcases = 185 [ // Testcase must end with 0 or 0x1A. 186 [0], // not malformed, but pathological 187 ['\'', 0], 188 ['\'', 0x1A], 189 ['{', '{', 'q', '{', 0], 190 [0xFF, 0], 191 [0xFF, 0x80, 0], 192 [0xFF, 0xFF, 0], 193 [0xFF, 0xFF, 0], 194 ['x', '"', 0x1A], 195 ]; 196 197 foreach (testcase; testcases) 198 { 199 DefaultDiagnosticHandler diagnosticHandler; 200 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0, diagnosticHandler.diagnosticHandler); 201 TOK tok = lex2.nextToken(); 202 diagnosticHandler.report(); 203 size_t iterations = 1; 204 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) 205 { 206 tok = lex2.nextToken(); 207 } 208 assert(tok == TOK.endOfFile); 209 tok = lex2.nextToken(); 210 assert(tok == TOK.endOfFile); 211 } 212 } 213 214 version (DMDLIB) 215 { 216 version = LocOffset; 217 } 218 219 /*********************************************************** 220 */ 221 class Lexer 222 { 223 private __gshared OutBuffer stringbuffer; 224 225 Loc scanloc; // for error messages 226 Loc prevloc; // location of token before current 227 228 const(char)* p; // current character 229 230 Token token; 231 232 private 233 { 234 const(char)* base; // pointer to start of buffer 235 const(char)* end; // pointer to last element of buffer 236 const(char)* line; // start of current line 237 238 bool doDocComment; // collect doc comment information 239 bool anyToken; // seen at least one token 240 bool commentToken; // comments are TOK.comment's 241 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings 242 int lastDocLine; // last line of previous doc comment 243 244 Token* tokenFreelist; 245 DiagnosticHandler handleDiagnostic; 246 DefaultDiagnosticReporter diagnosticReporter; 247 } 248 249 nothrow: 250 251 /********************* 252 * Creates a Lexer for the source code base[begoffset..endoffset+1]. 253 * The last character, base[endoffset], must be null (0) or EOF (0x1A). 254 * 255 * Params: 256 * filename = used for error messages 257 * base = source code, must be terminated by a null (0) or EOF (0x1A) character 258 * begoffset = starting offset into base[] 259 * endoffset = the last offset to read into base[] 260 * doDocComment = handle documentation comments 261 * commentToken = comments become TOK.comment's 262 * diagnosticHandler = diagnostic handler 263 */ 264 this(const(char)* filename, const(char)* base, size_t begoffset, 265 size_t endoffset, bool doDocComment, bool commentToken, 266 DiagnosticHandler handleDiagnostic) pure 267 { 268 scanloc = Loc(filename, 1, 1); 269 //printf("Lexer::Lexer(%p,%d)\n",base,length); 270 //printf("lexer.filename = %s\n", filename); 271 token = Token.init; 272 this.base = base; 273 this.end = base + endoffset; 274 p = base + begoffset; 275 line = p; 276 this.doDocComment = doDocComment; 277 this.commentToken = commentToken; 278 this.inTokenStringConstant = 0; 279 this.lastDocLine = 0; 280 this.handleDiagnostic = handleDiagnostic; 281 282 //initKeywords(); 283 /* If first line starts with '#!', ignore the line 284 */ 285 if (p && p[0] == '#' && p[1] == '!') 286 { 287 p += 2; 288 while (1) 289 { 290 char c = *p++; 291 switch (c) 292 { 293 case 0: 294 case 0x1A: 295 p--; 296 goto case; 297 case '\n': 298 break; 299 default: 300 continue; 301 } 302 break; 303 } 304 endOfLine(); 305 } 306 } 307 308 /// Returns: a newly allocated `Token`. 309 Token* allocateToken() pure nothrow @safe 310 { 311 if (tokenFreelist) 312 { 313 Token* t = tokenFreelist; 314 tokenFreelist = t.next; 315 t.next = null; 316 return t; 317 } 318 return new Token(); 319 } 320 321 /// Frees the given token by returning it to the freelist. 322 private void releaseToken(Token* token) pure nothrow @nogc @safe 323 { 324 if (mem.isGCEnabled) 325 *token = Token.init; 326 token.next = tokenFreelist; 327 tokenFreelist = token; 328 } 329 330 TOK nextToken() 331 { 332 prevloc = token.loc; 333 if (token.next) 334 { 335 Token* t = token.next; 336 memcpy(&token, t, Token.sizeof); 337 releaseToken(t); 338 } 339 else 340 { 341 scan(&token); 342 } 343 //printf(token.toChars()); 344 return token.value; 345 } 346 347 /*********************** 348 * Look ahead at next token's value. 349 */ 350 final TOK peekNext() 351 { 352 return peek(&token).value; 353 } 354 355 /*********************** 356 * Look 2 tokens ahead at value. 357 */ 358 final TOK peekNext2() 359 { 360 Token* t = peek(&token); 361 return peek(t).value; 362 } 363 364 /**************************** 365 * Turn next token in buffer into a token. 366 */ 367 final void scan(Token* t) 368 { 369 const lastLine = scanloc.linnum; 370 Loc startLoc; 371 t.blockComment = null; 372 t.lineComment = null; 373 374 while (1) 375 { 376 t.ptr = p; 377 //printf("p = %p, *p = '%c'\n",p,*p); 378 t.loc = loc(); 379 switch (*p) 380 { 381 case 0: 382 case 0x1A: 383 t.value = TOK.endOfFile; // end of file 384 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. 385 return; 386 case ' ': 387 case '\t': 388 case '\v': 389 case '\f': 390 p++; 391 continue; // skip white space 392 case '\r': 393 p++; 394 if (*p != '\n') // if CR stands by itself 395 { 396 endOfLine(); 397 goto skipFourSpaces; 398 } 399 continue; // skip white space 400 case '\n': 401 p++; 402 endOfLine(); 403 skipFourSpaces: 404 while (*(cast(uint*)p) == 0x20202020) //' ' == 0x20 405 { 406 p+=4; 407 } 408 continue; // skip white space 409 case '0': 410 if (!isZeroSecond(p[1])) // if numeric literal does not continue 411 { 412 ++p; 413 t.unsvalue = 0; 414 t.value = TOK.int32Literal; 415 return; 416 } 417 goto Lnumber; 418 419 case '1': .. case '9': 420 if (!isDigitSecond(p[1])) // if numeric literal does not continue 421 { 422 t.unsvalue = *p - '0'; 423 ++p; 424 t.value = TOK.int32Literal; 425 return; 426 } 427 Lnumber: 428 t.value = number(t); 429 return; 430 431 case '\'': 432 if (issinglechar(p[1]) && p[2] == '\'') 433 { 434 t.unsvalue = p[1]; // simple one character literal 435 t.value = TOK.charLiteral; 436 p += 3; 437 } 438 else 439 t.value = charConstant(t); 440 return; 441 case 'r': 442 if (p[1] != '"') 443 goto case_ident; 444 p++; 445 goto case '`'; 446 case '`': 447 wysiwygStringConstant(t); 448 return; 449 case 'x': 450 if (p[1] != '"') 451 goto case_ident; 452 p++; 453 auto start = p; 454 auto hexString = new OutBuffer(); 455 t.value = hexStringConstant(t); 456 hexString.write(start[0 .. p - start]); 457 error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString.extractChars()); 458 return; 459 case 'q': 460 if (p[1] == '"') 461 { 462 p++; 463 delimitedStringConstant(t); 464 return; 465 } 466 else if (p[1] == '{') 467 { 468 p++; 469 tokenStringConstant(t); 470 return; 471 } 472 else 473 goto case_ident; 474 case '"': 475 escapeStringConstant(t); 476 return; 477 case 'a': 478 case 'b': 479 case 'c': 480 case 'd': 481 case 'e': 482 case 'f': 483 case 'g': 484 case 'h': 485 case 'i': 486 case 'j': 487 case 'k': 488 case 'l': 489 case 'm': 490 case 'n': 491 case 'o': 492 case 'p': 493 /*case 'q': case 'r':*/ 494 case 's': 495 case 't': 496 case 'u': 497 case 'v': 498 case 'w': 499 /*case 'x':*/ 500 case 'y': 501 case 'z': 502 case 'A': 503 case 'B': 504 case 'C': 505 case 'D': 506 case 'E': 507 case 'F': 508 case 'G': 509 case 'H': 510 case 'I': 511 case 'J': 512 case 'K': 513 case 'L': 514 case 'M': 515 case 'N': 516 case 'O': 517 case 'P': 518 case 'Q': 519 case 'R': 520 case 'S': 521 case 'T': 522 case 'U': 523 case 'V': 524 case 'W': 525 case 'X': 526 case 'Y': 527 case 'Z': 528 case '_': 529 case_ident: 530 { 531 while (1) 532 { 533 const c = *++p; 534 if (isidchar(c)) 535 continue; 536 else if (c & 0x80) 537 { 538 const s = p; 539 const u = decodeUTF(); 540 if (isUniAlpha(u)) 541 continue; 542 error("char 0x%04x not allowed in identifier", u); 543 p = s; 544 } 545 break; 546 } 547 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); 548 t.ident = id; 549 t.value = cast(TOK)id.getValue(); 550 anyToken = 1; 551 if (*t.ptr == '_') // if special identifier token 552 { 553 // Lazy initialization 554 TimeStampInfo.initialize(t.loc); 555 556 if (id == Id.DATE) 557 { 558 t.ustring = TimeStampInfo.date.ptr; 559 goto Lstr; 560 } 561 else if (id == Id.TIME) 562 { 563 t.ustring = TimeStampInfo.time.ptr; 564 goto Lstr; 565 } 566 else if (id == Id.VENDOR) 567 { 568 t.ustring = global.vendor.xarraydup.ptr; 569 goto Lstr; 570 } 571 else if (id == Id.TIMESTAMP) 572 { 573 t.ustring = TimeStampInfo.timestamp.ptr; 574 Lstr: 575 t.value = TOK.string_; 576 t.postfix = 0; 577 t.len = cast(uint)strlen(t.ustring); 578 } 579 else if (id == Id.VERSIONX) 580 { 581 t.value = TOK.int64Literal; 582 t.unsvalue = global.versionNumber(); 583 } 584 else if (id == Id.EOFX) 585 { 586 t.value = TOK.endOfFile; 587 // Advance scanner to end of file 588 while (!(*p == 0 || *p == 0x1A)) 589 p++; 590 } 591 } 592 //printf("t.value = %d\n",t.value); 593 return; 594 } 595 case '/': 596 p++; 597 switch (*p) 598 { 599 case '=': 600 p++; 601 t.value = TOK.divAssign; 602 return; 603 case '*': 604 p++; 605 startLoc = loc(); 606 while (1) 607 { 608 while (1) 609 { 610 const c = *p; 611 switch (c) 612 { 613 case '/': 614 break; 615 case '\n': 616 endOfLine(); 617 p++; 618 continue; 619 case '\r': 620 p++; 621 if (*p != '\n') 622 endOfLine(); 623 continue; 624 case 0: 625 case 0x1A: 626 error("unterminated /* */ comment"); 627 p = end; 628 t.loc = loc(); 629 t.value = TOK.endOfFile; 630 return; 631 default: 632 if (c & 0x80) 633 { 634 const u = decodeUTF(); 635 if (u == PS || u == LS) 636 endOfLine(); 637 } 638 p++; 639 continue; 640 } 641 break; 642 } 643 p++; 644 if (p[-2] == '*' && p - 3 != t.ptr) 645 break; 646 } 647 if (commentToken) 648 { 649 t.loc = startLoc; 650 t.value = TOK.comment; 651 return; 652 } 653 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 654 { 655 // if /** but not /**/ 656 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 657 lastDocLine = scanloc.linnum; 658 } 659 continue; 660 case '/': // do // style comments 661 startLoc = loc(); 662 while (1) 663 { 664 const c = *++p; 665 switch (c) 666 { 667 case '\n': 668 break; 669 case '\r': 670 if (p[1] == '\n') 671 p++; 672 break; 673 case 0: 674 case 0x1A: 675 if (commentToken) 676 { 677 p = end; 678 t.loc = startLoc; 679 t.value = TOK.comment; 680 return; 681 } 682 if (doDocComment && t.ptr[2] == '/') 683 { 684 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 685 lastDocLine = scanloc.linnum; 686 } 687 p = end; 688 t.loc = loc(); 689 t.value = TOK.endOfFile; 690 return; 691 default: 692 if (c & 0x80) 693 { 694 const u = decodeUTF(); 695 if (u == PS || u == LS) 696 break; 697 } 698 continue; 699 } 700 break; 701 } 702 if (commentToken) 703 { 704 p++; 705 endOfLine(); 706 t.loc = startLoc; 707 t.value = TOK.comment; 708 return; 709 } 710 if (doDocComment && t.ptr[2] == '/') 711 { 712 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 713 lastDocLine = scanloc.linnum; 714 } 715 p++; 716 endOfLine(); 717 continue; 718 case '+': 719 { 720 int nest; 721 startLoc = loc(); 722 p++; 723 nest = 1; 724 while (1) 725 { 726 char c = *p; 727 switch (c) 728 { 729 case '/': 730 p++; 731 if (*p == '+') 732 { 733 p++; 734 nest++; 735 } 736 continue; 737 case '+': 738 p++; 739 if (*p == '/') 740 { 741 p++; 742 if (--nest == 0) 743 break; 744 } 745 continue; 746 case '\r': 747 p++; 748 if (*p != '\n') 749 endOfLine(); 750 continue; 751 case '\n': 752 endOfLine(); 753 p++; 754 continue; 755 case 0: 756 case 0x1A: 757 error("unterminated /+ +/ comment"); 758 p = end; 759 t.loc = loc(); 760 t.value = TOK.endOfFile; 761 return; 762 default: 763 if (c & 0x80) 764 { 765 uint u = decodeUTF(); 766 if (u == PS || u == LS) 767 endOfLine(); 768 } 769 p++; 770 continue; 771 } 772 break; 773 } 774 if (commentToken) 775 { 776 t.loc = startLoc; 777 t.value = TOK.comment; 778 return; 779 } 780 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 781 { 782 // if /++ but not /++/ 783 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 784 lastDocLine = scanloc.linnum; 785 } 786 continue; 787 } 788 default: 789 break; 790 } 791 t.value = TOK.div; 792 return; 793 case '.': 794 p++; 795 if (isdigit(*p)) 796 { 797 /* Note that we don't allow ._1 and ._ as being 798 * valid floating point numbers. 799 */ 800 p--; 801 t.value = inreal(t); 802 } 803 else if (p[0] == '.') 804 { 805 if (p[1] == '.') 806 { 807 p += 2; 808 t.value = TOK.dotDotDot; 809 } 810 else 811 { 812 p++; 813 t.value = TOK.slice; 814 } 815 } 816 else 817 t.value = TOK.dot; 818 return; 819 case '&': 820 p++; 821 if (*p == '=') 822 { 823 p++; 824 t.value = TOK.andAssign; 825 } 826 else if (*p == '&') 827 { 828 p++; 829 t.value = TOK.andAnd; 830 } 831 else 832 t.value = TOK.and; 833 return; 834 case '|': 835 p++; 836 if (*p == '=') 837 { 838 p++; 839 t.value = TOK.orAssign; 840 } 841 else if (*p == '|') 842 { 843 p++; 844 t.value = TOK.orOr; 845 } 846 else 847 t.value = TOK.or; 848 return; 849 case '-': 850 p++; 851 if (*p == '=') 852 { 853 p++; 854 t.value = TOK.minAssign; 855 } 856 else if (*p == '-') 857 { 858 p++; 859 t.value = TOK.minusMinus; 860 } 861 else 862 t.value = TOK.min; 863 return; 864 case '+': 865 p++; 866 if (*p == '=') 867 { 868 p++; 869 t.value = TOK.addAssign; 870 } 871 else if (*p == '+') 872 { 873 p++; 874 t.value = TOK.plusPlus; 875 } 876 else 877 t.value = TOK.add; 878 return; 879 case '<': 880 p++; 881 if (*p == '=') 882 { 883 p++; 884 t.value = TOK.lessOrEqual; // <= 885 } 886 else if (*p == '<') 887 { 888 p++; 889 if (*p == '=') 890 { 891 p++; 892 t.value = TOK.leftShiftAssign; // <<= 893 } 894 else 895 t.value = TOK.leftShift; // << 896 } 897 else 898 t.value = TOK.lessThan; // < 899 return; 900 case '>': 901 p++; 902 if (*p == '=') 903 { 904 p++; 905 t.value = TOK.greaterOrEqual; // >= 906 } 907 else if (*p == '>') 908 { 909 p++; 910 if (*p == '=') 911 { 912 p++; 913 t.value = TOK.rightShiftAssign; // >>= 914 } 915 else if (*p == '>') 916 { 917 p++; 918 if (*p == '=') 919 { 920 p++; 921 t.value = TOK.unsignedRightShiftAssign; // >>>= 922 } 923 else 924 t.value = TOK.unsignedRightShift; // >>> 925 } 926 else 927 t.value = TOK.rightShift; // >> 928 } 929 else 930 t.value = TOK.greaterThan; // > 931 return; 932 case '!': 933 p++; 934 if (*p == '=') 935 { 936 p++; 937 t.value = TOK.notEqual; // != 938 } 939 else 940 t.value = TOK.not; // ! 941 return; 942 case '=': 943 p++; 944 if (*p == '=') 945 { 946 p++; 947 t.value = TOK.equal; // == 948 } 949 else if (*p == '>') 950 { 951 p++; 952 t.value = TOK.goesTo; // => 953 } 954 else 955 t.value = TOK.assign; // = 956 return; 957 case '~': 958 p++; 959 if (*p == '=') 960 { 961 p++; 962 t.value = TOK.concatenateAssign; // ~= 963 } 964 else 965 t.value = TOK.tilde; // ~ 966 return; 967 case '^': 968 p++; 969 if (*p == '^') 970 { 971 p++; 972 if (*p == '=') 973 { 974 p++; 975 t.value = TOK.powAssign; // ^^= 976 } 977 else 978 t.value = TOK.pow; // ^^ 979 } 980 else if (*p == '=') 981 { 982 p++; 983 t.value = TOK.xorAssign; // ^= 984 } 985 else 986 t.value = TOK.xor; // ^ 987 return; 988 case '(': 989 p++; 990 t.value = TOK.leftParentheses; 991 return; 992 case ')': 993 p++; 994 t.value = TOK.rightParentheses; 995 return; 996 case '[': 997 p++; 998 t.value = TOK.leftBracket; 999 return; 1000 case ']': 1001 p++; 1002 t.value = TOK.rightBracket; 1003 return; 1004 case '{': 1005 p++; 1006 t.value = TOK.leftCurly; 1007 return; 1008 case '}': 1009 p++; 1010 t.value = TOK.rightCurly; 1011 return; 1012 case '?': 1013 p++; 1014 t.value = TOK.question; 1015 return; 1016 case ',': 1017 p++; 1018 t.value = TOK.comma; 1019 return; 1020 case ';': 1021 p++; 1022 t.value = TOK.semicolon; 1023 return; 1024 case ':': 1025 p++; 1026 t.value = TOK.colon; 1027 return; 1028 case '$': 1029 p++; 1030 t.value = TOK.dollar; 1031 return; 1032 case '@': 1033 p++; 1034 t.value = TOK.at; 1035 return; 1036 case '*': 1037 p++; 1038 if (*p == '=') 1039 { 1040 p++; 1041 t.value = TOK.mulAssign; 1042 } 1043 else 1044 t.value = TOK.mul; 1045 return; 1046 case '%': 1047 p++; 1048 if (*p == '=') 1049 { 1050 p++; 1051 t.value = TOK.modAssign; 1052 } 1053 else 1054 t.value = TOK.mod; 1055 return; 1056 case '#': 1057 { 1058 p++; 1059 Token n; 1060 scan(&n); 1061 if (n.value == TOK.identifier) 1062 { 1063 if (n.ident == Id.line) 1064 { 1065 poundLine(); 1066 continue; 1067 } 1068 else 1069 { 1070 const locx = loc(); 1071 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); 1072 } 1073 } 1074 else if (n.value == TOK.if_) 1075 { 1076 error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); 1077 } 1078 t.value = TOK.pound; 1079 return; 1080 } 1081 default: 1082 { 1083 dchar c = *p; 1084 if (c & 0x80) 1085 { 1086 c = decodeUTF(); 1087 // Check for start of unicode identifier 1088 if (isUniAlpha(c)) 1089 goto case_ident; 1090 if (c == PS || c == LS) 1091 { 1092 endOfLine(); 1093 p++; 1094 continue; 1095 } 1096 } 1097 if (c < 0x80 && isprint(c)) 1098 error("character '%c' is not a valid token", c); 1099 else 1100 error("character 0x%02x is not a valid token", c); 1101 p++; 1102 continue; 1103 } 1104 } 1105 } 1106 } 1107 1108 final Token* peek(Token* ct) 1109 { 1110 Token* t; 1111 if (ct.next) 1112 t = ct.next; 1113 else 1114 { 1115 t = allocateToken(); 1116 scan(t); 1117 ct.next = t; 1118 } 1119 return t; 1120 } 1121 1122 /********************************* 1123 * tk is on the opening (. 1124 * Look ahead and return token that is past the closing ). 1125 */ 1126 final Token* peekPastParen(Token* tk) 1127 { 1128 //printf("peekPastParen()\n"); 1129 int parens = 1; 1130 int curlynest = 0; 1131 while (1) 1132 { 1133 tk = peek(tk); 1134 //tk.print(); 1135 switch (tk.value) 1136 { 1137 case TOK.leftParentheses: 1138 parens++; 1139 continue; 1140 case TOK.rightParentheses: 1141 --parens; 1142 if (parens) 1143 continue; 1144 tk = peek(tk); 1145 break; 1146 case TOK.leftCurly: 1147 curlynest++; 1148 continue; 1149 case TOK.rightCurly: 1150 if (--curlynest >= 0) 1151 continue; 1152 break; 1153 case TOK.semicolon: 1154 if (curlynest) 1155 continue; 1156 break; 1157 case TOK.endOfFile: 1158 break; 1159 default: 1160 continue; 1161 } 1162 return tk; 1163 } 1164 } 1165 1166 /******************************************* 1167 * Parse escape sequence. 1168 */ 1169 private uint escapeSequence() 1170 { 1171 return Lexer.escapeSequence(token.loc, p); 1172 } 1173 1174 /** 1175 Parse the given string literal escape sequence into a single character. 1176 Params: 1177 loc = the location of the current token 1178 sequence = pointer to string with escape sequence to parse. this is a reference 1179 variable that is also used to return the position after the sequence 1180 Returns: 1181 the escaped sequence as a single character 1182 */ 1183 private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence) 1184 { 1185 const(char)* p = sequence; // cache sequence reference on stack 1186 scope(exit) sequence = p; 1187 1188 uint c = *p; 1189 int ndigits; 1190 switch (c) 1191 { 1192 case '\'': 1193 case '"': 1194 case '?': 1195 case '\\': 1196 Lconsume: 1197 p++; 1198 break; 1199 case 'a': 1200 c = 7; 1201 goto Lconsume; 1202 case 'b': 1203 c = 8; 1204 goto Lconsume; 1205 case 'f': 1206 c = 12; 1207 goto Lconsume; 1208 case 'n': 1209 c = 10; 1210 goto Lconsume; 1211 case 'r': 1212 c = 13; 1213 goto Lconsume; 1214 case 't': 1215 c = 9; 1216 goto Lconsume; 1217 case 'v': 1218 c = 11; 1219 goto Lconsume; 1220 case 'u': 1221 ndigits = 4; 1222 goto Lhex; 1223 case 'U': 1224 ndigits = 8; 1225 goto Lhex; 1226 case 'x': 1227 ndigits = 2; 1228 Lhex: 1229 p++; 1230 c = *p; 1231 if (ishex(cast(char)c)) 1232 { 1233 uint v = 0; 1234 int n = 0; 1235 while (1) 1236 { 1237 if (isdigit(cast(char)c)) 1238 c -= '0'; 1239 else if (islower(c)) 1240 c -= 'a' - 10; 1241 else 1242 c -= 'A' - 10; 1243 v = v * 16 + c; 1244 c = *++p; 1245 if (++n == ndigits) 1246 break; 1247 if (!ishex(cast(char)c)) 1248 { 1249 .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); 1250 break; 1251 } 1252 } 1253 if (ndigits != 2 && !utf_isValidDchar(v)) 1254 { 1255 .error(loc, "invalid UTF character \\U%08x", v); 1256 v = '?'; // recover with valid UTF character 1257 } 1258 c = v; 1259 } 1260 else 1261 { 1262 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); 1263 p++; 1264 } 1265 break; 1266 case '&': 1267 // named character entity 1268 for (const idstart = ++p; 1; p++) 1269 { 1270 switch (*p) 1271 { 1272 case ';': 1273 c = HtmlNamedEntity(idstart, p - idstart); 1274 if (c == ~0) 1275 { 1276 .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1277 c = '?'; 1278 } 1279 p++; 1280 break; 1281 default: 1282 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1283 continue; 1284 .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1285 c = '?'; 1286 break; 1287 } 1288 break; 1289 } 1290 break; 1291 case 0: 1292 case 0x1A: 1293 // end of file 1294 c = '\\'; 1295 break; 1296 default: 1297 if (isoctal(cast(char)c)) 1298 { 1299 uint v = 0; 1300 int n = 0; 1301 do 1302 { 1303 v = v * 8 + (c - '0'); 1304 c = *++p; 1305 } 1306 while (++n < 3 && isoctal(cast(char)c)); 1307 c = v; 1308 if (c > 0xFF) 1309 .error(loc, "escape octal sequence \\%03o is larger than \\377", c); 1310 } 1311 else 1312 { 1313 .error(loc, "undefined escape sequence \\%c", c); 1314 p++; 1315 } 1316 break; 1317 } 1318 return c; 1319 } 1320 1321 /** 1322 Lex a wysiwyg string. `p` must be pointing to the first character before the 1323 contents of the string literal. The character pointed to by `p` will be used as 1324 the terminating character (i.e. backtick or double-quote). 1325 Params: 1326 result = pointer to the token that accepts the result 1327 */ 1328 private void wysiwygStringConstant(Token* result) 1329 { 1330 result.value = TOK.string_; 1331 Loc start = loc(); 1332 auto terminator = p[0]; 1333 p++; 1334 stringbuffer.setsize(0); 1335 while (1) 1336 { 1337 dchar c = p[0]; 1338 p++; 1339 switch (c) 1340 { 1341 case '\n': 1342 endOfLine(); 1343 break; 1344 case '\r': 1345 if (p[0] == '\n') 1346 continue; // ignore 1347 c = '\n'; // treat EndOfLine as \n character 1348 endOfLine(); 1349 break; 1350 case 0: 1351 case 0x1A: 1352 error("unterminated string constant starting at %s", start.toChars()); 1353 result.setString(); 1354 // rewind `p` so it points to the EOF character 1355 p--; 1356 return; 1357 default: 1358 if (c == terminator) 1359 { 1360 result.setString(stringbuffer); 1361 stringPostfix(result); 1362 return; 1363 } 1364 else if (c & 0x80) 1365 { 1366 p--; 1367 const u = decodeUTF(); 1368 p++; 1369 if (u == PS || u == LS) 1370 endOfLine(); 1371 stringbuffer.writeUTF8(u); 1372 continue; 1373 } 1374 break; 1375 } 1376 stringbuffer.writeByte(c); 1377 } 1378 } 1379 1380 /************************************** 1381 * Lex hex strings: 1382 * x"0A ae 34FE BD" 1383 */ 1384 private TOK hexStringConstant(Token* t) 1385 { 1386 Loc start = loc(); 1387 uint n = 0; 1388 uint v = ~0; // dead assignment, needed to suppress warning 1389 p++; 1390 stringbuffer.setsize(0); 1391 while (1) 1392 { 1393 dchar c = *p++; 1394 switch (c) 1395 { 1396 case ' ': 1397 case '\t': 1398 case '\v': 1399 case '\f': 1400 continue; // skip white space 1401 case '\r': 1402 if (*p == '\n') 1403 continue; // ignore '\r' if followed by '\n' 1404 // Treat isolated '\r' as if it were a '\n' 1405 goto case '\n'; 1406 case '\n': 1407 endOfLine(); 1408 continue; 1409 case 0: 1410 case 0x1A: 1411 error("unterminated string constant starting at %s", start.toChars()); 1412 t.setString(); 1413 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1414 p--; 1415 return TOK.hexadecimalString; 1416 case '"': 1417 if (n & 1) 1418 { 1419 error("odd number (%d) of hex characters in hex string", n); 1420 stringbuffer.writeByte(v); 1421 } 1422 t.setString(stringbuffer); 1423 stringPostfix(t); 1424 return TOK.hexadecimalString; 1425 default: 1426 if (c >= '0' && c <= '9') 1427 c -= '0'; 1428 else if (c >= 'a' && c <= 'f') 1429 c -= 'a' - 10; 1430 else if (c >= 'A' && c <= 'F') 1431 c -= 'A' - 10; 1432 else if (c & 0x80) 1433 { 1434 p--; 1435 const u = decodeUTF(); 1436 p++; 1437 if (u == PS || u == LS) 1438 endOfLine(); 1439 else 1440 error("non-hex character \\u%04x in hex string", u); 1441 } 1442 else 1443 error("non-hex character '%c' in hex string", c); 1444 if (n & 1) 1445 { 1446 v = (v << 4) | c; 1447 stringbuffer.writeByte(v); 1448 } 1449 else 1450 v = c; 1451 n++; 1452 break; 1453 } 1454 } 1455 assert(0); // see bug 15731 1456 } 1457 1458 /** 1459 Lex a delimited string. Some examples of delimited strings are: 1460 --- 1461 q"(foo(xxx))" // "foo(xxx)" 1462 q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1463 q"/foo]/" // "foo]" 1464 q"HERE 1465 foo 1466 HERE" // "foo\n" 1467 --- 1468 It is assumed that `p` points to the opening double-quote '"'. 1469 Params: 1470 result = pointer to the token that accepts the result 1471 */ 1472 private void delimitedStringConstant(Token* result) 1473 { 1474 result.value = TOK.string_; 1475 Loc start = loc(); 1476 dchar delimleft = 0; 1477 dchar delimright = 0; 1478 uint nest = 1; 1479 uint nestcount = ~0; // dead assignment, needed to suppress warning 1480 Identifier hereid = null; 1481 uint blankrol = 0; 1482 uint startline = 0; 1483 p++; 1484 stringbuffer.setsize(0); 1485 while (1) 1486 { 1487 dchar c = *p++; 1488 //printf("c = '%c'\n", c); 1489 switch (c) 1490 { 1491 case '\n': 1492 Lnextline: 1493 endOfLine(); 1494 startline = 1; 1495 if (blankrol) 1496 { 1497 blankrol = 0; 1498 continue; 1499 } 1500 if (hereid) 1501 { 1502 stringbuffer.writeUTF8(c); 1503 continue; 1504 } 1505 break; 1506 case '\r': 1507 if (*p == '\n') 1508 continue; // ignore 1509 c = '\n'; // treat EndOfLine as \n character 1510 goto Lnextline; 1511 case 0: 1512 case 0x1A: 1513 error("unterminated delimited string constant starting at %s", start.toChars()); 1514 result.setString(); 1515 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1516 p--; 1517 return; 1518 default: 1519 if (c & 0x80) 1520 { 1521 p--; 1522 c = decodeUTF(); 1523 p++; 1524 if (c == PS || c == LS) 1525 goto Lnextline; 1526 } 1527 break; 1528 } 1529 if (delimleft == 0) 1530 { 1531 delimleft = c; 1532 nest = 1; 1533 nestcount = 1; 1534 if (c == '(') 1535 delimright = ')'; 1536 else if (c == '{') 1537 delimright = '}'; 1538 else if (c == '[') 1539 delimright = ']'; 1540 else if (c == '<') 1541 delimright = '>'; 1542 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1543 { 1544 // Start of identifier; must be a heredoc 1545 Token tok; 1546 p--; 1547 scan(&tok); // read in heredoc identifier 1548 if (tok.value != TOK.identifier) 1549 { 1550 error("identifier expected for heredoc, not %s", tok.toChars()); 1551 delimright = c; 1552 } 1553 else 1554 { 1555 hereid = tok.ident; 1556 //printf("hereid = '%s'\n", hereid.toChars()); 1557 blankrol = 1; 1558 } 1559 nest = 0; 1560 } 1561 else 1562 { 1563 delimright = c; 1564 nest = 0; 1565 if (isspace(c)) 1566 error("delimiter cannot be whitespace"); 1567 } 1568 } 1569 else 1570 { 1571 if (blankrol) 1572 { 1573 error("heredoc rest of line should be blank"); 1574 blankrol = 0; 1575 continue; 1576 } 1577 if (nest == 1) 1578 { 1579 if (c == delimleft) 1580 nestcount++; 1581 else if (c == delimright) 1582 { 1583 nestcount--; 1584 if (nestcount == 0) 1585 goto Ldone; 1586 } 1587 } 1588 else if (c == delimright) 1589 goto Ldone; 1590 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) 1591 { 1592 Token tok; 1593 auto psave = p; 1594 p--; 1595 scan(&tok); // read in possible heredoc identifier 1596 //printf("endid = '%s'\n", tok.ident.toChars()); 1597 if (tok.value == TOK.identifier && tok.ident is hereid) 1598 { 1599 /* should check that rest of line is blank 1600 */ 1601 goto Ldone; 1602 } 1603 p = psave; 1604 } 1605 stringbuffer.writeUTF8(c); 1606 startline = 0; 1607 } 1608 } 1609 Ldone: 1610 if (*p == '"') 1611 p++; 1612 else if (hereid) 1613 error("delimited string must end in %s\"", hereid.toChars()); 1614 else 1615 error("delimited string must end in %c\"", delimright); 1616 result.setString(stringbuffer); 1617 stringPostfix(result); 1618 } 1619 1620 /** 1621 Lex a token string. Some examples of token strings are: 1622 --- 1623 q{ foo(xxx) } // " foo(xxx) " 1624 q{foo$(LPAREN)} // "foo$(LPAREN)" 1625 q{{foo}"}"} // "{foo}"}"" 1626 --- 1627 It is assumed that `p` points to the opening curly-brace '{'. 1628 Params: 1629 result = pointer to the token that accepts the result 1630 */ 1631 private void tokenStringConstant(Token* result) 1632 { 1633 result.value = TOK.string_; 1634 1635 uint nest = 1; 1636 const start = loc(); 1637 const pstart = ++p; 1638 inTokenStringConstant++; 1639 scope(exit) inTokenStringConstant--; 1640 while (1) 1641 { 1642 Token tok; 1643 scan(&tok); 1644 switch (tok.value) 1645 { 1646 case TOK.leftCurly: 1647 nest++; 1648 continue; 1649 case TOK.rightCurly: 1650 if (--nest == 0) 1651 { 1652 result.setString(pstart, p - 1 - pstart); 1653 stringPostfix(result); 1654 return; 1655 } 1656 continue; 1657 case TOK.endOfFile: 1658 error("unterminated token string constant starting at %s", start.toChars()); 1659 result.setString(); 1660 return; 1661 default: 1662 continue; 1663 } 1664 } 1665 } 1666 1667 /** 1668 Scan a double-quoted string while building the processed string value by 1669 handling escape sequences. The result is returned in the given `t` token. 1670 This function assumes that `p` currently points to the opening double-quote 1671 of the string. 1672 Params: 1673 t = the token to set the resulting string to 1674 */ 1675 private void escapeStringConstant(Token* t) 1676 { 1677 t.value = TOK.string_; 1678 1679 const start = loc(); 1680 p++; 1681 stringbuffer.setsize(0); 1682 while (1) 1683 { 1684 dchar c = *p++; 1685 switch (c) 1686 { 1687 case '\\': 1688 switch (*p) 1689 { 1690 case 'u': 1691 case 'U': 1692 case '&': 1693 c = escapeSequence(); 1694 stringbuffer.writeUTF8(c); 1695 continue; 1696 default: 1697 c = escapeSequence(); 1698 break; 1699 } 1700 break; 1701 case '\n': 1702 endOfLine(); 1703 break; 1704 case '\r': 1705 if (*p == '\n') 1706 continue; // ignore 1707 c = '\n'; // treat EndOfLine as \n character 1708 endOfLine(); 1709 break; 1710 case '"': 1711 t.setString(stringbuffer); 1712 stringPostfix(t); 1713 return; 1714 case 0: 1715 case 0x1A: 1716 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1717 p--; 1718 error("unterminated string constant starting at %s", start.toChars()); 1719 t.setString(); 1720 return; 1721 default: 1722 if (c & 0x80) 1723 { 1724 p--; 1725 c = decodeUTF(); 1726 if (c == LS || c == PS) 1727 { 1728 c = '\n'; 1729 endOfLine(); 1730 } 1731 p++; 1732 stringbuffer.writeUTF8(c); 1733 continue; 1734 } 1735 break; 1736 } 1737 stringbuffer.writeByte(c); 1738 } 1739 } 1740 1741 /************************************** 1742 */ 1743 private TOK charConstant(Token* t) 1744 { 1745 TOK tk = TOK.charLiteral; 1746 //printf("Lexer::charConstant\n"); 1747 p++; 1748 dchar c = *p++; 1749 switch (c) 1750 { 1751 case '\\': 1752 switch (*p) 1753 { 1754 case 'u': 1755 t.unsvalue = escapeSequence(); 1756 tk = TOK.wcharLiteral; 1757 break; 1758 case 'U': 1759 case '&': 1760 t.unsvalue = escapeSequence(); 1761 tk = TOK.dcharLiteral; 1762 break; 1763 default: 1764 t.unsvalue = escapeSequence(); 1765 break; 1766 } 1767 break; 1768 case '\n': 1769 L1: 1770 endOfLine(); 1771 goto case; 1772 case '\r': 1773 goto case '\''; 1774 case 0: 1775 case 0x1A: 1776 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1777 p--; 1778 goto case; 1779 case '\'': 1780 error("unterminated character constant"); 1781 t.unsvalue = '?'; 1782 return tk; 1783 default: 1784 if (c & 0x80) 1785 { 1786 p--; 1787 c = decodeUTF(); 1788 p++; 1789 if (c == LS || c == PS) 1790 goto L1; 1791 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1792 tk = TOK.wcharLiteral; 1793 else 1794 tk = TOK.dcharLiteral; 1795 } 1796 t.unsvalue = c; 1797 break; 1798 } 1799 if (*p != '\'') 1800 { 1801 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && 1802 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') 1803 { 1804 if (*p & 0x80) 1805 { 1806 const s = p; 1807 c = decodeUTF(); 1808 if (c == LS || c == PS) 1809 { 1810 p = s; 1811 break; 1812 } 1813 } 1814 p++; 1815 } 1816 1817 if (*p == '\'') 1818 { 1819 error("character constant has multiple characters"); 1820 p++; 1821 } 1822 else 1823 error("unterminated character constant"); 1824 t.unsvalue = '?'; 1825 return tk; 1826 } 1827 p++; 1828 return tk; 1829 } 1830 1831 /*************************************** 1832 * Get postfix of string literal. 1833 */ 1834 private void stringPostfix(Token* t) pure @nogc 1835 { 1836 switch (*p) 1837 { 1838 case 'c': 1839 case 'w': 1840 case 'd': 1841 t.postfix = *p; 1842 p++; 1843 break; 1844 default: 1845 t.postfix = 0; 1846 break; 1847 } 1848 } 1849 1850 /************************************** 1851 * Read in a number. 1852 * If it's an integer, store it in tok.TKutok.Vlong. 1853 * integers can be decimal, octal or hex 1854 * Handle the suffixes U, UL, LU, L, etc. 1855 * If it's double, store it in tok.TKutok.Vdouble. 1856 * Returns: 1857 * TKnum 1858 * TKdouble,... 1859 */ 1860 private TOK number(Token* t) 1861 { 1862 int base = 10; 1863 const start = p; 1864 uinteger_t n = 0; // unsigned >=64 bit integer type 1865 int d; 1866 bool err = false; 1867 bool overflow = false; 1868 bool anyBinaryDigitsNoSingleUS = false; 1869 bool anyHexDigitsNoSingleUS = false; 1870 dchar c = *p; 1871 if (c == '0') 1872 { 1873 ++p; 1874 c = *p; 1875 switch (c) 1876 { 1877 case '0': 1878 case '1': 1879 case '2': 1880 case '3': 1881 case '4': 1882 case '5': 1883 case '6': 1884 case '7': 1885 case '8': 1886 case '9': 1887 base = 8; 1888 break; 1889 case 'x': 1890 case 'X': 1891 ++p; 1892 base = 16; 1893 break; 1894 case 'b': 1895 case 'B': 1896 ++p; 1897 base = 2; 1898 break; 1899 case '.': 1900 if (p[1] == '.') 1901 goto Ldone; // if ".." 1902 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1903 goto Ldone; // if ".identifier" or ".unicode" 1904 goto Lreal; // '.' is part of current token 1905 case 'i': 1906 case 'f': 1907 case 'F': 1908 goto Lreal; 1909 case '_': 1910 ++p; 1911 base = 8; 1912 break; 1913 case 'L': 1914 if (p[1] == 'i') 1915 goto Lreal; 1916 break; 1917 default: 1918 break; 1919 } 1920 } 1921 while (1) 1922 { 1923 c = *p; 1924 switch (c) 1925 { 1926 case '0': 1927 case '1': 1928 case '2': 1929 case '3': 1930 case '4': 1931 case '5': 1932 case '6': 1933 case '7': 1934 case '8': 1935 case '9': 1936 ++p; 1937 d = c - '0'; 1938 break; 1939 case 'a': 1940 case 'b': 1941 case 'c': 1942 case 'd': 1943 case 'e': 1944 case 'f': 1945 case 'A': 1946 case 'B': 1947 case 'C': 1948 case 'D': 1949 case 'E': 1950 case 'F': 1951 ++p; 1952 if (base != 16) 1953 { 1954 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1955 goto Lreal; 1956 } 1957 if (c >= 'a') 1958 d = c + 10 - 'a'; 1959 else 1960 d = c + 10 - 'A'; 1961 break; 1962 case 'L': 1963 if (p[1] == 'i') 1964 goto Lreal; 1965 goto Ldone; 1966 case '.': 1967 if (p[1] == '.') 1968 goto Ldone; // if ".." 1969 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1970 goto Ldone; // if ".identifier" or ".unicode" 1971 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) 1972 goto Ldone; // if ".identifier" or ".unicode" 1973 if (base == 2) 1974 goto Ldone; // if ".identifier" or ".unicode" 1975 goto Lreal; // otherwise as part of a floating point literal 1976 case 'p': 1977 case 'P': 1978 case 'i': 1979 Lreal: 1980 p = start; 1981 return inreal(t); 1982 case '_': 1983 ++p; 1984 continue; 1985 default: 1986 goto Ldone; 1987 } 1988 // got a digit here, set any necessary flags, check for errors 1989 anyHexDigitsNoSingleUS = true; 1990 anyBinaryDigitsNoSingleUS = true; 1991 if (!err && d >= base) 1992 { 1993 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr : 1994 base == 8 ? "octal".ptr : 1995 "decimal".ptr, c); 1996 err = true; 1997 } 1998 // Avoid expensive overflow check if we aren't at risk of overflow 1999 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 2000 n = n * base + d; 2001 else 2002 { 2003 import core.checkedint : mulu, addu; 2004 2005 n = mulu(n, base, overflow); 2006 n = addu(n, d, overflow); 2007 } 2008 } 2009 Ldone: 2010 if (overflow && !err) 2011 { 2012 error("integer overflow"); 2013 err = true; 2014 } 2015 if ((base == 2 && !anyBinaryDigitsNoSingleUS) || 2016 (base == 16 && !anyHexDigitsNoSingleUS)) 2017 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); 2018 enum FLAGS : int 2019 { 2020 none = 0, 2021 decimal = 1, // decimal 2022 unsigned = 2, // u or U suffix 2023 long_ = 4, // L suffix 2024 } 2025 2026 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; 2027 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 2028 const psuffix = p; 2029 while (1) 2030 { 2031 FLAGS f; 2032 switch (*p) 2033 { 2034 case 'U': 2035 case 'u': 2036 f = FLAGS.unsigned; 2037 goto L1; 2038 case 'l': 2039 f = FLAGS.long_; 2040 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 2041 goto L1; 2042 case 'L': 2043 f = FLAGS.long_; 2044 L1: 2045 p++; 2046 if ((flags & f) && !err) 2047 { 2048 error("unrecognized token"); 2049 err = true; 2050 } 2051 flags = cast(FLAGS)(flags | f); 2052 continue; 2053 default: 2054 break; 2055 } 2056 break; 2057 } 2058 if (base == 8 && n >= 8) 2059 { 2060 if (err) 2061 // can't translate invalid octal value, just show a generic message 2062 error("octal literals larger than 7 are no longer supported"); 2063 else 2064 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead", 2065 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); 2066 } 2067 TOK result; 2068 switch (flags) 2069 { 2070 case FLAGS.none: 2071 /* Octal or Hexadecimal constant. 2072 * First that fits: int, uint, long, ulong 2073 */ 2074 if (n & 0x8000000000000000L) 2075 result = TOK.uns64Literal; 2076 else if (n & 0xFFFFFFFF00000000L) 2077 result = TOK.int64Literal; 2078 else if (n & 0x80000000) 2079 result = TOK.uns32Literal; 2080 else 2081 result = TOK.int32Literal; 2082 break; 2083 case FLAGS.decimal: 2084 /* First that fits: int, long, long long 2085 */ 2086 if (n & 0x8000000000000000L) 2087 { 2088 result = TOK.uns64Literal; 2089 } 2090 else if (n & 0xFFFFFFFF80000000L) 2091 result = TOK.int64Literal; 2092 else 2093 result = TOK.int32Literal; 2094 break; 2095 case FLAGS.unsigned: 2096 case FLAGS.decimal | FLAGS.unsigned: 2097 /* First that fits: uint, ulong 2098 */ 2099 if (n & 0xFFFFFFFF00000000L) 2100 result = TOK.uns64Literal; 2101 else 2102 result = TOK.uns32Literal; 2103 break; 2104 case FLAGS.decimal | FLAGS.long_: 2105 if (n & 0x8000000000000000L) 2106 { 2107 if (!err) 2108 { 2109 error("signed integer overflow"); 2110 err = true; 2111 } 2112 result = TOK.uns64Literal; 2113 } 2114 else 2115 result = TOK.int64Literal; 2116 break; 2117 case FLAGS.long_: 2118 if (n & 0x8000000000000000L) 2119 result = TOK.uns64Literal; 2120 else 2121 result = TOK.int64Literal; 2122 break; 2123 case FLAGS.unsigned | FLAGS.long_: 2124 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2125 result = TOK.uns64Literal; 2126 break; 2127 default: 2128 debug 2129 { 2130 printf("%x\n", flags); 2131 } 2132 assert(0); 2133 } 2134 t.unsvalue = n; 2135 return result; 2136 } 2137 2138 /************************************** 2139 * Read in characters, converting them to real. 2140 * Bugs: 2141 * Exponent overflow not detected. 2142 * Too much requested precision is not detected. 2143 */ 2144 private TOK inreal(Token* t) 2145 { 2146 //printf("Lexer::inreal()\n"); 2147 debug 2148 { 2149 assert(*p == '.' || isdigit(*p)); 2150 } 2151 bool isWellformedString = true; 2152 stringbuffer.setsize(0); 2153 auto pstart = p; 2154 bool hex = false; 2155 dchar c = *p++; 2156 // Leading '0x' 2157 if (c == '0') 2158 { 2159 c = *p++; 2160 if (c == 'x' || c == 'X') 2161 { 2162 hex = true; 2163 c = *p++; 2164 } 2165 } 2166 // Digits to left of '.' 2167 while (1) 2168 { 2169 if (c == '.') 2170 { 2171 c = *p++; 2172 break; 2173 } 2174 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2175 { 2176 c = *p++; 2177 continue; 2178 } 2179 break; 2180 } 2181 // Digits to right of '.' 2182 while (1) 2183 { 2184 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2185 { 2186 c = *p++; 2187 continue; 2188 } 2189 break; 2190 } 2191 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2192 { 2193 c = *p++; 2194 if (c == '-' || c == '+') 2195 { 2196 c = *p++; 2197 } 2198 bool anyexp = false; 2199 while (1) 2200 { 2201 if (isdigit(c)) 2202 { 2203 anyexp = true; 2204 c = *p++; 2205 continue; 2206 } 2207 if (c == '_') 2208 { 2209 c = *p++; 2210 continue; 2211 } 2212 if (!anyexp) 2213 { 2214 error("missing exponent"); 2215 isWellformedString = false; 2216 } 2217 break; 2218 } 2219 } 2220 else if (hex) 2221 { 2222 error("exponent required for hex float"); 2223 isWellformedString = false; 2224 } 2225 --p; 2226 while (pstart < p) 2227 { 2228 if (*pstart != '_') 2229 stringbuffer.writeByte(*pstart); 2230 ++pstart; 2231 } 2232 stringbuffer.writeByte(0); 2233 auto sbufptr = cast(const(char)*)stringbuffer[].ptr; 2234 TOK result; 2235 bool isOutOfRange = false; 2236 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero); 2237 switch (*p) 2238 { 2239 case 'F': 2240 case 'f': 2241 if (isWellformedString && !isOutOfRange) 2242 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); 2243 result = TOK.float32Literal; 2244 p++; 2245 break; 2246 default: 2247 if (isWellformedString && !isOutOfRange) 2248 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); 2249 result = TOK.float64Literal; 2250 break; 2251 case 'l': 2252 error("use 'L' suffix instead of 'l'"); 2253 goto case 'L'; 2254 case 'L': 2255 result = TOK.float80Literal; 2256 p++; 2257 break; 2258 } 2259 if (*p == 'i' || *p == 'I') 2260 { 2261 if (*p == 'I') 2262 error("use 'i' suffix instead of 'I'"); 2263 p++; 2264 switch (result) 2265 { 2266 case TOK.float32Literal: 2267 result = TOK.imaginary32Literal; 2268 break; 2269 case TOK.float64Literal: 2270 result = TOK.imaginary64Literal; 2271 break; 2272 case TOK.float80Literal: 2273 result = TOK.imaginary80Literal; 2274 break; 2275 default: 2276 break; 2277 } 2278 } 2279 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); 2280 if (isOutOfRange && !isLong) 2281 { 2282 const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : ""; 2283 error(scanloc, "number `%s%s` is not representable", sbufptr, suffix); 2284 } 2285 debug 2286 { 2287 switch (result) 2288 { 2289 case TOK.float32Literal: 2290 case TOK.float64Literal: 2291 case TOK.float80Literal: 2292 case TOK.imaginary32Literal: 2293 case TOK.imaginary64Literal: 2294 case TOK.imaginary80Literal: 2295 break; 2296 default: 2297 assert(0); 2298 } 2299 } 2300 return result; 2301 } 2302 2303 final Loc loc() pure @nogc 2304 { 2305 scanloc.charnum = cast(uint)(1 + p - line); 2306 version (LocOffset) 2307 scanloc.fileOffset = cast(uint)(p - base); 2308 return scanloc; 2309 } 2310 2311 final void error(const(char)* format, ...) 2312 { 2313 va_list args; 2314 va_start(args, format); 2315 handleDiagnostic(token.loc, Severity.error, format, args); 2316 va_end(args); 2317 } 2318 2319 final void error(const ref Loc loc, const(char)* format, ...) 2320 { 2321 va_list args; 2322 va_start(args, format); 2323 handleDiagnostic(loc, Severity.error, format, args); 2324 va_end(args); 2325 } 2326 2327 final void errorSupplemental(const ref Loc loc, const(char)* format, ...) 2328 { 2329 va_list args; 2330 va_start(args, format); 2331 handleDiagnostic(loc, Severity.error, format, args, true); 2332 va_end(args); 2333 } 2334 2335 final void warning(const ref Loc loc, const(char)* format, ...) 2336 { 2337 va_list args; 2338 va_start(args, format); 2339 handleDiagnostic(loc, Severity.warning, format, args); 2340 va_end(args); 2341 } 2342 2343 final void warningSupplemental(const ref Loc loc, const(char)* format, ...) 2344 { 2345 va_list args; 2346 va_start(args, format); 2347 handleDiagnostic(loc, Severity.warning, format, args, true); 2348 va_end(args); 2349 } 2350 2351 final void deprecation(const(char)* format, ...) 2352 { 2353 va_list args; 2354 va_start(args, format); 2355 handleDiagnostic(token.loc, Severity.deprecation, format, args); 2356 va_end(args); 2357 } 2358 2359 final void deprecationSupplemental(const(char)* format, ...) 2360 { 2361 va_list args; 2362 va_start(args, format); 2363 handleDiagnostic(token.loc, Severity.deprecation, format, args, true); 2364 va_end(args); 2365 } 2366 2367 /********************************************* 2368 * parse: 2369 * #line linnum [filespec] 2370 * also allow __LINE__ for linnum, and __FILE__ for filespec 2371 */ 2372 private void poundLine() 2373 { 2374 auto linnum = this.scanloc.linnum; 2375 const(char)* filespec = null; 2376 const loc = this.loc(); 2377 Token tok; 2378 scan(&tok); 2379 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) 2380 { 2381 const lin = cast(int)(tok.unsvalue - 1); 2382 if (lin != tok.unsvalue - 1) 2383 error("line number `%lld` out of range", cast(ulong)tok.unsvalue); 2384 else 2385 linnum = lin; 2386 } 2387 else if (tok.value == TOK.line) 2388 { 2389 } 2390 else 2391 goto Lerr; 2392 while (1) 2393 { 2394 switch (*p) 2395 { 2396 case 0: 2397 case 0x1A: 2398 case '\n': 2399 Lnewline: 2400 if (!inTokenStringConstant) 2401 { 2402 this.scanloc.linnum = linnum; 2403 if (filespec) 2404 this.scanloc.filename = filespec; 2405 } 2406 return; 2407 case '\r': 2408 p++; 2409 if (*p != '\n') 2410 { 2411 p--; 2412 goto Lnewline; 2413 } 2414 continue; 2415 case ' ': 2416 case '\t': 2417 case '\v': 2418 case '\f': 2419 p++; 2420 continue; // skip white space 2421 case '_': 2422 if (memcmp(p, "__FILE__".ptr, 8) == 0) 2423 { 2424 p += 8; 2425 filespec = mem.xstrdup(scanloc.filename); 2426 continue; 2427 } 2428 goto Lerr; 2429 case '"': 2430 if (filespec) 2431 goto Lerr; 2432 stringbuffer.setsize(0); 2433 p++; 2434 while (1) 2435 { 2436 uint c; 2437 c = *p; 2438 switch (c) 2439 { 2440 case '\n': 2441 case '\r': 2442 case 0: 2443 case 0x1A: 2444 goto Lerr; 2445 case '"': 2446 stringbuffer.writeByte(0); 2447 filespec = mem.xstrdup(cast(const(char)*)stringbuffer[].ptr); 2448 p++; 2449 break; 2450 default: 2451 if (c & 0x80) 2452 { 2453 uint u = decodeUTF(); 2454 if (u == PS || u == LS) 2455 goto Lerr; 2456 } 2457 stringbuffer.writeByte(c); 2458 p++; 2459 continue; 2460 } 2461 break; 2462 } 2463 continue; 2464 default: 2465 if (*p & 0x80) 2466 { 2467 uint u = decodeUTF(); 2468 if (u == PS || u == LS) 2469 goto Lnewline; 2470 } 2471 goto Lerr; 2472 } 2473 } 2474 Lerr: 2475 error(loc, "#line integer [\"filespec\"]\\n expected"); 2476 } 2477 2478 /******************************************** 2479 * Decode UTF character. 2480 * Issue error messages for invalid sequences. 2481 * Return decoded character, advance p to last character in UTF sequence. 2482 */ 2483 private uint decodeUTF() 2484 { 2485 const s = p; 2486 assert(*s & 0x80); 2487 // Check length of remaining string up to 4 UTF-8 characters 2488 size_t len; 2489 for (len = 1; len < 4 && s[len]; len++) 2490 { 2491 } 2492 size_t idx = 0; 2493 dchar u; 2494 const msg = utf_decodeChar(s[0 .. len], idx, u); 2495 p += idx - 1; 2496 if (msg) 2497 { 2498 error("%.*s", cast(int)msg.length, msg.ptr); 2499 } 2500 return u; 2501 } 2502 2503 /*************************************************** 2504 * Parse doc comment embedded between t.ptr and p. 2505 * Remove trailing blanks and tabs from lines. 2506 * Replace all newlines with \n. 2507 * Remove leading comment character from each line. 2508 * Decide if it's a lineComment or a blockComment. 2509 * Append to previous one for this token. 2510 * 2511 * If newParagraph is true, an extra newline will be 2512 * added between adjoining doc comments. 2513 */ 2514 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure 2515 { 2516 /* ct tells us which kind of comment it is: '/', '*', or '+' 2517 */ 2518 const ct = t.ptr[2]; 2519 /* Start of comment text skips over / * *, / + +, or / / / 2520 */ 2521 const(char)* q = t.ptr + 3; // start of comment text 2522 const(char)* qend = p; 2523 if (ct == '*' || ct == '+') 2524 qend -= 2; 2525 /* Scan over initial row of ****'s or ++++'s or ////'s 2526 */ 2527 for (; q < qend; q++) 2528 { 2529 if (*q != ct) 2530 break; 2531 } 2532 /* Remove leading spaces until start of the comment 2533 */ 2534 int linestart = 0; 2535 if (ct == '/') 2536 { 2537 while (q < qend && (*q == ' ' || *q == '\t')) 2538 ++q; 2539 } 2540 else if (q < qend) 2541 { 2542 if (*q == '\r') 2543 { 2544 ++q; 2545 if (q < qend && *q == '\n') 2546 ++q; 2547 linestart = 1; 2548 } 2549 else if (*q == '\n') 2550 { 2551 ++q; 2552 linestart = 1; 2553 } 2554 } 2555 /* Remove trailing row of ****'s or ++++'s 2556 */ 2557 if (ct != '/') 2558 { 2559 for (; q < qend; qend--) 2560 { 2561 if (qend[-1] != ct) 2562 break; 2563 } 2564 } 2565 /* Comment is now [q .. qend]. 2566 * Canonicalize it into buf[]. 2567 */ 2568 OutBuffer buf; 2569 2570 void trimTrailingWhitespace() 2571 { 2572 const s = buf[]; 2573 auto len = s.length; 2574 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 2575 --len; 2576 buf.setsize(len); 2577 } 2578 2579 for (; q < qend; q++) 2580 { 2581 char c = *q; 2582 switch (c) 2583 { 2584 case '*': 2585 case '+': 2586 if (linestart && c == ct) 2587 { 2588 linestart = 0; 2589 /* Trim preceding whitespace up to preceding \n 2590 */ 2591 trimTrailingWhitespace(); 2592 continue; 2593 } 2594 break; 2595 case ' ': 2596 case '\t': 2597 break; 2598 case '\r': 2599 if (q[1] == '\n') 2600 continue; // skip the \r 2601 goto Lnewline; 2602 default: 2603 if (c == 226) 2604 { 2605 // If LS or PS 2606 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2607 { 2608 q += 2; 2609 goto Lnewline; 2610 } 2611 } 2612 linestart = 0; 2613 break; 2614 Lnewline: 2615 c = '\n'; // replace all newlines with \n 2616 goto case; 2617 case '\n': 2618 linestart = 1; 2619 /* Trim trailing whitespace 2620 */ 2621 trimTrailingWhitespace(); 2622 break; 2623 } 2624 buf.writeByte(c); 2625 } 2626 /* Trim trailing whitespace (if the last line does not have newline) 2627 */ 2628 trimTrailingWhitespace(); 2629 2630 // Always end with a newline 2631 const s = buf[]; 2632 if (s.length == 0 || s[$ - 1] != '\n') 2633 buf.writeByte('\n'); 2634 2635 // It's a line comment if the start of the doc comment comes 2636 // after other non-whitespace on the same line. 2637 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2638 // Combine with previous doc comment, if any 2639 if (*dc) 2640 *dc = combineComments(*dc, buf[], newParagraph).toDString(); 2641 else 2642 *dc = buf.extractSlice(true); 2643 } 2644 2645 /******************************************** 2646 * Combine two document comments into one, 2647 * separated by an extra newline if newParagraph is true. 2648 */ 2649 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure 2650 { 2651 //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph); 2652 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' 2653 if (!c1) 2654 return c2.ptr; 2655 if (!c2) 2656 return c1.ptr; 2657 2658 int insertNewLine = 0; 2659 if (c1.length && c1[$ - 1] != '\n') 2660 insertNewLine = 1; 2661 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; 2662 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); 2663 p[0 .. c1.length] = c1[]; 2664 if (insertNewLine) 2665 p[c1.length] = '\n'; 2666 if (newParagraph) 2667 p[c1.length + insertNewLine] = '\n'; 2668 p[retSize - c2.length .. retSize] = c2[]; 2669 p[retSize] = 0; 2670 return p; 2671 } 2672 2673 private: 2674 void endOfLine() pure @nogc @safe 2675 { 2676 scanloc.linnum++; 2677 line = p; 2678 } 2679 } 2680 2681 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` 2682 private struct TimeStampInfo 2683 { 2684 private __gshared bool initdone = false; 2685 2686 // Note: Those properties need to be guarded by a call to `init` 2687 // The API isn't safe, and quite brittle, but it was left this way 2688 // over performance concerns. 2689 // This is currently only called once, from the lexer. 2690 __gshared char[11 + 1] date; 2691 __gshared char[8 + 1] time; 2692 __gshared char[24 + 1] timestamp; 2693 2694 public static void initialize(const ref Loc loc) nothrow 2695 { 2696 if (initdone) 2697 return; 2698 2699 initdone = true; 2700 time_t ct; 2701 // https://issues.dlang.org/show_bug.cgi?id=20444 2702 if (auto p = getenv("SOURCE_DATE_EPOCH")) 2703 { 2704 if (!ct.parseDigits(p.toDString())) 2705 error(loc, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); 2706 } 2707 else 2708 .time(&ct); 2709 const p = ctime(&ct); 2710 assert(p); 2711 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 2712 sprintf(&time[0], "%.8s", p + 11); 2713 sprintf(×tamp[0], "%.24s", p); 2714 } 2715 } 2716 2717 unittest 2718 { 2719 import dmd.console; 2720 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2721 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2722 { 2723 assert(0); 2724 } 2725 diagnosticHandler = &assertDiagnosticHandler; 2726 2727 static void test(T)(string sequence, T expected) 2728 { 2729 auto p = cast(const(char)*)sequence.ptr; 2730 assert(expected == Lexer.escapeSequence(Loc.initial, p)); 2731 assert(p == sequence.ptr + sequence.length); 2732 } 2733 2734 test(`'`, '\''); 2735 test(`"`, '"'); 2736 test(`?`, '?'); 2737 test(`\`, '\\'); 2738 test(`0`, '\0'); 2739 test(`a`, '\a'); 2740 test(`b`, '\b'); 2741 test(`f`, '\f'); 2742 test(`n`, '\n'); 2743 test(`r`, '\r'); 2744 test(`t`, '\t'); 2745 test(`v`, '\v'); 2746 2747 test(`x00`, 0x00); 2748 test(`xff`, 0xff); 2749 test(`xFF`, 0xff); 2750 test(`xa7`, 0xa7); 2751 test(`x3c`, 0x3c); 2752 test(`xe2`, 0xe2); 2753 2754 test(`1`, '\1'); 2755 test(`42`, '\42'); 2756 test(`357`, '\357'); 2757 2758 test(`u1234`, '\u1234'); 2759 test(`uf0e4`, '\uf0e4'); 2760 2761 test(`U0001f603`, '\U0001f603'); 2762 2763 test(`"`, '"'); 2764 test(`<`, '<'); 2765 test(`>`, '>'); 2766 2767 diagnosticHandler = null; 2768 } 2769 unittest 2770 { 2771 import dmd.console; 2772 string expected; 2773 bool gotError; 2774 2775 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2776 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2777 { 2778 assert(cast(Classification)headerColor == Classification.error); 2779 2780 gotError = true; 2781 char[100] buffer = void; 2782 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)]; 2783 assert(expected == actual); 2784 return true; 2785 } 2786 2787 diagnosticHandler = &expectDiagnosticHandler; 2788 2789 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength) 2790 { 2791 uint errors = global.errors; 2792 gotError = false; 2793 expected = expectedError; 2794 auto p = cast(const(char)*)sequence.ptr; 2795 auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p); 2796 assert(gotError); 2797 assert(expectedReturnValue == actualReturnValue); 2798 2799 auto actualScanLength = p - sequence.ptr; 2800 assert(expectedScanLength == actualScanLength); 2801 global.errors = errors; 2802 } 2803 2804 test("c", `undefined escape sequence \c`, 'c', 1); 2805 test("!", `undefined escape sequence \!`, '!', 1); 2806 2807 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); 2808 2809 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); 2810 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); 2811 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); 2812 2813 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); 2814 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); 2815 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); 2816 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); 2817 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); 2818 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); 2819 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); 2820 2821 test("ud800" , `invalid UTF character \U0000d800`, '?', 5); 2822 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); 2823 test("U00110000", `invalid UTF character \U00110000`, '?', 9); 2824 2825 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); 2826 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); 2827 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); 2828 2829 test("&BAD;", `unnamed character entity &BAD;` , '?', 5); 2830 test(""", `unterminated named entity "`, '?', 5); 2831 2832 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); 2833 2834 diagnosticHandler = null; 2835 }