1 /** 2 * Implements the lexical analyzer, which converts source code into lexical tokens. 3 * 4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) 5 * 6 * Copyright: Copyright (C) 1999-2020 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) 10 * Documentation: https://dlang.org/phobos/dmd_lexer.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d 12 */ 13 14 module dmd.lexer; 15 16 import core.stdc.ctype; 17 import core.stdc.errno; 18 import core.stdc.stdarg; 19 import core.stdc.stdio; 20 import core.stdc.stdlib : getenv; 21 import core.stdc.string; 22 import core.stdc.time; 23 24 import dmd.diagnostic : DiagnosticHandler, Severity, DefaultDiagnosticHandler, DefaultDiagnosticReporter; 25 import dmd.entity; 26 import dmd.errors; 27 import dmd.globals; 28 import dmd.id; 29 import dmd.identifier; 30 import dmd.root.ctfloat; 31 import dmd.root.outbuffer; 32 import dmd.root.port; 33 import dmd.root.rmem; 34 import dmd.root.string; 35 import dmd.tokens; 36 import dmd.utf; 37 import dmd.utils; 38 39 nothrow: 40 41 private enum LS = 0x2028; // UTF line separator 42 private enum PS = 0x2029; // UTF paragraph separator 43 44 /******************************************** 45 * Do our own char maps 46 */ 47 private static immutable cmtable = () { 48 ubyte[256] table; 49 foreach (const c; 0 .. table.length) 50 { 51 if ('0' <= c && c <= '7') 52 table[c] |= CMoctal; 53 if (c_isxdigit(c)) 54 table[c] |= CMhex; 55 if (c_isalnum(c) || c == '_') 56 table[c] |= CMidchar; 57 58 switch (c) 59 { 60 case 'x': case 'X': 61 case 'b': case 'B': 62 table[c] |= CMzerosecond; 63 break; 64 65 case '0': .. case '9': 66 case 'e': case 'E': 67 case 'f': case 'F': 68 case 'l': case 'L': 69 case 'p': case 'P': 70 case 'u': case 'U': 71 case 'i': 72 case '.': 73 case '_': 74 table[c] |= CMzerosecond | CMdigitsecond; 75 break; 76 77 default: 78 break; 79 } 80 81 switch (c) 82 { 83 case '\\': 84 case '\n': 85 case '\r': 86 case 0: 87 case 0x1A: 88 case '\'': 89 break; 90 default: 91 if (!(c & 0x80)) 92 table[c] |= CMsinglechar; 93 break; 94 } 95 } 96 return table; 97 }(); 98 99 private 100 { 101 enum CMoctal = 0x1; 102 enum CMhex = 0x2; 103 enum CMidchar = 0x4; 104 enum CMzerosecond = 0x8; 105 enum CMdigitsecond = 0x10; 106 enum CMsinglechar = 0x20; 107 } 108 109 private bool isoctal(const char c) pure @nogc @safe 110 { 111 return (cmtable[c] & CMoctal) != 0; 112 } 113 114 private bool ishex(const char c) pure @nogc @safe 115 { 116 return (cmtable[c] & CMhex) != 0; 117 } 118 119 private bool isidchar(const char c) pure @nogc @safe 120 { 121 return (cmtable[c] & CMidchar) != 0; 122 } 123 124 private bool isZeroSecond(const char c) pure @nogc @safe 125 { 126 return (cmtable[c] & CMzerosecond) != 0; 127 } 128 129 private bool isDigitSecond(const char c) pure @nogc @safe 130 { 131 return (cmtable[c] & CMdigitsecond) != 0; 132 } 133 134 private bool issinglechar(const char c) pure @nogc @safe 135 { 136 return (cmtable[c] & CMsinglechar) != 0; 137 } 138 139 private bool c_isxdigit(const int c) pure @nogc @safe 140 { 141 return (( c >= '0' && c <= '9') || 142 ( c >= 'a' && c <= 'f') || 143 ( c >= 'A' && c <= 'F')); 144 } 145 146 private bool c_isalnum(const int c) pure @nogc @safe 147 { 148 return (( c >= '0' && c <= '9') || 149 ( c >= 'a' && c <= 'z') || 150 ( c >= 'A' && c <= 'Z')); 151 } 152 153 unittest 154 { 155 //printf("lexer.unittest\n"); 156 /* Not much here, just trying things out. 157 */ 158 string text = "int"; // We rely on the implicit null-terminator 159 DefaultDiagnosticHandler diagnosticHandler; 160 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0, diagnosticHandler.diagnosticHandler); 161 TOK tok; 162 tok = lex1.nextToken(); 163 diagnosticHandler.report(); 164 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); 165 assert(tok == TOK.int32); 166 tok = lex1.nextToken(); 167 diagnosticHandler.report(); 168 assert(tok == TOK.endOfFile); 169 tok = lex1.nextToken(); 170 diagnosticHandler.report(); 171 assert(tok == TOK.endOfFile); 172 tok = lex1.nextToken(); 173 diagnosticHandler.report(); 174 assert(tok == TOK.endOfFile); 175 } 176 177 unittest 178 { 179 // We don't want to see Lexer error output during these tests. 180 uint errors = global.startGagging(); 181 scope(exit) global.endGagging(errors); 182 183 // Test malformed input: even malformed input should end in a TOK.endOfFile. 184 static immutable char[][] testcases = 185 [ // Testcase must end with 0 or 0x1A. 186 [0], // not malformed, but pathological 187 ['\'', 0], 188 ['\'', 0x1A], 189 ['{', '{', 'q', '{', 0], 190 [0xFF, 0], 191 [0xFF, 0x80, 0], 192 [0xFF, 0xFF, 0], 193 [0xFF, 0xFF, 0], 194 ['x', '"', 0x1A], 195 ]; 196 197 foreach (testcase; testcases) 198 { 199 DefaultDiagnosticHandler diagnosticHandler; 200 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0, diagnosticHandler.diagnosticHandler); 201 TOK tok = lex2.nextToken(); 202 diagnosticHandler.report(); 203 size_t iterations = 1; 204 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) 205 { 206 tok = lex2.nextToken(); 207 } 208 assert(tok == TOK.endOfFile); 209 tok = lex2.nextToken(); 210 assert(tok == TOK.endOfFile); 211 } 212 } 213 214 /*********************************************************** 215 */ 216 class Lexer 217 { 218 private __gshared OutBuffer stringbuffer; 219 220 Loc scanloc; // for error messages 221 Loc prevloc; // location of token before current 222 223 const(char)* p; // current character 224 225 Token token; 226 227 private 228 { 229 const(char)* base; // pointer to start of buffer 230 const(char)* end; // pointer to last element of buffer 231 const(char)* line; // start of current line 232 233 bool doDocComment; // collect doc comment information 234 bool anyToken; // seen at least one token 235 bool commentToken; // comments are TOK.comment's 236 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings 237 int lastDocLine; // last line of previous doc comment 238 239 Token* tokenFreelist; 240 DiagnosticHandler handleDiagnostic; 241 DefaultDiagnosticReporter diagnosticReporter; 242 } 243 244 nothrow: 245 246 /********************* 247 * Creates a Lexer for the source code base[begoffset..endoffset+1]. 248 * The last character, base[endoffset], must be null (0) or EOF (0x1A). 249 * 250 * Params: 251 * filename = used for error messages 252 * base = source code, must be terminated by a null (0) or EOF (0x1A) character 253 * begoffset = starting offset into base[] 254 * endoffset = the last offset to read into base[] 255 * doDocComment = handle documentation comments 256 * commentToken = comments become TOK.comment's 257 * diagnosticHandler = diagnostic handler 258 */ 259 this(const(char)* filename, const(char)* base, size_t begoffset, 260 size_t endoffset, bool doDocComment, bool commentToken, 261 DiagnosticHandler handleDiagnostic) pure 262 { 263 scanloc = Loc(filename, 1, 1); 264 //printf("Lexer::Lexer(%p,%d)\n",base,length); 265 //printf("lexer.filename = %s\n", filename); 266 token = Token.init; 267 this.base = base; 268 this.end = base + endoffset; 269 p = base + begoffset; 270 line = p; 271 this.doDocComment = doDocComment; 272 this.commentToken = commentToken; 273 this.inTokenStringConstant = 0; 274 this.lastDocLine = 0; 275 this.handleDiagnostic = handleDiagnostic; 276 277 //initKeywords(); 278 /* If first line starts with '#!', ignore the line 279 */ 280 if (p && p[0] == '#' && p[1] == '!') 281 { 282 p += 2; 283 while (1) 284 { 285 char c = *p++; 286 switch (c) 287 { 288 case 0: 289 case 0x1A: 290 p--; 291 goto case; 292 case '\n': 293 break; 294 default: 295 continue; 296 } 297 break; 298 } 299 endOfLine(); 300 } 301 } 302 303 /// Returns: a newly allocated `Token`. 304 Token* allocateToken() pure nothrow @safe 305 { 306 if (tokenFreelist) 307 { 308 Token* t = tokenFreelist; 309 tokenFreelist = t.next; 310 t.next = null; 311 return t; 312 } 313 return new Token(); 314 } 315 316 /// Frees the given token by returning it to the freelist. 317 private void releaseToken(Token* token) pure nothrow @nogc @safe 318 { 319 if (mem.isGCEnabled) 320 *token = Token.init; 321 token.next = tokenFreelist; 322 tokenFreelist = token; 323 } 324 325 TOK nextToken() 326 { 327 prevloc = token.loc; 328 if (token.next) 329 { 330 Token* t = token.next; 331 memcpy(&token, t, Token.sizeof); 332 releaseToken(t); 333 } 334 else 335 { 336 scan(&token); 337 } 338 //printf(token.toChars()); 339 return token.value; 340 } 341 342 /*********************** 343 * Look ahead at next token's value. 344 */ 345 final TOK peekNext() 346 { 347 return peek(&token).value; 348 } 349 350 /*********************** 351 * Look 2 tokens ahead at value. 352 */ 353 final TOK peekNext2() 354 { 355 Token* t = peek(&token); 356 return peek(t).value; 357 } 358 359 /**************************** 360 * Turn next token in buffer into a token. 361 */ 362 final void scan(Token* t) 363 { 364 const lastLine = scanloc.linnum; 365 Loc startLoc; 366 t.blockComment = null; 367 t.lineComment = null; 368 369 while (1) 370 { 371 t.ptr = p; 372 //printf("p = %p, *p = '%c'\n",p,*p); 373 t.loc = loc(); 374 switch (*p) 375 { 376 case 0: 377 case 0x1A: 378 t.value = TOK.endOfFile; // end of file 379 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. 380 return; 381 case ' ': 382 case '\t': 383 case '\v': 384 case '\f': 385 p++; 386 continue; // skip white space 387 case '\r': 388 p++; 389 if (*p != '\n') // if CR stands by itself 390 { 391 endOfLine(); 392 goto skipFourSpaces; 393 } 394 continue; // skip white space 395 case '\n': 396 p++; 397 endOfLine(); 398 skipFourSpaces: 399 while (*(cast(uint*)p) == 0x20202020) //' ' == 0x20 400 { 401 p+=4; 402 } 403 continue; // skip white space 404 case '0': 405 if (!isZeroSecond(p[1])) // if numeric literal does not continue 406 { 407 ++p; 408 t.unsvalue = 0; 409 t.value = TOK.int32Literal; 410 return; 411 } 412 goto Lnumber; 413 414 case '1': .. case '9': 415 if (!isDigitSecond(p[1])) // if numeric literal does not continue 416 { 417 t.unsvalue = *p - '0'; 418 ++p; 419 t.value = TOK.int32Literal; 420 return; 421 } 422 Lnumber: 423 t.value = number(t); 424 return; 425 426 case '\'': 427 if (issinglechar(p[1]) && p[2] == '\'') 428 { 429 t.unsvalue = p[1]; // simple one character literal 430 t.value = TOK.charLiteral; 431 p += 3; 432 } 433 else 434 t.value = charConstant(t); 435 return; 436 case 'r': 437 if (p[1] != '"') 438 goto case_ident; 439 p++; 440 goto case '`'; 441 case '`': 442 wysiwygStringConstant(t); 443 return; 444 case 'x': 445 if (p[1] != '"') 446 goto case_ident; 447 p++; 448 auto start = p; 449 auto hexString = new OutBuffer(); 450 t.value = hexStringConstant(t); 451 hexString.write(start[0 .. p - start]); 452 error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString.extractChars()); 453 return; 454 case 'q': 455 if (p[1] == '"') 456 { 457 p++; 458 delimitedStringConstant(t); 459 return; 460 } 461 else if (p[1] == '{') 462 { 463 p++; 464 tokenStringConstant(t); 465 return; 466 } 467 else 468 goto case_ident; 469 case '"': 470 escapeStringConstant(t); 471 return; 472 case 'a': 473 case 'b': 474 case 'c': 475 case 'd': 476 case 'e': 477 case 'f': 478 case 'g': 479 case 'h': 480 case 'i': 481 case 'j': 482 case 'k': 483 case 'l': 484 case 'm': 485 case 'n': 486 case 'o': 487 case 'p': 488 /*case 'q': case 'r':*/ 489 case 's': 490 case 't': 491 case 'u': 492 case 'v': 493 case 'w': 494 /*case 'x':*/ 495 case 'y': 496 case 'z': 497 case 'A': 498 case 'B': 499 case 'C': 500 case 'D': 501 case 'E': 502 case 'F': 503 case 'G': 504 case 'H': 505 case 'I': 506 case 'J': 507 case 'K': 508 case 'L': 509 case 'M': 510 case 'N': 511 case 'O': 512 case 'P': 513 case 'Q': 514 case 'R': 515 case 'S': 516 case 'T': 517 case 'U': 518 case 'V': 519 case 'W': 520 case 'X': 521 case 'Y': 522 case 'Z': 523 case '_': 524 case_ident: 525 { 526 while (1) 527 { 528 const c = *++p; 529 if (isidchar(c)) 530 continue; 531 else if (c & 0x80) 532 { 533 const s = p; 534 const u = decodeUTF(); 535 if (isUniAlpha(u)) 536 continue; 537 error("char 0x%04x not allowed in identifier", u); 538 p = s; 539 } 540 break; 541 } 542 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); 543 t.ident = id; 544 t.value = cast(TOK)id.getValue(); 545 anyToken = 1; 546 if (*t.ptr == '_') // if special identifier token 547 { 548 // Lazy initialization 549 TimeStampInfo.initialize(t.loc); 550 551 if (id == Id.DATE) 552 { 553 t.ustring = TimeStampInfo.date.ptr; 554 goto Lstr; 555 } 556 else if (id == Id.TIME) 557 { 558 t.ustring = TimeStampInfo.time.ptr; 559 goto Lstr; 560 } 561 else if (id == Id.VENDOR) 562 { 563 t.ustring = global.vendor.xarraydup.ptr; 564 goto Lstr; 565 } 566 else if (id == Id.TIMESTAMP) 567 { 568 t.ustring = TimeStampInfo.timestamp.ptr; 569 Lstr: 570 t.value = TOK.string_; 571 t.postfix = 0; 572 t.len = cast(uint)strlen(t.ustring); 573 } 574 else if (id == Id.VERSIONX) 575 { 576 t.value = TOK.int64Literal; 577 t.unsvalue = global.versionNumber(); 578 } 579 else if (id == Id.EOFX) 580 { 581 t.value = TOK.endOfFile; 582 // Advance scanner to end of file 583 while (!(*p == 0 || *p == 0x1A)) 584 p++; 585 } 586 } 587 //printf("t.value = %d\n",t.value); 588 return; 589 } 590 case '/': 591 p++; 592 switch (*p) 593 { 594 case '=': 595 p++; 596 t.value = TOK.divAssign; 597 return; 598 case '*': 599 p++; 600 startLoc = loc(); 601 while (1) 602 { 603 while (1) 604 { 605 const c = *p; 606 switch (c) 607 { 608 case '/': 609 break; 610 case '\n': 611 endOfLine(); 612 p++; 613 continue; 614 case '\r': 615 p++; 616 if (*p != '\n') 617 endOfLine(); 618 continue; 619 case 0: 620 case 0x1A: 621 error("unterminated /* */ comment"); 622 p = end; 623 t.loc = loc(); 624 t.value = TOK.endOfFile; 625 return; 626 default: 627 if (c & 0x80) 628 { 629 const u = decodeUTF(); 630 if (u == PS || u == LS) 631 endOfLine(); 632 } 633 p++; 634 continue; 635 } 636 break; 637 } 638 p++; 639 if (p[-2] == '*' && p - 3 != t.ptr) 640 break; 641 } 642 if (commentToken) 643 { 644 t.loc = startLoc; 645 t.value = TOK.comment; 646 return; 647 } 648 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 649 { 650 // if /** but not /**/ 651 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 652 lastDocLine = scanloc.linnum; 653 } 654 continue; 655 case '/': // do // style comments 656 startLoc = loc(); 657 while (1) 658 { 659 const c = *++p; 660 switch (c) 661 { 662 case '\n': 663 break; 664 case '\r': 665 if (p[1] == '\n') 666 p++; 667 break; 668 case 0: 669 case 0x1A: 670 if (commentToken) 671 { 672 p = end; 673 t.loc = startLoc; 674 t.value = TOK.comment; 675 return; 676 } 677 if (doDocComment && t.ptr[2] == '/') 678 { 679 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 680 lastDocLine = scanloc.linnum; 681 } 682 p = end; 683 t.loc = loc(); 684 t.value = TOK.endOfFile; 685 return; 686 default: 687 if (c & 0x80) 688 { 689 const u = decodeUTF(); 690 if (u == PS || u == LS) 691 break; 692 } 693 continue; 694 } 695 break; 696 } 697 if (commentToken) 698 { 699 p++; 700 endOfLine(); 701 t.loc = startLoc; 702 t.value = TOK.comment; 703 return; 704 } 705 if (doDocComment && t.ptr[2] == '/') 706 { 707 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 708 lastDocLine = scanloc.linnum; 709 } 710 p++; 711 endOfLine(); 712 continue; 713 case '+': 714 { 715 int nest; 716 startLoc = loc(); 717 p++; 718 nest = 1; 719 while (1) 720 { 721 char c = *p; 722 switch (c) 723 { 724 case '/': 725 p++; 726 if (*p == '+') 727 { 728 p++; 729 nest++; 730 } 731 continue; 732 case '+': 733 p++; 734 if (*p == '/') 735 { 736 p++; 737 if (--nest == 0) 738 break; 739 } 740 continue; 741 case '\r': 742 p++; 743 if (*p != '\n') 744 endOfLine(); 745 continue; 746 case '\n': 747 endOfLine(); 748 p++; 749 continue; 750 case 0: 751 case 0x1A: 752 error("unterminated /+ +/ comment"); 753 p = end; 754 t.loc = loc(); 755 t.value = TOK.endOfFile; 756 return; 757 default: 758 if (c & 0x80) 759 { 760 uint u = decodeUTF(); 761 if (u == PS || u == LS) 762 endOfLine(); 763 } 764 p++; 765 continue; 766 } 767 break; 768 } 769 if (commentToken) 770 { 771 t.loc = startLoc; 772 t.value = TOK.comment; 773 return; 774 } 775 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 776 { 777 // if /++ but not /++/ 778 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 779 lastDocLine = scanloc.linnum; 780 } 781 continue; 782 } 783 default: 784 break; 785 } 786 t.value = TOK.div; 787 return; 788 case '.': 789 p++; 790 if (isdigit(*p)) 791 { 792 /* Note that we don't allow ._1 and ._ as being 793 * valid floating point numbers. 794 */ 795 p--; 796 t.value = inreal(t); 797 } 798 else if (p[0] == '.') 799 { 800 if (p[1] == '.') 801 { 802 p += 2; 803 t.value = TOK.dotDotDot; 804 } 805 else 806 { 807 p++; 808 t.value = TOK.slice; 809 } 810 } 811 else 812 t.value = TOK.dot; 813 return; 814 case '&': 815 p++; 816 if (*p == '=') 817 { 818 p++; 819 t.value = TOK.andAssign; 820 } 821 else if (*p == '&') 822 { 823 p++; 824 t.value = TOK.andAnd; 825 } 826 else 827 t.value = TOK.and; 828 return; 829 case '|': 830 p++; 831 if (*p == '=') 832 { 833 p++; 834 t.value = TOK.orAssign; 835 } 836 else if (*p == '|') 837 { 838 p++; 839 t.value = TOK.orOr; 840 } 841 else 842 t.value = TOK.or; 843 return; 844 case '-': 845 p++; 846 if (*p == '=') 847 { 848 p++; 849 t.value = TOK.minAssign; 850 } 851 else if (*p == '-') 852 { 853 p++; 854 t.value = TOK.minusMinus; 855 } 856 else 857 t.value = TOK.min; 858 return; 859 case '+': 860 p++; 861 if (*p == '=') 862 { 863 p++; 864 t.value = TOK.addAssign; 865 } 866 else if (*p == '+') 867 { 868 p++; 869 t.value = TOK.plusPlus; 870 } 871 else 872 t.value = TOK.add; 873 return; 874 case '<': 875 p++; 876 if (*p == '=') 877 { 878 p++; 879 t.value = TOK.lessOrEqual; // <= 880 } 881 else if (*p == '<') 882 { 883 p++; 884 if (*p == '=') 885 { 886 p++; 887 t.value = TOK.leftShiftAssign; // <<= 888 } 889 else 890 t.value = TOK.leftShift; // << 891 } 892 else 893 t.value = TOK.lessThan; // < 894 return; 895 case '>': 896 p++; 897 if (*p == '=') 898 { 899 p++; 900 t.value = TOK.greaterOrEqual; // >= 901 } 902 else if (*p == '>') 903 { 904 p++; 905 if (*p == '=') 906 { 907 p++; 908 t.value = TOK.rightShiftAssign; // >>= 909 } 910 else if (*p == '>') 911 { 912 p++; 913 if (*p == '=') 914 { 915 p++; 916 t.value = TOK.unsignedRightShiftAssign; // >>>= 917 } 918 else 919 t.value = TOK.unsignedRightShift; // >>> 920 } 921 else 922 t.value = TOK.rightShift; // >> 923 } 924 else 925 t.value = TOK.greaterThan; // > 926 return; 927 case '!': 928 p++; 929 if (*p == '=') 930 { 931 p++; 932 t.value = TOK.notEqual; // != 933 } 934 else 935 t.value = TOK.not; // ! 936 return; 937 case '=': 938 p++; 939 if (*p == '=') 940 { 941 p++; 942 t.value = TOK.equal; // == 943 } 944 else if (*p == '>') 945 { 946 p++; 947 t.value = TOK.goesTo; // => 948 } 949 else 950 t.value = TOK.assign; // = 951 return; 952 case '~': 953 p++; 954 if (*p == '=') 955 { 956 p++; 957 t.value = TOK.concatenateAssign; // ~= 958 } 959 else 960 t.value = TOK.tilde; // ~ 961 return; 962 case '^': 963 p++; 964 if (*p == '^') 965 { 966 p++; 967 if (*p == '=') 968 { 969 p++; 970 t.value = TOK.powAssign; // ^^= 971 } 972 else 973 t.value = TOK.pow; // ^^ 974 } 975 else if (*p == '=') 976 { 977 p++; 978 t.value = TOK.xorAssign; // ^= 979 } 980 else 981 t.value = TOK.xor; // ^ 982 return; 983 case '(': 984 p++; 985 t.value = TOK.leftParentheses; 986 return; 987 case ')': 988 p++; 989 t.value = TOK.rightParentheses; 990 return; 991 case '[': 992 p++; 993 t.value = TOK.leftBracket; 994 return; 995 case ']': 996 p++; 997 t.value = TOK.rightBracket; 998 return; 999 case '{': 1000 p++; 1001 t.value = TOK.leftCurly; 1002 return; 1003 case '}': 1004 p++; 1005 t.value = TOK.rightCurly; 1006 return; 1007 case '?': 1008 p++; 1009 t.value = TOK.question; 1010 return; 1011 case ',': 1012 p++; 1013 t.value = TOK.comma; 1014 return; 1015 case ';': 1016 p++; 1017 t.value = TOK.semicolon; 1018 return; 1019 case ':': 1020 p++; 1021 t.value = TOK.colon; 1022 return; 1023 case '$': 1024 p++; 1025 t.value = TOK.dollar; 1026 return; 1027 case '@': 1028 p++; 1029 t.value = TOK.at; 1030 return; 1031 case '*': 1032 p++; 1033 if (*p == '=') 1034 { 1035 p++; 1036 t.value = TOK.mulAssign; 1037 } 1038 else 1039 t.value = TOK.mul; 1040 return; 1041 case '%': 1042 p++; 1043 if (*p == '=') 1044 { 1045 p++; 1046 t.value = TOK.modAssign; 1047 } 1048 else 1049 t.value = TOK.mod; 1050 return; 1051 case '#': 1052 { 1053 p++; 1054 Token n; 1055 scan(&n); 1056 if (n.value == TOK.identifier) 1057 { 1058 if (n.ident == Id.line) 1059 { 1060 poundLine(); 1061 continue; 1062 } 1063 else 1064 { 1065 const locx = loc(); 1066 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); 1067 } 1068 } 1069 else if (n.value == TOK.if_) 1070 { 1071 error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); 1072 } 1073 t.value = TOK.pound; 1074 return; 1075 } 1076 default: 1077 { 1078 dchar c = *p; 1079 if (c & 0x80) 1080 { 1081 c = decodeUTF(); 1082 // Check for start of unicode identifier 1083 if (isUniAlpha(c)) 1084 goto case_ident; 1085 if (c == PS || c == LS) 1086 { 1087 endOfLine(); 1088 p++; 1089 continue; 1090 } 1091 } 1092 if (c < 0x80 && isprint(c)) 1093 error("character '%c' is not a valid token", c); 1094 else 1095 error("character 0x%02x is not a valid token", c); 1096 p++; 1097 continue; 1098 } 1099 } 1100 } 1101 } 1102 1103 final Token* peek(Token* ct) 1104 { 1105 Token* t; 1106 if (ct.next) 1107 t = ct.next; 1108 else 1109 { 1110 t = allocateToken(); 1111 scan(t); 1112 ct.next = t; 1113 } 1114 return t; 1115 } 1116 1117 /********************************* 1118 * tk is on the opening (. 1119 * Look ahead and return token that is past the closing ). 1120 */ 1121 final Token* peekPastParen(Token* tk) 1122 { 1123 //printf("peekPastParen()\n"); 1124 int parens = 1; 1125 int curlynest = 0; 1126 while (1) 1127 { 1128 tk = peek(tk); 1129 //tk.print(); 1130 switch (tk.value) 1131 { 1132 case TOK.leftParentheses: 1133 parens++; 1134 continue; 1135 case TOK.rightParentheses: 1136 --parens; 1137 if (parens) 1138 continue; 1139 tk = peek(tk); 1140 break; 1141 case TOK.leftCurly: 1142 curlynest++; 1143 continue; 1144 case TOK.rightCurly: 1145 if (--curlynest >= 0) 1146 continue; 1147 break; 1148 case TOK.semicolon: 1149 if (curlynest) 1150 continue; 1151 break; 1152 case TOK.endOfFile: 1153 break; 1154 default: 1155 continue; 1156 } 1157 return tk; 1158 } 1159 } 1160 1161 /******************************************* 1162 * Parse escape sequence. 1163 */ 1164 private uint escapeSequence() 1165 { 1166 return Lexer.escapeSequence(token.loc, p); 1167 } 1168 1169 /** 1170 Parse the given string literal escape sequence into a single character. 1171 Params: 1172 loc = the location of the current token 1173 sequence = pointer to string with escape sequence to parse. this is a reference 1174 variable that is also used to return the position after the sequence 1175 Returns: 1176 the escaped sequence as a single character 1177 */ 1178 private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence) 1179 { 1180 const(char)* p = sequence; // cache sequence reference on stack 1181 scope(exit) sequence = p; 1182 1183 uint c = *p; 1184 int ndigits; 1185 switch (c) 1186 { 1187 case '\'': 1188 case '"': 1189 case '?': 1190 case '\\': 1191 Lconsume: 1192 p++; 1193 break; 1194 case 'a': 1195 c = 7; 1196 goto Lconsume; 1197 case 'b': 1198 c = 8; 1199 goto Lconsume; 1200 case 'f': 1201 c = 12; 1202 goto Lconsume; 1203 case 'n': 1204 c = 10; 1205 goto Lconsume; 1206 case 'r': 1207 c = 13; 1208 goto Lconsume; 1209 case 't': 1210 c = 9; 1211 goto Lconsume; 1212 case 'v': 1213 c = 11; 1214 goto Lconsume; 1215 case 'u': 1216 ndigits = 4; 1217 goto Lhex; 1218 case 'U': 1219 ndigits = 8; 1220 goto Lhex; 1221 case 'x': 1222 ndigits = 2; 1223 Lhex: 1224 p++; 1225 c = *p; 1226 if (ishex(cast(char)c)) 1227 { 1228 uint v = 0; 1229 int n = 0; 1230 while (1) 1231 { 1232 if (isdigit(cast(char)c)) 1233 c -= '0'; 1234 else if (islower(c)) 1235 c -= 'a' - 10; 1236 else 1237 c -= 'A' - 10; 1238 v = v * 16 + c; 1239 c = *++p; 1240 if (++n == ndigits) 1241 break; 1242 if (!ishex(cast(char)c)) 1243 { 1244 .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); 1245 break; 1246 } 1247 } 1248 if (ndigits != 2 && !utf_isValidDchar(v)) 1249 { 1250 .error(loc, "invalid UTF character \\U%08x", v); 1251 v = '?'; // recover with valid UTF character 1252 } 1253 c = v; 1254 } 1255 else 1256 { 1257 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); 1258 p++; 1259 } 1260 break; 1261 case '&': 1262 // named character entity 1263 for (const idstart = ++p; 1; p++) 1264 { 1265 switch (*p) 1266 { 1267 case ';': 1268 c = HtmlNamedEntity(idstart, p - idstart); 1269 if (c == ~0) 1270 { 1271 .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1272 c = '?'; 1273 } 1274 p++; 1275 break; 1276 default: 1277 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1278 continue; 1279 .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1280 c = '?'; 1281 break; 1282 } 1283 break; 1284 } 1285 break; 1286 case 0: 1287 case 0x1A: 1288 // end of file 1289 c = '\\'; 1290 break; 1291 default: 1292 if (isoctal(cast(char)c)) 1293 { 1294 uint v = 0; 1295 int n = 0; 1296 do 1297 { 1298 v = v * 8 + (c - '0'); 1299 c = *++p; 1300 } 1301 while (++n < 3 && isoctal(cast(char)c)); 1302 c = v; 1303 if (c > 0xFF) 1304 .error(loc, "escape octal sequence \\%03o is larger than \\377", c); 1305 } 1306 else 1307 { 1308 .error(loc, "undefined escape sequence \\%c", c); 1309 p++; 1310 } 1311 break; 1312 } 1313 return c; 1314 } 1315 1316 /** 1317 Lex a wysiwyg string. `p` must be pointing to the first character before the 1318 contents of the string literal. The character pointed to by `p` will be used as 1319 the terminating character (i.e. backtick or double-quote). 1320 Params: 1321 result = pointer to the token that accepts the result 1322 */ 1323 private void wysiwygStringConstant(Token* result) 1324 { 1325 result.value = TOK.string_; 1326 Loc start = loc(); 1327 auto terminator = p[0]; 1328 p++; 1329 stringbuffer.setsize(0); 1330 while (1) 1331 { 1332 dchar c = p[0]; 1333 p++; 1334 switch (c) 1335 { 1336 case '\n': 1337 endOfLine(); 1338 break; 1339 case '\r': 1340 if (p[0] == '\n') 1341 continue; // ignore 1342 c = '\n'; // treat EndOfLine as \n character 1343 endOfLine(); 1344 break; 1345 case 0: 1346 case 0x1A: 1347 error("unterminated string constant starting at %s", start.toChars()); 1348 result.setString(); 1349 // rewind `p` so it points to the EOF character 1350 p--; 1351 return; 1352 default: 1353 if (c == terminator) 1354 { 1355 result.setString(stringbuffer); 1356 stringPostfix(result); 1357 return; 1358 } 1359 else if (c & 0x80) 1360 { 1361 p--; 1362 const u = decodeUTF(); 1363 p++; 1364 if (u == PS || u == LS) 1365 endOfLine(); 1366 stringbuffer.writeUTF8(u); 1367 continue; 1368 } 1369 break; 1370 } 1371 stringbuffer.writeByte(c); 1372 } 1373 } 1374 1375 /************************************** 1376 * Lex hex strings: 1377 * x"0A ae 34FE BD" 1378 */ 1379 private TOK hexStringConstant(Token* t) 1380 { 1381 Loc start = loc(); 1382 uint n = 0; 1383 uint v = ~0; // dead assignment, needed to suppress warning 1384 p++; 1385 stringbuffer.setsize(0); 1386 while (1) 1387 { 1388 dchar c = *p++; 1389 switch (c) 1390 { 1391 case ' ': 1392 case '\t': 1393 case '\v': 1394 case '\f': 1395 continue; // skip white space 1396 case '\r': 1397 if (*p == '\n') 1398 continue; // ignore '\r' if followed by '\n' 1399 // Treat isolated '\r' as if it were a '\n' 1400 goto case '\n'; 1401 case '\n': 1402 endOfLine(); 1403 continue; 1404 case 0: 1405 case 0x1A: 1406 error("unterminated string constant starting at %s", start.toChars()); 1407 t.setString(); 1408 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1409 p--; 1410 return TOK.hexadecimalString; 1411 case '"': 1412 if (n & 1) 1413 { 1414 error("odd number (%d) of hex characters in hex string", n); 1415 stringbuffer.writeByte(v); 1416 } 1417 t.setString(stringbuffer); 1418 stringPostfix(t); 1419 return TOK.hexadecimalString; 1420 default: 1421 if (c >= '0' && c <= '9') 1422 c -= '0'; 1423 else if (c >= 'a' && c <= 'f') 1424 c -= 'a' - 10; 1425 else if (c >= 'A' && c <= 'F') 1426 c -= 'A' - 10; 1427 else if (c & 0x80) 1428 { 1429 p--; 1430 const u = decodeUTF(); 1431 p++; 1432 if (u == PS || u == LS) 1433 endOfLine(); 1434 else 1435 error("non-hex character \\u%04x in hex string", u); 1436 } 1437 else 1438 error("non-hex character '%c' in hex string", c); 1439 if (n & 1) 1440 { 1441 v = (v << 4) | c; 1442 stringbuffer.writeByte(v); 1443 } 1444 else 1445 v = c; 1446 n++; 1447 break; 1448 } 1449 } 1450 assert(0); // see bug 15731 1451 } 1452 1453 /** 1454 Lex a delimited string. Some examples of delimited strings are: 1455 --- 1456 q"(foo(xxx))" // "foo(xxx)" 1457 q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1458 q"/foo]/" // "foo]" 1459 q"HERE 1460 foo 1461 HERE" // "foo\n" 1462 --- 1463 It is assumed that `p` points to the opening double-quote '"'. 1464 Params: 1465 result = pointer to the token that accepts the result 1466 */ 1467 private void delimitedStringConstant(Token* result) 1468 { 1469 result.value = TOK.string_; 1470 Loc start = loc(); 1471 dchar delimleft = 0; 1472 dchar delimright = 0; 1473 uint nest = 1; 1474 uint nestcount = ~0; // dead assignment, needed to suppress warning 1475 Identifier hereid = null; 1476 uint blankrol = 0; 1477 uint startline = 0; 1478 p++; 1479 stringbuffer.setsize(0); 1480 while (1) 1481 { 1482 dchar c = *p++; 1483 //printf("c = '%c'\n", c); 1484 switch (c) 1485 { 1486 case '\n': 1487 Lnextline: 1488 endOfLine(); 1489 startline = 1; 1490 if (blankrol) 1491 { 1492 blankrol = 0; 1493 continue; 1494 } 1495 if (hereid) 1496 { 1497 stringbuffer.writeUTF8(c); 1498 continue; 1499 } 1500 break; 1501 case '\r': 1502 if (*p == '\n') 1503 continue; // ignore 1504 c = '\n'; // treat EndOfLine as \n character 1505 goto Lnextline; 1506 case 0: 1507 case 0x1A: 1508 error("unterminated delimited string constant starting at %s", start.toChars()); 1509 result.setString(); 1510 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1511 p--; 1512 return; 1513 default: 1514 if (c & 0x80) 1515 { 1516 p--; 1517 c = decodeUTF(); 1518 p++; 1519 if (c == PS || c == LS) 1520 goto Lnextline; 1521 } 1522 break; 1523 } 1524 if (delimleft == 0) 1525 { 1526 delimleft = c; 1527 nest = 1; 1528 nestcount = 1; 1529 if (c == '(') 1530 delimright = ')'; 1531 else if (c == '{') 1532 delimright = '}'; 1533 else if (c == '[') 1534 delimright = ']'; 1535 else if (c == '<') 1536 delimright = '>'; 1537 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1538 { 1539 // Start of identifier; must be a heredoc 1540 Token tok; 1541 p--; 1542 scan(&tok); // read in heredoc identifier 1543 if (tok.value != TOK.identifier) 1544 { 1545 error("identifier expected for heredoc, not %s", tok.toChars()); 1546 delimright = c; 1547 } 1548 else 1549 { 1550 hereid = tok.ident; 1551 //printf("hereid = '%s'\n", hereid.toChars()); 1552 blankrol = 1; 1553 } 1554 nest = 0; 1555 } 1556 else 1557 { 1558 delimright = c; 1559 nest = 0; 1560 if (isspace(c)) 1561 error("delimiter cannot be whitespace"); 1562 } 1563 } 1564 else 1565 { 1566 if (blankrol) 1567 { 1568 error("heredoc rest of line should be blank"); 1569 blankrol = 0; 1570 continue; 1571 } 1572 if (nest == 1) 1573 { 1574 if (c == delimleft) 1575 nestcount++; 1576 else if (c == delimright) 1577 { 1578 nestcount--; 1579 if (nestcount == 0) 1580 goto Ldone; 1581 } 1582 } 1583 else if (c == delimright) 1584 goto Ldone; 1585 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) 1586 { 1587 Token tok; 1588 auto psave = p; 1589 p--; 1590 scan(&tok); // read in possible heredoc identifier 1591 //printf("endid = '%s'\n", tok.ident.toChars()); 1592 if (tok.value == TOK.identifier && tok.ident is hereid) 1593 { 1594 /* should check that rest of line is blank 1595 */ 1596 goto Ldone; 1597 } 1598 p = psave; 1599 } 1600 stringbuffer.writeUTF8(c); 1601 startline = 0; 1602 } 1603 } 1604 Ldone: 1605 if (*p == '"') 1606 p++; 1607 else if (hereid) 1608 error("delimited string must end in %s\"", hereid.toChars()); 1609 else 1610 error("delimited string must end in %c\"", delimright); 1611 result.setString(stringbuffer); 1612 stringPostfix(result); 1613 } 1614 1615 /** 1616 Lex a token string. Some examples of token strings are: 1617 --- 1618 q{ foo(xxx) } // " foo(xxx) " 1619 q{foo$(LPAREN)} // "foo$(LPAREN)" 1620 q{{foo}"}"} // "{foo}"}"" 1621 --- 1622 It is assumed that `p` points to the opening curly-brace '{'. 1623 Params: 1624 result = pointer to the token that accepts the result 1625 */ 1626 private void tokenStringConstant(Token* result) 1627 { 1628 result.value = TOK.string_; 1629 1630 uint nest = 1; 1631 const start = loc(); 1632 const pstart = ++p; 1633 inTokenStringConstant++; 1634 scope(exit) inTokenStringConstant--; 1635 while (1) 1636 { 1637 Token tok; 1638 scan(&tok); 1639 switch (tok.value) 1640 { 1641 case TOK.leftCurly: 1642 nest++; 1643 continue; 1644 case TOK.rightCurly: 1645 if (--nest == 0) 1646 { 1647 result.setString(pstart, p - 1 - pstart); 1648 stringPostfix(result); 1649 return; 1650 } 1651 continue; 1652 case TOK.endOfFile: 1653 error("unterminated token string constant starting at %s", start.toChars()); 1654 result.setString(); 1655 return; 1656 default: 1657 continue; 1658 } 1659 } 1660 } 1661 1662 /** 1663 Scan a double-quoted string while building the processed string value by 1664 handling escape sequences. The result is returned in the given `t` token. 1665 This function assumes that `p` currently points to the opening double-quote 1666 of the string. 1667 Params: 1668 t = the token to set the resulting string to 1669 */ 1670 private void escapeStringConstant(Token* t) 1671 { 1672 t.value = TOK.string_; 1673 1674 const start = loc(); 1675 p++; 1676 stringbuffer.setsize(0); 1677 while (1) 1678 { 1679 dchar c = *p++; 1680 switch (c) 1681 { 1682 case '\\': 1683 switch (*p) 1684 { 1685 case 'u': 1686 case 'U': 1687 case '&': 1688 c = escapeSequence(); 1689 stringbuffer.writeUTF8(c); 1690 continue; 1691 default: 1692 c = escapeSequence(); 1693 break; 1694 } 1695 break; 1696 case '\n': 1697 endOfLine(); 1698 break; 1699 case '\r': 1700 if (*p == '\n') 1701 continue; // ignore 1702 c = '\n'; // treat EndOfLine as \n character 1703 endOfLine(); 1704 break; 1705 case '"': 1706 t.setString(stringbuffer); 1707 stringPostfix(t); 1708 return; 1709 case 0: 1710 case 0x1A: 1711 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1712 p--; 1713 error("unterminated string constant starting at %s", start.toChars()); 1714 t.setString(); 1715 return; 1716 default: 1717 if (c & 0x80) 1718 { 1719 p--; 1720 c = decodeUTF(); 1721 if (c == LS || c == PS) 1722 { 1723 c = '\n'; 1724 endOfLine(); 1725 } 1726 p++; 1727 stringbuffer.writeUTF8(c); 1728 continue; 1729 } 1730 break; 1731 } 1732 stringbuffer.writeByte(c); 1733 } 1734 } 1735 1736 /************************************** 1737 */ 1738 private TOK charConstant(Token* t) 1739 { 1740 TOK tk = TOK.charLiteral; 1741 //printf("Lexer::charConstant\n"); 1742 p++; 1743 dchar c = *p++; 1744 switch (c) 1745 { 1746 case '\\': 1747 switch (*p) 1748 { 1749 case 'u': 1750 t.unsvalue = escapeSequence(); 1751 tk = TOK.wcharLiteral; 1752 break; 1753 case 'U': 1754 case '&': 1755 t.unsvalue = escapeSequence(); 1756 tk = TOK.dcharLiteral; 1757 break; 1758 default: 1759 t.unsvalue = escapeSequence(); 1760 break; 1761 } 1762 break; 1763 case '\n': 1764 L1: 1765 endOfLine(); 1766 goto case; 1767 case '\r': 1768 goto case '\''; 1769 case 0: 1770 case 0x1A: 1771 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1772 p--; 1773 goto case; 1774 case '\'': 1775 error("unterminated character constant"); 1776 t.unsvalue = '?'; 1777 return tk; 1778 default: 1779 if (c & 0x80) 1780 { 1781 p--; 1782 c = decodeUTF(); 1783 p++; 1784 if (c == LS || c == PS) 1785 goto L1; 1786 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1787 tk = TOK.wcharLiteral; 1788 else 1789 tk = TOK.dcharLiteral; 1790 } 1791 t.unsvalue = c; 1792 break; 1793 } 1794 if (*p != '\'') 1795 { 1796 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && 1797 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') 1798 { 1799 if (*p & 0x80) 1800 { 1801 const s = p; 1802 c = decodeUTF(); 1803 if (c == LS || c == PS) 1804 { 1805 p = s; 1806 break; 1807 } 1808 } 1809 p++; 1810 } 1811 1812 if (*p == '\'') 1813 { 1814 error("character constant has multiple characters"); 1815 p++; 1816 } 1817 else 1818 error("unterminated character constant"); 1819 t.unsvalue = '?'; 1820 return tk; 1821 } 1822 p++; 1823 return tk; 1824 } 1825 1826 /*************************************** 1827 * Get postfix of string literal. 1828 */ 1829 private void stringPostfix(Token* t) pure @nogc 1830 { 1831 switch (*p) 1832 { 1833 case 'c': 1834 case 'w': 1835 case 'd': 1836 t.postfix = *p; 1837 p++; 1838 break; 1839 default: 1840 t.postfix = 0; 1841 break; 1842 } 1843 } 1844 1845 /************************************** 1846 * Read in a number. 1847 * If it's an integer, store it in tok.TKutok.Vlong. 1848 * integers can be decimal, octal or hex 1849 * Handle the suffixes U, UL, LU, L, etc. 1850 * If it's double, store it in tok.TKutok.Vdouble. 1851 * Returns: 1852 * TKnum 1853 * TKdouble,... 1854 */ 1855 private TOK number(Token* t) 1856 { 1857 int base = 10; 1858 const start = p; 1859 uinteger_t n = 0; // unsigned >=64 bit integer type 1860 int d; 1861 bool err = false; 1862 bool overflow = false; 1863 bool anyBinaryDigitsNoSingleUS = false; 1864 bool anyHexDigitsNoSingleUS = false; 1865 dchar c = *p; 1866 if (c == '0') 1867 { 1868 ++p; 1869 c = *p; 1870 switch (c) 1871 { 1872 case '0': 1873 case '1': 1874 case '2': 1875 case '3': 1876 case '4': 1877 case '5': 1878 case '6': 1879 case '7': 1880 case '8': 1881 case '9': 1882 base = 8; 1883 break; 1884 case 'x': 1885 case 'X': 1886 ++p; 1887 base = 16; 1888 break; 1889 case 'b': 1890 case 'B': 1891 ++p; 1892 base = 2; 1893 break; 1894 case '.': 1895 if (p[1] == '.') 1896 goto Ldone; // if ".." 1897 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1898 goto Ldone; // if ".identifier" or ".unicode" 1899 goto Lreal; // '.' is part of current token 1900 case 'i': 1901 case 'f': 1902 case 'F': 1903 goto Lreal; 1904 case '_': 1905 ++p; 1906 base = 8; 1907 break; 1908 case 'L': 1909 if (p[1] == 'i') 1910 goto Lreal; 1911 break; 1912 default: 1913 break; 1914 } 1915 } 1916 while (1) 1917 { 1918 c = *p; 1919 switch (c) 1920 { 1921 case '0': 1922 case '1': 1923 case '2': 1924 case '3': 1925 case '4': 1926 case '5': 1927 case '6': 1928 case '7': 1929 case '8': 1930 case '9': 1931 ++p; 1932 d = c - '0'; 1933 break; 1934 case 'a': 1935 case 'b': 1936 case 'c': 1937 case 'd': 1938 case 'e': 1939 case 'f': 1940 case 'A': 1941 case 'B': 1942 case 'C': 1943 case 'D': 1944 case 'E': 1945 case 'F': 1946 ++p; 1947 if (base != 16) 1948 { 1949 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1950 goto Lreal; 1951 } 1952 if (c >= 'a') 1953 d = c + 10 - 'a'; 1954 else 1955 d = c + 10 - 'A'; 1956 break; 1957 case 'L': 1958 if (p[1] == 'i') 1959 goto Lreal; 1960 goto Ldone; 1961 case '.': 1962 if (p[1] == '.') 1963 goto Ldone; // if ".." 1964 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1965 goto Ldone; // if ".identifier" or ".unicode" 1966 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) 1967 goto Ldone; // if ".identifier" or ".unicode" 1968 if (base == 2) 1969 goto Ldone; // if ".identifier" or ".unicode" 1970 goto Lreal; // otherwise as part of a floating point literal 1971 case 'p': 1972 case 'P': 1973 case 'i': 1974 Lreal: 1975 p = start; 1976 return inreal(t); 1977 case '_': 1978 ++p; 1979 continue; 1980 default: 1981 goto Ldone; 1982 } 1983 // got a digit here, set any necessary flags, check for errors 1984 anyHexDigitsNoSingleUS = true; 1985 anyBinaryDigitsNoSingleUS = true; 1986 if (!err && d >= base) 1987 { 1988 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr : 1989 base == 8 ? "octal".ptr : 1990 "decimal".ptr, c); 1991 err = true; 1992 } 1993 // Avoid expensive overflow check if we aren't at risk of overflow 1994 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 1995 n = n * base + d; 1996 else 1997 { 1998 import core.checkedint : mulu, addu; 1999 2000 n = mulu(n, base, overflow); 2001 n = addu(n, d, overflow); 2002 } 2003 } 2004 Ldone: 2005 if (overflow && !err) 2006 { 2007 error("integer overflow"); 2008 err = true; 2009 } 2010 if ((base == 2 && !anyBinaryDigitsNoSingleUS) || 2011 (base == 16 && !anyHexDigitsNoSingleUS)) 2012 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); 2013 enum FLAGS : int 2014 { 2015 none = 0, 2016 decimal = 1, // decimal 2017 unsigned = 2, // u or U suffix 2018 long_ = 4, // L suffix 2019 } 2020 2021 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; 2022 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 2023 const psuffix = p; 2024 while (1) 2025 { 2026 FLAGS f; 2027 switch (*p) 2028 { 2029 case 'U': 2030 case 'u': 2031 f = FLAGS.unsigned; 2032 goto L1; 2033 case 'l': 2034 f = FLAGS.long_; 2035 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 2036 goto L1; 2037 case 'L': 2038 f = FLAGS.long_; 2039 L1: 2040 p++; 2041 if ((flags & f) && !err) 2042 { 2043 error("unrecognized token"); 2044 err = true; 2045 } 2046 flags = cast(FLAGS)(flags | f); 2047 continue; 2048 default: 2049 break; 2050 } 2051 break; 2052 } 2053 if (base == 8 && n >= 8) 2054 { 2055 if (err) 2056 // can't translate invalid octal value, just show a generic message 2057 error("octal literals larger than 7 are no longer supported"); 2058 else 2059 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead", 2060 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); 2061 } 2062 TOK result; 2063 switch (flags) 2064 { 2065 case FLAGS.none: 2066 /* Octal or Hexadecimal constant. 2067 * First that fits: int, uint, long, ulong 2068 */ 2069 if (n & 0x8000000000000000L) 2070 result = TOK.uns64Literal; 2071 else if (n & 0xFFFFFFFF00000000L) 2072 result = TOK.int64Literal; 2073 else if (n & 0x80000000) 2074 result = TOK.uns32Literal; 2075 else 2076 result = TOK.int32Literal; 2077 break; 2078 case FLAGS.decimal: 2079 /* First that fits: int, long, long long 2080 */ 2081 if (n & 0x8000000000000000L) 2082 { 2083 result = TOK.uns64Literal; 2084 } 2085 else if (n & 0xFFFFFFFF80000000L) 2086 result = TOK.int64Literal; 2087 else 2088 result = TOK.int32Literal; 2089 break; 2090 case FLAGS.unsigned: 2091 case FLAGS.decimal | FLAGS.unsigned: 2092 /* First that fits: uint, ulong 2093 */ 2094 if (n & 0xFFFFFFFF00000000L) 2095 result = TOK.uns64Literal; 2096 else 2097 result = TOK.uns32Literal; 2098 break; 2099 case FLAGS.decimal | FLAGS.long_: 2100 if (n & 0x8000000000000000L) 2101 { 2102 if (!err) 2103 { 2104 error("signed integer overflow"); 2105 err = true; 2106 } 2107 result = TOK.uns64Literal; 2108 } 2109 else 2110 result = TOK.int64Literal; 2111 break; 2112 case FLAGS.long_: 2113 if (n & 0x8000000000000000L) 2114 result = TOK.uns64Literal; 2115 else 2116 result = TOK.int64Literal; 2117 break; 2118 case FLAGS.unsigned | FLAGS.long_: 2119 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2120 result = TOK.uns64Literal; 2121 break; 2122 default: 2123 debug 2124 { 2125 printf("%x\n", flags); 2126 } 2127 assert(0); 2128 } 2129 t.unsvalue = n; 2130 return result; 2131 } 2132 2133 /************************************** 2134 * Read in characters, converting them to real. 2135 * Bugs: 2136 * Exponent overflow not detected. 2137 * Too much requested precision is not detected. 2138 */ 2139 private TOK inreal(Token* t) 2140 { 2141 //printf("Lexer::inreal()\n"); 2142 debug 2143 { 2144 assert(*p == '.' || isdigit(*p)); 2145 } 2146 bool isWellformedString = true; 2147 stringbuffer.setsize(0); 2148 auto pstart = p; 2149 bool hex = false; 2150 dchar c = *p++; 2151 // Leading '0x' 2152 if (c == '0') 2153 { 2154 c = *p++; 2155 if (c == 'x' || c == 'X') 2156 { 2157 hex = true; 2158 c = *p++; 2159 } 2160 } 2161 // Digits to left of '.' 2162 while (1) 2163 { 2164 if (c == '.') 2165 { 2166 c = *p++; 2167 break; 2168 } 2169 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2170 { 2171 c = *p++; 2172 continue; 2173 } 2174 break; 2175 } 2176 // Digits to right of '.' 2177 while (1) 2178 { 2179 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2180 { 2181 c = *p++; 2182 continue; 2183 } 2184 break; 2185 } 2186 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2187 { 2188 c = *p++; 2189 if (c == '-' || c == '+') 2190 { 2191 c = *p++; 2192 } 2193 bool anyexp = false; 2194 while (1) 2195 { 2196 if (isdigit(c)) 2197 { 2198 anyexp = true; 2199 c = *p++; 2200 continue; 2201 } 2202 if (c == '_') 2203 { 2204 c = *p++; 2205 continue; 2206 } 2207 if (!anyexp) 2208 { 2209 error("missing exponent"); 2210 isWellformedString = false; 2211 } 2212 break; 2213 } 2214 } 2215 else if (hex) 2216 { 2217 error("exponent required for hex float"); 2218 isWellformedString = false; 2219 } 2220 --p; 2221 while (pstart < p) 2222 { 2223 if (*pstart != '_') 2224 stringbuffer.writeByte(*pstart); 2225 ++pstart; 2226 } 2227 stringbuffer.writeByte(0); 2228 auto sbufptr = cast(const(char)*)stringbuffer[].ptr; 2229 TOK result; 2230 bool isOutOfRange = false; 2231 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero); 2232 switch (*p) 2233 { 2234 case 'F': 2235 case 'f': 2236 if (isWellformedString && !isOutOfRange) 2237 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); 2238 result = TOK.float32Literal; 2239 p++; 2240 break; 2241 default: 2242 if (isWellformedString && !isOutOfRange) 2243 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); 2244 result = TOK.float64Literal; 2245 break; 2246 case 'l': 2247 error("use 'L' suffix instead of 'l'"); 2248 goto case 'L'; 2249 case 'L': 2250 result = TOK.float80Literal; 2251 p++; 2252 break; 2253 } 2254 if (*p == 'i' || *p == 'I') 2255 { 2256 if (*p == 'I') 2257 error("use 'i' suffix instead of 'I'"); 2258 p++; 2259 switch (result) 2260 { 2261 case TOK.float32Literal: 2262 result = TOK.imaginary32Literal; 2263 break; 2264 case TOK.float64Literal: 2265 result = TOK.imaginary64Literal; 2266 break; 2267 case TOK.float80Literal: 2268 result = TOK.imaginary80Literal; 2269 break; 2270 default: 2271 break; 2272 } 2273 } 2274 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); 2275 if (isOutOfRange && !isLong) 2276 { 2277 const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : ""; 2278 error(scanloc, "number `%s%s` is not representable", sbufptr, suffix); 2279 } 2280 debug 2281 { 2282 switch (result) 2283 { 2284 case TOK.float32Literal: 2285 case TOK.float64Literal: 2286 case TOK.float80Literal: 2287 case TOK.imaginary32Literal: 2288 case TOK.imaginary64Literal: 2289 case TOK.imaginary80Literal: 2290 break; 2291 default: 2292 assert(0); 2293 } 2294 } 2295 return result; 2296 } 2297 2298 final Loc loc() pure @nogc 2299 { 2300 scanloc.charnum = cast(uint)(1 + p - line); 2301 return scanloc; 2302 } 2303 2304 final void error(const(char)* format, ...) 2305 { 2306 va_list args; 2307 va_start(args, format); 2308 handleDiagnostic(token.loc, Severity.error, format, args); 2309 va_end(args); 2310 } 2311 2312 final void error(const ref Loc loc, const(char)* format, ...) 2313 { 2314 va_list args; 2315 va_start(args, format); 2316 handleDiagnostic(loc, Severity.error, format, args); 2317 va_end(args); 2318 } 2319 2320 final void errorSupplemental(const ref Loc loc, const(char)* format, ...) 2321 { 2322 va_list args; 2323 va_start(args, format); 2324 handleDiagnostic(loc, Severity.error, format, args, true); 2325 va_end(args); 2326 } 2327 2328 final void warning(const ref Loc loc, const(char)* format, ...) 2329 { 2330 va_list args; 2331 va_start(args, format); 2332 handleDiagnostic(loc, Severity.warning, format, args); 2333 va_end(args); 2334 } 2335 2336 final void warningSupplemental(const ref Loc loc, const(char)* format, ...) 2337 { 2338 va_list args; 2339 va_start(args, format); 2340 handleDiagnostic(loc, Severity.warning, format, args, true); 2341 va_end(args); 2342 } 2343 2344 final void deprecation(const(char)* format, ...) 2345 { 2346 va_list args; 2347 va_start(args, format); 2348 handleDiagnostic(token.loc, Severity.deprecation, format, args); 2349 va_end(args); 2350 } 2351 2352 final void deprecationSupplemental(const(char)* format, ...) 2353 { 2354 va_list args; 2355 va_start(args, format); 2356 handleDiagnostic(token.loc, Severity.deprecation, format, args, true); 2357 va_end(args); 2358 } 2359 2360 /********************************************* 2361 * parse: 2362 * #line linnum [filespec] 2363 * also allow __LINE__ for linnum, and __FILE__ for filespec 2364 */ 2365 private void poundLine() 2366 { 2367 auto linnum = this.scanloc.linnum; 2368 const(char)* filespec = null; 2369 const loc = this.loc(); 2370 Token tok; 2371 scan(&tok); 2372 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) 2373 { 2374 const lin = cast(int)(tok.unsvalue - 1); 2375 if (lin != tok.unsvalue - 1) 2376 error("line number `%lld` out of range", cast(ulong)tok.unsvalue); 2377 else 2378 linnum = lin; 2379 } 2380 else if (tok.value == TOK.line) 2381 { 2382 } 2383 else 2384 goto Lerr; 2385 while (1) 2386 { 2387 switch (*p) 2388 { 2389 case 0: 2390 case 0x1A: 2391 case '\n': 2392 Lnewline: 2393 if (!inTokenStringConstant) 2394 { 2395 this.scanloc.linnum = linnum; 2396 if (filespec) 2397 this.scanloc.filename = filespec; 2398 } 2399 return; 2400 case '\r': 2401 p++; 2402 if (*p != '\n') 2403 { 2404 p--; 2405 goto Lnewline; 2406 } 2407 continue; 2408 case ' ': 2409 case '\t': 2410 case '\v': 2411 case '\f': 2412 p++; 2413 continue; // skip white space 2414 case '_': 2415 if (memcmp(p, "__FILE__".ptr, 8) == 0) 2416 { 2417 p += 8; 2418 filespec = mem.xstrdup(scanloc.filename); 2419 continue; 2420 } 2421 goto Lerr; 2422 case '"': 2423 if (filespec) 2424 goto Lerr; 2425 stringbuffer.setsize(0); 2426 p++; 2427 while (1) 2428 { 2429 uint c; 2430 c = *p; 2431 switch (c) 2432 { 2433 case '\n': 2434 case '\r': 2435 case 0: 2436 case 0x1A: 2437 goto Lerr; 2438 case '"': 2439 stringbuffer.writeByte(0); 2440 filespec = mem.xstrdup(cast(const(char)*)stringbuffer[].ptr); 2441 p++; 2442 break; 2443 default: 2444 if (c & 0x80) 2445 { 2446 uint u = decodeUTF(); 2447 if (u == PS || u == LS) 2448 goto Lerr; 2449 } 2450 stringbuffer.writeByte(c); 2451 p++; 2452 continue; 2453 } 2454 break; 2455 } 2456 continue; 2457 default: 2458 if (*p & 0x80) 2459 { 2460 uint u = decodeUTF(); 2461 if (u == PS || u == LS) 2462 goto Lnewline; 2463 } 2464 goto Lerr; 2465 } 2466 } 2467 Lerr: 2468 error(loc, "#line integer [\"filespec\"]\\n expected"); 2469 } 2470 2471 /******************************************** 2472 * Decode UTF character. 2473 * Issue error messages for invalid sequences. 2474 * Return decoded character, advance p to last character in UTF sequence. 2475 */ 2476 private uint decodeUTF() 2477 { 2478 const s = p; 2479 assert(*s & 0x80); 2480 // Check length of remaining string up to 4 UTF-8 characters 2481 size_t len; 2482 for (len = 1; len < 4 && s[len]; len++) 2483 { 2484 } 2485 size_t idx = 0; 2486 dchar u; 2487 const msg = utf_decodeChar(s[0 .. len], idx, u); 2488 p += idx - 1; 2489 if (msg) 2490 { 2491 error("%.*s", cast(int)msg.length, msg.ptr); 2492 } 2493 return u; 2494 } 2495 2496 /*************************************************** 2497 * Parse doc comment embedded between t.ptr and p. 2498 * Remove trailing blanks and tabs from lines. 2499 * Replace all newlines with \n. 2500 * Remove leading comment character from each line. 2501 * Decide if it's a lineComment or a blockComment. 2502 * Append to previous one for this token. 2503 * 2504 * If newParagraph is true, an extra newline will be 2505 * added between adjoining doc comments. 2506 */ 2507 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure 2508 { 2509 /* ct tells us which kind of comment it is: '/', '*', or '+' 2510 */ 2511 const ct = t.ptr[2]; 2512 /* Start of comment text skips over / * *, / + +, or / / / 2513 */ 2514 const(char)* q = t.ptr + 3; // start of comment text 2515 const(char)* qend = p; 2516 if (ct == '*' || ct == '+') 2517 qend -= 2; 2518 /* Scan over initial row of ****'s or ++++'s or ////'s 2519 */ 2520 for (; q < qend; q++) 2521 { 2522 if (*q != ct) 2523 break; 2524 } 2525 /* Remove leading spaces until start of the comment 2526 */ 2527 int linestart = 0; 2528 if (ct == '/') 2529 { 2530 while (q < qend && (*q == ' ' || *q == '\t')) 2531 ++q; 2532 } 2533 else if (q < qend) 2534 { 2535 if (*q == '\r') 2536 { 2537 ++q; 2538 if (q < qend && *q == '\n') 2539 ++q; 2540 linestart = 1; 2541 } 2542 else if (*q == '\n') 2543 { 2544 ++q; 2545 linestart = 1; 2546 } 2547 } 2548 /* Remove trailing row of ****'s or ++++'s 2549 */ 2550 if (ct != '/') 2551 { 2552 for (; q < qend; qend--) 2553 { 2554 if (qend[-1] != ct) 2555 break; 2556 } 2557 } 2558 /* Comment is now [q .. qend]. 2559 * Canonicalize it into buf[]. 2560 */ 2561 OutBuffer buf; 2562 2563 void trimTrailingWhitespace() 2564 { 2565 const s = buf[]; 2566 auto len = s.length; 2567 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 2568 --len; 2569 buf.setsize(len); 2570 } 2571 2572 for (; q < qend; q++) 2573 { 2574 char c = *q; 2575 switch (c) 2576 { 2577 case '*': 2578 case '+': 2579 if (linestart && c == ct) 2580 { 2581 linestart = 0; 2582 /* Trim preceding whitespace up to preceding \n 2583 */ 2584 trimTrailingWhitespace(); 2585 continue; 2586 } 2587 break; 2588 case ' ': 2589 case '\t': 2590 break; 2591 case '\r': 2592 if (q[1] == '\n') 2593 continue; // skip the \r 2594 goto Lnewline; 2595 default: 2596 if (c == 226) 2597 { 2598 // If LS or PS 2599 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2600 { 2601 q += 2; 2602 goto Lnewline; 2603 } 2604 } 2605 linestart = 0; 2606 break; 2607 Lnewline: 2608 c = '\n'; // replace all newlines with \n 2609 goto case; 2610 case '\n': 2611 linestart = 1; 2612 /* Trim trailing whitespace 2613 */ 2614 trimTrailingWhitespace(); 2615 break; 2616 } 2617 buf.writeByte(c); 2618 } 2619 /* Trim trailing whitespace (if the last line does not have newline) 2620 */ 2621 trimTrailingWhitespace(); 2622 2623 // Always end with a newline 2624 const s = buf[]; 2625 if (s.length == 0 || s[$ - 1] != '\n') 2626 buf.writeByte('\n'); 2627 2628 // It's a line comment if the start of the doc comment comes 2629 // after other non-whitespace on the same line. 2630 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2631 // Combine with previous doc comment, if any 2632 if (*dc) 2633 *dc = combineComments(*dc, buf[], newParagraph).toDString(); 2634 else 2635 *dc = buf.extractSlice(true); 2636 } 2637 2638 /******************************************** 2639 * Combine two document comments into one, 2640 * separated by an extra newline if newParagraph is true. 2641 */ 2642 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure 2643 { 2644 //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph); 2645 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' 2646 if (!c1) 2647 return c2.ptr; 2648 if (!c2) 2649 return c1.ptr; 2650 2651 int insertNewLine = 0; 2652 if (c1.length && c1[$ - 1] != '\n') 2653 insertNewLine = 1; 2654 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; 2655 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); 2656 p[0 .. c1.length] = c1[]; 2657 if (insertNewLine) 2658 p[c1.length] = '\n'; 2659 if (newParagraph) 2660 p[c1.length + insertNewLine] = '\n'; 2661 p[retSize - c2.length .. retSize] = c2[]; 2662 p[retSize] = 0; 2663 return p; 2664 } 2665 2666 private: 2667 void endOfLine() pure @nogc @safe 2668 { 2669 scanloc.linnum++; 2670 line = p; 2671 } 2672 } 2673 2674 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` 2675 private struct TimeStampInfo 2676 { 2677 private __gshared bool initdone = false; 2678 2679 // Note: Those properties need to be guarded by a call to `init` 2680 // The API isn't safe, and quite brittle, but it was left this way 2681 // over performance concerns. 2682 // This is currently only called once, from the lexer. 2683 __gshared char[11 + 1] date; 2684 __gshared char[8 + 1] time; 2685 __gshared char[24 + 1] timestamp; 2686 2687 public static void initialize(const ref Loc loc) nothrow 2688 { 2689 if (initdone) 2690 return; 2691 2692 initdone = true; 2693 time_t ct; 2694 // https://issues.dlang.org/show_bug.cgi?id=20444 2695 if (auto p = getenv("SOURCE_DATE_EPOCH")) 2696 { 2697 if (!ct.parseDigits(p.toDString())) 2698 error(loc, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); 2699 } 2700 else 2701 .time(&ct); 2702 const p = ctime(&ct); 2703 assert(p); 2704 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 2705 sprintf(&time[0], "%.8s", p + 11); 2706 sprintf(×tamp[0], "%.24s", p); 2707 } 2708 } 2709 2710 unittest 2711 { 2712 import dmd.console; 2713 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2714 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2715 { 2716 assert(0); 2717 } 2718 diagnosticHandler = &assertDiagnosticHandler; 2719 2720 static void test(T)(string sequence, T expected) 2721 { 2722 auto p = cast(const(char)*)sequence.ptr; 2723 assert(expected == Lexer.escapeSequence(Loc.initial, p)); 2724 assert(p == sequence.ptr + sequence.length); 2725 } 2726 2727 test(`'`, '\''); 2728 test(`"`, '"'); 2729 test(`?`, '?'); 2730 test(`\`, '\\'); 2731 test(`0`, '\0'); 2732 test(`a`, '\a'); 2733 test(`b`, '\b'); 2734 test(`f`, '\f'); 2735 test(`n`, '\n'); 2736 test(`r`, '\r'); 2737 test(`t`, '\t'); 2738 test(`v`, '\v'); 2739 2740 test(`x00`, 0x00); 2741 test(`xff`, 0xff); 2742 test(`xFF`, 0xff); 2743 test(`xa7`, 0xa7); 2744 test(`x3c`, 0x3c); 2745 test(`xe2`, 0xe2); 2746 2747 test(`1`, '\1'); 2748 test(`42`, '\42'); 2749 test(`357`, '\357'); 2750 2751 test(`u1234`, '\u1234'); 2752 test(`uf0e4`, '\uf0e4'); 2753 2754 test(`U0001f603`, '\U0001f603'); 2755 2756 test(`"`, '"'); 2757 test(`<`, '<'); 2758 test(`>`, '>'); 2759 2760 diagnosticHandler = null; 2761 } 2762 unittest 2763 { 2764 import dmd.console; 2765 string expected; 2766 bool gotError; 2767 2768 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2769 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2770 { 2771 assert(cast(Classification)headerColor == Classification.error); 2772 2773 gotError = true; 2774 char[100] buffer = void; 2775 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)]; 2776 assert(expected == actual); 2777 return true; 2778 } 2779 2780 diagnosticHandler = &expectDiagnosticHandler; 2781 2782 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength) 2783 { 2784 uint errors = global.errors; 2785 gotError = false; 2786 expected = expectedError; 2787 auto p = cast(const(char)*)sequence.ptr; 2788 auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p); 2789 assert(gotError); 2790 assert(expectedReturnValue == actualReturnValue); 2791 2792 auto actualScanLength = p - sequence.ptr; 2793 assert(expectedScanLength == actualScanLength); 2794 global.errors = errors; 2795 } 2796 2797 test("c", `undefined escape sequence \c`, 'c', 1); 2798 test("!", `undefined escape sequence \!`, '!', 1); 2799 2800 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); 2801 2802 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); 2803 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); 2804 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); 2805 2806 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); 2807 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); 2808 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); 2809 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); 2810 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); 2811 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); 2812 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); 2813 2814 test("ud800" , `invalid UTF character \U0000d800`, '?', 5); 2815 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); 2816 test("U00110000", `invalid UTF character \U00110000`, '?', 9); 2817 2818 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); 2819 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); 2820 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); 2821 2822 test("&BAD;", `unnamed character entity &BAD;` , '?', 5); 2823 test(""", `unterminated named entity "`, '?', 5); 2824 2825 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); 2826 2827 diagnosticHandler = null; 2828 }