1 /** 2 * Implements the lexical analyzer, which converts source code into lexical tokens. 3 * 4 * Specification: $(LINK2 https://dlang.org/spec/lex.html, Lexical) 5 * 6 * Copyright: Copyright (C) 1999-2020 by The D Language Foundation, All Rights Reserved 7 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/lexer.d, _lexer.d) 10 * Documentation: https://dlang.org/phobos/dmd_lexer.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/lexer.d 12 */ 13 14 module dmd.lexer; 15 16 import core.stdc.ctype; 17 import core.stdc.errno; 18 import core.stdc.stdarg; 19 import core.stdc.stdio; 20 import core.stdc.stdlib : getenv; 21 import core.stdc.string; 22 import core.stdc.time; 23 24 import dmd.diagnostic : DiagnosticHandler, Severity, DefaultDiagnosticHandler, DefaultDiagnosticReporter; 25 import dmd.entity; 26 import dmd.errors; 27 import dmd.globals; 28 import dmd.id; 29 import dmd.identifier; 30 import dmd.root.ctfloat; 31 import dmd.root.outbuffer; 32 import dmd.root.port; 33 import dmd.root.rmem; 34 import dmd.root.string; 35 import dmd.tokens; 36 import dmd.utf; 37 import dmd.utils; 38 39 nothrow: 40 41 private enum LS = 0x2028; // UTF line separator 42 private enum PS = 0x2029; // UTF paragraph separator 43 44 /******************************************** 45 * Do our own char maps 46 */ 47 private static immutable cmtable = () { 48 ubyte[256] table; 49 foreach (const c; 0 .. table.length) 50 { 51 if ('0' <= c && c <= '7') 52 table[c] |= CMoctal; 53 if (c_isxdigit(c)) 54 table[c] |= CMhex; 55 if (c_isalnum(c) || c == '_') 56 table[c] |= CMidchar; 57 58 switch (c) 59 { 60 case 'x': case 'X': 61 case 'b': case 'B': 62 table[c] |= CMzerosecond; 63 break; 64 65 case '0': .. case '9': 66 case 'e': case 'E': 67 case 'f': case 'F': 68 case 'l': case 'L': 69 case 'p': case 'P': 70 case 'u': case 'U': 71 case 'i': 72 case '.': 73 case '_': 74 table[c] |= CMzerosecond | CMdigitsecond; 75 break; 76 77 default: 78 break; 79 } 80 81 switch (c) 82 { 83 case '\\': 84 case '\n': 85 case '\r': 86 case 0: 87 case 0x1A: 88 case '\'': 89 break; 90 default: 91 if (!(c & 0x80)) 92 table[c] |= CMsinglechar; 93 break; 94 } 95 } 96 return table; 97 }(); 98 99 private 100 { 101 enum CMoctal = 0x1; 102 enum CMhex = 0x2; 103 enum CMidchar = 0x4; 104 enum CMzerosecond = 0x8; 105 enum CMdigitsecond = 0x10; 106 enum CMsinglechar = 0x20; 107 } 108 109 private bool isoctal(const char c) pure @nogc @safe 110 { 111 return (cmtable[c] & CMoctal) != 0; 112 } 113 114 private bool ishex(const char c) pure @nogc @safe 115 { 116 return (cmtable[c] & CMhex) != 0; 117 } 118 119 private bool isidchar(const char c) pure @nogc @safe 120 { 121 return (cmtable[c] & CMidchar) != 0; 122 } 123 124 private bool isZeroSecond(const char c) pure @nogc @safe 125 { 126 return (cmtable[c] & CMzerosecond) != 0; 127 } 128 129 private bool isDigitSecond(const char c) pure @nogc @safe 130 { 131 return (cmtable[c] & CMdigitsecond) != 0; 132 } 133 134 private bool issinglechar(const char c) pure @nogc @safe 135 { 136 return (cmtable[c] & CMsinglechar) != 0; 137 } 138 139 private bool c_isxdigit(const int c) pure @nogc @safe 140 { 141 return (( c >= '0' && c <= '9') || 142 ( c >= 'a' && c <= 'f') || 143 ( c >= 'A' && c <= 'F')); 144 } 145 146 private bool c_isalnum(const int c) pure @nogc @safe 147 { 148 return (( c >= '0' && c <= '9') || 149 ( c >= 'a' && c <= 'z') || 150 ( c >= 'A' && c <= 'Z')); 151 } 152 153 unittest 154 { 155 //printf("lexer.unittest\n"); 156 /* Not much here, just trying things out. 157 */ 158 string text = "int"; // We rely on the implicit null-terminator 159 DefaultDiagnosticHandler diagnosticHandler; 160 scope Lexer lex1 = new Lexer(null, text.ptr, 0, text.length, 0, 0, diagnosticHandler.diagnosticHandler); 161 TOK tok; 162 tok = lex1.nextToken(); 163 diagnosticHandler.report(); 164 //printf("tok == %s, %d, %d\n", Token::toChars(tok), tok, TOK.int32); 165 assert(tok == TOK.int32); 166 tok = lex1.nextToken(); 167 diagnosticHandler.report(); 168 assert(tok == TOK.endOfFile); 169 tok = lex1.nextToken(); 170 diagnosticHandler.report(); 171 assert(tok == TOK.endOfFile); 172 tok = lex1.nextToken(); 173 diagnosticHandler.report(); 174 assert(tok == TOK.endOfFile); 175 } 176 177 unittest 178 { 179 // We don't want to see Lexer error output during these tests. 180 uint errors = global.startGagging(); 181 scope(exit) global.endGagging(errors); 182 183 // Test malformed input: even malformed input should end in a TOK.endOfFile. 184 static immutable char[][] testcases = 185 [ // Testcase must end with 0 or 0x1A. 186 [0], // not malformed, but pathological 187 ['\'', 0], 188 ['\'', 0x1A], 189 ['{', '{', 'q', '{', 0], 190 [0xFF, 0], 191 [0xFF, 0x80, 0], 192 [0xFF, 0xFF, 0], 193 [0xFF, 0xFF, 0], 194 ['x', '"', 0x1A], 195 ]; 196 197 foreach (testcase; testcases) 198 { 199 DefaultDiagnosticHandler diagnosticHandler; 200 scope Lexer lex2 = new Lexer(null, testcase.ptr, 0, testcase.length-1, 0, 0, diagnosticHandler.diagnosticHandler); 201 TOK tok = lex2.nextToken(); 202 diagnosticHandler.report(); 203 size_t iterations = 1; 204 while ((tok != TOK.endOfFile) && (iterations++ < testcase.length)) 205 { 206 tok = lex2.nextToken(); 207 } 208 assert(tok == TOK.endOfFile); 209 tok = lex2.nextToken(); 210 assert(tok == TOK.endOfFile); 211 } 212 } 213 214 /*********************************************************** 215 */ 216 class Lexer 217 { 218 private __gshared OutBuffer stringbuffer; 219 220 Loc scanloc; // for error messages 221 Loc prevloc; // location of token before current 222 223 const(char)* p; // current character 224 225 Token token; 226 227 private 228 { 229 const(char)* base; // pointer to start of buffer 230 const(char)* end; // pointer to last element of buffer 231 const(char)* line; // start of current line 232 233 bool doDocComment; // collect doc comment information 234 bool anyToken; // seen at least one token 235 bool commentToken; // comments are TOK.comment's 236 int inTokenStringConstant; // can be larger than 1 when in nested q{} strings 237 int lastDocLine; // last line of previous doc comment 238 239 Token* tokenFreelist; 240 DiagnosticHandler handleDiagnostic; 241 DefaultDiagnosticReporter diagnosticReporter; 242 } 243 244 nothrow: 245 246 /********************* 247 * Creates a Lexer for the source code base[begoffset..endoffset+1]. 248 * The last character, base[endoffset], must be null (0) or EOF (0x1A). 249 * 250 * Params: 251 * filename = used for error messages 252 * base = source code, must be terminated by a null (0) or EOF (0x1A) character 253 * begoffset = starting offset into base[] 254 * endoffset = the last offset to read into base[] 255 * doDocComment = handle documentation comments 256 * commentToken = comments become TOK.comment's 257 * diagnosticHandler = diagnostic handler 258 */ 259 this(const(char)* filename, const(char)* base, size_t begoffset, 260 size_t endoffset, bool doDocComment, bool commentToken, 261 DiagnosticHandler handleDiagnostic) pure 262 { 263 scanloc = Loc(filename, 1, 1); 264 //printf("Lexer::Lexer(%p,%d)\n",base,length); 265 //printf("lexer.filename = %s\n", filename); 266 token = Token.init; 267 this.base = base; 268 this.end = base + endoffset; 269 p = base + begoffset; 270 line = p; 271 this.doDocComment = doDocComment; 272 this.commentToken = commentToken; 273 this.inTokenStringConstant = 0; 274 this.lastDocLine = 0; 275 this.handleDiagnostic = handleDiagnostic; 276 277 //initKeywords(); 278 /* If first line starts with '#!', ignore the line 279 */ 280 if (p && p[0] == '#' && p[1] == '!') 281 { 282 p += 2; 283 while (1) 284 { 285 char c = *p++; 286 switch (c) 287 { 288 case 0: 289 case 0x1A: 290 p--; 291 goto case; 292 case '\n': 293 break; 294 default: 295 continue; 296 } 297 break; 298 } 299 endOfLine(); 300 } 301 } 302 303 /// Returns: a newly allocated `Token`. 304 Token* allocateToken() pure nothrow @safe 305 { 306 if (tokenFreelist) 307 { 308 Token* t = tokenFreelist; 309 tokenFreelist = t.next; 310 t.next = null; 311 return t; 312 } 313 return new Token(); 314 } 315 316 /// Frees the given token by returning it to the freelist. 317 private void releaseToken(Token* token) pure nothrow @nogc @safe 318 { 319 if (mem.isGCEnabled) 320 *token = Token.init; 321 token.next = tokenFreelist; 322 tokenFreelist = token; 323 } 324 325 TOK nextToken() 326 { 327 prevloc = token.loc; 328 if (token.next) 329 { 330 Token* t = token.next; 331 memcpy(&token, t, Token.sizeof); 332 releaseToken(t); 333 } 334 else 335 { 336 scan(&token); 337 } 338 //printf(token.toChars()); 339 return token.value; 340 } 341 342 /*********************** 343 * Look ahead at next token's value. 344 */ 345 final TOK peekNext() 346 { 347 return peek(&token).value; 348 } 349 350 /*********************** 351 * Look 2 tokens ahead at value. 352 */ 353 final TOK peekNext2() 354 { 355 Token* t = peek(&token); 356 return peek(t).value; 357 } 358 359 /**************************** 360 * Turn next token in buffer into a token. 361 */ 362 final void scan(Token* t) 363 { 364 const lastLine = scanloc.linnum; 365 Loc startLoc; 366 t.blockComment = null; 367 t.lineComment = null; 368 369 while (1) 370 { 371 t.ptr = p; 372 //printf("p = %p, *p = '%c'\n",p,*p); 373 t.loc = loc(); 374 switch (*p) 375 { 376 case 0: 377 case 0x1A: 378 t.value = TOK.endOfFile; // end of file 379 // Intentionally not advancing `p`, such that subsequent calls keep returning TOK.endOfFile. 380 return; 381 case ' ': 382 case '\t': 383 case '\v': 384 case '\f': 385 p++; 386 continue; // skip white space 387 case '\r': 388 p++; 389 if (*p != '\n') // if CR stands by itself 390 endOfLine(); 391 continue; // skip white space 392 case '\n': 393 p++; 394 endOfLine(); 395 continue; // skip white space 396 case '0': 397 if (!isZeroSecond(p[1])) // if numeric literal does not continue 398 { 399 ++p; 400 t.unsvalue = 0; 401 t.value = TOK.int32Literal; 402 return; 403 } 404 goto Lnumber; 405 406 case '1': .. case '9': 407 if (!isDigitSecond(p[1])) // if numeric literal does not continue 408 { 409 t.unsvalue = *p - '0'; 410 ++p; 411 t.value = TOK.int32Literal; 412 return; 413 } 414 Lnumber: 415 t.value = number(t); 416 return; 417 418 case '\'': 419 if (issinglechar(p[1]) && p[2] == '\'') 420 { 421 t.unsvalue = p[1]; // simple one character literal 422 t.value = TOK.charLiteral; 423 p += 3; 424 } 425 else 426 t.value = charConstant(t); 427 return; 428 case 'r': 429 if (p[1] != '"') 430 goto case_ident; 431 p++; 432 goto case '`'; 433 case '`': 434 wysiwygStringConstant(t); 435 return; 436 case 'x': 437 if (p[1] != '"') 438 goto case_ident; 439 p++; 440 auto start = p; 441 auto hexString = new OutBuffer(); 442 t.value = hexStringConstant(t); 443 hexString.write(start[0 .. p - start]); 444 error("Built-in hex string literals are obsolete, use `std.conv.hexString!%s` instead.", hexString.extractChars()); 445 return; 446 case 'q': 447 if (p[1] == '"') 448 { 449 p++; 450 delimitedStringConstant(t); 451 return; 452 } 453 else if (p[1] == '{') 454 { 455 p++; 456 tokenStringConstant(t); 457 return; 458 } 459 else 460 goto case_ident; 461 case '"': 462 escapeStringConstant(t); 463 return; 464 case 'a': 465 case 'b': 466 case 'c': 467 case 'd': 468 case 'e': 469 case 'f': 470 case 'g': 471 case 'h': 472 case 'i': 473 case 'j': 474 case 'k': 475 case 'l': 476 case 'm': 477 case 'n': 478 case 'o': 479 case 'p': 480 /*case 'q': case 'r':*/ 481 case 's': 482 case 't': 483 case 'u': 484 case 'v': 485 case 'w': 486 /*case 'x':*/ 487 case 'y': 488 case 'z': 489 case 'A': 490 case 'B': 491 case 'C': 492 case 'D': 493 case 'E': 494 case 'F': 495 case 'G': 496 case 'H': 497 case 'I': 498 case 'J': 499 case 'K': 500 case 'L': 501 case 'M': 502 case 'N': 503 case 'O': 504 case 'P': 505 case 'Q': 506 case 'R': 507 case 'S': 508 case 'T': 509 case 'U': 510 case 'V': 511 case 'W': 512 case 'X': 513 case 'Y': 514 case 'Z': 515 case '_': 516 case_ident: 517 { 518 while (1) 519 { 520 const c = *++p; 521 if (isidchar(c)) 522 continue; 523 else if (c & 0x80) 524 { 525 const s = p; 526 const u = decodeUTF(); 527 if (isUniAlpha(u)) 528 continue; 529 error("char 0x%04x not allowed in identifier", u); 530 p = s; 531 } 532 break; 533 } 534 Identifier id = Identifier.idPool(cast(char*)t.ptr, cast(uint)(p - t.ptr)); 535 t.ident = id; 536 t.value = cast(TOK)id.getValue(); 537 anyToken = 1; 538 if (*t.ptr == '_') // if special identifier token 539 { 540 // Lazy initialization 541 TimeStampInfo.initialize(t.loc); 542 543 if (id == Id.DATE) 544 { 545 t.ustring = TimeStampInfo.date.ptr; 546 goto Lstr; 547 } 548 else if (id == Id.TIME) 549 { 550 t.ustring = TimeStampInfo.time.ptr; 551 goto Lstr; 552 } 553 else if (id == Id.VENDOR) 554 { 555 t.ustring = global.vendor.xarraydup.ptr; 556 goto Lstr; 557 } 558 else if (id == Id.TIMESTAMP) 559 { 560 t.ustring = TimeStampInfo.timestamp.ptr; 561 Lstr: 562 t.value = TOK.string_; 563 t.postfix = 0; 564 t.len = cast(uint)strlen(t.ustring); 565 } 566 else if (id == Id.VERSIONX) 567 { 568 t.value = TOK.int64Literal; 569 t.unsvalue = global.versionNumber(); 570 } 571 else if (id == Id.EOFX) 572 { 573 t.value = TOK.endOfFile; 574 // Advance scanner to end of file 575 while (!(*p == 0 || *p == 0x1A)) 576 p++; 577 } 578 } 579 //printf("t.value = %d\n",t.value); 580 return; 581 } 582 case '/': 583 p++; 584 switch (*p) 585 { 586 case '=': 587 p++; 588 t.value = TOK.divAssign; 589 return; 590 case '*': 591 p++; 592 startLoc = loc(); 593 while (1) 594 { 595 while (1) 596 { 597 const c = *p; 598 switch (c) 599 { 600 case '/': 601 break; 602 case '\n': 603 endOfLine(); 604 p++; 605 continue; 606 case '\r': 607 p++; 608 if (*p != '\n') 609 endOfLine(); 610 continue; 611 case 0: 612 case 0x1A: 613 error("unterminated /* */ comment"); 614 p = end; 615 t.loc = loc(); 616 t.value = TOK.endOfFile; 617 return; 618 default: 619 if (c & 0x80) 620 { 621 const u = decodeUTF(); 622 if (u == PS || u == LS) 623 endOfLine(); 624 } 625 p++; 626 continue; 627 } 628 break; 629 } 630 p++; 631 if (p[-2] == '*' && p - 3 != t.ptr) 632 break; 633 } 634 if (commentToken) 635 { 636 t.loc = startLoc; 637 t.value = TOK.comment; 638 return; 639 } 640 else if (doDocComment && t.ptr[2] == '*' && p - 4 != t.ptr) 641 { 642 // if /** but not /**/ 643 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 644 lastDocLine = scanloc.linnum; 645 } 646 continue; 647 case '/': // do // style comments 648 startLoc = loc(); 649 while (1) 650 { 651 const c = *++p; 652 switch (c) 653 { 654 case '\n': 655 break; 656 case '\r': 657 if (p[1] == '\n') 658 p++; 659 break; 660 case 0: 661 case 0x1A: 662 if (commentToken) 663 { 664 p = end; 665 t.loc = startLoc; 666 t.value = TOK.comment; 667 return; 668 } 669 if (doDocComment && t.ptr[2] == '/') 670 { 671 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 672 lastDocLine = scanloc.linnum; 673 } 674 p = end; 675 t.loc = loc(); 676 t.value = TOK.endOfFile; 677 return; 678 default: 679 if (c & 0x80) 680 { 681 const u = decodeUTF(); 682 if (u == PS || u == LS) 683 break; 684 } 685 continue; 686 } 687 break; 688 } 689 if (commentToken) 690 { 691 p++; 692 endOfLine(); 693 t.loc = startLoc; 694 t.value = TOK.comment; 695 return; 696 } 697 if (doDocComment && t.ptr[2] == '/') 698 { 699 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 700 lastDocLine = scanloc.linnum; 701 } 702 p++; 703 endOfLine(); 704 continue; 705 case '+': 706 { 707 int nest; 708 startLoc = loc(); 709 p++; 710 nest = 1; 711 while (1) 712 { 713 char c = *p; 714 switch (c) 715 { 716 case '/': 717 p++; 718 if (*p == '+') 719 { 720 p++; 721 nest++; 722 } 723 continue; 724 case '+': 725 p++; 726 if (*p == '/') 727 { 728 p++; 729 if (--nest == 0) 730 break; 731 } 732 continue; 733 case '\r': 734 p++; 735 if (*p != '\n') 736 endOfLine(); 737 continue; 738 case '\n': 739 endOfLine(); 740 p++; 741 continue; 742 case 0: 743 case 0x1A: 744 error("unterminated /+ +/ comment"); 745 p = end; 746 t.loc = loc(); 747 t.value = TOK.endOfFile; 748 return; 749 default: 750 if (c & 0x80) 751 { 752 uint u = decodeUTF(); 753 if (u == PS || u == LS) 754 endOfLine(); 755 } 756 p++; 757 continue; 758 } 759 break; 760 } 761 if (commentToken) 762 { 763 t.loc = startLoc; 764 t.value = TOK.comment; 765 return; 766 } 767 if (doDocComment && t.ptr[2] == '+' && p - 4 != t.ptr) 768 { 769 // if /++ but not /++/ 770 getDocComment(t, lastLine == startLoc.linnum, startLoc.linnum - lastDocLine > 1); 771 lastDocLine = scanloc.linnum; 772 } 773 continue; 774 } 775 default: 776 break; 777 } 778 t.value = TOK.div; 779 return; 780 case '.': 781 p++; 782 if (isdigit(*p)) 783 { 784 /* Note that we don't allow ._1 and ._ as being 785 * valid floating point numbers. 786 */ 787 p--; 788 t.value = inreal(t); 789 } 790 else if (p[0] == '.') 791 { 792 if (p[1] == '.') 793 { 794 p += 2; 795 t.value = TOK.dotDotDot; 796 } 797 else 798 { 799 p++; 800 t.value = TOK.slice; 801 } 802 } 803 else 804 t.value = TOK.dot; 805 return; 806 case '&': 807 p++; 808 if (*p == '=') 809 { 810 p++; 811 t.value = TOK.andAssign; 812 } 813 else if (*p == '&') 814 { 815 p++; 816 t.value = TOK.andAnd; 817 } 818 else 819 t.value = TOK.and; 820 return; 821 case '|': 822 p++; 823 if (*p == '=') 824 { 825 p++; 826 t.value = TOK.orAssign; 827 } 828 else if (*p == '|') 829 { 830 p++; 831 t.value = TOK.orOr; 832 } 833 else 834 t.value = TOK.or; 835 return; 836 case '-': 837 p++; 838 if (*p == '=') 839 { 840 p++; 841 t.value = TOK.minAssign; 842 } 843 else if (*p == '-') 844 { 845 p++; 846 t.value = TOK.minusMinus; 847 } 848 else 849 t.value = TOK.min; 850 return; 851 case '+': 852 p++; 853 if (*p == '=') 854 { 855 p++; 856 t.value = TOK.addAssign; 857 } 858 else if (*p == '+') 859 { 860 p++; 861 t.value = TOK.plusPlus; 862 } 863 else 864 t.value = TOK.add; 865 return; 866 case '<': 867 p++; 868 if (*p == '=') 869 { 870 p++; 871 t.value = TOK.lessOrEqual; // <= 872 } 873 else if (*p == '<') 874 { 875 p++; 876 if (*p == '=') 877 { 878 p++; 879 t.value = TOK.leftShiftAssign; // <<= 880 } 881 else 882 t.value = TOK.leftShift; // << 883 } 884 else 885 t.value = TOK.lessThan; // < 886 return; 887 case '>': 888 p++; 889 if (*p == '=') 890 { 891 p++; 892 t.value = TOK.greaterOrEqual; // >= 893 } 894 else if (*p == '>') 895 { 896 p++; 897 if (*p == '=') 898 { 899 p++; 900 t.value = TOK.rightShiftAssign; // >>= 901 } 902 else if (*p == '>') 903 { 904 p++; 905 if (*p == '=') 906 { 907 p++; 908 t.value = TOK.unsignedRightShiftAssign; // >>>= 909 } 910 else 911 t.value = TOK.unsignedRightShift; // >>> 912 } 913 else 914 t.value = TOK.rightShift; // >> 915 } 916 else 917 t.value = TOK.greaterThan; // > 918 return; 919 case '!': 920 p++; 921 if (*p == '=') 922 { 923 p++; 924 t.value = TOK.notEqual; // != 925 } 926 else 927 t.value = TOK.not; // ! 928 return; 929 case '=': 930 p++; 931 if (*p == '=') 932 { 933 p++; 934 t.value = TOK.equal; // == 935 } 936 else if (*p == '>') 937 { 938 p++; 939 t.value = TOK.goesTo; // => 940 } 941 else 942 t.value = TOK.assign; // = 943 return; 944 case '~': 945 p++; 946 if (*p == '=') 947 { 948 p++; 949 t.value = TOK.concatenateAssign; // ~= 950 } 951 else 952 t.value = TOK.tilde; // ~ 953 return; 954 case '^': 955 p++; 956 if (*p == '^') 957 { 958 p++; 959 if (*p == '=') 960 { 961 p++; 962 t.value = TOK.powAssign; // ^^= 963 } 964 else 965 t.value = TOK.pow; // ^^ 966 } 967 else if (*p == '=') 968 { 969 p++; 970 t.value = TOK.xorAssign; // ^= 971 } 972 else 973 t.value = TOK.xor; // ^ 974 return; 975 case '(': 976 p++; 977 t.value = TOK.leftParentheses; 978 return; 979 case ')': 980 p++; 981 t.value = TOK.rightParentheses; 982 return; 983 case '[': 984 p++; 985 t.value = TOK.leftBracket; 986 return; 987 case ']': 988 p++; 989 t.value = TOK.rightBracket; 990 return; 991 case '{': 992 p++; 993 t.value = TOK.leftCurly; 994 return; 995 case '}': 996 p++; 997 t.value = TOK.rightCurly; 998 return; 999 case '?': 1000 p++; 1001 t.value = TOK.question; 1002 return; 1003 case ',': 1004 p++; 1005 t.value = TOK.comma; 1006 return; 1007 case ';': 1008 p++; 1009 t.value = TOK.semicolon; 1010 return; 1011 case ':': 1012 p++; 1013 t.value = TOK.colon; 1014 return; 1015 case '$': 1016 p++; 1017 t.value = TOK.dollar; 1018 return; 1019 case '@': 1020 p++; 1021 t.value = TOK.at; 1022 return; 1023 case '*': 1024 p++; 1025 if (*p == '=') 1026 { 1027 p++; 1028 t.value = TOK.mulAssign; 1029 } 1030 else 1031 t.value = TOK.mul; 1032 return; 1033 case '%': 1034 p++; 1035 if (*p == '=') 1036 { 1037 p++; 1038 t.value = TOK.modAssign; 1039 } 1040 else 1041 t.value = TOK.mod; 1042 return; 1043 case '#': 1044 { 1045 p++; 1046 Token n; 1047 scan(&n); 1048 if (n.value == TOK.identifier) 1049 { 1050 if (n.ident == Id.line) 1051 { 1052 poundLine(); 1053 continue; 1054 } 1055 else 1056 { 1057 const locx = loc(); 1058 warning(locx, "C preprocessor directive `#%s` is not supported", n.ident.toChars()); 1059 } 1060 } 1061 else if (n.value == TOK.if_) 1062 { 1063 error("C preprocessor directive `#if` is not supported, use `version` or `static if`"); 1064 } 1065 t.value = TOK.pound; 1066 return; 1067 } 1068 default: 1069 { 1070 dchar c = *p; 1071 if (c & 0x80) 1072 { 1073 c = decodeUTF(); 1074 // Check for start of unicode identifier 1075 if (isUniAlpha(c)) 1076 goto case_ident; 1077 if (c == PS || c == LS) 1078 { 1079 endOfLine(); 1080 p++; 1081 continue; 1082 } 1083 } 1084 if (c < 0x80 && isprint(c)) 1085 error("character '%c' is not a valid token", c); 1086 else 1087 error("character 0x%02x is not a valid token", c); 1088 p++; 1089 continue; 1090 } 1091 } 1092 } 1093 } 1094 1095 final Token* peek(Token* ct) 1096 { 1097 Token* t; 1098 if (ct.next) 1099 t = ct.next; 1100 else 1101 { 1102 t = allocateToken(); 1103 scan(t); 1104 ct.next = t; 1105 } 1106 return t; 1107 } 1108 1109 /********************************* 1110 * tk is on the opening (. 1111 * Look ahead and return token that is past the closing ). 1112 */ 1113 final Token* peekPastParen(Token* tk) 1114 { 1115 //printf("peekPastParen()\n"); 1116 int parens = 1; 1117 int curlynest = 0; 1118 while (1) 1119 { 1120 tk = peek(tk); 1121 //tk.print(); 1122 switch (tk.value) 1123 { 1124 case TOK.leftParentheses: 1125 parens++; 1126 continue; 1127 case TOK.rightParentheses: 1128 --parens; 1129 if (parens) 1130 continue; 1131 tk = peek(tk); 1132 break; 1133 case TOK.leftCurly: 1134 curlynest++; 1135 continue; 1136 case TOK.rightCurly: 1137 if (--curlynest >= 0) 1138 continue; 1139 break; 1140 case TOK.semicolon: 1141 if (curlynest) 1142 continue; 1143 break; 1144 case TOK.endOfFile: 1145 break; 1146 default: 1147 continue; 1148 } 1149 return tk; 1150 } 1151 } 1152 1153 /******************************************* 1154 * Parse escape sequence. 1155 */ 1156 private uint escapeSequence() 1157 { 1158 return Lexer.escapeSequence(token.loc, p); 1159 } 1160 1161 /** 1162 Parse the given string literal escape sequence into a single character. 1163 Params: 1164 loc = the location of the current token 1165 sequence = pointer to string with escape sequence to parse. this is a reference 1166 variable that is also used to return the position after the sequence 1167 Returns: 1168 the escaped sequence as a single character 1169 */ 1170 private static dchar escapeSequence(const ref Loc loc, ref const(char)* sequence) 1171 { 1172 const(char)* p = sequence; // cache sequence reference on stack 1173 scope(exit) sequence = p; 1174 1175 uint c = *p; 1176 int ndigits; 1177 switch (c) 1178 { 1179 case '\'': 1180 case '"': 1181 case '?': 1182 case '\\': 1183 Lconsume: 1184 p++; 1185 break; 1186 case 'a': 1187 c = 7; 1188 goto Lconsume; 1189 case 'b': 1190 c = 8; 1191 goto Lconsume; 1192 case 'f': 1193 c = 12; 1194 goto Lconsume; 1195 case 'n': 1196 c = 10; 1197 goto Lconsume; 1198 case 'r': 1199 c = 13; 1200 goto Lconsume; 1201 case 't': 1202 c = 9; 1203 goto Lconsume; 1204 case 'v': 1205 c = 11; 1206 goto Lconsume; 1207 case 'u': 1208 ndigits = 4; 1209 goto Lhex; 1210 case 'U': 1211 ndigits = 8; 1212 goto Lhex; 1213 case 'x': 1214 ndigits = 2; 1215 Lhex: 1216 p++; 1217 c = *p; 1218 if (ishex(cast(char)c)) 1219 { 1220 uint v = 0; 1221 int n = 0; 1222 while (1) 1223 { 1224 if (isdigit(cast(char)c)) 1225 c -= '0'; 1226 else if (islower(c)) 1227 c -= 'a' - 10; 1228 else 1229 c -= 'A' - 10; 1230 v = v * 16 + c; 1231 c = *++p; 1232 if (++n == ndigits) 1233 break; 1234 if (!ishex(cast(char)c)) 1235 { 1236 .error(loc, "escape hex sequence has %d hex digits instead of %d", n, ndigits); 1237 break; 1238 } 1239 } 1240 if (ndigits != 2 && !utf_isValidDchar(v)) 1241 { 1242 .error(loc, "invalid UTF character \\U%08x", v); 1243 v = '?'; // recover with valid UTF character 1244 } 1245 c = v; 1246 } 1247 else 1248 { 1249 .error(loc, "undefined escape hex sequence \\%c%c", sequence[0], c); 1250 p++; 1251 } 1252 break; 1253 case '&': 1254 // named character entity 1255 for (const idstart = ++p; 1; p++) 1256 { 1257 switch (*p) 1258 { 1259 case ';': 1260 c = HtmlNamedEntity(idstart, p - idstart); 1261 if (c == ~0) 1262 { 1263 .error(loc, "unnamed character entity &%.*s;", cast(int)(p - idstart), idstart); 1264 c = '?'; 1265 } 1266 p++; 1267 break; 1268 default: 1269 if (isalpha(*p) || (p != idstart && isdigit(*p))) 1270 continue; 1271 .error(loc, "unterminated named entity &%.*s;", cast(int)(p - idstart + 1), idstart); 1272 c = '?'; 1273 break; 1274 } 1275 break; 1276 } 1277 break; 1278 case 0: 1279 case 0x1A: 1280 // end of file 1281 c = '\\'; 1282 break; 1283 default: 1284 if (isoctal(cast(char)c)) 1285 { 1286 uint v = 0; 1287 int n = 0; 1288 do 1289 { 1290 v = v * 8 + (c - '0'); 1291 c = *++p; 1292 } 1293 while (++n < 3 && isoctal(cast(char)c)); 1294 c = v; 1295 if (c > 0xFF) 1296 .error(loc, "escape octal sequence \\%03o is larger than \\377", c); 1297 } 1298 else 1299 { 1300 .error(loc, "undefined escape sequence \\%c", c); 1301 p++; 1302 } 1303 break; 1304 } 1305 return c; 1306 } 1307 1308 /** 1309 Lex a wysiwyg string. `p` must be pointing to the first character before the 1310 contents of the string literal. The character pointed to by `p` will be used as 1311 the terminating character (i.e. backtick or double-quote). 1312 Params: 1313 result = pointer to the token that accepts the result 1314 */ 1315 private void wysiwygStringConstant(Token* result) 1316 { 1317 result.value = TOK.string_; 1318 Loc start = loc(); 1319 auto terminator = p[0]; 1320 p++; 1321 stringbuffer.setsize(0); 1322 while (1) 1323 { 1324 dchar c = p[0]; 1325 p++; 1326 switch (c) 1327 { 1328 case '\n': 1329 endOfLine(); 1330 break; 1331 case '\r': 1332 if (p[0] == '\n') 1333 continue; // ignore 1334 c = '\n'; // treat EndOfLine as \n character 1335 endOfLine(); 1336 break; 1337 case 0: 1338 case 0x1A: 1339 error("unterminated string constant starting at %s", start.toChars()); 1340 result.setString(); 1341 // rewind `p` so it points to the EOF character 1342 p--; 1343 return; 1344 default: 1345 if (c == terminator) 1346 { 1347 result.setString(stringbuffer); 1348 stringPostfix(result); 1349 return; 1350 } 1351 else if (c & 0x80) 1352 { 1353 p--; 1354 const u = decodeUTF(); 1355 p++; 1356 if (u == PS || u == LS) 1357 endOfLine(); 1358 stringbuffer.writeUTF8(u); 1359 continue; 1360 } 1361 break; 1362 } 1363 stringbuffer.writeByte(c); 1364 } 1365 } 1366 1367 /************************************** 1368 * Lex hex strings: 1369 * x"0A ae 34FE BD" 1370 */ 1371 private TOK hexStringConstant(Token* t) 1372 { 1373 Loc start = loc(); 1374 uint n = 0; 1375 uint v = ~0; // dead assignment, needed to suppress warning 1376 p++; 1377 stringbuffer.setsize(0); 1378 while (1) 1379 { 1380 dchar c = *p++; 1381 switch (c) 1382 { 1383 case ' ': 1384 case '\t': 1385 case '\v': 1386 case '\f': 1387 continue; // skip white space 1388 case '\r': 1389 if (*p == '\n') 1390 continue; // ignore '\r' if followed by '\n' 1391 // Treat isolated '\r' as if it were a '\n' 1392 goto case '\n'; 1393 case '\n': 1394 endOfLine(); 1395 continue; 1396 case 0: 1397 case 0x1A: 1398 error("unterminated string constant starting at %s", start.toChars()); 1399 t.setString(); 1400 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1401 p--; 1402 return TOK.hexadecimalString; 1403 case '"': 1404 if (n & 1) 1405 { 1406 error("odd number (%d) of hex characters in hex string", n); 1407 stringbuffer.writeByte(v); 1408 } 1409 t.setString(stringbuffer); 1410 stringPostfix(t); 1411 return TOK.hexadecimalString; 1412 default: 1413 if (c >= '0' && c <= '9') 1414 c -= '0'; 1415 else if (c >= 'a' && c <= 'f') 1416 c -= 'a' - 10; 1417 else if (c >= 'A' && c <= 'F') 1418 c -= 'A' - 10; 1419 else if (c & 0x80) 1420 { 1421 p--; 1422 const u = decodeUTF(); 1423 p++; 1424 if (u == PS || u == LS) 1425 endOfLine(); 1426 else 1427 error("non-hex character \\u%04x in hex string", u); 1428 } 1429 else 1430 error("non-hex character '%c' in hex string", c); 1431 if (n & 1) 1432 { 1433 v = (v << 4) | c; 1434 stringbuffer.writeByte(v); 1435 } 1436 else 1437 v = c; 1438 n++; 1439 break; 1440 } 1441 } 1442 assert(0); // see bug 15731 1443 } 1444 1445 /** 1446 Lex a delimited string. Some examples of delimited strings are: 1447 --- 1448 q"(foo(xxx))" // "foo(xxx)" 1449 q"[foo$(LPAREN)]" // "foo$(LPAREN)" 1450 q"/foo]/" // "foo]" 1451 q"HERE 1452 foo 1453 HERE" // "foo\n" 1454 --- 1455 It is assumed that `p` points to the opening double-quote '"'. 1456 Params: 1457 result = pointer to the token that accepts the result 1458 */ 1459 private void delimitedStringConstant(Token* result) 1460 { 1461 result.value = TOK.string_; 1462 Loc start = loc(); 1463 dchar delimleft = 0; 1464 dchar delimright = 0; 1465 uint nest = 1; 1466 uint nestcount = ~0; // dead assignment, needed to suppress warning 1467 Identifier hereid = null; 1468 uint blankrol = 0; 1469 uint startline = 0; 1470 p++; 1471 stringbuffer.setsize(0); 1472 while (1) 1473 { 1474 dchar c = *p++; 1475 //printf("c = '%c'\n", c); 1476 switch (c) 1477 { 1478 case '\n': 1479 Lnextline: 1480 endOfLine(); 1481 startline = 1; 1482 if (blankrol) 1483 { 1484 blankrol = 0; 1485 continue; 1486 } 1487 if (hereid) 1488 { 1489 stringbuffer.writeUTF8(c); 1490 continue; 1491 } 1492 break; 1493 case '\r': 1494 if (*p == '\n') 1495 continue; // ignore 1496 c = '\n'; // treat EndOfLine as \n character 1497 goto Lnextline; 1498 case 0: 1499 case 0x1A: 1500 error("unterminated delimited string constant starting at %s", start.toChars()); 1501 result.setString(); 1502 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1503 p--; 1504 return; 1505 default: 1506 if (c & 0x80) 1507 { 1508 p--; 1509 c = decodeUTF(); 1510 p++; 1511 if (c == PS || c == LS) 1512 goto Lnextline; 1513 } 1514 break; 1515 } 1516 if (delimleft == 0) 1517 { 1518 delimleft = c; 1519 nest = 1; 1520 nestcount = 1; 1521 if (c == '(') 1522 delimright = ')'; 1523 else if (c == '{') 1524 delimright = '}'; 1525 else if (c == '[') 1526 delimright = ']'; 1527 else if (c == '<') 1528 delimright = '>'; 1529 else if (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) 1530 { 1531 // Start of identifier; must be a heredoc 1532 Token tok; 1533 p--; 1534 scan(&tok); // read in heredoc identifier 1535 if (tok.value != TOK.identifier) 1536 { 1537 error("identifier expected for heredoc, not %s", tok.toChars()); 1538 delimright = c; 1539 } 1540 else 1541 { 1542 hereid = tok.ident; 1543 //printf("hereid = '%s'\n", hereid.toChars()); 1544 blankrol = 1; 1545 } 1546 nest = 0; 1547 } 1548 else 1549 { 1550 delimright = c; 1551 nest = 0; 1552 if (isspace(c)) 1553 error("delimiter cannot be whitespace"); 1554 } 1555 } 1556 else 1557 { 1558 if (blankrol) 1559 { 1560 error("heredoc rest of line should be blank"); 1561 blankrol = 0; 1562 continue; 1563 } 1564 if (nest == 1) 1565 { 1566 if (c == delimleft) 1567 nestcount++; 1568 else if (c == delimright) 1569 { 1570 nestcount--; 1571 if (nestcount == 0) 1572 goto Ldone; 1573 } 1574 } 1575 else if (c == delimright) 1576 goto Ldone; 1577 if (startline && (isalpha(c) || c == '_' || (c >= 0x80 && isUniAlpha(c))) && hereid) 1578 { 1579 Token tok; 1580 auto psave = p; 1581 p--; 1582 scan(&tok); // read in possible heredoc identifier 1583 //printf("endid = '%s'\n", tok.ident.toChars()); 1584 if (tok.value == TOK.identifier && tok.ident is hereid) 1585 { 1586 /* should check that rest of line is blank 1587 */ 1588 goto Ldone; 1589 } 1590 p = psave; 1591 } 1592 stringbuffer.writeUTF8(c); 1593 startline = 0; 1594 } 1595 } 1596 Ldone: 1597 if (*p == '"') 1598 p++; 1599 else if (hereid) 1600 error("delimited string must end in %s\"", hereid.toChars()); 1601 else 1602 error("delimited string must end in %c\"", delimright); 1603 result.setString(stringbuffer); 1604 stringPostfix(result); 1605 } 1606 1607 /** 1608 Lex a token string. Some examples of token strings are: 1609 --- 1610 q{ foo(xxx) } // " foo(xxx) " 1611 q{foo$(LPAREN)} // "foo$(LPAREN)" 1612 q{{foo}"}"} // "{foo}"}"" 1613 --- 1614 It is assumed that `p` points to the opening curly-brace '{'. 1615 Params: 1616 result = pointer to the token that accepts the result 1617 */ 1618 private void tokenStringConstant(Token* result) 1619 { 1620 result.value = TOK.string_; 1621 1622 uint nest = 1; 1623 const start = loc(); 1624 const pstart = ++p; 1625 inTokenStringConstant++; 1626 scope(exit) inTokenStringConstant--; 1627 while (1) 1628 { 1629 Token tok; 1630 scan(&tok); 1631 switch (tok.value) 1632 { 1633 case TOK.leftCurly: 1634 nest++; 1635 continue; 1636 case TOK.rightCurly: 1637 if (--nest == 0) 1638 { 1639 result.setString(pstart, p - 1 - pstart); 1640 stringPostfix(result); 1641 return; 1642 } 1643 continue; 1644 case TOK.endOfFile: 1645 error("unterminated token string constant starting at %s", start.toChars()); 1646 result.setString(); 1647 return; 1648 default: 1649 continue; 1650 } 1651 } 1652 } 1653 1654 /** 1655 Scan a double-quoted string while building the processed string value by 1656 handling escape sequences. The result is returned in the given `t` token. 1657 This function assumes that `p` currently points to the opening double-quote 1658 of the string. 1659 Params: 1660 t = the token to set the resulting string to 1661 */ 1662 private void escapeStringConstant(Token* t) 1663 { 1664 t.value = TOK.string_; 1665 1666 const start = loc(); 1667 p++; 1668 stringbuffer.setsize(0); 1669 while (1) 1670 { 1671 dchar c = *p++; 1672 switch (c) 1673 { 1674 case '\\': 1675 switch (*p) 1676 { 1677 case 'u': 1678 case 'U': 1679 case '&': 1680 c = escapeSequence(); 1681 stringbuffer.writeUTF8(c); 1682 continue; 1683 default: 1684 c = escapeSequence(); 1685 break; 1686 } 1687 break; 1688 case '\n': 1689 endOfLine(); 1690 break; 1691 case '\r': 1692 if (*p == '\n') 1693 continue; // ignore 1694 c = '\n'; // treat EndOfLine as \n character 1695 endOfLine(); 1696 break; 1697 case '"': 1698 t.setString(stringbuffer); 1699 stringPostfix(t); 1700 return; 1701 case 0: 1702 case 0x1A: 1703 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1704 p--; 1705 error("unterminated string constant starting at %s", start.toChars()); 1706 t.setString(); 1707 return; 1708 default: 1709 if (c & 0x80) 1710 { 1711 p--; 1712 c = decodeUTF(); 1713 if (c == LS || c == PS) 1714 { 1715 c = '\n'; 1716 endOfLine(); 1717 } 1718 p++; 1719 stringbuffer.writeUTF8(c); 1720 continue; 1721 } 1722 break; 1723 } 1724 stringbuffer.writeByte(c); 1725 } 1726 } 1727 1728 /************************************** 1729 */ 1730 private TOK charConstant(Token* t) 1731 { 1732 TOK tk = TOK.charLiteral; 1733 //printf("Lexer::charConstant\n"); 1734 p++; 1735 dchar c = *p++; 1736 switch (c) 1737 { 1738 case '\\': 1739 switch (*p) 1740 { 1741 case 'u': 1742 t.unsvalue = escapeSequence(); 1743 tk = TOK.wcharLiteral; 1744 break; 1745 case 'U': 1746 case '&': 1747 t.unsvalue = escapeSequence(); 1748 tk = TOK.dcharLiteral; 1749 break; 1750 default: 1751 t.unsvalue = escapeSequence(); 1752 break; 1753 } 1754 break; 1755 case '\n': 1756 L1: 1757 endOfLine(); 1758 goto case; 1759 case '\r': 1760 goto case '\''; 1761 case 0: 1762 case 0x1A: 1763 // decrement `p`, because it needs to point to the next token (the 0 or 0x1A character is the TOK.endOfFile token). 1764 p--; 1765 goto case; 1766 case '\'': 1767 error("unterminated character constant"); 1768 t.unsvalue = '?'; 1769 return tk; 1770 default: 1771 if (c & 0x80) 1772 { 1773 p--; 1774 c = decodeUTF(); 1775 p++; 1776 if (c == LS || c == PS) 1777 goto L1; 1778 if (c < 0xD800 || (c >= 0xE000 && c < 0xFFFE)) 1779 tk = TOK.wcharLiteral; 1780 else 1781 tk = TOK.dcharLiteral; 1782 } 1783 t.unsvalue = c; 1784 break; 1785 } 1786 if (*p != '\'') 1787 { 1788 while (*p != '\'' && *p != 0x1A && *p != 0 && *p != '\n' && 1789 *p != '\r' && *p != ';' && *p != ')' && *p != ']' && *p != '}') 1790 { 1791 if (*p & 0x80) 1792 { 1793 const s = p; 1794 c = decodeUTF(); 1795 if (c == LS || c == PS) 1796 { 1797 p = s; 1798 break; 1799 } 1800 } 1801 p++; 1802 } 1803 1804 if (*p == '\'') 1805 { 1806 error("character constant has multiple characters"); 1807 p++; 1808 } 1809 else 1810 error("unterminated character constant"); 1811 t.unsvalue = '?'; 1812 return tk; 1813 } 1814 p++; 1815 return tk; 1816 } 1817 1818 /*************************************** 1819 * Get postfix of string literal. 1820 */ 1821 private void stringPostfix(Token* t) pure @nogc 1822 { 1823 switch (*p) 1824 { 1825 case 'c': 1826 case 'w': 1827 case 'd': 1828 t.postfix = *p; 1829 p++; 1830 break; 1831 default: 1832 t.postfix = 0; 1833 break; 1834 } 1835 } 1836 1837 /************************************** 1838 * Read in a number. 1839 * If it's an integer, store it in tok.TKutok.Vlong. 1840 * integers can be decimal, octal or hex 1841 * Handle the suffixes U, UL, LU, L, etc. 1842 * If it's double, store it in tok.TKutok.Vdouble. 1843 * Returns: 1844 * TKnum 1845 * TKdouble,... 1846 */ 1847 private TOK number(Token* t) 1848 { 1849 int base = 10; 1850 const start = p; 1851 uinteger_t n = 0; // unsigned >=64 bit integer type 1852 int d; 1853 bool err = false; 1854 bool overflow = false; 1855 bool anyBinaryDigitsNoSingleUS = false; 1856 bool anyHexDigitsNoSingleUS = false; 1857 dchar c = *p; 1858 if (c == '0') 1859 { 1860 ++p; 1861 c = *p; 1862 switch (c) 1863 { 1864 case '0': 1865 case '1': 1866 case '2': 1867 case '3': 1868 case '4': 1869 case '5': 1870 case '6': 1871 case '7': 1872 case '8': 1873 case '9': 1874 base = 8; 1875 break; 1876 case 'x': 1877 case 'X': 1878 ++p; 1879 base = 16; 1880 break; 1881 case 'b': 1882 case 'B': 1883 ++p; 1884 base = 2; 1885 break; 1886 case '.': 1887 if (p[1] == '.') 1888 goto Ldone; // if ".." 1889 if (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80) 1890 goto Ldone; // if ".identifier" or ".unicode" 1891 goto Lreal; // '.' is part of current token 1892 case 'i': 1893 case 'f': 1894 case 'F': 1895 goto Lreal; 1896 case '_': 1897 ++p; 1898 base = 8; 1899 break; 1900 case 'L': 1901 if (p[1] == 'i') 1902 goto Lreal; 1903 break; 1904 default: 1905 break; 1906 } 1907 } 1908 while (1) 1909 { 1910 c = *p; 1911 switch (c) 1912 { 1913 case '0': 1914 case '1': 1915 case '2': 1916 case '3': 1917 case '4': 1918 case '5': 1919 case '6': 1920 case '7': 1921 case '8': 1922 case '9': 1923 ++p; 1924 d = c - '0'; 1925 break; 1926 case 'a': 1927 case 'b': 1928 case 'c': 1929 case 'd': 1930 case 'e': 1931 case 'f': 1932 case 'A': 1933 case 'B': 1934 case 'C': 1935 case 'D': 1936 case 'E': 1937 case 'F': 1938 ++p; 1939 if (base != 16) 1940 { 1941 if (c == 'e' || c == 'E' || c == 'f' || c == 'F') 1942 goto Lreal; 1943 } 1944 if (c >= 'a') 1945 d = c + 10 - 'a'; 1946 else 1947 d = c + 10 - 'A'; 1948 break; 1949 case 'L': 1950 if (p[1] == 'i') 1951 goto Lreal; 1952 goto Ldone; 1953 case '.': 1954 if (p[1] == '.') 1955 goto Ldone; // if ".." 1956 if (base == 10 && (isalpha(p[1]) || p[1] == '_' || p[1] & 0x80)) 1957 goto Ldone; // if ".identifier" or ".unicode" 1958 if (base == 16 && (!ishex(p[1]) || p[1] == '_' || p[1] & 0x80)) 1959 goto Ldone; // if ".identifier" or ".unicode" 1960 if (base == 2) 1961 goto Ldone; // if ".identifier" or ".unicode" 1962 goto Lreal; // otherwise as part of a floating point literal 1963 case 'p': 1964 case 'P': 1965 case 'i': 1966 Lreal: 1967 p = start; 1968 return inreal(t); 1969 case '_': 1970 ++p; 1971 continue; 1972 default: 1973 goto Ldone; 1974 } 1975 // got a digit here, set any necessary flags, check for errors 1976 anyHexDigitsNoSingleUS = true; 1977 anyBinaryDigitsNoSingleUS = true; 1978 if (!err && d >= base) 1979 { 1980 error("%s digit expected, not `%c`", base == 2 ? "binary".ptr : 1981 base == 8 ? "octal".ptr : 1982 "decimal".ptr, c); 1983 err = true; 1984 } 1985 // Avoid expensive overflow check if we aren't at risk of overflow 1986 if (n <= 0x0FFF_FFFF_FFFF_FFFFUL) 1987 n = n * base + d; 1988 else 1989 { 1990 import core.checkedint : mulu, addu; 1991 1992 n = mulu(n, base, overflow); 1993 n = addu(n, d, overflow); 1994 } 1995 } 1996 Ldone: 1997 if (overflow && !err) 1998 { 1999 error("integer overflow"); 2000 err = true; 2001 } 2002 if ((base == 2 && !anyBinaryDigitsNoSingleUS) || 2003 (base == 16 && !anyHexDigitsNoSingleUS)) 2004 error("`%.*s` isn't a valid integer literal, use `%.*s0` instead", cast(int)(p - start), start, 2, start); 2005 enum FLAGS : int 2006 { 2007 none = 0, 2008 decimal = 1, // decimal 2009 unsigned = 2, // u or U suffix 2010 long_ = 4, // L suffix 2011 } 2012 2013 FLAGS flags = (base == 10) ? FLAGS.decimal : FLAGS.none; 2014 // Parse trailing 'u', 'U', 'l' or 'L' in any combination 2015 const psuffix = p; 2016 while (1) 2017 { 2018 FLAGS f; 2019 switch (*p) 2020 { 2021 case 'U': 2022 case 'u': 2023 f = FLAGS.unsigned; 2024 goto L1; 2025 case 'l': 2026 f = FLAGS.long_; 2027 error("lower case integer suffix 'l' is not allowed. Please use 'L' instead"); 2028 goto L1; 2029 case 'L': 2030 f = FLAGS.long_; 2031 L1: 2032 p++; 2033 if ((flags & f) && !err) 2034 { 2035 error("unrecognized token"); 2036 err = true; 2037 } 2038 flags = cast(FLAGS)(flags | f); 2039 continue; 2040 default: 2041 break; 2042 } 2043 break; 2044 } 2045 if (base == 8 && n >= 8) 2046 { 2047 if (err) 2048 // can't translate invalid octal value, just show a generic message 2049 error("octal literals larger than 7 are no longer supported"); 2050 else 2051 error("octal literals `0%llo%.*s` are no longer supported, use `std.conv.octal!%llo%.*s` instead", 2052 n, cast(int)(p - psuffix), psuffix, n, cast(int)(p - psuffix), psuffix); 2053 } 2054 TOK result; 2055 switch (flags) 2056 { 2057 case FLAGS.none: 2058 /* Octal or Hexadecimal constant. 2059 * First that fits: int, uint, long, ulong 2060 */ 2061 if (n & 0x8000000000000000L) 2062 result = TOK.uns64Literal; 2063 else if (n & 0xFFFFFFFF00000000L) 2064 result = TOK.int64Literal; 2065 else if (n & 0x80000000) 2066 result = TOK.uns32Literal; 2067 else 2068 result = TOK.int32Literal; 2069 break; 2070 case FLAGS.decimal: 2071 /* First that fits: int, long, long long 2072 */ 2073 if (n & 0x8000000000000000L) 2074 { 2075 result = TOK.uns64Literal; 2076 } 2077 else if (n & 0xFFFFFFFF80000000L) 2078 result = TOK.int64Literal; 2079 else 2080 result = TOK.int32Literal; 2081 break; 2082 case FLAGS.unsigned: 2083 case FLAGS.decimal | FLAGS.unsigned: 2084 /* First that fits: uint, ulong 2085 */ 2086 if (n & 0xFFFFFFFF00000000L) 2087 result = TOK.uns64Literal; 2088 else 2089 result = TOK.uns32Literal; 2090 break; 2091 case FLAGS.decimal | FLAGS.long_: 2092 if (n & 0x8000000000000000L) 2093 { 2094 if (!err) 2095 { 2096 error("signed integer overflow"); 2097 err = true; 2098 } 2099 result = TOK.uns64Literal; 2100 } 2101 else 2102 result = TOK.int64Literal; 2103 break; 2104 case FLAGS.long_: 2105 if (n & 0x8000000000000000L) 2106 result = TOK.uns64Literal; 2107 else 2108 result = TOK.int64Literal; 2109 break; 2110 case FLAGS.unsigned | FLAGS.long_: 2111 case FLAGS.decimal | FLAGS.unsigned | FLAGS.long_: 2112 result = TOK.uns64Literal; 2113 break; 2114 default: 2115 debug 2116 { 2117 printf("%x\n", flags); 2118 } 2119 assert(0); 2120 } 2121 t.unsvalue = n; 2122 return result; 2123 } 2124 2125 /************************************** 2126 * Read in characters, converting them to real. 2127 * Bugs: 2128 * Exponent overflow not detected. 2129 * Too much requested precision is not detected. 2130 */ 2131 private TOK inreal(Token* t) 2132 { 2133 //printf("Lexer::inreal()\n"); 2134 debug 2135 { 2136 assert(*p == '.' || isdigit(*p)); 2137 } 2138 bool isWellformedString = true; 2139 stringbuffer.setsize(0); 2140 auto pstart = p; 2141 bool hex = false; 2142 dchar c = *p++; 2143 // Leading '0x' 2144 if (c == '0') 2145 { 2146 c = *p++; 2147 if (c == 'x' || c == 'X') 2148 { 2149 hex = true; 2150 c = *p++; 2151 } 2152 } 2153 // Digits to left of '.' 2154 while (1) 2155 { 2156 if (c == '.') 2157 { 2158 c = *p++; 2159 break; 2160 } 2161 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2162 { 2163 c = *p++; 2164 continue; 2165 } 2166 break; 2167 } 2168 // Digits to right of '.' 2169 while (1) 2170 { 2171 if (isdigit(c) || (hex && isxdigit(c)) || c == '_') 2172 { 2173 c = *p++; 2174 continue; 2175 } 2176 break; 2177 } 2178 if (c == 'e' || c == 'E' || (hex && (c == 'p' || c == 'P'))) 2179 { 2180 c = *p++; 2181 if (c == '-' || c == '+') 2182 { 2183 c = *p++; 2184 } 2185 bool anyexp = false; 2186 while (1) 2187 { 2188 if (isdigit(c)) 2189 { 2190 anyexp = true; 2191 c = *p++; 2192 continue; 2193 } 2194 if (c == '_') 2195 { 2196 c = *p++; 2197 continue; 2198 } 2199 if (!anyexp) 2200 { 2201 error("missing exponent"); 2202 isWellformedString = false; 2203 } 2204 break; 2205 } 2206 } 2207 else if (hex) 2208 { 2209 error("exponent required for hex float"); 2210 isWellformedString = false; 2211 } 2212 --p; 2213 while (pstart < p) 2214 { 2215 if (*pstart != '_') 2216 stringbuffer.writeByte(*pstart); 2217 ++pstart; 2218 } 2219 stringbuffer.writeByte(0); 2220 auto sbufptr = cast(const(char)*)stringbuffer[].ptr; 2221 TOK result; 2222 bool isOutOfRange = false; 2223 t.floatvalue = (isWellformedString ? CTFloat.parse(sbufptr, &isOutOfRange) : CTFloat.zero); 2224 switch (*p) 2225 { 2226 case 'F': 2227 case 'f': 2228 if (isWellformedString && !isOutOfRange) 2229 isOutOfRange = Port.isFloat32LiteralOutOfRange(sbufptr); 2230 result = TOK.float32Literal; 2231 p++; 2232 break; 2233 default: 2234 if (isWellformedString && !isOutOfRange) 2235 isOutOfRange = Port.isFloat64LiteralOutOfRange(sbufptr); 2236 result = TOK.float64Literal; 2237 break; 2238 case 'l': 2239 error("use 'L' suffix instead of 'l'"); 2240 goto case 'L'; 2241 case 'L': 2242 result = TOK.float80Literal; 2243 p++; 2244 break; 2245 } 2246 if (*p == 'i' || *p == 'I') 2247 { 2248 if (*p == 'I') 2249 error("use 'i' suffix instead of 'I'"); 2250 p++; 2251 switch (result) 2252 { 2253 case TOK.float32Literal: 2254 result = TOK.imaginary32Literal; 2255 break; 2256 case TOK.float64Literal: 2257 result = TOK.imaginary64Literal; 2258 break; 2259 case TOK.float80Literal: 2260 result = TOK.imaginary80Literal; 2261 break; 2262 default: 2263 break; 2264 } 2265 } 2266 const isLong = (result == TOK.float80Literal || result == TOK.imaginary80Literal); 2267 if (isOutOfRange && !isLong) 2268 { 2269 const char* suffix = (result == TOK.float32Literal || result == TOK.imaginary32Literal) ? "f" : ""; 2270 error(scanloc, "number `%s%s` is not representable", sbufptr, suffix); 2271 } 2272 debug 2273 { 2274 switch (result) 2275 { 2276 case TOK.float32Literal: 2277 case TOK.float64Literal: 2278 case TOK.float80Literal: 2279 case TOK.imaginary32Literal: 2280 case TOK.imaginary64Literal: 2281 case TOK.imaginary80Literal: 2282 break; 2283 default: 2284 assert(0); 2285 } 2286 } 2287 return result; 2288 } 2289 2290 final Loc loc() pure @nogc 2291 { 2292 scanloc.charnum = cast(uint)(1 + p - line); 2293 scanloc.offset = cast(uint)(p - base); 2294 return scanloc; 2295 } 2296 2297 final void error(const(char)* format, ...) 2298 { 2299 va_list args; 2300 va_start(args, format); 2301 handleDiagnostic(token.loc, Severity.error, format, args); 2302 va_end(args); 2303 } 2304 2305 final void error(const ref Loc loc, const(char)* format, ...) 2306 { 2307 va_list args; 2308 va_start(args, format); 2309 handleDiagnostic(loc, Severity.error, format, args); 2310 va_end(args); 2311 } 2312 2313 final void errorSupplemental(const ref Loc loc, const(char)* format, ...) 2314 { 2315 va_list args; 2316 va_start(args, format); 2317 handleDiagnostic(loc, Severity.error, format, args, true); 2318 va_end(args); 2319 } 2320 2321 final void warning(const ref Loc loc, const(char)* format, ...) 2322 { 2323 va_list args; 2324 va_start(args, format); 2325 handleDiagnostic(loc, Severity.warning, format, args); 2326 va_end(args); 2327 } 2328 2329 final void warningSupplemental(const ref Loc loc, const(char)* format, ...) 2330 { 2331 va_list args; 2332 va_start(args, format); 2333 handleDiagnostic(loc, Severity.warning, format, args, true); 2334 va_end(args); 2335 } 2336 2337 final void deprecation(const(char)* format, ...) 2338 { 2339 va_list args; 2340 va_start(args, format); 2341 handleDiagnostic(token.loc, Severity.deprecation, format, args); 2342 va_end(args); 2343 } 2344 2345 final void deprecationSupplemental(const(char)* format, ...) 2346 { 2347 va_list args; 2348 va_start(args, format); 2349 handleDiagnostic(token.loc, Severity.deprecation, format, args, true); 2350 va_end(args); 2351 } 2352 2353 /********************************************* 2354 * parse: 2355 * #line linnum [filespec] 2356 * also allow __LINE__ for linnum, and __FILE__ for filespec 2357 */ 2358 private void poundLine() 2359 { 2360 auto linnum = this.scanloc.linnum; 2361 const(char)* filespec = null; 2362 const loc = this.loc(); 2363 Token tok; 2364 scan(&tok); 2365 if (tok.value == TOK.int32Literal || tok.value == TOK.int64Literal) 2366 { 2367 const lin = cast(int)(tok.unsvalue - 1); 2368 if (lin != tok.unsvalue - 1) 2369 error("line number `%lld` out of range", cast(ulong)tok.unsvalue); 2370 else 2371 linnum = lin; 2372 } 2373 else if (tok.value == TOK.line) 2374 { 2375 } 2376 else 2377 goto Lerr; 2378 while (1) 2379 { 2380 switch (*p) 2381 { 2382 case 0: 2383 case 0x1A: 2384 case '\n': 2385 Lnewline: 2386 if (!inTokenStringConstant) 2387 { 2388 this.scanloc.linnum = linnum; 2389 if (filespec) 2390 this.scanloc.filename = filespec; 2391 } 2392 return; 2393 case '\r': 2394 p++; 2395 if (*p != '\n') 2396 { 2397 p--; 2398 goto Lnewline; 2399 } 2400 continue; 2401 case ' ': 2402 case '\t': 2403 case '\v': 2404 case '\f': 2405 p++; 2406 continue; // skip white space 2407 case '_': 2408 if (memcmp(p, "__FILE__".ptr, 8) == 0) 2409 { 2410 p += 8; 2411 filespec = mem.xstrdup(scanloc.filename); 2412 continue; 2413 } 2414 goto Lerr; 2415 case '"': 2416 if (filespec) 2417 goto Lerr; 2418 stringbuffer.setsize(0); 2419 p++; 2420 while (1) 2421 { 2422 uint c; 2423 c = *p; 2424 switch (c) 2425 { 2426 case '\n': 2427 case '\r': 2428 case 0: 2429 case 0x1A: 2430 goto Lerr; 2431 case '"': 2432 stringbuffer.writeByte(0); 2433 filespec = mem.xstrdup(cast(const(char)*)stringbuffer[].ptr); 2434 p++; 2435 break; 2436 default: 2437 if (c & 0x80) 2438 { 2439 uint u = decodeUTF(); 2440 if (u == PS || u == LS) 2441 goto Lerr; 2442 } 2443 stringbuffer.writeByte(c); 2444 p++; 2445 continue; 2446 } 2447 break; 2448 } 2449 continue; 2450 default: 2451 if (*p & 0x80) 2452 { 2453 uint u = decodeUTF(); 2454 if (u == PS || u == LS) 2455 goto Lnewline; 2456 } 2457 goto Lerr; 2458 } 2459 } 2460 Lerr: 2461 error(loc, "#line integer [\"filespec\"]\\n expected"); 2462 } 2463 2464 /******************************************** 2465 * Decode UTF character. 2466 * Issue error messages for invalid sequences. 2467 * Return decoded character, advance p to last character in UTF sequence. 2468 */ 2469 private uint decodeUTF() 2470 { 2471 const s = p; 2472 assert(*s & 0x80); 2473 // Check length of remaining string up to 4 UTF-8 characters 2474 size_t len; 2475 for (len = 1; len < 4 && s[len]; len++) 2476 { 2477 } 2478 size_t idx = 0; 2479 dchar u; 2480 const msg = utf_decodeChar(s[0 .. len], idx, u); 2481 p += idx - 1; 2482 if (msg) 2483 { 2484 error("%.*s", cast(int)msg.length, msg.ptr); 2485 } 2486 return u; 2487 } 2488 2489 /*************************************************** 2490 * Parse doc comment embedded between t.ptr and p. 2491 * Remove trailing blanks and tabs from lines. 2492 * Replace all newlines with \n. 2493 * Remove leading comment character from each line. 2494 * Decide if it's a lineComment or a blockComment. 2495 * Append to previous one for this token. 2496 * 2497 * If newParagraph is true, an extra newline will be 2498 * added between adjoining doc comments. 2499 */ 2500 private void getDocComment(Token* t, uint lineComment, bool newParagraph) pure 2501 { 2502 /* ct tells us which kind of comment it is: '/', '*', or '+' 2503 */ 2504 const ct = t.ptr[2]; 2505 /* Start of comment text skips over / * *, / + +, or / / / 2506 */ 2507 const(char)* q = t.ptr + 3; // start of comment text 2508 const(char)* qend = p; 2509 if (ct == '*' || ct == '+') 2510 qend -= 2; 2511 /* Scan over initial row of ****'s or ++++'s or ////'s 2512 */ 2513 for (; q < qend; q++) 2514 { 2515 if (*q != ct) 2516 break; 2517 } 2518 /* Remove leading spaces until start of the comment 2519 */ 2520 int linestart = 0; 2521 if (ct == '/') 2522 { 2523 while (q < qend && (*q == ' ' || *q == '\t')) 2524 ++q; 2525 } 2526 else if (q < qend) 2527 { 2528 if (*q == '\r') 2529 { 2530 ++q; 2531 if (q < qend && *q == '\n') 2532 ++q; 2533 linestart = 1; 2534 } 2535 else if (*q == '\n') 2536 { 2537 ++q; 2538 linestart = 1; 2539 } 2540 } 2541 /* Remove trailing row of ****'s or ++++'s 2542 */ 2543 if (ct != '/') 2544 { 2545 for (; q < qend; qend--) 2546 { 2547 if (qend[-1] != ct) 2548 break; 2549 } 2550 } 2551 /* Comment is now [q .. qend]. 2552 * Canonicalize it into buf[]. 2553 */ 2554 OutBuffer buf; 2555 2556 void trimTrailingWhitespace() 2557 { 2558 const s = buf[]; 2559 auto len = s.length; 2560 while (len && (s[len - 1] == ' ' || s[len - 1] == '\t')) 2561 --len; 2562 buf.setsize(len); 2563 } 2564 2565 for (; q < qend; q++) 2566 { 2567 char c = *q; 2568 switch (c) 2569 { 2570 case '*': 2571 case '+': 2572 if (linestart && c == ct) 2573 { 2574 linestart = 0; 2575 /* Trim preceding whitespace up to preceding \n 2576 */ 2577 trimTrailingWhitespace(); 2578 continue; 2579 } 2580 break; 2581 case ' ': 2582 case '\t': 2583 break; 2584 case '\r': 2585 if (q[1] == '\n') 2586 continue; // skip the \r 2587 goto Lnewline; 2588 default: 2589 if (c == 226) 2590 { 2591 // If LS or PS 2592 if (q[1] == 128 && (q[2] == 168 || q[2] == 169)) 2593 { 2594 q += 2; 2595 goto Lnewline; 2596 } 2597 } 2598 linestart = 0; 2599 break; 2600 Lnewline: 2601 c = '\n'; // replace all newlines with \n 2602 goto case; 2603 case '\n': 2604 linestart = 1; 2605 /* Trim trailing whitespace 2606 */ 2607 trimTrailingWhitespace(); 2608 break; 2609 } 2610 buf.writeByte(c); 2611 } 2612 /* Trim trailing whitespace (if the last line does not have newline) 2613 */ 2614 trimTrailingWhitespace(); 2615 2616 // Always end with a newline 2617 const s = buf[]; 2618 if (s.length == 0 || s[$ - 1] != '\n') 2619 buf.writeByte('\n'); 2620 2621 // It's a line comment if the start of the doc comment comes 2622 // after other non-whitespace on the same line. 2623 auto dc = (lineComment && anyToken) ? &t.lineComment : &t.blockComment; 2624 // Combine with previous doc comment, if any 2625 if (*dc) 2626 *dc = combineComments(*dc, buf[], newParagraph).toDString(); 2627 else 2628 *dc = buf.extractSlice(true); 2629 } 2630 2631 /******************************************** 2632 * Combine two document comments into one, 2633 * separated by an extra newline if newParagraph is true. 2634 */ 2635 static const(char)* combineComments(const(char)[] c1, const(char)[] c2, bool newParagraph) pure 2636 { 2637 //printf("Lexer::combineComments('%s', '%s', '%i')\n", c1, c2, newParagraph); 2638 const(int) newParagraphSize = newParagraph ? 1 : 0; // Size of the combining '\n' 2639 if (!c1) 2640 return c2.ptr; 2641 if (!c2) 2642 return c1.ptr; 2643 2644 int insertNewLine = 0; 2645 if (c1.length && c1[$ - 1] != '\n') 2646 insertNewLine = 1; 2647 const retSize = c1.length + insertNewLine + newParagraphSize + c2.length; 2648 auto p = cast(char*)mem.xmalloc_noscan(retSize + 1); 2649 p[0 .. c1.length] = c1[]; 2650 if (insertNewLine) 2651 p[c1.length] = '\n'; 2652 if (newParagraph) 2653 p[c1.length + insertNewLine] = '\n'; 2654 p[retSize - c2.length .. retSize] = c2[]; 2655 p[retSize] = 0; 2656 return p; 2657 } 2658 2659 private: 2660 void endOfLine() pure @nogc @safe 2661 { 2662 scanloc.linnum++; 2663 line = p; 2664 } 2665 } 2666 2667 /// Support for `__DATE__`, `__TIME__`, and `__TIMESTAMP__` 2668 private struct TimeStampInfo 2669 { 2670 private __gshared bool initdone = false; 2671 2672 // Note: Those properties need to be guarded by a call to `init` 2673 // The API isn't safe, and quite brittle, but it was left this way 2674 // over performance concerns. 2675 // This is currently only called once, from the lexer. 2676 __gshared char[11 + 1] date; 2677 __gshared char[8 + 1] time; 2678 __gshared char[24 + 1] timestamp; 2679 2680 public static void initialize(const ref Loc loc) nothrow 2681 { 2682 if (initdone) 2683 return; 2684 2685 initdone = true; 2686 time_t ct; 2687 // https://issues.dlang.org/show_bug.cgi?id=20444 2688 if (auto p = getenv("SOURCE_DATE_EPOCH")) 2689 { 2690 if (!ct.parseDigits(p.toDString())) 2691 error(loc, "Value of environment variable `SOURCE_DATE_EPOCH` should be a valid UNIX timestamp, not: `%s`", p); 2692 } 2693 else 2694 .time(&ct); 2695 const p = ctime(&ct); 2696 assert(p); 2697 sprintf(&date[0], "%.6s %.4s", p + 4, p + 20); 2698 sprintf(&time[0], "%.8s", p + 11); 2699 sprintf(×tamp[0], "%.24s", p); 2700 } 2701 } 2702 2703 unittest 2704 { 2705 import dmd.console; 2706 nothrow bool assertDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2707 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2708 { 2709 assert(0); 2710 } 2711 diagnosticHandler = &assertDiagnosticHandler; 2712 2713 static void test(T)(string sequence, T expected) 2714 { 2715 auto p = cast(const(char)*)sequence.ptr; 2716 assert(expected == Lexer.escapeSequence(Loc.initial, p)); 2717 assert(p == sequence.ptr + sequence.length); 2718 } 2719 2720 test(`'`, '\''); 2721 test(`"`, '"'); 2722 test(`?`, '?'); 2723 test(`\`, '\\'); 2724 test(`0`, '\0'); 2725 test(`a`, '\a'); 2726 test(`b`, '\b'); 2727 test(`f`, '\f'); 2728 test(`n`, '\n'); 2729 test(`r`, '\r'); 2730 test(`t`, '\t'); 2731 test(`v`, '\v'); 2732 2733 test(`x00`, 0x00); 2734 test(`xff`, 0xff); 2735 test(`xFF`, 0xff); 2736 test(`xa7`, 0xa7); 2737 test(`x3c`, 0x3c); 2738 test(`xe2`, 0xe2); 2739 2740 test(`1`, '\1'); 2741 test(`42`, '\42'); 2742 test(`357`, '\357'); 2743 2744 test(`u1234`, '\u1234'); 2745 test(`uf0e4`, '\uf0e4'); 2746 2747 test(`U0001f603`, '\U0001f603'); 2748 2749 test(`"`, '"'); 2750 test(`<`, '<'); 2751 test(`>`, '>'); 2752 2753 diagnosticHandler = null; 2754 } 2755 unittest 2756 { 2757 import dmd.console; 2758 string expected; 2759 bool gotError; 2760 2761 nothrow bool expectDiagnosticHandler(const ref Loc loc, Color headerColor, const(char)* header, 2762 const(char)* format, va_list ap, const(char)* p1, const(char)* p2) 2763 { 2764 assert(cast(Classification)headerColor == Classification.error); 2765 2766 gotError = true; 2767 char[100] buffer = void; 2768 auto actual = buffer[0 .. vsprintf(buffer.ptr, format, ap)]; 2769 assert(expected == actual); 2770 return true; 2771 } 2772 2773 diagnosticHandler = &expectDiagnosticHandler; 2774 2775 void test(string sequence, string expectedError, dchar expectedReturnValue, uint expectedScanLength) 2776 { 2777 uint errors = global.errors; 2778 gotError = false; 2779 expected = expectedError; 2780 auto p = cast(const(char)*)sequence.ptr; 2781 auto actualReturnValue = Lexer.escapeSequence(Loc.initial, p); 2782 assert(gotError); 2783 assert(expectedReturnValue == actualReturnValue); 2784 2785 auto actualScanLength = p - sequence.ptr; 2786 assert(expectedScanLength == actualScanLength); 2787 global.errors = errors; 2788 } 2789 2790 test("c", `undefined escape sequence \c`, 'c', 1); 2791 test("!", `undefined escape sequence \!`, '!', 1); 2792 2793 test("x1", `escape hex sequence has 1 hex digits instead of 2`, '\x01', 2); 2794 2795 test("u1" , `escape hex sequence has 1 hex digits instead of 4`, 0x1, 2); 2796 test("u12" , `escape hex sequence has 2 hex digits instead of 4`, 0x12, 3); 2797 test("u123", `escape hex sequence has 3 hex digits instead of 4`, 0x123, 4); 2798 2799 test("U0" , `escape hex sequence has 1 hex digits instead of 8`, 0x0, 2); 2800 test("U00" , `escape hex sequence has 2 hex digits instead of 8`, 0x00, 3); 2801 test("U000" , `escape hex sequence has 3 hex digits instead of 8`, 0x000, 4); 2802 test("U0000" , `escape hex sequence has 4 hex digits instead of 8`, 0x0000, 5); 2803 test("U0001f" , `escape hex sequence has 5 hex digits instead of 8`, 0x0001f, 6); 2804 test("U0001f6" , `escape hex sequence has 6 hex digits instead of 8`, 0x0001f6, 7); 2805 test("U0001f60", `escape hex sequence has 7 hex digits instead of 8`, 0x0001f60, 8); 2806 2807 test("ud800" , `invalid UTF character \U0000d800`, '?', 5); 2808 test("udfff" , `invalid UTF character \U0000dfff`, '?', 5); 2809 test("U00110000", `invalid UTF character \U00110000`, '?', 9); 2810 2811 test("xg0" , `undefined escape hex sequence \xg`, 'g', 2); 2812 test("ug000" , `undefined escape hex sequence \ug`, 'g', 2); 2813 test("Ug0000000", `undefined escape hex sequence \Ug`, 'g', 2); 2814 2815 test("&BAD;", `unnamed character entity &BAD;` , '?', 5); 2816 test(""", `unterminated named entity "`, '?', 5); 2817 2818 test("400", `escape octal sequence \400 is larger than \377`, 0x100, 3); 2819 2820 diagnosticHandler = null; 2821 }