1 /** 2 * Functions related to UTF encoding. 3 * 4 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved 5 * Authors: $(LINK2 http://www.digitalmars.com, Walter Bright) 6 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 7 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/utf.d, _utf.d) 8 * Documentation: https://dlang.org/phobos/dmd_utf.html 9 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/utf.d 10 */ 11 12 module dmd.utf; 13 14 nothrow pure @nogc: 15 16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF] 17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF] 18 bool utf_isValidDchar(dchar c) 19 { 20 // TODO: Whether non-char code points should be rejected is pending review. 21 // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar 22 // See also https://issues.dlang.org/show_bug.cgi?id=1357 23 if (c < 0xD800) // Almost all characters in a typical document. 24 return true; 25 if (c > 0xDFFF && c <= 0x10FFFF) 26 return true; 27 return false; 28 } 29 30 /******************************* 31 * Return !=0 if unicode alpha. 32 * Use table from C99 Appendix D. 33 */ 34 bool isUniAlpha(dchar c) 35 { 36 static immutable wchar[2][] ALPHA_TABLE = 37 [ 38 [0x00AA, 0x00AA], 39 [0x00B5, 0x00B5], 40 [0x00B7, 0x00B7], 41 [0x00BA, 0x00BA], 42 [0x00C0, 0x00D6], 43 [0x00D8, 0x00F6], 44 [0x00F8, 0x01F5], 45 [0x01FA, 0x0217], 46 [0x0250, 0x02A8], 47 [0x02B0, 0x02B8], 48 [0x02BB, 0x02BB], 49 [0x02BD, 0x02C1], 50 [0x02D0, 0x02D1], 51 [0x02E0, 0x02E4], 52 [0x037A, 0x037A], 53 [0x0386, 0x0386], 54 [0x0388, 0x038A], 55 [0x038C, 0x038C], 56 [0x038E, 0x03A1], 57 [0x03A3, 0x03CE], 58 [0x03D0, 0x03D6], 59 [0x03DA, 0x03DA], 60 [0x03DC, 0x03DC], 61 [0x03DE, 0x03DE], 62 [0x03E0, 0x03E0], 63 [0x03E2, 0x03F3], 64 [0x0401, 0x040C], 65 [0x040E, 0x044F], 66 [0x0451, 0x045C], 67 [0x045E, 0x0481], 68 [0x0490, 0x04C4], 69 [0x04C7, 0x04C8], 70 [0x04CB, 0x04CC], 71 [0x04D0, 0x04EB], 72 [0x04EE, 0x04F5], 73 [0x04F8, 0x04F9], 74 [0x0531, 0x0556], 75 [0x0559, 0x0559], 76 [0x0561, 0x0587], 77 [0x05B0, 0x05B9], 78 [0x05BB, 0x05BD], 79 [0x05BF, 0x05BF], 80 [0x05C1, 0x05C2], 81 [0x05D0, 0x05EA], 82 [0x05F0, 0x05F2], 83 [0x0621, 0x063A], 84 [0x0640, 0x0652], 85 [0x0660, 0x0669], 86 [0x0670, 0x06B7], 87 [0x06BA, 0x06BE], 88 [0x06C0, 0x06CE], 89 [0x06D0, 0x06DC], 90 [0x06E5, 0x06E8], 91 [0x06EA, 0x06ED], 92 [0x06F0, 0x06F9], 93 [0x0901, 0x0903], 94 [0x0905, 0x0939], 95 [0x093D, 0x094D], 96 [0x0950, 0x0952], 97 [0x0958, 0x0963], 98 [0x0966, 0x096F], 99 [0x0981, 0x0983], 100 [0x0985, 0x098C], 101 [0x098F, 0x0990], 102 [0x0993, 0x09A8], 103 [0x09AA, 0x09B0], 104 [0x09B2, 0x09B2], 105 [0x09B6, 0x09B9], 106 [0x09BE, 0x09C4], 107 [0x09C7, 0x09C8], 108 [0x09CB, 0x09CD], 109 [0x09DC, 0x09DD], 110 [0x09DF, 0x09E3], 111 [0x09E6, 0x09F1], 112 [0x0A02, 0x0A02], 113 [0x0A05, 0x0A0A], 114 [0x0A0F, 0x0A10], 115 [0x0A13, 0x0A28], 116 [0x0A2A, 0x0A30], 117 [0x0A32, 0x0A33], 118 [0x0A35, 0x0A36], 119 [0x0A38, 0x0A39], 120 [0x0A3E, 0x0A42], 121 [0x0A47, 0x0A48], 122 [0x0A4B, 0x0A4D], 123 [0x0A59, 0x0A5C], 124 [0x0A5E, 0x0A5E], 125 [0x0A66, 0x0A6F], 126 [0x0A74, 0x0A74], 127 [0x0A81, 0x0A83], 128 [0x0A85, 0x0A8B], 129 [0x0A8D, 0x0A8D], 130 [0x0A8F, 0x0A91], 131 [0x0A93, 0x0AA8], 132 [0x0AAA, 0x0AB0], 133 [0x0AB2, 0x0AB3], 134 [0x0AB5, 0x0AB9], 135 [0x0ABD, 0x0AC5], 136 [0x0AC7, 0x0AC9], 137 [0x0ACB, 0x0ACD], 138 [0x0AD0, 0x0AD0], 139 [0x0AE0, 0x0AE0], 140 [0x0AE6, 0x0AEF], 141 [0x0B01, 0x0B03], 142 [0x0B05, 0x0B0C], 143 [0x0B0F, 0x0B10], 144 [0x0B13, 0x0B28], 145 [0x0B2A, 0x0B30], 146 [0x0B32, 0x0B33], 147 [0x0B36, 0x0B39], 148 [0x0B3D, 0x0B43], 149 [0x0B47, 0x0B48], 150 [0x0B4B, 0x0B4D], 151 [0x0B5C, 0x0B5D], 152 [0x0B5F, 0x0B61], 153 [0x0B66, 0x0B6F], 154 [0x0B82, 0x0B83], 155 [0x0B85, 0x0B8A], 156 [0x0B8E, 0x0B90], 157 [0x0B92, 0x0B95], 158 [0x0B99, 0x0B9A], 159 [0x0B9C, 0x0B9C], 160 [0x0B9E, 0x0B9F], 161 [0x0BA3, 0x0BA4], 162 [0x0BA8, 0x0BAA], 163 [0x0BAE, 0x0BB5], 164 [0x0BB7, 0x0BB9], 165 [0x0BBE, 0x0BC2], 166 [0x0BC6, 0x0BC8], 167 [0x0BCA, 0x0BCD], 168 [0x0BE7, 0x0BEF], 169 [0x0C01, 0x0C03], 170 [0x0C05, 0x0C0C], 171 [0x0C0E, 0x0C10], 172 [0x0C12, 0x0C28], 173 [0x0C2A, 0x0C33], 174 [0x0C35, 0x0C39], 175 [0x0C3E, 0x0C44], 176 [0x0C46, 0x0C48], 177 [0x0C4A, 0x0C4D], 178 [0x0C60, 0x0C61], 179 [0x0C66, 0x0C6F], 180 [0x0C82, 0x0C83], 181 [0x0C85, 0x0C8C], 182 [0x0C8E, 0x0C90], 183 [0x0C92, 0x0CA8], 184 [0x0CAA, 0x0CB3], 185 [0x0CB5, 0x0CB9], 186 [0x0CBE, 0x0CC4], 187 [0x0CC6, 0x0CC8], 188 [0x0CCA, 0x0CCD], 189 [0x0CDE, 0x0CDE], 190 [0x0CE0, 0x0CE1], 191 [0x0CE6, 0x0CEF], 192 [0x0D02, 0x0D03], 193 [0x0D05, 0x0D0C], 194 [0x0D0E, 0x0D10], 195 [0x0D12, 0x0D28], 196 [0x0D2A, 0x0D39], 197 [0x0D3E, 0x0D43], 198 [0x0D46, 0x0D48], 199 [0x0D4A, 0x0D4D], 200 [0x0D60, 0x0D61], 201 [0x0D66, 0x0D6F], 202 [0x0E01, 0x0E3A], 203 [0x0E40, 0x0E5B], 204 [0x0E81, 0x0E82], 205 [0x0E84, 0x0E84], 206 [0x0E87, 0x0E88], 207 [0x0E8A, 0x0E8A], 208 [0x0E8D, 0x0E8D], 209 [0x0E94, 0x0E97], 210 [0x0E99, 0x0E9F], 211 [0x0EA1, 0x0EA3], 212 [0x0EA5, 0x0EA5], 213 [0x0EA7, 0x0EA7], 214 [0x0EAA, 0x0EAB], 215 [0x0EAD, 0x0EAE], 216 [0x0EB0, 0x0EB9], 217 [0x0EBB, 0x0EBD], 218 [0x0EC0, 0x0EC4], 219 [0x0EC6, 0x0EC6], 220 [0x0EC8, 0x0ECD], 221 [0x0ED0, 0x0ED9], 222 [0x0EDC, 0x0EDD], 223 [0x0F00, 0x0F00], 224 [0x0F18, 0x0F19], 225 [0x0F20, 0x0F33], 226 [0x0F35, 0x0F35], 227 [0x0F37, 0x0F37], 228 [0x0F39, 0x0F39], 229 [0x0F3E, 0x0F47], 230 [0x0F49, 0x0F69], 231 [0x0F71, 0x0F84], 232 [0x0F86, 0x0F8B], 233 [0x0F90, 0x0F95], 234 [0x0F97, 0x0F97], 235 [0x0F99, 0x0FAD], 236 [0x0FB1, 0x0FB7], 237 [0x0FB9, 0x0FB9], 238 [0x10A0, 0x10C5], 239 [0x10D0, 0x10F6], 240 [0x1E00, 0x1E9B], 241 [0x1EA0, 0x1EF9], 242 [0x1F00, 0x1F15], 243 [0x1F18, 0x1F1D], 244 [0x1F20, 0x1F45], 245 [0x1F48, 0x1F4D], 246 [0x1F50, 0x1F57], 247 [0x1F59, 0x1F59], 248 [0x1F5B, 0x1F5B], 249 [0x1F5D, 0x1F5D], 250 [0x1F5F, 0x1F7D], 251 [0x1F80, 0x1FB4], 252 [0x1FB6, 0x1FBC], 253 [0x1FBE, 0x1FBE], 254 [0x1FC2, 0x1FC4], 255 [0x1FC6, 0x1FCC], 256 [0x1FD0, 0x1FD3], 257 [0x1FD6, 0x1FDB], 258 [0x1FE0, 0x1FEC], 259 [0x1FF2, 0x1FF4], 260 [0x1FF6, 0x1FFC], 261 [0x203F, 0x2040], 262 [0x207F, 0x207F], 263 [0x2102, 0x2102], 264 [0x2107, 0x2107], 265 [0x210A, 0x2113], 266 [0x2115, 0x2115], 267 [0x2118, 0x211D], 268 [0x2124, 0x2124], 269 [0x2126, 0x2126], 270 [0x2128, 0x2128], 271 [0x212A, 0x2131], 272 [0x2133, 0x2138], 273 [0x2160, 0x2182], 274 [0x3005, 0x3007], 275 [0x3021, 0x3029], 276 [0x3041, 0x3093], 277 [0x309B, 0x309C], 278 [0x30A1, 0x30F6], 279 [0x30FB, 0x30FC], 280 [0x3105, 0x312C], 281 [0x4E00, 0x9FA5], 282 [0xAC00, 0xD7A3] 283 ]; 284 285 size_t high = ALPHA_TABLE.length - 1; 286 // Shortcut search if c is out of range 287 size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0; 288 // Binary search 289 while (low <= high) 290 { 291 size_t mid = (low + high) >> 1; 292 if (c < ALPHA_TABLE[mid][0]) 293 high = mid - 1; 294 else if (ALPHA_TABLE[mid][1] < c) 295 low = mid + 1; 296 else 297 { 298 assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]); 299 return true; 300 } 301 } 302 return false; 303 } 304 305 /** 306 * Returns the code length of c in code units. 307 */ 308 int utf_codeLengthChar(dchar c) 309 { 310 if (c <= 0x7F) 311 return 1; 312 if (c <= 0x7FF) 313 return 2; 314 if (c <= 0xFFFF) 315 return 3; 316 if (c <= 0x10FFFF) 317 return 4; 318 assert(false); 319 } 320 321 int utf_codeLengthWchar(dchar c) 322 { 323 return c <= 0xFFFF ? 1 : 2; 324 } 325 326 /** 327 * Returns the code length of c in code units for the encoding. 328 * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32. 329 */ 330 int utf_codeLength(int sz, dchar c) 331 { 332 if (sz == 1) 333 return utf_codeLengthChar(c); 334 if (sz == 2) 335 return utf_codeLengthWchar(c); 336 assert(sz == 4); 337 return 1; 338 } 339 340 void utf_encodeChar(char* s, dchar c) 341 { 342 assert(s !is null); 343 assert(utf_isValidDchar(c)); 344 if (c <= 0x7F) 345 { 346 s[0] = cast(char)c; 347 } 348 else if (c <= 0x07FF) 349 { 350 s[0] = cast(char)(0xC0 | (c >> 6)); 351 s[1] = cast(char)(0x80 | (c & 0x3F)); 352 } 353 else if (c <= 0xFFFF) 354 { 355 s[0] = cast(char)(0xE0 | (c >> 12)); 356 s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 357 s[2] = cast(char)(0x80 | (c & 0x3F)); 358 } 359 else if (c <= 0x10FFFF) 360 { 361 s[0] = cast(char)(0xF0 | (c >> 18)); 362 s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F)); 363 s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F)); 364 s[3] = cast(char)(0x80 | (c & 0x3F)); 365 } 366 else 367 assert(0); 368 } 369 370 void utf_encodeWchar(wchar* s, dchar c) 371 { 372 assert(s !is null); 373 assert(utf_isValidDchar(c)); 374 if (c <= 0xFFFF) 375 { 376 s[0] = cast(wchar)c; 377 } 378 else 379 { 380 s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800); 381 s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00); 382 } 383 } 384 385 void utf_encode(int sz, void* s, dchar c) 386 { 387 if (sz == 1) 388 utf_encodeChar(cast(char*)s, c); 389 else if (sz == 2) 390 utf_encodeWchar(cast(wchar*)s, c); 391 else 392 { 393 assert(sz == 4); 394 *(cast(dchar*)s) = c; 395 } 396 } 397 398 /******************************************** 399 * Decode a UTF-8 sequence as a single UTF-32 code point. 400 * Params: 401 * s = UTF-8 sequence 402 * ridx = starting index in s[], updated to reflect number of code units decoded 403 * rresult = set to character decoded 404 * Returns: 405 * null on success, otherwise error message string 406 */ 407 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult) 408 { 409 // UTF-8 decoding errors 410 static immutable string UTF8_DECODE_OK = null; // no error 411 static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space"; 412 static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence"; 413 static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence"; 414 static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit"; 415 static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; 416 417 /* The following encodings are valid, except for the 5 and 6 byte 418 * combinations: 419 * 0xxxxxxx 420 * 110xxxxx 10xxxxxx 421 * 1110xxxx 10xxxxxx 10xxxxxx 422 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 423 * 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 424 * 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 425 */ 426 static immutable ubyte[256] UTF8_STRIDE = 427 [ 428 1,1,1,1, 1,1,1,1, 429 1,1,1,1, 1,1,1,1, 430 1,1,1,1, 1,1,1,1, 431 1,1,1,1, 1,1,1,1, 432 1,1,1,1, 1,1,1,1, 433 1,1,1,1, 1,1,1,1, 434 1,1,1,1, 1,1,1,1, 435 1,1,1,1, 1,1,1,1, 436 437 1,1,1,1, 1,1,1,1, 438 1,1,1,1, 1,1,1,1, 439 1,1,1,1, 1,1,1,1, 440 1,1,1,1, 1,1,1,1, 441 1,1,1,1, 1,1,1,1, 442 1,1,1,1, 1,1,1,1, 443 1,1,1,1, 1,1,1,1, 444 1,1,1,1, 1,1,1,1, 445 446 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 447 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 448 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 449 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 450 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 451 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 452 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 453 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF, 454 455 2,2,2,2, 2,2,2,2, 456 2,2,2,2, 2,2,2,2, 457 2,2,2,2, 2,2,2,2, 458 2,2,2,2, 2,2,2,2, 459 460 3,3,3,3, 3,3,3,3, 461 3,3,3,3, 3,3,3,3, 462 463 4,4,4,4, 4,4,4,4, 464 5,5,5,5, 6,6,0xFF,0xFF 465 ]; 466 467 assert(s !is null); 468 size_t i = ridx++; 469 470 const char u = s[i]; 471 // Pre-stage results for ASCII and error cases 472 rresult = u; 473 //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len); 474 // Get expected sequence length 475 const size_t n = UTF8_STRIDE[u]; 476 switch (n) 477 { 478 case 1: 479 // ASCII 480 return UTF8_DECODE_OK; 481 case 2: 482 case 3: 483 case 4: 484 // multi-byte UTF-8 485 break; 486 default: 487 // 5- or 6-byte sequence 488 return UTF8_DECODE_OUTSIDE_CODE_SPACE; 489 } 490 if (s.length < i + n) // source too short 491 return UTF8_DECODE_TRUNCATED_SEQUENCE; 492 // Pick off 7 - n low bits from first code unit 493 dchar c = u & ((1 << (7 - n)) - 1); 494 /* The following combinations are overlong, and illegal: 495 * 1100000x (10xxxxxx) 496 * 11100000 100xxxxx (10xxxxxx) 497 * 11110000 1000xxxx (10xxxxxx 10xxxxxx) 498 * 11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx) 499 * 11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx) 500 */ 501 const char u2 = s[++i]; 502 // overlong combination 503 if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80)) 504 return UTF8_DECODE_OVERLONG; 505 // Decode remaining bits 506 for (const m = n + i - 1; i != m; ++i) 507 { 508 const u3 = s[i]; 509 if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx 510 return UTF8_DECODE_INVALID_TRAILER; 511 c = (c << 6) | (u3 & 0x3F); 512 } 513 if (!utf_isValidDchar(c)) 514 return UTF8_DECODE_INVALID_CODE_POINT; 515 ridx = i; 516 rresult = c; 517 return UTF8_DECODE_OK; 518 } 519 520 /******************************************** 521 * Decode a UTF-16 sequence as a single UTF-32 code point. 522 * Params: 523 * s = UTF-16 sequence 524 * ridx = starting index in s[], updated to reflect number of code units decoded 525 * rresult = set to character decoded 526 * Returns: 527 * null on success, otherwise error message string 528 */ 529 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult) 530 { 531 // UTF-16 decoding errors 532 static immutable string UTF16_DECODE_OK = null; // no error 533 static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence"; 534 static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate"; 535 static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate"; 536 static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded"; 537 538 assert(s !is null); 539 size_t i = ridx++; 540 541 // Pre-stage results for single wchar and error cases 542 dchar u = rresult = s[i]; 543 if (u < 0xD800) // Single wchar codepoint 544 return UTF16_DECODE_OK; 545 if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair 546 { 547 if (s.length <= i + 1) 548 return UTF16_DECODE_TRUNCATED_SEQUENCE; 549 wchar u2 = s[i + 1]; 550 if (u2 < 0xDC00 || 0xDFFF < u) 551 return UTF16_DECODE_INVALID_SURROGATE; 552 u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00); 553 ++ridx; 554 } 555 else if (0xDC00 <= u && u <= 0xDFFF) 556 return UTF16_DECODE_UNPAIRED_SURROGATE; 557 if (!utf_isValidDchar(u)) 558 return UTF16_DECODE_INVALID_CODE_POINT; 559 rresult = u; 560 return UTF16_DECODE_OK; 561 }