1 /**
2  * Functions related to UTF encoding.
3  *
4  * Copyright:   Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
5  * Authors:     $(LINK2 http://www.digitalmars.com, Walter Bright)
6  * License:     $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7  * Source:      $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/utf.d, _utf.d)
8  * Documentation:  https://dlang.org/phobos/dmd_utf.html
9  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/utf.d
10  */
11 
12 module dmd.utf;
13 
14 nothrow pure @nogc:
15 
16 /// The Unicode code space is the range of code points [0x000000,0x10FFFF]
17 /// except the UTF-16 surrogate pairs in the range [0xD800,0xDFFF]
18 bool utf_isValidDchar(dchar c)
19 {
20     // TODO: Whether non-char code points should be rejected is pending review.
21     // 0xFFFE and 0xFFFF are valid for internal use, like Phobos std.utf.isValidDChar
22     // See also https://issues.dlang.org/show_bug.cgi?id=1357
23     if (c < 0xD800) // Almost all characters in a typical document.
24         return true;
25     if (c > 0xDFFF && c <= 0x10FFFF)
26         return true;
27     return false;
28 }
29 
30 /*******************************
31  * Return !=0 if unicode alpha.
32  * Use table from C99 Appendix D.
33  */
34 bool isUniAlpha(dchar c)
35 {
36     static immutable wchar[2][] ALPHA_TABLE =
37     [
38         [0x00AA, 0x00AA],
39         [0x00B5, 0x00B5],
40         [0x00B7, 0x00B7],
41         [0x00BA, 0x00BA],
42         [0x00C0, 0x00D6],
43         [0x00D8, 0x00F6],
44         [0x00F8, 0x01F5],
45         [0x01FA, 0x0217],
46         [0x0250, 0x02A8],
47         [0x02B0, 0x02B8],
48         [0x02BB, 0x02BB],
49         [0x02BD, 0x02C1],
50         [0x02D0, 0x02D1],
51         [0x02E0, 0x02E4],
52         [0x037A, 0x037A],
53         [0x0386, 0x0386],
54         [0x0388, 0x038A],
55         [0x038C, 0x038C],
56         [0x038E, 0x03A1],
57         [0x03A3, 0x03CE],
58         [0x03D0, 0x03D6],
59         [0x03DA, 0x03DA],
60         [0x03DC, 0x03DC],
61         [0x03DE, 0x03DE],
62         [0x03E0, 0x03E0],
63         [0x03E2, 0x03F3],
64         [0x0401, 0x040C],
65         [0x040E, 0x044F],
66         [0x0451, 0x045C],
67         [0x045E, 0x0481],
68         [0x0490, 0x04C4],
69         [0x04C7, 0x04C8],
70         [0x04CB, 0x04CC],
71         [0x04D0, 0x04EB],
72         [0x04EE, 0x04F5],
73         [0x04F8, 0x04F9],
74         [0x0531, 0x0556],
75         [0x0559, 0x0559],
76         [0x0561, 0x0587],
77         [0x05B0, 0x05B9],
78         [0x05BB, 0x05BD],
79         [0x05BF, 0x05BF],
80         [0x05C1, 0x05C2],
81         [0x05D0, 0x05EA],
82         [0x05F0, 0x05F2],
83         [0x0621, 0x063A],
84         [0x0640, 0x0652],
85         [0x0660, 0x0669],
86         [0x0670, 0x06B7],
87         [0x06BA, 0x06BE],
88         [0x06C0, 0x06CE],
89         [0x06D0, 0x06DC],
90         [0x06E5, 0x06E8],
91         [0x06EA, 0x06ED],
92         [0x06F0, 0x06F9],
93         [0x0901, 0x0903],
94         [0x0905, 0x0939],
95         [0x093D, 0x094D],
96         [0x0950, 0x0952],
97         [0x0958, 0x0963],
98         [0x0966, 0x096F],
99         [0x0981, 0x0983],
100         [0x0985, 0x098C],
101         [0x098F, 0x0990],
102         [0x0993, 0x09A8],
103         [0x09AA, 0x09B0],
104         [0x09B2, 0x09B2],
105         [0x09B6, 0x09B9],
106         [0x09BE, 0x09C4],
107         [0x09C7, 0x09C8],
108         [0x09CB, 0x09CD],
109         [0x09DC, 0x09DD],
110         [0x09DF, 0x09E3],
111         [0x09E6, 0x09F1],
112         [0x0A02, 0x0A02],
113         [0x0A05, 0x0A0A],
114         [0x0A0F, 0x0A10],
115         [0x0A13, 0x0A28],
116         [0x0A2A, 0x0A30],
117         [0x0A32, 0x0A33],
118         [0x0A35, 0x0A36],
119         [0x0A38, 0x0A39],
120         [0x0A3E, 0x0A42],
121         [0x0A47, 0x0A48],
122         [0x0A4B, 0x0A4D],
123         [0x0A59, 0x0A5C],
124         [0x0A5E, 0x0A5E],
125         [0x0A66, 0x0A6F],
126         [0x0A74, 0x0A74],
127         [0x0A81, 0x0A83],
128         [0x0A85, 0x0A8B],
129         [0x0A8D, 0x0A8D],
130         [0x0A8F, 0x0A91],
131         [0x0A93, 0x0AA8],
132         [0x0AAA, 0x0AB0],
133         [0x0AB2, 0x0AB3],
134         [0x0AB5, 0x0AB9],
135         [0x0ABD, 0x0AC5],
136         [0x0AC7, 0x0AC9],
137         [0x0ACB, 0x0ACD],
138         [0x0AD0, 0x0AD0],
139         [0x0AE0, 0x0AE0],
140         [0x0AE6, 0x0AEF],
141         [0x0B01, 0x0B03],
142         [0x0B05, 0x0B0C],
143         [0x0B0F, 0x0B10],
144         [0x0B13, 0x0B28],
145         [0x0B2A, 0x0B30],
146         [0x0B32, 0x0B33],
147         [0x0B36, 0x0B39],
148         [0x0B3D, 0x0B43],
149         [0x0B47, 0x0B48],
150         [0x0B4B, 0x0B4D],
151         [0x0B5C, 0x0B5D],
152         [0x0B5F, 0x0B61],
153         [0x0B66, 0x0B6F],
154         [0x0B82, 0x0B83],
155         [0x0B85, 0x0B8A],
156         [0x0B8E, 0x0B90],
157         [0x0B92, 0x0B95],
158         [0x0B99, 0x0B9A],
159         [0x0B9C, 0x0B9C],
160         [0x0B9E, 0x0B9F],
161         [0x0BA3, 0x0BA4],
162         [0x0BA8, 0x0BAA],
163         [0x0BAE, 0x0BB5],
164         [0x0BB7, 0x0BB9],
165         [0x0BBE, 0x0BC2],
166         [0x0BC6, 0x0BC8],
167         [0x0BCA, 0x0BCD],
168         [0x0BE7, 0x0BEF],
169         [0x0C01, 0x0C03],
170         [0x0C05, 0x0C0C],
171         [0x0C0E, 0x0C10],
172         [0x0C12, 0x0C28],
173         [0x0C2A, 0x0C33],
174         [0x0C35, 0x0C39],
175         [0x0C3E, 0x0C44],
176         [0x0C46, 0x0C48],
177         [0x0C4A, 0x0C4D],
178         [0x0C60, 0x0C61],
179         [0x0C66, 0x0C6F],
180         [0x0C82, 0x0C83],
181         [0x0C85, 0x0C8C],
182         [0x0C8E, 0x0C90],
183         [0x0C92, 0x0CA8],
184         [0x0CAA, 0x0CB3],
185         [0x0CB5, 0x0CB9],
186         [0x0CBE, 0x0CC4],
187         [0x0CC6, 0x0CC8],
188         [0x0CCA, 0x0CCD],
189         [0x0CDE, 0x0CDE],
190         [0x0CE0, 0x0CE1],
191         [0x0CE6, 0x0CEF],
192         [0x0D02, 0x0D03],
193         [0x0D05, 0x0D0C],
194         [0x0D0E, 0x0D10],
195         [0x0D12, 0x0D28],
196         [0x0D2A, 0x0D39],
197         [0x0D3E, 0x0D43],
198         [0x0D46, 0x0D48],
199         [0x0D4A, 0x0D4D],
200         [0x0D60, 0x0D61],
201         [0x0D66, 0x0D6F],
202         [0x0E01, 0x0E3A],
203         [0x0E40, 0x0E5B],
204         [0x0E81, 0x0E82],
205         [0x0E84, 0x0E84],
206         [0x0E87, 0x0E88],
207         [0x0E8A, 0x0E8A],
208         [0x0E8D, 0x0E8D],
209         [0x0E94, 0x0E97],
210         [0x0E99, 0x0E9F],
211         [0x0EA1, 0x0EA3],
212         [0x0EA5, 0x0EA5],
213         [0x0EA7, 0x0EA7],
214         [0x0EAA, 0x0EAB],
215         [0x0EAD, 0x0EAE],
216         [0x0EB0, 0x0EB9],
217         [0x0EBB, 0x0EBD],
218         [0x0EC0, 0x0EC4],
219         [0x0EC6, 0x0EC6],
220         [0x0EC8, 0x0ECD],
221         [0x0ED0, 0x0ED9],
222         [0x0EDC, 0x0EDD],
223         [0x0F00, 0x0F00],
224         [0x0F18, 0x0F19],
225         [0x0F20, 0x0F33],
226         [0x0F35, 0x0F35],
227         [0x0F37, 0x0F37],
228         [0x0F39, 0x0F39],
229         [0x0F3E, 0x0F47],
230         [0x0F49, 0x0F69],
231         [0x0F71, 0x0F84],
232         [0x0F86, 0x0F8B],
233         [0x0F90, 0x0F95],
234         [0x0F97, 0x0F97],
235         [0x0F99, 0x0FAD],
236         [0x0FB1, 0x0FB7],
237         [0x0FB9, 0x0FB9],
238         [0x10A0, 0x10C5],
239         [0x10D0, 0x10F6],
240         [0x1E00, 0x1E9B],
241         [0x1EA0, 0x1EF9],
242         [0x1F00, 0x1F15],
243         [0x1F18, 0x1F1D],
244         [0x1F20, 0x1F45],
245         [0x1F48, 0x1F4D],
246         [0x1F50, 0x1F57],
247         [0x1F59, 0x1F59],
248         [0x1F5B, 0x1F5B],
249         [0x1F5D, 0x1F5D],
250         [0x1F5F, 0x1F7D],
251         [0x1F80, 0x1FB4],
252         [0x1FB6, 0x1FBC],
253         [0x1FBE, 0x1FBE],
254         [0x1FC2, 0x1FC4],
255         [0x1FC6, 0x1FCC],
256         [0x1FD0, 0x1FD3],
257         [0x1FD6, 0x1FDB],
258         [0x1FE0, 0x1FEC],
259         [0x1FF2, 0x1FF4],
260         [0x1FF6, 0x1FFC],
261         [0x203F, 0x2040],
262         [0x207F, 0x207F],
263         [0x2102, 0x2102],
264         [0x2107, 0x2107],
265         [0x210A, 0x2113],
266         [0x2115, 0x2115],
267         [0x2118, 0x211D],
268         [0x2124, 0x2124],
269         [0x2126, 0x2126],
270         [0x2128, 0x2128],
271         [0x212A, 0x2131],
272         [0x2133, 0x2138],
273         [0x2160, 0x2182],
274         [0x3005, 0x3007],
275         [0x3021, 0x3029],
276         [0x3041, 0x3093],
277         [0x309B, 0x309C],
278         [0x30A1, 0x30F6],
279         [0x30FB, 0x30FC],
280         [0x3105, 0x312C],
281         [0x4E00, 0x9FA5],
282         [0xAC00, 0xD7A3]
283     ];
284 
285     size_t high = ALPHA_TABLE.length - 1;
286     // Shortcut search if c is out of range
287     size_t low = (c < ALPHA_TABLE[0][0] || ALPHA_TABLE[high][1] < c) ? high + 1 : 0;
288     // Binary search
289     while (low <= high)
290     {
291         size_t mid = (low + high) >> 1;
292         if (c < ALPHA_TABLE[mid][0])
293             high = mid - 1;
294         else if (ALPHA_TABLE[mid][1] < c)
295             low = mid + 1;
296         else
297         {
298             assert(ALPHA_TABLE[mid][0] <= c && c <= ALPHA_TABLE[mid][1]);
299             return true;
300         }
301     }
302     return false;
303 }
304 
305 /**
306  * Returns the code length of c in code units.
307  */
308 int utf_codeLengthChar(dchar c)
309 {
310     if (c <= 0x7F)
311         return 1;
312     if (c <= 0x7FF)
313         return 2;
314     if (c <= 0xFFFF)
315         return 3;
316     if (c <= 0x10FFFF)
317         return 4;
318     assert(false);
319 }
320 
321 int utf_codeLengthWchar(dchar c)
322 {
323     return c <= 0xFFFF ? 1 : 2;
324 }
325 
326 /**
327  * Returns the code length of c in code units for the encoding.
328  * sz is the encoding: 1 = utf8, 2 = utf16, 4 = utf32.
329  */
330 int utf_codeLength(int sz, dchar c)
331 {
332     if (sz == 1)
333         return utf_codeLengthChar(c);
334     if (sz == 2)
335         return utf_codeLengthWchar(c);
336     assert(sz == 4);
337     return 1;
338 }
339 
340 void utf_encodeChar(char* s, dchar c)
341 {
342     assert(s !is null);
343     assert(utf_isValidDchar(c));
344     if (c <= 0x7F)
345     {
346         s[0] = cast(char)c;
347     }
348     else if (c <= 0x07FF)
349     {
350         s[0] = cast(char)(0xC0 | (c >> 6));
351         s[1] = cast(char)(0x80 | (c & 0x3F));
352     }
353     else if (c <= 0xFFFF)
354     {
355         s[0] = cast(char)(0xE0 | (c >> 12));
356         s[1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
357         s[2] = cast(char)(0x80 | (c & 0x3F));
358     }
359     else if (c <= 0x10FFFF)
360     {
361         s[0] = cast(char)(0xF0 | (c >> 18));
362         s[1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
363         s[2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
364         s[3] = cast(char)(0x80 | (c & 0x3F));
365     }
366     else
367         assert(0);
368 }
369 
370 void utf_encodeWchar(wchar* s, dchar c)
371 {
372     assert(s !is null);
373     assert(utf_isValidDchar(c));
374     if (c <= 0xFFFF)
375     {
376         s[0] = cast(wchar)c;
377     }
378     else
379     {
380         s[0] = cast(wchar)((((c - 0x010000) >> 10) & 0x03FF) + 0xD800);
381         s[1] = cast(wchar)(((c - 0x010000) & 0x03FF) + 0xDC00);
382     }
383 }
384 
385 void utf_encode(int sz, void* s, dchar c)
386 {
387     if (sz == 1)
388         utf_encodeChar(cast(char*)s, c);
389     else if (sz == 2)
390         utf_encodeWchar(cast(wchar*)s, c);
391     else
392     {
393         assert(sz == 4);
394         *(cast(dchar*)s) = c;
395     }
396 }
397 
398 /********************************************
399  * Decode a UTF-8 sequence as a single UTF-32 code point.
400  * Params:
401  *      s = UTF-8 sequence
402  *      ridx = starting index in s[], updated to reflect number of code units decoded
403  *      rresult = set to character decoded
404  * Returns:
405  *      null on success, otherwise error message string
406  */
407 string utf_decodeChar(const(char)[] s, ref size_t ridx, out dchar rresult)
408 {
409     // UTF-8 decoding errors
410     static immutable string UTF8_DECODE_OK = null; // no error
411     static immutable string UTF8_DECODE_OUTSIDE_CODE_SPACE = "Outside Unicode code space";
412     static immutable string UTF8_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-8 sequence";
413     static immutable string UTF8_DECODE_OVERLONG = "Overlong UTF-8 sequence";
414     static immutable string UTF8_DECODE_INVALID_TRAILER = "Invalid trailing code unit";
415     static immutable string UTF8_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
416 
417     /* The following encodings are valid, except for the 5 and 6 byte
418      * combinations:
419      *      0xxxxxxx
420      *      110xxxxx 10xxxxxx
421      *      1110xxxx 10xxxxxx 10xxxxxx
422      *      11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
423      *      111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
424      *      1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
425      */
426     static immutable ubyte[256] UTF8_STRIDE =
427     [
428         1,1,1,1, 1,1,1,1,
429         1,1,1,1, 1,1,1,1,
430         1,1,1,1, 1,1,1,1,
431         1,1,1,1, 1,1,1,1,
432         1,1,1,1, 1,1,1,1,
433         1,1,1,1, 1,1,1,1,
434         1,1,1,1, 1,1,1,1,
435         1,1,1,1, 1,1,1,1,
436 
437         1,1,1,1, 1,1,1,1,
438         1,1,1,1, 1,1,1,1,
439         1,1,1,1, 1,1,1,1,
440         1,1,1,1, 1,1,1,1,
441         1,1,1,1, 1,1,1,1,
442         1,1,1,1, 1,1,1,1,
443         1,1,1,1, 1,1,1,1,
444         1,1,1,1, 1,1,1,1,
445 
446         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
447         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
448         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
449         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
450         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
451         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
452         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
453         0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
454 
455         2,2,2,2, 2,2,2,2,
456         2,2,2,2, 2,2,2,2,
457         2,2,2,2, 2,2,2,2,
458         2,2,2,2, 2,2,2,2,
459 
460         3,3,3,3, 3,3,3,3,
461         3,3,3,3, 3,3,3,3,
462 
463         4,4,4,4, 4,4,4,4,
464         5,5,5,5, 6,6,0xFF,0xFF
465     ];
466 
467     assert(s !is null);
468     size_t i = ridx++;
469 
470     const char u = s[i];
471     // Pre-stage results for ASCII and error cases
472     rresult = u;
473     //printf("utf_decodeChar(s = %02x, %02x, %02x len = %d)\n", u, s[1], s[2], len);
474     // Get expected sequence length
475     const size_t n = UTF8_STRIDE[u];
476     switch (n)
477     {
478     case 1:
479         // ASCII
480         return UTF8_DECODE_OK;
481     case 2:
482     case 3:
483     case 4:
484         // multi-byte UTF-8
485         break;
486     default:
487         // 5- or 6-byte sequence
488         return UTF8_DECODE_OUTSIDE_CODE_SPACE;
489     }
490     if (s.length < i + n) // source too short
491         return UTF8_DECODE_TRUNCATED_SEQUENCE;
492     // Pick off 7 - n low bits from first code unit
493     dchar c = u & ((1 << (7 - n)) - 1);
494     /* The following combinations are overlong, and illegal:
495      *      1100000x (10xxxxxx)
496      *      11100000 100xxxxx (10xxxxxx)
497      *      11110000 1000xxxx (10xxxxxx 10xxxxxx)
498      *      11111000 10000xxx (10xxxxxx 10xxxxxx 10xxxxxx)
499      *      11111100 100000xx (10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx)
500      */
501     const char u2 = s[++i];
502     // overlong combination
503     if ((u & 0xFE) == 0xC0 || (u == 0xE0 && (u2 & 0xE0) == 0x80) || (u == 0xF0 && (u2 & 0xF0) == 0x80) || (u == 0xF8 && (u2 & 0xF8) == 0x80) || (u == 0xFC && (u2 & 0xFC) == 0x80))
504         return UTF8_DECODE_OVERLONG;
505     // Decode remaining bits
506     for (const m = n + i - 1; i != m; ++i)
507     {
508         const u3 = s[i];
509         if ((u3 & 0xC0) != 0x80) // trailing bytes are 10xxxxxx
510             return UTF8_DECODE_INVALID_TRAILER;
511         c = (c << 6) | (u3 & 0x3F);
512     }
513     if (!utf_isValidDchar(c))
514         return UTF8_DECODE_INVALID_CODE_POINT;
515     ridx = i;
516     rresult = c;
517     return UTF8_DECODE_OK;
518 }
519 
520 /********************************************
521  * Decode a UTF-16 sequence as a single UTF-32 code point.
522  * Params:
523  *      s = UTF-16 sequence
524  *      ridx = starting index in s[], updated to reflect number of code units decoded
525  *      rresult = set to character decoded
526  * Returns:
527  *      null on success, otherwise error message string
528  */
529 string utf_decodeWchar(const(wchar)[] s, ref size_t ridx, out dchar rresult)
530 {
531     // UTF-16 decoding errors
532     static immutable string UTF16_DECODE_OK = null; // no error
533     static immutable string UTF16_DECODE_TRUNCATED_SEQUENCE = "Truncated UTF-16 sequence";
534     static immutable string UTF16_DECODE_INVALID_SURROGATE = "Invalid low surrogate";
535     static immutable string UTF16_DECODE_UNPAIRED_SURROGATE = "Unpaired surrogate";
536     static immutable string UTF16_DECODE_INVALID_CODE_POINT = "Invalid code point decoded";
537 
538     assert(s !is null);
539     size_t i = ridx++;
540 
541     // Pre-stage results for single wchar and error cases
542     dchar u = rresult = s[i];
543     if (u < 0xD800) // Single wchar codepoint
544         return UTF16_DECODE_OK;
545     if (0xD800 <= u && u <= 0xDBFF) // Surrogate pair
546     {
547         if (s.length <= i + 1)
548             return UTF16_DECODE_TRUNCATED_SEQUENCE;
549         wchar u2 = s[i + 1];
550         if (u2 < 0xDC00 || 0xDFFF < u)
551             return UTF16_DECODE_INVALID_SURROGATE;
552         u = ((u - 0xD7C0) << 10) + (u2 - 0xDC00);
553         ++ridx;
554     }
555     else if (0xDC00 <= u && u <= 0xDFFF)
556         return UTF16_DECODE_UNPAIRED_SURROGATE;
557     if (!utf_isValidDchar(u))
558         return UTF16_DECODE_INVALID_CODE_POINT;
559     rresult = u;
560     return UTF16_DECODE_OK;
561 }