1 /**
2  * Contains various string related functions.
3  *
4  * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
5  * Authors:   Walter Bright, http://www.digitalmars.com
6  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
7  * Source:    $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/string.d, root/_string.d)
8  * Documentation:  https://dlang.org/phobos/dmd_root_string.html
9  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/string.d
10  */
11 module dmd.root..string;
12 
13 /// Slices a `\0`-terminated C-string, excluding the terminator
14 inout(char)[] toDString (inout(char)* s) pure nothrow @nogc
15 {
16     import core.stdc.string : strlen;
17     return s ? s[0 .. strlen(s)] : null;
18 }
19 
20 /**
21 Compare two slices for equality, in a case-insensitive way
22 
23 Comparison is based on `char` and does not do decoding.
24 As a result, it's only really accurate for plain ASCII strings.
25 
26 Params:
27 s1 = string to compare
28 s2 = string to compare
29 
30 Returns:
31 `true` if `s1 == s2` regardless of case
32 */
33 extern(D) static bool iequals(const(char)[] s1, const(char)[] s2)
34 {
35     import core.stdc.ctype : toupper;
36 
37     if (s1.length != s2.length)
38         return false;
39 
40     foreach (idx, c1; s1)
41     {
42         // Since we did a length check, it is safe to bypass bounds checking
43         const c2 = s2.ptr[idx];
44         if (c1 != c2)
45             if (toupper(c1) != toupper(c2))
46                 return false;
47     }
48     return true;
49 }
50 
51 /**
52 Copy the content of `src` into a C-string ('\0' terminated) then call `dg`
53 
54 The intent of this function is to provide an allocation-less
55 way to call a C function using a D slice.
56 The function internally allocates a buffer if needed, but frees it on exit.
57 
58 Note:
59 The argument to `dg` is `scope`. To keep the data around after `dg` exits,
60 one has to copy it.
61 
62 Params:
63 src = Slice to use to call the C function
64 dg  = Delegate to call afterwards
65 
66 Returns:
67 The return value of `T`
68 */
69 auto toCStringThen(alias dg)(const(char)[] src) nothrow
70 {
71     import dmd.root.rmem : mem;
72 
73     const len = src.length + 1;
74     char[512] small = void;
75     scope ptr = (src.length < (small.length - 1))
76                     ? small[0 .. len]
77                     : (cast(char*)mem.xmalloc(len))[0 .. len];
78     scope (exit)
79     {
80         if (&ptr[0] != &small[0])
81             mem.xfree(&ptr[0]);
82     }
83     ptr[0 .. src.length] = src[];
84     ptr[src.length] = '\0';
85     return dg(ptr);
86 }
87 
88 unittest
89 {
90     assert("Hello world".toCStringThen!((v) => v == "Hello world\0"));
91     assert("Hello world\0".toCStringThen!((v) => v == "Hello world\0\0"));
92     assert(null.toCStringThen!((v) => v == "\0"));
93 }
94 
95 /**
96  * Strips one leading line terminator of the given string.
97  *
98  * The following are what the Unicode standard considers as line terminators:
99  *
100  * | Name                | D Escape Sequence | Unicode Code Point |
101  * |---------------------|-------------------|--------------------|
102  * | Line feed           | `\n`              | `U+000A`           |
103  * | Line tabulation     | `\v`              | `U+000B`           |
104  * | Form feed           | `\f`              | `U+000C`           |
105  * | Carriage return     | `\r`              | `U+000D`           |
106  * | Next line           |                   | `U+0085`           |
107  * | Line separator      |                   | `U+2028`           |
108  * | Paragraph separator |                   | `U+2029`           |
109  *
110  * This function will also strip `\r\n`.
111  */
112 string stripLeadingLineTerminator(string str) pure nothrow @nogc @safe
113 {
114     enum nextLine = "\xC2\x85";
115     enum lineSeparator = "\xE2\x80\xA8";
116     enum paragraphSeparator = "\xE2\x80\xA9";
117 
118     static assert(lineSeparator.length == paragraphSeparator.length);
119 
120     if (str.length == 0)
121         return str;
122 
123     switch (str[0])
124     {
125         case '\r':
126         {
127             if (str.length >= 2 && str[1] == '\n')
128                 return str[2 .. $];
129             goto case;
130         }
131         case '\v', '\f', '\n': return str[1 .. $];
132 
133         case nextLine[0]:
134         {
135             if (str.length >= 2 && str[0 .. 2] == nextLine)
136                 return str[2 .. $];
137 
138             return str;
139         }
140 
141         case lineSeparator[0]:
142         {
143             if (str.length >= lineSeparator.length)
144             {
145                 const prefix = str[0 .. lineSeparator.length];
146 
147                 if (prefix == lineSeparator || prefix == paragraphSeparator)
148                     return str[lineSeparator.length .. $];
149             }
150 
151             return str;
152         }
153 
154         default: return str;
155     }
156 }
157 
158 unittest
159 {
160     assert("".stripLeadingLineTerminator == "");
161     assert("foo".stripLeadingLineTerminator == "foo");
162     assert("\xC2foo".stripLeadingLineTerminator == "\xC2foo");
163     assert("\xE2foo".stripLeadingLineTerminator == "\xE2foo");
164     assert("\nfoo".stripLeadingLineTerminator == "foo");
165     assert("\vfoo".stripLeadingLineTerminator == "foo");
166     assert("\ffoo".stripLeadingLineTerminator == "foo");
167     assert("\rfoo".stripLeadingLineTerminator == "foo");
168     assert("\u0085foo".stripLeadingLineTerminator == "foo");
169     assert("\u2028foo".stripLeadingLineTerminator == "foo");
170     assert("\u2029foo".stripLeadingLineTerminator == "foo");
171     assert("\n\rfoo".stripLeadingLineTerminator == "\rfoo");
172     assert("\r\nfoo".stripLeadingLineTerminator == "foo");
173 }
174 
175 /**
176  * A string comparison functions that returns the same result as strcmp
177  *
178  * Note: Strings are compared based on their ASCII values, no UTF-8 decoding.
179  *
180  * Some C functions (e.g. `qsort`) require a `int` result for comparison.
181  * See_Also: Druntime's `core.internal.string`
182  */
183 int dstrcmp()( scope const char[] s1, scope const char[] s2 ) @trusted
184 {
185     immutable len = s1.length <= s2.length ? s1.length : s2.length;
186     if (__ctfe)
187     {
188         foreach (const u; 0 .. len)
189         {
190             if (s1[u] != s2[u])
191                 return s1[u] > s2[u] ? 1 : -1;
192         }
193     }
194     else
195     {
196         import core.stdc.string : memcmp;
197 
198         const ret = memcmp( s1.ptr, s2.ptr, len );
199         if ( ret )
200             return ret;
201     }
202     return s1.length < s2.length ? -1 : (s1.length > s2.length);
203 }
204 
205 //
206 unittest
207 {
208     assert(dstrcmp("Fraise", "Fraise")      == 0);
209     assert(dstrcmp("Baguette", "Croissant") == -1);
210     assert(dstrcmp("Croissant", "Baguette") == 1);
211 
212     static assert(dstrcmp("Baguette", "Croissant") == -1);
213 
214     // UTF-8 decoding for the CT variant
215     assert(dstrcmp("안녕하세요!", "안녕하세요!") == 0);
216     static assert(dstrcmp("안녕하세요!", "안녕하세요!") == 0);
217 }
218 
219 /**
220  * Infers the length `N` of a string literal and coerces its type to a static
221  * array with length `N + 1`. Returns the string with a null character appended
222  * to the end.
223  *
224  * Params:
225  *  literal = string literal
226  *
227  * Notes:
228  *  - LDC produces quite optimal code for short strings:
229  *    - https://d.godbolt.org/z/M69Z1g
230  *    - https://gist.github.com/PetarKirov/338e4ab9292b6b2b311a3070572a07fb (backup URL)
231 */
232 char[N + 1] toStaticArray(size_t N)(scope const(char)[N] literal)
233 {
234     char[N+1] result = void;
235     result[0..N] = literal[0..N];
236     result[N] = 0;
237     return result;
238 }
239 
240 ///
241 @safe pure nothrow @nogc
242 unittest
243 {
244     auto m = "123".toStaticArray;
245     const c = "123".toStaticArray;
246     immutable i = "123".toStaticArray;
247     enum e = "123".toStaticArray;
248 
249     assert(m == "123\0");
250     assert(c == "123\0");
251     assert(i == "123\0");
252     static assert(e == "123\0");
253 
254     const empty = "".toStaticArray;
255     static assert(empty.length == 1);
256     static assert(empty[0] == '\0');
257 }
258 
259 /**
260  * Checks if C string `p` starts with `needle`.
261  * Params:
262  *     p = the C string to check
263  *     needle = the string to look for
264  * Returns:
265  *    `true` if `p` starts with `needle`
266  */
267 @system pure nothrow @nogc
268 bool startsWith(scope const(char)* p, scope const(char)[] needle)
269 in { assert(p && needle.ptr); }
270 do
271 {
272     foreach (const c; needle)
273     {
274         assert(c);
275         if (c != *p)
276             return false;
277         ++p;
278     }
279     return true;
280 }
281 
282 ///
283 @system pure nothrow @nogc
284 unittest
285 {
286     const buf = "123".toStaticArray;
287     const ptr = &buf[0];
288     assert(ptr.startsWith(""));
289     assert(ptr.startsWith("1"));
290     assert(ptr.startsWith("12"));
291     assert(ptr.startsWith("123"));
292     assert(!ptr.startsWith("1234"));
293 }