1 /**
2  * Spell checker
3  *
4  * Does not have any dependencies on the rest of DMD.
5  *
6  * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved
7  * Authors:   Walter Bright, http://www.digitalmars.com
8  * License:   $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0)
9  * Source:    $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/speller.d, root/_speller.d)
10  * Documentation:  https://dlang.org/phobos/dmd_root_speller.html
11  * Coverage:    https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/speller.d
12  */
13 
14 module dmd.root.speller;
15 
16 /**************************************************
17  * Looks for correct spelling.
18  * Looks a distance of up to two.
19  * This does an exhaustive search, so can potentially be very slow.
20  * Params:
21  *      seed = wrongly spelled word
22  *      dg = search delegate of the form `T delegate(const(char)[] p, out int cost)`
23  * Returns:
24  *      T.init = no correct spellings found,
25  *      otherwise the value returned by dg() for first possible correct spelling
26  */
27 auto speller(alias dg)(const(char)[] seed)
28 if (isSearchFunction!dg)
29 {
30     const size_t maxdist = seed.length < 4 ? seed.length / 2 : 2;
31     foreach (distance; 0 .. maxdist)
32     {
33         if (auto p = spellerX!dg(seed, distance != 0))
34             return p;
35         //      if (seedlen > 10)
36         //          break;
37     }
38     return null; // didn't find it
39 }
40 
41 private:
42 
43 import core.stdc.stdlib;
44 import core.stdc.string;
45 
46 enum isSearchFunction(alias fun) = is(searchFunctionType!fun);
47 alias searchFunctionType(alias fun) = typeof(() {int x; return fun("", x);}());
48 
49 /*************************************
50  * Spell check level 1.
51  * Params:
52  *      dg = delegate that looks up string in dictionary AA and returns value found
53  *      seed = starting string
54  *      flag = if true, do 2 level lookup otherwise 1 level
55  * Returns:
56  *      whatever dg returns, null if no match
57  */
58 auto spellerX(alias dg)(const(char)[] seed, bool flag)
59 {
60     if (!seed.length)
61         return null;
62 
63     /* Need buffer to store trial strings in
64      */
65     char[30] tmp = void;
66     char[] buf;
67     if (seed.length <= tmp.sizeof - 1)
68         buf = tmp;
69     else
70     {
71         buf = (cast(char*)alloca(seed.length + 1))[0 .. seed.length + 1]; // leave space for extra char
72         if (!buf.ptr)
73             return null; // no matches
74     }
75 
76     int cost = int.max;
77     searchFunctionType!dg p = null;
78 
79     /* Deletions */
80     buf[0 .. seed.length - 1] = seed[1 .. $];
81     foreach (i; 0 .. seed.length)
82     {
83         //printf("del buf = '%s'\n", buf);
84         int ncost;
85         auto np = flag ? spellerY!dg(buf[0 .. seed.length - 1], i, ncost)
86                        : dg(buf[0 .. seed.length - 1], ncost);
87         if (combineSpellerResult(p, cost, np, ncost))
88             return p;
89         buf[i] = seed[i];
90     }
91 
92     /* Transpositions */
93     if (!flag)
94     {
95         buf[0 .. seed.length] = seed;
96         for (size_t i = 0; i + 1 < seed.length; i++)
97         {
98             // swap [i] and [i + 1]
99             buf[i] = seed[i + 1];
100             buf[i + 1] = seed[i];
101             //printf("tra buf = '%s'\n", buf);
102             int ncost;
103             auto np = dg(buf[0 .. seed.length], ncost);
104             if (combineSpellerResult(p, cost, np, ncost))
105                 return p;
106             buf[i] = seed[i];
107         }
108     }
109 
110     /* Substitutions */
111     buf[0 .. seed.length] = seed;
112     foreach (i; 0 .. seed.length)
113     {
114         foreach (s; idchars)
115         {
116             buf[i] = s;
117             //printf("sub buf = '%s'\n", buf);
118             int ncost;
119             auto np = flag ? spellerY!dg(buf[0 .. seed.length], i + 1, ncost)
120                            : dg(buf[0 .. seed.length], ncost);
121             if (combineSpellerResult(p, cost, np, ncost))
122                 return p;
123         }
124         buf[i] = seed[i];
125     }
126 
127     /* Insertions */
128     buf[1 .. seed.length + 1] = seed;
129     foreach (i; 0 .. seed.length + 1) // yes, do seed.length+1 iterations
130     {
131         foreach (s; idchars)
132         {
133             buf[i] = s;
134             //printf("ins buf = '%s'\n", buf);
135             int ncost;
136             auto np = flag ? spellerY!dg(buf[0 .. seed.length + 1], i + 1, ncost)
137                            : dg(buf[0 .. seed.length + 1], ncost);
138             if (combineSpellerResult(p, cost, np, ncost))
139                 return p;
140         }
141         if (i < seed.length)
142             buf[i] = seed[i];
143     }
144 
145     return p; // return "best" result
146 }
147 
148 /**********************************************
149  * Do second level of spell matching.
150  * Params:
151  *      dg = delegate that looks up string in dictionary AA and returns value found
152  *      seed = starting string
153  *      index = index into seed[] that is where we will mutate it
154  *      cost = set to cost of match
155  * Returns:
156  *      whatever dg returns, null if no match
157  */
158 auto spellerY(alias dg)(const(char)[] seed, size_t index, out int cost)
159 {
160     if (!seed.length)
161         return null;
162 
163     /* Allocate a buf to store the new string to play with, needs
164      * space for an extra char for insertions
165      */
166     char[30] tmp = void;        // stack allocations are fastest
167     char[] buf;
168     if (seed.length <= tmp.sizeof - 1)
169         buf = tmp;
170     else
171     {
172         buf = (cast(char*)alloca(seed.length + 1))[0 .. seed.length + 1]; // leave space for extra char
173         if (!buf.ptr)
174             return null; // no matches
175     }
176     buf[0 .. index] = seed[0 .. index];
177 
178     cost = int.max;             // start with worst possible match
179     searchFunctionType!dg p = null;
180 
181     /* Delete character at seed[index] */
182     if (index < seed.length)
183     {
184         buf[index .. seed.length - 1] = seed[index + 1 .. $]; // seed[] with deleted character
185         int ncost;
186         auto np = dg(buf[0 .. seed.length - 1], ncost); // look it up
187         if (combineSpellerResult(p, cost, np, ncost))   // compare with prev match
188             return p;                                   // cannot get any better
189     }
190 
191     /* Substitute character at seed[index] */
192     if (index < seed.length)
193     {
194         buf[0 .. seed.length] = seed;
195         foreach (s; idchars)
196         {
197             buf[index] = s;     // seed[] with substituted character
198             //printf("sub buf = '%s'\n", buf);
199             int ncost;
200             auto np = dg(buf[0 .. seed.length], ncost);
201             if (combineSpellerResult(p, cost, np, ncost))
202                 return p;
203         }
204     }
205 
206     /* Insert character at seed[index] */
207     buf[index + 1 .. seed.length + 1] = seed[index .. $];
208     foreach (s; idchars)
209     {
210         buf[index] = s;
211         //printf("ins buf = '%s'\n", buf);
212         int ncost;
213         auto np = dg(buf[0 .. seed.length + 1], ncost);
214         if (combineSpellerResult(p, cost, np, ncost))
215             return p;
216     }
217     return p; // return "best" result
218 }
219 
220 
221 /* Characters used to substitute ones in the string we're checking
222  * the spelling on.
223  */
224 immutable string idchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_";
225 
226 /**************************************************
227  * Combine a new result from the spell checker to
228  * find the one with the closest symbol with
229  * respect to the cost defined by the search function
230  * Params:
231  *      p = best found spelling so far, T.init if none found yet.
232  *          If np is better, p is replaced with np
233  *      cost = cost of p (int.max if none found yet).
234  *          If np is better, cost is replaced with ncost
235  *      np = current spelling to check against p, T.init if none
236  *      ncost = cost of np if np is not T.init
237  * Returns:
238  *      true    if the cost is less or equal 0, meaning we can stop looking
239  *      false   otherwise
240  */
241 bool combineSpellerResult(T)(ref T p, ref int cost, T np, int ncost)
242 {
243     if (np && ncost < cost) // if np is better
244     {
245         p = np;             // np is new best match
246         cost = ncost;
247         if (cost <= 0)
248             return true;    // meaning we can stop looking
249     }
250     return false;
251 }
252 
253 /************************************* Tests ***********************/
254 
255 unittest
256 {
257     static immutable string[][] cases =
258     [
259         ["hello", "hell", "y"],
260         ["hello", "hel", "y"],
261         ["hello", "ello", "y"],
262         ["hello", "llo", "y"],
263         ["hello", "hellox", "y"],
264         ["hello", "helloxy", "y"],
265         ["hello", "xhello", "y"],
266         ["hello", "xyhello", "y"],
267         ["hello", "ehllo", "y"],
268         ["hello", "helol", "y"],
269         ["hello", "abcd", "n"],
270         ["hello", "helxxlo", "y"],
271         ["hello", "ehlxxlo", "n"],
272         ["hello", "heaao", "y"],
273         ["_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y"],
274     ];
275     //printf("unittest_speller()\n");
276 
277     string dgarg;
278 
279     string speller_test(const(char)[] s, ref int cost)
280     {
281         assert(s[$-1] != '\0');
282         //printf("speller_test(%s, %s)\n", dgarg, s);
283         cost = 0;
284         if (dgarg == s)
285             return dgarg;
286         return null;
287     }
288 
289     dgarg = "hell";
290     auto p = speller!speller_test("hello");
291     assert(p !is null);
292     foreach (testCase; cases)
293     {
294         //printf("case [%d]\n", i);
295         dgarg = testCase[1];
296         auto p2 = speller!speller_test(testCase[0]);
297         if (p2)
298             assert(testCase[2][0] == 'y');
299         else
300             assert(testCase[2][0] == 'n');
301     }
302     //printf("unittest_speller() success\n");
303 }