1 /** 2 * Spell checker 3 * 4 * Does not have any dependencies on the rest of DMD. 5 * 6 * Copyright: Copyright (C) 1999-2021 by The D Language Foundation, All Rights Reserved 7 * Authors: Walter Bright, http://www.digitalmars.com 8 * License: $(LINK2 http://www.boost.org/LICENSE_1_0.txt, Boost License 1.0) 9 * Source: $(LINK2 https://github.com/dlang/dmd/blob/master/src/dmd/root/speller.d, root/_speller.d) 10 * Documentation: https://dlang.org/phobos/dmd_root_speller.html 11 * Coverage: https://codecov.io/gh/dlang/dmd/src/master/src/dmd/root/speller.d 12 */ 13 14 module dmd.root.speller; 15 16 /************************************************** 17 * Looks for correct spelling. 18 * Looks a distance of up to two. 19 * This does an exhaustive search, so can potentially be very slow. 20 * Params: 21 * seed = wrongly spelled word 22 * dg = search delegate of the form `T delegate(const(char)[] p, out int cost)` 23 * Returns: 24 * T.init = no correct spellings found, 25 * otherwise the value returned by dg() for first possible correct spelling 26 */ 27 auto speller(alias dg)(const(char)[] seed) 28 if (isSearchFunction!dg) 29 { 30 const size_t maxdist = seed.length < 4 ? seed.length / 2 : 2; 31 foreach (distance; 0 .. maxdist) 32 { 33 if (auto p = spellerX!dg(seed, distance != 0)) 34 return p; 35 // if (seedlen > 10) 36 // break; 37 } 38 return null; // didn't find it 39 } 40 41 private: 42 43 import core.stdc.stdlib; 44 import core.stdc.string; 45 46 enum isSearchFunction(alias fun) = is(searchFunctionType!fun); 47 alias searchFunctionType(alias fun) = typeof(() {int x; return fun("", x);}()); 48 49 /************************************* 50 * Spell check level 1. 51 * Params: 52 * dg = delegate that looks up string in dictionary AA and returns value found 53 * seed = starting string 54 * flag = if true, do 2 level lookup otherwise 1 level 55 * Returns: 56 * whatever dg returns, null if no match 57 */ 58 auto spellerX(alias dg)(const(char)[] seed, bool flag) 59 { 60 if (!seed.length) 61 return null; 62 63 /* Need buffer to store trial strings in 64 */ 65 char[30] tmp = void; 66 char[] buf; 67 if (seed.length <= tmp.sizeof - 1) 68 buf = tmp; 69 else 70 { 71 buf = (cast(char*)alloca(seed.length + 1))[0 .. seed.length + 1]; // leave space for extra char 72 if (!buf.ptr) 73 return null; // no matches 74 } 75 76 int cost = int.max; 77 searchFunctionType!dg p = null; 78 79 /* Deletions */ 80 buf[0 .. seed.length - 1] = seed[1 .. $]; 81 foreach (i; 0 .. seed.length) 82 { 83 //printf("del buf = '%s'\n", buf); 84 int ncost; 85 auto np = flag ? spellerY!dg(buf[0 .. seed.length - 1], i, ncost) 86 : dg(buf[0 .. seed.length - 1], ncost); 87 if (combineSpellerResult(p, cost, np, ncost)) 88 return p; 89 buf[i] = seed[i]; 90 } 91 92 /* Transpositions */ 93 if (!flag) 94 { 95 buf[0 .. seed.length] = seed; 96 for (size_t i = 0; i + 1 < seed.length; i++) 97 { 98 // swap [i] and [i + 1] 99 buf[i] = seed[i + 1]; 100 buf[i + 1] = seed[i]; 101 //printf("tra buf = '%s'\n", buf); 102 int ncost; 103 auto np = dg(buf[0 .. seed.length], ncost); 104 if (combineSpellerResult(p, cost, np, ncost)) 105 return p; 106 buf[i] = seed[i]; 107 } 108 } 109 110 /* Substitutions */ 111 buf[0 .. seed.length] = seed; 112 foreach (i; 0 .. seed.length) 113 { 114 foreach (s; idchars) 115 { 116 buf[i] = s; 117 //printf("sub buf = '%s'\n", buf); 118 int ncost; 119 auto np = flag ? spellerY!dg(buf[0 .. seed.length], i + 1, ncost) 120 : dg(buf[0 .. seed.length], ncost); 121 if (combineSpellerResult(p, cost, np, ncost)) 122 return p; 123 } 124 buf[i] = seed[i]; 125 } 126 127 /* Insertions */ 128 buf[1 .. seed.length + 1] = seed; 129 foreach (i; 0 .. seed.length + 1) // yes, do seed.length+1 iterations 130 { 131 foreach (s; idchars) 132 { 133 buf[i] = s; 134 //printf("ins buf = '%s'\n", buf); 135 int ncost; 136 auto np = flag ? spellerY!dg(buf[0 .. seed.length + 1], i + 1, ncost) 137 : dg(buf[0 .. seed.length + 1], ncost); 138 if (combineSpellerResult(p, cost, np, ncost)) 139 return p; 140 } 141 if (i < seed.length) 142 buf[i] = seed[i]; 143 } 144 145 return p; // return "best" result 146 } 147 148 /********************************************** 149 * Do second level of spell matching. 150 * Params: 151 * dg = delegate that looks up string in dictionary AA and returns value found 152 * seed = starting string 153 * index = index into seed[] that is where we will mutate it 154 * cost = set to cost of match 155 * Returns: 156 * whatever dg returns, null if no match 157 */ 158 auto spellerY(alias dg)(const(char)[] seed, size_t index, out int cost) 159 { 160 if (!seed.length) 161 return null; 162 163 /* Allocate a buf to store the new string to play with, needs 164 * space for an extra char for insertions 165 */ 166 char[30] tmp = void; // stack allocations are fastest 167 char[] buf; 168 if (seed.length <= tmp.sizeof - 1) 169 buf = tmp; 170 else 171 { 172 buf = (cast(char*)alloca(seed.length + 1))[0 .. seed.length + 1]; // leave space for extra char 173 if (!buf.ptr) 174 return null; // no matches 175 } 176 buf[0 .. index] = seed[0 .. index]; 177 178 cost = int.max; // start with worst possible match 179 searchFunctionType!dg p = null; 180 181 /* Delete character at seed[index] */ 182 if (index < seed.length) 183 { 184 buf[index .. seed.length - 1] = seed[index + 1 .. $]; // seed[] with deleted character 185 int ncost; 186 auto np = dg(buf[0 .. seed.length - 1], ncost); // look it up 187 if (combineSpellerResult(p, cost, np, ncost)) // compare with prev match 188 return p; // cannot get any better 189 } 190 191 /* Substitute character at seed[index] */ 192 if (index < seed.length) 193 { 194 buf[0 .. seed.length] = seed; 195 foreach (s; idchars) 196 { 197 buf[index] = s; // seed[] with substituted character 198 //printf("sub buf = '%s'\n", buf); 199 int ncost; 200 auto np = dg(buf[0 .. seed.length], ncost); 201 if (combineSpellerResult(p, cost, np, ncost)) 202 return p; 203 } 204 } 205 206 /* Insert character at seed[index] */ 207 buf[index + 1 .. seed.length + 1] = seed[index .. $]; 208 foreach (s; idchars) 209 { 210 buf[index] = s; 211 //printf("ins buf = '%s'\n", buf); 212 int ncost; 213 auto np = dg(buf[0 .. seed.length + 1], ncost); 214 if (combineSpellerResult(p, cost, np, ncost)) 215 return p; 216 } 217 return p; // return "best" result 218 } 219 220 221 /* Characters used to substitute ones in the string we're checking 222 * the spelling on. 223 */ 224 immutable string idchars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_"; 225 226 /************************************************** 227 * Combine a new result from the spell checker to 228 * find the one with the closest symbol with 229 * respect to the cost defined by the search function 230 * Params: 231 * p = best found spelling so far, T.init if none found yet. 232 * If np is better, p is replaced with np 233 * cost = cost of p (int.max if none found yet). 234 * If np is better, cost is replaced with ncost 235 * np = current spelling to check against p, T.init if none 236 * ncost = cost of np if np is not T.init 237 * Returns: 238 * true if the cost is less or equal 0, meaning we can stop looking 239 * false otherwise 240 */ 241 bool combineSpellerResult(T)(ref T p, ref int cost, T np, int ncost) 242 { 243 if (np && ncost < cost) // if np is better 244 { 245 p = np; // np is new best match 246 cost = ncost; 247 if (cost <= 0) 248 return true; // meaning we can stop looking 249 } 250 return false; 251 } 252 253 /************************************* Tests ***********************/ 254 255 unittest 256 { 257 static immutable string[][] cases = 258 [ 259 ["hello", "hell", "y"], 260 ["hello", "hel", "y"], 261 ["hello", "ello", "y"], 262 ["hello", "llo", "y"], 263 ["hello", "hellox", "y"], 264 ["hello", "helloxy", "y"], 265 ["hello", "xhello", "y"], 266 ["hello", "xyhello", "y"], 267 ["hello", "ehllo", "y"], 268 ["hello", "helol", "y"], 269 ["hello", "abcd", "n"], 270 ["hello", "helxxlo", "y"], 271 ["hello", "ehlxxlo", "n"], 272 ["hello", "heaao", "y"], 273 ["_123456789_123456789_123456789_123456789", "_123456789_123456789_123456789_12345678", "y"], 274 ]; 275 //printf("unittest_speller()\n"); 276 277 string dgarg; 278 279 string speller_test(const(char)[] s, ref int cost) 280 { 281 assert(s[$-1] != '\0'); 282 //printf("speller_test(%s, %s)\n", dgarg, s); 283 cost = 0; 284 if (dgarg == s) 285 return dgarg; 286 return null; 287 } 288 289 dgarg = "hell"; 290 auto p = speller!speller_test("hello"); 291 assert(p !is null); 292 foreach (testCase; cases) 293 { 294 //printf("case [%d]\n", i); 295 dgarg = testCase[1]; 296 auto p2 = speller!speller_test(testCase[0]); 297 if (p2) 298 assert(testCase[2][0] == 'y'); 299 else 300 assert(testCase[2][0] == 'n'); 301 } 302 //printf("unittest_speller() success\n"); 303 }