1 | <?php |
||
2 | |||
3 | /** |
||
4 | * webtrees: online genealogy |
||
5 | * Copyright (C) 2025 webtrees development team |
||
6 | * This program is free software: you can redistribute it and/or modify |
||
7 | * it under the terms of the GNU General Public License as published by |
||
8 | * the Free Software Foundation, either version 3 of the License, or |
||
9 | * (at your option) any later version. |
||
10 | * This program is distributed in the hope that it will be useful, |
||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
13 | * GNU General Public License for more details. |
||
14 | * You should have received a copy of the GNU General Public License |
||
15 | * along with this program. If not, see <https://www.gnu.org/licenses/>. |
||
16 | */ |
||
17 | |||
18 | declare(strict_types=1); |
||
19 | |||
20 | namespace Fisharebest\Webtrees; |
||
21 | |||
22 | use function array_slice; |
||
23 | use function count; |
||
24 | use function strlen; |
||
25 | |||
26 | /** |
||
27 | * Phonetic matching of strings. |
||
28 | */ |
||
29 | class Soundex |
||
30 | { |
||
31 | // Determine the Daitch–Mokotoff Soundex code for a word |
||
32 | // Original implementation by Gerry Kroll, and analysis by Meliza Amity |
||
33 | |||
34 | // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) |
||
35 | private const int MAXCHAR = 7; |
||
0 ignored issues
–
show
Bug
introduced
by
![]() |
|||
36 | |||
37 | /** |
||
38 | * Name transformation arrays. |
||
39 | * Used to transform the Name string to simplify the "sounds like" table. |
||
40 | * This is especially useful in Hebrew. |
||
41 | * |
||
42 | * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) |
||
43 | * function call to achieve the desired transformations. |
||
44 | * |
||
45 | * Note about the use of "\x01": |
||
46 | * This code, which can’t legitimately occur in the kind of text we're dealing with, |
||
47 | * is used as a place-holder so that conditional string replacements can be done. |
||
48 | */ |
||
49 | private const array TRANSFORM_NAMES = [ |
||
50 | // Force Yiddish ligatures to be treated as separate letters |
||
51 | ['װ', 'וו'], |
||
52 | ['ײ', 'יי'], |
||
53 | ['ױ', 'וי'], |
||
54 | ['בו', 'בע'], |
||
55 | ['פו', 'פע'], |
||
56 | ['ומ', 'עמ'], |
||
57 | ['ום', 'עם'], |
||
58 | ['ונ', 'ענ'], |
||
59 | ['ון', 'ען'], |
||
60 | ['וו', 'ב'], |
||
61 | ["\x01", ''], |
||
62 | ['ייה$', "\x01ה"], |
||
63 | ['ייע$', "\x01ע"], |
||
64 | ['יי', 'ע'], |
||
65 | ["\x01", 'יי'], |
||
66 | ]; |
||
67 | |||
68 | /** |
||
69 | * The DM sound coding table is organized this way: |
||
70 | * key: a variable-length string that corresponds to the UTF-8 character sequence |
||
71 | * represented by the table entry. Currently, that string can be up to 7 |
||
72 | * bytes long. This maximum length is defined by the value of global variable |
||
73 | * $maxchar. |
||
74 | * |
||
75 | * value: an array as follows: |
||
76 | * [0]: zero if not a vowel |
||
77 | * [1]: sound value when this string is at the beginning of the word |
||
78 | * [2]: sound value when this string is followed by a vowel |
||
79 | * [3]: sound value for other cases |
||
80 | * [1],[2],[3] can be repeated several times to create branches in the code |
||
81 | * an empty sound value means "ignore in this state" |
||
82 | */ |
||
83 | private const array DM_SOUNDS = [ |
||
84 | 'A' => ['1', '0', '', ''], |
||
85 | 'À' => ['1', '0', '', ''], |
||
86 | 'Á' => ['1', '0', '', ''], |
||
87 | 'Â' => ['1', '0', '', ''], |
||
88 | 'Ã' => ['1', '0', '', ''], |
||
89 | 'Ä' => ['1', '0', '1', '', '0', '', ''], |
||
90 | 'Å' => ['1', '0', '', ''], |
||
91 | 'Ă' => ['1', '0', '', ''], |
||
92 | 'Ą' => ['1', '', '', '', '', '', '6'], |
||
93 | 'Ạ' => ['1', '0', '', ''], |
||
94 | 'Ả' => ['1', '0', '', ''], |
||
95 | 'Ấ' => ['1', '0', '', ''], |
||
96 | 'Ầ' => ['1', '0', '', ''], |
||
97 | 'Ẩ' => ['1', '0', '', ''], |
||
98 | 'Ẫ' => ['1', '0', '', ''], |
||
99 | 'Ậ' => ['1', '0', '', ''], |
||
100 | 'Ắ' => ['1', '0', '', ''], |
||
101 | 'Ằ' => ['1', '0', '', ''], |
||
102 | 'Ẳ' => ['1', '0', '', ''], |
||
103 | 'Ẵ' => ['1', '0', '', ''], |
||
104 | 'Ặ' => ['1', '0', '', ''], |
||
105 | 'AE' => ['1', '0', '1', ''], |
||
106 | 'Æ' => ['1', '0', '1', ''], |
||
107 | 'AI' => ['1', '0', '1', ''], |
||
108 | 'AJ' => ['1', '0', '1', ''], |
||
109 | 'AU' => ['1', '0', '7', ''], |
||
110 | 'AV' => ['1', '0', '7', '', '7', '7', '7'], |
||
111 | 'ÄU' => ['1', '0', '1', ''], |
||
112 | 'AY' => ['1', '0', '1', ''], |
||
113 | 'B' => ['0', '7', '7', '7'], |
||
114 | 'C' => ['0', '5', '5', '5', '34', '4', '4'], |
||
115 | 'Ć' => ['0', '4', '4', '4'], |
||
116 | 'Č' => ['0', '4', '4', '4'], |
||
117 | 'Ç' => ['0', '4', '4', '4'], |
||
118 | 'CH' => ['0', '5', '5', '5', '34', '4', '4'], |
||
119 | 'CHS' => ['0', '5', '54', '54'], |
||
120 | 'CK' => ['0', '5', '5', '5', '45', '45', '45'], |
||
121 | 'CCS' => ['0', '4', '4', '4'], |
||
122 | 'CS' => ['0', '4', '4', '4'], |
||
123 | 'CSZ' => ['0', '4', '4', '4'], |
||
124 | 'CZ' => ['0', '4', '4', '4'], |
||
125 | 'CZS' => ['0', '4', '4', '4'], |
||
126 | 'D' => ['0', '3', '3', '3'], |
||
127 | 'Ď' => ['0', '3', '3', '3'], |
||
128 | 'Đ' => ['0', '3', '3', '3'], |
||
129 | 'DRS' => ['0', '4', '4', '4'], |
||
130 | 'DRZ' => ['0', '4', '4', '4'], |
||
131 | 'DS' => ['0', '4', '4', '4'], |
||
132 | 'DSH' => ['0', '4', '4', '4'], |
||
133 | 'DSZ' => ['0', '4', '4', '4'], |
||
134 | 'DT' => ['0', '3', '3', '3'], |
||
135 | 'DDZ' => ['0', '4', '4', '4'], |
||
136 | 'DDZS' => ['0', '4', '4', '4'], |
||
137 | 'DZ' => ['0', '4', '4', '4'], |
||
138 | 'DŹ' => ['0', '4', '4', '4'], |
||
139 | 'DŻ' => ['0', '4', '4', '4'], |
||
140 | 'DZH' => ['0', '4', '4', '4'], |
||
141 | 'DZS' => ['0', '4', '4', '4'], |
||
142 | 'E' => ['1', '0', '', ''], |
||
143 | 'È' => ['1', '0', '', ''], |
||
144 | 'É' => ['1', '0', '', ''], |
||
145 | 'Ê' => ['1', '0', '', ''], |
||
146 | 'Ë' => ['1', '0', '', ''], |
||
147 | 'Ĕ' => ['1', '0', '', ''], |
||
148 | 'Ė' => ['1', '0', '', ''], |
||
149 | 'Ę' => ['1', '', '', '6', '', '', ''], |
||
150 | 'Ẹ' => ['1', '0', '', ''], |
||
151 | 'Ẻ' => ['1', '0', '', ''], |
||
152 | 'Ẽ' => ['1', '0', '', ''], |
||
153 | 'Ế' => ['1', '0', '', ''], |
||
154 | 'Ề' => ['1', '0', '', ''], |
||
155 | 'Ể' => ['1', '0', '', ''], |
||
156 | 'Ễ' => ['1', '0', '', ''], |
||
157 | 'Ệ' => ['1', '0', '', ''], |
||
158 | 'EAU' => ['1', '0', '', ''], |
||
159 | 'EI' => ['1', '0', '1', ''], |
||
160 | 'EJ' => ['1', '0', '1', ''], |
||
161 | 'EU' => ['1', '1', '1', ''], |
||
162 | 'EY' => ['1', '0', '1', ''], |
||
163 | 'F' => ['0', '7', '7', '7'], |
||
164 | 'FB' => ['0', '7', '7', '7'], |
||
165 | 'G' => ['0', '5', '5', '5', '34', '4', '4'], |
||
166 | 'Ğ' => ['0', '', '', ''], |
||
167 | 'GGY' => ['0', '5', '5', '5'], |
||
168 | 'GY' => ['0', '5', '5', '5'], |
||
169 | 'H' => ['0', '5', '5', '', '5', '5', '5'], |
||
170 | 'I' => ['1', '0', '', ''], |
||
171 | 'Ì' => ['1', '0', '', ''], |
||
172 | 'Í' => ['1', '0', '', ''], |
||
173 | 'Î' => ['1', '0', '', ''], |
||
174 | 'Ï' => ['1', '0', '', ''], |
||
175 | 'Ĩ' => ['1', '0', '', ''], |
||
176 | 'Į' => ['1', '0', '', ''], |
||
177 | 'İ' => ['1', '0', '', ''], |
||
178 | 'Ỉ' => ['1', '0', '', ''], |
||
179 | 'Ị' => ['1', '0', '', ''], |
||
180 | 'IA' => ['1', '1', '', ''], |
||
181 | 'IE' => ['1', '1', '', ''], |
||
182 | 'IO' => ['1', '1', '', ''], |
||
183 | 'IU' => ['1', '1', '', ''], |
||
184 | 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], |
||
185 | 'K' => ['0', '5', '5', '5'], |
||
186 | 'KH' => ['0', '5', '5', '5'], |
||
187 | 'KS' => ['0', '5', '54', '54'], |
||
188 | 'L' => ['0', '8', '8', '8'], |
||
189 | 'Ľ' => ['0', '8', '8', '8'], |
||
190 | 'Ĺ' => ['0', '8', '8', '8'], |
||
191 | 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], |
||
192 | 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], |
||
193 | 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], |
||
194 | 'LY' => ['0', '8', '8', '8', '1', '8', '8'], |
||
195 | 'M' => ['0', '6', '6', '6'], |
||
196 | 'MĔ' => ['0', '66', '66', '66'], |
||
197 | 'MN' => ['0', '66', '66', '66'], |
||
198 | 'N' => ['0', '6', '6', '6'], |
||
199 | 'Ń' => ['0', '6', '6', '6'], |
||
200 | 'Ň' => ['0', '6', '6', '6'], |
||
201 | 'Ñ' => ['0', '6', '6', '6'], |
||
202 | 'NM' => ['0', '66', '66', '66'], |
||
203 | 'O' => ['1', '0', '', ''], |
||
204 | 'Ò' => ['1', '0', '', ''], |
||
205 | 'Ó' => ['1', '0', '', ''], |
||
206 | 'Ô' => ['1', '0', '', ''], |
||
207 | 'Õ' => ['1', '0', '', ''], |
||
208 | 'Ö' => ['1', '0', '', ''], |
||
209 | 'Ø' => ['1', '0', '', ''], |
||
210 | 'Ő' => ['1', '0', '', ''], |
||
211 | 'Œ' => ['1', '0', '', ''], |
||
212 | 'Ơ' => ['1', '0', '', ''], |
||
213 | 'Ọ' => ['1', '0', '', ''], |
||
214 | 'Ỏ' => ['1', '0', '', ''], |
||
215 | 'Ố' => ['1', '0', '', ''], |
||
216 | 'Ồ' => ['1', '0', '', ''], |
||
217 | 'Ổ' => ['1', '0', '', ''], |
||
218 | 'Ỗ' => ['1', '0', '', ''], |
||
219 | 'Ộ' => ['1', '0', '', ''], |
||
220 | 'Ớ' => ['1', '0', '', ''], |
||
221 | 'Ờ' => ['1', '0', '', ''], |
||
222 | 'Ở' => ['1', '0', '', ''], |
||
223 | 'Ỡ' => ['1', '0', '', ''], |
||
224 | 'Ợ' => ['1', '0', '', ''], |
||
225 | 'OE' => ['1', '0', '', ''], |
||
226 | 'OI' => ['1', '0', '1', ''], |
||
227 | 'OJ' => ['1', '0', '1', ''], |
||
228 | 'OU' => ['1', '0', '', ''], |
||
229 | 'OY' => ['1', '0', '1', ''], |
||
230 | 'P' => ['0', '7', '7', '7'], |
||
231 | 'PF' => ['0', '7', '7', '7'], |
||
232 | 'PH' => ['0', '7', '7', '7'], |
||
233 | 'Q' => ['0', '5', '5', '5'], |
||
234 | 'R' => ['0', '9', '9', '9'], |
||
235 | 'Ř' => ['0', '4', '4', '4'], |
||
236 | 'RS' => ['0', '4', '4', '4', '94', '94', '94'], |
||
237 | 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], |
||
238 | 'S' => ['0', '4', '4', '4'], |
||
239 | 'Ś' => ['0', '4', '4', '4'], |
||
240 | 'Š' => ['0', '4', '4', '4'], |
||
241 | 'Ş' => ['0', '4', '4', '4'], |
||
242 | 'SC' => ['0', '2', '4', '4'], |
||
243 | 'ŠČ' => ['0', '2', '4', '4'], |
||
244 | 'SCH' => ['0', '4', '4', '4'], |
||
245 | 'SCHD' => ['0', '2', '43', '43'], |
||
246 | 'SCHT' => ['0', '2', '43', '43'], |
||
247 | 'SCHTCH' => ['0', '2', '4', '4'], |
||
248 | 'SCHTSCH' => ['0', '2', '4', '4'], |
||
249 | 'SCHTSH' => ['0', '2', '4', '4'], |
||
250 | 'SD' => ['0', '2', '43', '43'], |
||
251 | 'SH' => ['0', '4', '4', '4'], |
||
252 | 'SHCH' => ['0', '2', '4', '4'], |
||
253 | 'SHD' => ['0', '2', '43', '43'], |
||
254 | 'SHT' => ['0', '2', '43', '43'], |
||
255 | 'SHTCH' => ['0', '2', '4', '4'], |
||
256 | 'SHTSH' => ['0', '2', '4', '4'], |
||
257 | 'ß' => ['0', '', '4', '4'], |
||
258 | 'ST' => ['0', '2', '43', '43'], |
||
259 | 'STCH' => ['0', '2', '4', '4'], |
||
260 | 'STRS' => ['0', '2', '4', '4'], |
||
261 | 'STRZ' => ['0', '2', '4', '4'], |
||
262 | 'STSCH' => ['0', '2', '4', '4'], |
||
263 | 'STSH' => ['0', '2', '4', '4'], |
||
264 | 'SSZ' => ['0', '4', '4', '4'], |
||
265 | 'SZ' => ['0', '4', '4', '4'], |
||
266 | 'SZCS' => ['0', '2', '4', '4'], |
||
267 | 'SZCZ' => ['0', '2', '4', '4'], |
||
268 | 'SZD' => ['0', '2', '43', '43'], |
||
269 | 'SZT' => ['0', '2', '43', '43'], |
||
270 | 'T' => ['0', '3', '3', '3'], |
||
271 | 'Ť' => ['0', '3', '3', '3'], |
||
272 | 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], |
||
273 | 'TC' => ['0', '4', '4', '4'], |
||
274 | 'TCH' => ['0', '4', '4', '4'], |
||
275 | 'TH' => ['0', '3', '3', '3'], |
||
276 | 'TRS' => ['0', '4', '4', '4'], |
||
277 | 'TRZ' => ['0', '4', '4', '4'], |
||
278 | 'TS' => ['0', '4', '4', '4'], |
||
279 | 'TSCH' => ['0', '4', '4', '4'], |
||
280 | 'TSH' => ['0', '4', '4', '4'], |
||
281 | 'TSZ' => ['0', '4', '4', '4'], |
||
282 | 'TTCH' => ['0', '4', '4', '4'], |
||
283 | 'TTS' => ['0', '4', '4', '4'], |
||
284 | 'TTSCH' => ['0', '4', '4', '4'], |
||
285 | 'TTSZ' => ['0', '4', '4', '4'], |
||
286 | 'TTZ' => ['0', '4', '4', '4'], |
||
287 | 'TZ' => ['0', '4', '4', '4'], |
||
288 | 'TZS' => ['0', '4', '4', '4'], |
||
289 | 'U' => ['1', '0', '', ''], |
||
290 | 'Ù' => ['1', '0', '', ''], |
||
291 | 'Ú' => ['1', '0', '', ''], |
||
292 | 'Û' => ['1', '0', '', ''], |
||
293 | 'Ü' => ['1', '0', '', ''], |
||
294 | 'Ũ' => ['1', '0', '', ''], |
||
295 | 'Ū' => ['1', '0', '', ''], |
||
296 | 'Ů' => ['1', '0', '', ''], |
||
297 | 'Ű' => ['1', '0', '', ''], |
||
298 | 'Ų' => ['1', '0', '', ''], |
||
299 | 'Ư' => ['1', '0', '', ''], |
||
300 | 'Ụ' => ['1', '0', '', ''], |
||
301 | 'Ủ' => ['1', '0', '', ''], |
||
302 | 'Ứ' => ['1', '0', '', ''], |
||
303 | 'Ừ' => ['1', '0', '', ''], |
||
304 | 'Ử' => ['1', '0', '', ''], |
||
305 | 'Ữ' => ['1', '0', '', ''], |
||
306 | 'Ự' => ['1', '0', '', ''], |
||
307 | 'UE' => ['1', '0', '', ''], |
||
308 | 'UI' => ['1', '0', '1', ''], |
||
309 | 'UJ' => ['1', '0', '1', ''], |
||
310 | 'UY' => ['1', '0', '1', ''], |
||
311 | 'UW' => ['1', '0', '1', '', '0', '7', '7'], |
||
312 | 'V' => ['0', '7', '7', '7'], |
||
313 | 'W' => ['0', '7', '7', '7'], |
||
314 | 'X' => ['0', '5', '54', '54'], |
||
315 | 'Y' => ['1', '1', '', ''], |
||
316 | 'Ý' => ['1', '1', '', ''], |
||
317 | 'Ỳ' => ['1', '1', '', ''], |
||
318 | 'Ỵ' => ['1', '1', '', ''], |
||
319 | 'Ỷ' => ['1', '1', '', ''], |
||
320 | 'Ỹ' => ['1', '1', '', ''], |
||
321 | 'Z' => ['0', '4', '4', '4'], |
||
322 | 'Ź' => ['0', '4', '4', '4'], |
||
323 | 'Ż' => ['0', '4', '4', '4'], |
||
324 | 'Ž' => ['0', '4', '4', '4'], |
||
325 | 'ZD' => ['0', '2', '43', '43'], |
||
326 | 'ZDZ' => ['0', '2', '4', '4'], |
||
327 | 'ZDZH' => ['0', '2', '4', '4'], |
||
328 | 'ZH' => ['0', '4', '4', '4'], |
||
329 | 'ZHD' => ['0', '2', '43', '43'], |
||
330 | 'ZHDZH' => ['0', '2', '4', '4'], |
||
331 | 'ZS' => ['0', '4', '4', '4'], |
||
332 | 'ZSCH' => ['0', '4', '4', '4'], |
||
333 | 'ZSH' => ['0', '4', '4', '4'], |
||
334 | 'ZZS' => ['0', '4', '4', '4'], |
||
335 | // Cyrillic alphabet |
||
336 | 'А' => ['1', '0', '', ''], |
||
337 | 'Б' => ['0', '7', '7', '7'], |
||
338 | 'В' => ['0', '7', '7', '7'], |
||
339 | 'Г' => ['0', '5', '5', '5'], |
||
340 | 'Д' => ['0', '3', '3', '3'], |
||
341 | 'ДЗ' => ['0', '4', '4', '4'], |
||
342 | 'Е' => ['1', '0', '', ''], |
||
343 | 'Ё' => ['1', '0', '', ''], |
||
344 | 'Ж' => ['0', '4', '4', '4'], |
||
345 | 'З' => ['0', '4', '4', '4'], |
||
346 | 'И' => ['1', '0', '', ''], |
||
347 | 'Й' => ['1', '1', '', '', '4', '4', '4'], |
||
348 | 'К' => ['0', '5', '5', '5'], |
||
349 | 'Л' => ['0', '8', '8', '8'], |
||
350 | 'М' => ['0', '6', '6', '6'], |
||
351 | 'Н' => ['0', '6', '6', '6'], |
||
352 | 'О' => ['1', '0', '', ''], |
||
353 | 'П' => ['0', '7', '7', '7'], |
||
354 | 'Р' => ['0', '9', '9', '9'], |
||
355 | 'РЖ' => ['0', '4', '4', '4'], |
||
356 | 'С' => ['0', '4', '4', '4'], |
||
357 | 'Т' => ['0', '3', '3', '3'], |
||
358 | 'У' => ['1', '0', '', ''], |
||
359 | 'Ф' => ['0', '7', '7', '7'], |
||
360 | 'Х' => ['0', '5', '5', '5'], |
||
361 | 'Ц' => ['0', '4', '4', '4'], |
||
362 | 'Ч' => ['0', '4', '4', '4'], |
||
363 | 'Ш' => ['0', '4', '4', '4'], |
||
364 | 'Щ' => ['0', '2', '4', '4'], |
||
365 | 'Ъ' => ['0', '', '', ''], |
||
366 | 'Ы' => ['0', '1', '', ''], |
||
367 | 'Ь' => ['0', '', '', ''], |
||
368 | 'Э' => ['1', '0', '', ''], |
||
369 | 'Ю' => ['0', '1', '', ''], |
||
370 | 'Я' => ['0', '1', '', ''], |
||
371 | // Greek alphabet |
||
372 | 'Α' => ['1', '0', '', ''], |
||
373 | 'Ά' => ['1', '0', '', ''], |
||
374 | 'ΑΙ' => ['1', '0', '1', ''], |
||
375 | 'ΑΥ' => ['1', '0', '1', ''], |
||
376 | 'Β' => ['0', '7', '7', '7'], |
||
377 | 'Γ' => ['0', '5', '5', '5'], |
||
378 | 'Δ' => ['0', '3', '3', '3'], |
||
379 | 'Ε' => ['1', '0', '', ''], |
||
380 | 'Έ' => ['1', '0', '', ''], |
||
381 | 'ΕΙ' => ['1', '0', '1', ''], |
||
382 | 'ΕΥ' => ['1', '1', '1', ''], |
||
383 | 'Ζ' => ['0', '4', '4', '4'], |
||
384 | 'Η' => ['1', '0', '', ''], |
||
385 | 'Ή' => ['1', '0', '', ''], |
||
386 | 'Θ' => ['0', '3', '3', '3'], |
||
387 | 'Ι' => ['1', '0', '', ''], |
||
388 | 'Ί' => ['1', '0', '', ''], |
||
389 | 'Ϊ' => ['1', '0', '', ''], |
||
390 | 'ΐ' => ['1', '0', '', ''], |
||
391 | 'Κ' => ['0', '5', '5', '5'], |
||
392 | 'Λ' => ['0', '8', '8', '8'], |
||
393 | 'Μ' => ['0', '6', '6', '6'], |
||
394 | 'ΜΠ' => ['0', '7', '7', '7'], |
||
395 | 'Ν' => ['0', '6', '6', '6'], |
||
396 | 'ΝΤ' => ['0', '3', '3', '3'], |
||
397 | 'Ξ' => ['0', '5', '54', '54'], |
||
398 | 'Ο' => ['1', '0', '', ''], |
||
399 | 'Ό' => ['1', '0', '', ''], |
||
400 | 'ΟΙ' => ['1', '0', '1', ''], |
||
401 | 'ΟΥ' => ['1', '0', '1', ''], |
||
402 | 'Π' => ['0', '7', '7', '7'], |
||
403 | 'Ρ' => ['0', '9', '9', '9'], |
||
404 | 'Σ' => ['0', '4', '4', '4'], |
||
405 | 'ς' => ['0', '', '', '4'], |
||
406 | 'Τ' => ['0', '3', '3', '3'], |
||
407 | 'ΤΖ' => ['0', '4', '4', '4'], |
||
408 | 'ΤΣ' => ['0', '4', '4', '4'], |
||
409 | 'Υ' => ['1', '1', '', ''], |
||
410 | 'Ύ' => ['1', '1', '', ''], |
||
411 | 'Ϋ' => ['1', '1', '', ''], |
||
412 | 'ΰ' => ['1', '1', '', ''], |
||
413 | 'ΥΚ' => ['1', '5', '5', '5'], |
||
414 | 'ΥΥ' => ['1', '65', '65', '65'], |
||
415 | 'Φ' => ['0', '7', '7', '7'], |
||
416 | 'Χ' => ['0', '5', '5', '5'], |
||
417 | 'Ψ' => ['0', '7', '7', '7'], |
||
418 | 'Ω' => ['1', '0', '', ''], |
||
419 | 'Ώ' => ['1', '0', '', ''], |
||
420 | // Hebrew alphabet |
||
421 | 'א' => ['1', '0', '', ''], |
||
422 | 'או' => ['1', '0', '7', ''], |
||
423 | 'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], |
||
424 | 'בב' => ['0', '7', '7', '7', '77', '77', '77'], |
||
425 | 'ב' => ['0', '7', '7', '7'], |
||
426 | 'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], |
||
427 | 'גד' => ['0', '43', '43', '43', '53', '53', '53'], |
||
428 | 'גה' => ['0', '45', '45', '45', '55', '55', '55'], |
||
429 | 'גז' => ['0', '44', '44', '44', '45', '45', '45'], |
||
430 | 'גח' => ['0', '45', '45', '45', '55', '55', '55'], |
||
431 | 'גכ' => ['0', '45', '45', '45', '55', '55', '55'], |
||
432 | 'גך' => ['0', '45', '45', '45', '55', '55', '55'], |
||
433 | 'גצ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
434 | 'גץ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
435 | 'גק' => ['0', '45', '45', '45', '54', '54', '54'], |
||
436 | 'גש' => ['0', '44', '44', '44', '54', '54', '54'], |
||
437 | 'גת' => ['0', '43', '43', '43', '53', '53', '53'], |
||
438 | 'ג' => ['0', '4', '4', '4', '5', '5', '5'], |
||
439 | 'דז' => ['0', '4', '4', '4'], |
||
440 | 'דד' => ['0', '3', '3', '3', '33', '33', '33'], |
||
441 | 'דט' => ['0', '33', '33', '33'], |
||
442 | 'דש' => ['0', '4', '4', '4'], |
||
443 | 'דצ' => ['0', '4', '4', '4'], |
||
444 | 'דץ' => ['0', '4', '4', '4'], |
||
445 | 'ד' => ['0', '3', '3', '3'], |
||
446 | 'הג' => ['0', '54', '54', '54', '55', '55', '55'], |
||
447 | 'הכ' => ['0', '55', '55', '55'], |
||
448 | 'הח' => ['0', '55', '55', '55'], |
||
449 | 'הק' => ['0', '55', '55', '55', '5', '5', '5'], |
||
450 | 'הה' => ['0', '5', '5', '', '55', '55', ''], |
||
451 | 'ה' => ['0', '5', '5', ''], |
||
452 | 'וי' => ['1', '', '', '', '7', '7', '7'], |
||
453 | 'ו' => ['1', '7', '7', '7', '7', '', ''], |
||
454 | 'וו' => ['1', '7', '7', '7', '7', '', ''], |
||
455 | 'וופ' => ['1', '7', '7', '7', '77', '77', '77'], |
||
456 | 'זש' => ['0', '4', '4', '4', '44', '44', '44'], |
||
457 | 'זדז' => ['0', '2', '4', '4'], |
||
458 | 'ז' => ['0', '4', '4', '4'], |
||
459 | 'זג' => ['0', '44', '44', '44', '45', '45', '45'], |
||
460 | 'זז' => ['0', '4', '4', '4', '44', '44', '44'], |
||
461 | 'זס' => ['0', '44', '44', '44'], |
||
462 | 'זצ' => ['0', '44', '44', '44'], |
||
463 | 'זץ' => ['0', '44', '44', '44'], |
||
464 | 'חג' => ['0', '54', '54', '54', '53', '53', '53'], |
||
465 | 'חח' => ['0', '5', '5', '5', '55', '55', '55'], |
||
466 | 'חק' => ['0', '55', '55', '55', '5', '5', '5'], |
||
467 | 'חכ' => ['0', '45', '45', '45', '55', '55', '55'], |
||
468 | 'חס' => ['0', '5', '54', '54'], |
||
469 | 'חש' => ['0', '5', '54', '54'], |
||
470 | 'ח' => ['0', '5', '5', '5'], |
||
471 | 'טש' => ['0', '4', '4', '4'], |
||
472 | 'טד' => ['0', '33', '33', '33'], |
||
473 | 'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], |
||
474 | 'טת' => ['0', '33', '33', '33'], |
||
475 | 'טט' => ['0', '3', '3', '3', '33', '33', '33'], |
||
476 | 'ט' => ['0', '3', '3', '3'], |
||
477 | 'י' => ['1', '1', '', ''], |
||
478 | 'יא' => ['1', '1', '', '', '1', '1', '1'], |
||
479 | 'כג' => ['0', '55', '55', '55', '54', '54', '54'], |
||
480 | 'כש' => ['0', '5', '54', '54'], |
||
481 | 'כס' => ['0', '5', '54', '54'], |
||
482 | 'ככ' => ['0', '5', '5', '5', '55', '55', '55'], |
||
483 | 'כך' => ['0', '5', '5', '5', '55', '55', '55'], |
||
484 | 'כ' => ['0', '5', '5', '5'], |
||
485 | 'כח' => ['0', '55', '55', '55', '5', '5', '5'], |
||
486 | 'ך' => ['0', '', '5', '5'], |
||
487 | 'ל' => ['0', '8', '8', '8'], |
||
488 | 'לל' => ['0', '88', '88', '88', '8', '8', '8'], |
||
489 | 'מנ' => ['0', '66', '66', '66'], |
||
490 | 'מן' => ['0', '66', '66', '66'], |
||
491 | 'ממ' => ['0', '6', '6', '6', '66', '66', '66'], |
||
492 | 'מם' => ['0', '6', '6', '6', '66', '66', '66'], |
||
493 | 'מ' => ['0', '6', '6', '6'], |
||
494 | 'ם' => ['0', '', '6', '6'], |
||
495 | 'נמ' => ['0', '66', '66', '66'], |
||
496 | 'נם' => ['0', '66', '66', '66'], |
||
497 | 'ננ' => ['0', '6', '6', '6', '66', '66', '66'], |
||
498 | 'נן' => ['0', '6', '6', '6', '66', '66', '66'], |
||
499 | 'נ' => ['0', '6', '6', '6'], |
||
500 | 'ן' => ['0', '', '6', '6'], |
||
501 | 'סתש' => ['0', '2', '4', '4'], |
||
502 | 'סתז' => ['0', '2', '4', '4'], |
||
503 | 'סטז' => ['0', '2', '4', '4'], |
||
504 | 'סטש' => ['0', '2', '4', '4'], |
||
505 | 'סצד' => ['0', '2', '4', '4'], |
||
506 | 'סט' => ['0', '2', '4', '4', '43', '43', '43'], |
||
507 | 'סת' => ['0', '2', '4', '4', '43', '43', '43'], |
||
508 | 'סג' => ['0', '44', '44', '44', '4', '4', '4'], |
||
509 | 'סס' => ['0', '4', '4', '4', '44', '44', '44'], |
||
510 | 'סצ' => ['0', '44', '44', '44'], |
||
511 | 'סץ' => ['0', '44', '44', '44'], |
||
512 | 'סז' => ['0', '44', '44', '44'], |
||
513 | 'סש' => ['0', '44', '44', '44'], |
||
514 | 'ס' => ['0', '4', '4', '4'], |
||
515 | 'ע' => ['1', '0', '', ''], |
||
516 | 'פב' => ['0', '7', '7', '7', '77', '77', '77'], |
||
517 | 'פוו' => ['0', '7', '7', '7', '77', '77', '77'], |
||
518 | 'פפ' => ['0', '7', '7', '7', '77', '77', '77'], |
||
519 | 'פף' => ['0', '7', '7', '7', '77', '77', '77'], |
||
520 | 'פ' => ['0', '7', '7', '7'], |
||
521 | 'ף' => ['0', '', '7', '7'], |
||
522 | 'צג' => ['0', '44', '44', '44', '45', '45', '45'], |
||
523 | 'צז' => ['0', '44', '44', '44'], |
||
524 | 'צס' => ['0', '44', '44', '44'], |
||
525 | 'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], |
||
526 | 'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], |
||
527 | 'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], |
||
528 | 'צ' => ['0', '4', '4', '4', '5', '5', '5'], |
||
529 | 'ץ' => ['0', '', '4', '4'], |
||
530 | 'קה' => ['0', '55', '55', '5'], |
||
531 | 'קס' => ['0', '5', '54', '54'], |
||
532 | 'קש' => ['0', '5', '54', '54'], |
||
533 | 'קק' => ['0', '5', '5', '5', '55', '55', '55'], |
||
534 | 'קח' => ['0', '55', '55', '55'], |
||
535 | 'קכ' => ['0', '55', '55', '55'], |
||
536 | 'קך' => ['0', '55', '55', '55'], |
||
537 | 'קג' => ['0', '55', '55', '55', '54', '54', '54'], |
||
538 | 'ק' => ['0', '5', '5', '5'], |
||
539 | 'רר' => ['0', '99', '99', '99', '9', '9', '9'], |
||
540 | 'ר' => ['0', '9', '9', '9'], |
||
541 | 'שטז' => ['0', '2', '4', '4'], |
||
542 | 'שתש' => ['0', '2', '4', '4'], |
||
543 | 'שתז' => ['0', '2', '4', '4'], |
||
544 | 'שטש' => ['0', '2', '4', '4'], |
||
545 | 'שד' => ['0', '2', '43', '43'], |
||
546 | 'שז' => ['0', '44', '44', '44'], |
||
547 | 'שס' => ['0', '44', '44', '44'], |
||
548 | 'שת' => ['0', '2', '43', '43'], |
||
549 | 'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], |
||
550 | 'שט' => ['0', '2', '43', '43', '44', '44', '44'], |
||
551 | 'שצ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
552 | 'שץ' => ['0', '44', '', '44', '45', '', '45'], |
||
553 | 'שש' => ['0', '4', '4', '4', '44', '44', '44'], |
||
554 | 'ש' => ['0', '4', '4', '4'], |
||
555 | 'תג' => ['0', '34', '34', '34'], |
||
556 | 'תז' => ['0', '34', '34', '34'], |
||
557 | 'תש' => ['0', '4', '4', '4'], |
||
558 | 'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], |
||
559 | 'ת' => ['0', '3', '3', '3', '4', '4', '4'], |
||
560 | // Arabic alphabet |
||
561 | 'ا' => ['1', '0', '', ''], |
||
562 | 'ب' => ['0', '7', '7', '7'], |
||
563 | 'ت' => ['0', '3', '3', '3'], |
||
564 | 'ث' => ['0', '3', '3', '3'], |
||
565 | 'ج' => ['0', '4', '4', '4'], |
||
566 | 'ح' => ['0', '5', '5', '5'], |
||
567 | 'خ' => ['0', '5', '5', '5'], |
||
568 | 'د' => ['0', '3', '3', '3'], |
||
569 | 'ذ' => ['0', '3', '3', '3'], |
||
570 | 'ر' => ['0', '9', '9', '9'], |
||
571 | 'ز' => ['0', '4', '4', '4'], |
||
572 | 'س' => ['0', '4', '4', '4'], |
||
573 | 'ش' => ['0', '4', '4', '4'], |
||
574 | 'ص' => ['0', '4', '4', '4'], |
||
575 | 'ض' => ['0', '3', '3', '3'], |
||
576 | 'ط' => ['0', '3', '3', '3'], |
||
577 | 'ظ' => ['0', '4', '4', '4'], |
||
578 | 'ع' => ['1', '0', '', ''], |
||
579 | 'غ' => ['0', '0', '', ''], |
||
580 | 'ف' => ['0', '7', '7', '7'], |
||
581 | 'ق' => ['0', '5', '5', '5'], |
||
582 | 'ك' => ['0', '5', '5', '5'], |
||
583 | 'ل' => ['0', '8', '8', '8'], |
||
584 | 'لا' => ['0', '8', '8', '8'], |
||
585 | 'م' => ['0', '6', '6', '6'], |
||
586 | 'ن' => ['0', '6', '6', '6'], |
||
587 | 'هن' => ['0', '66', '66', '66'], |
||
588 | 'ه' => ['0', '5', '5', ''], |
||
589 | 'و' => ['1', '', '', '', '7', '', ''], |
||
590 | 'ي' => ['0', '1', '', ''], |
||
591 | 'آ' => ['0', '1', '', ''], |
||
592 | 'ة' => ['0', '', '', '3'], |
||
593 | 'ی' => ['0', '1', '', ''], |
||
594 | 'ى' => ['1', '1', '', ''], |
||
595 | ]; |
||
596 | |||
597 | /** |
||
598 | * Which algorithms are supported. |
||
599 | * |
||
600 | * @return array<string> |
||
601 | */ |
||
602 | public static function getAlgorithms(): array |
||
603 | { |
||
604 | return [ |
||
605 | /* I18N: https://en.wikipedia.org/wiki/Soundex */ |
||
606 | 'std' => I18N::translate('Russell'), |
||
607 | /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ |
||
608 | 'dm' => I18N::translate('Daitch-Mokotoff'), |
||
609 | ]; |
||
610 | } |
||
611 | |||
612 | /** |
||
613 | * Is there a match between two soundex codes? |
||
614 | * |
||
615 | * @param string $soundex1 |
||
616 | * @param string $soundex2 |
||
617 | * |
||
618 | * @return bool |
||
619 | */ |
||
620 | public static function compare(string $soundex1, string $soundex2): bool |
||
621 | { |
||
622 | if ($soundex1 !== '' && $soundex2 !== '') { |
||
623 | return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; |
||
624 | } |
||
625 | |||
626 | return false; |
||
627 | } |
||
628 | |||
629 | /** |
||
630 | * Generate Russell soundex codes for a given text. |
||
631 | * |
||
632 | * @param string $text |
||
633 | * |
||
634 | * @return string |
||
635 | */ |
||
636 | public static function russell(string $text): string |
||
637 | { |
||
638 | $words = explode(' ', $text); |
||
639 | $soundex_array = []; |
||
640 | |||
641 | foreach ($words as $word) { |
||
642 | $soundex = soundex($word); |
||
643 | |||
644 | // Only return codes from recognisable sounds |
||
645 | if ($soundex !== '0000') { |
||
646 | $soundex_array[] = $soundex; |
||
647 | } |
||
648 | } |
||
649 | |||
650 | // Combine words, e.g. “New York” as “Newyork” |
||
651 | if (count($words) > 1) { |
||
652 | $soundex_array[] = soundex(str_replace(' ', '', $text)); |
||
653 | } |
||
654 | |||
655 | // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) |
||
656 | $soundex_array = array_slice(array_unique($soundex_array), 0, 51); |
||
657 | |||
658 | return implode(':', $soundex_array); |
||
659 | } |
||
660 | |||
661 | /** |
||
662 | * Generate Daitch–Mokotoff soundex codes for a given text. |
||
663 | * |
||
664 | * @param string $text |
||
665 | * |
||
666 | * @return string |
||
667 | */ |
||
668 | public static function daitchMokotoff(string $text): string |
||
669 | { |
||
670 | $words = explode(' ', $text); |
||
671 | $soundex_array = []; |
||
672 | |||
673 | foreach ($words as $word) { |
||
674 | $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); |
||
675 | } |
||
676 | // Combine words, e.g. “New York” as “Newyork” |
||
677 | if (count($words) > 1) { |
||
678 | $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); |
||
679 | } |
||
680 | |||
681 | // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) |
||
682 | $soundex_array = array_slice(array_unique($soundex_array), 0, 36); |
||
683 | |||
684 | return implode(':', $soundex_array); |
||
685 | } |
||
686 | |||
687 | /** |
||
688 | * Calculate the Daitch-Mokotoff soundex for a word. |
||
689 | * |
||
690 | * @param string $name |
||
691 | * |
||
692 | * @return array<string> List of possible DM codes for the word. |
||
693 | */ |
||
694 | private static function daitchMokotoffWord(string $name): array |
||
695 | { |
||
696 | // Apply special transformation rules to the input string |
||
697 | $name = I18N::strtoupper($name); |
||
698 | foreach (self::TRANSFORM_NAMES as $transformRule) { |
||
699 | $name = str_replace($transformRule[0], $transformRule[1], $name); |
||
700 | } |
||
701 | |||
702 | // Initialize |
||
703 | $name_script = I18N::textScript($name); |
||
704 | $noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; |
||
705 | |||
706 | $lastPos = strlen($name) - 1; |
||
707 | $currPos = 0; |
||
708 | $state = 1; // 1: start of input string, 2: before vowel, 3: other |
||
709 | $result = []; // accumulate complete 6-digit D-M codes here |
||
710 | $partialResult = []; // accumulate incomplete D-M codes here |
||
711 | $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) |
||
712 | |||
713 | // Loop through the input string. |
||
714 | // Stop when the string is exhausted or when no more partial results remain |
||
715 | while ($partialResult !== [] && $currPos <= $lastPos) { |
||
716 | // Find the DM coding table entry for the chunk at the current position |
||
717 | $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
||
718 | while ($thisEntry !== '') { |
||
719 | if (isset(self::DM_SOUNDS[$thisEntry])) { |
||
720 | break; |
||
721 | } |
||
722 | $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk |
||
723 | } |
||
724 | if ($thisEntry === '') { |
||
725 | $currPos++; // Not in table: advance pointer to next byte |
||
726 | continue; // and try again |
||
727 | } |
||
728 | |||
729 | $soundTableEntry = self::DM_SOUNDS[$thisEntry]; |
||
730 | $workingResult = $partialResult; |
||
731 | $partialResult = []; |
||
732 | $currPos += strlen($thisEntry); |
||
733 | |||
734 | // Not at beginning of input string |
||
735 | if ($state !== 1) { |
||
736 | if ($currPos <= $lastPos) { |
||
737 | // Determine whether the next chunk is a vowel |
||
738 | $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
||
739 | while ($nextEntry !== '') { |
||
740 | if (isset(self::DM_SOUNDS[$nextEntry])) { |
||
741 | break; |
||
742 | } |
||
743 | $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk |
||
744 | } |
||
745 | } else { |
||
746 | $nextEntry = ''; |
||
747 | } |
||
748 | if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { |
||
749 | $state = 2; |
||
750 | } else { |
||
751 | // Next chunk is a vowel |
||
752 | $state = 3; |
||
753 | } |
||
754 | } |
||
755 | |||
756 | while ($state < count($soundTableEntry)) { |
||
757 | // empty means 'ignore this sound in this state' |
||
758 | if ($soundTableEntry[$state] === '') { |
||
759 | foreach ($workingResult as $workingEntry) { |
||
760 | $tempEntry = $workingEntry; |
||
761 | $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' |
||
762 | $partialResult[] = $tempEntry; |
||
763 | } |
||
764 | } else { |
||
765 | foreach ($workingResult as $workingEntry) { |
||
766 | if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { |
||
767 | // Incoming sound isn't a duplicate of the previous sound |
||
768 | $workingEntry[] = $soundTableEntry[$state]; |
||
769 | } elseif ($noVowels) { |
||
770 | // Incoming sound is a duplicate of the previous sound |
||
771 | // For Hebrew and Arabic, we need to create a pair of D-M sound codes, |
||
772 | // one of the pair with only a single occurrence of the duplicate sound, |
||
773 | // the other with both occurrences |
||
774 | $workingEntry[] = $soundTableEntry[$state]; |
||
775 | } |
||
776 | |||
777 | if (count($workingEntry) < 7) { |
||
778 | $partialResult[] = $workingEntry; |
||
779 | } else { |
||
780 | // This is the 6th code in the sequence |
||
781 | // We're looking for 7 entries because the first is '!' and doesn't count |
||
782 | $tempResult = str_replace('!', '', implode('', $workingEntry)); |
||
783 | // Only return codes from recognisable sounds |
||
784 | if ($tempResult !== '') { |
||
785 | $result[] = substr($tempResult . '000000', 0, 6); |
||
786 | } |
||
787 | } |
||
788 | } |
||
789 | } |
||
790 | $state += 3; // Advance to next triplet while keeping the same basic state |
||
791 | } |
||
792 | } |
||
793 | |||
794 | // Zero-fill and copy all remaining partial results |
||
795 | foreach ($partialResult as $workingEntry) { |
||
796 | $tempResult = str_replace('!', '', implode('', $workingEntry)); |
||
797 | // Only return codes from recognisable sounds |
||
798 | if ($tempResult !== '') { |
||
799 | $result[] = substr($tempResult . '000000', 0, 6); |
||
800 | } |
||
801 | } |
||
802 | |||
803 | return $result; |
||
804 | } |
||
805 | } |
||
806 |