Soundex   A
last analyzed

Complexity

Total Complexity 35

Size/Duplication

Total Lines 775
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 610
c 0
b 0
f 0
dl 0
loc 775
rs 9.59
wmc 35

5 Methods

Rating   Name   Duplication   Size   Complexity  
A getAlgorithms() 0 7 1
F daitchMokotoffWord() 0 110 24
A russell() 0 23 4
A compare() 0 7 3
A daitchMokotoff() 0 17 3
1
<?php
2
3
/**
4
 * webtrees: online genealogy
5
 * Copyright (C) 2025 webtrees development team
6
 * This program is free software: you can redistribute it and/or modify
7
 * it under the terms of the GNU General Public License as published by
8
 * the Free Software Foundation, either version 3 of the License, or
9
 * (at your option) any later version.
10
 * This program is distributed in the hope that it will be useful,
11
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13
 * GNU General Public License for more details.
14
 * You should have received a copy of the GNU General Public License
15
 * along with this program. If not, see <https://www.gnu.org/licenses/>.
16
 */
17
18
declare(strict_types=1);
19
20
namespace Fisharebest\Webtrees;
21
22
use function array_slice;
23
use function count;
24
use function strlen;
25
26
/**
27
 * Phonetic matching of strings.
28
 */
29
class Soundex
30
{
31
    // Determine the Daitch–Mokotoff Soundex code for a word
32
    // Original implementation by Gerry Kroll, and analysis by Meliza Amity
33
34
    // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!)
35
    private const MAXCHAR = 7;
36
37
    /**
38
     * Name transformation arrays.
39
     * Used to transform the Name string to simplify the "sounds like" table.
40
     * This is especially useful in Hebrew.
41
     *
42
     * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text)
43
     * function call to achieve the desired transformations.
44
     *
45
     * Note about the use of "\x01":
46
     * This code, which can’t legitimately occur in the kind of text we're dealing with,
47
     * is used as a place-holder so that conditional string replacements can be done.
48
     */
49
    private const TRANSFORM_NAMES = [
50
        // Force Yiddish ligatures to be treated as separate letters
51
        ['װ', 'וו'],
52
        ['ײ', 'יי'],
53
        ['ױ', 'וי'],
54
        ['בו', 'בע'],
55
        ['פו', 'פע'],
56
        ['ומ', 'עמ'],
57
        ['ום', 'עם'],
58
        ['ונ', 'ענ'],
59
        ['ון', 'ען'],
60
        ['וו', 'ב'],
61
        ["\x01", ''],
62
        ['ייה$', "\x01ה"],
63
        ['ייע$', "\x01ע"],
64
        ['יי', 'ע'],
65
        ["\x01", 'יי'],
66
    ];
67
68
    /**
69
     * The DM sound coding table is organized this way:
70
     * key: a variable-length string that corresponds to the UTF-8 character sequence
71
     * represented by the table entry. Currently, that string can be up to 7
72
     * bytes long. This maximum length is defined by the value of global variable
73
     * $maxchar.
74
     *
75
     * value: an array as follows:
76
     * [0]:  zero if not a vowel
77
     * [1]:  sound value when this string is at the beginning of the word
78
     * [2]:  sound value when this string is followed by a vowel
79
     * [3]:  sound value for other cases
80
     * [1],[2],[3] can be repeated several times to create branches in the code
81
     * an empty sound value means "ignore in this state"
82
     */
83
    private const DM_SOUNDS = [
84
        'A'       => ['1', '0', '', ''],
85
        'À'       => ['1', '0', '', ''],
86
        'Á'       => ['1', '0', '', ''],
87
        'Â'       => ['1', '0', '', ''],
88
        'Ã'       => ['1', '0', '', ''],
89
        'Ä'       => ['1', '0', '1', '', '0', '', ''],
90
        'Å'       => ['1', '0', '', ''],
91
        'Ă'       => ['1', '0', '', ''],
92
        'Ą'       => ['1', '', '', '', '', '', '6'],
93
        'Ạ'       => ['1', '0', '', ''],
94
        'Ả'       => ['1', '0', '', ''],
95
        'Ấ'       => ['1', '0', '', ''],
96
        'Ầ'       => ['1', '0', '', ''],
97
        'Ẩ'       => ['1', '0', '', ''],
98
        'Ẫ'       => ['1', '0', '', ''],
99
        'Ậ'       => ['1', '0', '', ''],
100
        'Ắ'       => ['1', '0', '', ''],
101
        'Ằ'       => ['1', '0', '', ''],
102
        'Ẳ'       => ['1', '0', '', ''],
103
        'Ẵ'       => ['1', '0', '', ''],
104
        'Ặ'       => ['1', '0', '', ''],
105
        'AE'      => ['1', '0', '1', ''],
106
        'Æ'       => ['1', '0', '1', ''],
107
        'AI'      => ['1', '0', '1', ''],
108
        'AJ'      => ['1', '0', '1', ''],
109
        'AU'      => ['1', '0', '7', ''],
110
        'AV'      => ['1', '0', '7', '', '7', '7', '7'],
111
        'ÄU'      => ['1', '0', '1', ''],
112
        'AY'      => ['1', '0', '1', ''],
113
        'B'       => ['0', '7', '7', '7'],
114
        'C'       => ['0', '5', '5', '5', '34', '4', '4'],
115
        'Ć'       => ['0', '4', '4', '4'],
116
        'Č'       => ['0', '4', '4', '4'],
117
        'Ç'       => ['0', '4', '4', '4'],
118
        'CH'      => ['0', '5', '5', '5', '34', '4', '4'],
119
        'CHS'     => ['0', '5', '54', '54'],
120
        'CK'      => ['0', '5', '5', '5', '45', '45', '45'],
121
        'CCS'     => ['0', '4', '4', '4'],
122
        'CS'      => ['0', '4', '4', '4'],
123
        'CSZ'     => ['0', '4', '4', '4'],
124
        'CZ'      => ['0', '4', '4', '4'],
125
        'CZS'     => ['0', '4', '4', '4'],
126
        'D'       => ['0', '3', '3', '3'],
127
        'Ď'       => ['0', '3', '3', '3'],
128
        'Đ'       => ['0', '3', '3', '3'],
129
        'DRS'     => ['0', '4', '4', '4'],
130
        'DRZ'     => ['0', '4', '4', '4'],
131
        'DS'      => ['0', '4', '4', '4'],
132
        'DSH'     => ['0', '4', '4', '4'],
133
        'DSZ'     => ['0', '4', '4', '4'],
134
        'DT'      => ['0', '3', '3', '3'],
135
        'DDZ'     => ['0', '4', '4', '4'],
136
        'DDZS'    => ['0', '4', '4', '4'],
137
        'DZ'      => ['0', '4', '4', '4'],
138
        'DŹ'      => ['0', '4', '4', '4'],
139
        'DŻ'      => ['0', '4', '4', '4'],
140
        'DZH'     => ['0', '4', '4', '4'],
141
        'DZS'     => ['0', '4', '4', '4'],
142
        'E'       => ['1', '0', '', ''],
143
        'È'       => ['1', '0', '', ''],
144
        'É'       => ['1', '0', '', ''],
145
        'Ê'       => ['1', '0', '', ''],
146
        'Ë'       => ['1', '0', '', ''],
147
        'Ĕ'       => ['1', '0', '', ''],
148
        'Ė'       => ['1', '0', '', ''],
149
        'Ę'       => ['1', '', '', '6', '', '', ''],
150
        'Ẹ'       => ['1', '0', '', ''],
151
        'Ẻ'       => ['1', '0', '', ''],
152
        'Ẽ'       => ['1', '0', '', ''],
153
        'Ế'       => ['1', '0', '', ''],
154
        'Ề'       => ['1', '0', '', ''],
155
        'Ể'       => ['1', '0', '', ''],
156
        'Ễ'       => ['1', '0', '', ''],
157
        'Ệ'       => ['1', '0', '', ''],
158
        'EAU'     => ['1', '0', '', ''],
159
        'EI'      => ['1', '0', '1', ''],
160
        'EJ'      => ['1', '0', '1', ''],
161
        'EU'      => ['1', '1', '1', ''],
162
        'EY'      => ['1', '0', '1', ''],
163
        'F'       => ['0', '7', '7', '7'],
164
        'FB'      => ['0', '7', '7', '7'],
165
        'G'       => ['0', '5', '5', '5', '34', '4', '4'],
166
        'Ğ'       => ['0', '', '', ''],
167
        'GGY'     => ['0', '5', '5', '5'],
168
        'GY'      => ['0', '5', '5', '5'],
169
        'H'       => ['0', '5', '5', '', '5', '5', '5'],
170
        'I'       => ['1', '0', '', ''],
171
        'Ì'       => ['1', '0', '', ''],
172
        'Í'       => ['1', '0', '', ''],
173
        'Î'       => ['1', '0', '', ''],
174
        'Ï'       => ['1', '0', '', ''],
175
        'Ĩ'       => ['1', '0', '', ''],
176
        'Į'       => ['1', '0', '', ''],
177
        'İ'       => ['1', '0', '', ''],
178
        'Ỉ'       => ['1', '0', '', ''],
179
        'Ị'       => ['1', '0', '', ''],
180
        'IA'      => ['1', '1', '', ''],
181
        'IE'      => ['1', '1', '', ''],
182
        'IO'      => ['1', '1', '', ''],
183
        'IU'      => ['1', '1', '', ''],
184
        'J'       => ['0', '1', '', '', '4', '4', '4', '5', '5', ''],
185
        'K'       => ['0', '5', '5', '5'],
186
        'KH'      => ['0', '5', '5', '5'],
187
        'KS'      => ['0', '5', '54', '54'],
188
        'L'       => ['0', '8', '8', '8'],
189
        'Ľ'       => ['0', '8', '8', '8'],
190
        'Ĺ'       => ['0', '8', '8', '8'],
191
        'Ł'       => ['0', '7', '7', '7', '8', '8', '8'],
192
        'LL'      => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'],
193
        'LLY'     => ['0', '8', '8', '8', '1', '8', '8'],
194
        'LY'      => ['0', '8', '8', '8', '1', '8', '8'],
195
        'M'       => ['0', '6', '6', '6'],
196
        'MĔ'      => ['0', '66', '66', '66'],
197
        'MN'      => ['0', '66', '66', '66'],
198
        'N'       => ['0', '6', '6', '6'],
199
        'Ń'       => ['0', '6', '6', '6'],
200
        'Ň'       => ['0', '6', '6', '6'],
201
        'Ñ'       => ['0', '6', '6', '6'],
202
        'NM'      => ['0', '66', '66', '66'],
203
        'O'       => ['1', '0', '', ''],
204
        'Ò'       => ['1', '0', '', ''],
205
        'Ó'       => ['1', '0', '', ''],
206
        'Ô'       => ['1', '0', '', ''],
207
        'Õ'       => ['1', '0', '', ''],
208
        'Ö'       => ['1', '0', '', ''],
209
        'Ø'       => ['1', '0', '', ''],
210
        'Ő'       => ['1', '0', '', ''],
211
        'Œ'       => ['1', '0', '', ''],
212
        'Ơ'       => ['1', '0', '', ''],
213
        'Ọ'       => ['1', '0', '', ''],
214
        'Ỏ'       => ['1', '0', '', ''],
215
        'Ố'       => ['1', '0', '', ''],
216
        'Ồ'       => ['1', '0', '', ''],
217
        'Ổ'       => ['1', '0', '', ''],
218
        'Ỗ'       => ['1', '0', '', ''],
219
        'Ộ'       => ['1', '0', '', ''],
220
        'Ớ'       => ['1', '0', '', ''],
221
        'Ờ'       => ['1', '0', '', ''],
222
        'Ở'       => ['1', '0', '', ''],
223
        'Ỡ'       => ['1', '0', '', ''],
224
        'Ợ'       => ['1', '0', '', ''],
225
        'OE'      => ['1', '0', '', ''],
226
        'OI'      => ['1', '0', '1', ''],
227
        'OJ'      => ['1', '0', '1', ''],
228
        'OU'      => ['1', '0', '', ''],
229
        'OY'      => ['1', '0', '1', ''],
230
        'P'       => ['0', '7', '7', '7'],
231
        'PF'      => ['0', '7', '7', '7'],
232
        'PH'      => ['0', '7', '7', '7'],
233
        'Q'       => ['0', '5', '5', '5'],
234
        'R'       => ['0', '9', '9', '9'],
235
        'Ř'       => ['0', '4', '4', '4'],
236
        'RS'      => ['0', '4', '4', '4', '94', '94', '94'],
237
        'RZ'      => ['0', '4', '4', '4', '94', '94', '94'],
238
        'S'       => ['0', '4', '4', '4'],
239
        'Ś'       => ['0', '4', '4', '4'],
240
        'Š'       => ['0', '4', '4', '4'],
241
        'Ş'       => ['0', '4', '4', '4'],
242
        'SC'      => ['0', '2', '4', '4'],
243
        'ŠČ'      => ['0', '2', '4', '4'],
244
        'SCH'     => ['0', '4', '4', '4'],
245
        'SCHD'    => ['0', '2', '43', '43'],
246
        'SCHT'    => ['0', '2', '43', '43'],
247
        'SCHTCH'  => ['0', '2', '4', '4'],
248
        'SCHTSCH' => ['0', '2', '4', '4'],
249
        'SCHTSH'  => ['0', '2', '4', '4'],
250
        'SD'      => ['0', '2', '43', '43'],
251
        'SH'      => ['0', '4', '4', '4'],
252
        'SHCH'    => ['0', '2', '4', '4'],
253
        'SHD'     => ['0', '2', '43', '43'],
254
        'SHT'     => ['0', '2', '43', '43'],
255
        'SHTCH'   => ['0', '2', '4', '4'],
256
        'SHTSH'   => ['0', '2', '4', '4'],
257
        'ß'       => ['0', '', '4', '4'],
258
        'ST'      => ['0', '2', '43', '43'],
259
        'STCH'    => ['0', '2', '4', '4'],
260
        'STRS'    => ['0', '2', '4', '4'],
261
        'STRZ'    => ['0', '2', '4', '4'],
262
        'STSCH'   => ['0', '2', '4', '4'],
263
        'STSH'    => ['0', '2', '4', '4'],
264
        'SSZ'     => ['0', '4', '4', '4'],
265
        'SZ'      => ['0', '4', '4', '4'],
266
        'SZCS'    => ['0', '2', '4', '4'],
267
        'SZCZ'    => ['0', '2', '4', '4'],
268
        'SZD'     => ['0', '2', '43', '43'],
269
        'SZT'     => ['0', '2', '43', '43'],
270
        'T'       => ['0', '3', '3', '3'],
271
        'Ť'       => ['0', '3', '3', '3'],
272
        'Ţ'       => ['0', '3', '3', '3', '4', '4', '4'],
273
        'TC'      => ['0', '4', '4', '4'],
274
        'TCH'     => ['0', '4', '4', '4'],
275
        'TH'      => ['0', '3', '3', '3'],
276
        'TRS'     => ['0', '4', '4', '4'],
277
        'TRZ'     => ['0', '4', '4', '4'],
278
        'TS'      => ['0', '4', '4', '4'],
279
        'TSCH'    => ['0', '4', '4', '4'],
280
        'TSH'     => ['0', '4', '4', '4'],
281
        'TSZ'     => ['0', '4', '4', '4'],
282
        'TTCH'    => ['0', '4', '4', '4'],
283
        'TTS'     => ['0', '4', '4', '4'],
284
        'TTSCH'   => ['0', '4', '4', '4'],
285
        'TTSZ'    => ['0', '4', '4', '4'],
286
        'TTZ'     => ['0', '4', '4', '4'],
287
        'TZ'      => ['0', '4', '4', '4'],
288
        'TZS'     => ['0', '4', '4', '4'],
289
        'U'       => ['1', '0', '', ''],
290
        'Ù'       => ['1', '0', '', ''],
291
        'Ú'       => ['1', '0', '', ''],
292
        'Û'       => ['1', '0', '', ''],
293
        'Ü'       => ['1', '0', '', ''],
294
        'Ũ'       => ['1', '0', '', ''],
295
        'Ū'       => ['1', '0', '', ''],
296
        'Ů'       => ['1', '0', '', ''],
297
        'Ű'       => ['1', '0', '', ''],
298
        'Ų'       => ['1', '0', '', ''],
299
        'Ư'       => ['1', '0', '', ''],
300
        'Ụ'       => ['1', '0', '', ''],
301
        'Ủ'       => ['1', '0', '', ''],
302
        'Ứ'       => ['1', '0', '', ''],
303
        'Ừ'       => ['1', '0', '', ''],
304
        'Ử'       => ['1', '0', '', ''],
305
        'Ữ'       => ['1', '0', '', ''],
306
        'Ự'       => ['1', '0', '', ''],
307
        'UE'      => ['1', '0', '', ''],
308
        'UI'      => ['1', '0', '1', ''],
309
        'UJ'      => ['1', '0', '1', ''],
310
        'UY'      => ['1', '0', '1', ''],
311
        'UW'      => ['1', '0', '1', '', '0', '7', '7'],
312
        'V'       => ['0', '7', '7', '7'],
313
        'W'       => ['0', '7', '7', '7'],
314
        'X'       => ['0', '5', '54', '54'],
315
        'Y'       => ['1', '1', '', ''],
316
        'Ý'       => ['1', '1', '', ''],
317
        'Ỳ'       => ['1', '1', '', ''],
318
        'Ỵ'       => ['1', '1', '', ''],
319
        'Ỷ'       => ['1', '1', '', ''],
320
        'Ỹ'       => ['1', '1', '', ''],
321
        'Z'       => ['0', '4', '4', '4'],
322
        'Ź'       => ['0', '4', '4', '4'],
323
        'Ż'       => ['0', '4', '4', '4'],
324
        'Ž'       => ['0', '4', '4', '4'],
325
        'ZD'      => ['0', '2', '43', '43'],
326
        'ZDZ'     => ['0', '2', '4', '4'],
327
        'ZDZH'    => ['0', '2', '4', '4'],
328
        'ZH'      => ['0', '4', '4', '4'],
329
        'ZHD'     => ['0', '2', '43', '43'],
330
        'ZHDZH'   => ['0', '2', '4', '4'],
331
        'ZS'      => ['0', '4', '4', '4'],
332
        'ZSCH'    => ['0', '4', '4', '4'],
333
        'ZSH'     => ['0', '4', '4', '4'],
334
        'ZZS'     => ['0', '4', '4', '4'],
335
        // Cyrillic alphabet
336
        'А'       => ['1', '0', '', ''],
337
        'Б'       => ['0', '7', '7', '7'],
338
        'В'       => ['0', '7', '7', '7'],
339
        'Г'       => ['0', '5', '5', '5'],
340
        'Д'       => ['0', '3', '3', '3'],
341
        'ДЗ'      => ['0', '4', '4', '4'],
342
        'Е'       => ['1', '0', '', ''],
343
        'Ё'       => ['1', '0', '', ''],
344
        'Ж'       => ['0', '4', '4', '4'],
345
        'З'       => ['0', '4', '4', '4'],
346
        'И'       => ['1', '0', '', ''],
347
        'Й'       => ['1', '1', '', '', '4', '4', '4'],
348
        'К'       => ['0', '5', '5', '5'],
349
        'Л'       => ['0', '8', '8', '8'],
350
        'М'       => ['0', '6', '6', '6'],
351
        'Н'       => ['0', '6', '6', '6'],
352
        'О'       => ['1', '0', '', ''],
353
        'П'       => ['0', '7', '7', '7'],
354
        'Р'       => ['0', '9', '9', '9'],
355
        'РЖ'      => ['0', '4', '4', '4'],
356
        'С'       => ['0', '4', '4', '4'],
357
        'Т'       => ['0', '3', '3', '3'],
358
        'У'       => ['1', '0', '', ''],
359
        'Ф'       => ['0', '7', '7', '7'],
360
        'Х'       => ['0', '5', '5', '5'],
361
        'Ц'       => ['0', '4', '4', '4'],
362
        'Ч'       => ['0', '4', '4', '4'],
363
        'Ш'       => ['0', '4', '4', '4'],
364
        'Щ'       => ['0', '2', '4', '4'],
365
        'Ъ'       => ['0', '', '', ''],
366
        'Ы'       => ['0', '1', '', ''],
367
        'Ь'       => ['0', '', '', ''],
368
        'Э'       => ['1', '0', '', ''],
369
        'Ю'       => ['0', '1', '', ''],
370
        'Я'       => ['0', '1', '', ''],
371
        // Greek alphabet
372
        'Α'       => ['1', '0', '', ''],
373
        'Ά'       => ['1', '0', '', ''],
374
        'ΑΙ'      => ['1', '0', '1', ''],
375
        'ΑΥ'      => ['1', '0', '1', ''],
376
        'Β'       => ['0', '7', '7', '7'],
377
        'Γ'       => ['0', '5', '5', '5'],
378
        'Δ'       => ['0', '3', '3', '3'],
379
        'Ε'       => ['1', '0', '', ''],
380
        'Έ'       => ['1', '0', '', ''],
381
        'ΕΙ'      => ['1', '0', '1', ''],
382
        'ΕΥ'      => ['1', '1', '1', ''],
383
        'Ζ'       => ['0', '4', '4', '4'],
384
        'Η'       => ['1', '0', '', ''],
385
        'Ή'       => ['1', '0', '', ''],
386
        'Θ'       => ['0', '3', '3', '3'],
387
        'Ι'       => ['1', '0', '', ''],
388
        'Ί'       => ['1', '0', '', ''],
389
        'Ϊ'       => ['1', '0', '', ''],
390
        'ΐ'       => ['1', '0', '', ''],
391
        'Κ'       => ['0', '5', '5', '5'],
392
        'Λ'       => ['0', '8', '8', '8'],
393
        'Μ'       => ['0', '6', '6', '6'],
394
        'ΜΠ'      => ['0', '7', '7', '7'],
395
        'Ν'       => ['0', '6', '6', '6'],
396
        'ΝΤ'      => ['0', '3', '3', '3'],
397
        'Ξ'       => ['0', '5', '54', '54'],
398
        'Ο'       => ['1', '0', '', ''],
399
        'Ό'       => ['1', '0', '', ''],
400
        'ΟΙ'      => ['1', '0', '1', ''],
401
        'ΟΥ'      => ['1', '0', '1', ''],
402
        'Π'       => ['0', '7', '7', '7'],
403
        'Ρ'       => ['0', '9', '9', '9'],
404
        'Σ'       => ['0', '4', '4', '4'],
405
        'ς'       => ['0', '', '', '4'],
406
        'Τ'       => ['0', '3', '3', '3'],
407
        'ΤΖ'      => ['0', '4', '4', '4'],
408
        'ΤΣ'      => ['0', '4', '4', '4'],
409
        'Υ'       => ['1', '1', '', ''],
410
        'Ύ'       => ['1', '1', '', ''],
411
        'Ϋ'       => ['1', '1', '', ''],
412
        'ΰ'       => ['1', '1', '', ''],
413
        'ΥΚ'      => ['1', '5', '5', '5'],
414
        'ΥΥ'      => ['1', '65', '65', '65'],
415
        'Φ'       => ['0', '7', '7', '7'],
416
        'Χ'       => ['0', '5', '5', '5'],
417
        'Ψ'       => ['0', '7', '7', '7'],
418
        'Ω'       => ['1', '0', '', ''],
419
        'Ώ'       => ['1', '0', '', ''],
420
        // Hebrew alphabet
421
        'א'       => ['1', '0', '', ''],
422
        'או'      => ['1', '0', '7', ''],
423
        'אג'      => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'],
424
        'בב'      => ['0', '7', '7', '7', '77', '77', '77'],
425
        'ב'       => ['0', '7', '7', '7'],
426
        'גג'      => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'],
427
        'גד'      => ['0', '43', '43', '43', '53', '53', '53'],
428
        'גה'      => ['0', '45', '45', '45', '55', '55', '55'],
429
        'גז'      => ['0', '44', '44', '44', '45', '45', '45'],
430
        'גח'      => ['0', '45', '45', '45', '55', '55', '55'],
431
        'גכ'      => ['0', '45', '45', '45', '55', '55', '55'],
432
        'גך'      => ['0', '45', '45', '45', '55', '55', '55'],
433
        'גצ'      => ['0', '44', '44', '44', '45', '45', '45'],
434
        'גץ'      => ['0', '44', '44', '44', '45', '45', '45'],
435
        'גק'      => ['0', '45', '45', '45', '54', '54', '54'],
436
        'גש'      => ['0', '44', '44', '44', '54', '54', '54'],
437
        'גת'      => ['0', '43', '43', '43', '53', '53', '53'],
438
        'ג'       => ['0', '4', '4', '4', '5', '5', '5'],
439
        'דז'      => ['0', '4', '4', '4'],
440
        'דד'      => ['0', '3', '3', '3', '33', '33', '33'],
441
        'דט'      => ['0', '33', '33', '33'],
442
        'דש'      => ['0', '4', '4', '4'],
443
        'דצ'      => ['0', '4', '4', '4'],
444
        'דץ'      => ['0', '4', '4', '4'],
445
        'ד'       => ['0', '3', '3', '3'],
446
        'הג'      => ['0', '54', '54', '54', '55', '55', '55'],
447
        'הכ'      => ['0', '55', '55', '55'],
448
        'הח'      => ['0', '55', '55', '55'],
449
        'הק'      => ['0', '55', '55', '55', '5', '5', '5'],
450
        'הה'      => ['0', '5', '5', '', '55', '55', ''],
451
        'ה'       => ['0', '5', '5', ''],
452
        'וי'      => ['1', '', '', '', '7', '7', '7'],
453
        'ו'       => ['1', '7', '7', '7', '7', '', ''],
454
        'וו'      => ['1', '7', '7', '7', '7', '', ''],
455
        'וופ'     => ['1', '7', '7', '7', '77', '77', '77'],
456
        'זש'      => ['0', '4', '4', '4', '44', '44', '44'],
457
        'זדז'     => ['0', '2', '4', '4'],
458
        'ז'       => ['0', '4', '4', '4'],
459
        'זג'      => ['0', '44', '44', '44', '45', '45', '45'],
460
        'זז'      => ['0', '4', '4', '4', '44', '44', '44'],
461
        'זס'      => ['0', '44', '44', '44'],
462
        'זצ'      => ['0', '44', '44', '44'],
463
        'זץ'      => ['0', '44', '44', '44'],
464
        'חג'      => ['0', '54', '54', '54', '53', '53', '53'],
465
        'חח'      => ['0', '5', '5', '5', '55', '55', '55'],
466
        'חק'      => ['0', '55', '55', '55', '5', '5', '5'],
467
        'חכ'      => ['0', '45', '45', '45', '55', '55', '55'],
468
        'חס'      => ['0', '5', '54', '54'],
469
        'חש'      => ['0', '5', '54', '54'],
470
        'ח'       => ['0', '5', '5', '5'],
471
        'טש'      => ['0', '4', '4', '4'],
472
        'טד'      => ['0', '33', '33', '33'],
473
        'טי'      => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'],
474
        'טת'      => ['0', '33', '33', '33'],
475
        'טט'      => ['0', '3', '3', '3', '33', '33', '33'],
476
        'ט'       => ['0', '3', '3', '3'],
477
        'י'       => ['1', '1', '', ''],
478
        'יא'      => ['1', '1', '', '', '1', '1', '1'],
479
        'כג'      => ['0', '55', '55', '55', '54', '54', '54'],
480
        'כש'      => ['0', '5', '54', '54'],
481
        'כס'      => ['0', '5', '54', '54'],
482
        'ככ'      => ['0', '5', '5', '5', '55', '55', '55'],
483
        'כך'      => ['0', '5', '5', '5', '55', '55', '55'],
484
        'כ'       => ['0', '5', '5', '5'],
485
        'כח'      => ['0', '55', '55', '55', '5', '5', '5'],
486
        'ך'       => ['0', '', '5', '5'],
487
        'ל'       => ['0', '8', '8', '8'],
488
        'לל'      => ['0', '88', '88', '88', '8', '8', '8'],
489
        'מנ'      => ['0', '66', '66', '66'],
490
        'מן'      => ['0', '66', '66', '66'],
491
        'ממ'      => ['0', '6', '6', '6', '66', '66', '66'],
492
        'מם'      => ['0', '6', '6', '6', '66', '66', '66'],
493
        'מ'       => ['0', '6', '6', '6'],
494
        'ם'       => ['0', '', '6', '6'],
495
        'נמ'      => ['0', '66', '66', '66'],
496
        'נם'      => ['0', '66', '66', '66'],
497
        'ננ'      => ['0', '6', '6', '6', '66', '66', '66'],
498
        'נן'      => ['0', '6', '6', '6', '66', '66', '66'],
499
        'נ'       => ['0', '6', '6', '6'],
500
        'ן'       => ['0', '', '6', '6'],
501
        'סתש'     => ['0', '2', '4', '4'],
502
        'סתז'     => ['0', '2', '4', '4'],
503
        'סטז'     => ['0', '2', '4', '4'],
504
        'סטש'     => ['0', '2', '4', '4'],
505
        'סצד'     => ['0', '2', '4', '4'],
506
        'סט'      => ['0', '2', '4', '4', '43', '43', '43'],
507
        'סת'      => ['0', '2', '4', '4', '43', '43', '43'],
508
        'סג'      => ['0', '44', '44', '44', '4', '4', '4'],
509
        'סס'      => ['0', '4', '4', '4', '44', '44', '44'],
510
        'סצ'      => ['0', '44', '44', '44'],
511
        'סץ'      => ['0', '44', '44', '44'],
512
        'סז'      => ['0', '44', '44', '44'],
513
        'סש'      => ['0', '44', '44', '44'],
514
        'ס'       => ['0', '4', '4', '4'],
515
        'ע'       => ['1', '0', '', ''],
516
        'פב'      => ['0', '7', '7', '7', '77', '77', '77'],
517
        'פוו'     => ['0', '7', '7', '7', '77', '77', '77'],
518
        'פפ'      => ['0', '7', '7', '7', '77', '77', '77'],
519
        'פף'      => ['0', '7', '7', '7', '77', '77', '77'],
520
        'פ'       => ['0', '7', '7', '7'],
521
        'ף'       => ['0', '', '7', '7'],
522
        'צג'      => ['0', '44', '44', '44', '45', '45', '45'],
523
        'צז'      => ['0', '44', '44', '44'],
524
        'צס'      => ['0', '44', '44', '44'],
525
        'צצ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'],
526
        'צץ'      => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'],
527
        'צש'      => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'],
528
        'צ'       => ['0', '4', '4', '4', '5', '5', '5'],
529
        'ץ'       => ['0', '', '4', '4'],
530
        'קה'      => ['0', '55', '55', '5'],
531
        'קס'      => ['0', '5', '54', '54'],
532
        'קש'      => ['0', '5', '54', '54'],
533
        'קק'      => ['0', '5', '5', '5', '55', '55', '55'],
534
        'קח'      => ['0', '55', '55', '55'],
535
        'קכ'      => ['0', '55', '55', '55'],
536
        'קך'      => ['0', '55', '55', '55'],
537
        'קג'      => ['0', '55', '55', '55', '54', '54', '54'],
538
        'ק'       => ['0', '5', '5', '5'],
539
        'רר'      => ['0', '99', '99', '99', '9', '9', '9'],
540
        'ר'       => ['0', '9', '9', '9'],
541
        'שטז'     => ['0', '2', '4', '4'],
542
        'שתש'     => ['0', '2', '4', '4'],
543
        'שתז'     => ['0', '2', '4', '4'],
544
        'שטש'     => ['0', '2', '4', '4'],
545
        'שד'      => ['0', '2', '43', '43'],
546
        'שז'      => ['0', '44', '44', '44'],
547
        'שס'      => ['0', '44', '44', '44'],
548
        'שת'      => ['0', '2', '43', '43'],
549
        'שג'      => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'],
550
        'שט'      => ['0', '2', '43', '43', '44', '44', '44'],
551
        'שצ'      => ['0', '44', '44', '44', '45', '45', '45'],
552
        'שץ'      => ['0', '44', '', '44', '45', '', '45'],
553
        'שש'      => ['0', '4', '4', '4', '44', '44', '44'],
554
        'ש'       => ['0', '4', '4', '4'],
555
        'תג'      => ['0', '34', '34', '34'],
556
        'תז'      => ['0', '34', '34', '34'],
557
        'תש'      => ['0', '4', '4', '4'],
558
        'תת'      => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'],
559
        'ת'       => ['0', '3', '3', '3', '4', '4', '4'],
560
        // Arabic alphabet
561
        'ا'       => ['1', '0', '', ''],
562
        'ب'       => ['0', '7', '7', '7'],
563
        'ت'       => ['0', '3', '3', '3'],
564
        'ث'       => ['0', '3', '3', '3'],
565
        'ج'       => ['0', '4', '4', '4'],
566
        'ح'       => ['0', '5', '5', '5'],
567
        'خ'       => ['0', '5', '5', '5'],
568
        'د'       => ['0', '3', '3', '3'],
569
        'ذ'       => ['0', '3', '3', '3'],
570
        'ر'       => ['0', '9', '9', '9'],
571
        'ز'       => ['0', '4', '4', '4'],
572
        'س'       => ['0', '4', '4', '4'],
573
        'ش'       => ['0', '4', '4', '4'],
574
        'ص'       => ['0', '4', '4', '4'],
575
        'ض'       => ['0', '3', '3', '3'],
576
        'ط'       => ['0', '3', '3', '3'],
577
        'ظ'       => ['0', '4', '4', '4'],
578
        'ع'       => ['1', '0', '', ''],
579
        'غ'       => ['0', '0', '', ''],
580
        'ف'       => ['0', '7', '7', '7'],
581
        'ق'       => ['0', '5', '5', '5'],
582
        'ك'       => ['0', '5', '5', '5'],
583
        'ل'       => ['0', '8', '8', '8'],
584
        'لا'      => ['0', '8', '8', '8'],
585
        'م'       => ['0', '6', '6', '6'],
586
        'ن'       => ['0', '6', '6', '6'],
587
        'هن'      => ['0', '66', '66', '66'],
588
        'ه'       => ['0', '5', '5', ''],
589
        'و'       => ['1', '', '', '', '7', '', ''],
590
        'ي'       => ['0', '1', '', ''],
591
        'آ'       => ['0', '1', '', ''],
592
        'ة'       => ['0', '', '', '3'],
593
        'ی'       => ['0', '1', '', ''],
594
        'ى'       => ['1', '1', '', ''],
595
    ];
596
597
    /**
598
     * Which algorithms are supported.
599
     *
600
     * @return array<string>
601
     */
602
    public static function getAlgorithms(): array
603
    {
604
        return [
605
            /* I18N: https://en.wikipedia.org/wiki/Soundex */
606
            'std' => I18N::translate('Russell'),
607
            /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */
608
            'dm'  => I18N::translate('Daitch-Mokotoff'),
609
        ];
610
    }
611
612
    /**
613
     * Is there a match between two soundex codes?
614
     *
615
     * @param string $soundex1
616
     * @param string $soundex2
617
     *
618
     * @return bool
619
     */
620
    public static function compare(string $soundex1, string $soundex2): bool
621
    {
622
        if ($soundex1 !== '' && $soundex2 !== '') {
623
            return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== [];
624
        }
625
626
        return false;
627
    }
628
629
    /**
630
     * Generate Russell soundex codes for a given text.
631
     *
632
     * @param string $text
633
     *
634
     * @return string
635
     */
636
    public static function russell(string $text): string
637
    {
638
        $words         = explode(' ', $text);
639
        $soundex_array = [];
640
641
        foreach ($words as $word) {
642
            $soundex = soundex($word);
643
644
            // Only return codes from recognisable sounds
645
            if ($soundex !== '0000') {
646
                $soundex_array[] = $soundex;
647
            }
648
        }
649
650
        // Combine words, e.g. “New York” as “Newyork”
651
        if (count($words) > 1) {
652
            $soundex_array[] = soundex(str_replace(' ', '', $text));
653
        }
654
655
        // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters)
656
        $soundex_array = array_slice(array_unique($soundex_array), 0, 51);
657
658
        return implode(':', $soundex_array);
659
    }
660
661
    /**
662
     * Generate Daitch–Mokotoff soundex codes for a given text.
663
     *
664
     * @param string $text
665
     *
666
     * @return string
667
     */
668
    public static function daitchMokotoff(string $text): string
669
    {
670
        $words         = explode(' ', $text);
671
        $soundex_array = [];
672
673
        foreach ($words as $word) {
674
            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word));
675
        }
676
        // Combine words, e.g. “New York” as “Newyork”
677
        if (count($words) > 1) {
678
            $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text)));
679
        }
680
681
        // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters)
682
        $soundex_array = array_slice(array_unique($soundex_array), 0, 36);
683
684
        return implode(':', $soundex_array);
685
    }
686
687
    /**
688
     * Calculate the Daitch-Mokotoff soundex for a word.
689
     *
690
     * @param string $name
691
     *
692
     * @return array<string> List of possible DM codes for the word.
693
     */
694
    private static function daitchMokotoffWord(string $name): array
695
    {
696
        // Apply special transformation rules to the input string
697
        $name = I18N::strtoupper($name);
698
        foreach (self::TRANSFORM_NAMES as $transformRule) {
699
            $name = str_replace($transformRule[0], $transformRule[1], $name);
700
        }
701
702
        // Initialize
703
        $name_script = I18N::textScript($name);
704
        $noVowels    = $name_script === 'Hebr' || $name_script === 'Arab';
705
706
        $lastPos         = strlen($name) - 1;
707
        $currPos         = 0;
708
        $state           = 1; // 1: start of input string, 2: before vowel, 3: other
709
        $result          = []; // accumulate complete 6-digit D-M codes here
710
        $partialResult   = []; // accumulate incomplete D-M codes here
711
        $partialResult[] = ['!']; // initialize 1st partial result  ('!' stops "duplicate sound" check)
712
713
        // Loop through the input string.
714
        // Stop when the string is exhausted or when no more partial results remain
715
        while ($partialResult !== [] && $currPos <= $lastPos) {
716
            // Find the DM coding table entry for the chunk at the current position
717
            $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
718
            while ($thisEntry !== '') {
719
                if (isset(self::DM_SOUNDS[$thisEntry])) {
720
                    break;
721
                }
722
                $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk
723
            }
724
            if ($thisEntry === '') {
725
                $currPos++; // Not in table: advance pointer to next byte
726
                continue; // and try again
727
            }
728
729
            $soundTableEntry = self::DM_SOUNDS[$thisEntry];
730
            $workingResult   = $partialResult;
731
            $partialResult   = [];
732
            $currPos += strlen($thisEntry);
733
734
            // Not at beginning of input string
735
            if ($state !== 1) {
736
                if ($currPos <= $lastPos) {
737
                    // Determine whether the next chunk is a vowel
738
                    $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk
739
                    while ($nextEntry !== '') {
740
                        if (isset(self::DM_SOUNDS[$nextEntry])) {
741
                            break;
742
                        }
743
                        $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk
744
                    }
745
                } else {
746
                    $nextEntry = '';
747
                }
748
                if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') {
749
                    $state = 2;
750
                } else {
751
                    // Next chunk is a vowel
752
                    $state = 3;
753
                }
754
            }
755
756
            while ($state < count($soundTableEntry)) {
757
                // empty means 'ignore this sound in this state'
758
                if ($soundTableEntry[$state] === '') {
759
                    foreach ($workingResult as $workingEntry) {
760
                        $tempEntry                        = $workingEntry;
761
                        $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles'
762
                        $partialResult[]                  = $tempEntry;
763
                    }
764
                } else {
765
                    foreach ($workingResult as $workingEntry) {
766
                        if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) {
767
                            // Incoming sound isn't a duplicate of the previous sound
768
                            $workingEntry[] = $soundTableEntry[$state];
769
                        } elseif ($noVowels) {
770
                            // Incoming sound is a duplicate of the previous sound
771
                            // For Hebrew and Arabic, we need to create a pair of D-M sound codes,
772
                            // one of the pair with only a single occurrence of the duplicate sound,
773
                            // the other with both occurrences
774
                            $workingEntry[] = $soundTableEntry[$state];
775
                        }
776
777
                        if (count($workingEntry) < 7) {
778
                            $partialResult[] = $workingEntry;
779
                        } else {
780
                            // This is the 6th code in the sequence
781
                            // We're looking for 7 entries because the first is '!' and doesn't count
782
                            $tempResult = str_replace('!', '', implode('', $workingEntry));
783
                            // Only return codes from recognisable sounds
784
                            if ($tempResult !== '') {
785
                                $result[] = substr($tempResult . '000000', 0, 6);
786
                            }
787
                        }
788
                    }
789
                }
790
                $state += 3; // Advance to next triplet while keeping the same basic state
791
            }
792
        }
793
794
        // Zero-fill and copy all remaining partial results
795
        foreach ($partialResult as $workingEntry) {
796
            $tempResult = str_replace('!', '', implode('', $workingEntry));
797
            // Only return codes from recognisable sounds
798
            if ($tempResult !== '') {
799
                $result[] = substr($tempResult . '000000', 0, 6);
800
            }
801
        }
802
803
        return $result;
804
    }
805
}
806