1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
/** |
4
|
|
|
* webtrees: online genealogy |
5
|
|
|
* Copyright (C) 2025 webtrees development team |
6
|
|
|
* This program is free software: you can redistribute it and/or modify |
7
|
|
|
* it under the terms of the GNU General Public License as published by |
8
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
* (at your option) any later version. |
10
|
|
|
* This program is distributed in the hope that it will be useful, |
11
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
12
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
13
|
|
|
* GNU General Public License for more details. |
14
|
|
|
* You should have received a copy of the GNU General Public License |
15
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>. |
16
|
|
|
*/ |
17
|
|
|
|
18
|
|
|
declare(strict_types=1); |
19
|
|
|
|
20
|
|
|
namespace Fisharebest\Webtrees; |
21
|
|
|
|
22
|
|
|
use function array_slice; |
23
|
|
|
use function count; |
24
|
|
|
use function strlen; |
25
|
|
|
|
26
|
|
|
/** |
27
|
|
|
* Phonetic matching of strings. |
28
|
|
|
*/ |
29
|
|
|
class Soundex |
30
|
|
|
{ |
31
|
|
|
// Determine the Daitch–Mokotoff Soundex code for a word |
32
|
|
|
// Original implementation by Gerry Kroll, and analysis by Meliza Amity |
33
|
|
|
|
34
|
|
|
// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) |
35
|
|
|
private const MAXCHAR = 7; |
36
|
|
|
|
37
|
|
|
/** |
38
|
|
|
* Name transformation arrays. |
39
|
|
|
* Used to transform the Name string to simplify the "sounds like" table. |
40
|
|
|
* This is especially useful in Hebrew. |
41
|
|
|
* |
42
|
|
|
* Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) |
43
|
|
|
* function call to achieve the desired transformations. |
44
|
|
|
* |
45
|
|
|
* Note about the use of "\x01": |
46
|
|
|
* This code, which can’t legitimately occur in the kind of text we're dealing with, |
47
|
|
|
* is used as a place-holder so that conditional string replacements can be done. |
48
|
|
|
*/ |
49
|
|
|
private const TRANSFORM_NAMES = [ |
50
|
|
|
// Force Yiddish ligatures to be treated as separate letters |
51
|
|
|
['װ', 'וו'], |
52
|
|
|
['ײ', 'יי'], |
53
|
|
|
['ױ', 'וי'], |
54
|
|
|
['בו', 'בע'], |
55
|
|
|
['פו', 'פע'], |
56
|
|
|
['ומ', 'עמ'], |
57
|
|
|
['ום', 'עם'], |
58
|
|
|
['ונ', 'ענ'], |
59
|
|
|
['ון', 'ען'], |
60
|
|
|
['וו', 'ב'], |
61
|
|
|
["\x01", ''], |
62
|
|
|
['ייה$', "\x01ה"], |
63
|
|
|
['ייע$', "\x01ע"], |
64
|
|
|
['יי', 'ע'], |
65
|
|
|
["\x01", 'יי'], |
66
|
|
|
]; |
67
|
|
|
|
68
|
|
|
/** |
69
|
|
|
* The DM sound coding table is organized this way: |
70
|
|
|
* key: a variable-length string that corresponds to the UTF-8 character sequence |
71
|
|
|
* represented by the table entry. Currently, that string can be up to 7 |
72
|
|
|
* bytes long. This maximum length is defined by the value of global variable |
73
|
|
|
* $maxchar. |
74
|
|
|
* |
75
|
|
|
* value: an array as follows: |
76
|
|
|
* [0]: zero if not a vowel |
77
|
|
|
* [1]: sound value when this string is at the beginning of the word |
78
|
|
|
* [2]: sound value when this string is followed by a vowel |
79
|
|
|
* [3]: sound value for other cases |
80
|
|
|
* [1],[2],[3] can be repeated several times to create branches in the code |
81
|
|
|
* an empty sound value means "ignore in this state" |
82
|
|
|
*/ |
83
|
|
|
private const DM_SOUNDS = [ |
84
|
|
|
'A' => ['1', '0', '', ''], |
85
|
|
|
'À' => ['1', '0', '', ''], |
86
|
|
|
'Á' => ['1', '0', '', ''], |
87
|
|
|
'Â' => ['1', '0', '', ''], |
88
|
|
|
'Ã' => ['1', '0', '', ''], |
89
|
|
|
'Ä' => ['1', '0', '1', '', '0', '', ''], |
90
|
|
|
'Å' => ['1', '0', '', ''], |
91
|
|
|
'Ă' => ['1', '0', '', ''], |
92
|
|
|
'Ą' => ['1', '', '', '', '', '', '6'], |
93
|
|
|
'Ạ' => ['1', '0', '', ''], |
94
|
|
|
'Ả' => ['1', '0', '', ''], |
95
|
|
|
'Ấ' => ['1', '0', '', ''], |
96
|
|
|
'Ầ' => ['1', '0', '', ''], |
97
|
|
|
'Ẩ' => ['1', '0', '', ''], |
98
|
|
|
'Ẫ' => ['1', '0', '', ''], |
99
|
|
|
'Ậ' => ['1', '0', '', ''], |
100
|
|
|
'Ắ' => ['1', '0', '', ''], |
101
|
|
|
'Ằ' => ['1', '0', '', ''], |
102
|
|
|
'Ẳ' => ['1', '0', '', ''], |
103
|
|
|
'Ẵ' => ['1', '0', '', ''], |
104
|
|
|
'Ặ' => ['1', '0', '', ''], |
105
|
|
|
'AE' => ['1', '0', '1', ''], |
106
|
|
|
'Æ' => ['1', '0', '1', ''], |
107
|
|
|
'AI' => ['1', '0', '1', ''], |
108
|
|
|
'AJ' => ['1', '0', '1', ''], |
109
|
|
|
'AU' => ['1', '0', '7', ''], |
110
|
|
|
'AV' => ['1', '0', '7', '', '7', '7', '7'], |
111
|
|
|
'ÄU' => ['1', '0', '1', ''], |
112
|
|
|
'AY' => ['1', '0', '1', ''], |
113
|
|
|
'B' => ['0', '7', '7', '7'], |
114
|
|
|
'C' => ['0', '5', '5', '5', '34', '4', '4'], |
115
|
|
|
'Ć' => ['0', '4', '4', '4'], |
116
|
|
|
'Č' => ['0', '4', '4', '4'], |
117
|
|
|
'Ç' => ['0', '4', '4', '4'], |
118
|
|
|
'CH' => ['0', '5', '5', '5', '34', '4', '4'], |
119
|
|
|
'CHS' => ['0', '5', '54', '54'], |
120
|
|
|
'CK' => ['0', '5', '5', '5', '45', '45', '45'], |
121
|
|
|
'CCS' => ['0', '4', '4', '4'], |
122
|
|
|
'CS' => ['0', '4', '4', '4'], |
123
|
|
|
'CSZ' => ['0', '4', '4', '4'], |
124
|
|
|
'CZ' => ['0', '4', '4', '4'], |
125
|
|
|
'CZS' => ['0', '4', '4', '4'], |
126
|
|
|
'D' => ['0', '3', '3', '3'], |
127
|
|
|
'Ď' => ['0', '3', '3', '3'], |
128
|
|
|
'Đ' => ['0', '3', '3', '3'], |
129
|
|
|
'DRS' => ['0', '4', '4', '4'], |
130
|
|
|
'DRZ' => ['0', '4', '4', '4'], |
131
|
|
|
'DS' => ['0', '4', '4', '4'], |
132
|
|
|
'DSH' => ['0', '4', '4', '4'], |
133
|
|
|
'DSZ' => ['0', '4', '4', '4'], |
134
|
|
|
'DT' => ['0', '3', '3', '3'], |
135
|
|
|
'DDZ' => ['0', '4', '4', '4'], |
136
|
|
|
'DDZS' => ['0', '4', '4', '4'], |
137
|
|
|
'DZ' => ['0', '4', '4', '4'], |
138
|
|
|
'DŹ' => ['0', '4', '4', '4'], |
139
|
|
|
'DŻ' => ['0', '4', '4', '4'], |
140
|
|
|
'DZH' => ['0', '4', '4', '4'], |
141
|
|
|
'DZS' => ['0', '4', '4', '4'], |
142
|
|
|
'E' => ['1', '0', '', ''], |
143
|
|
|
'È' => ['1', '0', '', ''], |
144
|
|
|
'É' => ['1', '0', '', ''], |
145
|
|
|
'Ê' => ['1', '0', '', ''], |
146
|
|
|
'Ë' => ['1', '0', '', ''], |
147
|
|
|
'Ĕ' => ['1', '0', '', ''], |
148
|
|
|
'Ė' => ['1', '0', '', ''], |
149
|
|
|
'Ę' => ['1', '', '', '6', '', '', ''], |
150
|
|
|
'Ẹ' => ['1', '0', '', ''], |
151
|
|
|
'Ẻ' => ['1', '0', '', ''], |
152
|
|
|
'Ẽ' => ['1', '0', '', ''], |
153
|
|
|
'Ế' => ['1', '0', '', ''], |
154
|
|
|
'Ề' => ['1', '0', '', ''], |
155
|
|
|
'Ể' => ['1', '0', '', ''], |
156
|
|
|
'Ễ' => ['1', '0', '', ''], |
157
|
|
|
'Ệ' => ['1', '0', '', ''], |
158
|
|
|
'EAU' => ['1', '0', '', ''], |
159
|
|
|
'EI' => ['1', '0', '1', ''], |
160
|
|
|
'EJ' => ['1', '0', '1', ''], |
161
|
|
|
'EU' => ['1', '1', '1', ''], |
162
|
|
|
'EY' => ['1', '0', '1', ''], |
163
|
|
|
'F' => ['0', '7', '7', '7'], |
164
|
|
|
'FB' => ['0', '7', '7', '7'], |
165
|
|
|
'G' => ['0', '5', '5', '5', '34', '4', '4'], |
166
|
|
|
'Ğ' => ['0', '', '', ''], |
167
|
|
|
'GGY' => ['0', '5', '5', '5'], |
168
|
|
|
'GY' => ['0', '5', '5', '5'], |
169
|
|
|
'H' => ['0', '5', '5', '', '5', '5', '5'], |
170
|
|
|
'I' => ['1', '0', '', ''], |
171
|
|
|
'Ì' => ['1', '0', '', ''], |
172
|
|
|
'Í' => ['1', '0', '', ''], |
173
|
|
|
'Î' => ['1', '0', '', ''], |
174
|
|
|
'Ï' => ['1', '0', '', ''], |
175
|
|
|
'Ĩ' => ['1', '0', '', ''], |
176
|
|
|
'Į' => ['1', '0', '', ''], |
177
|
|
|
'İ' => ['1', '0', '', ''], |
178
|
|
|
'Ỉ' => ['1', '0', '', ''], |
179
|
|
|
'Ị' => ['1', '0', '', ''], |
180
|
|
|
'IA' => ['1', '1', '', ''], |
181
|
|
|
'IE' => ['1', '1', '', ''], |
182
|
|
|
'IO' => ['1', '1', '', ''], |
183
|
|
|
'IU' => ['1', '1', '', ''], |
184
|
|
|
'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], |
185
|
|
|
'K' => ['0', '5', '5', '5'], |
186
|
|
|
'KH' => ['0', '5', '5', '5'], |
187
|
|
|
'KS' => ['0', '5', '54', '54'], |
188
|
|
|
'L' => ['0', '8', '8', '8'], |
189
|
|
|
'Ľ' => ['0', '8', '8', '8'], |
190
|
|
|
'Ĺ' => ['0', '8', '8', '8'], |
191
|
|
|
'Ł' => ['0', '7', '7', '7', '8', '8', '8'], |
192
|
|
|
'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], |
193
|
|
|
'LLY' => ['0', '8', '8', '8', '1', '8', '8'], |
194
|
|
|
'LY' => ['0', '8', '8', '8', '1', '8', '8'], |
195
|
|
|
'M' => ['0', '6', '6', '6'], |
196
|
|
|
'MĔ' => ['0', '66', '66', '66'], |
197
|
|
|
'MN' => ['0', '66', '66', '66'], |
198
|
|
|
'N' => ['0', '6', '6', '6'], |
199
|
|
|
'Ń' => ['0', '6', '6', '6'], |
200
|
|
|
'Ň' => ['0', '6', '6', '6'], |
201
|
|
|
'Ñ' => ['0', '6', '6', '6'], |
202
|
|
|
'NM' => ['0', '66', '66', '66'], |
203
|
|
|
'O' => ['1', '0', '', ''], |
204
|
|
|
'Ò' => ['1', '0', '', ''], |
205
|
|
|
'Ó' => ['1', '0', '', ''], |
206
|
|
|
'Ô' => ['1', '0', '', ''], |
207
|
|
|
'Õ' => ['1', '0', '', ''], |
208
|
|
|
'Ö' => ['1', '0', '', ''], |
209
|
|
|
'Ø' => ['1', '0', '', ''], |
210
|
|
|
'Ő' => ['1', '0', '', ''], |
211
|
|
|
'Œ' => ['1', '0', '', ''], |
212
|
|
|
'Ơ' => ['1', '0', '', ''], |
213
|
|
|
'Ọ' => ['1', '0', '', ''], |
214
|
|
|
'Ỏ' => ['1', '0', '', ''], |
215
|
|
|
'Ố' => ['1', '0', '', ''], |
216
|
|
|
'Ồ' => ['1', '0', '', ''], |
217
|
|
|
'Ổ' => ['1', '0', '', ''], |
218
|
|
|
'Ỗ' => ['1', '0', '', ''], |
219
|
|
|
'Ộ' => ['1', '0', '', ''], |
220
|
|
|
'Ớ' => ['1', '0', '', ''], |
221
|
|
|
'Ờ' => ['1', '0', '', ''], |
222
|
|
|
'Ở' => ['1', '0', '', ''], |
223
|
|
|
'Ỡ' => ['1', '0', '', ''], |
224
|
|
|
'Ợ' => ['1', '0', '', ''], |
225
|
|
|
'OE' => ['1', '0', '', ''], |
226
|
|
|
'OI' => ['1', '0', '1', ''], |
227
|
|
|
'OJ' => ['1', '0', '1', ''], |
228
|
|
|
'OU' => ['1', '0', '', ''], |
229
|
|
|
'OY' => ['1', '0', '1', ''], |
230
|
|
|
'P' => ['0', '7', '7', '7'], |
231
|
|
|
'PF' => ['0', '7', '7', '7'], |
232
|
|
|
'PH' => ['0', '7', '7', '7'], |
233
|
|
|
'Q' => ['0', '5', '5', '5'], |
234
|
|
|
'R' => ['0', '9', '9', '9'], |
235
|
|
|
'Ř' => ['0', '4', '4', '4'], |
236
|
|
|
'RS' => ['0', '4', '4', '4', '94', '94', '94'], |
237
|
|
|
'RZ' => ['0', '4', '4', '4', '94', '94', '94'], |
238
|
|
|
'S' => ['0', '4', '4', '4'], |
239
|
|
|
'Ś' => ['0', '4', '4', '4'], |
240
|
|
|
'Š' => ['0', '4', '4', '4'], |
241
|
|
|
'Ş' => ['0', '4', '4', '4'], |
242
|
|
|
'SC' => ['0', '2', '4', '4'], |
243
|
|
|
'ŠČ' => ['0', '2', '4', '4'], |
244
|
|
|
'SCH' => ['0', '4', '4', '4'], |
245
|
|
|
'SCHD' => ['0', '2', '43', '43'], |
246
|
|
|
'SCHT' => ['0', '2', '43', '43'], |
247
|
|
|
'SCHTCH' => ['0', '2', '4', '4'], |
248
|
|
|
'SCHTSCH' => ['0', '2', '4', '4'], |
249
|
|
|
'SCHTSH' => ['0', '2', '4', '4'], |
250
|
|
|
'SD' => ['0', '2', '43', '43'], |
251
|
|
|
'SH' => ['0', '4', '4', '4'], |
252
|
|
|
'SHCH' => ['0', '2', '4', '4'], |
253
|
|
|
'SHD' => ['0', '2', '43', '43'], |
254
|
|
|
'SHT' => ['0', '2', '43', '43'], |
255
|
|
|
'SHTCH' => ['0', '2', '4', '4'], |
256
|
|
|
'SHTSH' => ['0', '2', '4', '4'], |
257
|
|
|
'ß' => ['0', '', '4', '4'], |
258
|
|
|
'ST' => ['0', '2', '43', '43'], |
259
|
|
|
'STCH' => ['0', '2', '4', '4'], |
260
|
|
|
'STRS' => ['0', '2', '4', '4'], |
261
|
|
|
'STRZ' => ['0', '2', '4', '4'], |
262
|
|
|
'STSCH' => ['0', '2', '4', '4'], |
263
|
|
|
'STSH' => ['0', '2', '4', '4'], |
264
|
|
|
'SSZ' => ['0', '4', '4', '4'], |
265
|
|
|
'SZ' => ['0', '4', '4', '4'], |
266
|
|
|
'SZCS' => ['0', '2', '4', '4'], |
267
|
|
|
'SZCZ' => ['0', '2', '4', '4'], |
268
|
|
|
'SZD' => ['0', '2', '43', '43'], |
269
|
|
|
'SZT' => ['0', '2', '43', '43'], |
270
|
|
|
'T' => ['0', '3', '3', '3'], |
271
|
|
|
'Ť' => ['0', '3', '3', '3'], |
272
|
|
|
'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], |
273
|
|
|
'TC' => ['0', '4', '4', '4'], |
274
|
|
|
'TCH' => ['0', '4', '4', '4'], |
275
|
|
|
'TH' => ['0', '3', '3', '3'], |
276
|
|
|
'TRS' => ['0', '4', '4', '4'], |
277
|
|
|
'TRZ' => ['0', '4', '4', '4'], |
278
|
|
|
'TS' => ['0', '4', '4', '4'], |
279
|
|
|
'TSCH' => ['0', '4', '4', '4'], |
280
|
|
|
'TSH' => ['0', '4', '4', '4'], |
281
|
|
|
'TSZ' => ['0', '4', '4', '4'], |
282
|
|
|
'TTCH' => ['0', '4', '4', '4'], |
283
|
|
|
'TTS' => ['0', '4', '4', '4'], |
284
|
|
|
'TTSCH' => ['0', '4', '4', '4'], |
285
|
|
|
'TTSZ' => ['0', '4', '4', '4'], |
286
|
|
|
'TTZ' => ['0', '4', '4', '4'], |
287
|
|
|
'TZ' => ['0', '4', '4', '4'], |
288
|
|
|
'TZS' => ['0', '4', '4', '4'], |
289
|
|
|
'U' => ['1', '0', '', ''], |
290
|
|
|
'Ù' => ['1', '0', '', ''], |
291
|
|
|
'Ú' => ['1', '0', '', ''], |
292
|
|
|
'Û' => ['1', '0', '', ''], |
293
|
|
|
'Ü' => ['1', '0', '', ''], |
294
|
|
|
'Ũ' => ['1', '0', '', ''], |
295
|
|
|
'Ū' => ['1', '0', '', ''], |
296
|
|
|
'Ů' => ['1', '0', '', ''], |
297
|
|
|
'Ű' => ['1', '0', '', ''], |
298
|
|
|
'Ų' => ['1', '0', '', ''], |
299
|
|
|
'Ư' => ['1', '0', '', ''], |
300
|
|
|
'Ụ' => ['1', '0', '', ''], |
301
|
|
|
'Ủ' => ['1', '0', '', ''], |
302
|
|
|
'Ứ' => ['1', '0', '', ''], |
303
|
|
|
'Ừ' => ['1', '0', '', ''], |
304
|
|
|
'Ử' => ['1', '0', '', ''], |
305
|
|
|
'Ữ' => ['1', '0', '', ''], |
306
|
|
|
'Ự' => ['1', '0', '', ''], |
307
|
|
|
'UE' => ['1', '0', '', ''], |
308
|
|
|
'UI' => ['1', '0', '1', ''], |
309
|
|
|
'UJ' => ['1', '0', '1', ''], |
310
|
|
|
'UY' => ['1', '0', '1', ''], |
311
|
|
|
'UW' => ['1', '0', '1', '', '0', '7', '7'], |
312
|
|
|
'V' => ['0', '7', '7', '7'], |
313
|
|
|
'W' => ['0', '7', '7', '7'], |
314
|
|
|
'X' => ['0', '5', '54', '54'], |
315
|
|
|
'Y' => ['1', '1', '', ''], |
316
|
|
|
'Ý' => ['1', '1', '', ''], |
317
|
|
|
'Ỳ' => ['1', '1', '', ''], |
318
|
|
|
'Ỵ' => ['1', '1', '', ''], |
319
|
|
|
'Ỷ' => ['1', '1', '', ''], |
320
|
|
|
'Ỹ' => ['1', '1', '', ''], |
321
|
|
|
'Z' => ['0', '4', '4', '4'], |
322
|
|
|
'Ź' => ['0', '4', '4', '4'], |
323
|
|
|
'Ż' => ['0', '4', '4', '4'], |
324
|
|
|
'Ž' => ['0', '4', '4', '4'], |
325
|
|
|
'ZD' => ['0', '2', '43', '43'], |
326
|
|
|
'ZDZ' => ['0', '2', '4', '4'], |
327
|
|
|
'ZDZH' => ['0', '2', '4', '4'], |
328
|
|
|
'ZH' => ['0', '4', '4', '4'], |
329
|
|
|
'ZHD' => ['0', '2', '43', '43'], |
330
|
|
|
'ZHDZH' => ['0', '2', '4', '4'], |
331
|
|
|
'ZS' => ['0', '4', '4', '4'], |
332
|
|
|
'ZSCH' => ['0', '4', '4', '4'], |
333
|
|
|
'ZSH' => ['0', '4', '4', '4'], |
334
|
|
|
'ZZS' => ['0', '4', '4', '4'], |
335
|
|
|
// Cyrillic alphabet |
336
|
|
|
'А' => ['1', '0', '', ''], |
337
|
|
|
'Б' => ['0', '7', '7', '7'], |
338
|
|
|
'В' => ['0', '7', '7', '7'], |
339
|
|
|
'Г' => ['0', '5', '5', '5'], |
340
|
|
|
'Д' => ['0', '3', '3', '3'], |
341
|
|
|
'ДЗ' => ['0', '4', '4', '4'], |
342
|
|
|
'Е' => ['1', '0', '', ''], |
343
|
|
|
'Ё' => ['1', '0', '', ''], |
344
|
|
|
'Ж' => ['0', '4', '4', '4'], |
345
|
|
|
'З' => ['0', '4', '4', '4'], |
346
|
|
|
'И' => ['1', '0', '', ''], |
347
|
|
|
'Й' => ['1', '1', '', '', '4', '4', '4'], |
348
|
|
|
'К' => ['0', '5', '5', '5'], |
349
|
|
|
'Л' => ['0', '8', '8', '8'], |
350
|
|
|
'М' => ['0', '6', '6', '6'], |
351
|
|
|
'Н' => ['0', '6', '6', '6'], |
352
|
|
|
'О' => ['1', '0', '', ''], |
353
|
|
|
'П' => ['0', '7', '7', '7'], |
354
|
|
|
'Р' => ['0', '9', '9', '9'], |
355
|
|
|
'РЖ' => ['0', '4', '4', '4'], |
356
|
|
|
'С' => ['0', '4', '4', '4'], |
357
|
|
|
'Т' => ['0', '3', '3', '3'], |
358
|
|
|
'У' => ['1', '0', '', ''], |
359
|
|
|
'Ф' => ['0', '7', '7', '7'], |
360
|
|
|
'Х' => ['0', '5', '5', '5'], |
361
|
|
|
'Ц' => ['0', '4', '4', '4'], |
362
|
|
|
'Ч' => ['0', '4', '4', '4'], |
363
|
|
|
'Ш' => ['0', '4', '4', '4'], |
364
|
|
|
'Щ' => ['0', '2', '4', '4'], |
365
|
|
|
'Ъ' => ['0', '', '', ''], |
366
|
|
|
'Ы' => ['0', '1', '', ''], |
367
|
|
|
'Ь' => ['0', '', '', ''], |
368
|
|
|
'Э' => ['1', '0', '', ''], |
369
|
|
|
'Ю' => ['0', '1', '', ''], |
370
|
|
|
'Я' => ['0', '1', '', ''], |
371
|
|
|
// Greek alphabet |
372
|
|
|
'Α' => ['1', '0', '', ''], |
373
|
|
|
'Ά' => ['1', '0', '', ''], |
374
|
|
|
'ΑΙ' => ['1', '0', '1', ''], |
375
|
|
|
'ΑΥ' => ['1', '0', '1', ''], |
376
|
|
|
'Β' => ['0', '7', '7', '7'], |
377
|
|
|
'Γ' => ['0', '5', '5', '5'], |
378
|
|
|
'Δ' => ['0', '3', '3', '3'], |
379
|
|
|
'Ε' => ['1', '0', '', ''], |
380
|
|
|
'Έ' => ['1', '0', '', ''], |
381
|
|
|
'ΕΙ' => ['1', '0', '1', ''], |
382
|
|
|
'ΕΥ' => ['1', '1', '1', ''], |
383
|
|
|
'Ζ' => ['0', '4', '4', '4'], |
384
|
|
|
'Η' => ['1', '0', '', ''], |
385
|
|
|
'Ή' => ['1', '0', '', ''], |
386
|
|
|
'Θ' => ['0', '3', '3', '3'], |
387
|
|
|
'Ι' => ['1', '0', '', ''], |
388
|
|
|
'Ί' => ['1', '0', '', ''], |
389
|
|
|
'Ϊ' => ['1', '0', '', ''], |
390
|
|
|
'ΐ' => ['1', '0', '', ''], |
391
|
|
|
'Κ' => ['0', '5', '5', '5'], |
392
|
|
|
'Λ' => ['0', '8', '8', '8'], |
393
|
|
|
'Μ' => ['0', '6', '6', '6'], |
394
|
|
|
'ΜΠ' => ['0', '7', '7', '7'], |
395
|
|
|
'Ν' => ['0', '6', '6', '6'], |
396
|
|
|
'ΝΤ' => ['0', '3', '3', '3'], |
397
|
|
|
'Ξ' => ['0', '5', '54', '54'], |
398
|
|
|
'Ο' => ['1', '0', '', ''], |
399
|
|
|
'Ό' => ['1', '0', '', ''], |
400
|
|
|
'ΟΙ' => ['1', '0', '1', ''], |
401
|
|
|
'ΟΥ' => ['1', '0', '1', ''], |
402
|
|
|
'Π' => ['0', '7', '7', '7'], |
403
|
|
|
'Ρ' => ['0', '9', '9', '9'], |
404
|
|
|
'Σ' => ['0', '4', '4', '4'], |
405
|
|
|
'ς' => ['0', '', '', '4'], |
406
|
|
|
'Τ' => ['0', '3', '3', '3'], |
407
|
|
|
'ΤΖ' => ['0', '4', '4', '4'], |
408
|
|
|
'ΤΣ' => ['0', '4', '4', '4'], |
409
|
|
|
'Υ' => ['1', '1', '', ''], |
410
|
|
|
'Ύ' => ['1', '1', '', ''], |
411
|
|
|
'Ϋ' => ['1', '1', '', ''], |
412
|
|
|
'ΰ' => ['1', '1', '', ''], |
413
|
|
|
'ΥΚ' => ['1', '5', '5', '5'], |
414
|
|
|
'ΥΥ' => ['1', '65', '65', '65'], |
415
|
|
|
'Φ' => ['0', '7', '7', '7'], |
416
|
|
|
'Χ' => ['0', '5', '5', '5'], |
417
|
|
|
'Ψ' => ['0', '7', '7', '7'], |
418
|
|
|
'Ω' => ['1', '0', '', ''], |
419
|
|
|
'Ώ' => ['1', '0', '', ''], |
420
|
|
|
// Hebrew alphabet |
421
|
|
|
'א' => ['1', '0', '', ''], |
422
|
|
|
'או' => ['1', '0', '7', ''], |
423
|
|
|
'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], |
424
|
|
|
'בב' => ['0', '7', '7', '7', '77', '77', '77'], |
425
|
|
|
'ב' => ['0', '7', '7', '7'], |
426
|
|
|
'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], |
427
|
|
|
'גד' => ['0', '43', '43', '43', '53', '53', '53'], |
428
|
|
|
'גה' => ['0', '45', '45', '45', '55', '55', '55'], |
429
|
|
|
'גז' => ['0', '44', '44', '44', '45', '45', '45'], |
430
|
|
|
'גח' => ['0', '45', '45', '45', '55', '55', '55'], |
431
|
|
|
'גכ' => ['0', '45', '45', '45', '55', '55', '55'], |
432
|
|
|
'גך' => ['0', '45', '45', '45', '55', '55', '55'], |
433
|
|
|
'גצ' => ['0', '44', '44', '44', '45', '45', '45'], |
434
|
|
|
'גץ' => ['0', '44', '44', '44', '45', '45', '45'], |
435
|
|
|
'גק' => ['0', '45', '45', '45', '54', '54', '54'], |
436
|
|
|
'גש' => ['0', '44', '44', '44', '54', '54', '54'], |
437
|
|
|
'גת' => ['0', '43', '43', '43', '53', '53', '53'], |
438
|
|
|
'ג' => ['0', '4', '4', '4', '5', '5', '5'], |
439
|
|
|
'דז' => ['0', '4', '4', '4'], |
440
|
|
|
'דד' => ['0', '3', '3', '3', '33', '33', '33'], |
441
|
|
|
'דט' => ['0', '33', '33', '33'], |
442
|
|
|
'דש' => ['0', '4', '4', '4'], |
443
|
|
|
'דצ' => ['0', '4', '4', '4'], |
444
|
|
|
'דץ' => ['0', '4', '4', '4'], |
445
|
|
|
'ד' => ['0', '3', '3', '3'], |
446
|
|
|
'הג' => ['0', '54', '54', '54', '55', '55', '55'], |
447
|
|
|
'הכ' => ['0', '55', '55', '55'], |
448
|
|
|
'הח' => ['0', '55', '55', '55'], |
449
|
|
|
'הק' => ['0', '55', '55', '55', '5', '5', '5'], |
450
|
|
|
'הה' => ['0', '5', '5', '', '55', '55', ''], |
451
|
|
|
'ה' => ['0', '5', '5', ''], |
452
|
|
|
'וי' => ['1', '', '', '', '7', '7', '7'], |
453
|
|
|
'ו' => ['1', '7', '7', '7', '7', '', ''], |
454
|
|
|
'וו' => ['1', '7', '7', '7', '7', '', ''], |
455
|
|
|
'וופ' => ['1', '7', '7', '7', '77', '77', '77'], |
456
|
|
|
'זש' => ['0', '4', '4', '4', '44', '44', '44'], |
457
|
|
|
'זדז' => ['0', '2', '4', '4'], |
458
|
|
|
'ז' => ['0', '4', '4', '4'], |
459
|
|
|
'זג' => ['0', '44', '44', '44', '45', '45', '45'], |
460
|
|
|
'זז' => ['0', '4', '4', '4', '44', '44', '44'], |
461
|
|
|
'זס' => ['0', '44', '44', '44'], |
462
|
|
|
'זצ' => ['0', '44', '44', '44'], |
463
|
|
|
'זץ' => ['0', '44', '44', '44'], |
464
|
|
|
'חג' => ['0', '54', '54', '54', '53', '53', '53'], |
465
|
|
|
'חח' => ['0', '5', '5', '5', '55', '55', '55'], |
466
|
|
|
'חק' => ['0', '55', '55', '55', '5', '5', '5'], |
467
|
|
|
'חכ' => ['0', '45', '45', '45', '55', '55', '55'], |
468
|
|
|
'חס' => ['0', '5', '54', '54'], |
469
|
|
|
'חש' => ['0', '5', '54', '54'], |
470
|
|
|
'ח' => ['0', '5', '5', '5'], |
471
|
|
|
'טש' => ['0', '4', '4', '4'], |
472
|
|
|
'טד' => ['0', '33', '33', '33'], |
473
|
|
|
'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], |
474
|
|
|
'טת' => ['0', '33', '33', '33'], |
475
|
|
|
'טט' => ['0', '3', '3', '3', '33', '33', '33'], |
476
|
|
|
'ט' => ['0', '3', '3', '3'], |
477
|
|
|
'י' => ['1', '1', '', ''], |
478
|
|
|
'יא' => ['1', '1', '', '', '1', '1', '1'], |
479
|
|
|
'כג' => ['0', '55', '55', '55', '54', '54', '54'], |
480
|
|
|
'כש' => ['0', '5', '54', '54'], |
481
|
|
|
'כס' => ['0', '5', '54', '54'], |
482
|
|
|
'ככ' => ['0', '5', '5', '5', '55', '55', '55'], |
483
|
|
|
'כך' => ['0', '5', '5', '5', '55', '55', '55'], |
484
|
|
|
'כ' => ['0', '5', '5', '5'], |
485
|
|
|
'כח' => ['0', '55', '55', '55', '5', '5', '5'], |
486
|
|
|
'ך' => ['0', '', '5', '5'], |
487
|
|
|
'ל' => ['0', '8', '8', '8'], |
488
|
|
|
'לל' => ['0', '88', '88', '88', '8', '8', '8'], |
489
|
|
|
'מנ' => ['0', '66', '66', '66'], |
490
|
|
|
'מן' => ['0', '66', '66', '66'], |
491
|
|
|
'ממ' => ['0', '6', '6', '6', '66', '66', '66'], |
492
|
|
|
'מם' => ['0', '6', '6', '6', '66', '66', '66'], |
493
|
|
|
'מ' => ['0', '6', '6', '6'], |
494
|
|
|
'ם' => ['0', '', '6', '6'], |
495
|
|
|
'נמ' => ['0', '66', '66', '66'], |
496
|
|
|
'נם' => ['0', '66', '66', '66'], |
497
|
|
|
'ננ' => ['0', '6', '6', '6', '66', '66', '66'], |
498
|
|
|
'נן' => ['0', '6', '6', '6', '66', '66', '66'], |
499
|
|
|
'נ' => ['0', '6', '6', '6'], |
500
|
|
|
'ן' => ['0', '', '6', '6'], |
501
|
|
|
'סתש' => ['0', '2', '4', '4'], |
502
|
|
|
'סתז' => ['0', '2', '4', '4'], |
503
|
|
|
'סטז' => ['0', '2', '4', '4'], |
504
|
|
|
'סטש' => ['0', '2', '4', '4'], |
505
|
|
|
'סצד' => ['0', '2', '4', '4'], |
506
|
|
|
'סט' => ['0', '2', '4', '4', '43', '43', '43'], |
507
|
|
|
'סת' => ['0', '2', '4', '4', '43', '43', '43'], |
508
|
|
|
'סג' => ['0', '44', '44', '44', '4', '4', '4'], |
509
|
|
|
'סס' => ['0', '4', '4', '4', '44', '44', '44'], |
510
|
|
|
'סצ' => ['0', '44', '44', '44'], |
511
|
|
|
'סץ' => ['0', '44', '44', '44'], |
512
|
|
|
'סז' => ['0', '44', '44', '44'], |
513
|
|
|
'סש' => ['0', '44', '44', '44'], |
514
|
|
|
'ס' => ['0', '4', '4', '4'], |
515
|
|
|
'ע' => ['1', '0', '', ''], |
516
|
|
|
'פב' => ['0', '7', '7', '7', '77', '77', '77'], |
517
|
|
|
'פוו' => ['0', '7', '7', '7', '77', '77', '77'], |
518
|
|
|
'פפ' => ['0', '7', '7', '7', '77', '77', '77'], |
519
|
|
|
'פף' => ['0', '7', '7', '7', '77', '77', '77'], |
520
|
|
|
'פ' => ['0', '7', '7', '7'], |
521
|
|
|
'ף' => ['0', '', '7', '7'], |
522
|
|
|
'צג' => ['0', '44', '44', '44', '45', '45', '45'], |
523
|
|
|
'צז' => ['0', '44', '44', '44'], |
524
|
|
|
'צס' => ['0', '44', '44', '44'], |
525
|
|
|
'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], |
526
|
|
|
'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], |
527
|
|
|
'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], |
528
|
|
|
'צ' => ['0', '4', '4', '4', '5', '5', '5'], |
529
|
|
|
'ץ' => ['0', '', '4', '4'], |
530
|
|
|
'קה' => ['0', '55', '55', '5'], |
531
|
|
|
'קס' => ['0', '5', '54', '54'], |
532
|
|
|
'קש' => ['0', '5', '54', '54'], |
533
|
|
|
'קק' => ['0', '5', '5', '5', '55', '55', '55'], |
534
|
|
|
'קח' => ['0', '55', '55', '55'], |
535
|
|
|
'קכ' => ['0', '55', '55', '55'], |
536
|
|
|
'קך' => ['0', '55', '55', '55'], |
537
|
|
|
'קג' => ['0', '55', '55', '55', '54', '54', '54'], |
538
|
|
|
'ק' => ['0', '5', '5', '5'], |
539
|
|
|
'רר' => ['0', '99', '99', '99', '9', '9', '9'], |
540
|
|
|
'ר' => ['0', '9', '9', '9'], |
541
|
|
|
'שטז' => ['0', '2', '4', '4'], |
542
|
|
|
'שתש' => ['0', '2', '4', '4'], |
543
|
|
|
'שתז' => ['0', '2', '4', '4'], |
544
|
|
|
'שטש' => ['0', '2', '4', '4'], |
545
|
|
|
'שד' => ['0', '2', '43', '43'], |
546
|
|
|
'שז' => ['0', '44', '44', '44'], |
547
|
|
|
'שס' => ['0', '44', '44', '44'], |
548
|
|
|
'שת' => ['0', '2', '43', '43'], |
549
|
|
|
'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], |
550
|
|
|
'שט' => ['0', '2', '43', '43', '44', '44', '44'], |
551
|
|
|
'שצ' => ['0', '44', '44', '44', '45', '45', '45'], |
552
|
|
|
'שץ' => ['0', '44', '', '44', '45', '', '45'], |
553
|
|
|
'שש' => ['0', '4', '4', '4', '44', '44', '44'], |
554
|
|
|
'ש' => ['0', '4', '4', '4'], |
555
|
|
|
'תג' => ['0', '34', '34', '34'], |
556
|
|
|
'תז' => ['0', '34', '34', '34'], |
557
|
|
|
'תש' => ['0', '4', '4', '4'], |
558
|
|
|
'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], |
559
|
|
|
'ת' => ['0', '3', '3', '3', '4', '4', '4'], |
560
|
|
|
// Arabic alphabet |
561
|
|
|
'ا' => ['1', '0', '', ''], |
562
|
|
|
'ب' => ['0', '7', '7', '7'], |
563
|
|
|
'ت' => ['0', '3', '3', '3'], |
564
|
|
|
'ث' => ['0', '3', '3', '3'], |
565
|
|
|
'ج' => ['0', '4', '4', '4'], |
566
|
|
|
'ح' => ['0', '5', '5', '5'], |
567
|
|
|
'خ' => ['0', '5', '5', '5'], |
568
|
|
|
'د' => ['0', '3', '3', '3'], |
569
|
|
|
'ذ' => ['0', '3', '3', '3'], |
570
|
|
|
'ر' => ['0', '9', '9', '9'], |
571
|
|
|
'ز' => ['0', '4', '4', '4'], |
572
|
|
|
'س' => ['0', '4', '4', '4'], |
573
|
|
|
'ش' => ['0', '4', '4', '4'], |
574
|
|
|
'ص' => ['0', '4', '4', '4'], |
575
|
|
|
'ض' => ['0', '3', '3', '3'], |
576
|
|
|
'ط' => ['0', '3', '3', '3'], |
577
|
|
|
'ظ' => ['0', '4', '4', '4'], |
578
|
|
|
'ع' => ['1', '0', '', ''], |
579
|
|
|
'غ' => ['0', '0', '', ''], |
580
|
|
|
'ف' => ['0', '7', '7', '7'], |
581
|
|
|
'ق' => ['0', '5', '5', '5'], |
582
|
|
|
'ك' => ['0', '5', '5', '5'], |
583
|
|
|
'ل' => ['0', '8', '8', '8'], |
584
|
|
|
'لا' => ['0', '8', '8', '8'], |
585
|
|
|
'م' => ['0', '6', '6', '6'], |
586
|
|
|
'ن' => ['0', '6', '6', '6'], |
587
|
|
|
'هن' => ['0', '66', '66', '66'], |
588
|
|
|
'ه' => ['0', '5', '5', ''], |
589
|
|
|
'و' => ['1', '', '', '', '7', '', ''], |
590
|
|
|
'ي' => ['0', '1', '', ''], |
591
|
|
|
'آ' => ['0', '1', '', ''], |
592
|
|
|
'ة' => ['0', '', '', '3'], |
593
|
|
|
'ی' => ['0', '1', '', ''], |
594
|
|
|
'ى' => ['1', '1', '', ''], |
595
|
|
|
]; |
596
|
|
|
|
597
|
|
|
/** |
598
|
|
|
* Which algorithms are supported. |
599
|
|
|
* |
600
|
|
|
* @return array<string> |
601
|
|
|
*/ |
602
|
|
|
public static function getAlgorithms(): array |
603
|
|
|
{ |
604
|
|
|
return [ |
605
|
|
|
/* I18N: https://en.wikipedia.org/wiki/Soundex */ |
606
|
|
|
'std' => I18N::translate('Russell'), |
607
|
|
|
/* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ |
608
|
|
|
'dm' => I18N::translate('Daitch-Mokotoff'), |
609
|
|
|
]; |
610
|
|
|
} |
611
|
|
|
|
612
|
|
|
/** |
613
|
|
|
* Is there a match between two soundex codes? |
614
|
|
|
* |
615
|
|
|
* @param string $soundex1 |
616
|
|
|
* @param string $soundex2 |
617
|
|
|
* |
618
|
|
|
* @return bool |
619
|
|
|
*/ |
620
|
|
|
public static function compare(string $soundex1, string $soundex2): bool |
621
|
|
|
{ |
622
|
|
|
if ($soundex1 !== '' && $soundex2 !== '') { |
623
|
|
|
return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; |
624
|
|
|
} |
625
|
|
|
|
626
|
|
|
return false; |
627
|
|
|
} |
628
|
|
|
|
629
|
|
|
/** |
630
|
|
|
* Generate Russell soundex codes for a given text. |
631
|
|
|
* |
632
|
|
|
* @param string $text |
633
|
|
|
* |
634
|
|
|
* @return string |
635
|
|
|
*/ |
636
|
|
|
public static function russell(string $text): string |
637
|
|
|
{ |
638
|
|
|
$words = explode(' ', $text); |
639
|
|
|
$soundex_array = []; |
640
|
|
|
|
641
|
|
|
foreach ($words as $word) { |
642
|
|
|
$soundex = soundex($word); |
643
|
|
|
|
644
|
|
|
// Only return codes from recognisable sounds |
645
|
|
|
if ($soundex !== '0000') { |
646
|
|
|
$soundex_array[] = $soundex; |
647
|
|
|
} |
648
|
|
|
} |
649
|
|
|
|
650
|
|
|
// Combine words, e.g. “New York” as “Newyork” |
651
|
|
|
if (count($words) > 1) { |
652
|
|
|
$soundex_array[] = soundex(str_replace(' ', '', $text)); |
653
|
|
|
} |
654
|
|
|
|
655
|
|
|
// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) |
656
|
|
|
$soundex_array = array_slice(array_unique($soundex_array), 0, 51); |
657
|
|
|
|
658
|
|
|
return implode(':', $soundex_array); |
659
|
|
|
} |
660
|
|
|
|
661
|
|
|
/** |
662
|
|
|
* Generate Daitch–Mokotoff soundex codes for a given text. |
663
|
|
|
* |
664
|
|
|
* @param string $text |
665
|
|
|
* |
666
|
|
|
* @return string |
667
|
|
|
*/ |
668
|
|
|
public static function daitchMokotoff(string $text): string |
669
|
|
|
{ |
670
|
|
|
$words = explode(' ', $text); |
671
|
|
|
$soundex_array = []; |
672
|
|
|
|
673
|
|
|
foreach ($words as $word) { |
674
|
|
|
$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); |
675
|
|
|
} |
676
|
|
|
// Combine words, e.g. “New York” as “Newyork” |
677
|
|
|
if (count($words) > 1) { |
678
|
|
|
$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); |
679
|
|
|
} |
680
|
|
|
|
681
|
|
|
// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) |
682
|
|
|
$soundex_array = array_slice(array_unique($soundex_array), 0, 36); |
683
|
|
|
|
684
|
|
|
return implode(':', $soundex_array); |
685
|
|
|
} |
686
|
|
|
|
687
|
|
|
/** |
688
|
|
|
* Calculate the Daitch-Mokotoff soundex for a word. |
689
|
|
|
* |
690
|
|
|
* @param string $name |
691
|
|
|
* |
692
|
|
|
* @return array<string> List of possible DM codes for the word. |
693
|
|
|
*/ |
694
|
|
|
private static function daitchMokotoffWord(string $name): array |
695
|
|
|
{ |
696
|
|
|
// Apply special transformation rules to the input string |
697
|
|
|
$name = I18N::strtoupper($name); |
698
|
|
|
foreach (self::TRANSFORM_NAMES as $transformRule) { |
699
|
|
|
$name = str_replace($transformRule[0], $transformRule[1], $name); |
700
|
|
|
} |
701
|
|
|
|
702
|
|
|
// Initialize |
703
|
|
|
$name_script = I18N::textScript($name); |
704
|
|
|
$noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; |
705
|
|
|
|
706
|
|
|
$lastPos = strlen($name) - 1; |
707
|
|
|
$currPos = 0; |
708
|
|
|
$state = 1; // 1: start of input string, 2: before vowel, 3: other |
709
|
|
|
$result = []; // accumulate complete 6-digit D-M codes here |
710
|
|
|
$partialResult = []; // accumulate incomplete D-M codes here |
711
|
|
|
$partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) |
712
|
|
|
|
713
|
|
|
// Loop through the input string. |
714
|
|
|
// Stop when the string is exhausted or when no more partial results remain |
715
|
|
|
while ($partialResult !== [] && $currPos <= $lastPos) { |
716
|
|
|
// Find the DM coding table entry for the chunk at the current position |
717
|
|
|
$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
718
|
|
|
while ($thisEntry !== '') { |
719
|
|
|
if (isset(self::DM_SOUNDS[$thisEntry])) { |
720
|
|
|
break; |
721
|
|
|
} |
722
|
|
|
$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk |
723
|
|
|
} |
724
|
|
|
if ($thisEntry === '') { |
725
|
|
|
$currPos++; // Not in table: advance pointer to next byte |
726
|
|
|
continue; // and try again |
727
|
|
|
} |
728
|
|
|
|
729
|
|
|
$soundTableEntry = self::DM_SOUNDS[$thisEntry]; |
730
|
|
|
$workingResult = $partialResult; |
731
|
|
|
$partialResult = []; |
732
|
|
|
$currPos += strlen($thisEntry); |
733
|
|
|
|
734
|
|
|
// Not at beginning of input string |
735
|
|
|
if ($state !== 1) { |
736
|
|
|
if ($currPos <= $lastPos) { |
737
|
|
|
// Determine whether the next chunk is a vowel |
738
|
|
|
$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
739
|
|
|
while ($nextEntry !== '') { |
740
|
|
|
if (isset(self::DM_SOUNDS[$nextEntry])) { |
741
|
|
|
break; |
742
|
|
|
} |
743
|
|
|
$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk |
744
|
|
|
} |
745
|
|
|
} else { |
746
|
|
|
$nextEntry = ''; |
747
|
|
|
} |
748
|
|
|
if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { |
749
|
|
|
$state = 2; |
750
|
|
|
} else { |
751
|
|
|
// Next chunk is a vowel |
752
|
|
|
$state = 3; |
753
|
|
|
} |
754
|
|
|
} |
755
|
|
|
|
756
|
|
|
while ($state < count($soundTableEntry)) { |
757
|
|
|
// empty means 'ignore this sound in this state' |
758
|
|
|
if ($soundTableEntry[$state] === '') { |
759
|
|
|
foreach ($workingResult as $workingEntry) { |
760
|
|
|
$tempEntry = $workingEntry; |
761
|
|
|
$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' |
762
|
|
|
$partialResult[] = $tempEntry; |
763
|
|
|
} |
764
|
|
|
} else { |
765
|
|
|
foreach ($workingResult as $workingEntry) { |
766
|
|
|
if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { |
767
|
|
|
// Incoming sound isn't a duplicate of the previous sound |
768
|
|
|
$workingEntry[] = $soundTableEntry[$state]; |
769
|
|
|
} elseif ($noVowels) { |
770
|
|
|
// Incoming sound is a duplicate of the previous sound |
771
|
|
|
// For Hebrew and Arabic, we need to create a pair of D-M sound codes, |
772
|
|
|
// one of the pair with only a single occurrence of the duplicate sound, |
773
|
|
|
// the other with both occurrences |
774
|
|
|
$workingEntry[] = $soundTableEntry[$state]; |
775
|
|
|
} |
776
|
|
|
|
777
|
|
|
if (count($workingEntry) < 7) { |
778
|
|
|
$partialResult[] = $workingEntry; |
779
|
|
|
} else { |
780
|
|
|
// This is the 6th code in the sequence |
781
|
|
|
// We're looking for 7 entries because the first is '!' and doesn't count |
782
|
|
|
$tempResult = str_replace('!', '', implode('', $workingEntry)); |
783
|
|
|
// Only return codes from recognisable sounds |
784
|
|
|
if ($tempResult !== '') { |
785
|
|
|
$result[] = substr($tempResult . '000000', 0, 6); |
786
|
|
|
} |
787
|
|
|
} |
788
|
|
|
} |
789
|
|
|
} |
790
|
|
|
$state += 3; // Advance to next triplet while keeping the same basic state |
791
|
|
|
} |
792
|
|
|
} |
793
|
|
|
|
794
|
|
|
// Zero-fill and copy all remaining partial results |
795
|
|
|
foreach ($partialResult as $workingEntry) { |
796
|
|
|
$tempResult = str_replace('!', '', implode('', $workingEntry)); |
797
|
|
|
// Only return codes from recognisable sounds |
798
|
|
|
if ($tempResult !== '') { |
799
|
|
|
$result[] = substr($tempResult . '000000', 0, 6); |
800
|
|
|
} |
801
|
|
|
} |
802
|
|
|
|
803
|
|
|
return $result; |
804
|
|
|
} |
805
|
|
|
} |
806
|
|
|
|