|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
/** |
|
4
|
|
|
* webtrees: online genealogy |
|
5
|
|
|
* Copyright (C) 2021 webtrees development team |
|
6
|
|
|
* This program is free software: you can redistribute it and/or modify |
|
7
|
|
|
* it under the terms of the GNU General Public License as published by |
|
8
|
|
|
* the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
* (at your option) any later version. |
|
10
|
|
|
* This program is distributed in the hope that it will be useful, |
|
11
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
12
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
13
|
|
|
* GNU General Public License for more details. |
|
14
|
|
|
* You should have received a copy of the GNU General Public License |
|
15
|
|
|
* along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
16
|
|
|
*/ |
|
17
|
|
|
|
|
18
|
|
|
declare(strict_types=1); |
|
19
|
|
|
|
|
20
|
|
|
namespace Fisharebest\Webtrees; |
|
21
|
|
|
|
|
22
|
|
|
/** |
|
23
|
|
|
* Phonetic matching of strings. |
|
24
|
|
|
*/ |
|
25
|
|
|
class Soundex |
|
26
|
|
|
{ |
|
27
|
|
|
// Determine the Daitch–Mokotoff Soundex code for a word |
|
28
|
|
|
// Original implementation by Gerry Kroll, and analysis by Meliza Amity |
|
29
|
|
|
|
|
30
|
|
|
// Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) |
|
31
|
|
|
private const MAXCHAR = 7; |
|
32
|
|
|
|
|
33
|
|
|
/** |
|
34
|
|
|
* Name transformation arrays. |
|
35
|
|
|
* Used to transform the Name string to simplify the "sounds like" table. |
|
36
|
|
|
* This is especially useful in Hebrew. |
|
37
|
|
|
* |
|
38
|
|
|
* Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) |
|
39
|
|
|
* function call to achieve the desired transformations. |
|
40
|
|
|
* |
|
41
|
|
|
* Note about the use of "\x01": |
|
42
|
|
|
* This code, which can’t legitimately occur in the kind of text we're dealing with, |
|
43
|
|
|
* is used as a place-holder so that conditional string replacements can be done. |
|
44
|
|
|
*/ |
|
45
|
|
|
private const TRANSFORM_NAMES = [ |
|
46
|
|
|
// Force Yiddish ligatures to be treated as separate letters |
|
47
|
|
|
['װ', 'וו'], |
|
48
|
|
|
['ײ', 'יי'], |
|
49
|
|
|
['ױ', 'וי'], |
|
50
|
|
|
['בו', 'בע'], |
|
51
|
|
|
['פו', 'פע'], |
|
52
|
|
|
['ומ', 'עמ'], |
|
53
|
|
|
['ום', 'עם'], |
|
54
|
|
|
['ונ', 'ענ'], |
|
55
|
|
|
['ון', 'ען'], |
|
56
|
|
|
['וו', 'ב'], |
|
57
|
|
|
["\x01", ''], |
|
58
|
|
|
['ייה$', "\x01ה"], |
|
59
|
|
|
['ייע$', "\x01ע"], |
|
60
|
|
|
['יי', 'ע'], |
|
61
|
|
|
["\x01", 'יי'], |
|
62
|
|
|
]; |
|
63
|
|
|
|
|
64
|
|
|
/** |
|
65
|
|
|
* The DM sound coding table is organized this way: |
|
66
|
|
|
* key: a variable-length string that corresponds to the UTF-8 character sequence |
|
67
|
|
|
* represented by the table entry. Currently, that string can be up to 7 |
|
68
|
|
|
* bytes long. This maximum length is defined by the value of global variable |
|
69
|
|
|
* $maxchar. |
|
70
|
|
|
* |
|
71
|
|
|
* value: an array as follows: |
|
72
|
|
|
* [0]: zero if not a vowel |
|
73
|
|
|
* [1]: sound value when this string is at the beginning of the word |
|
74
|
|
|
* [2]: sound value when this string is followed by a vowel |
|
75
|
|
|
* [3]: sound value for other cases |
|
76
|
|
|
* [1],[2],[3] can be repeated several times to create branches in the code |
|
77
|
|
|
* an empty sound value means "ignore in this state" |
|
78
|
|
|
*/ |
|
79
|
|
|
private const DM_SOUNDS = [ |
|
80
|
|
|
'A' => ['1', '0', '', ''], |
|
81
|
|
|
'À' => ['1', '0', '', ''], |
|
82
|
|
|
'Á' => ['1', '0', '', ''], |
|
83
|
|
|
'Â' => ['1', '0', '', ''], |
|
84
|
|
|
'Ã' => ['1', '0', '', ''], |
|
85
|
|
|
'Ä' => ['1', '0', '1', '', '0', '', ''], |
|
86
|
|
|
'Å' => ['1', '0', '', ''], |
|
87
|
|
|
'Ă' => ['1', '0', '', ''], |
|
88
|
|
|
'Ą' => ['1', '', '', '', '', '', '6'], |
|
89
|
|
|
'Ạ' => ['1', '0', '', ''], |
|
90
|
|
|
'Ả' => ['1', '0', '', ''], |
|
91
|
|
|
'Ấ' => ['1', '0', '', ''], |
|
92
|
|
|
'Ầ' => ['1', '0', '', ''], |
|
93
|
|
|
'Ẩ' => ['1', '0', '', ''], |
|
94
|
|
|
'Ẫ' => ['1', '0', '', ''], |
|
95
|
|
|
'Ậ' => ['1', '0', '', ''], |
|
96
|
|
|
'Ắ' => ['1', '0', '', ''], |
|
97
|
|
|
'Ằ' => ['1', '0', '', ''], |
|
98
|
|
|
'Ẳ' => ['1', '0', '', ''], |
|
99
|
|
|
'Ẵ' => ['1', '0', '', ''], |
|
100
|
|
|
'Ặ' => ['1', '0', '', ''], |
|
101
|
|
|
'AE' => ['1', '0', '1', ''], |
|
102
|
|
|
'Æ' => ['1', '0', '1', ''], |
|
103
|
|
|
'AI' => ['1', '0', '1', ''], |
|
104
|
|
|
'AJ' => ['1', '0', '1', ''], |
|
105
|
|
|
'AU' => ['1', '0', '7', ''], |
|
106
|
|
|
'AV' => ['1', '0', '7', '', '7', '7', '7'], |
|
107
|
|
|
'ÄU' => ['1', '0', '1', ''], |
|
108
|
|
|
'AY' => ['1', '0', '1', ''], |
|
109
|
|
|
'B' => ['0', '7', '7', '7'], |
|
110
|
|
|
'C' => ['0', '5', '5', '5', '34', '4', '4'], |
|
111
|
|
|
'Ć' => ['0', '4', '4', '4'], |
|
112
|
|
|
'Č' => ['0', '4', '4', '4'], |
|
113
|
|
|
'Ç' => ['0', '4', '4', '4'], |
|
114
|
|
|
'CH' => ['0', '5', '5', '5', '34', '4', '4'], |
|
115
|
|
|
'CHS' => ['0', '5', '54', '54'], |
|
116
|
|
|
'CK' => ['0', '5', '5', '5', '45', '45', '45'], |
|
117
|
|
|
'CCS' => ['0', '4', '4', '4'], |
|
118
|
|
|
'CS' => ['0', '4', '4', '4'], |
|
119
|
|
|
'CSZ' => ['0', '4', '4', '4'], |
|
120
|
|
|
'CZ' => ['0', '4', '4', '4'], |
|
121
|
|
|
'CZS' => ['0', '4', '4', '4'], |
|
122
|
|
|
'D' => ['0', '3', '3', '3'], |
|
123
|
|
|
'Ď' => ['0', '3', '3', '3'], |
|
124
|
|
|
'Đ' => ['0', '3', '3', '3'], |
|
125
|
|
|
'DRS' => ['0', '4', '4', '4'], |
|
126
|
|
|
'DRZ' => ['0', '4', '4', '4'], |
|
127
|
|
|
'DS' => ['0', '4', '4', '4'], |
|
128
|
|
|
'DSH' => ['0', '4', '4', '4'], |
|
129
|
|
|
'DSZ' => ['0', '4', '4', '4'], |
|
130
|
|
|
'DT' => ['0', '3', '3', '3'], |
|
131
|
|
|
'DDZ' => ['0', '4', '4', '4'], |
|
132
|
|
|
'DDZS' => ['0', '4', '4', '4'], |
|
133
|
|
|
'DZ' => ['0', '4', '4', '4'], |
|
134
|
|
|
'DŹ' => ['0', '4', '4', '4'], |
|
135
|
|
|
'DŻ' => ['0', '4', '4', '4'], |
|
136
|
|
|
'DZH' => ['0', '4', '4', '4'], |
|
137
|
|
|
'DZS' => ['0', '4', '4', '4'], |
|
138
|
|
|
'E' => ['1', '0', '', ''], |
|
139
|
|
|
'È' => ['1', '0', '', ''], |
|
140
|
|
|
'É' => ['1', '0', '', ''], |
|
141
|
|
|
'Ê' => ['1', '0', '', ''], |
|
142
|
|
|
'Ë' => ['1', '0', '', ''], |
|
143
|
|
|
'Ĕ' => ['1', '0', '', ''], |
|
144
|
|
|
'Ė' => ['1', '0', '', ''], |
|
145
|
|
|
'Ę' => ['1', '', '', '6', '', '', ''], |
|
146
|
|
|
'Ẹ' => ['1', '0', '', ''], |
|
147
|
|
|
'Ẻ' => ['1', '0', '', ''], |
|
148
|
|
|
'Ẽ' => ['1', '0', '', ''], |
|
149
|
|
|
'Ế' => ['1', '0', '', ''], |
|
150
|
|
|
'Ề' => ['1', '0', '', ''], |
|
151
|
|
|
'Ể' => ['1', '0', '', ''], |
|
152
|
|
|
'Ễ' => ['1', '0', '', ''], |
|
153
|
|
|
'Ệ' => ['1', '0', '', ''], |
|
154
|
|
|
'EAU' => ['1', '0', '', ''], |
|
155
|
|
|
'EI' => ['1', '0', '1', ''], |
|
156
|
|
|
'EJ' => ['1', '0', '1', ''], |
|
157
|
|
|
'EU' => ['1', '1', '1', ''], |
|
158
|
|
|
'EY' => ['1', '0', '1', ''], |
|
159
|
|
|
'F' => ['0', '7', '7', '7'], |
|
160
|
|
|
'FB' => ['0', '7', '7', '7'], |
|
161
|
|
|
'G' => ['0', '5', '5', '5', '34', '4', '4'], |
|
162
|
|
|
'Ğ' => ['0', '', '', ''], |
|
163
|
|
|
'GGY' => ['0', '5', '5', '5'], |
|
164
|
|
|
'GY' => ['0', '5', '5', '5'], |
|
165
|
|
|
'H' => ['0', '5', '5', '', '5', '5', '5'], |
|
166
|
|
|
'I' => ['1', '0', '', ''], |
|
167
|
|
|
'Ì' => ['1', '0', '', ''], |
|
168
|
|
|
'Í' => ['1', '0', '', ''], |
|
169
|
|
|
'Î' => ['1', '0', '', ''], |
|
170
|
|
|
'Ï' => ['1', '0', '', ''], |
|
171
|
|
|
'Ĩ' => ['1', '0', '', ''], |
|
172
|
|
|
'Į' => ['1', '0', '', ''], |
|
173
|
|
|
'İ' => ['1', '0', '', ''], |
|
174
|
|
|
'Ỉ' => ['1', '0', '', ''], |
|
175
|
|
|
'Ị' => ['1', '0', '', ''], |
|
176
|
|
|
'IA' => ['1', '1', '', ''], |
|
177
|
|
|
'IE' => ['1', '1', '', ''], |
|
178
|
|
|
'IO' => ['1', '1', '', ''], |
|
179
|
|
|
'IU' => ['1', '1', '', ''], |
|
180
|
|
|
'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], |
|
181
|
|
|
'K' => ['0', '5', '5', '5'], |
|
182
|
|
|
'KH' => ['0', '5', '5', '5'], |
|
183
|
|
|
'KS' => ['0', '5', '54', '54'], |
|
184
|
|
|
'L' => ['0', '8', '8', '8'], |
|
185
|
|
|
'Ľ' => ['0', '8', '8', '8'], |
|
186
|
|
|
'Ĺ' => ['0', '8', '8', '8'], |
|
187
|
|
|
'Ł' => ['0', '7', '7', '7', '8', '8', '8'], |
|
188
|
|
|
'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], |
|
189
|
|
|
'LLY' => ['0', '8', '8', '8', '1', '8', '8'], |
|
190
|
|
|
'LY' => ['0', '8', '8', '8', '1', '8', '8'], |
|
191
|
|
|
'M' => ['0', '6', '6', '6'], |
|
192
|
|
|
'MĔ' => ['0', '66', '66', '66'], |
|
193
|
|
|
'MN' => ['0', '66', '66', '66'], |
|
194
|
|
|
'N' => ['0', '6', '6', '6'], |
|
195
|
|
|
'Ń' => ['0', '6', '6', '6'], |
|
196
|
|
|
'Ň' => ['0', '6', '6', '6'], |
|
197
|
|
|
'Ñ' => ['0', '6', '6', '6'], |
|
198
|
|
|
'NM' => ['0', '66', '66', '66'], |
|
199
|
|
|
'O' => ['1', '0', '', ''], |
|
200
|
|
|
'Ò' => ['1', '0', '', ''], |
|
201
|
|
|
'Ó' => ['1', '0', '', ''], |
|
202
|
|
|
'Ô' => ['1', '0', '', ''], |
|
203
|
|
|
'Õ' => ['1', '0', '', ''], |
|
204
|
|
|
'Ö' => ['1', '0', '', ''], |
|
205
|
|
|
'Ø' => ['1', '0', '', ''], |
|
206
|
|
|
'Ő' => ['1', '0', '', ''], |
|
207
|
|
|
'Œ' => ['1', '0', '', ''], |
|
208
|
|
|
'Ơ' => ['1', '0', '', ''], |
|
209
|
|
|
'Ọ' => ['1', '0', '', ''], |
|
210
|
|
|
'Ỏ' => ['1', '0', '', ''], |
|
211
|
|
|
'Ố' => ['1', '0', '', ''], |
|
212
|
|
|
'Ồ' => ['1', '0', '', ''], |
|
213
|
|
|
'Ổ' => ['1', '0', '', ''], |
|
214
|
|
|
'Ỗ' => ['1', '0', '', ''], |
|
215
|
|
|
'Ộ' => ['1', '0', '', ''], |
|
216
|
|
|
'Ớ' => ['1', '0', '', ''], |
|
217
|
|
|
'Ờ' => ['1', '0', '', ''], |
|
218
|
|
|
'Ở' => ['1', '0', '', ''], |
|
219
|
|
|
'Ỡ' => ['1', '0', '', ''], |
|
220
|
|
|
'Ợ' => ['1', '0', '', ''], |
|
221
|
|
|
'OE' => ['1', '0', '', ''], |
|
222
|
|
|
'OI' => ['1', '0', '1', ''], |
|
223
|
|
|
'OJ' => ['1', '0', '1', ''], |
|
224
|
|
|
'OU' => ['1', '0', '', ''], |
|
225
|
|
|
'OY' => ['1', '0', '1', ''], |
|
226
|
|
|
'P' => ['0', '7', '7', '7'], |
|
227
|
|
|
'PF' => ['0', '7', '7', '7'], |
|
228
|
|
|
'PH' => ['0', '7', '7', '7'], |
|
229
|
|
|
'Q' => ['0', '5', '5', '5'], |
|
230
|
|
|
'R' => ['0', '9', '9', '9'], |
|
231
|
|
|
'Ř' => ['0', '4', '4', '4'], |
|
232
|
|
|
'RS' => ['0', '4', '4', '4', '94', '94', '94'], |
|
233
|
|
|
'RZ' => ['0', '4', '4', '4', '94', '94', '94'], |
|
234
|
|
|
'S' => ['0', '4', '4', '4'], |
|
235
|
|
|
'Ś' => ['0', '4', '4', '4'], |
|
236
|
|
|
'Š' => ['0', '4', '4', '4'], |
|
237
|
|
|
'Ş' => ['0', '4', '4', '4'], |
|
238
|
|
|
'SC' => ['0', '2', '4', '4'], |
|
239
|
|
|
'ŠČ' => ['0', '2', '4', '4'], |
|
240
|
|
|
'SCH' => ['0', '4', '4', '4'], |
|
241
|
|
|
'SCHD' => ['0', '2', '43', '43'], |
|
242
|
|
|
'SCHT' => ['0', '2', '43', '43'], |
|
243
|
|
|
'SCHTCH' => ['0', '2', '4', '4'], |
|
244
|
|
|
'SCHTSCH' => ['0', '2', '4', '4'], |
|
245
|
|
|
'SCHTSH' => ['0', '2', '4', '4'], |
|
246
|
|
|
'SD' => ['0', '2', '43', '43'], |
|
247
|
|
|
'SH' => ['0', '4', '4', '4'], |
|
248
|
|
|
'SHCH' => ['0', '2', '4', '4'], |
|
249
|
|
|
'SHD' => ['0', '2', '43', '43'], |
|
250
|
|
|
'SHT' => ['0', '2', '43', '43'], |
|
251
|
|
|
'SHTCH' => ['0', '2', '4', '4'], |
|
252
|
|
|
'SHTSH' => ['0', '2', '4', '4'], |
|
253
|
|
|
'ß' => ['0', '', '4', '4'], |
|
254
|
|
|
'ST' => ['0', '2', '43', '43'], |
|
255
|
|
|
'STCH' => ['0', '2', '4', '4'], |
|
256
|
|
|
'STRS' => ['0', '2', '4', '4'], |
|
257
|
|
|
'STRZ' => ['0', '2', '4', '4'], |
|
258
|
|
|
'STSCH' => ['0', '2', '4', '4'], |
|
259
|
|
|
'STSH' => ['0', '2', '4', '4'], |
|
260
|
|
|
'SSZ' => ['0', '4', '4', '4'], |
|
261
|
|
|
'SZ' => ['0', '4', '4', '4'], |
|
262
|
|
|
'SZCS' => ['0', '2', '4', '4'], |
|
263
|
|
|
'SZCZ' => ['0', '2', '4', '4'], |
|
264
|
|
|
'SZD' => ['0', '2', '43', '43'], |
|
265
|
|
|
'SZT' => ['0', '2', '43', '43'], |
|
266
|
|
|
'T' => ['0', '3', '3', '3'], |
|
267
|
|
|
'Ť' => ['0', '3', '3', '3'], |
|
268
|
|
|
'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], |
|
269
|
|
|
'TC' => ['0', '4', '4', '4'], |
|
270
|
|
|
'TCH' => ['0', '4', '4', '4'], |
|
271
|
|
|
'TH' => ['0', '3', '3', '3'], |
|
272
|
|
|
'TRS' => ['0', '4', '4', '4'], |
|
273
|
|
|
'TRZ' => ['0', '4', '4', '4'], |
|
274
|
|
|
'TS' => ['0', '4', '4', '4'], |
|
275
|
|
|
'TSCH' => ['0', '4', '4', '4'], |
|
276
|
|
|
'TSH' => ['0', '4', '4', '4'], |
|
277
|
|
|
'TSZ' => ['0', '4', '4', '4'], |
|
278
|
|
|
'TTCH' => ['0', '4', '4', '4'], |
|
279
|
|
|
'TTS' => ['0', '4', '4', '4'], |
|
280
|
|
|
'TTSCH' => ['0', '4', '4', '4'], |
|
281
|
|
|
'TTSZ' => ['0', '4', '4', '4'], |
|
282
|
|
|
'TTZ' => ['0', '4', '4', '4'], |
|
283
|
|
|
'TZ' => ['0', '4', '4', '4'], |
|
284
|
|
|
'TZS' => ['0', '4', '4', '4'], |
|
285
|
|
|
'U' => ['1', '0', '', ''], |
|
286
|
|
|
'Ù' => ['1', '0', '', ''], |
|
287
|
|
|
'Ú' => ['1', '0', '', ''], |
|
288
|
|
|
'Û' => ['1', '0', '', ''], |
|
289
|
|
|
'Ü' => ['1', '0', '', ''], |
|
290
|
|
|
'Ũ' => ['1', '0', '', ''], |
|
291
|
|
|
'Ū' => ['1', '0', '', ''], |
|
292
|
|
|
'Ů' => ['1', '0', '', ''], |
|
293
|
|
|
'Ű' => ['1', '0', '', ''], |
|
294
|
|
|
'Ų' => ['1', '0', '', ''], |
|
295
|
|
|
'Ư' => ['1', '0', '', ''], |
|
296
|
|
|
'Ụ' => ['1', '0', '', ''], |
|
297
|
|
|
'Ủ' => ['1', '0', '', ''], |
|
298
|
|
|
'Ứ' => ['1', '0', '', ''], |
|
299
|
|
|
'Ừ' => ['1', '0', '', ''], |
|
300
|
|
|
'Ử' => ['1', '0', '', ''], |
|
301
|
|
|
'Ữ' => ['1', '0', '', ''], |
|
302
|
|
|
'Ự' => ['1', '0', '', ''], |
|
303
|
|
|
'UE' => ['1', '0', '', ''], |
|
304
|
|
|
'UI' => ['1', '0', '1', ''], |
|
305
|
|
|
'UJ' => ['1', '0', '1', ''], |
|
306
|
|
|
'UY' => ['1', '0', '1', ''], |
|
307
|
|
|
'UW' => ['1', '0', '1', '', '0', '7', '7'], |
|
308
|
|
|
'V' => ['0', '7', '7', '7'], |
|
309
|
|
|
'W' => ['0', '7', '7', '7'], |
|
310
|
|
|
'X' => ['0', '5', '54', '54'], |
|
311
|
|
|
'Y' => ['1', '1', '', ''], |
|
312
|
|
|
'Ý' => ['1', '1', '', ''], |
|
313
|
|
|
'Ỳ' => ['1', '1', '', ''], |
|
314
|
|
|
'Ỵ' => ['1', '1', '', ''], |
|
315
|
|
|
'Ỷ' => ['1', '1', '', ''], |
|
316
|
|
|
'Ỹ' => ['1', '1', '', ''], |
|
317
|
|
|
'Z' => ['0', '4', '4', '4'], |
|
318
|
|
|
'Ź' => ['0', '4', '4', '4'], |
|
319
|
|
|
'Ż' => ['0', '4', '4', '4'], |
|
320
|
|
|
'Ž' => ['0', '4', '4', '4'], |
|
321
|
|
|
'ZD' => ['0', '2', '43', '43'], |
|
322
|
|
|
'ZDZ' => ['0', '2', '4', '4'], |
|
323
|
|
|
'ZDZH' => ['0', '2', '4', '4'], |
|
324
|
|
|
'ZH' => ['0', '4', '4', '4'], |
|
325
|
|
|
'ZHD' => ['0', '2', '43', '43'], |
|
326
|
|
|
'ZHDZH' => ['0', '2', '4', '4'], |
|
327
|
|
|
'ZS' => ['0', '4', '4', '4'], |
|
328
|
|
|
'ZSCH' => ['0', '4', '4', '4'], |
|
329
|
|
|
'ZSH' => ['0', '4', '4', '4'], |
|
330
|
|
|
'ZZS' => ['0', '4', '4', '4'], |
|
331
|
|
|
// Cyrillic alphabet |
|
332
|
|
|
'А' => ['1', '0', '', ''], |
|
333
|
|
|
'Б' => ['0', '7', '7', '7'], |
|
334
|
|
|
'В' => ['0', '7', '7', '7'], |
|
335
|
|
|
'Г' => ['0', '5', '5', '5'], |
|
336
|
|
|
'Д' => ['0', '3', '3', '3'], |
|
337
|
|
|
'ДЗ' => ['0', '4', '4', '4'], |
|
338
|
|
|
'Е' => ['1', '0', '', ''], |
|
339
|
|
|
'Ё' => ['1', '0', '', ''], |
|
340
|
|
|
'Ж' => ['0', '4', '4', '4'], |
|
341
|
|
|
'З' => ['0', '4', '4', '4'], |
|
342
|
|
|
'И' => ['1', '0', '', ''], |
|
343
|
|
|
'Й' => ['1', '1', '', '', '4', '4', '4'], |
|
344
|
|
|
'К' => ['0', '5', '5', '5'], |
|
345
|
|
|
'Л' => ['0', '8', '8', '8'], |
|
346
|
|
|
'М' => ['0', '6', '6', '6'], |
|
347
|
|
|
'Н' => ['0', '6', '6', '6'], |
|
348
|
|
|
'О' => ['1', '0', '', ''], |
|
349
|
|
|
'П' => ['0', '7', '7', '7'], |
|
350
|
|
|
'Р' => ['0', '9', '9', '9'], |
|
351
|
|
|
'РЖ' => ['0', '4', '4', '4'], |
|
352
|
|
|
'С' => ['0', '4', '4', '4'], |
|
353
|
|
|
'Т' => ['0', '3', '3', '3'], |
|
354
|
|
|
'У' => ['1', '0', '', ''], |
|
355
|
|
|
'Ф' => ['0', '7', '7', '7'], |
|
356
|
|
|
'Х' => ['0', '5', '5', '5'], |
|
357
|
|
|
'Ц' => ['0', '4', '4', '4'], |
|
358
|
|
|
'Ч' => ['0', '4', '4', '4'], |
|
359
|
|
|
'Ш' => ['0', '4', '4', '4'], |
|
360
|
|
|
'Щ' => ['0', '2', '4', '4'], |
|
361
|
|
|
'Ъ' => ['0', '', '', ''], |
|
362
|
|
|
'Ы' => ['0', '1', '', ''], |
|
363
|
|
|
'Ь' => ['0', '', '', ''], |
|
364
|
|
|
'Э' => ['1', '0', '', ''], |
|
365
|
|
|
'Ю' => ['0', '1', '', ''], |
|
366
|
|
|
'Я' => ['0', '1', '', ''], |
|
367
|
|
|
// Greek alphabet |
|
368
|
|
|
'Α' => ['1', '0', '', ''], |
|
369
|
|
|
'Ά' => ['1', '0', '', ''], |
|
370
|
|
|
'ΑΙ' => ['1', '0', '1', ''], |
|
371
|
|
|
'ΑΥ' => ['1', '0', '1', ''], |
|
372
|
|
|
'Β' => ['0', '7', '7', '7'], |
|
373
|
|
|
'Γ' => ['0', '5', '5', '5'], |
|
374
|
|
|
'Δ' => ['0', '3', '3', '3'], |
|
375
|
|
|
'Ε' => ['1', '0', '', ''], |
|
376
|
|
|
'Έ' => ['1', '0', '', ''], |
|
377
|
|
|
'ΕΙ' => ['1', '0', '1', ''], |
|
378
|
|
|
'ΕΥ' => ['1', '1', '1', ''], |
|
379
|
|
|
'Ζ' => ['0', '4', '4', '4'], |
|
380
|
|
|
'Η' => ['1', '0', '', ''], |
|
381
|
|
|
'Ή' => ['1', '0', '', ''], |
|
382
|
|
|
'Θ' => ['0', '3', '3', '3'], |
|
383
|
|
|
'Ι' => ['1', '0', '', ''], |
|
384
|
|
|
'Ί' => ['1', '0', '', ''], |
|
385
|
|
|
'Ϊ' => ['1', '0', '', ''], |
|
386
|
|
|
'ΐ' => ['1', '0', '', ''], |
|
387
|
|
|
'Κ' => ['0', '5', '5', '5'], |
|
388
|
|
|
'Λ' => ['0', '8', '8', '8'], |
|
389
|
|
|
'Μ' => ['0', '6', '6', '6'], |
|
390
|
|
|
'ΜΠ' => ['0', '7', '7', '7'], |
|
391
|
|
|
'Ν' => ['0', '6', '6', '6'], |
|
392
|
|
|
'ΝΤ' => ['0', '3', '3', '3'], |
|
393
|
|
|
'Ξ' => ['0', '5', '54', '54'], |
|
394
|
|
|
'Ο' => ['1', '0', '', ''], |
|
395
|
|
|
'Ό' => ['1', '0', '', ''], |
|
396
|
|
|
'ΟΙ' => ['1', '0', '1', ''], |
|
397
|
|
|
'ΟΥ' => ['1', '0', '1', ''], |
|
398
|
|
|
'Π' => ['0', '7', '7', '7'], |
|
399
|
|
|
'Ρ' => ['0', '9', '9', '9'], |
|
400
|
|
|
'Σ' => ['0', '4', '4', '4'], |
|
401
|
|
|
'ς' => ['0', '', '', '4'], |
|
402
|
|
|
'Τ' => ['0', '3', '3', '3'], |
|
403
|
|
|
'ΤΖ' => ['0', '4', '4', '4'], |
|
404
|
|
|
'ΤΣ' => ['0', '4', '4', '4'], |
|
405
|
|
|
'Υ' => ['1', '1', '', ''], |
|
406
|
|
|
'Ύ' => ['1', '1', '', ''], |
|
407
|
|
|
'Ϋ' => ['1', '1', '', ''], |
|
408
|
|
|
'ΰ' => ['1', '1', '', ''], |
|
409
|
|
|
'ΥΚ' => ['1', '5', '5', '5'], |
|
410
|
|
|
'ΥΥ' => ['1', '65', '65', '65'], |
|
411
|
|
|
'Φ' => ['0', '7', '7', '7'], |
|
412
|
|
|
'Χ' => ['0', '5', '5', '5'], |
|
413
|
|
|
'Ψ' => ['0', '7', '7', '7'], |
|
414
|
|
|
'Ω' => ['1', '0', '', ''], |
|
415
|
|
|
'Ώ' => ['1', '0', '', ''], |
|
416
|
|
|
// Hebrew alphabet |
|
417
|
|
|
'א' => ['1', '0', '', ''], |
|
418
|
|
|
'או' => ['1', '0', '7', ''], |
|
419
|
|
|
'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], |
|
420
|
|
|
'בב' => ['0', '7', '7', '7', '77', '77', '77'], |
|
421
|
|
|
'ב' => ['0', '7', '7', '7'], |
|
422
|
|
|
'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], |
|
423
|
|
|
'גד' => ['0', '43', '43', '43', '53', '53', '53'], |
|
424
|
|
|
'גה' => ['0', '45', '45', '45', '55', '55', '55'], |
|
425
|
|
|
'גז' => ['0', '44', '44', '44', '45', '45', '45'], |
|
426
|
|
|
'גח' => ['0', '45', '45', '45', '55', '55', '55'], |
|
427
|
|
|
'גכ' => ['0', '45', '45', '45', '55', '55', '55'], |
|
428
|
|
|
'גך' => ['0', '45', '45', '45', '55', '55', '55'], |
|
429
|
|
|
'גצ' => ['0', '44', '44', '44', '45', '45', '45'], |
|
430
|
|
|
'גץ' => ['0', '44', '44', '44', '45', '45', '45'], |
|
431
|
|
|
'גק' => ['0', '45', '45', '45', '54', '54', '54'], |
|
432
|
|
|
'גש' => ['0', '44', '44', '44', '54', '54', '54'], |
|
433
|
|
|
'גת' => ['0', '43', '43', '43', '53', '53', '53'], |
|
434
|
|
|
'ג' => ['0', '4', '4', '4', '5', '5', '5'], |
|
435
|
|
|
'דז' => ['0', '4', '4', '4'], |
|
436
|
|
|
'דד' => ['0', '3', '3', '3', '33', '33', '33'], |
|
437
|
|
|
'דט' => ['0', '33', '33', '33'], |
|
438
|
|
|
'דש' => ['0', '4', '4', '4'], |
|
439
|
|
|
'דצ' => ['0', '4', '4', '4'], |
|
440
|
|
|
'דץ' => ['0', '4', '4', '4'], |
|
441
|
|
|
'ד' => ['0', '3', '3', '3'], |
|
442
|
|
|
'הג' => ['0', '54', '54', '54', '55', '55', '55'], |
|
443
|
|
|
'הכ' => ['0', '55', '55', '55'], |
|
444
|
|
|
'הח' => ['0', '55', '55', '55'], |
|
445
|
|
|
'הק' => ['0', '55', '55', '55', '5', '5', '5'], |
|
446
|
|
|
'הה' => ['0', '5', '5', '', '55', '55', ''], |
|
447
|
|
|
'ה' => ['0', '5', '5', ''], |
|
448
|
|
|
'וי' => ['1', '', '', '', '7', '7', '7'], |
|
449
|
|
|
'ו' => ['1', '7', '7', '7', '7', '', ''], |
|
450
|
|
|
'וו' => ['1', '7', '7', '7', '7', '', ''], |
|
451
|
|
|
'וופ' => ['1', '7', '7', '7', '77', '77', '77'], |
|
452
|
|
|
'זש' => ['0', '4', '4', '4', '44', '44', '44'], |
|
453
|
|
|
'זדז' => ['0', '2', '4', '4'], |
|
454
|
|
|
'ז' => ['0', '4', '4', '4'], |
|
455
|
|
|
'זג' => ['0', '44', '44', '44', '45', '45', '45'], |
|
456
|
|
|
'זז' => ['0', '4', '4', '4', '44', '44', '44'], |
|
457
|
|
|
'זס' => ['0', '44', '44', '44'], |
|
458
|
|
|
'זצ' => ['0', '44', '44', '44'], |
|
459
|
|
|
'זץ' => ['0', '44', '44', '44'], |
|
460
|
|
|
'חג' => ['0', '54', '54', '54', '53', '53', '53'], |
|
461
|
|
|
'חח' => ['0', '5', '5', '5', '55', '55', '55'], |
|
462
|
|
|
'חק' => ['0', '55', '55', '55', '5', '5', '5'], |
|
463
|
|
|
'חכ' => ['0', '45', '45', '45', '55', '55', '55'], |
|
464
|
|
|
'חס' => ['0', '5', '54', '54'], |
|
465
|
|
|
'חש' => ['0', '5', '54', '54'], |
|
466
|
|
|
'ח' => ['0', '5', '5', '5'], |
|
467
|
|
|
'טש' => ['0', '4', '4', '4'], |
|
468
|
|
|
'טד' => ['0', '33', '33', '33'], |
|
469
|
|
|
'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], |
|
470
|
|
|
'טת' => ['0', '33', '33', '33'], |
|
471
|
|
|
'טט' => ['0', '3', '3', '3', '33', '33', '33'], |
|
472
|
|
|
'ט' => ['0', '3', '3', '3'], |
|
473
|
|
|
'י' => ['1', '1', '', ''], |
|
474
|
|
|
'יא' => ['1', '1', '', '', '1', '1', '1'], |
|
475
|
|
|
'כג' => ['0', '55', '55', '55', '54', '54', '54'], |
|
476
|
|
|
'כש' => ['0', '5', '54', '54'], |
|
477
|
|
|
'כס' => ['0', '5', '54', '54'], |
|
478
|
|
|
'ככ' => ['0', '5', '5', '5', '55', '55', '55'], |
|
479
|
|
|
'כך' => ['0', '5', '5', '5', '55', '55', '55'], |
|
480
|
|
|
'כ' => ['0', '5', '5', '5'], |
|
481
|
|
|
'כח' => ['0', '55', '55', '55', '5', '5', '5'], |
|
482
|
|
|
'ך' => ['0', '', '5', '5'], |
|
483
|
|
|
'ל' => ['0', '8', '8', '8'], |
|
484
|
|
|
'לל' => ['0', '88', '88', '88', '8', '8', '8'], |
|
485
|
|
|
'מנ' => ['0', '66', '66', '66'], |
|
486
|
|
|
'מן' => ['0', '66', '66', '66'], |
|
487
|
|
|
'ממ' => ['0', '6', '6', '6', '66', '66', '66'], |
|
488
|
|
|
'מם' => ['0', '6', '6', '6', '66', '66', '66'], |
|
489
|
|
|
'מ' => ['0', '6', '6', '6'], |
|
490
|
|
|
'ם' => ['0', '', '6', '6'], |
|
491
|
|
|
'נמ' => ['0', '66', '66', '66'], |
|
492
|
|
|
'נם' => ['0', '66', '66', '66'], |
|
493
|
|
|
'ננ' => ['0', '6', '6', '6', '66', '66', '66'], |
|
494
|
|
|
'נן' => ['0', '6', '6', '6', '66', '66', '66'], |
|
495
|
|
|
'נ' => ['0', '6', '6', '6'], |
|
496
|
|
|
'ן' => ['0', '', '6', '6'], |
|
497
|
|
|
'סתש' => ['0', '2', '4', '4'], |
|
498
|
|
|
'סתז' => ['0', '2', '4', '4'], |
|
499
|
|
|
'סטז' => ['0', '2', '4', '4'], |
|
500
|
|
|
'סטש' => ['0', '2', '4', '4'], |
|
501
|
|
|
'סצד' => ['0', '2', '4', '4'], |
|
502
|
|
|
'סט' => ['0', '2', '4', '4', '43', '43', '43'], |
|
503
|
|
|
'סת' => ['0', '2', '4', '4', '43', '43', '43'], |
|
504
|
|
|
'סג' => ['0', '44', '44', '44', '4', '4', '4'], |
|
505
|
|
|
'סס' => ['0', '4', '4', '4', '44', '44', '44'], |
|
506
|
|
|
'סצ' => ['0', '44', '44', '44'], |
|
507
|
|
|
'סץ' => ['0', '44', '44', '44'], |
|
508
|
|
|
'סז' => ['0', '44', '44', '44'], |
|
509
|
|
|
'סש' => ['0', '44', '44', '44'], |
|
510
|
|
|
'ס' => ['0', '4', '4', '4'], |
|
511
|
|
|
'ע' => ['1', '0', '', ''], |
|
512
|
|
|
'פב' => ['0', '7', '7', '7', '77', '77', '77'], |
|
513
|
|
|
'פוו' => ['0', '7', '7', '7', '77', '77', '77'], |
|
514
|
|
|
'פפ' => ['0', '7', '7', '7', '77', '77', '77'], |
|
515
|
|
|
'פף' => ['0', '7', '7', '7', '77', '77', '77'], |
|
516
|
|
|
'פ' => ['0', '7', '7', '7'], |
|
517
|
|
|
'ף' => ['0', '', '7', '7'], |
|
518
|
|
|
'צג' => ['0', '44', '44', '44', '45', '45', '45'], |
|
519
|
|
|
'צז' => ['0', '44', '44', '44'], |
|
520
|
|
|
'צס' => ['0', '44', '44', '44'], |
|
521
|
|
|
'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], |
|
522
|
|
|
'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], |
|
523
|
|
|
'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], |
|
524
|
|
|
'צ' => ['0', '4', '4', '4', '5', '5', '5'], |
|
525
|
|
|
'ץ' => ['0', '', '4', '4'], |
|
526
|
|
|
'קה' => ['0', '55', '55', '5'], |
|
527
|
|
|
'קס' => ['0', '5', '54', '54'], |
|
528
|
|
|
'קש' => ['0', '5', '54', '54'], |
|
529
|
|
|
'קק' => ['0', '5', '5', '5', '55', '55', '55'], |
|
530
|
|
|
'קח' => ['0', '55', '55', '55'], |
|
531
|
|
|
'קכ' => ['0', '55', '55', '55'], |
|
532
|
|
|
'קך' => ['0', '55', '55', '55'], |
|
533
|
|
|
'קג' => ['0', '55', '55', '55', '54', '54', '54'], |
|
534
|
|
|
'ק' => ['0', '5', '5', '5'], |
|
535
|
|
|
'רר' => ['0', '99', '99', '99', '9', '9', '9'], |
|
536
|
|
|
'ר' => ['0', '9', '9', '9'], |
|
537
|
|
|
'שטז' => ['0', '2', '4', '4'], |
|
538
|
|
|
'שתש' => ['0', '2', '4', '4'], |
|
539
|
|
|
'שתז' => ['0', '2', '4', '4'], |
|
540
|
|
|
'שטש' => ['0', '2', '4', '4'], |
|
541
|
|
|
'שד' => ['0', '2', '43', '43'], |
|
542
|
|
|
'שז' => ['0', '44', '44', '44'], |
|
543
|
|
|
'שס' => ['0', '44', '44', '44'], |
|
544
|
|
|
'שת' => ['0', '2', '43', '43'], |
|
545
|
|
|
'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], |
|
546
|
|
|
'שט' => ['0', '2', '43', '43', '44', '44', '44'], |
|
547
|
|
|
'שצ' => ['0', '44', '44', '44', '45', '45', '45'], |
|
548
|
|
|
'שץ' => ['0', '44', '', '44', '45', '', '45'], |
|
549
|
|
|
'שש' => ['0', '4', '4', '4', '44', '44', '44'], |
|
550
|
|
|
'ש' => ['0', '4', '4', '4'], |
|
551
|
|
|
'תג' => ['0', '34', '34', '34'], |
|
552
|
|
|
'תז' => ['0', '34', '34', '34'], |
|
553
|
|
|
'תש' => ['0', '4', '4', '4'], |
|
554
|
|
|
'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], |
|
555
|
|
|
'ת' => ['0', '3', '3', '3', '4', '4', '4'], |
|
556
|
|
|
// Arabic alphabet |
|
557
|
|
|
'ا' => ['1', '0', '', ''], |
|
558
|
|
|
'ب' => ['0', '7', '7', '7'], |
|
559
|
|
|
'ت' => ['0', '3', '3', '3'], |
|
560
|
|
|
'ث' => ['0', '3', '3', '3'], |
|
561
|
|
|
'ج' => ['0', '4', '4', '4'], |
|
562
|
|
|
'ح' => ['0', '5', '5', '5'], |
|
563
|
|
|
'خ' => ['0', '5', '5', '5'], |
|
564
|
|
|
'د' => ['0', '3', '3', '3'], |
|
565
|
|
|
'ذ' => ['0', '3', '3', '3'], |
|
566
|
|
|
'ر' => ['0', '9', '9', '9'], |
|
567
|
|
|
'ز' => ['0', '4', '4', '4'], |
|
568
|
|
|
'س' => ['0', '4', '4', '4'], |
|
569
|
|
|
'ش' => ['0', '4', '4', '4'], |
|
570
|
|
|
'ص' => ['0', '4', '4', '4'], |
|
571
|
|
|
'ض' => ['0', '3', '3', '3'], |
|
572
|
|
|
'ط' => ['0', '3', '3', '3'], |
|
573
|
|
|
'ظ' => ['0', '4', '4', '4'], |
|
574
|
|
|
'ع' => ['1', '0', '', ''], |
|
575
|
|
|
'غ' => ['0', '0', '', ''], |
|
576
|
|
|
'ف' => ['0', '7', '7', '7'], |
|
577
|
|
|
'ق' => ['0', '5', '5', '5'], |
|
578
|
|
|
'ك' => ['0', '5', '5', '5'], |
|
579
|
|
|
'ل' => ['0', '8', '8', '8'], |
|
580
|
|
|
'لا' => ['0', '8', '8', '8'], |
|
581
|
|
|
'م' => ['0', '6', '6', '6'], |
|
582
|
|
|
'ن' => ['0', '6', '6', '6'], |
|
583
|
|
|
'هن' => ['0', '66', '66', '66'], |
|
584
|
|
|
'ه' => ['0', '5', '5', ''], |
|
585
|
|
|
'و' => ['1', '', '', '', '7', '', ''], |
|
586
|
|
|
'ي' => ['0', '1', '', ''], |
|
587
|
|
|
'آ' => ['0', '1', '', ''], |
|
588
|
|
|
'ة' => ['0', '', '', '3'], |
|
589
|
|
|
'ی' => ['0', '1', '', ''], |
|
590
|
|
|
'ى' => ['1', '1', '', ''], |
|
591
|
|
|
]; |
|
592
|
|
|
|
|
593
|
|
|
/** |
|
594
|
|
|
* Which algorithms are supported. |
|
595
|
|
|
* |
|
596
|
|
|
* @return array<string> |
|
597
|
|
|
*/ |
|
598
|
|
|
public static function getAlgorithms(): array |
|
599
|
|
|
{ |
|
600
|
|
|
return [ |
|
601
|
|
|
/* I18N: https://en.wikipedia.org/wiki/Soundex */ |
|
602
|
|
|
'std' => I18N::translate('Russell'), |
|
603
|
|
|
/* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ |
|
604
|
|
|
'dm' => I18N::translate('Daitch-Mokotoff'), |
|
605
|
|
|
]; |
|
606
|
|
|
} |
|
607
|
|
|
|
|
608
|
|
|
/** |
|
609
|
|
|
* Is there a match between two soundex codes? |
|
610
|
|
|
* |
|
611
|
|
|
* @param string $soundex1 |
|
612
|
|
|
* @param string $soundex2 |
|
613
|
|
|
* |
|
614
|
|
|
* @return bool |
|
615
|
|
|
*/ |
|
616
|
|
|
public static function compare(string $soundex1, string $soundex2): bool |
|
617
|
|
|
{ |
|
618
|
|
|
if ($soundex1 !== '' && $soundex2 !== '') { |
|
619
|
|
|
return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; |
|
620
|
|
|
} |
|
621
|
|
|
|
|
622
|
|
|
return false; |
|
623
|
|
|
} |
|
624
|
|
|
|
|
625
|
|
|
/** |
|
626
|
|
|
* Generate Russell soundex codes for a given text. |
|
627
|
|
|
* |
|
628
|
|
|
* @param string $text |
|
629
|
|
|
* |
|
630
|
|
|
* @return string |
|
631
|
|
|
*/ |
|
632
|
|
|
public static function russell(string $text): string |
|
633
|
|
|
{ |
|
634
|
|
|
$words = explode(' ', $text); |
|
635
|
|
|
$soundex_array = []; |
|
636
|
|
|
|
|
637
|
|
|
foreach ($words as $word) { |
|
638
|
|
|
$soundex = soundex($word); |
|
639
|
|
|
|
|
640
|
|
|
// Only return codes from recognisable sounds |
|
641
|
|
|
if ($soundex !== '0000') { |
|
642
|
|
|
$soundex_array[] = $soundex; |
|
643
|
|
|
} |
|
644
|
|
|
} |
|
645
|
|
|
|
|
646
|
|
|
// Combine words, e.g. “New York” as “Newyork” |
|
647
|
|
|
if (count($words) > 1) { |
|
648
|
|
|
$soundex_array[] = soundex(str_replace(' ', '', $text)); |
|
649
|
|
|
} |
|
650
|
|
|
|
|
651
|
|
|
// A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) |
|
652
|
|
|
$soundex_array = array_slice(array_unique($soundex_array), 0, 51); |
|
653
|
|
|
|
|
654
|
|
|
return implode(':', $soundex_array); |
|
655
|
|
|
} |
|
656
|
|
|
|
|
657
|
|
|
/** |
|
658
|
|
|
* Generate Daitch–Mokotoff soundex codes for a given text. |
|
659
|
|
|
* |
|
660
|
|
|
* @param string $text |
|
661
|
|
|
* |
|
662
|
|
|
* @return string |
|
663
|
|
|
*/ |
|
664
|
|
|
public static function daitchMokotoff(string $text): string |
|
665
|
|
|
{ |
|
666
|
|
|
$words = explode(' ', $text); |
|
667
|
|
|
$soundex_array = []; |
|
668
|
|
|
|
|
669
|
|
|
foreach ($words as $word) { |
|
670
|
|
|
$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); |
|
671
|
|
|
} |
|
672
|
|
|
// Combine words, e.g. “New York” as “Newyork” |
|
673
|
|
|
if (count($words) > 1) { |
|
674
|
|
|
$soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); |
|
675
|
|
|
} |
|
676
|
|
|
|
|
677
|
|
|
// A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) |
|
678
|
|
|
$soundex_array = array_slice(array_unique($soundex_array), 0, 36); |
|
679
|
|
|
|
|
680
|
|
|
return implode(':', $soundex_array); |
|
681
|
|
|
} |
|
682
|
|
|
|
|
683
|
|
|
/** |
|
684
|
|
|
* Calculate the Daitch-Mokotoff soundex for a word. |
|
685
|
|
|
* |
|
686
|
|
|
* @param string $name |
|
687
|
|
|
* |
|
688
|
|
|
* @return array<string> List of possible DM codes for the word. |
|
689
|
|
|
*/ |
|
690
|
|
|
private static function daitchMokotoffWord(string $name): array |
|
691
|
|
|
{ |
|
692
|
|
|
// Apply special transformation rules to the input string |
|
693
|
|
|
$name = I18N::strtoupper($name); |
|
694
|
|
|
foreach (self::TRANSFORM_NAMES as $transformRule) { |
|
695
|
|
|
$name = str_replace($transformRule[0], $transformRule[1], $name); |
|
696
|
|
|
} |
|
697
|
|
|
|
|
698
|
|
|
// Initialize |
|
699
|
|
|
$name_script = I18N::textScript($name); |
|
700
|
|
|
$noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; |
|
701
|
|
|
|
|
702
|
|
|
$lastPos = strlen($name) - 1; |
|
703
|
|
|
$currPos = 0; |
|
704
|
|
|
$state = 1; // 1: start of input string, 2: before vowel, 3: other |
|
705
|
|
|
$result = []; // accumulate complete 6-digit D-M codes here |
|
706
|
|
|
$partialResult = []; // accumulate incomplete D-M codes here |
|
707
|
|
|
$partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) |
|
708
|
|
|
|
|
709
|
|
|
// Loop through the input string. |
|
710
|
|
|
// Stop when the string is exhausted or when no more partial results remain |
|
711
|
|
|
while (count($partialResult) !== 0 && $currPos <= $lastPos) { |
|
712
|
|
|
// Find the DM coding table entry for the chunk at the current position |
|
713
|
|
|
$thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
|
714
|
|
|
while ($thisEntry !== '') { |
|
715
|
|
|
if (isset(self::DM_SOUNDS[$thisEntry])) { |
|
716
|
|
|
break; |
|
717
|
|
|
} |
|
718
|
|
|
$thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk |
|
719
|
|
|
} |
|
720
|
|
|
if ($thisEntry === '') { |
|
721
|
|
|
$currPos++; // Not in table: advance pointer to next byte |
|
722
|
|
|
continue; // and try again |
|
723
|
|
|
} |
|
724
|
|
|
|
|
725
|
|
|
$soundTableEntry = self::DM_SOUNDS[$thisEntry]; |
|
726
|
|
|
$workingResult = $partialResult; |
|
727
|
|
|
$partialResult = []; |
|
728
|
|
|
$currPos += strlen($thisEntry); |
|
729
|
|
|
|
|
730
|
|
|
// Not at beginning of input string |
|
731
|
|
|
if ($state !== 1) { |
|
732
|
|
|
if ($currPos <= $lastPos) { |
|
733
|
|
|
// Determine whether the next chunk is a vowel |
|
734
|
|
|
$nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
|
735
|
|
|
while ($nextEntry !== '') { |
|
736
|
|
|
if (isset(self::DM_SOUNDS[$nextEntry])) { |
|
737
|
|
|
break; |
|
738
|
|
|
} |
|
739
|
|
|
$nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk |
|
740
|
|
|
} |
|
741
|
|
|
} else { |
|
742
|
|
|
$nextEntry = ''; |
|
743
|
|
|
} |
|
744
|
|
|
if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { |
|
745
|
|
|
$state = 2; |
|
746
|
|
|
} else { |
|
747
|
|
|
// Next chunk is a vowel |
|
748
|
|
|
$state = 3; |
|
749
|
|
|
} |
|
750
|
|
|
} |
|
751
|
|
|
|
|
752
|
|
|
while ($state < count($soundTableEntry)) { |
|
753
|
|
|
// empty means 'ignore this sound in this state' |
|
754
|
|
|
if ($soundTableEntry[$state] === '') { |
|
755
|
|
|
foreach ($workingResult as $workingEntry) { |
|
756
|
|
|
$tempEntry = $workingEntry; |
|
757
|
|
|
$tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' |
|
758
|
|
|
$partialResult[] = $tempEntry; |
|
759
|
|
|
} |
|
760
|
|
|
} else { |
|
761
|
|
|
foreach ($workingResult as $workingEntry) { |
|
762
|
|
|
if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { |
|
763
|
|
|
// Incoming sound isn't a duplicate of the previous sound |
|
764
|
|
|
$workingEntry[] = $soundTableEntry[$state]; |
|
765
|
|
|
} elseif ($noVowels) { |
|
766
|
|
|
// Incoming sound is a duplicate of the previous sound |
|
767
|
|
|
// For Hebrew and Arabic, we need to create a pair of D-M sound codes, |
|
768
|
|
|
// one of the pair with only a single occurrence of the duplicate sound, |
|
769
|
|
|
// the other with both occurrences |
|
770
|
|
|
$workingEntry[] = $soundTableEntry[$state]; |
|
771
|
|
|
} |
|
772
|
|
|
|
|
773
|
|
|
if (count($workingEntry) < 7) { |
|
774
|
|
|
$partialResult[] = $workingEntry; |
|
775
|
|
|
} else { |
|
776
|
|
|
// This is the 6th code in the sequence |
|
777
|
|
|
// We're looking for 7 entries because the first is '!' and doesn't count |
|
778
|
|
|
$tempResult = str_replace('!', '', implode('', $workingEntry)); |
|
779
|
|
|
// Only return codes from recognisable sounds |
|
780
|
|
|
if ($tempResult) { |
|
781
|
|
|
$result[] = substr($tempResult . '000000', 0, 6); |
|
782
|
|
|
} |
|
783
|
|
|
} |
|
784
|
|
|
} |
|
785
|
|
|
} |
|
786
|
|
|
$state += 3; // Advance to next triplet while keeping the same basic state |
|
787
|
|
|
} |
|
788
|
|
|
} |
|
789
|
|
|
|
|
790
|
|
|
// Zero-fill and copy all remaining partial results |
|
791
|
|
|
foreach ($partialResult as $workingEntry) { |
|
792
|
|
|
$tempResult = str_replace('!', '', implode('', $workingEntry)); |
|
793
|
|
|
// Only return codes from recognisable sounds |
|
794
|
|
|
if ($tempResult) { |
|
795
|
|
|
$result[] = substr($tempResult . '000000', 0, 6); |
|
796
|
|
|
} |
|
797
|
|
|
} |
|
798
|
|
|
|
|
799
|
|
|
return $result; |
|
800
|
|
|
} |
|
801
|
|
|
} |
|
802
|
|
|
|