fisharebest /
webtrees
| 1 | <?php |
||
| 2 | |||
| 3 | /** |
||
| 4 | * webtrees: online genealogy |
||
| 5 | * Copyright (C) 2025 webtrees development team |
||
| 6 | * This program is free software: you can redistribute it and/or modify |
||
| 7 | * it under the terms of the GNU General Public License as published by |
||
| 8 | * the Free Software Foundation, either version 3 of the License, or |
||
| 9 | * (at your option) any later version. |
||
| 10 | * This program is distributed in the hope that it will be useful, |
||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 13 | * GNU General Public License for more details. |
||
| 14 | * You should have received a copy of the GNU General Public License |
||
| 15 | * along with this program. If not, see <https://www.gnu.org/licenses/>. |
||
| 16 | */ |
||
| 17 | |||
| 18 | declare(strict_types=1); |
||
| 19 | |||
| 20 | namespace Fisharebest\Webtrees; |
||
| 21 | |||
| 22 | use function array_slice; |
||
| 23 | use function count; |
||
| 24 | use function strlen; |
||
| 25 | |||
| 26 | /** |
||
| 27 | * Phonetic matching of strings. |
||
| 28 | */ |
||
| 29 | class Soundex |
||
| 30 | { |
||
| 31 | // Determine the Daitch–Mokotoff Soundex code for a word |
||
| 32 | // Original implementation by Gerry Kroll, and analysis by Meliza Amity |
||
| 33 | |||
| 34 | // Max. table key length (in ASCII bytes -- NOT in UTF-8 characters!) |
||
| 35 | private const int MAXCHAR = 7; |
||
|
0 ignored issues
–
show
Bug
introduced
by
Loading history...
|
|||
| 36 | |||
| 37 | /** |
||
| 38 | * Name transformation arrays. |
||
| 39 | * Used to transform the Name string to simplify the "sounds like" table. |
||
| 40 | * This is especially useful in Hebrew. |
||
| 41 | * |
||
| 42 | * Each array entry defines the "from" and "to" arguments of an preg($from, $to, $text) |
||
| 43 | * function call to achieve the desired transformations. |
||
| 44 | * |
||
| 45 | * Note about the use of "\x01": |
||
| 46 | * This code, which can’t legitimately occur in the kind of text we're dealing with, |
||
| 47 | * is used as a place-holder so that conditional string replacements can be done. |
||
| 48 | */ |
||
| 49 | private const array TRANSFORM_NAMES = [ |
||
| 50 | // Force Yiddish ligatures to be treated as separate letters |
||
| 51 | ['װ', 'וו'], |
||
| 52 | ['ײ', 'יי'], |
||
| 53 | ['ױ', 'וי'], |
||
| 54 | ['בו', 'בע'], |
||
| 55 | ['פו', 'פע'], |
||
| 56 | ['ומ', 'עמ'], |
||
| 57 | ['ום', 'עם'], |
||
| 58 | ['ונ', 'ענ'], |
||
| 59 | ['ון', 'ען'], |
||
| 60 | ['וו', 'ב'], |
||
| 61 | ["\x01", ''], |
||
| 62 | ['ייה$', "\x01ה"], |
||
| 63 | ['ייע$', "\x01ע"], |
||
| 64 | ['יי', 'ע'], |
||
| 65 | ["\x01", 'יי'], |
||
| 66 | ]; |
||
| 67 | |||
| 68 | /** |
||
| 69 | * The DM sound coding table is organized this way: |
||
| 70 | * key: a variable-length string that corresponds to the UTF-8 character sequence |
||
| 71 | * represented by the table entry. Currently, that string can be up to 7 |
||
| 72 | * bytes long. This maximum length is defined by the value of global variable |
||
| 73 | * $maxchar. |
||
| 74 | * |
||
| 75 | * value: an array as follows: |
||
| 76 | * [0]: zero if not a vowel |
||
| 77 | * [1]: sound value when this string is at the beginning of the word |
||
| 78 | * [2]: sound value when this string is followed by a vowel |
||
| 79 | * [3]: sound value for other cases |
||
| 80 | * [1],[2],[3] can be repeated several times to create branches in the code |
||
| 81 | * an empty sound value means "ignore in this state" |
||
| 82 | */ |
||
| 83 | private const array DM_SOUNDS = [ |
||
| 84 | 'A' => ['1', '0', '', ''], |
||
| 85 | 'À' => ['1', '0', '', ''], |
||
| 86 | 'Á' => ['1', '0', '', ''], |
||
| 87 | 'Â' => ['1', '0', '', ''], |
||
| 88 | 'Ã' => ['1', '0', '', ''], |
||
| 89 | 'Ä' => ['1', '0', '1', '', '0', '', ''], |
||
| 90 | 'Å' => ['1', '0', '', ''], |
||
| 91 | 'Ă' => ['1', '0', '', ''], |
||
| 92 | 'Ą' => ['1', '', '', '', '', '', '6'], |
||
| 93 | 'Ạ' => ['1', '0', '', ''], |
||
| 94 | 'Ả' => ['1', '0', '', ''], |
||
| 95 | 'Ấ' => ['1', '0', '', ''], |
||
| 96 | 'Ầ' => ['1', '0', '', ''], |
||
| 97 | 'Ẩ' => ['1', '0', '', ''], |
||
| 98 | 'Ẫ' => ['1', '0', '', ''], |
||
| 99 | 'Ậ' => ['1', '0', '', ''], |
||
| 100 | 'Ắ' => ['1', '0', '', ''], |
||
| 101 | 'Ằ' => ['1', '0', '', ''], |
||
| 102 | 'Ẳ' => ['1', '0', '', ''], |
||
| 103 | 'Ẵ' => ['1', '0', '', ''], |
||
| 104 | 'Ặ' => ['1', '0', '', ''], |
||
| 105 | 'AE' => ['1', '0', '1', ''], |
||
| 106 | 'Æ' => ['1', '0', '1', ''], |
||
| 107 | 'AI' => ['1', '0', '1', ''], |
||
| 108 | 'AJ' => ['1', '0', '1', ''], |
||
| 109 | 'AU' => ['1', '0', '7', ''], |
||
| 110 | 'AV' => ['1', '0', '7', '', '7', '7', '7'], |
||
| 111 | 'ÄU' => ['1', '0', '1', ''], |
||
| 112 | 'AY' => ['1', '0', '1', ''], |
||
| 113 | 'B' => ['0', '7', '7', '7'], |
||
| 114 | 'C' => ['0', '5', '5', '5', '34', '4', '4'], |
||
| 115 | 'Ć' => ['0', '4', '4', '4'], |
||
| 116 | 'Č' => ['0', '4', '4', '4'], |
||
| 117 | 'Ç' => ['0', '4', '4', '4'], |
||
| 118 | 'CH' => ['0', '5', '5', '5', '34', '4', '4'], |
||
| 119 | 'CHS' => ['0', '5', '54', '54'], |
||
| 120 | 'CK' => ['0', '5', '5', '5', '45', '45', '45'], |
||
| 121 | 'CCS' => ['0', '4', '4', '4'], |
||
| 122 | 'CS' => ['0', '4', '4', '4'], |
||
| 123 | 'CSZ' => ['0', '4', '4', '4'], |
||
| 124 | 'CZ' => ['0', '4', '4', '4'], |
||
| 125 | 'CZS' => ['0', '4', '4', '4'], |
||
| 126 | 'D' => ['0', '3', '3', '3'], |
||
| 127 | 'Ď' => ['0', '3', '3', '3'], |
||
| 128 | 'Đ' => ['0', '3', '3', '3'], |
||
| 129 | 'DRS' => ['0', '4', '4', '4'], |
||
| 130 | 'DRZ' => ['0', '4', '4', '4'], |
||
| 131 | 'DS' => ['0', '4', '4', '4'], |
||
| 132 | 'DSH' => ['0', '4', '4', '4'], |
||
| 133 | 'DSZ' => ['0', '4', '4', '4'], |
||
| 134 | 'DT' => ['0', '3', '3', '3'], |
||
| 135 | 'DDZ' => ['0', '4', '4', '4'], |
||
| 136 | 'DDZS' => ['0', '4', '4', '4'], |
||
| 137 | 'DZ' => ['0', '4', '4', '4'], |
||
| 138 | 'DŹ' => ['0', '4', '4', '4'], |
||
| 139 | 'DŻ' => ['0', '4', '4', '4'], |
||
| 140 | 'DZH' => ['0', '4', '4', '4'], |
||
| 141 | 'DZS' => ['0', '4', '4', '4'], |
||
| 142 | 'E' => ['1', '0', '', ''], |
||
| 143 | 'È' => ['1', '0', '', ''], |
||
| 144 | 'É' => ['1', '0', '', ''], |
||
| 145 | 'Ê' => ['1', '0', '', ''], |
||
| 146 | 'Ë' => ['1', '0', '', ''], |
||
| 147 | 'Ĕ' => ['1', '0', '', ''], |
||
| 148 | 'Ė' => ['1', '0', '', ''], |
||
| 149 | 'Ę' => ['1', '', '', '6', '', '', ''], |
||
| 150 | 'Ẹ' => ['1', '0', '', ''], |
||
| 151 | 'Ẻ' => ['1', '0', '', ''], |
||
| 152 | 'Ẽ' => ['1', '0', '', ''], |
||
| 153 | 'Ế' => ['1', '0', '', ''], |
||
| 154 | 'Ề' => ['1', '0', '', ''], |
||
| 155 | 'Ể' => ['1', '0', '', ''], |
||
| 156 | 'Ễ' => ['1', '0', '', ''], |
||
| 157 | 'Ệ' => ['1', '0', '', ''], |
||
| 158 | 'EAU' => ['1', '0', '', ''], |
||
| 159 | 'EI' => ['1', '0', '1', ''], |
||
| 160 | 'EJ' => ['1', '0', '1', ''], |
||
| 161 | 'EU' => ['1', '1', '1', ''], |
||
| 162 | 'EY' => ['1', '0', '1', ''], |
||
| 163 | 'F' => ['0', '7', '7', '7'], |
||
| 164 | 'FB' => ['0', '7', '7', '7'], |
||
| 165 | 'G' => ['0', '5', '5', '5', '34', '4', '4'], |
||
| 166 | 'Ğ' => ['0', '', '', ''], |
||
| 167 | 'GGY' => ['0', '5', '5', '5'], |
||
| 168 | 'GY' => ['0', '5', '5', '5'], |
||
| 169 | 'H' => ['0', '5', '5', '', '5', '5', '5'], |
||
| 170 | 'I' => ['1', '0', '', ''], |
||
| 171 | 'Ì' => ['1', '0', '', ''], |
||
| 172 | 'Í' => ['1', '0', '', ''], |
||
| 173 | 'Î' => ['1', '0', '', ''], |
||
| 174 | 'Ï' => ['1', '0', '', ''], |
||
| 175 | 'Ĩ' => ['1', '0', '', ''], |
||
| 176 | 'Į' => ['1', '0', '', ''], |
||
| 177 | 'İ' => ['1', '0', '', ''], |
||
| 178 | 'Ỉ' => ['1', '0', '', ''], |
||
| 179 | 'Ị' => ['1', '0', '', ''], |
||
| 180 | 'IA' => ['1', '1', '', ''], |
||
| 181 | 'IE' => ['1', '1', '', ''], |
||
| 182 | 'IO' => ['1', '1', '', ''], |
||
| 183 | 'IU' => ['1', '1', '', ''], |
||
| 184 | 'J' => ['0', '1', '', '', '4', '4', '4', '5', '5', ''], |
||
| 185 | 'K' => ['0', '5', '5', '5'], |
||
| 186 | 'KH' => ['0', '5', '5', '5'], |
||
| 187 | 'KS' => ['0', '5', '54', '54'], |
||
| 188 | 'L' => ['0', '8', '8', '8'], |
||
| 189 | 'Ľ' => ['0', '8', '8', '8'], |
||
| 190 | 'Ĺ' => ['0', '8', '8', '8'], |
||
| 191 | 'Ł' => ['0', '7', '7', '7', '8', '8', '8'], |
||
| 192 | 'LL' => ['0', '8', '8', '8', '58', '8', '8', '1', '8', '8'], |
||
| 193 | 'LLY' => ['0', '8', '8', '8', '1', '8', '8'], |
||
| 194 | 'LY' => ['0', '8', '8', '8', '1', '8', '8'], |
||
| 195 | 'M' => ['0', '6', '6', '6'], |
||
| 196 | 'MĔ' => ['0', '66', '66', '66'], |
||
| 197 | 'MN' => ['0', '66', '66', '66'], |
||
| 198 | 'N' => ['0', '6', '6', '6'], |
||
| 199 | 'Ń' => ['0', '6', '6', '6'], |
||
| 200 | 'Ň' => ['0', '6', '6', '6'], |
||
| 201 | 'Ñ' => ['0', '6', '6', '6'], |
||
| 202 | 'NM' => ['0', '66', '66', '66'], |
||
| 203 | 'O' => ['1', '0', '', ''], |
||
| 204 | 'Ò' => ['1', '0', '', ''], |
||
| 205 | 'Ó' => ['1', '0', '', ''], |
||
| 206 | 'Ô' => ['1', '0', '', ''], |
||
| 207 | 'Õ' => ['1', '0', '', ''], |
||
| 208 | 'Ö' => ['1', '0', '', ''], |
||
| 209 | 'Ø' => ['1', '0', '', ''], |
||
| 210 | 'Ő' => ['1', '0', '', ''], |
||
| 211 | 'Œ' => ['1', '0', '', ''], |
||
| 212 | 'Ơ' => ['1', '0', '', ''], |
||
| 213 | 'Ọ' => ['1', '0', '', ''], |
||
| 214 | 'Ỏ' => ['1', '0', '', ''], |
||
| 215 | 'Ố' => ['1', '0', '', ''], |
||
| 216 | 'Ồ' => ['1', '0', '', ''], |
||
| 217 | 'Ổ' => ['1', '0', '', ''], |
||
| 218 | 'Ỗ' => ['1', '0', '', ''], |
||
| 219 | 'Ộ' => ['1', '0', '', ''], |
||
| 220 | 'Ớ' => ['1', '0', '', ''], |
||
| 221 | 'Ờ' => ['1', '0', '', ''], |
||
| 222 | 'Ở' => ['1', '0', '', ''], |
||
| 223 | 'Ỡ' => ['1', '0', '', ''], |
||
| 224 | 'Ợ' => ['1', '0', '', ''], |
||
| 225 | 'OE' => ['1', '0', '', ''], |
||
| 226 | 'OI' => ['1', '0', '1', ''], |
||
| 227 | 'OJ' => ['1', '0', '1', ''], |
||
| 228 | 'OU' => ['1', '0', '', ''], |
||
| 229 | 'OY' => ['1', '0', '1', ''], |
||
| 230 | 'P' => ['0', '7', '7', '7'], |
||
| 231 | 'PF' => ['0', '7', '7', '7'], |
||
| 232 | 'PH' => ['0', '7', '7', '7'], |
||
| 233 | 'Q' => ['0', '5', '5', '5'], |
||
| 234 | 'R' => ['0', '9', '9', '9'], |
||
| 235 | 'Ř' => ['0', '4', '4', '4'], |
||
| 236 | 'RS' => ['0', '4', '4', '4', '94', '94', '94'], |
||
| 237 | 'RZ' => ['0', '4', '4', '4', '94', '94', '94'], |
||
| 238 | 'S' => ['0', '4', '4', '4'], |
||
| 239 | 'Ś' => ['0', '4', '4', '4'], |
||
| 240 | 'Š' => ['0', '4', '4', '4'], |
||
| 241 | 'Ş' => ['0', '4', '4', '4'], |
||
| 242 | 'SC' => ['0', '2', '4', '4'], |
||
| 243 | 'ŠČ' => ['0', '2', '4', '4'], |
||
| 244 | 'SCH' => ['0', '4', '4', '4'], |
||
| 245 | 'SCHD' => ['0', '2', '43', '43'], |
||
| 246 | 'SCHT' => ['0', '2', '43', '43'], |
||
| 247 | 'SCHTCH' => ['0', '2', '4', '4'], |
||
| 248 | 'SCHTSCH' => ['0', '2', '4', '4'], |
||
| 249 | 'SCHTSH' => ['0', '2', '4', '4'], |
||
| 250 | 'SD' => ['0', '2', '43', '43'], |
||
| 251 | 'SH' => ['0', '4', '4', '4'], |
||
| 252 | 'SHCH' => ['0', '2', '4', '4'], |
||
| 253 | 'SHD' => ['0', '2', '43', '43'], |
||
| 254 | 'SHT' => ['0', '2', '43', '43'], |
||
| 255 | 'SHTCH' => ['0', '2', '4', '4'], |
||
| 256 | 'SHTSH' => ['0', '2', '4', '4'], |
||
| 257 | 'ß' => ['0', '', '4', '4'], |
||
| 258 | 'ST' => ['0', '2', '43', '43'], |
||
| 259 | 'STCH' => ['0', '2', '4', '4'], |
||
| 260 | 'STRS' => ['0', '2', '4', '4'], |
||
| 261 | 'STRZ' => ['0', '2', '4', '4'], |
||
| 262 | 'STSCH' => ['0', '2', '4', '4'], |
||
| 263 | 'STSH' => ['0', '2', '4', '4'], |
||
| 264 | 'SSZ' => ['0', '4', '4', '4'], |
||
| 265 | 'SZ' => ['0', '4', '4', '4'], |
||
| 266 | 'SZCS' => ['0', '2', '4', '4'], |
||
| 267 | 'SZCZ' => ['0', '2', '4', '4'], |
||
| 268 | 'SZD' => ['0', '2', '43', '43'], |
||
| 269 | 'SZT' => ['0', '2', '43', '43'], |
||
| 270 | 'T' => ['0', '3', '3', '3'], |
||
| 271 | 'Ť' => ['0', '3', '3', '3'], |
||
| 272 | 'Ţ' => ['0', '3', '3', '3', '4', '4', '4'], |
||
| 273 | 'TC' => ['0', '4', '4', '4'], |
||
| 274 | 'TCH' => ['0', '4', '4', '4'], |
||
| 275 | 'TH' => ['0', '3', '3', '3'], |
||
| 276 | 'TRS' => ['0', '4', '4', '4'], |
||
| 277 | 'TRZ' => ['0', '4', '4', '4'], |
||
| 278 | 'TS' => ['0', '4', '4', '4'], |
||
| 279 | 'TSCH' => ['0', '4', '4', '4'], |
||
| 280 | 'TSH' => ['0', '4', '4', '4'], |
||
| 281 | 'TSZ' => ['0', '4', '4', '4'], |
||
| 282 | 'TTCH' => ['0', '4', '4', '4'], |
||
| 283 | 'TTS' => ['0', '4', '4', '4'], |
||
| 284 | 'TTSCH' => ['0', '4', '4', '4'], |
||
| 285 | 'TTSZ' => ['0', '4', '4', '4'], |
||
| 286 | 'TTZ' => ['0', '4', '4', '4'], |
||
| 287 | 'TZ' => ['0', '4', '4', '4'], |
||
| 288 | 'TZS' => ['0', '4', '4', '4'], |
||
| 289 | 'U' => ['1', '0', '', ''], |
||
| 290 | 'Ù' => ['1', '0', '', ''], |
||
| 291 | 'Ú' => ['1', '0', '', ''], |
||
| 292 | 'Û' => ['1', '0', '', ''], |
||
| 293 | 'Ü' => ['1', '0', '', ''], |
||
| 294 | 'Ũ' => ['1', '0', '', ''], |
||
| 295 | 'Ū' => ['1', '0', '', ''], |
||
| 296 | 'Ů' => ['1', '0', '', ''], |
||
| 297 | 'Ű' => ['1', '0', '', ''], |
||
| 298 | 'Ų' => ['1', '0', '', ''], |
||
| 299 | 'Ư' => ['1', '0', '', ''], |
||
| 300 | 'Ụ' => ['1', '0', '', ''], |
||
| 301 | 'Ủ' => ['1', '0', '', ''], |
||
| 302 | 'Ứ' => ['1', '0', '', ''], |
||
| 303 | 'Ừ' => ['1', '0', '', ''], |
||
| 304 | 'Ử' => ['1', '0', '', ''], |
||
| 305 | 'Ữ' => ['1', '0', '', ''], |
||
| 306 | 'Ự' => ['1', '0', '', ''], |
||
| 307 | 'UE' => ['1', '0', '', ''], |
||
| 308 | 'UI' => ['1', '0', '1', ''], |
||
| 309 | 'UJ' => ['1', '0', '1', ''], |
||
| 310 | 'UY' => ['1', '0', '1', ''], |
||
| 311 | 'UW' => ['1', '0', '1', '', '0', '7', '7'], |
||
| 312 | 'V' => ['0', '7', '7', '7'], |
||
| 313 | 'W' => ['0', '7', '7', '7'], |
||
| 314 | 'X' => ['0', '5', '54', '54'], |
||
| 315 | 'Y' => ['1', '1', '', ''], |
||
| 316 | 'Ý' => ['1', '1', '', ''], |
||
| 317 | 'Ỳ' => ['1', '1', '', ''], |
||
| 318 | 'Ỵ' => ['1', '1', '', ''], |
||
| 319 | 'Ỷ' => ['1', '1', '', ''], |
||
| 320 | 'Ỹ' => ['1', '1', '', ''], |
||
| 321 | 'Z' => ['0', '4', '4', '4'], |
||
| 322 | 'Ź' => ['0', '4', '4', '4'], |
||
| 323 | 'Ż' => ['0', '4', '4', '4'], |
||
| 324 | 'Ž' => ['0', '4', '4', '4'], |
||
| 325 | 'ZD' => ['0', '2', '43', '43'], |
||
| 326 | 'ZDZ' => ['0', '2', '4', '4'], |
||
| 327 | 'ZDZH' => ['0', '2', '4', '4'], |
||
| 328 | 'ZH' => ['0', '4', '4', '4'], |
||
| 329 | 'ZHD' => ['0', '2', '43', '43'], |
||
| 330 | 'ZHDZH' => ['0', '2', '4', '4'], |
||
| 331 | 'ZS' => ['0', '4', '4', '4'], |
||
| 332 | 'ZSCH' => ['0', '4', '4', '4'], |
||
| 333 | 'ZSH' => ['0', '4', '4', '4'], |
||
| 334 | 'ZZS' => ['0', '4', '4', '4'], |
||
| 335 | // Cyrillic alphabet |
||
| 336 | 'А' => ['1', '0', '', ''], |
||
| 337 | 'Б' => ['0', '7', '7', '7'], |
||
| 338 | 'В' => ['0', '7', '7', '7'], |
||
| 339 | 'Г' => ['0', '5', '5', '5'], |
||
| 340 | 'Д' => ['0', '3', '3', '3'], |
||
| 341 | 'ДЗ' => ['0', '4', '4', '4'], |
||
| 342 | 'Е' => ['1', '0', '', ''], |
||
| 343 | 'Ё' => ['1', '0', '', ''], |
||
| 344 | 'Ж' => ['0', '4', '4', '4'], |
||
| 345 | 'З' => ['0', '4', '4', '4'], |
||
| 346 | 'И' => ['1', '0', '', ''], |
||
| 347 | 'Й' => ['1', '1', '', '', '4', '4', '4'], |
||
| 348 | 'К' => ['0', '5', '5', '5'], |
||
| 349 | 'Л' => ['0', '8', '8', '8'], |
||
| 350 | 'М' => ['0', '6', '6', '6'], |
||
| 351 | 'Н' => ['0', '6', '6', '6'], |
||
| 352 | 'О' => ['1', '0', '', ''], |
||
| 353 | 'П' => ['0', '7', '7', '7'], |
||
| 354 | 'Р' => ['0', '9', '9', '9'], |
||
| 355 | 'РЖ' => ['0', '4', '4', '4'], |
||
| 356 | 'С' => ['0', '4', '4', '4'], |
||
| 357 | 'Т' => ['0', '3', '3', '3'], |
||
| 358 | 'У' => ['1', '0', '', ''], |
||
| 359 | 'Ф' => ['0', '7', '7', '7'], |
||
| 360 | 'Х' => ['0', '5', '5', '5'], |
||
| 361 | 'Ц' => ['0', '4', '4', '4'], |
||
| 362 | 'Ч' => ['0', '4', '4', '4'], |
||
| 363 | 'Ш' => ['0', '4', '4', '4'], |
||
| 364 | 'Щ' => ['0', '2', '4', '4'], |
||
| 365 | 'Ъ' => ['0', '', '', ''], |
||
| 366 | 'Ы' => ['0', '1', '', ''], |
||
| 367 | 'Ь' => ['0', '', '', ''], |
||
| 368 | 'Э' => ['1', '0', '', ''], |
||
| 369 | 'Ю' => ['0', '1', '', ''], |
||
| 370 | 'Я' => ['0', '1', '', ''], |
||
| 371 | // Greek alphabet |
||
| 372 | 'Α' => ['1', '0', '', ''], |
||
| 373 | 'Ά' => ['1', '0', '', ''], |
||
| 374 | 'ΑΙ' => ['1', '0', '1', ''], |
||
| 375 | 'ΑΥ' => ['1', '0', '1', ''], |
||
| 376 | 'Β' => ['0', '7', '7', '7'], |
||
| 377 | 'Γ' => ['0', '5', '5', '5'], |
||
| 378 | 'Δ' => ['0', '3', '3', '3'], |
||
| 379 | 'Ε' => ['1', '0', '', ''], |
||
| 380 | 'Έ' => ['1', '0', '', ''], |
||
| 381 | 'ΕΙ' => ['1', '0', '1', ''], |
||
| 382 | 'ΕΥ' => ['1', '1', '1', ''], |
||
| 383 | 'Ζ' => ['0', '4', '4', '4'], |
||
| 384 | 'Η' => ['1', '0', '', ''], |
||
| 385 | 'Ή' => ['1', '0', '', ''], |
||
| 386 | 'Θ' => ['0', '3', '3', '3'], |
||
| 387 | 'Ι' => ['1', '0', '', ''], |
||
| 388 | 'Ί' => ['1', '0', '', ''], |
||
| 389 | 'Ϊ' => ['1', '0', '', ''], |
||
| 390 | 'ΐ' => ['1', '0', '', ''], |
||
| 391 | 'Κ' => ['0', '5', '5', '5'], |
||
| 392 | 'Λ' => ['0', '8', '8', '8'], |
||
| 393 | 'Μ' => ['0', '6', '6', '6'], |
||
| 394 | 'ΜΠ' => ['0', '7', '7', '7'], |
||
| 395 | 'Ν' => ['0', '6', '6', '6'], |
||
| 396 | 'ΝΤ' => ['0', '3', '3', '3'], |
||
| 397 | 'Ξ' => ['0', '5', '54', '54'], |
||
| 398 | 'Ο' => ['1', '0', '', ''], |
||
| 399 | 'Ό' => ['1', '0', '', ''], |
||
| 400 | 'ΟΙ' => ['1', '0', '1', ''], |
||
| 401 | 'ΟΥ' => ['1', '0', '1', ''], |
||
| 402 | 'Π' => ['0', '7', '7', '7'], |
||
| 403 | 'Ρ' => ['0', '9', '9', '9'], |
||
| 404 | 'Σ' => ['0', '4', '4', '4'], |
||
| 405 | 'ς' => ['0', '', '', '4'], |
||
| 406 | 'Τ' => ['0', '3', '3', '3'], |
||
| 407 | 'ΤΖ' => ['0', '4', '4', '4'], |
||
| 408 | 'ΤΣ' => ['0', '4', '4', '4'], |
||
| 409 | 'Υ' => ['1', '1', '', ''], |
||
| 410 | 'Ύ' => ['1', '1', '', ''], |
||
| 411 | 'Ϋ' => ['1', '1', '', ''], |
||
| 412 | 'ΰ' => ['1', '1', '', ''], |
||
| 413 | 'ΥΚ' => ['1', '5', '5', '5'], |
||
| 414 | 'ΥΥ' => ['1', '65', '65', '65'], |
||
| 415 | 'Φ' => ['0', '7', '7', '7'], |
||
| 416 | 'Χ' => ['0', '5', '5', '5'], |
||
| 417 | 'Ψ' => ['0', '7', '7', '7'], |
||
| 418 | 'Ω' => ['1', '0', '', ''], |
||
| 419 | 'Ώ' => ['1', '0', '', ''], |
||
| 420 | // Hebrew alphabet |
||
| 421 | 'א' => ['1', '0', '', ''], |
||
| 422 | 'או' => ['1', '0', '7', ''], |
||
| 423 | 'אג' => ['1', '4', '4', '4', '5', '5', '5', '34', '34', '34'], |
||
| 424 | 'בב' => ['0', '7', '7', '7', '77', '77', '77'], |
||
| 425 | 'ב' => ['0', '7', '7', '7'], |
||
| 426 | 'גג' => ['0', '4', '4', '4', '5', '5', '5', '45', '45', '45', '55', '55', '55', '54', '54', '54'], |
||
| 427 | 'גד' => ['0', '43', '43', '43', '53', '53', '53'], |
||
| 428 | 'גה' => ['0', '45', '45', '45', '55', '55', '55'], |
||
| 429 | 'גז' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 430 | 'גח' => ['0', '45', '45', '45', '55', '55', '55'], |
||
| 431 | 'גכ' => ['0', '45', '45', '45', '55', '55', '55'], |
||
| 432 | 'גך' => ['0', '45', '45', '45', '55', '55', '55'], |
||
| 433 | 'גצ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 434 | 'גץ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 435 | 'גק' => ['0', '45', '45', '45', '54', '54', '54'], |
||
| 436 | 'גש' => ['0', '44', '44', '44', '54', '54', '54'], |
||
| 437 | 'גת' => ['0', '43', '43', '43', '53', '53', '53'], |
||
| 438 | 'ג' => ['0', '4', '4', '4', '5', '5', '5'], |
||
| 439 | 'דז' => ['0', '4', '4', '4'], |
||
| 440 | 'דד' => ['0', '3', '3', '3', '33', '33', '33'], |
||
| 441 | 'דט' => ['0', '33', '33', '33'], |
||
| 442 | 'דש' => ['0', '4', '4', '4'], |
||
| 443 | 'דצ' => ['0', '4', '4', '4'], |
||
| 444 | 'דץ' => ['0', '4', '4', '4'], |
||
| 445 | 'ד' => ['0', '3', '3', '3'], |
||
| 446 | 'הג' => ['0', '54', '54', '54', '55', '55', '55'], |
||
| 447 | 'הכ' => ['0', '55', '55', '55'], |
||
| 448 | 'הח' => ['0', '55', '55', '55'], |
||
| 449 | 'הק' => ['0', '55', '55', '55', '5', '5', '5'], |
||
| 450 | 'הה' => ['0', '5', '5', '', '55', '55', ''], |
||
| 451 | 'ה' => ['0', '5', '5', ''], |
||
| 452 | 'וי' => ['1', '', '', '', '7', '7', '7'], |
||
| 453 | 'ו' => ['1', '7', '7', '7', '7', '', ''], |
||
| 454 | 'וו' => ['1', '7', '7', '7', '7', '', ''], |
||
| 455 | 'וופ' => ['1', '7', '7', '7', '77', '77', '77'], |
||
| 456 | 'זש' => ['0', '4', '4', '4', '44', '44', '44'], |
||
| 457 | 'זדז' => ['0', '2', '4', '4'], |
||
| 458 | 'ז' => ['0', '4', '4', '4'], |
||
| 459 | 'זג' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 460 | 'זז' => ['0', '4', '4', '4', '44', '44', '44'], |
||
| 461 | 'זס' => ['0', '44', '44', '44'], |
||
| 462 | 'זצ' => ['0', '44', '44', '44'], |
||
| 463 | 'זץ' => ['0', '44', '44', '44'], |
||
| 464 | 'חג' => ['0', '54', '54', '54', '53', '53', '53'], |
||
| 465 | 'חח' => ['0', '5', '5', '5', '55', '55', '55'], |
||
| 466 | 'חק' => ['0', '55', '55', '55', '5', '5', '5'], |
||
| 467 | 'חכ' => ['0', '45', '45', '45', '55', '55', '55'], |
||
| 468 | 'חס' => ['0', '5', '54', '54'], |
||
| 469 | 'חש' => ['0', '5', '54', '54'], |
||
| 470 | 'ח' => ['0', '5', '5', '5'], |
||
| 471 | 'טש' => ['0', '4', '4', '4'], |
||
| 472 | 'טד' => ['0', '33', '33', '33'], |
||
| 473 | 'טי' => ['0', '3', '3', '3', '4', '4', '4', '3', '3', '34'], |
||
| 474 | 'טת' => ['0', '33', '33', '33'], |
||
| 475 | 'טט' => ['0', '3', '3', '3', '33', '33', '33'], |
||
| 476 | 'ט' => ['0', '3', '3', '3'], |
||
| 477 | 'י' => ['1', '1', '', ''], |
||
| 478 | 'יא' => ['1', '1', '', '', '1', '1', '1'], |
||
| 479 | 'כג' => ['0', '55', '55', '55', '54', '54', '54'], |
||
| 480 | 'כש' => ['0', '5', '54', '54'], |
||
| 481 | 'כס' => ['0', '5', '54', '54'], |
||
| 482 | 'ככ' => ['0', '5', '5', '5', '55', '55', '55'], |
||
| 483 | 'כך' => ['0', '5', '5', '5', '55', '55', '55'], |
||
| 484 | 'כ' => ['0', '5', '5', '5'], |
||
| 485 | 'כח' => ['0', '55', '55', '55', '5', '5', '5'], |
||
| 486 | 'ך' => ['0', '', '5', '5'], |
||
| 487 | 'ל' => ['0', '8', '8', '8'], |
||
| 488 | 'לל' => ['0', '88', '88', '88', '8', '8', '8'], |
||
| 489 | 'מנ' => ['0', '66', '66', '66'], |
||
| 490 | 'מן' => ['0', '66', '66', '66'], |
||
| 491 | 'ממ' => ['0', '6', '6', '6', '66', '66', '66'], |
||
| 492 | 'מם' => ['0', '6', '6', '6', '66', '66', '66'], |
||
| 493 | 'מ' => ['0', '6', '6', '6'], |
||
| 494 | 'ם' => ['0', '', '6', '6'], |
||
| 495 | 'נמ' => ['0', '66', '66', '66'], |
||
| 496 | 'נם' => ['0', '66', '66', '66'], |
||
| 497 | 'ננ' => ['0', '6', '6', '6', '66', '66', '66'], |
||
| 498 | 'נן' => ['0', '6', '6', '6', '66', '66', '66'], |
||
| 499 | 'נ' => ['0', '6', '6', '6'], |
||
| 500 | 'ן' => ['0', '', '6', '6'], |
||
| 501 | 'סתש' => ['0', '2', '4', '4'], |
||
| 502 | 'סתז' => ['0', '2', '4', '4'], |
||
| 503 | 'סטז' => ['0', '2', '4', '4'], |
||
| 504 | 'סטש' => ['0', '2', '4', '4'], |
||
| 505 | 'סצד' => ['0', '2', '4', '4'], |
||
| 506 | 'סט' => ['0', '2', '4', '4', '43', '43', '43'], |
||
| 507 | 'סת' => ['0', '2', '4', '4', '43', '43', '43'], |
||
| 508 | 'סג' => ['0', '44', '44', '44', '4', '4', '4'], |
||
| 509 | 'סס' => ['0', '4', '4', '4', '44', '44', '44'], |
||
| 510 | 'סצ' => ['0', '44', '44', '44'], |
||
| 511 | 'סץ' => ['0', '44', '44', '44'], |
||
| 512 | 'סז' => ['0', '44', '44', '44'], |
||
| 513 | 'סש' => ['0', '44', '44', '44'], |
||
| 514 | 'ס' => ['0', '4', '4', '4'], |
||
| 515 | 'ע' => ['1', '0', '', ''], |
||
| 516 | 'פב' => ['0', '7', '7', '7', '77', '77', '77'], |
||
| 517 | 'פוו' => ['0', '7', '7', '7', '77', '77', '77'], |
||
| 518 | 'פפ' => ['0', '7', '7', '7', '77', '77', '77'], |
||
| 519 | 'פף' => ['0', '7', '7', '7', '77', '77', '77'], |
||
| 520 | 'פ' => ['0', '7', '7', '7'], |
||
| 521 | 'ף' => ['0', '', '7', '7'], |
||
| 522 | 'צג' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 523 | 'צז' => ['0', '44', '44', '44'], |
||
| 524 | 'צס' => ['0', '44', '44', '44'], |
||
| 525 | 'צצ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54', '45', '45', '45'], |
||
| 526 | 'צץ' => ['0', '4', '4', '4', '5', '5', '5', '44', '44', '44', '54', '54', '54'], |
||
| 527 | 'צש' => ['0', '44', '44', '44', '4', '4', '4', '5', '5', '5'], |
||
| 528 | 'צ' => ['0', '4', '4', '4', '5', '5', '5'], |
||
| 529 | 'ץ' => ['0', '', '4', '4'], |
||
| 530 | 'קה' => ['0', '55', '55', '5'], |
||
| 531 | 'קס' => ['0', '5', '54', '54'], |
||
| 532 | 'קש' => ['0', '5', '54', '54'], |
||
| 533 | 'קק' => ['0', '5', '5', '5', '55', '55', '55'], |
||
| 534 | 'קח' => ['0', '55', '55', '55'], |
||
| 535 | 'קכ' => ['0', '55', '55', '55'], |
||
| 536 | 'קך' => ['0', '55', '55', '55'], |
||
| 537 | 'קג' => ['0', '55', '55', '55', '54', '54', '54'], |
||
| 538 | 'ק' => ['0', '5', '5', '5'], |
||
| 539 | 'רר' => ['0', '99', '99', '99', '9', '9', '9'], |
||
| 540 | 'ר' => ['0', '9', '9', '9'], |
||
| 541 | 'שטז' => ['0', '2', '4', '4'], |
||
| 542 | 'שתש' => ['0', '2', '4', '4'], |
||
| 543 | 'שתז' => ['0', '2', '4', '4'], |
||
| 544 | 'שטש' => ['0', '2', '4', '4'], |
||
| 545 | 'שד' => ['0', '2', '43', '43'], |
||
| 546 | 'שז' => ['0', '44', '44', '44'], |
||
| 547 | 'שס' => ['0', '44', '44', '44'], |
||
| 548 | 'שת' => ['0', '2', '43', '43'], |
||
| 549 | 'שג' => ['0', '4', '4', '4', '44', '44', '44', '4', '43', '43'], |
||
| 550 | 'שט' => ['0', '2', '43', '43', '44', '44', '44'], |
||
| 551 | 'שצ' => ['0', '44', '44', '44', '45', '45', '45'], |
||
| 552 | 'שץ' => ['0', '44', '', '44', '45', '', '45'], |
||
| 553 | 'שש' => ['0', '4', '4', '4', '44', '44', '44'], |
||
| 554 | 'ש' => ['0', '4', '4', '4'], |
||
| 555 | 'תג' => ['0', '34', '34', '34'], |
||
| 556 | 'תז' => ['0', '34', '34', '34'], |
||
| 557 | 'תש' => ['0', '4', '4', '4'], |
||
| 558 | 'תת' => ['0', '3', '3', '3', '4', '4', '4', '33', '33', '33', '44', '44', '44', '34', '34', '34', '43', '43', '43'], |
||
| 559 | 'ת' => ['0', '3', '3', '3', '4', '4', '4'], |
||
| 560 | // Arabic alphabet |
||
| 561 | 'ا' => ['1', '0', '', ''], |
||
| 562 | 'ب' => ['0', '7', '7', '7'], |
||
| 563 | 'ت' => ['0', '3', '3', '3'], |
||
| 564 | 'ث' => ['0', '3', '3', '3'], |
||
| 565 | 'ج' => ['0', '4', '4', '4'], |
||
| 566 | 'ح' => ['0', '5', '5', '5'], |
||
| 567 | 'خ' => ['0', '5', '5', '5'], |
||
| 568 | 'د' => ['0', '3', '3', '3'], |
||
| 569 | 'ذ' => ['0', '3', '3', '3'], |
||
| 570 | 'ر' => ['0', '9', '9', '9'], |
||
| 571 | 'ز' => ['0', '4', '4', '4'], |
||
| 572 | 'س' => ['0', '4', '4', '4'], |
||
| 573 | 'ش' => ['0', '4', '4', '4'], |
||
| 574 | 'ص' => ['0', '4', '4', '4'], |
||
| 575 | 'ض' => ['0', '3', '3', '3'], |
||
| 576 | 'ط' => ['0', '3', '3', '3'], |
||
| 577 | 'ظ' => ['0', '4', '4', '4'], |
||
| 578 | 'ع' => ['1', '0', '', ''], |
||
| 579 | 'غ' => ['0', '0', '', ''], |
||
| 580 | 'ف' => ['0', '7', '7', '7'], |
||
| 581 | 'ق' => ['0', '5', '5', '5'], |
||
| 582 | 'ك' => ['0', '5', '5', '5'], |
||
| 583 | 'ل' => ['0', '8', '8', '8'], |
||
| 584 | 'لا' => ['0', '8', '8', '8'], |
||
| 585 | 'م' => ['0', '6', '6', '6'], |
||
| 586 | 'ن' => ['0', '6', '6', '6'], |
||
| 587 | 'هن' => ['0', '66', '66', '66'], |
||
| 588 | 'ه' => ['0', '5', '5', ''], |
||
| 589 | 'و' => ['1', '', '', '', '7', '', ''], |
||
| 590 | 'ي' => ['0', '1', '', ''], |
||
| 591 | 'آ' => ['0', '1', '', ''], |
||
| 592 | 'ة' => ['0', '', '', '3'], |
||
| 593 | 'ی' => ['0', '1', '', ''], |
||
| 594 | 'ى' => ['1', '1', '', ''], |
||
| 595 | ]; |
||
| 596 | |||
| 597 | /** |
||
| 598 | * Which algorithms are supported. |
||
| 599 | * |
||
| 600 | * @return array<string> |
||
| 601 | */ |
||
| 602 | public static function getAlgorithms(): array |
||
| 603 | { |
||
| 604 | return [ |
||
| 605 | /* I18N: https://en.wikipedia.org/wiki/Soundex */ |
||
| 606 | 'std' => I18N::translate('Russell'), |
||
| 607 | /* I18N: https://en.wikipedia.org/wiki/Daitch–Mokotoff_Soundex */ |
||
| 608 | 'dm' => I18N::translate('Daitch-Mokotoff'), |
||
| 609 | ]; |
||
| 610 | } |
||
| 611 | |||
| 612 | /** |
||
| 613 | * Is there a match between two soundex codes? |
||
| 614 | * |
||
| 615 | * @param string $soundex1 |
||
| 616 | * @param string $soundex2 |
||
| 617 | * |
||
| 618 | * @return bool |
||
| 619 | */ |
||
| 620 | public static function compare(string $soundex1, string $soundex2): bool |
||
| 621 | { |
||
| 622 | if ($soundex1 !== '' && $soundex2 !== '') { |
||
| 623 | return array_intersect(explode(':', $soundex1), explode(':', $soundex2)) !== []; |
||
| 624 | } |
||
| 625 | |||
| 626 | return false; |
||
| 627 | } |
||
| 628 | |||
| 629 | /** |
||
| 630 | * Generate Russell soundex codes for a given text. |
||
| 631 | * |
||
| 632 | * @param string $text |
||
| 633 | * |
||
| 634 | * @return string |
||
| 635 | */ |
||
| 636 | public static function russell(string $text): string |
||
| 637 | { |
||
| 638 | $words = explode(' ', $text); |
||
| 639 | $soundex_array = []; |
||
| 640 | |||
| 641 | foreach ($words as $word) { |
||
| 642 | $soundex = soundex($word); |
||
| 643 | |||
| 644 | // Only return codes from recognisable sounds |
||
| 645 | if ($soundex !== '0000') { |
||
| 646 | $soundex_array[] = $soundex; |
||
| 647 | } |
||
| 648 | } |
||
| 649 | |||
| 650 | // Combine words, e.g. “New York” as “Newyork” |
||
| 651 | if (count($words) > 1) { |
||
| 652 | $soundex_array[] = soundex(str_replace(' ', '', $text)); |
||
| 653 | } |
||
| 654 | |||
| 655 | // A varchar(255) column can only hold 51 4-character codes (plus 50 delimiters) |
||
| 656 | $soundex_array = array_slice(array_unique($soundex_array), 0, 51); |
||
| 657 | |||
| 658 | return implode(':', $soundex_array); |
||
| 659 | } |
||
| 660 | |||
| 661 | /** |
||
| 662 | * Generate Daitch–Mokotoff soundex codes for a given text. |
||
| 663 | * |
||
| 664 | * @param string $text |
||
| 665 | * |
||
| 666 | * @return string |
||
| 667 | */ |
||
| 668 | public static function daitchMokotoff(string $text): string |
||
| 669 | { |
||
| 670 | $words = explode(' ', $text); |
||
| 671 | $soundex_array = []; |
||
| 672 | |||
| 673 | foreach ($words as $word) { |
||
| 674 | $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord($word)); |
||
| 675 | } |
||
| 676 | // Combine words, e.g. “New York” as “Newyork” |
||
| 677 | if (count($words) > 1) { |
||
| 678 | $soundex_array = array_merge($soundex_array, self::daitchMokotoffWord(str_replace(' ', '', $text))); |
||
| 679 | } |
||
| 680 | |||
| 681 | // A varchar(255) column can only hold 36 6-character codes (plus 35 delimiters) |
||
| 682 | $soundex_array = array_slice(array_unique($soundex_array), 0, 36); |
||
| 683 | |||
| 684 | return implode(':', $soundex_array); |
||
| 685 | } |
||
| 686 | |||
| 687 | /** |
||
| 688 | * Calculate the Daitch-Mokotoff soundex for a word. |
||
| 689 | * |
||
| 690 | * @param string $name |
||
| 691 | * |
||
| 692 | * @return array<string> List of possible DM codes for the word. |
||
| 693 | */ |
||
| 694 | private static function daitchMokotoffWord(string $name): array |
||
| 695 | { |
||
| 696 | // Apply special transformation rules to the input string |
||
| 697 | $name = I18N::strtoupper($name); |
||
| 698 | foreach (self::TRANSFORM_NAMES as $transformRule) { |
||
| 699 | $name = str_replace($transformRule[0], $transformRule[1], $name); |
||
| 700 | } |
||
| 701 | |||
| 702 | // Initialize |
||
| 703 | $name_script = I18N::textScript($name); |
||
| 704 | $noVowels = $name_script === 'Hebr' || $name_script === 'Arab'; |
||
| 705 | |||
| 706 | $lastPos = strlen($name) - 1; |
||
| 707 | $currPos = 0; |
||
| 708 | $state = 1; // 1: start of input string, 2: before vowel, 3: other |
||
| 709 | $result = []; // accumulate complete 6-digit D-M codes here |
||
| 710 | $partialResult = []; // accumulate incomplete D-M codes here |
||
| 711 | $partialResult[] = ['!']; // initialize 1st partial result ('!' stops "duplicate sound" check) |
||
| 712 | |||
| 713 | // Loop through the input string. |
||
| 714 | // Stop when the string is exhausted or when no more partial results remain |
||
| 715 | while ($partialResult !== [] && $currPos <= $lastPos) { |
||
| 716 | // Find the DM coding table entry for the chunk at the current position |
||
| 717 | $thisEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
||
| 718 | while ($thisEntry !== '') { |
||
| 719 | if (isset(self::DM_SOUNDS[$thisEntry])) { |
||
| 720 | break; |
||
| 721 | } |
||
| 722 | $thisEntry = substr($thisEntry, 0, -1); // Not in table: try a shorter chunk |
||
| 723 | } |
||
| 724 | if ($thisEntry === '') { |
||
| 725 | $currPos++; // Not in table: advance pointer to next byte |
||
| 726 | continue; // and try again |
||
| 727 | } |
||
| 728 | |||
| 729 | $soundTableEntry = self::DM_SOUNDS[$thisEntry]; |
||
| 730 | $workingResult = $partialResult; |
||
| 731 | $partialResult = []; |
||
| 732 | $currPos += strlen($thisEntry); |
||
| 733 | |||
| 734 | // Not at beginning of input string |
||
| 735 | if ($state !== 1) { |
||
| 736 | if ($currPos <= $lastPos) { |
||
| 737 | // Determine whether the next chunk is a vowel |
||
| 738 | $nextEntry = substr($name, $currPos, self::MAXCHAR); // Get maximum length chunk |
||
| 739 | while ($nextEntry !== '') { |
||
| 740 | if (isset(self::DM_SOUNDS[$nextEntry])) { |
||
| 741 | break; |
||
| 742 | } |
||
| 743 | $nextEntry = substr($nextEntry, 0, -1); // Not in table: try a shorter chunk |
||
| 744 | } |
||
| 745 | } else { |
||
| 746 | $nextEntry = ''; |
||
| 747 | } |
||
| 748 | if ($nextEntry !== '' && self::DM_SOUNDS[$nextEntry][0] !== '0') { |
||
| 749 | $state = 2; |
||
| 750 | } else { |
||
| 751 | // Next chunk is a vowel |
||
| 752 | $state = 3; |
||
| 753 | } |
||
| 754 | } |
||
| 755 | |||
| 756 | while ($state < count($soundTableEntry)) { |
||
| 757 | // empty means 'ignore this sound in this state' |
||
| 758 | if ($soundTableEntry[$state] === '') { |
||
| 759 | foreach ($workingResult as $workingEntry) { |
||
| 760 | $tempEntry = $workingEntry; |
||
| 761 | $tempEntry[count($tempEntry) - 1] .= '!'; // Prevent false 'doubles' |
||
| 762 | $partialResult[] = $tempEntry; |
||
| 763 | } |
||
| 764 | } else { |
||
| 765 | foreach ($workingResult as $workingEntry) { |
||
| 766 | if ($soundTableEntry[$state] !== $workingEntry[count($workingEntry) - 1]) { |
||
| 767 | // Incoming sound isn't a duplicate of the previous sound |
||
| 768 | $workingEntry[] = $soundTableEntry[$state]; |
||
| 769 | } elseif ($noVowels) { |
||
| 770 | // Incoming sound is a duplicate of the previous sound |
||
| 771 | // For Hebrew and Arabic, we need to create a pair of D-M sound codes, |
||
| 772 | // one of the pair with only a single occurrence of the duplicate sound, |
||
| 773 | // the other with both occurrences |
||
| 774 | $workingEntry[] = $soundTableEntry[$state]; |
||
| 775 | } |
||
| 776 | |||
| 777 | if (count($workingEntry) < 7) { |
||
| 778 | $partialResult[] = $workingEntry; |
||
| 779 | } else { |
||
| 780 | // This is the 6th code in the sequence |
||
| 781 | // We're looking for 7 entries because the first is '!' and doesn't count |
||
| 782 | $tempResult = str_replace('!', '', implode('', $workingEntry)); |
||
| 783 | // Only return codes from recognisable sounds |
||
| 784 | if ($tempResult !== '') { |
||
| 785 | $result[] = substr($tempResult . '000000', 0, 6); |
||
| 786 | } |
||
| 787 | } |
||
| 788 | } |
||
| 789 | } |
||
| 790 | $state += 3; // Advance to next triplet while keeping the same basic state |
||
| 791 | } |
||
| 792 | } |
||
| 793 | |||
| 794 | // Zero-fill and copy all remaining partial results |
||
| 795 | foreach ($partialResult as $workingEntry) { |
||
| 796 | $tempResult = str_replace('!', '', implode('', $workingEntry)); |
||
| 797 | // Only return codes from recognisable sounds |
||
| 798 | if ($tempResult !== '') { |
||
| 799 | $result[] = substr($tempResult . '000000', 0, 6); |
||
| 800 | } |
||
| 801 | } |
||
| 802 | |||
| 803 | return $result; |
||
| 804 | } |
||
| 805 | } |
||
| 806 |