| Total Complexity | 1056 |
| Total Lines | 6147 |
| Duplicated Lines | 1.43 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.phonetic often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
|
|
|||
| 2 | |||
| 3 | # Copyright 2014-2018 by Christopher C. Little. |
||
| 4 | # This file is part of Abydos. |
||
| 5 | # |
||
| 6 | # Abydos is free software: you can redistribute it and/or modify |
||
| 7 | # it under the terms of the GNU General Public License as published by |
||
| 8 | # the Free Software Foundation, either version 3 of the License, or |
||
| 9 | # (at your option) any later version. |
||
| 10 | # |
||
| 11 | # Abydos is distributed in the hope that it will be useful, |
||
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 14 | # GNU General Public License for more details. |
||
| 15 | # |
||
| 16 | # You should have received a copy of the GNU General Public License |
||
| 17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
| 18 | |||
| 19 | """abydos.phonetic. |
||
| 20 | |||
| 21 | The phonetic module implements phonetic algorithms including: |
||
| 22 | |||
| 23 | - Robert C. Russell's Index |
||
| 24 | - American Soundex |
||
| 25 | - Refined Soundex |
||
| 26 | - Daitch-Mokotoff Soundex |
||
| 27 | - Kölner Phonetik |
||
| 28 | - NYSIIS |
||
| 29 | - Match Rating Algorithm |
||
| 30 | - Metaphone |
||
| 31 | - Double Metaphone |
||
| 32 | - Caverphone |
||
| 33 | - Alpha Search Inquiry System |
||
| 34 | - Fuzzy Soundex |
||
| 35 | - Phonex |
||
| 36 | - Phonem |
||
| 37 | - Phonix |
||
| 38 | - SfinxBis |
||
| 39 | - phonet |
||
| 40 | - Standardized Phonetic Frequency Code |
||
| 41 | - Statistics Canada |
||
| 42 | - Lein |
||
| 43 | - Roger Root |
||
| 44 | - Oxford Name Compression Algorithm (ONCA) |
||
| 45 | - Eudex phonetic hash |
||
| 46 | - Haase Phonetik |
||
| 47 | - Reth-Schek Phonetik |
||
| 48 | - FONEM |
||
| 49 | - Parmar-Kumbharana |
||
| 50 | - Davidson's Consonant Code |
||
| 51 | - SoundD |
||
| 52 | - PSHP Soundex/Viewex Coding |
||
| 53 | - an early version of Henry Code |
||
| 54 | - Norphone |
||
| 55 | - Dolby Code |
||
| 56 | - Beider-Morse Phonetic Matching |
||
| 57 | """ |
||
| 58 | |||
| 59 | from __future__ import division, unicode_literals |
||
| 60 | |||
| 61 | import re |
||
| 62 | import unicodedata |
||
| 63 | from collections import Counter |
||
| 64 | from itertools import groupby, product |
||
| 65 | |||
| 66 | from six import text_type |
||
| 67 | from six.moves import range |
||
| 68 | |||
| 69 | from ._bm import _bmpm |
||
| 70 | |||
| 71 | _INFINITY = float('inf') |
||
| 72 | |||
| 73 | |||
| 74 | def _delete_consecutive_repeats(word): |
||
| 75 | """Delete consecutive repeated characters in a word. |
||
| 76 | |||
| 77 | :param str word: the word to transform |
||
| 78 | :returns: word with consecutive repeating characters collapsed to |
||
| 79 | a single instance |
||
| 80 | :rtype: str |
||
| 81 | """ |
||
| 82 | return ''.join(char for char, _ in groupby(word)) |
||
| 83 | |||
| 84 | |||
| 85 | def russell_index(word): |
||
| 86 | """Return the Russell Index (integer output) of a word. |
||
| 87 | |||
| 88 | This follows Robert C. Russell's Index algorithm, as described in |
||
| 89 | US Patent 1,261,167 (1917) |
||
| 90 | |||
| 91 | :param str word: the word to transform |
||
| 92 | :returns: the Russell Index value |
||
| 93 | :rtype: int |
||
| 94 | |||
| 95 | >>> russell_index('Christopher') |
||
| 96 | 3813428 |
||
| 97 | >>> russell_index('Niall') |
||
| 98 | 715 |
||
| 99 | >>> russell_index('Smith') |
||
| 100 | 3614 |
||
| 101 | >>> russell_index('Schmidt') |
||
| 102 | 3614 |
||
| 103 | """ |
||
| 104 | _russell_translation = dict(zip((ord(_) for _ in |
||
| 105 | 'ABCDEFGIKLMNOPQRSTUVXYZ'), |
||
| 106 | '12341231356712383412313')) |
||
| 107 | |||
| 108 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 109 | word = word.replace('ß', 'SS') |
||
| 110 | word = word.replace('GH', '') # discard gh (rule 3) |
||
| 111 | word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) |
||
| 112 | |||
| 113 | # translate according to Russell's mapping |
||
| 114 | word = ''.join(c for c in word if c in |
||
| 115 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', |
||
| 116 | 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}) |
||
| 117 | sdx = word.translate(_russell_translation) |
||
| 118 | |||
| 119 | # remove any 1s after the first occurrence |
||
| 120 | one = sdx.find('1')+1 |
||
| 121 | if one: |
||
| 122 | sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') |
||
| 123 | |||
| 124 | # remove repeating characters |
||
| 125 | sdx = _delete_consecutive_repeats(sdx) |
||
| 126 | |||
| 127 | # return as an int |
||
| 128 | return int(sdx) if sdx else float('NaN') |
||
| 129 | |||
| 130 | |||
| 131 | def russell_index_num_to_alpha(num): |
||
| 132 | """Convert the Russell Index integer to an alphabetic string. |
||
| 133 | |||
| 134 | This follows Robert C. Russell's Index algorithm, as described in |
||
| 135 | US Patent 1,261,167 (1917) |
||
| 136 | |||
| 137 | :param int num: a Russell Index integer value |
||
| 138 | :returns: the Russell Index as an alphabetic string |
||
| 139 | :rtype: str |
||
| 140 | |||
| 141 | >>> russell_index_num_to_alpha(3813428) |
||
| 142 | 'CRACDBR' |
||
| 143 | >>> russell_index_num_to_alpha(715) |
||
| 144 | 'NAL' |
||
| 145 | >>> russell_index_num_to_alpha(3614) |
||
| 146 | 'CMAD' |
||
| 147 | """ |
||
| 148 | _russell_num_translation = dict(zip((ord(_) for _ in '12345678'), |
||
| 149 | 'ABCDLMNR')) |
||
| 150 | num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5', |
||
| 151 | '6', '7', '8'}) |
||
| 152 | if num: |
||
| 153 | return num.translate(_russell_num_translation) |
||
| 154 | return '' |
||
| 155 | |||
| 156 | |||
| 157 | def russell_index_alpha(word): |
||
| 158 | """Return the Russell Index (alphabetic output) for the word. |
||
| 159 | |||
| 160 | This follows Robert C. Russell's Index algorithm, as described in |
||
| 161 | US Patent 1,261,167 (1917) |
||
| 162 | |||
| 163 | :param str word: the word to transform |
||
| 164 | :returns: the Russell Index value as an alphabetic string |
||
| 165 | :rtype: str |
||
| 166 | |||
| 167 | >>> russell_index_alpha('Christopher') |
||
| 168 | 'CRACDBR' |
||
| 169 | >>> russell_index_alpha('Niall') |
||
| 170 | 'NAL' |
||
| 171 | >>> russell_index_alpha('Smith') |
||
| 172 | 'CMAD' |
||
| 173 | >>> russell_index_alpha('Schmidt') |
||
| 174 | 'CMAD' |
||
| 175 | """ |
||
| 176 | if word: |
||
| 177 | return russell_index_num_to_alpha(russell_index(word)) |
||
| 178 | return '' |
||
| 179 | |||
| 180 | |||
| 181 | def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True): |
||
| 182 | """Return the Soundex code for a word. |
||
| 183 | |||
| 184 | :param str word: the word to transform |
||
| 185 | :param int maxlength: the length of the code returned (defaults to 4) |
||
| 186 | :param str var: the variant of the algorithm to employ (defaults to |
||
| 187 | 'American'): |
||
| 188 | |||
| 189 | - 'American' follows the American Soundex algorithm, as described at |
||
| 190 | http://www.archives.gov/publications/general-info-leaflets/55-census.html |
||
| 191 | and in Knuth(1998:394); this is also called Miracode |
||
| 192 | - 'special' follows the rules from the 1880-1910 US Census |
||
| 193 | retrospective re-analysis, in which h & w are not treated as blocking |
||
| 194 | consonants but as vowels. |
||
| 195 | Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm |
||
| 196 | - 'Census' follows the rules laid out in GIL 55 by the US Census, |
||
| 197 | including coding prefixed and unprefixed versions of some names |
||
| 198 | |||
| 199 | :param bool reverse: reverse the word before computing the selected Soundex |
||
| 200 | (defaults to False); This results in "Reverse Soundex" |
||
| 201 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
| 202 | maxlength string |
||
| 203 | :returns: the Soundex value |
||
| 204 | :rtype: str |
||
| 205 | |||
| 206 | >>> soundex("Christopher") |
||
| 207 | 'C623' |
||
| 208 | >>> soundex("Niall") |
||
| 209 | 'N400' |
||
| 210 | >>> soundex('Smith') |
||
| 211 | 'S530' |
||
| 212 | >>> soundex('Schmidt') |
||
| 213 | 'S530' |
||
| 214 | |||
| 215 | |||
| 216 | >>> soundex('Christopher', maxlength=_INFINITY) |
||
| 217 | 'C623160000000000000000000000000000000000000000000000000000000000' |
||
| 218 | >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False) |
||
| 219 | 'C62316' |
||
| 220 | |||
| 221 | >>> soundex('Christopher', reverse=True) |
||
| 222 | 'R132' |
||
| 223 | |||
| 224 | >>> soundex('Ashcroft') |
||
| 225 | 'A261' |
||
| 226 | >>> soundex('Asicroft') |
||
| 227 | 'A226' |
||
| 228 | >>> soundex('Ashcroft', var='special') |
||
| 229 | 'A226' |
||
| 230 | >>> soundex('Asicroft', var='special') |
||
| 231 | 'A226' |
||
| 232 | """ |
||
| 233 | _soundex_translation = dict(zip((ord(_) for _ in |
||
| 234 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 235 | '01230129022455012623019202')) |
||
| 236 | |||
| 237 | # Require a maxlength of at least 4 and not more than 64 |
||
| 238 | if maxlength is not None: |
||
| 239 | maxlength = min(max(4, maxlength), 64) |
||
| 240 | else: |
||
| 241 | maxlength = 64 |
||
| 242 | |||
| 243 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 244 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 245 | word = word.replace('ß', 'SS') |
||
| 246 | |||
| 247 | if var == 'Census': |
||
| 248 | # Should these prefixes be supplemented? (VANDE, DELA, VON) |
||
| 249 | if word[:3] in {'VAN', 'CON'} and len(word) > 4: |
||
| 250 | return (soundex(word, maxlength, 'American', reverse, zero_pad), |
||
| 251 | soundex(word[3:], maxlength, 'American', reverse, |
||
| 252 | zero_pad)) |
||
| 253 | if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: |
||
| 254 | return (soundex(word, maxlength, 'American', reverse, zero_pad), |
||
| 255 | soundex(word[2:], maxlength, 'American', reverse, |
||
| 256 | zero_pad)) |
||
| 257 | # Otherwise, proceed as usual (var='American' mode, ostensibly) |
||
| 258 | |||
| 259 | word = ''.join(c for c in word if c in |
||
| 260 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 261 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 262 | 'Y', 'Z'}) |
||
| 263 | |||
| 264 | # Nothing to convert, return base case |
||
| 265 | if not word: |
||
| 266 | if zero_pad: |
||
| 267 | return '0'*maxlength |
||
| 268 | return '0' |
||
| 269 | |||
| 270 | # Reverse word if computing Reverse Soundex |
||
| 271 | if reverse: |
||
| 272 | word = word[::-1] |
||
| 273 | |||
| 274 | # apply the Soundex algorithm |
||
| 275 | sdx = word.translate(_soundex_translation) |
||
| 276 | |||
| 277 | if var == 'special': |
||
| 278 | sdx = sdx.replace('9', '0') # special rule for 1880-1910 census |
||
| 279 | else: |
||
| 280 | sdx = sdx.replace('9', '') # rule 1 |
||
| 281 | sdx = _delete_consecutive_repeats(sdx) # rule 3 |
||
| 282 | |||
| 283 | if word[0] in 'HW': |
||
| 284 | sdx = word[0] + sdx |
||
| 285 | else: |
||
| 286 | sdx = word[0] + sdx[1:] |
||
| 287 | sdx = sdx.replace('0', '') # rule 1 |
||
| 288 | |||
| 289 | if zero_pad: |
||
| 290 | sdx += ('0'*maxlength) # rule 4 |
||
| 291 | |||
| 292 | return sdx[:maxlength] |
||
| 293 | |||
| 294 | |||
| 295 | def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False, |
||
| 296 | retain_vowels=False): |
||
| 297 | """Return the Refined Soundex code for a word. |
||
| 298 | |||
| 299 | This is Soundex, but with more character classes. It was defined by |
||
| 300 | Carolyn B. Boyce: |
||
| 301 | https://web.archive.org/web/20010513121003/http://www.bluepoof.com:80/Soundex/info2.html |
||
| 302 | |||
| 303 | :param word: the word to transform |
||
| 304 | :param maxlength: the length of the code returned (defaults to unlimited) |
||
| 305 | :param reverse: reverse the word before computing the selected Soundex |
||
| 306 | (defaults to False); This results in "Reverse Soundex" |
||
| 307 | :param zero_pad: pad the end of the return value with 0s to achieve a |
||
| 308 | maxlength string |
||
| 309 | :param retain_vowels: retain vowels (as 0) in the resulting code |
||
| 310 | :returns: the Refined Soundex value |
||
| 311 | :rtype: str |
||
| 312 | |||
| 313 | >>> refined_soundex('Christopher') |
||
| 314 | 'C3090360109' |
||
| 315 | >>> refined_soundex('Niall') |
||
| 316 | 'N807' |
||
| 317 | >>> refined_soundex('Smith') |
||
| 318 | 'S38060' |
||
| 319 | >>> refined_soundex('Schmidt') |
||
| 320 | 'S30806' |
||
| 321 | """ |
||
| 322 | _ref_soundex_translation = dict(zip((ord(_) for _ in |
||
| 323 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 324 | '01360240043788015936020505')) |
||
| 325 | |||
| 326 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 327 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 328 | word = word.replace('ß', 'SS') |
||
| 329 | word = ''.join(c for c in word if c in |
||
| 330 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 331 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 332 | 'Y', 'Z'}) |
||
| 333 | |||
| 334 | # Reverse word if computing Reverse Soundex |
||
| 335 | if reverse: |
||
| 336 | word = word[::-1] |
||
| 337 | |||
| 338 | # apply the Soundex algorithm |
||
| 339 | sdx = word[0] + word.translate(_ref_soundex_translation) |
||
| 340 | sdx = _delete_consecutive_repeats(sdx) |
||
| 341 | if not retain_vowels: |
||
| 342 | sdx = sdx.replace('0', '') # Delete vowels, H, W, Y |
||
| 343 | |||
| 344 | if maxlength < _INFINITY: |
||
| 345 | if zero_pad: |
||
| 346 | sdx += ('0' * maxlength) |
||
| 347 | if maxlength: |
||
| 348 | sdx = sdx[:maxlength] |
||
| 349 | |||
| 350 | return sdx |
||
| 351 | |||
| 352 | |||
| 353 | def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True): |
||
| 354 | """Return the Daitch-Mokotoff Soundex code for a word. |
||
| 355 | |||
| 356 | Returns values of a word as a set. A collection is necessary since there |
||
| 357 | can be multiple values for a single word. |
||
| 358 | |||
| 359 | :param word: the word to transform |
||
| 360 | :param maxlength: the length of the code returned (defaults to 6) |
||
| 361 | :param reverse: reverse the word before computing the selected Soundex |
||
| 362 | (defaults to False); This results in "Reverse Soundex" |
||
| 363 | :param zero_pad: pad the end of the return value with 0s to achieve a |
||
| 364 | maxlength string |
||
| 365 | :returns: the Daitch-Mokotoff Soundex value |
||
| 366 | :rtype: str |
||
| 367 | |||
| 368 | >>> dm_soundex('Christopher') |
||
| 369 | {'494379', '594379'} |
||
| 370 | >>> dm_soundex('Niall') |
||
| 371 | {'680000'} |
||
| 372 | >>> dm_soundex('Smith') |
||
| 373 | {'463000'} |
||
| 374 | >>> dm_soundex('Schmidt') |
||
| 375 | {'463000'} |
||
| 376 | |||
| 377 | >>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False) |
||
| 378 | {'35457976754', '3557976754'} |
||
| 379 | """ |
||
| 380 | _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), |
||
| 381 | 'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), |
||
| 382 | 'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), |
||
| 383 | 'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), |
||
| 384 | 'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), |
||
| 385 | 'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), |
||
| 386 | 'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), |
||
| 387 | 'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), |
||
| 388 | 'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), |
||
| 389 | 'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), |
||
| 390 | 'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), |
||
| 391 | 'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), |
||
| 392 | 'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), |
||
| 393 | 'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), |
||
| 394 | 'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), |
||
| 395 | 'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), |
||
| 396 | 'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), |
||
| 397 | 'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), |
||
| 398 | 'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), |
||
| 399 | 'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), |
||
| 400 | 'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), |
||
| 401 | 'STRS': (2, 4, 4), 'CZS': (4, 4, 4), |
||
| 402 | 'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), |
||
| 403 | 'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), |
||
| 404 | 'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), |
||
| 405 | 'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), |
||
| 406 | 'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), |
||
| 407 | 'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), |
||
| 408 | 'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), |
||
| 409 | 'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), |
||
| 410 | 'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), |
||
| 411 | 'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), |
||
| 412 | 'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), |
||
| 413 | 'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), |
||
| 414 | 'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), |
||
| 415 | 'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), |
||
| 416 | 'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), |
||
| 417 | 'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), |
||
| 418 | 'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), |
||
| 419 | 'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), |
||
| 420 | 'CH': ((5, 4), (5, 4), (5, 4)), |
||
| 421 | 'CK': ((5, 45), (5, 45), (5, 45)), |
||
| 422 | 'C': ((5, 4), (5, 4), (5, 4)), |
||
| 423 | 'J': ((1, 4), ('_', 4), ('_', 4)), |
||
| 424 | 'RZ': ((94, 4), (94, 4), (94, 4)), |
||
| 425 | 'RS': ((94, 4), (94, 4), (94, 4))} |
||
| 426 | |||
| 427 | _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
||
| 428 | 'B': ('B'), |
||
| 429 | 'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
||
| 430 | 'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', |
||
| 431 | 'DZ', 'D'), |
||
| 432 | 'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
||
| 433 | 'F': ('FB', 'F'), |
||
| 434 | 'G': ('G'), |
||
| 435 | 'H': ('H'), |
||
| 436 | 'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
||
| 437 | 'J': ('J'), |
||
| 438 | 'K': ('KH', 'KS', 'K'), |
||
| 439 | 'L': ('L'), |
||
| 440 | 'M': ('MN', 'M'), |
||
| 441 | 'N': ('NM', 'N'), |
||
| 442 | 'O': ('OI', 'OJ', 'OY', 'O'), |
||
| 443 | 'P': ('PF', 'PH', 'P'), |
||
| 444 | 'Q': ('Q'), |
||
| 445 | 'R': ('RS', 'RZ', 'R'), |
||
| 446 | 'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', |
||
| 447 | 'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', |
||
| 448 | 'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', |
||
| 449 | 'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'), |
||
| 450 | 'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', |
||
| 451 | 'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', |
||
| 452 | 'TS', 'TZ', 'T'), |
||
| 453 | 'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
||
| 454 | 'V': ('V'), |
||
| 455 | 'W': ('W'), |
||
| 456 | 'X': ('X'), |
||
| 457 | 'Y': ('Y'), |
||
| 458 | 'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', |
||
| 459 | 'ZH', 'ZS', 'Z')} |
||
| 460 | |||
| 461 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
| 462 | dms = [''] # initialize empty code list |
||
| 463 | |||
| 464 | # Require a maxlength of at least 6 and not more than 64 |
||
| 465 | if maxlength is not None: |
||
| 466 | maxlength = min(max(6, maxlength), 64) |
||
| 467 | else: |
||
| 468 | maxlength = 64 |
||
| 469 | |||
| 470 | # uppercase, normalize, decompose, and filter non-A-Z |
||
| 471 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 472 | word = word.replace('ß', 'SS') |
||
| 473 | word = ''.join(c for c in word if c in |
||
| 474 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 475 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 476 | 'Y', 'Z'}) |
||
| 477 | |||
| 478 | # Nothing to convert, return base case |
||
| 479 | if not word: |
||
| 480 | if zero_pad: |
||
| 481 | return {'0'*maxlength} |
||
| 482 | return {'0'} |
||
| 483 | |||
| 484 | # Reverse word if computing Reverse Soundex |
||
| 485 | if reverse: |
||
| 486 | word = word[::-1] |
||
| 487 | |||
| 488 | pos = 0 |
||
| 489 | while pos < len(word): |
||
| 490 | # Iterate through _dms_order, which specifies the possible substrings |
||
| 491 | # for which codes exist in the Daitch-Mokotoff coding |
||
| 492 | for sstr in _dms_order[word[pos]]: # pragma: no branch |
||
| 493 | if word[pos:].startswith(sstr): |
||
| 494 | # Having determined a valid substring start, retrieve the code |
||
| 495 | dm_val = _dms_table[sstr] |
||
| 496 | |||
| 497 | # Having retried the code (triple), determine the correct |
||
| 498 | # positional variant (first, pre-vocalic, elsewhere) |
||
| 499 | if pos == 0: |
||
| 500 | dm_val = dm_val[0] |
||
| 501 | elif (pos+len(sstr) < len(word) and |
||
| 502 | word[pos+len(sstr)] in _vowels): |
||
| 503 | dm_val = dm_val[1] |
||
| 504 | else: |
||
| 505 | dm_val = dm_val[2] |
||
| 506 | |||
| 507 | # Build the code strings |
||
| 508 | if isinstance(dm_val, tuple): |
||
| 509 | dms = [_ + text_type(dm_val[0]) for _ in dms] \ |
||
| 510 | + [_ + text_type(dm_val[1]) for _ in dms] |
||
| 511 | else: |
||
| 512 | dms = [_ + text_type(dm_val) for _ in dms] |
||
| 513 | pos += len(sstr) |
||
| 514 | break |
||
| 515 | |||
| 516 | # Filter out double letters and _ placeholders |
||
| 517 | dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_') |
||
| 518 | for _ in dms) |
||
| 519 | |||
| 520 | # Trim codes and return set |
||
| 521 | if zero_pad: |
||
| 522 | dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms) |
||
| 523 | else: |
||
| 524 | dms = (_[:maxlength] for _ in dms) |
||
| 525 | return set(dms) |
||
| 526 | |||
| 527 | |||
| 528 | def koelner_phonetik(word): |
||
| 529 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
| 530 | |||
| 531 | Based on the algorithm described at |
||
| 532 | https://de.wikipedia.org/wiki/Kölner_Phonetik |
||
| 533 | |||
| 534 | While the output code is numeric, it is still a str because 0s can lead |
||
| 535 | the code. |
||
| 536 | |||
| 537 | :param str word: the word to transform |
||
| 538 | :returns: the Kölner Phonetik value as a numeric string |
||
| 539 | :rtype: str |
||
| 540 | |||
| 541 | >>> koelner_phonetik('Christopher') |
||
| 542 | '478237' |
||
| 543 | >>> koelner_phonetik('Niall') |
||
| 544 | '65' |
||
| 545 | >>> koelner_phonetik('Smith') |
||
| 546 | '862' |
||
| 547 | >>> koelner_phonetik('Schmidt') |
||
| 548 | '862' |
||
| 549 | >>> koelner_phonetik('Müller') |
||
| 550 | '657' |
||
| 551 | >>> koelner_phonetik('Zimmermann') |
||
| 552 | '86766' |
||
| 553 | """ |
||
| 554 | # pylint: disable=too-many-branches |
||
| 555 | def _after(word, i, letters): |
||
| 556 | """Return True if word[i] follows one of the supplied letters.""" |
||
| 557 | if i > 0 and word[i-1] in letters: |
||
| 558 | return True |
||
| 559 | return False |
||
| 560 | |||
| 561 | def _before(word, i, letters): |
||
| 562 | """Return True if word[i] precedes one of the supplied letters.""" |
||
| 563 | if i+1 < len(word) and word[i+1] in letters: |
||
| 564 | return True |
||
| 565 | return False |
||
| 566 | |||
| 567 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
| 568 | |||
| 569 | sdx = '' |
||
| 570 | |||
| 571 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 572 | word = word.replace('ß', 'SS') |
||
| 573 | |||
| 574 | word = word.replace('Ä', 'AE') |
||
| 575 | word = word.replace('Ö', 'OE') |
||
| 576 | word = word.replace('Ü', 'UE') |
||
| 577 | word = ''.join(c for c in word if c in |
||
| 578 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 579 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 580 | 'Y', 'Z'}) |
||
| 581 | |||
| 582 | # Nothing to convert, return base case |
||
| 583 | if not word: |
||
| 584 | return sdx |
||
| 585 | |||
| 586 | for i in range(len(word)): |
||
| 587 | View Code Duplication | if word[i] in _vowels: |
|
| 588 | sdx += '0' |
||
| 589 | elif word[i] == 'B': |
||
| 590 | sdx += '1' |
||
| 591 | elif word[i] == 'P': |
||
| 592 | if _before(word, i, {'H'}): |
||
| 593 | sdx += '3' |
||
| 594 | else: |
||
| 595 | sdx += '1' |
||
| 596 | elif word[i] in {'D', 'T'}: |
||
| 597 | if _before(word, i, {'C', 'S', 'Z'}): |
||
| 598 | sdx += '8' |
||
| 599 | else: |
||
| 600 | sdx += '2' |
||
| 601 | elif word[i] in {'F', 'V', 'W'}: |
||
| 602 | sdx += '3' |
||
| 603 | elif word[i] in {'G', 'K', 'Q'}: |
||
| 604 | sdx += '4' |
||
| 605 | elif word[i] == 'C': |
||
| 606 | if _after(word, i, {'S', 'Z'}): |
||
| 607 | sdx += '8' |
||
| 608 | elif i == 0: |
||
| 609 | if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', |
||
| 610 | 'X'}): |
||
| 611 | sdx += '4' |
||
| 612 | else: |
||
| 613 | sdx += '8' |
||
| 614 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
||
| 615 | sdx += '4' |
||
| 616 | else: |
||
| 617 | sdx += '8' |
||
| 618 | elif word[i] == 'X': |
||
| 619 | if _after(word, i, {'C', 'K', 'Q'}): |
||
| 620 | sdx += '8' |
||
| 621 | else: |
||
| 622 | sdx += '48' |
||
| 623 | elif word[i] == 'L': |
||
| 624 | sdx += '5' |
||
| 625 | elif word[i] in {'M', 'N'}: |
||
| 626 | sdx += '6' |
||
| 627 | elif word[i] == 'R': |
||
| 628 | sdx += '7' |
||
| 629 | elif word[i] in {'S', 'Z'}: |
||
| 630 | sdx += '8' |
||
| 631 | |||
| 632 | sdx = _delete_consecutive_repeats(sdx) |
||
| 633 | |||
| 634 | if sdx: |
||
| 635 | sdx = sdx[0] + sdx[1:].replace('0', '') |
||
| 636 | |||
| 637 | return sdx |
||
| 638 | |||
| 639 | |||
| 640 | def koelner_phonetik_num_to_alpha(num): |
||
| 641 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
| 642 | |||
| 643 | :param str num: a numeric Kölner Phonetik representation |
||
| 644 | :returns: an alphabetic representation of the same word |
||
| 645 | :rtype: str |
||
| 646 | |||
| 647 | >>> koelner_phonetik_num_to_alpha(862) |
||
| 648 | 'SNT' |
||
| 649 | >>> koelner_phonetik_num_to_alpha(657) |
||
| 650 | 'NLR' |
||
| 651 | >>> koelner_phonetik_num_to_alpha(86766) |
||
| 652 | 'SNRNN' |
||
| 653 | """ |
||
| 654 | _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'), |
||
| 655 | 'APTFKLNRS')) |
||
| 656 | num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4', |
||
| 657 | '5', '6', '7', '8'}) |
||
| 658 | return num.translate(_koelner_num_translation) |
||
| 659 | |||
| 660 | |||
| 661 | def koelner_phonetik_alpha(word): |
||
| 662 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
| 663 | |||
| 664 | :param str word: the word to transform |
||
| 665 | :returns: the Kölner Phonetik value as an alphabetic string |
||
| 666 | :rtype: str |
||
| 667 | |||
| 668 | >>> koelner_phonetik_alpha('Smith') |
||
| 669 | 'SNT' |
||
| 670 | >>> koelner_phonetik_alpha('Schmidt') |
||
| 671 | 'SNT' |
||
| 672 | >>> koelner_phonetik_alpha('Müller') |
||
| 673 | 'NLR' |
||
| 674 | >>> koelner_phonetik_alpha('Zimmermann') |
||
| 675 | 'SNRNN' |
||
| 676 | """ |
||
| 677 | return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
||
| 678 | |||
| 679 | |||
| 680 | def nysiis(word, maxlength=6, modified=False): |
||
| 681 | """Return the NYSIIS code for a word. |
||
| 682 | |||
| 683 | A description of the New York State Identification and Intelligence System |
||
| 684 | algorithm can be found at |
||
| 685 | https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System |
||
| 686 | |||
| 687 | The modified version of this algorithm is described in Appendix B of |
||
| 688 | Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding |
||
| 689 | Procedure for the SRS Record Linkage System.` Statistical Reporting |
||
| 690 | Service, U.S. Department of Agriculture, Washington, D.C. February 1977. |
||
| 691 | https://naldc.nal.usda.gov/download/27833/PDF |
||
| 692 | |||
| 693 | :param str word: the word to transform |
||
| 694 | :param int maxlength: the maximum length (default 6) of the code to return |
||
| 695 | :param bool modified: indicates whether to use USDA modified NYSIIS |
||
| 696 | :returns: the NYSIIS value |
||
| 697 | :rtype: str |
||
| 698 | |||
| 699 | >>> nysiis('Christopher') |
||
| 700 | 'CRASTA' |
||
| 701 | >>> nysiis('Niall') |
||
| 702 | 'NAL' |
||
| 703 | >>> nysiis('Smith') |
||
| 704 | 'SNAT' |
||
| 705 | >>> nysiis('Schmidt') |
||
| 706 | 'SNAD' |
||
| 707 | |||
| 708 | >>> nysiis('Christopher', maxlength=_INFINITY) |
||
| 709 | 'CRASTAFAR' |
||
| 710 | |||
| 711 | >>> nysiis('Christopher', maxlength=8, modified=True) |
||
| 712 | 'CRASTAFA' |
||
| 713 | >>> nysiis('Niall', maxlength=8, modified=True) |
||
| 714 | 'NAL' |
||
| 715 | >>> nysiis('Smith', maxlength=8, modified=True) |
||
| 716 | 'SNAT' |
||
| 717 | >>> nysiis('Schmidt', maxlength=8, modified=True) |
||
| 718 | 'SNAD' |
||
| 719 | """ |
||
| 720 | # Require a maxlength of at least 6 |
||
| 721 | if maxlength: |
||
| 722 | maxlength = max(6, maxlength) |
||
| 723 | |||
| 724 | _vowels = {'A', 'E', 'I', 'O', 'U'} |
||
| 725 | |||
| 726 | word = ''.join(c for c in word.upper() if c.isalpha()) |
||
| 727 | word = word.replace('ß', 'SS') |
||
| 728 | |||
| 729 | # exit early if there are no alphas |
||
| 730 | if not word: |
||
| 731 | return '' |
||
| 732 | |||
| 733 | if modified: |
||
| 734 | original_first_char = word[0] |
||
| 735 | |||
| 736 | if word[:3] == 'MAC': |
||
| 737 | word = 'MCC'+word[3:] |
||
| 738 | elif word[:2] == 'KN': |
||
| 739 | word = 'NN'+word[2:] |
||
| 740 | elif word[:1] == 'K': |
||
| 741 | word = 'C'+word[1:] |
||
| 742 | elif word[:2] in {'PH', 'PF'}: |
||
| 743 | word = 'FF'+word[2:] |
||
| 744 | elif word[:3] == 'SCH': |
||
| 745 | word = 'SSS'+word[3:] |
||
| 746 | elif modified: |
||
| 747 | if word[:2] == 'WR': |
||
| 748 | word = 'RR'+word[2:] |
||
| 749 | elif word[:2] == 'RH': |
||
| 750 | word = 'RR'+word[2:] |
||
| 751 | elif word[:2] == 'DG': |
||
| 752 | word = 'GG'+word[2:] |
||
| 753 | elif word[:1] in _vowels: |
||
| 754 | word = 'A'+word[1:] |
||
| 755 | |||
| 756 | if modified and word[-1] in {'S', 'Z'}: |
||
| 757 | word = word[:-1] |
||
| 758 | |||
| 759 | if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and |
||
| 760 | word[-2:] == 'YE'): |
||
| 761 | word = word[:-2]+'Y' |
||
| 762 | elif word[-2:] in {'DT', 'RT', 'RD'}: |
||
| 763 | word = word[:-2]+'D' |
||
| 764 | elif word[-2:] in {'NT', 'ND'}: |
||
| 765 | word = word[:-2]+('N' if modified else 'D') |
||
| 766 | elif modified: |
||
| 767 | if word[-2:] == 'IX': |
||
| 768 | word = word[:-2]+'ICK' |
||
| 769 | elif word[-2:] == 'EX': |
||
| 770 | word = word[:-2]+'ECK' |
||
| 771 | elif word[-2:] in {'JR', 'SR'}: |
||
| 772 | return 'ERROR' # TODO: decide how best to return an error |
||
| 773 | |||
| 774 | key = word[0] |
||
| 775 | |||
| 776 | skip = 0 |
||
| 777 | for i in range(1, len(word)): |
||
| 778 | if i >= len(word): |
||
| 779 | continue |
||
| 780 | elif skip: |
||
| 781 | skip -= 1 |
||
| 782 | continue |
||
| 783 | elif word[i:i+2] == 'EV': |
||
| 784 | word = word[:i] + 'AF' + word[i+2:] |
||
| 785 | skip = 1 |
||
| 786 | elif word[i] in _vowels: |
||
| 787 | word = word[:i] + 'A' + word[i+1:] |
||
| 788 | elif modified and i != len(word)-1 and word[i] == 'Y': |
||
| 789 | word = word[:i] + 'A' + word[i+1:] |
||
| 790 | elif word[i] == 'Q': |
||
| 791 | word = word[:i] + 'G' + word[i+1:] |
||
| 792 | elif word[i] == 'Z': |
||
| 793 | word = word[:i] + 'S' + word[i+1:] |
||
| 794 | elif word[i] == 'M': |
||
| 795 | word = word[:i] + 'N' + word[i+1:] |
||
| 796 | elif word[i:i+2] == 'KN': |
||
| 797 | word = word[:i] + 'N' + word[i+2:] |
||
| 798 | elif word[i] == 'K': |
||
| 799 | word = word[:i] + 'C' + word[i+1:] |
||
| 800 | elif modified and i == len(word)-3 and word[i:i+3] == 'SCH': |
||
| 801 | word = word[:i] + 'SSA' |
||
| 802 | skip = 2 |
||
| 803 | elif word[i:i+3] == 'SCH': |
||
| 804 | word = word[:i] + 'SSS' + word[i+3:] |
||
| 805 | skip = 2 |
||
| 806 | elif modified and i == len(word)-2 and word[i:i+2] == 'SH': |
||
| 807 | word = word[:i] + 'SA' |
||
| 808 | skip = 1 |
||
| 809 | elif word[i:i+2] == 'SH': |
||
| 810 | word = word[:i] + 'SS' + word[i+2:] |
||
| 811 | skip = 1 |
||
| 812 | elif word[i:i+2] == 'PH': |
||
| 813 | word = word[:i] + 'FF' + word[i+2:] |
||
| 814 | skip = 1 |
||
| 815 | elif modified and word[i:i+3] == 'GHT': |
||
| 816 | word = word[:i] + 'TTT' + word[i+3:] |
||
| 817 | skip = 2 |
||
| 818 | elif modified and word[i:i+2] == 'DG': |
||
| 819 | word = word[:i] + 'GG' + word[i+2:] |
||
| 820 | skip = 1 |
||
| 821 | elif modified and word[i:i+2] == 'WR': |
||
| 822 | word = word[:i] + 'RR' + word[i+2:] |
||
| 823 | skip = 1 |
||
| 824 | elif word[i] == 'H' and (word[i-1] not in _vowels or |
||
| 825 | word[i+1:i+2] not in _vowels): |
||
| 826 | word = word[:i] + word[i-1] + word[i+1:] |
||
| 827 | elif word[i] == 'W' and word[i-1] in _vowels: |
||
| 828 | word = word[:i] + word[i-1] + word[i+1:] |
||
| 829 | |||
| 830 | if word[i:i+skip+1] != key[-1:]: |
||
| 831 | key += word[i:i+skip+1] |
||
| 832 | |||
| 833 | key = _delete_consecutive_repeats(key) |
||
| 834 | |||
| 835 | if key[-1] == 'S': |
||
| 836 | key = key[:-1] |
||
| 837 | if key[-2:] == 'AY': |
||
| 838 | key = key[:-2] + 'Y' |
||
| 839 | if key[-1:] == 'A': |
||
| 840 | key = key[:-1] |
||
| 841 | if modified and key[0] == 'A': |
||
| 842 | key = original_first_char + key[1:] |
||
| 843 | |||
| 844 | if maxlength and maxlength < _INFINITY: |
||
| 845 | key = key[:maxlength] |
||
| 846 | |||
| 847 | return key |
||
| 848 | |||
| 849 | |||
| 850 | def mra(word): |
||
| 851 | """Return the MRA personal numeric identifier (PNI) for a word. |
||
| 852 | |||
| 853 | A description of the Western Airlines Surname Match Rating Algorithm can |
||
| 854 | be found on page 18 of |
||
| 855 | https://archive.org/details/accessingindivid00moor |
||
| 856 | |||
| 857 | :param str word: the word to transform |
||
| 858 | :returns: the MRA PNI |
||
| 859 | :rtype: str |
||
| 860 | |||
| 861 | >>> mra('Christopher') |
||
| 862 | 'CHRPHR' |
||
| 863 | >>> mra('Niall') |
||
| 864 | 'NL' |
||
| 865 | >>> mra('Smith') |
||
| 866 | 'SMTH' |
||
| 867 | >>> mra('Schmidt') |
||
| 868 | 'SCHMDT' |
||
| 869 | """ |
||
| 870 | if not word: |
||
| 871 | return word |
||
| 872 | word = word.upper() |
||
| 873 | word = word.replace('ß', 'SS') |
||
| 874 | word = word[0]+''.join(c for c in word[1:] if |
||
| 875 | c not in {'A', 'E', 'I', 'O', 'U'}) |
||
| 876 | word = _delete_consecutive_repeats(word) |
||
| 877 | if len(word) > 6: |
||
| 878 | word = word[:3]+word[-3:] |
||
| 879 | return word |
||
| 880 | |||
| 881 | |||
| 882 | def metaphone(word, maxlength=_INFINITY): |
||
| 883 | """Return the Metaphone code for a word. |
||
| 884 | |||
| 885 | Based on Lawrence Philips' Pick BASIC code from 1990: |
||
| 886 | http://aspell.net/metaphone/metaphone.basic |
||
| 887 | This incorporates some corrections to the above code, particularly |
||
| 888 | some of those suggested by Michael Kuhn in: |
||
| 889 | http://aspell.net/metaphone/metaphone-kuhn.txt |
||
| 890 | |||
| 891 | :param str word: the word to transform |
||
| 892 | :param int maxlength: the maximum length of the returned Metaphone code |
||
| 893 | (defaults to unlimited, but in Philips' original implementation |
||
| 894 | this was 4) |
||
| 895 | :returns: the Metaphone value |
||
| 896 | :rtype: str |
||
| 897 | |||
| 898 | |||
| 899 | >>> metaphone('Christopher') |
||
| 900 | 'KRSTFR' |
||
| 901 | >>> metaphone('Niall') |
||
| 902 | 'NL' |
||
| 903 | >>> metaphone('Smith') |
||
| 904 | 'SM0' |
||
| 905 | >>> metaphone('Schmidt') |
||
| 906 | 'SKMTT' |
||
| 907 | """ |
||
| 908 | # pylint: disable=too-many-branches |
||
| 909 | _vowels = {'A', 'E', 'I', 'O', 'U'} |
||
| 910 | _frontv = {'E', 'I', 'Y'} |
||
| 911 | _varson = {'C', 'G', 'P', 'S', 'T'} |
||
| 912 | |||
| 913 | # Require a maxlength of at least 4 |
||
| 914 | if maxlength is not None: |
||
| 915 | maxlength = max(4, maxlength) |
||
| 916 | else: |
||
| 917 | maxlength = 64 |
||
| 918 | |||
| 919 | # As in variable sound--those modified by adding an "h" |
||
| 920 | ename = ''.join(c for c in word.upper() if c.isalnum()) |
||
| 921 | ename = ename.replace('ß', 'SS') |
||
| 922 | |||
| 923 | # Delete nonalphanumeric characters and make all caps |
||
| 924 | if not ename: |
||
| 925 | return '' |
||
| 926 | if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}: |
||
| 927 | ename = ename[1:] |
||
| 928 | elif ename[0] == 'X': |
||
| 929 | ename = 'S' + ename[1:] |
||
| 930 | elif ename[0:2] == 'WH': |
||
| 931 | ename = 'W' + ename[2:] |
||
| 932 | |||
| 933 | # Convert to metaph |
||
| 934 | elen = len(ename)-1 |
||
| 935 | metaph = '' |
||
| 936 | for i in range(len(ename)): |
||
| 937 | if len(metaph) >= maxlength: |
||
| 938 | break |
||
| 939 | if ((ename[i] not in {'G', 'T'} and |
||
| 940 | i > 0 and ename[i-1] == ename[i])): |
||
| 941 | continue |
||
| 942 | |||
| 943 | if ename[i] in _vowels and i == 0: |
||
| 944 | metaph = ename[i] |
||
| 945 | |||
| 946 | elif ename[i] == 'B': |
||
| 947 | if i != elen or ename[i-1] != 'M': |
||
| 948 | metaph += ename[i] |
||
| 949 | |||
| 950 | elif ename[i] == 'C': |
||
| 951 | if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv): |
||
| 952 | if ename[i+1:i+3] == 'IA': |
||
| 953 | metaph += 'X' |
||
| 954 | elif ename[i+1:i+2] in _frontv: |
||
| 955 | metaph += 'S' |
||
| 956 | elif i > 0 and ename[i-1:i+2] == 'SCH': |
||
| 957 | metaph += 'K' |
||
| 958 | elif ename[i+1:i+2] == 'H': |
||
| 959 | if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels: |
||
| 960 | metaph += 'K' |
||
| 961 | else: |
||
| 962 | metaph += 'X' |
||
| 963 | else: |
||
| 964 | metaph += 'K' |
||
| 965 | |||
| 966 | elif ename[i] == 'D': |
||
| 967 | if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv: |
||
| 968 | metaph += 'J' |
||
| 969 | else: |
||
| 970 | metaph += 'T' |
||
| 971 | |||
| 972 | elif ename[i] == 'G': |
||
| 973 | if ename[i+1:i+2] == 'H' and not (i+1 == elen or |
||
| 974 | ename[i+2:i+3] not in _vowels): |
||
| 975 | continue |
||
| 976 | elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or |
||
| 977 | (i+3 == elen and ename[i+1:i+4] == 'NED')): |
||
| 978 | continue |
||
| 979 | elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and |
||
| 980 | ename[i+1] in _frontv): |
||
| 981 | continue |
||
| 982 | elif ename[i+1:i+2] == 'G': |
||
| 983 | continue |
||
| 984 | elif ename[i+1:i+2] in _frontv: |
||
| 985 | if i == 0 or ename[i-1] != 'G': |
||
| 986 | metaph += 'J' |
||
| 987 | else: |
||
| 988 | metaph += 'K' |
||
| 989 | else: |
||
| 990 | metaph += 'K' |
||
| 991 | |||
| 992 | elif ename[i] == 'H': |
||
| 993 | if ((i > 0 and ename[i-1] in _vowels and |
||
| 994 | ename[i+1:i+2] not in _vowels)): |
||
| 995 | continue |
||
| 996 | elif i > 0 and ename[i-1] in _varson: |
||
| 997 | continue |
||
| 998 | else: |
||
| 999 | metaph += 'H' |
||
| 1000 | |||
| 1001 | elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}: |
||
| 1002 | metaph += ename[i] |
||
| 1003 | |||
| 1004 | elif ename[i] == 'K': |
||
| 1005 | if i > 0 and ename[i-1] == 'C': |
||
| 1006 | continue |
||
| 1007 | else: |
||
| 1008 | metaph += 'K' |
||
| 1009 | |||
| 1010 | elif ename[i] == 'P': |
||
| 1011 | if ename[i+1:i+2] == 'H': |
||
| 1012 | metaph += 'F' |
||
| 1013 | else: |
||
| 1014 | metaph += 'P' |
||
| 1015 | |||
| 1016 | elif ename[i] == 'Q': |
||
| 1017 | metaph += 'K' |
||
| 1018 | |||
| 1019 | elif ename[i] == 'S': |
||
| 1020 | if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
||
| 1021 | ename[i+2] in 'OA')): |
||
| 1022 | metaph += 'X' |
||
| 1023 | elif ename[i+1:i+2] == 'H': |
||
| 1024 | metaph += 'X' |
||
| 1025 | else: |
||
| 1026 | metaph += 'S' |
||
| 1027 | |||
| 1028 | elif ename[i] == 'T': |
||
| 1029 | if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
||
| 1030 | ename[i+2] in {'A', 'O'})): |
||
| 1031 | metaph += 'X' |
||
| 1032 | elif ename[i+1:i+2] == 'H': |
||
| 1033 | metaph += '0' |
||
| 1034 | elif ename[i+1:i+3] != 'CH': |
||
| 1035 | if ename[i-1:i] != 'T': |
||
| 1036 | metaph += 'T' |
||
| 1037 | |||
| 1038 | elif ename[i] == 'V': |
||
| 1039 | metaph += 'F' |
||
| 1040 | |||
| 1041 | elif ename[i] in 'WY': |
||
| 1042 | if ename[i+1:i+2] in _vowels: |
||
| 1043 | metaph += ename[i] |
||
| 1044 | |||
| 1045 | elif ename[i] == 'X': |
||
| 1046 | metaph += 'KS' |
||
| 1047 | |||
| 1048 | elif ename[i] == 'Z': |
||
| 1049 | metaph += 'S' |
||
| 1050 | |||
| 1051 | return metaph |
||
| 1052 | |||
| 1053 | |||
| 1054 | def double_metaphone(word, maxlength=_INFINITY): |
||
| 1055 | """Return the Double Metaphone code for a word. |
||
| 1056 | |||
| 1057 | Based on Lawrence Philips' (Visual) C++ code from 1999: |
||
| 1058 | http://aspell.net/metaphone/dmetaph.cpp |
||
| 1059 | |||
| 1060 | :param word: the word to transform |
||
| 1061 | :param maxlength: the maximum length of the returned Double Metaphone codes |
||
| 1062 | (defaults to unlimited, but in Philips' original implementation this |
||
| 1063 | was 4) |
||
| 1064 | :returns: the Double Metaphone value(s) |
||
| 1065 | :rtype: tuple |
||
| 1066 | |||
| 1067 | >>> double_metaphone('Christopher') |
||
| 1068 | ('KRSTFR', '') |
||
| 1069 | >>> double_metaphone('Niall') |
||
| 1070 | ('NL', '') |
||
| 1071 | >>> double_metaphone('Smith') |
||
| 1072 | ('SM0', 'XMT') |
||
| 1073 | >>> double_metaphone('Schmidt') |
||
| 1074 | ('XMT', 'SMT') |
||
| 1075 | """ |
||
| 1076 | # pylint: disable=too-many-branches |
||
| 1077 | # Require a maxlength of at least 4 |
||
| 1078 | if maxlength is not None: |
||
| 1079 | maxlength = max(4, maxlength) |
||
| 1080 | else: |
||
| 1081 | maxlength = 64 |
||
| 1082 | |||
| 1083 | primary = '' |
||
| 1084 | secondary = '' |
||
| 1085 | |||
| 1086 | def _slavo_germanic(): |
||
| 1087 | """Return True if the word appears to be Slavic or Germanic.""" |
||
| 1088 | if 'W' in word or 'K' in word or 'CZ' in word: |
||
| 1089 | return True |
||
| 1090 | return False |
||
| 1091 | |||
| 1092 | def _metaph_add(pri, sec=''): |
||
| 1093 | """Return a new metaphone tuple with the supplied elements.""" |
||
| 1094 | newpri = primary |
||
| 1095 | newsec = secondary |
||
| 1096 | if pri: |
||
| 1097 | newpri += pri |
||
| 1098 | if sec: |
||
| 1099 | if sec != ' ': |
||
| 1100 | newsec += sec |
||
| 1101 | else: |
||
| 1102 | newsec += pri |
||
| 1103 | return (newpri, newsec) |
||
| 1104 | |||
| 1105 | def _is_vowel(pos): |
||
| 1106 | """Return True if the character at word[pos] is a vowel.""" |
||
| 1107 | if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 1108 | return True |
||
| 1109 | return False |
||
| 1110 | |||
| 1111 | def _get_at(pos): |
||
| 1112 | """Return the character at word[pos].""" |
||
| 1113 | return word[pos] |
||
| 1114 | |||
| 1115 | def _string_at(pos, slen, substrings): |
||
| 1116 | """Return True if word[pos:pos+slen] is in substrings.""" |
||
| 1117 | if pos < 0: |
||
| 1118 | return False |
||
| 1119 | return word[pos:pos+slen] in substrings |
||
| 1120 | |||
| 1121 | current = 0 |
||
| 1122 | length = len(word) |
||
| 1123 | if length < 1: |
||
| 1124 | return ('', '') |
||
| 1125 | last = length - 1 |
||
| 1126 | |||
| 1127 | word = word.upper() |
||
| 1128 | word = word.replace('ß', 'SS') |
||
| 1129 | |||
| 1130 | # Pad the original string so that we can index beyond the edge of the world |
||
| 1131 | word += ' ' |
||
| 1132 | |||
| 1133 | # Skip these when at start of word |
||
| 1134 | if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
||
| 1135 | current += 1 |
||
| 1136 | |||
| 1137 | # Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
||
| 1138 | if _get_at(0) == 'X': |
||
| 1139 | (primary, secondary) = _metaph_add('S') # 'Z' maps to 'S' |
||
| 1140 | current += 1 |
||
| 1141 | |||
| 1142 | # Main loop |
||
| 1143 | while True: |
||
| 1144 | if current >= length: |
||
| 1145 | break |
||
| 1146 | |||
| 1147 | if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 1148 | if current == 0: |
||
| 1149 | # All init vowels now map to 'A' |
||
| 1150 | (primary, secondary) = _metaph_add('A') |
||
| 1151 | current += 1 |
||
| 1152 | continue |
||
| 1153 | |||
| 1154 | elif _get_at(current) == 'B': |
||
| 1155 | # "-mb", e.g", "dumb", already skipped over... |
||
| 1156 | (primary, secondary) = _metaph_add('P') |
||
| 1157 | if _get_at(current + 1) == 'B': |
||
| 1158 | current += 2 |
||
| 1159 | else: |
||
| 1160 | current += 1 |
||
| 1161 | continue |
||
| 1162 | |||
| 1163 | elif _get_at(current) == 'Ç': |
||
| 1164 | (primary, secondary) = _metaph_add('S') |
||
| 1165 | current += 1 |
||
| 1166 | continue |
||
| 1167 | |||
| 1168 | elif _get_at(current) == 'C': |
||
| 1169 | # Various Germanic |
||
| 1170 | if (current > 1 and not _is_vowel(current - 2) and |
||
| 1171 | _string_at((current - 1), 3, {'ACH'}) and |
||
| 1172 | ((_get_at(current + 2) != 'I') and |
||
| 1173 | ((_get_at(current + 2) != 'E') or |
||
| 1174 | _string_at((current - 2), 6, |
||
| 1175 | {'BACHER', 'MACHER'})))): |
||
| 1176 | (primary, secondary) = _metaph_add('K') |
||
| 1177 | current += 2 |
||
| 1178 | continue |
||
| 1179 | |||
| 1180 | # Special case 'caesar' |
||
| 1181 | elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
||
| 1182 | (primary, secondary) = _metaph_add('S') |
||
| 1183 | current += 2 |
||
| 1184 | continue |
||
| 1185 | |||
| 1186 | # Italian 'chianti' |
||
| 1187 | elif _string_at(current, 4, {'CHIA'}): |
||
| 1188 | (primary, secondary) = _metaph_add('K') |
||
| 1189 | current += 2 |
||
| 1190 | continue |
||
| 1191 | |||
| 1192 | elif _string_at(current, 2, {'CH'}): |
||
| 1193 | # Find 'Michael' |
||
| 1194 | if current > 0 and _string_at(current, 4, {'CHAE'}): |
||
| 1195 | (primary, secondary) = _metaph_add('K', 'X') |
||
| 1196 | current += 2 |
||
| 1197 | continue |
||
| 1198 | |||
| 1199 | # Greek roots e.g. 'chemistry', 'chorus' |
||
| 1200 | elif (current == 0 and |
||
| 1201 | (_string_at((current + 1), 5, |
||
| 1202 | {'HARAC', 'HARIS'}) or |
||
| 1203 | _string_at((current + 1), 3, |
||
| 1204 | {'HOR', 'HYM', 'HIA', 'HEM'})) and |
||
| 1205 | not _string_at(0, 5, {'CHORE'})): |
||
| 1206 | (primary, secondary) = _metaph_add('K') |
||
| 1207 | current += 2 |
||
| 1208 | continue |
||
| 1209 | |||
| 1210 | # Germanic, Greek, or otherwise 'ch' for 'kh' sound |
||
| 1211 | elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 1212 | _string_at(0, 3, {'SCH'})) or |
||
| 1213 | # 'architect but not 'arch', 'orchestra', 'orchid' |
||
| 1214 | _string_at((current - 2), 6, |
||
| 1215 | {'ORCHES', 'ARCHIT', 'ORCHID'}) or |
||
| 1216 | _string_at((current + 2), 1, {'T', 'S'}) or |
||
| 1217 | ((_string_at((current - 1), 1, |
||
| 1218 | {'A', 'O', 'U', 'E'}) or |
||
| 1219 | (current == 0)) and |
||
| 1220 | # e.g., 'wachtler', 'wechsler', but not 'tichner' |
||
| 1221 | _string_at((current + 2), 1, |
||
| 1222 | {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
||
| 1223 | ' '}))): |
||
| 1224 | (primary, secondary) = _metaph_add('K') |
||
| 1225 | |||
| 1226 | else: |
||
| 1227 | if current > 0: |
||
| 1228 | if _string_at(0, 2, {'MC'}): |
||
| 1229 | # e.g., "McHugh" |
||
| 1230 | (primary, secondary) = _metaph_add('K') |
||
| 1231 | else: |
||
| 1232 | (primary, secondary) = _metaph_add('X', 'K') |
||
| 1233 | else: |
||
| 1234 | (primary, secondary) = _metaph_add('X') |
||
| 1235 | |||
| 1236 | current += 2 |
||
| 1237 | continue |
||
| 1238 | |||
| 1239 | # e.g, 'czerny' |
||
| 1240 | elif (_string_at(current, 2, {'CZ'}) and |
||
| 1241 | not _string_at((current - 2), 4, {'WICZ'})): |
||
| 1242 | (primary, secondary) = _metaph_add('S', 'X') |
||
| 1243 | current += 2 |
||
| 1244 | continue |
||
| 1245 | |||
| 1246 | # e.g., 'focaccia' |
||
| 1247 | elif _string_at((current + 1), 3, {'CIA'}): |
||
| 1248 | (primary, secondary) = _metaph_add('X') |
||
| 1249 | current += 3 |
||
| 1250 | |||
| 1251 | # double 'C', but not if e.g. 'McClellan' |
||
| 1252 | elif (_string_at(current, 2, {'CC'}) and |
||
| 1253 | not ((current == 1) and (_get_at(0) == 'M'))): |
||
| 1254 | # 'bellocchio' but not 'bacchus' |
||
| 1255 | if ((_string_at((current + 2), 1, |
||
| 1256 | {'I', 'E', 'H'}) and |
||
| 1257 | not _string_at((current + 2), 2, ['HU']))): |
||
| 1258 | # 'accident', 'accede' 'succeed' |
||
| 1259 | if ((((current == 1) and _get_at(current - 1) == 'A') or |
||
| 1260 | _string_at((current - 1), 5, |
||
| 1261 | {'UCCEE', 'UCCES'}))): |
||
| 1262 | (primary, secondary) = _metaph_add('KS') |
||
| 1263 | # 'bacci', 'bertucci', other italian |
||
| 1264 | else: |
||
| 1265 | (primary, secondary) = _metaph_add('X') |
||
| 1266 | current += 3 |
||
| 1267 | continue |
||
| 1268 | else: # Pierce's rule |
||
| 1269 | (primary, secondary) = _metaph_add('K') |
||
| 1270 | current += 2 |
||
| 1271 | continue |
||
| 1272 | |||
| 1273 | elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
||
| 1274 | (primary, secondary) = _metaph_add('K') |
||
| 1275 | current += 2 |
||
| 1276 | continue |
||
| 1277 | |||
| 1278 | elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
||
| 1279 | # Italian vs. English |
||
| 1280 | if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
||
| 1281 | (primary, secondary) = _metaph_add('S', 'X') |
||
| 1282 | else: |
||
| 1283 | (primary, secondary) = _metaph_add('S') |
||
| 1284 | current += 2 |
||
| 1285 | continue |
||
| 1286 | |||
| 1287 | # else |
||
| 1288 | else: |
||
| 1289 | (primary, secondary) = _metaph_add('K') |
||
| 1290 | |||
| 1291 | # name sent in 'mac caffrey', 'mac gregor |
||
| 1292 | if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
||
| 1293 | current += 3 |
||
| 1294 | elif (_string_at((current + 1), 1, |
||
| 1295 | {'C', 'K', 'Q'}) and |
||
| 1296 | not _string_at((current + 1), 2, {'CE', 'CI'})): |
||
| 1297 | current += 2 |
||
| 1298 | else: |
||
| 1299 | current += 1 |
||
| 1300 | continue |
||
| 1301 | |||
| 1302 | elif _get_at(current) == 'D': |
||
| 1303 | if _string_at(current, 2, {'DG'}): |
||
| 1304 | if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
| 1305 | # e.g. 'edge' |
||
| 1306 | (primary, secondary) = _metaph_add('J') |
||
| 1307 | current += 3 |
||
| 1308 | continue |
||
| 1309 | else: |
||
| 1310 | # e.g. 'edgar' |
||
| 1311 | (primary, secondary) = _metaph_add('TK') |
||
| 1312 | current += 2 |
||
| 1313 | continue |
||
| 1314 | |||
| 1315 | elif _string_at(current, 2, {'DT', 'DD'}): |
||
| 1316 | (primary, secondary) = _metaph_add('T') |
||
| 1317 | current += 2 |
||
| 1318 | continue |
||
| 1319 | |||
| 1320 | # else |
||
| 1321 | else: |
||
| 1322 | (primary, secondary) = _metaph_add('T') |
||
| 1323 | current += 1 |
||
| 1324 | continue |
||
| 1325 | |||
| 1326 | elif _get_at(current) == 'F': |
||
| 1327 | if _get_at(current + 1) == 'F': |
||
| 1328 | current += 2 |
||
| 1329 | else: |
||
| 1330 | current += 1 |
||
| 1331 | (primary, secondary) = _metaph_add('F') |
||
| 1332 | continue |
||
| 1333 | |||
| 1334 | elif _get_at(current) == 'G': |
||
| 1335 | if _get_at(current + 1) == 'H': |
||
| 1336 | if (current > 0) and not _is_vowel(current - 1): |
||
| 1337 | (primary, secondary) = _metaph_add('K') |
||
| 1338 | current += 2 |
||
| 1339 | continue |
||
| 1340 | |||
| 1341 | # 'ghislane', ghiradelli |
||
| 1342 | elif current == 0: |
||
| 1343 | if _get_at(current + 2) == 'I': |
||
| 1344 | (primary, secondary) = _metaph_add('J') |
||
| 1345 | else: |
||
| 1346 | (primary, secondary) = _metaph_add('K') |
||
| 1347 | current += 2 |
||
| 1348 | continue |
||
| 1349 | |||
| 1350 | # Parker's rule (with some further refinements) - e.g., 'hugh' |
||
| 1351 | elif (((current > 1) and |
||
| 1352 | _string_at((current - 2), 1, {'B', 'H', 'D'})) or |
||
| 1353 | # e.g., 'bough' |
||
| 1354 | ((current > 2) and |
||
| 1355 | _string_at((current - 3), 1, {'B', 'H', 'D'})) or |
||
| 1356 | # e.g., 'broughton' |
||
| 1357 | ((current > 3) and |
||
| 1358 | _string_at((current - 4), 1, {'B', 'H'}))): |
||
| 1359 | current += 2 |
||
| 1360 | continue |
||
| 1361 | else: |
||
| 1362 | # e.g. 'laugh', 'McLaughlin', 'cough', |
||
| 1363 | # 'gough', 'rough', 'tough' |
||
| 1364 | if ((current > 2) and |
||
| 1365 | (_get_at(current - 1) == 'U') and |
||
| 1366 | (_string_at((current - 3), 1, |
||
| 1367 | {'C', 'G', 'L', 'R', 'T'}))): |
||
| 1368 | (primary, secondary) = _metaph_add('F') |
||
| 1369 | elif (current > 0) and _get_at(current - 1) != 'I': |
||
| 1370 | (primary, secondary) = _metaph_add('K') |
||
| 1371 | current += 2 |
||
| 1372 | continue |
||
| 1373 | |||
| 1374 | elif _get_at(current + 1) == 'N': |
||
| 1375 | if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
||
| 1376 | (primary, secondary) = _metaph_add('KN', 'N') |
||
| 1377 | # not e.g. 'cagney' |
||
| 1378 | elif (not _string_at((current + 2), 2, {'EY'}) and |
||
| 1379 | (_get_at(current + 1) != 'Y') and |
||
| 1380 | not _slavo_germanic()): |
||
| 1381 | (primary, secondary) = _metaph_add('N', 'KN') |
||
| 1382 | else: |
||
| 1383 | (primary, secondary) = _metaph_add('KN') |
||
| 1384 | current += 2 |
||
| 1385 | continue |
||
| 1386 | |||
| 1387 | # 'tagliaro' |
||
| 1388 | elif (_string_at((current + 1), 2, {'LI'}) and |
||
| 1389 | not _slavo_germanic()): |
||
| 1390 | (primary, secondary) = _metaph_add('KL', 'L') |
||
| 1391 | current += 2 |
||
| 1392 | continue |
||
| 1393 | |||
| 1394 | # -ges-, -gep-, -gel-, -gie- at beginning |
||
| 1395 | elif ((current == 0) and |
||
| 1396 | ((_get_at(current + 1) == 'Y') or |
||
| 1397 | _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
||
| 1398 | 'IB', 'IL', 'IN', 'IE', 'EI', |
||
| 1399 | 'ER'}))): |
||
| 1400 | (primary, secondary) = _metaph_add('K', 'J') |
||
| 1401 | current += 2 |
||
| 1402 | continue |
||
| 1403 | |||
| 1404 | # -ger-, -gy- |
||
| 1405 | elif ((_string_at((current + 1), 2, {'ER'}) or |
||
| 1406 | (_get_at(current + 1) == 'Y')) and not |
||
| 1407 | _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
||
| 1408 | _string_at((current - 1), 1, {'E', 'I'}) and not |
||
| 1409 | _string_at((current - 1), 3, {'RGY', 'OGY'})): |
||
| 1410 | (primary, secondary) = _metaph_add('K', 'J') |
||
| 1411 | current += 2 |
||
| 1412 | continue |
||
| 1413 | |||
| 1414 | # italian e.g, 'biaggi' |
||
| 1415 | elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
||
| 1416 | _string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
||
| 1417 | # obvious germanic |
||
| 1418 | if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 1419 | _string_at(0, 3, {'SCH'})) or |
||
| 1420 | _string_at((current + 1), 2, {'ET'}))): |
||
| 1421 | (primary, secondary) = _metaph_add('K') |
||
| 1422 | elif _string_at((current + 1), 4, {'IER '}): |
||
| 1423 | (primary, secondary) = _metaph_add('J') |
||
| 1424 | else: |
||
| 1425 | (primary, secondary) = _metaph_add('J', 'K') |
||
| 1426 | current += 2 |
||
| 1427 | continue |
||
| 1428 | |||
| 1429 | else: |
||
| 1430 | if _get_at(current + 1) == 'G': |
||
| 1431 | current += 2 |
||
| 1432 | else: |
||
| 1433 | current += 1 |
||
| 1434 | (primary, secondary) = _metaph_add('K') |
||
| 1435 | continue |
||
| 1436 | |||
| 1437 | elif _get_at(current) == 'H': |
||
| 1438 | # only keep if first & before vowel or btw. 2 vowels |
||
| 1439 | if ((((current == 0) or _is_vowel(current - 1)) and |
||
| 1440 | _is_vowel(current + 1))): |
||
| 1441 | (primary, secondary) = _metaph_add('H') |
||
| 1442 | current += 2 |
||
| 1443 | else: # also takes care of 'HH' |
||
| 1444 | current += 1 |
||
| 1445 | continue |
||
| 1446 | |||
| 1447 | elif _get_at(current) == 'J': |
||
| 1448 | # obvious spanish, 'jose', 'san jacinto' |
||
| 1449 | if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
||
| 1450 | if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
||
| 1451 | _string_at(0, 4, ['SAN ']))): |
||
| 1452 | (primary, secondary) = _metaph_add('H') |
||
| 1453 | else: |
||
| 1454 | (primary, secondary) = _metaph_add('J', 'H') |
||
| 1455 | current += 1 |
||
| 1456 | continue |
||
| 1457 | |||
| 1458 | elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
||
| 1459 | # Yankelovich/Jankelowicz |
||
| 1460 | (primary, secondary) = _metaph_add('J', 'A') |
||
| 1461 | # Spanish pron. of e.g. 'bajador' |
||
| 1462 | elif (_is_vowel(current - 1) and |
||
| 1463 | not _slavo_germanic() and |
||
| 1464 | ((_get_at(current + 1) == 'A') or |
||
| 1465 | (_get_at(current + 1) == 'O'))): |
||
| 1466 | (primary, secondary) = _metaph_add('J', 'H') |
||
| 1467 | elif current == last: |
||
| 1468 | (primary, secondary) = _metaph_add('J', ' ') |
||
| 1469 | elif (not _string_at((current + 1), 1, |
||
| 1470 | {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
||
| 1471 | not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
||
| 1472 | (primary, secondary) = _metaph_add('J') |
||
| 1473 | |||
| 1474 | if _get_at(current + 1) == 'J': # it could happen! |
||
| 1475 | current += 2 |
||
| 1476 | else: |
||
| 1477 | current += 1 |
||
| 1478 | continue |
||
| 1479 | |||
| 1480 | elif _get_at(current) == 'K': |
||
| 1481 | if _get_at(current + 1) == 'K': |
||
| 1482 | current += 2 |
||
| 1483 | else: |
||
| 1484 | current += 1 |
||
| 1485 | (primary, secondary) = _metaph_add('K') |
||
| 1486 | continue |
||
| 1487 | |||
| 1488 | elif _get_at(current) == 'L': |
||
| 1489 | if _get_at(current + 1) == 'L': |
||
| 1490 | # Spanish e.g. 'cabrillo', 'gallegos' |
||
| 1491 | if (((current == (length - 3)) and |
||
| 1492 | _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
||
| 1493 | ((_string_at((last - 1), 2, {'AS', 'OS'}) or |
||
| 1494 | _string_at(last, 1, {'A', 'O'})) and |
||
| 1495 | _string_at((current - 1), 4, {'ALLE'}))): |
||
| 1496 | (primary, secondary) = _metaph_add('L', ' ') |
||
| 1497 | current += 2 |
||
| 1498 | continue |
||
| 1499 | current += 2 |
||
| 1500 | else: |
||
| 1501 | current += 1 |
||
| 1502 | (primary, secondary) = _metaph_add('L') |
||
| 1503 | continue |
||
| 1504 | |||
| 1505 | elif _get_at(current) == 'M': |
||
| 1506 | if (((_string_at((current - 1), 3, {'UMB'}) and |
||
| 1507 | (((current + 1) == last) or |
||
| 1508 | _string_at((current + 2), 2, {'ER'}))) or |
||
| 1509 | # 'dumb', 'thumb' |
||
| 1510 | (_get_at(current + 1) == 'M'))): |
||
| 1511 | current += 2 |
||
| 1512 | else: |
||
| 1513 | current += 1 |
||
| 1514 | (primary, secondary) = _metaph_add('M') |
||
| 1515 | continue |
||
| 1516 | |||
| 1517 | elif _get_at(current) == 'N': |
||
| 1518 | if _get_at(current + 1) == 'N': |
||
| 1519 | current += 2 |
||
| 1520 | else: |
||
| 1521 | current += 1 |
||
| 1522 | (primary, secondary) = _metaph_add('N') |
||
| 1523 | continue |
||
| 1524 | |||
| 1525 | elif _get_at(current) == 'Ñ': |
||
| 1526 | current += 1 |
||
| 1527 | (primary, secondary) = _metaph_add('N') |
||
| 1528 | continue |
||
| 1529 | |||
| 1530 | elif _get_at(current) == 'P': |
||
| 1531 | if _get_at(current + 1) == 'H': |
||
| 1532 | (primary, secondary) = _metaph_add('F') |
||
| 1533 | current += 2 |
||
| 1534 | continue |
||
| 1535 | |||
| 1536 | # also account for "campbell", "raspberry" |
||
| 1537 | elif _string_at((current + 1), 1, {'P', 'B'}): |
||
| 1538 | current += 2 |
||
| 1539 | else: |
||
| 1540 | current += 1 |
||
| 1541 | (primary, secondary) = _metaph_add('P') |
||
| 1542 | continue |
||
| 1543 | |||
| 1544 | elif _get_at(current) == 'Q': |
||
| 1545 | if _get_at(current + 1) == 'Q': |
||
| 1546 | current += 2 |
||
| 1547 | else: |
||
| 1548 | current += 1 |
||
| 1549 | (primary, secondary) = _metaph_add('K') |
||
| 1550 | continue |
||
| 1551 | |||
| 1552 | elif _get_at(current) == 'R': |
||
| 1553 | # french e.g. 'rogier', but exclude 'hochmeier' |
||
| 1554 | if (((current == last) and |
||
| 1555 | not _slavo_germanic() and |
||
| 1556 | _string_at((current - 2), 2, {'IE'}) and |
||
| 1557 | not _string_at((current - 4), 2, {'ME', 'MA'}))): |
||
| 1558 | (primary, secondary) = _metaph_add('', 'R') |
||
| 1559 | else: |
||
| 1560 | (primary, secondary) = _metaph_add('R') |
||
| 1561 | |||
| 1562 | if _get_at(current + 1) == 'R': |
||
| 1563 | current += 2 |
||
| 1564 | else: |
||
| 1565 | current += 1 |
||
| 1566 | continue |
||
| 1567 | |||
| 1568 | elif _get_at(current) == 'S': |
||
| 1569 | # special cases 'island', 'isle', 'carlisle', 'carlysle' |
||
| 1570 | if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
||
| 1571 | current += 1 |
||
| 1572 | continue |
||
| 1573 | |||
| 1574 | # special case 'sugar-' |
||
| 1575 | elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
||
| 1576 | (primary, secondary) = _metaph_add('X', 'S') |
||
| 1577 | current += 1 |
||
| 1578 | continue |
||
| 1579 | |||
| 1580 | elif _string_at(current, 2, {'SH'}): |
||
| 1581 | # Germanic |
||
| 1582 | if _string_at((current + 1), 4, |
||
| 1583 | {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
||
| 1584 | (primary, secondary) = _metaph_add('S') |
||
| 1585 | else: |
||
| 1586 | (primary, secondary) = _metaph_add('X') |
||
| 1587 | current += 2 |
||
| 1588 | continue |
||
| 1589 | |||
| 1590 | # Italian & Armenian |
||
| 1591 | elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
||
| 1592 | _string_at(current, 4, {'SIAN'})): |
||
| 1593 | if not _slavo_germanic(): |
||
| 1594 | (primary, secondary) = _metaph_add('S', 'X') |
||
| 1595 | else: |
||
| 1596 | (primary, secondary) = _metaph_add('S') |
||
| 1597 | current += 3 |
||
| 1598 | continue |
||
| 1599 | |||
| 1600 | # German & anglicisations, e.g. 'smith' match 'schmidt', |
||
| 1601 | # 'snider' match 'schneider' |
||
| 1602 | # also, -sz- in Slavic language although in Hungarian it is |
||
| 1603 | # pronounced 's' |
||
| 1604 | elif (((current == 0) and |
||
| 1605 | _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
||
| 1606 | _string_at((current + 1), 1, {'Z'})): |
||
| 1607 | (primary, secondary) = _metaph_add('S', 'X') |
||
| 1608 | if _string_at((current + 1), 1, {'Z'}): |
||
| 1609 | current += 2 |
||
| 1610 | else: |
||
| 1611 | current += 1 |
||
| 1612 | continue |
||
| 1613 | |||
| 1614 | elif _string_at(current, 2, {'SC'}): |
||
| 1615 | # Schlesinger's rule |
||
| 1616 | if _get_at(current + 2) == 'H': |
||
| 1617 | # dutch origin, e.g. 'school', 'schooner' |
||
| 1618 | if _string_at((current + 3), 2, |
||
| 1619 | {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
||
| 1620 | # 'schermerhorn', 'schenker' |
||
| 1621 | if _string_at((current + 3), 2, {'ER', 'EN'}): |
||
| 1622 | (primary, secondary) = _metaph_add('X', 'SK') |
||
| 1623 | else: |
||
| 1624 | (primary, secondary) = _metaph_add('SK') |
||
| 1625 | current += 3 |
||
| 1626 | continue |
||
| 1627 | else: |
||
| 1628 | if (((current == 0) and not _is_vowel(3) and |
||
| 1629 | (_get_at(3) != 'W'))): |
||
| 1630 | (primary, secondary) = _metaph_add('X', 'S') |
||
| 1631 | else: |
||
| 1632 | (primary, secondary) = _metaph_add('X') |
||
| 1633 | current += 3 |
||
| 1634 | continue |
||
| 1635 | |||
| 1636 | elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
| 1637 | (primary, secondary) = _metaph_add('S') |
||
| 1638 | current += 3 |
||
| 1639 | continue |
||
| 1640 | |||
| 1641 | # else |
||
| 1642 | else: |
||
| 1643 | (primary, secondary) = _metaph_add('SK') |
||
| 1644 | current += 3 |
||
| 1645 | continue |
||
| 1646 | |||
| 1647 | else: |
||
| 1648 | # french e.g. 'resnais', 'artois' |
||
| 1649 | if (current == last) and _string_at((current - 2), 2, |
||
| 1650 | {'AI', 'OI'}): |
||
| 1651 | (primary, secondary) = _metaph_add('', 'S') |
||
| 1652 | else: |
||
| 1653 | (primary, secondary) = _metaph_add('S') |
||
| 1654 | |||
| 1655 | if _string_at((current + 1), 1, {'S', 'Z'}): |
||
| 1656 | current += 2 |
||
| 1657 | else: |
||
| 1658 | current += 1 |
||
| 1659 | continue |
||
| 1660 | |||
| 1661 | elif _get_at(current) == 'T': |
||
| 1662 | if _string_at(current, 4, {'TION'}): |
||
| 1663 | (primary, secondary) = _metaph_add('X') |
||
| 1664 | current += 3 |
||
| 1665 | continue |
||
| 1666 | |||
| 1667 | elif _string_at(current, 3, {'TIA', 'TCH'}): |
||
| 1668 | (primary, secondary) = _metaph_add('X') |
||
| 1669 | current += 3 |
||
| 1670 | continue |
||
| 1671 | |||
| 1672 | elif (_string_at(current, 2, {'TH'}) or |
||
| 1673 | _string_at(current, 3, {'TTH'})): |
||
| 1674 | # special case 'thomas', 'thames' or germanic |
||
| 1675 | if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
||
| 1676 | _string_at(0, 4, {'VAN ', 'VON '}) or |
||
| 1677 | _string_at(0, 3, {'SCH'}))): |
||
| 1678 | (primary, secondary) = _metaph_add('T') |
||
| 1679 | else: |
||
| 1680 | (primary, secondary) = _metaph_add('0', 'T') |
||
| 1681 | current += 2 |
||
| 1682 | continue |
||
| 1683 | |||
| 1684 | elif _string_at((current + 1), 1, {'T', 'D'}): |
||
| 1685 | current += 2 |
||
| 1686 | else: |
||
| 1687 | current += 1 |
||
| 1688 | (primary, secondary) = _metaph_add('T') |
||
| 1689 | continue |
||
| 1690 | |||
| 1691 | elif _get_at(current) == 'V': |
||
| 1692 | if _get_at(current + 1) == 'V': |
||
| 1693 | current += 2 |
||
| 1694 | else: |
||
| 1695 | current += 1 |
||
| 1696 | (primary, secondary) = _metaph_add('F') |
||
| 1697 | continue |
||
| 1698 | |||
| 1699 | elif _get_at(current) == 'W': |
||
| 1700 | # can also be in middle of word |
||
| 1701 | if _string_at(current, 2, {'WR'}): |
||
| 1702 | (primary, secondary) = _metaph_add('R') |
||
| 1703 | current += 2 |
||
| 1704 | continue |
||
| 1705 | elif ((current == 0) and |
||
| 1706 | (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
||
| 1707 | # Wasserman should match Vasserman |
||
| 1708 | if _is_vowel(current + 1): |
||
| 1709 | (primary, secondary) = _metaph_add('A', 'F') |
||
| 1710 | else: |
||
| 1711 | # need Uomo to match Womo |
||
| 1712 | (primary, secondary) = _metaph_add('A') |
||
| 1713 | |||
| 1714 | # Arnow should match Arnoff |
||
| 1715 | if ((((current == last) and _is_vowel(current - 1)) or |
||
| 1716 | _string_at((current - 1), 5, |
||
| 1717 | {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
||
| 1718 | _string_at(0, 3, ['SCH']))): |
||
| 1719 | (primary, secondary) = _metaph_add('', 'F') |
||
| 1720 | current += 1 |
||
| 1721 | continue |
||
| 1722 | # Polish e.g. 'filipowicz' |
||
| 1723 | elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
||
| 1724 | (primary, secondary) = _metaph_add('TS', 'FX') |
||
| 1725 | current += 4 |
||
| 1726 | continue |
||
| 1727 | # else skip it |
||
| 1728 | else: |
||
| 1729 | current += 1 |
||
| 1730 | continue |
||
| 1731 | |||
| 1732 | elif _get_at(current) == 'X': |
||
| 1733 | # French e.g. breaux |
||
| 1734 | if (not ((current == last) and |
||
| 1735 | (_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
||
| 1736 | _string_at((current - 2), 2, {'AU', 'OU'})))): |
||
| 1737 | (primary, secondary) = _metaph_add('KS') |
||
| 1738 | |||
| 1739 | if _string_at((current + 1), 1, {'C', 'X'}): |
||
| 1740 | current += 2 |
||
| 1741 | else: |
||
| 1742 | current += 1 |
||
| 1743 | continue |
||
| 1744 | |||
| 1745 | elif _get_at(current) == 'Z': |
||
| 1746 | # Chinese Pinyin e.g. 'zhao' |
||
| 1747 | if _get_at(current + 1) == 'H': |
||
| 1748 | (primary, secondary) = _metaph_add('J') |
||
| 1749 | current += 2 |
||
| 1750 | continue |
||
| 1751 | elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
||
| 1752 | (_slavo_germanic() and ((current > 0) and |
||
| 1753 | _get_at(current - 1) != 'T'))): |
||
| 1754 | (primary, secondary) = _metaph_add('S', 'TS') |
||
| 1755 | else: |
||
| 1756 | (primary, secondary) = _metaph_add('S') |
||
| 1757 | |||
| 1758 | if _get_at(current + 1) == 'Z': |
||
| 1759 | current += 2 |
||
| 1760 | else: |
||
| 1761 | current += 1 |
||
| 1762 | continue |
||
| 1763 | |||
| 1764 | else: |
||
| 1765 | current += 1 |
||
| 1766 | |||
| 1767 | if maxlength and maxlength < _INFINITY: |
||
| 1768 | primary = primary[:maxlength] |
||
| 1769 | secondary = secondary[:maxlength] |
||
| 1770 | if primary == secondary: |
||
| 1771 | secondary = '' |
||
| 1772 | |||
| 1773 | return (primary, secondary) |
||
| 1774 | |||
| 1775 | |||
| 1776 | def caverphone(word, version=2): |
||
| 1777 | """Return the Caverphone code for a word. |
||
| 1778 | |||
| 1779 | A description of version 1 of the algorithm can be found at: |
||
| 1780 | http://caversham.otago.ac.nz/files/working/ctp060902.pdf |
||
| 1781 | |||
| 1782 | A description of version 2 of the algorithm can be found at: |
||
| 1783 | http://caversham.otago.ac.nz/files/working/ctp150804.pdf |
||
| 1784 | |||
| 1785 | :param str word: the word to transform |
||
| 1786 | :param int version: the version of Caverphone to employ for encoding |
||
| 1787 | (defaults to 2) |
||
| 1788 | :returns: the Caverphone value |
||
| 1789 | :rtype: str |
||
| 1790 | |||
| 1791 | >>> caverphone('Christopher') |
||
| 1792 | 'KRSTFA1111' |
||
| 1793 | >>> caverphone('Niall') |
||
| 1794 | 'NA11111111' |
||
| 1795 | >>> caverphone('Smith') |
||
| 1796 | 'SMT1111111' |
||
| 1797 | >>> caverphone('Schmidt') |
||
| 1798 | 'SKMT111111' |
||
| 1799 | |||
| 1800 | >>> caverphone('Christopher', 1) |
||
| 1801 | 'KRSTF1' |
||
| 1802 | >>> caverphone('Niall', 1) |
||
| 1803 | 'N11111' |
||
| 1804 | >>> caverphone('Smith', 1) |
||
| 1805 | 'SMT111' |
||
| 1806 | >>> caverphone('Schmidt', 1) |
||
| 1807 | 'SKMT11' |
||
| 1808 | """ |
||
| 1809 | _vowels = {'a', 'e', 'i', 'o', 'u'} |
||
| 1810 | |||
| 1811 | word = word.lower() |
||
| 1812 | word = ''.join(c for c in word if c in |
||
| 1813 | {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
||
| 1814 | 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
||
| 1815 | 'y', 'z'}) |
||
| 1816 | |||
| 1817 | def _squeeze_replace(word, char, new_char): |
||
| 1818 | """Convert strings of char in word to one instance of new_char.""" |
||
| 1819 | while char * 2 in word: |
||
| 1820 | word = word.replace(char * 2, char) |
||
| 1821 | return word.replace(char, new_char) |
||
| 1822 | |||
| 1823 | # the main replacemet algorithm |
||
| 1824 | if version != 1 and word[-1:] == 'e': |
||
| 1825 | word = word[:-1] |
||
| 1826 | if word: |
||
| 1827 | if word[:5] == 'cough': |
||
| 1828 | word = 'cou2f'+word[5:] |
||
| 1829 | if word[:5] == 'rough': |
||
| 1830 | word = 'rou2f'+word[5:] |
||
| 1831 | if word[:5] == 'tough': |
||
| 1832 | word = 'tou2f'+word[5:] |
||
| 1833 | if word[:6] == 'enough': |
||
| 1834 | word = 'enou2f'+word[6:] |
||
| 1835 | if version != 1 and word[:6] == 'trough': |
||
| 1836 | word = 'trou2f'+word[6:] |
||
| 1837 | if word[:2] == 'gn': |
||
| 1838 | word = '2n'+word[2:] |
||
| 1839 | if word[-2:] == 'mb': |
||
| 1840 | word = word[:-1]+'2' |
||
| 1841 | word = word.replace('cq', '2q') |
||
| 1842 | word = word.replace('ci', 'si') |
||
| 1843 | word = word.replace('ce', 'se') |
||
| 1844 | word = word.replace('cy', 'sy') |
||
| 1845 | word = word.replace('tch', '2ch') |
||
| 1846 | word = word.replace('c', 'k') |
||
| 1847 | word = word.replace('q', 'k') |
||
| 1848 | word = word.replace('x', 'k') |
||
| 1849 | word = word.replace('v', 'f') |
||
| 1850 | word = word.replace('dg', '2g') |
||
| 1851 | word = word.replace('tio', 'sio') |
||
| 1852 | word = word.replace('tia', 'sia') |
||
| 1853 | word = word.replace('d', 't') |
||
| 1854 | word = word.replace('ph', 'fh') |
||
| 1855 | word = word.replace('b', 'p') |
||
| 1856 | word = word.replace('sh', 's2') |
||
| 1857 | word = word.replace('z', 's') |
||
| 1858 | if word[0] in _vowels: |
||
| 1859 | word = 'A'+word[1:] |
||
| 1860 | word = word.replace('a', '3') |
||
| 1861 | word = word.replace('e', '3') |
||
| 1862 | word = word.replace('i', '3') |
||
| 1863 | word = word.replace('o', '3') |
||
| 1864 | word = word.replace('u', '3') |
||
| 1865 | if version != 1: |
||
| 1866 | word = word.replace('j', 'y') |
||
| 1867 | if word[:2] == 'y3': |
||
| 1868 | word = 'Y3'+word[2:] |
||
| 1869 | if word[:1] == 'y': |
||
| 1870 | word = 'A'+word[1:] |
||
| 1871 | word = word.replace('y', '3') |
||
| 1872 | word = word.replace('3gh3', '3kh3') |
||
| 1873 | word = word.replace('gh', '22') |
||
| 1874 | word = word.replace('g', 'k') |
||
| 1875 | |||
| 1876 | word = _squeeze_replace(word, 's', 'S') |
||
| 1877 | word = _squeeze_replace(word, 't', 'T') |
||
| 1878 | word = _squeeze_replace(word, 'p', 'P') |
||
| 1879 | word = _squeeze_replace(word, 'k', 'K') |
||
| 1880 | word = _squeeze_replace(word, 'f', 'F') |
||
| 1881 | word = _squeeze_replace(word, 'm', 'M') |
||
| 1882 | word = _squeeze_replace(word, 'n', 'N') |
||
| 1883 | |||
| 1884 | word = word.replace('w3', 'W3') |
||
| 1885 | if version == 1: |
||
| 1886 | word = word.replace('wy', 'Wy') |
||
| 1887 | word = word.replace('wh3', 'Wh3') |
||
| 1888 | if version == 1: |
||
| 1889 | word = word.replace('why', 'Why') |
||
| 1890 | if version != 1 and word[-1:] == 'w': |
||
| 1891 | word = word[:-1]+'3' |
||
| 1892 | word = word.replace('w', '2') |
||
| 1893 | if word[:1] == 'h': |
||
| 1894 | word = 'A'+word[1:] |
||
| 1895 | word = word.replace('h', '2') |
||
| 1896 | word = word.replace('r3', 'R3') |
||
| 1897 | if version == 1: |
||
| 1898 | word = word.replace('ry', 'Ry') |
||
| 1899 | if version != 1 and word[-1:] == 'r': |
||
| 1900 | word = word[:-1]+'3' |
||
| 1901 | word = word.replace('r', '2') |
||
| 1902 | word = word.replace('l3', 'L3') |
||
| 1903 | if version == 1: |
||
| 1904 | word = word.replace('ly', 'Ly') |
||
| 1905 | if version != 1 and word[-1:] == 'l': |
||
| 1906 | word = word[:-1]+'3' |
||
| 1907 | word = word.replace('l', '2') |
||
| 1908 | if version == 1: |
||
| 1909 | word = word.replace('j', 'y') |
||
| 1910 | word = word.replace('y3', 'Y3') |
||
| 1911 | word = word.replace('y', '2') |
||
| 1912 | word = word.replace('2', '') |
||
| 1913 | if version != 1 and word[-1:] == '3': |
||
| 1914 | word = word[:-1]+'A' |
||
| 1915 | word = word.replace('3', '') |
||
| 1916 | |||
| 1917 | # pad with 1s, then extract the necessary length of code |
||
| 1918 | word = word+'1'*10 |
||
| 1919 | if version != 1: |
||
| 1920 | word = word[:10] |
||
| 1921 | else: |
||
| 1922 | word = word[:6] |
||
| 1923 | |||
| 1924 | return word |
||
| 1925 | |||
| 1926 | |||
| 1927 | def alpha_sis(word, maxlength=14): |
||
| 1928 | """Return the IBM Alpha Search Inquiry System code for a word. |
||
| 1929 | |||
| 1930 | Based on the algorithm described in "Accessing individual records from |
||
| 1931 | personal data files using non-unique identifiers" / Gwendolyn B. Moore, |
||
| 1932 | et al.; prepared for the Institute for Computer Sciences and Technology, |
||
| 1933 | National Bureau of Standards, Washington, D.C (1977): |
||
| 1934 | https://archive.org/stream/accessingindivid00moor#page/15/mode/1up |
||
| 1935 | |||
| 1936 | A collection is necessary since there can be multiple values for a |
||
| 1937 | single word. But the collection must be ordered since the first value |
||
| 1938 | is the primary coding. |
||
| 1939 | |||
| 1940 | :param str word: the word to transform |
||
| 1941 | :param int maxlength: the length of the code returned (defaults to 14) |
||
| 1942 | :returns: the Alpha SIS value |
||
| 1943 | :rtype: tuple |
||
| 1944 | |||
| 1945 | >>> alpha_sis('Christopher') |
||
| 1946 | ('06401840000000', '07040184000000', '04018400000000') |
||
| 1947 | >>> alpha_sis('Niall') |
||
| 1948 | ('02500000000000',) |
||
| 1949 | >>> alpha_sis('Smith') |
||
| 1950 | ('03100000000000',) |
||
| 1951 | >>> alpha_sis('Schmidt') |
||
| 1952 | ('06310000000000',) |
||
| 1953 | """ |
||
| 1954 | _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', |
||
| 1955 | 'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', |
||
| 1956 | 'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', |
||
| 1957 | 'O': '1', 'U': '1', 'W': '4', 'Y': '5'} |
||
| 1958 | _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', |
||
| 1959 | 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', |
||
| 1960 | 'Y') |
||
| 1961 | _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'), |
||
| 1962 | 'CH': ('6', '70', '0'), 'CK': ('7', '6'), |
||
| 1963 | 'DS': ('0', '10'), 'DZ': ('0', '10'), |
||
| 1964 | 'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', |
||
| 1965 | 'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', |
||
| 1966 | 'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', |
||
| 1967 | 'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', |
||
| 1968 | 'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', |
||
| 1969 | 'F': '8', 'V': '8', 'B': '9', 'P': '9'} |
||
| 1970 | _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', |
||
| 1971 | 'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', |
||
| 1972 | 'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', |
||
| 1973 | 'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P') |
||
| 1974 | |||
| 1975 | alpha = [''] |
||
| 1976 | pos = 0 |
||
| 1977 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 1978 | word = word.replace('ß', 'SS') |
||
| 1979 | word = ''.join(c for c in word if c in |
||
| 1980 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 1981 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 1982 | 'Y', 'Z'}) |
||
| 1983 | |||
| 1984 | # Clamp maxlength to [4, 64] |
||
| 1985 | if maxlength is not None: |
||
| 1986 | maxlength = min(max(4, maxlength), 64) |
||
| 1987 | else: |
||
| 1988 | maxlength = 64 |
||
| 1989 | |||
| 1990 | # Do special processing for initial substrings |
||
| 1991 | for k in _alpha_sis_initials_order: |
||
| 1992 | if word.startswith(k): |
||
| 1993 | alpha[0] += _alpha_sis_initials[k] |
||
| 1994 | pos += len(k) |
||
| 1995 | break |
||
| 1996 | |||
| 1997 | # Add a '0' if alpha is still empty |
||
| 1998 | if not alpha[0]: |
||
| 1999 | alpha[0] += '0' |
||
| 2000 | |||
| 2001 | # Whether or not any special initial codes were encoded, iterate |
||
| 2002 | # through the length of the word in the main encoding loop |
||
| 2003 | while pos < len(word): |
||
| 2004 | origpos = pos |
||
| 2005 | for k in _alpha_sis_basic_order: |
||
| 2006 | if word[pos:].startswith(k): |
||
| 2007 | if isinstance(_alpha_sis_basic[k], tuple): |
||
| 2008 | newalpha = [] |
||
| 2009 | for i in range(len(_alpha_sis_basic[k])): |
||
| 2010 | newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] |
||
| 2011 | alpha = newalpha |
||
| 2012 | else: |
||
| 2013 | alpha = [_ + _alpha_sis_basic[k] for _ in alpha] |
||
| 2014 | pos += len(k) |
||
| 2015 | break |
||
| 2016 | if pos == origpos: |
||
| 2017 | alpha = [_ + '_' for _ in alpha] |
||
| 2018 | pos += 1 |
||
| 2019 | |||
| 2020 | # Trim doublets and placeholders |
||
| 2021 | for i in range(len(alpha)): |
||
| 2022 | pos = 1 |
||
| 2023 | while pos < len(alpha[i]): |
||
| 2024 | if alpha[i][pos] == alpha[i][pos-1]: |
||
| 2025 | alpha[i] = alpha[i][:pos]+alpha[i][pos+1:] |
||
| 2026 | pos += 1 |
||
| 2027 | alpha = (_.replace('_', '') for _ in alpha) |
||
| 2028 | |||
| 2029 | # Trim codes and return tuple |
||
| 2030 | alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha) |
||
| 2031 | return tuple(alpha) |
||
| 2032 | |||
| 2033 | |||
| 2034 | def fuzzy_soundex(word, maxlength=5, zero_pad=True): |
||
| 2035 | """Return the Fuzzy Soundex code for a word. |
||
| 2036 | |||
| 2037 | Fuzzy Soundex is an algorithm derived from Soundex, defined in: |
||
| 2038 | Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for |
||
| 2039 | Soundex Retrieval." |
||
| 2040 | http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf |
||
| 2041 | |||
| 2042 | :param str word: the word to transform |
||
| 2043 | :param int maxlength: the length of the code returned (defaults to 4) |
||
| 2044 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
| 2045 | a maxlength string |
||
| 2046 | :returns: the Fuzzy Soundex value |
||
| 2047 | :rtype: str |
||
| 2048 | |||
| 2049 | >>> fuzzy_soundex('Christopher') |
||
| 2050 | 'K6931' |
||
| 2051 | >>> fuzzy_soundex('Niall') |
||
| 2052 | 'N4000' |
||
| 2053 | >>> fuzzy_soundex('Smith') |
||
| 2054 | 'S5300' |
||
| 2055 | >>> fuzzy_soundex('Smith') |
||
| 2056 | 'S5300' |
||
| 2057 | """ |
||
| 2058 | _fuzzy_soundex_translation = dict(zip((ord(_) for _ in |
||
| 2059 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 2060 | '0193017-07745501769301-7-9')) |
||
| 2061 | |||
| 2062 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 2063 | word = word.replace('ß', 'SS') |
||
| 2064 | |||
| 2065 | # Clamp maxlength to [4, 64] |
||
| 2066 | if maxlength is not None: |
||
| 2067 | maxlength = min(max(4, maxlength), 64) |
||
| 2068 | else: |
||
| 2069 | maxlength = 64 |
||
| 2070 | |||
| 2071 | if not word: |
||
| 2072 | if zero_pad: |
||
| 2073 | return '0' * maxlength |
||
| 2074 | return '0' |
||
| 2075 | |||
| 2076 | if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: |
||
| 2077 | word = 'SS' + word[2:] |
||
| 2078 | elif word[:2] == 'GN': |
||
| 2079 | word = 'NN' + word[2:] |
||
| 2080 | elif word[:2] in {'HR', 'WR'}: |
||
| 2081 | word = 'RR' + word[2:] |
||
| 2082 | elif word[:2] == 'HW': |
||
| 2083 | word = 'WW' + word[2:] |
||
| 2084 | elif word[:2] in {'KN', 'NG'}: |
||
| 2085 | word = 'NN' + word[2:] |
||
| 2086 | |||
| 2087 | if word[-2:] == 'CH': |
||
| 2088 | word = word[:-2] + 'KK' |
||
| 2089 | elif word[-2:] == 'NT': |
||
| 2090 | word = word[:-2] + 'TT' |
||
| 2091 | elif word[-2:] == 'RT': |
||
| 2092 | word = word[:-2] + 'RR' |
||
| 2093 | elif word[-3:] == 'RDT': |
||
| 2094 | word = word[:-3] + 'RR' |
||
| 2095 | |||
| 2096 | word = word.replace('CA', 'KA') |
||
| 2097 | word = word.replace('CC', 'KK') |
||
| 2098 | word = word.replace('CK', 'KK') |
||
| 2099 | word = word.replace('CE', 'SE') |
||
| 2100 | word = word.replace('CHL', 'KL') |
||
| 2101 | word = word.replace('CL', 'KL') |
||
| 2102 | word = word.replace('CHR', 'KR') |
||
| 2103 | word = word.replace('CR', 'KR') |
||
| 2104 | word = word.replace('CI', 'SI') |
||
| 2105 | word = word.replace('CO', 'KO') |
||
| 2106 | word = word.replace('CU', 'KU') |
||
| 2107 | word = word.replace('CY', 'SY') |
||
| 2108 | word = word.replace('DG', 'GG') |
||
| 2109 | word = word.replace('GH', 'HH') |
||
| 2110 | word = word.replace('MAC', 'MK') |
||
| 2111 | word = word.replace('MC', 'MK') |
||
| 2112 | word = word.replace('NST', 'NSS') |
||
| 2113 | word = word.replace('PF', 'FF') |
||
| 2114 | word = word.replace('PH', 'FF') |
||
| 2115 | word = word.replace('SCH', 'SSS') |
||
| 2116 | word = word.replace('TIO', 'SIO') |
||
| 2117 | word = word.replace('TIA', 'SIO') |
||
| 2118 | word = word.replace('TCH', 'CHH') |
||
| 2119 | |||
| 2120 | sdx = word.translate(_fuzzy_soundex_translation) |
||
| 2121 | sdx = sdx.replace('-', '') |
||
| 2122 | |||
| 2123 | # remove repeating characters |
||
| 2124 | sdx = _delete_consecutive_repeats(sdx) |
||
| 2125 | |||
| 2126 | if word[0] in {'H', 'W', 'Y'}: |
||
| 2127 | sdx = word[0] + sdx |
||
| 2128 | else: |
||
| 2129 | sdx = word[0] + sdx[1:] |
||
| 2130 | |||
| 2131 | sdx = sdx.replace('0', '') |
||
| 2132 | |||
| 2133 | if zero_pad: |
||
| 2134 | sdx += ('0'*maxlength) |
||
| 2135 | |||
| 2136 | return sdx[:maxlength] |
||
| 2137 | |||
| 2138 | |||
| 2139 | def phonex(word, maxlength=4, zero_pad=True): |
||
| 2140 | """Return the Phonex code for a word. |
||
| 2141 | |||
| 2142 | Phonex is an algorithm derived from Soundex, defined in: |
||
| 2143 | Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms". |
||
| 2144 | http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf |
||
| 2145 | |||
| 2146 | :param str word: the word to transform |
||
| 2147 | :param int maxlength: the length of the code returned (defaults to 4) |
||
| 2148 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
| 2149 | a maxlength string |
||
| 2150 | :returns: the Phonex value |
||
| 2151 | :rtype: str |
||
| 2152 | |||
| 2153 | >>> phonex('Christopher') |
||
| 2154 | 'C623' |
||
| 2155 | >>> phonex('Niall') |
||
| 2156 | 'N400' |
||
| 2157 | >>> phonex('Schmidt') |
||
| 2158 | 'S253' |
||
| 2159 | >>> phonex('Smith') |
||
| 2160 | 'S530' |
||
| 2161 | """ |
||
| 2162 | name = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 2163 | name = name.replace('ß', 'SS') |
||
| 2164 | |||
| 2165 | # Clamp maxlength to [4, 64] |
||
| 2166 | if maxlength is not None: |
||
| 2167 | maxlength = min(max(4, maxlength), 64) |
||
| 2168 | else: |
||
| 2169 | maxlength = 64 |
||
| 2170 | |||
| 2171 | name_code = last = '' |
||
| 2172 | |||
| 2173 | # Deletions effected by replacing with next letter which |
||
| 2174 | # will be ignored due to duplicate handling of Soundex code. |
||
| 2175 | # This is faster than 'moving' all subsequent letters. |
||
| 2176 | |||
| 2177 | # Remove any trailing Ss |
||
| 2178 | while name[-1:] == 'S': |
||
| 2179 | name = name[:-1] |
||
| 2180 | |||
| 2181 | # Phonetic equivalents of first 2 characters |
||
| 2182 | # Works since duplicate letters are ignored |
||
| 2183 | if name[:2] == 'KN': |
||
| 2184 | name = 'N' + name[2:] # KN.. == N.. |
||
| 2185 | elif name[:2] == 'PH': |
||
| 2186 | name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) |
||
| 2187 | elif name[:2] == 'WR': |
||
| 2188 | name = 'R' + name[2:] # WR.. == R.. |
||
| 2189 | |||
| 2190 | if name: |
||
| 2191 | # Special case, ignore H first letter (subsequent Hs ignored anyway) |
||
| 2192 | # Works since duplicate letters are ignored |
||
| 2193 | if name[0] == 'H': |
||
| 2194 | name = name[1:] |
||
| 2195 | |||
| 2196 | if name: |
||
| 2197 | # Phonetic equivalents of first character |
||
| 2198 | if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 2199 | name = 'A' + name[1:] |
||
| 2200 | elif name[0] in {'B', 'P'}: |
||
| 2201 | name = 'B' + name[1:] |
||
| 2202 | elif name[0] in {'V', 'F'}: |
||
| 2203 | name = 'F' + name[1:] |
||
| 2204 | elif name[0] in {'C', 'K', 'Q'}: |
||
| 2205 | name = 'C' + name[1:] |
||
| 2206 | elif name[0] in {'G', 'J'}: |
||
| 2207 | name = 'G' + name[1:] |
||
| 2208 | elif name[0] in {'S', 'Z'}: |
||
| 2209 | name = 'S' + name[1:] |
||
| 2210 | |||
| 2211 | name_code = last = name[0] |
||
| 2212 | |||
| 2213 | # MODIFIED SOUNDEX CODE |
||
| 2214 | for i in range(1, len(name)): |
||
| 2215 | code = '0' |
||
| 2216 | if name[i] in {'B', 'F', 'P', 'V'}: |
||
| 2217 | code = '1' |
||
| 2218 | elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: |
||
| 2219 | code = '2' |
||
| 2220 | elif name[i] in {'D', 'T'}: |
||
| 2221 | if name[i+1:i+2] != 'C': |
||
| 2222 | code = '3' |
||
| 2223 | elif name[i] == 'L': |
||
| 2224 | if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
||
| 2225 | i+1 == len(name)): |
||
| 2226 | code = '4' |
||
| 2227 | elif name[i] in {'M', 'N'}: |
||
| 2228 | if name[i+1:i+2] in {'D', 'G'}: |
||
| 2229 | name = name[:i+1] + name[i] + name[i+2:] |
||
| 2230 | code = '5' |
||
| 2231 | elif name[i] == 'R': |
||
| 2232 | if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
||
| 2233 | i+1 == len(name)): |
||
| 2234 | code = '6' |
||
| 2235 | |||
| 2236 | if code != last and code != '0' and i != 0: |
||
| 2237 | name_code += code |
||
| 2238 | |||
| 2239 | last = name_code[-1] |
||
| 2240 | |||
| 2241 | if zero_pad: |
||
| 2242 | name_code += '0' * maxlength |
||
| 2243 | if not name_code: |
||
| 2244 | name_code = '0' |
||
| 2245 | return name_code[:maxlength] |
||
| 2246 | |||
| 2247 | |||
| 2248 | def phonem(word): |
||
| 2249 | """Return the Phonem code for a word. |
||
| 2250 | |||
| 2251 | Phonem is defined in: |
||
| 2252 | Wilde, Georg and Carsten Meyer. 1988. "Nicht wörtlich genommen, |
||
| 2253 | 'Schreibweisentolerante' Suchroutine in dBASE implementiert." c't Magazin |
||
| 2254 | für Computer Technik. Oct. 1988. 126--131. |
||
| 2255 | |||
| 2256 | This version is based on the Perl implementation documented at: |
||
| 2257 | http://ifl.phil-fak.uni-koeln.de/sites/linguistik/Phonetik/import/Phonetik_Files/Allgemeine_Dateien/Martin_Wilz.pdf |
||
| 2258 | It includes some enhancements presented in the Java port at: |
||
| 2259 | https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java |
||
| 2260 | |||
| 2261 | Phonem is intended chiefly for German names/words. |
||
| 2262 | |||
| 2263 | :param str word: the word to transform |
||
| 2264 | :returns: the Phonem value |
||
| 2265 | :rtype: str |
||
| 2266 | |||
| 2267 | >>> phonem('Christopher') |
||
| 2268 | 'CRYSDOVR' |
||
| 2269 | >>> phonem('Niall') |
||
| 2270 | 'NYAL' |
||
| 2271 | >>> phonem('Smith') |
||
| 2272 | 'SMYD' |
||
| 2273 | >>> phonem('Schmidt') |
||
| 2274 | 'CMYD' |
||
| 2275 | """ |
||
| 2276 | _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), |
||
| 2277 | ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), |
||
| 2278 | ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), |
||
| 2279 | ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), |
||
| 2280 | ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), |
||
| 2281 | ('AU', 'A§'), ('OU', '§')) |
||
| 2282 | _phonem_translation = dict(zip((ord(_) for _ in |
||
| 2283 | 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
||
| 2284 | 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) |
||
| 2285 | |||
| 2286 | word = unicodedata.normalize('NFC', text_type(word.upper())) |
||
| 2287 | for i, j in _phonem_substitutions: |
||
| 2288 | word = word.replace(i, j) |
||
| 2289 | word = word.translate(_phonem_translation) |
||
| 2290 | |||
| 2291 | return ''.join(c for c in _delete_consecutive_repeats(word) |
||
| 2292 | if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', |
||
| 2293 | 'U', 'V', 'W', 'X', 'Y', 'Ö'}) |
||
| 2294 | |||
| 2295 | |||
| 2296 | def phonix(word, maxlength=4, zero_pad=True): |
||
| 2297 | """Return the Phonix code for a word. |
||
| 2298 | |||
| 2299 | Phonix is a Soundex-like algorithm defined in: |
||
| 2300 | T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366. |
||
| 2301 | |||
| 2302 | This implementation is based on |
||
| 2303 | http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c |
||
| 2304 | http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py |
||
| 2305 | and |
||
| 2306 | https://metacpan.org/pod/Text::Phonetic::Phonix |
||
| 2307 | |||
| 2308 | :param str word: the word to transform |
||
| 2309 | :param int maxlength: the length of the code returned (defaults to 4) |
||
| 2310 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
| 2311 | a maxlength string |
||
| 2312 | :returns: the Phonix value |
||
| 2313 | :rtype: str |
||
| 2314 | |||
| 2315 | >>> phonix('Christopher') |
||
| 2316 | 'K683' |
||
| 2317 | >>> phonix('Niall') |
||
| 2318 | 'N400' |
||
| 2319 | >>> phonix('Smith') |
||
| 2320 | 'S530' |
||
| 2321 | >>> phonix('Schmidt') |
||
| 2322 | 'S530' |
||
| 2323 | """ |
||
| 2324 | # pylint: disable=too-many-branches |
||
| 2325 | def _start_repl(word, src, tar, post=None): |
||
| 2326 | r"""Replace src with tar at the start of word.""" |
||
| 2327 | if post: |
||
| 2328 | for i in post: |
||
| 2329 | if word.startswith(src+i): |
||
| 2330 | return tar + word[len(src):] |
||
| 2331 | elif word.startswith(src): |
||
| 2332 | return tar + word[len(src):] |
||
| 2333 | return word |
||
| 2334 | |||
| 2335 | def _end_repl(word, src, tar, pre=None): |
||
| 2336 | r"""Replace src with tar at the end of word.""" |
||
| 2337 | if pre: |
||
| 2338 | for i in pre: |
||
| 2339 | if word.endswith(i+src): |
||
| 2340 | return word[:-len(src)] + tar |
||
| 2341 | elif word.endswith(src): |
||
| 2342 | return word[:-len(src)] + tar |
||
| 2343 | return word |
||
| 2344 | |||
| 2345 | def _mid_repl(word, src, tar, pre=None, post=None): |
||
| 2346 | r"""Replace src with tar in the middle of word.""" |
||
| 2347 | if pre or post: |
||
| 2348 | if not pre: |
||
| 2349 | return word[0] + _all_repl(word[1:], src, tar, pre, post) |
||
| 2350 | elif not post: |
||
| 2351 | return _all_repl(word[:-1], src, tar, pre, post) + word[-1] |
||
| 2352 | return _all_repl(word, src, tar, pre, post) |
||
| 2353 | return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) + |
||
| 2354 | word[-1]) |
||
| 2355 | |||
| 2356 | def _all_repl(word, src, tar, pre=None, post=None): |
||
| 2357 | r"""Replace src with tar anywhere in word.""" |
||
| 2358 | if pre or post: |
||
| 2359 | if post: |
||
| 2360 | post = post |
||
| 2361 | else: |
||
| 2362 | post = frozenset(('',)) |
||
| 2363 | if pre: |
||
| 2364 | pre = pre |
||
| 2365 | else: |
||
| 2366 | pre = frozenset(('',)) |
||
| 2367 | |||
| 2368 | for i, j in ((i, j) for i in pre for j in post): |
||
| 2369 | word = word.replace(i+src+j, i+tar+j) |
||
| 2370 | return word |
||
| 2371 | else: |
||
| 2372 | return word.replace(src, tar) |
||
| 2373 | |||
| 2374 | _vow = {'A', 'E', 'I', 'O', 'U'} |
||
| 2375 | _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
||
| 2376 | 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'} |
||
| 2377 | |||
| 2378 | _phonix_substitutions = ((_all_repl, 'DG', 'G'), |
||
| 2379 | (_all_repl, 'CO', 'KO'), |
||
| 2380 | (_all_repl, 'CA', 'KA'), |
||
| 2381 | (_all_repl, 'CU', 'KU'), |
||
| 2382 | (_all_repl, 'CY', 'SI'), |
||
| 2383 | (_all_repl, 'CI', 'SI'), |
||
| 2384 | (_all_repl, 'CE', 'SE'), |
||
| 2385 | (_start_repl, 'CL', 'KL', _vow), |
||
| 2386 | (_all_repl, 'CK', 'K'), |
||
| 2387 | (_end_repl, 'GC', 'K'), |
||
| 2388 | (_end_repl, 'JC', 'K'), |
||
| 2389 | (_start_repl, 'CHR', 'KR', _vow), |
||
| 2390 | (_start_repl, 'CR', 'KR', _vow), |
||
| 2391 | (_start_repl, 'WR', 'R'), |
||
| 2392 | (_all_repl, 'NC', 'NK'), |
||
| 2393 | (_all_repl, 'CT', 'KT'), |
||
| 2394 | (_all_repl, 'PH', 'F'), |
||
| 2395 | (_all_repl, 'AA', 'AR'), |
||
| 2396 | (_all_repl, 'SCH', 'SH'), |
||
| 2397 | (_all_repl, 'BTL', 'TL'), |
||
| 2398 | (_all_repl, 'GHT', 'T'), |
||
| 2399 | (_all_repl, 'AUGH', 'ARF'), |
||
| 2400 | (_mid_repl, 'LJ', 'LD', _vow, _vow), |
||
| 2401 | (_all_repl, 'LOUGH', 'LOW'), |
||
| 2402 | (_start_repl, 'Q', 'KW'), |
||
| 2403 | (_start_repl, 'KN', 'N'), |
||
| 2404 | (_end_repl, 'GN', 'N'), |
||
| 2405 | (_all_repl, 'GHN', 'N'), |
||
| 2406 | (_end_repl, 'GNE', 'N'), |
||
| 2407 | (_all_repl, 'GHNE', 'NE'), |
||
| 2408 | (_end_repl, 'GNES', 'NS'), |
||
| 2409 | (_start_repl, 'GN', 'N'), |
||
| 2410 | (_mid_repl, 'GN', 'N', None, _con), |
||
| 2411 | (_end_repl, 'GN', 'N'), |
||
| 2412 | (_start_repl, 'PS', 'S'), |
||
| 2413 | (_start_repl, 'PT', 'T'), |
||
| 2414 | (_start_repl, 'CZ', 'C'), |
||
| 2415 | (_mid_repl, 'WZ', 'Z', _vow), |
||
| 2416 | (_mid_repl, 'CZ', 'CH'), |
||
| 2417 | (_all_repl, 'LZ', 'LSH'), |
||
| 2418 | (_all_repl, 'RZ', 'RSH'), |
||
| 2419 | (_mid_repl, 'Z', 'S', None, _vow), |
||
| 2420 | (_all_repl, 'ZZ', 'TS'), |
||
| 2421 | (_mid_repl, 'Z', 'TS', _con), |
||
| 2422 | (_all_repl, 'HROUG', 'REW'), |
||
| 2423 | (_all_repl, 'OUGH', 'OF'), |
||
| 2424 | (_mid_repl, 'Q', 'KW', _vow, _vow), |
||
| 2425 | (_mid_repl, 'J', 'Y', _vow, _vow), |
||
| 2426 | (_start_repl, 'YJ', 'Y', _vow), |
||
| 2427 | (_start_repl, 'GH', 'G'), |
||
| 2428 | (_end_repl, 'GH', 'E', _vow), |
||
| 2429 | (_start_repl, 'CY', 'S'), |
||
| 2430 | (_all_repl, 'NX', 'NKS'), |
||
| 2431 | (_start_repl, 'PF', 'F'), |
||
| 2432 | (_end_repl, 'DT', 'T'), |
||
| 2433 | (_end_repl, 'TL', 'TIL'), |
||
| 2434 | (_end_repl, 'DL', 'DIL'), |
||
| 2435 | (_all_repl, 'YTH', 'ITH'), |
||
| 2436 | (_start_repl, 'TJ', 'CH', _vow), |
||
| 2437 | (_start_repl, 'TSJ', 'CH', _vow), |
||
| 2438 | (_start_repl, 'TS', 'T', _vow), |
||
| 2439 | (_all_repl, 'TCH', 'CH'), |
||
| 2440 | (_mid_repl, 'WSK', 'VSKIE', _vow), |
||
| 2441 | (_end_repl, 'WSK', 'VSKIE', _vow), |
||
| 2442 | (_start_repl, 'MN', 'N', _vow), |
||
| 2443 | (_start_repl, 'PN', 'N', _vow), |
||
| 2444 | (_mid_repl, 'STL', 'SL', _vow), |
||
| 2445 | (_end_repl, 'STL', 'SL', _vow), |
||
| 2446 | (_end_repl, 'TNT', 'ENT'), |
||
| 2447 | (_end_repl, 'EAUX', 'OH'), |
||
| 2448 | (_all_repl, 'EXCI', 'ECS'), |
||
| 2449 | (_all_repl, 'X', 'ECS'), |
||
| 2450 | (_end_repl, 'NED', 'ND'), |
||
| 2451 | (_all_repl, 'JR', 'DR'), |
||
| 2452 | (_end_repl, 'EE', 'EA'), |
||
| 2453 | (_all_repl, 'ZS', 'S'), |
||
| 2454 | (_mid_repl, 'R', 'AH', _vow, _con), |
||
| 2455 | (_end_repl, 'R', 'AH', _vow), |
||
| 2456 | (_mid_repl, 'HR', 'AH', _vow, _con), |
||
| 2457 | (_end_repl, 'HR', 'AH', _vow), |
||
| 2458 | (_end_repl, 'HR', 'AH', _vow), |
||
| 2459 | (_end_repl, 'RE', 'AR'), |
||
| 2460 | (_end_repl, 'R', 'AH', _vow), |
||
| 2461 | (_all_repl, 'LLE', 'LE'), |
||
| 2462 | (_end_repl, 'LE', 'ILE', _con), |
||
| 2463 | (_end_repl, 'LES', 'ILES', _con), |
||
| 2464 | (_end_repl, 'E', ''), |
||
| 2465 | (_end_repl, 'ES', 'S'), |
||
| 2466 | (_end_repl, 'SS', 'AS', _vow), |
||
| 2467 | (_end_repl, 'MB', 'M', _vow), |
||
| 2468 | (_all_repl, 'MPTS', 'MPS'), |
||
| 2469 | (_all_repl, 'MPS', 'MS'), |
||
| 2470 | (_all_repl, 'MPT', 'MT')) |
||
| 2471 | |||
| 2472 | _phonix_translation = dict(zip((ord(_) for _ in |
||
| 2473 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 2474 | '01230720022455012683070808')) |
||
| 2475 | |||
| 2476 | sdx = '' |
||
| 2477 | |||
| 2478 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 2479 | word = word.replace('ß', 'SS') |
||
| 2480 | word = ''.join(c for c in word if c in |
||
| 2481 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 2482 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 2483 | 'Y', 'Z'}) |
||
| 2484 | if word: |
||
| 2485 | for trans in _phonix_substitutions: |
||
| 2486 | word = trans[0](word, *trans[1:]) |
||
| 2487 | if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 2488 | sdx = 'v' + word[1:].translate(_phonix_translation) |
||
| 2489 | else: |
||
| 2490 | sdx = word[0] + word[1:].translate(_phonix_translation) |
||
| 2491 | sdx = _delete_consecutive_repeats(sdx) |
||
| 2492 | sdx = sdx.replace('0', '') |
||
| 2493 | |||
| 2494 | # Clamp maxlength to [4, 64] |
||
| 2495 | if maxlength is not None: |
||
| 2496 | maxlength = min(max(4, maxlength), 64) |
||
| 2497 | else: |
||
| 2498 | maxlength = 64 |
||
| 2499 | |||
| 2500 | if zero_pad: |
||
| 2501 | sdx += '0' * maxlength |
||
| 2502 | if not sdx: |
||
| 2503 | sdx = '0' |
||
| 2504 | return sdx[:maxlength] |
||
| 2505 | |||
| 2506 | |||
| 2507 | def sfinxbis(word, maxlength=None): |
||
| 2508 | """Return the SfinxBis code for a word. |
||
| 2509 | |||
| 2510 | SfinxBis is a Soundex-like algorithm defined in: |
||
| 2511 | http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf |
||
| 2512 | |||
| 2513 | This implementation follows the reference implementation: |
||
| 2514 | http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt |
||
| 2515 | |||
| 2516 | SfinxBis is intended chiefly for Swedish names. |
||
| 2517 | |||
| 2518 | :param str word: the word to transform |
||
| 2519 | :param int maxlength: the length of the code returned (defaults to |
||
| 2520 | unlimited) |
||
| 2521 | :returns: the SfinxBis value |
||
| 2522 | :rtype: tuple |
||
| 2523 | |||
| 2524 | >>> sfinxbis('Christopher') |
||
| 2525 | ('K68376',) |
||
| 2526 | >>> sfinxbis('Niall') |
||
| 2527 | ('N4',) |
||
| 2528 | >>> sfinxbis('Smith') |
||
| 2529 | ('S53',) |
||
| 2530 | >>> sfinxbis('Schmidt') |
||
| 2531 | ('S53',) |
||
| 2532 | |||
| 2533 | >>> sfinxbis('Johansson') |
||
| 2534 | ('J585',) |
||
| 2535 | >>> sfinxbis('Sjöberg') |
||
| 2536 | ('#162',) |
||
| 2537 | """ |
||
| 2538 | adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', |
||
| 2539 | ' VAN DER ', ' VON DEM ', ' VON DER ', |
||
| 2540 | ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', |
||
| 2541 | ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', |
||
| 2542 | ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', |
||
| 2543 | ' S:T ') |
||
| 2544 | |||
| 2545 | _harde_vokaler = {'A', 'O', 'U', 'Å'} |
||
| 2546 | _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} |
||
| 2547 | _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', |
||
| 2548 | 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
||
| 2549 | _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 2550 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 2551 | 'Y', 'Z', 'Ä', 'Å', 'Ö'} |
||
| 2552 | |||
| 2553 | _sfinxbis_translation = dict(zip((ord(_) for _ in |
||
| 2554 | 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), |
||
| 2555 | '123729224551268378999999999')) |
||
| 2556 | |||
| 2557 | _sfinxbis_substitutions = dict(zip((ord(_) for _ in |
||
| 2558 | 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), |
||
| 2559 | 'VSAAAAÄCEEEEIIIINOOOOÖUUUYY')) |
||
| 2560 | |||
| 2561 | def _foersvensker(ordet): |
||
| 2562 | """Return the Swedish-ized form of the word.""" |
||
| 2563 | ordet = ordet.replace('STIERN', 'STJÄRN') |
||
| 2564 | ordet = ordet.replace('HIE', 'HJ') |
||
| 2565 | ordet = ordet.replace('SIÖ', 'SJÖ') |
||
| 2566 | ordet = ordet.replace('SCH', 'SH') |
||
| 2567 | ordet = ordet.replace('QU', 'KV') |
||
| 2568 | ordet = ordet.replace('IO', 'JO') |
||
| 2569 | ordet = ordet.replace('PH', 'F') |
||
| 2570 | |||
| 2571 | for i in _harde_vokaler: |
||
| 2572 | ordet = ordet.replace(i+'Ü', i+'J') |
||
| 2573 | ordet = ordet.replace(i+'Y', i+'J') |
||
| 2574 | ordet = ordet.replace(i+'I', i+'J') |
||
| 2575 | for i in _mjuka_vokaler: |
||
| 2576 | ordet = ordet.replace(i+'Ü', i+'J') |
||
| 2577 | ordet = ordet.replace(i+'Y', i+'J') |
||
| 2578 | ordet = ordet.replace(i+'I', i+'J') |
||
| 2579 | |||
| 2580 | if 'H' in ordet: |
||
| 2581 | for i in _konsonanter: |
||
| 2582 | ordet = ordet.replace('H'+i, i) |
||
| 2583 | |||
| 2584 | ordet = ordet.translate(_sfinxbis_substitutions) |
||
| 2585 | |||
| 2586 | ordet = ordet.replace('Ð', 'ETH') |
||
| 2587 | ordet = ordet.replace('Þ', 'TH') |
||
| 2588 | ordet = ordet.replace('ß', 'SS') |
||
| 2589 | |||
| 2590 | return ordet |
||
| 2591 | |||
| 2592 | def _koda_foersta_ljudet(ordet): |
||
| 2593 | """Return the word with the first sound coded.""" |
||
| 2594 | if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler: |
||
| 2595 | ordet = '$' + ordet[1:] |
||
| 2596 | elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): |
||
| 2597 | ordet = 'J' + ordet[2:] |
||
| 2598 | elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler: |
||
| 2599 | ordet = 'J' + ordet[1:] |
||
| 2600 | elif ordet[0:1] == 'Q': |
||
| 2601 | ordet = 'K' + ordet[1:] |
||
| 2602 | elif (ordet[0:2] == 'CH' and |
||
| 2603 | ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)): |
||
| 2604 | ordet = '#' + ordet[2:] |
||
| 2605 | elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler: |
||
| 2606 | ordet = 'K' + ordet[1:] |
||
| 2607 | elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter: |
||
| 2608 | ordet = 'K' + ordet[1:] |
||
| 2609 | elif ordet[0:1] == 'X': |
||
| 2610 | ordet = 'S' + ordet[1:] |
||
| 2611 | elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler: |
||
| 2612 | ordet = 'S' + ordet[1:] |
||
| 2613 | elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'): |
||
| 2614 | ordet = '#' + ordet[3:] |
||
| 2615 | elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): |
||
| 2616 | ordet = '#' + ordet[2:] |
||
| 2617 | elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler: |
||
| 2618 | ordet = '#' + ordet[2:] |
||
| 2619 | elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler: |
||
| 2620 | ordet = '#' + ordet[1:] |
||
| 2621 | return ordet |
||
| 2622 | |||
| 2623 | # Steg 1, Versaler |
||
| 2624 | word = unicodedata.normalize('NFC', text_type(word.upper())) |
||
| 2625 | word = word.replace('ß', 'SS') |
||
| 2626 | word = word.replace('-', ' ') |
||
| 2627 | |||
| 2628 | # Steg 2, Ta bort adelsprefix |
||
| 2629 | for adelstitel in adelstitler: |
||
| 2630 | while adelstitel in word: |
||
| 2631 | word = word.replace(adelstitel, ' ') |
||
| 2632 | if word.startswith(adelstitel[1:]): |
||
| 2633 | word = word[len(adelstitel)-1:] |
||
| 2634 | |||
| 2635 | # Split word into tokens |
||
| 2636 | ordlista = word.split() |
||
| 2637 | |||
| 2638 | # Steg 3, Ta bort dubbelteckning i början på namnet |
||
| 2639 | ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] |
||
| 2640 | if not ordlista: |
||
| 2641 | return ('',) |
||
| 2642 | |||
| 2643 | # Steg 4, Försvenskning |
||
| 2644 | ordlista = [_foersvensker(ordet) for ordet in ordlista] |
||
| 2645 | |||
| 2646 | # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) |
||
| 2647 | ordlista = [''.join(c for c in ordet if c in _alfabet) |
||
| 2648 | for ordet in ordlista] |
||
| 2649 | |||
| 2650 | # Steg 6, Koda första ljudet |
||
| 2651 | ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] |
||
| 2652 | |||
| 2653 | # Steg 7, Dela upp namnet i två delar |
||
| 2654 | rest = [ordet[1:] for ordet in ordlista] |
||
| 2655 | |||
| 2656 | # Steg 8, Utför fonetisk transformation i resten |
||
| 2657 | rest = [ordet.replace('DT', 'T') for ordet in rest] |
||
| 2658 | rest = [ordet.replace('X', 'KS') for ordet in rest] |
||
| 2659 | |||
| 2660 | # Steg 9, Koda resten till en sifferkod |
||
| 2661 | for vokal in _mjuka_vokaler: |
||
| 2662 | rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest] |
||
| 2663 | rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] |
||
| 2664 | |||
| 2665 | # Steg 10, Ta bort intilliggande dubbletter |
||
| 2666 | rest = [_delete_consecutive_repeats(ordet) for ordet in rest] |
||
| 2667 | |||
| 2668 | # Steg 11, Ta bort alla "9" |
||
| 2669 | rest = [ordet.replace('9', '') for ordet in rest] |
||
| 2670 | |||
| 2671 | # Steg 12, Sätt ihop delarna igen |
||
| 2672 | ordlista = [''.join(ordet) for ordet in |
||
| 2673 | zip((_[0:1] for _ in ordlista), rest)] |
||
| 2674 | |||
| 2675 | # truncate, if maxlength is set |
||
| 2676 | if maxlength and maxlength < _INFINITY: |
||
| 2677 | ordlista = [ordet[:maxlength] for ordet in ordlista] |
||
| 2678 | |||
| 2679 | return tuple(ordlista) |
||
| 2680 | |||
| 2681 | |||
| 2682 | def phonet(word, mode=1, lang='de', trace=False): |
||
| 2683 | """Return the phonet code for a word. |
||
| 2684 | |||
| 2685 | phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and |
||
| 2686 | documented in c't magazine vol. 25/1999, p. 252. It is a phonetic |
||
| 2687 | algorithm designed primarily for German. |
||
| 2688 | Cf. http://www.heise.de/ct/ftp/99/25/252/ |
||
| 2689 | |||
| 2690 | This is a port of Jesper Zedlitz's code, which is licensed LGPL: |
||
| 2691 | https://github.com/jze/phonet4java/blob/master/src/main/java/de/zedlitz/phonet4java/Phonet.java |
||
| 2692 | |||
| 2693 | That is, in turn, based on Michael's C code, which is also licensed LGPL: |
||
| 2694 | ftp://ftp.heise.de/pub/ct/listings/phonet.zip |
||
| 2695 | |||
| 2696 | :param str word: the word to transform |
||
| 2697 | :param int mode: the ponet variant to employ (1 or 2) |
||
| 2698 | :param str lang: 'de' (default) for German |
||
| 2699 | 'none' for no language |
||
| 2700 | :param bool trace: prints debugging info if True |
||
| 2701 | :returns: the phonet value |
||
| 2702 | :rtype: str |
||
| 2703 | |||
| 2704 | >>> phonet('Christopher') |
||
| 2705 | 'KRISTOFA' |
||
| 2706 | >>> phonet('Niall') |
||
| 2707 | 'NIAL' |
||
| 2708 | >>> phonet('Smith') |
||
| 2709 | 'SMIT' |
||
| 2710 | >>> phonet('Schmidt') |
||
| 2711 | 'SHMIT' |
||
| 2712 | |||
| 2713 | >>> phonet('Christopher', mode=2) |
||
| 2714 | 'KRIZTUFA' |
||
| 2715 | >>> phonet('Niall', mode=2) |
||
| 2716 | 'NIAL' |
||
| 2717 | >>> phonet('Smith', mode=2) |
||
| 2718 | 'ZNIT' |
||
| 2719 | >>> phonet('Schmidt', mode=2) |
||
| 2720 | 'ZNIT' |
||
| 2721 | |||
| 2722 | >>> phonet('Christopher', lang='none') |
||
| 2723 | 'CHRISTOPHER' |
||
| 2724 | >>> phonet('Niall', lang='none') |
||
| 2725 | 'NIAL' |
||
| 2726 | >>> phonet('Smith', lang='none') |
||
| 2727 | 'SMITH' |
||
| 2728 | >>> phonet('Schmidt', lang='none') |
||
| 2729 | 'SCHMIDT' |
||
| 2730 | """ |
||
| 2731 | # pylint: disable=too-many-branches |
||
| 2732 | |||
| 2733 | _phonet_rules_no_lang = ( # separator chars |
||
| 2734 | '´', ' ', ' ', |
||
| 2735 | '"', ' ', ' ', |
||
| 2736 | '`$', '', '', |
||
| 2737 | '\'', ' ', ' ', |
||
| 2738 | ',', ',', ',', |
||
| 2739 | ';', ',', ',', |
||
| 2740 | '-', ' ', ' ', |
||
| 2741 | ' ', ' ', ' ', |
||
| 2742 | '.', '.', '.', |
||
| 2743 | ':', '.', '.', |
||
| 2744 | # German umlauts |
||
| 2745 | 'Ä', 'AE', 'AE', |
||
| 2746 | 'Ö', 'OE', 'OE', |
||
| 2747 | 'Ü', 'UE', 'UE', |
||
| 2748 | 'ß', 'S', 'S', |
||
| 2749 | # international umlauts |
||
| 2750 | 'À', 'A', 'A', |
||
| 2751 | 'Á', 'A', 'A', |
||
| 2752 | 'Â', 'A', 'A', |
||
| 2753 | 'Ã', 'A', 'A', |
||
| 2754 | 'Å', 'A', 'A', |
||
| 2755 | 'Æ', 'AE', 'AE', |
||
| 2756 | 'Ç', 'C', 'C', |
||
| 2757 | 'Ð', 'DJ', 'DJ', |
||
| 2758 | 'È', 'E', 'E', |
||
| 2759 | 'É', 'E', 'E', |
||
| 2760 | 'Ê', 'E', 'E', |
||
| 2761 | 'Ë', 'E', 'E', |
||
| 2762 | 'Ì', 'I', 'I', |
||
| 2763 | 'Í', 'I', 'I', |
||
| 2764 | 'Î', 'I', 'I', |
||
| 2765 | 'Ï', 'I', 'I', |
||
| 2766 | 'Ñ', 'NH', 'NH', |
||
| 2767 | 'Ò', 'O', 'O', |
||
| 2768 | 'Ó', 'O', 'O', |
||
| 2769 | 'Ô', 'O', 'O', |
||
| 2770 | 'Õ', 'O', 'O', |
||
| 2771 | 'Œ', 'OE', 'OE', |
||
| 2772 | 'Ø', 'OE', 'OE', |
||
| 2773 | 'Š', 'SH', 'SH', |
||
| 2774 | 'Þ', 'TH', 'TH', |
||
| 2775 | 'Ù', 'U', 'U', |
||
| 2776 | 'Ú', 'U', 'U', |
||
| 2777 | 'Û', 'U', 'U', |
||
| 2778 | 'Ý', 'Y', 'Y', |
||
| 2779 | 'Ÿ', 'Y', 'Y', |
||
| 2780 | # 'normal' letters (A-Z) |
||
| 2781 | 'MC^', 'MAC', 'MAC', |
||
| 2782 | 'MC^', 'MAC', 'MAC', |
||
| 2783 | 'M´^', 'MAC', 'MAC', |
||
| 2784 | 'M\'^', 'MAC', 'MAC', |
||
| 2785 | 'O´^', 'O', 'O', |
||
| 2786 | 'O\'^', 'O', 'O', |
||
| 2787 | 'VAN DEN ^', 'VANDEN', 'VANDEN', |
||
| 2788 | None, None, None) |
||
| 2789 | |||
| 2790 | _phonet_rules_german = ( # separator chars |
||
| 2791 | '´', ' ', ' ', |
||
| 2792 | '"', ' ', ' ', |
||
| 2793 | '`$', '', '', |
||
| 2794 | '\'', ' ', ' ', |
||
| 2795 | ',', ' ', ' ', |
||
| 2796 | ';', ' ', ' ', |
||
| 2797 | '-', ' ', ' ', |
||
| 2798 | ' ', ' ', ' ', |
||
| 2799 | '.', '.', '.', |
||
| 2800 | ':', '.', '.', |
||
| 2801 | # German umlauts |
||
| 2802 | 'ÄE', 'E', 'E', |
||
| 2803 | 'ÄU<', 'EU', 'EU', |
||
| 2804 | 'ÄV(AEOU)-<', 'EW', None, |
||
| 2805 | 'Ä$', 'Ä', None, |
||
| 2806 | 'Ä<', None, 'E', |
||
| 2807 | 'Ä', 'E', None, |
||
| 2808 | 'ÖE', 'Ö', 'Ö', |
||
| 2809 | 'ÖU', 'Ö', 'Ö', |
||
| 2810 | 'ÖVER--<', 'ÖW', None, |
||
| 2811 | 'ÖV(AOU)-', 'ÖW', None, |
||
| 2812 | 'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
| 2813 | 'ÜBER^^', 'ÜBA', 'IBA', |
||
| 2814 | 'ÜE', 'Ü', 'I', |
||
| 2815 | 'ÜVER--<', 'ÜW', None, |
||
| 2816 | 'ÜV(AOU)-', 'ÜW', None, |
||
| 2817 | 'Ü', None, 'I', |
||
| 2818 | 'ßCH<', None, 'Z', |
||
| 2819 | 'ß<', 'S', 'Z', |
||
| 2820 | # international umlauts |
||
| 2821 | 'À<', 'A', 'A', |
||
| 2822 | 'Á<', 'A', 'A', |
||
| 2823 | 'Â<', 'A', 'A', |
||
| 2824 | 'Ã<', 'A', 'A', |
||
| 2825 | 'Å<', 'A', 'A', |
||
| 2826 | 'ÆER-', 'E', 'E', |
||
| 2827 | 'ÆU<', 'EU', 'EU', |
||
| 2828 | 'ÆV(AEOU)-<', 'EW', None, |
||
| 2829 | 'Æ$', 'Ä', None, |
||
| 2830 | 'Æ<', None, 'E', |
||
| 2831 | 'Æ', 'E', None, |
||
| 2832 | 'Ç', 'Z', 'Z', |
||
| 2833 | 'ÐÐ-', '', '', |
||
| 2834 | 'Ð', 'DI', 'TI', |
||
| 2835 | 'È<', 'E', 'E', |
||
| 2836 | 'É<', 'E', 'E', |
||
| 2837 | 'Ê<', 'E', 'E', |
||
| 2838 | 'Ë', 'E', 'E', |
||
| 2839 | 'Ì<', 'I', 'I', |
||
| 2840 | 'Í<', 'I', 'I', |
||
| 2841 | 'Î<', 'I', 'I', |
||
| 2842 | 'Ï', 'I', 'I', |
||
| 2843 | 'ÑÑ-', '', '', |
||
| 2844 | 'Ñ', 'NI', 'NI', |
||
| 2845 | 'Ò<', 'O', 'U', |
||
| 2846 | 'Ó<', 'O', 'U', |
||
| 2847 | 'Ô<', 'O', 'U', |
||
| 2848 | 'Õ<', 'O', 'U', |
||
| 2849 | 'Œ<', 'Ö', 'Ö', |
||
| 2850 | 'Ø(IJY)-<', 'E', 'E', |
||
| 2851 | 'Ø<', 'Ö', 'Ö', |
||
| 2852 | 'Š', 'SH', 'Z', |
||
| 2853 | 'Þ', 'T', 'T', |
||
| 2854 | 'Ù<', 'U', 'U', |
||
| 2855 | 'Ú<', 'U', 'U', |
||
| 2856 | 'Û<', 'U', 'U', |
||
| 2857 | 'Ý<', 'I', 'I', |
||
| 2858 | 'Ÿ<', 'I', 'I', |
||
| 2859 | # 'normal' letters (A-Z) |
||
| 2860 | 'ABELLE$', 'ABL', 'ABL', |
||
| 2861 | 'ABELL$', 'ABL', 'ABL', |
||
| 2862 | 'ABIENNE$', 'ABIN', 'ABIN', |
||
| 2863 | 'ACHME---^', 'ACH', 'AK', |
||
| 2864 | 'ACEY$', 'AZI', 'AZI', |
||
| 2865 | 'ADV', 'ATW', None, |
||
| 2866 | 'AEGL-', 'EK', None, |
||
| 2867 | 'AEU<', 'EU', 'EU', |
||
| 2868 | 'AE2', 'E', 'E', |
||
| 2869 | 'AFTRAUBEN------', 'AFT ', 'AFT ', |
||
| 2870 | 'AGL-1', 'AK', None, |
||
| 2871 | 'AGNI-^', 'AKN', 'AKN', |
||
| 2872 | 'AGNIE-', 'ANI', 'ANI', |
||
| 2873 | 'AGN(AEOU)-$', 'ANI', 'ANI', |
||
| 2874 | 'AH(AIOÖUÜY)-', 'AH', None, |
||
| 2875 | 'AIA2', 'AIA', 'AIA', |
||
| 2876 | 'AIE$', 'E', 'E', |
||
| 2877 | 'AILL(EOU)-', 'ALI', 'ALI', |
||
| 2878 | 'AINE$', 'EN', 'EN', |
||
| 2879 | 'AIRE$', 'ER', 'ER', |
||
| 2880 | 'AIR-', 'E', 'E', |
||
| 2881 | 'AISE$', 'ES', 'EZ', |
||
| 2882 | 'AISSANCE$', 'ESANS', 'EZANZ', |
||
| 2883 | 'AISSE$', 'ES', 'EZ', |
||
| 2884 | 'AIX$', 'EX', 'EX', |
||
| 2885 | 'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
||
| 2886 | 'AKTIE', 'AXIE', 'AXIE', |
||
| 2887 | 'AKTUEL', 'AKTUEL', None, |
||
| 2888 | 'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
||
| 2889 | 'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
||
| 2890 | 'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
||
| 2891 | 'ANCH(OEI)-', 'ANSH', 'ANZ', |
||
| 2892 | 'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
||
| 2893 | 'ANDERGEHE----', 'ANDA ', 'ANTA ', |
||
| 2894 | 'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
||
| 2895 | 'ANDERGING----', 'ANDA ', 'ANTA ', |
||
| 2896 | 'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
||
| 2897 | 'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
||
| 2898 | 'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
||
| 2899 | 'ANER(BKO)---^^', 'AN', None, |
||
| 2900 | 'ANHAND---^$', 'AN H', 'AN ', |
||
| 2901 | 'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
||
| 2902 | 'ANIELLE$', 'ANIEL', 'ANIL', |
||
| 2903 | 'ANIEL', 'ANIEL', None, |
||
| 2904 | 'ANSTELLE----^$', 'AN ST', 'AN ZT', |
||
| 2905 | 'ANTI^^', 'ANTI', 'ANTI', |
||
| 2906 | 'ANVER^^', 'ANFA', 'ANFA', |
||
| 2907 | 'ATIA$', 'ATIA', 'ATIA', |
||
| 2908 | 'ATIA(NS)--', 'ATI', 'ATI', |
||
| 2909 | 'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
||
| 2910 | 'AUAU--', '', '', |
||
| 2911 | 'AUERE$', 'AUERE', None, |
||
| 2912 | 'AUERE(NS)-$', 'AUERE', None, |
||
| 2913 | 'AUERE(AIOUY)--', 'AUER', None, |
||
| 2914 | 'AUER(AÄIOÖUÜY)-', 'AUER', None, |
||
| 2915 | 'AUER<', 'AUA', 'AUA', |
||
| 2916 | 'AUF^^', 'AUF', 'AUF', |
||
| 2917 | 'AULT$', 'O', 'U', |
||
| 2918 | 'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
||
| 2919 | 'AUR$', 'AUA', 'AUA', |
||
| 2920 | 'AUSSE$', 'OS', 'UZ', |
||
| 2921 | 'AUS(ST)-^', 'AUS', 'AUS', |
||
| 2922 | 'AUS^^', 'AUS', 'AUS', |
||
| 2923 | 'AUTOFAHR----', 'AUTO ', 'AUTU ', |
||
| 2924 | 'AUTO^^', 'AUTO', 'AUTU', |
||
| 2925 | 'AUX(IY)-', 'AUX', 'AUX', |
||
| 2926 | 'AUX', 'O', 'U', |
||
| 2927 | 'AU', 'AU', 'AU', |
||
| 2928 | 'AVER--<', 'AW', None, |
||
| 2929 | 'AVIER$', 'AWIE', 'AFIE', |
||
| 2930 | 'AV(EÈÉÊI)-^', 'AW', None, |
||
| 2931 | 'AV(AOU)-', 'AW', None, |
||
| 2932 | 'AYRE$', 'EIRE', 'EIRE', |
||
| 2933 | 'AYRE(NS)-$', 'EIRE', 'EIRE', |
||
| 2934 | 'AYRE(AIOUY)--', 'EIR', 'EIR', |
||
| 2935 | 'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
||
| 2936 | 'AYR<', 'EIA', 'EIA', |
||
| 2937 | 'AYER--<', 'EI', 'EI', |
||
| 2938 | 'AY(AÄEIOÖUÜY)--', 'A', 'A', |
||
| 2939 | 'AË', 'E', 'E', |
||
| 2940 | 'A(IJY)<', 'EI', 'EI', |
||
| 2941 | 'BABY^$', 'BEBI', 'BEBI', |
||
| 2942 | 'BAB(IY)^', 'BEBI', 'BEBI', |
||
| 2943 | 'BEAU^$', 'BO', None, |
||
| 2944 | 'BEA(BCMNRU)-^', 'BEA', 'BEA', |
||
| 2945 | 'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
||
| 2946 | 'BEE$', 'BI', 'BI', |
||
| 2947 | 'BEIGE^$', 'BESH', 'BEZ', |
||
| 2948 | 'BENOIT--', 'BENO', 'BENU', |
||
| 2949 | 'BER(DT)-', 'BER', None, |
||
| 2950 | 'BERN(DT)-', 'BERN', None, |
||
| 2951 | 'BE(LMNRST)-^', 'BE', 'BE', |
||
| 2952 | 'BETTE$', 'BET', 'BET', |
||
| 2953 | 'BEVOR^$', 'BEFOR', None, |
||
| 2954 | 'BIC$', 'BIZ', 'BIZ', |
||
| 2955 | 'BOWL(EI)-', 'BOL', 'BUL', |
||
| 2956 | 'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
||
| 2957 | 'BRINGEND-----^', 'BRI', 'BRI', |
||
| 2958 | 'BRINGEND-----', ' BRI', ' BRI', |
||
| 2959 | 'BROW(NS)-', 'BRAU', 'BRAU', |
||
| 2960 | 'BUDGET7', 'BÜGE', 'BIKE', |
||
| 2961 | 'BUFFET7', 'BÜFE', 'BIFE', |
||
| 2962 | 'BYLLE$', 'BILE', 'BILE', |
||
| 2963 | 'BYLL$', 'BIL', 'BIL', |
||
| 2964 | 'BYPA--^', 'BEI', 'BEI', |
||
| 2965 | 'BYTE<', 'BEIT', 'BEIT', |
||
| 2966 | 'BY9^', 'BÜ', None, |
||
| 2967 | 'B(SßZ)$', 'BS', None, |
||
| 2968 | 'CACH(EI)-^', 'KESH', 'KEZ', |
||
| 2969 | 'CAE--', 'Z', 'Z', |
||
| 2970 | 'CA(IY)$', 'ZEI', 'ZEI', |
||
| 2971 | 'CE(EIJUY)--', 'Z', 'Z', |
||
| 2972 | 'CENT<', 'ZENT', 'ZENT', |
||
| 2973 | 'CERST(EI)----^', 'KE', 'KE', |
||
| 2974 | 'CER$', 'ZA', 'ZA', |
||
| 2975 | 'CE3', 'ZE', 'ZE', |
||
| 2976 | 'CH\'S$', 'X', 'X', |
||
| 2977 | 'CH´S$', 'X', 'X', |
||
| 2978 | 'CHAO(ST)-', 'KAO', 'KAU', |
||
| 2979 | 'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
||
| 2980 | 'CHAR(AI)-^', 'KAR', 'KAR', |
||
| 2981 | 'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
||
| 2982 | 'CHÄ(CF)-', 'SHE', 'ZE', |
||
| 2983 | 'CHE(CF)-', 'SHE', 'ZE', |
||
| 2984 | 'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
||
| 2985 | 'CHEQUE<', 'SHEK', 'ZEK', |
||
| 2986 | 'CHI(CFGPVW)-', 'SHI', 'ZI', |
||
| 2987 | 'CH(AEUY)-<^', 'SH', 'Z', |
||
| 2988 | 'CHK-', '', '', |
||
| 2989 | 'CHO(CKPS)-^', 'SHO', 'ZU', |
||
| 2990 | 'CHRIS-', 'KRI', None, |
||
| 2991 | 'CHRO-', 'KR', None, |
||
| 2992 | 'CH(LOR)-<^', 'K', 'K', |
||
| 2993 | 'CHST-', 'X', 'X', |
||
| 2994 | 'CH(SßXZ)3', 'X', 'X', |
||
| 2995 | 'CHTNI-3', 'CHN', 'KN', |
||
| 2996 | 'CH^', 'K', 'K', # or: 'CH', 'K' |
||
| 2997 | 'CH', 'CH', 'K', |
||
| 2998 | 'CIC$', 'ZIZ', 'ZIZ', |
||
| 2999 | 'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
||
| 3000 | 'CIENCE$', 'EIENS', 'EIENZ', |
||
| 3001 | 'CIER$', 'ZIE', 'ZIE', |
||
| 3002 | 'CYB-^', 'ZEI', 'ZEI', |
||
| 3003 | 'CY9^', 'ZÜ', 'ZI', |
||
| 3004 | 'C(IJY)-<3', 'Z', 'Z', |
||
| 3005 | 'CLOWN-', 'KLAU', 'KLAU', |
||
| 3006 | 'CCH', 'Z', 'Z', |
||
| 3007 | 'CCE-', 'X', 'X', |
||
| 3008 | 'C(CK)-', '', '', |
||
| 3009 | 'CLAUDET---', 'KLO', 'KLU', |
||
| 3010 | 'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
||
| 3011 | 'COACH', 'KOSH', 'KUZ', |
||
| 3012 | 'COLE$', 'KOL', 'KUL', |
||
| 3013 | 'COUCH', 'KAUSH', 'KAUZ', |
||
| 3014 | 'COW', 'KAU', 'KAU', |
||
| 3015 | 'CQUES$', 'K', 'K', |
||
| 3016 | 'CQUE', 'K', 'K', |
||
| 3017 | 'CRASH--9', 'KRE', 'KRE', |
||
| 3018 | 'CREAT-^', 'KREA', 'KREA', |
||
| 3019 | 'CST', 'XT', 'XT', |
||
| 3020 | 'CS<^', 'Z', 'Z', |
||
| 3021 | 'C(SßX)', 'X', 'X', |
||
| 3022 | 'CT\'S$', 'X', 'X', |
||
| 3023 | 'CT(SßXZ)', 'X', 'X', |
||
| 3024 | 'CZ<', 'Z', 'Z', |
||
| 3025 | 'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
||
| 3026 | 'C.^', 'C.', 'C.', |
||
| 3027 | 'CÄ-', 'Z', 'Z', |
||
| 3028 | 'CÜ$', 'ZÜ', 'ZI', |
||
| 3029 | 'C\'S$', 'X', 'X', |
||
| 3030 | 'C<', 'K', 'K', |
||
| 3031 | 'DAHER^$', 'DAHER', None, |
||
| 3032 | 'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
||
| 3033 | 'DAVO(NR)-^$', 'DAFO', 'TAFU', |
||
| 3034 | 'DD(SZ)--<', '', '', |
||
| 3035 | 'DD9', 'D', None, |
||
| 3036 | 'DEPOT7', 'DEPO', 'TEBU', |
||
| 3037 | 'DESIGN', 'DISEIN', 'TIZEIN', |
||
| 3038 | 'DE(LMNRST)-3^', 'DE', 'TE', |
||
| 3039 | 'DETTE$', 'DET', 'TET', |
||
| 3040 | 'DH$', 'T', None, |
||
| 3041 | 'DIC$', 'DIZ', 'TIZ', |
||
| 3042 | 'DIDR-^', 'DIT', None, |
||
| 3043 | 'DIEDR-^', 'DIT', None, |
||
| 3044 | 'DJ(AEIOU)-^', 'I', 'I', |
||
| 3045 | 'DMITR-^', 'DIMIT', 'TINIT', |
||
| 3046 | 'DRY9^', 'DRÜ', None, |
||
| 3047 | 'DT-', '', '', |
||
| 3048 | 'DUIS-^', 'DÜ', 'TI', |
||
| 3049 | 'DURCH^^', 'DURCH', 'TURK', |
||
| 3050 | 'DVA$', 'TWA', None, |
||
| 3051 | 'DY9^', 'DÜ', None, |
||
| 3052 | 'DYS$', 'DIS', None, |
||
| 3053 | 'DS(CH)--<', 'T', 'T', |
||
| 3054 | 'DST', 'ZT', 'ZT', |
||
| 3055 | 'DZS(CH)--', 'T', 'T', |
||
| 3056 | 'D(SßZ)', 'Z', 'Z', |
||
| 3057 | 'D(AÄEIOÖRUÜY)-', 'D', None, |
||
| 3058 | 'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
||
| 3059 | 'D\'H^', 'D', 'T', |
||
| 3060 | 'D´H^', 'D', 'T', |
||
| 3061 | 'D`H^', 'D', 'T', |
||
| 3062 | 'D\'S3$', 'Z', 'Z', |
||
| 3063 | 'D´S3$', 'Z', 'Z', |
||
| 3064 | 'D^', 'D', None, |
||
| 3065 | 'D', 'T', 'T', |
||
| 3066 | 'EAULT$', 'O', 'U', |
||
| 3067 | 'EAUX$', 'O', 'U', |
||
| 3068 | 'EAU', 'O', 'U', |
||
| 3069 | 'EAV', 'IW', 'IF', |
||
| 3070 | 'EAS3$', 'EAS', None, |
||
| 3071 | 'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
||
| 3072 | 'EA3$', 'EA', 'EA', |
||
| 3073 | 'EA3', 'I', 'I', |
||
| 3074 | 'EBENSO^$', 'EBNSO', 'EBNZU', |
||
| 3075 | 'EBENSO^^', 'EBNSO ', 'EBNZU ', |
||
| 3076 | 'EBEN^^', 'EBN', 'EBN', |
||
| 3077 | 'EE9', 'E', 'E', |
||
| 3078 | 'EGL-1', 'EK', None, |
||
| 3079 | 'EHE(IUY)--1', 'EH', None, |
||
| 3080 | 'EHUNG---1', 'E', None, |
||
| 3081 | 'EH(AÄIOÖUÜY)-1', 'EH', None, |
||
| 3082 | 'EIEI--', '', '', |
||
| 3083 | 'EIERE^$', 'EIERE', None, |
||
| 3084 | 'EIERE$', 'EIERE', None, |
||
| 3085 | 'EIERE(NS)-$', 'EIERE', None, |
||
| 3086 | 'EIERE(AIOUY)--', 'EIER', None, |
||
| 3087 | 'EIER(AÄIOÖUÜY)-', 'EIER', None, |
||
| 3088 | 'EIER<', 'EIA', None, |
||
| 3089 | 'EIGL-1', 'EIK', None, |
||
| 3090 | 'EIGH$', 'EI', 'EI', |
||
| 3091 | 'EIH--', 'E', 'E', |
||
| 3092 | 'EILLE$', 'EI', 'EI', |
||
| 3093 | 'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
||
| 3094 | 'EIR$', 'EIA', 'EIA', |
||
| 3095 | 'EITRAUBEN------', 'EIT ', 'EIT ', |
||
| 3096 | 'EI', 'EI', 'EI', |
||
| 3097 | 'EJ$', 'EI', 'EI', |
||
| 3098 | 'ELIZ^', 'ELIS', None, |
||
| 3099 | 'ELZ^', 'ELS', None, |
||
| 3100 | 'EL-^', 'E', 'E', |
||
| 3101 | 'ELANG----1', 'E', 'E', |
||
| 3102 | 'EL(DKL)--1', 'E', 'E', |
||
| 3103 | 'EL(MNT)--1$', 'E', 'E', |
||
| 3104 | 'ELYNE$', 'ELINE', 'ELINE', |
||
| 3105 | 'ELYN$', 'ELIN', 'ELIN', |
||
| 3106 | 'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
||
| 3107 | 'EL-1', 'L', 'L', |
||
| 3108 | 'EM-^', None, 'E', |
||
| 3109 | 'EM(DFKMPQT)--1', None, 'E', |
||
| 3110 | 'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
||
| 3111 | 'EM-1', None, 'N', |
||
| 3112 | 'ENGAG-^', 'ANGA', 'ANKA', |
||
| 3113 | 'EN-^', 'E', 'E', |
||
| 3114 | 'ENTUEL', 'ENTUEL', None, |
||
| 3115 | 'EN(CDGKQSTZ)--1', 'E', 'E', |
||
| 3116 | 'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
||
| 3117 | 'EN-1', '', '', |
||
| 3118 | 'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
||
| 3119 | 'ER-^', 'E', 'E', |
||
| 3120 | 'ERREGEND-----', ' ER', ' ER', |
||
| 3121 | 'ERT1$', 'AT', None, |
||
| 3122 | 'ER(DGLKMNRQTZß)-1', 'ER', None, |
||
| 3123 | 'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
||
| 3124 | 'ER1$', 'A', 'A', |
||
| 3125 | 'ER<1', 'A', 'A', |
||
| 3126 | 'ETAT7', 'ETA', 'ETA', |
||
| 3127 | 'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
||
| 3128 | 'EUERE$', 'EUERE', None, |
||
| 3129 | 'EUERE(NS)-$', 'EUERE', None, |
||
| 3130 | 'EUERE(AIOUY)--', 'EUER', None, |
||
| 3131 | 'EUER(AÄIOÖUÜY)-', 'EUER', None, |
||
| 3132 | 'EUER<', 'EUA', None, |
||
| 3133 | 'EUEU--', '', '', |
||
| 3134 | 'EUILLE$', 'Ö', 'Ö', |
||
| 3135 | 'EUR$', 'ÖR', 'ÖR', |
||
| 3136 | 'EUX', 'Ö', 'Ö', |
||
| 3137 | 'EUSZ$', 'EUS', None, |
||
| 3138 | 'EUTZ$', 'EUS', None, |
||
| 3139 | 'EUYS$', 'EUS', 'EUZ', |
||
| 3140 | 'EUZ$', 'EUS', None, |
||
| 3141 | 'EU', 'EU', 'EU', |
||
| 3142 | 'EVER--<1', 'EW', None, |
||
| 3143 | 'EV(ÄOÖUÜ)-1', 'EW', None, |
||
| 3144 | 'EYER<', 'EIA', 'EIA', |
||
| 3145 | 'EY<', 'EI', 'EI', |
||
| 3146 | 'FACETTE', 'FASET', 'FAZET', |
||
| 3147 | 'FANS--^$', 'FE', 'FE', |
||
| 3148 | 'FAN-^$', 'FE', 'FE', |
||
| 3149 | 'FAULT-', 'FOL', 'FUL', |
||
| 3150 | 'FEE(DL)-', 'FI', 'FI', |
||
| 3151 | 'FEHLER', 'FELA', 'FELA', |
||
| 3152 | 'FE(LMNRST)-3^', 'FE', 'FE', |
||
| 3153 | 'FOERDERN---^', 'FÖRD', 'FÖRT', |
||
| 3154 | 'FOERDERN---', ' FÖRD', ' FÖRT', |
||
| 3155 | 'FOND7', 'FON', 'FUN', |
||
| 3156 | 'FRAIN$', 'FRA', 'FRA', |
||
| 3157 | 'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
||
| 3158 | 'FY9^', 'FÜ', None, |
||
| 3159 | 'FÖRDERN---^', 'FÖRD', 'FÖRT', |
||
| 3160 | 'FÖRDERN---', ' FÖRD', ' FÖRT', |
||
| 3161 | 'GAGS^$', 'GEX', 'KEX', |
||
| 3162 | 'GAG^$', 'GEK', 'KEK', |
||
| 3163 | 'GD', 'KT', 'KT', |
||
| 3164 | 'GEGEN^^', 'GEGN', 'KEKN', |
||
| 3165 | 'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
||
| 3166 | 'GEGENGESET-----', 'GEGN ', 'KEKN ', |
||
| 3167 | 'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
||
| 3168 | 'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
||
| 3169 | 'GENDETWAS-----$', 'GENT ', 'KENT ', |
||
| 3170 | 'GENRE', 'IORE', 'IURE', |
||
| 3171 | 'GE(LMNRST)-3^', 'GE', 'KE', |
||
| 3172 | 'GER(DKT)-', 'GER', None, |
||
| 3173 | 'GETTE$', 'GET', 'KET', |
||
| 3174 | 'GGF.', 'GF.', None, |
||
| 3175 | 'GG-', '', '', |
||
| 3176 | 'GH', 'G', None, |
||
| 3177 | 'GI(AOU)-^', 'I', 'I', |
||
| 3178 | 'GION-3', 'KIO', 'KIU', |
||
| 3179 | 'G(CK)-', '', '', |
||
| 3180 | 'GJ(AEIOU)-^', 'I', 'I', |
||
| 3181 | 'GMBH^$', 'GMBH', 'GMBH', |
||
| 3182 | 'GNAC$', 'NIAK', 'NIAK', |
||
| 3183 | 'GNON$', 'NION', 'NIUN', |
||
| 3184 | 'GN$', 'N', 'N', |
||
| 3185 | 'GONCAL-^', 'GONZA', 'KUNZA', |
||
| 3186 | 'GRY9^', 'GRÜ', None, |
||
| 3187 | 'G(SßXZ)-<', 'K', 'K', |
||
| 3188 | 'GUCK-', 'KU', 'KU', |
||
| 3189 | 'GUISEP-^', 'IUSE', 'IUZE', |
||
| 3190 | 'GUI-^', 'G', 'K', |
||
| 3191 | 'GUTAUSSEH------^', 'GUT ', 'KUT ', |
||
| 3192 | 'GUTGEHEND------^', 'GUT ', 'KUT ', |
||
| 3193 | 'GY9^', 'GÜ', None, |
||
| 3194 | 'G(AÄEILOÖRUÜY)-', 'G', None, |
||
| 3195 | 'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
||
| 3196 | 'G\'S$', 'X', 'X', |
||
| 3197 | 'G´S$', 'X', 'X', |
||
| 3198 | 'G^', 'G', None, |
||
| 3199 | 'G', 'K', 'K', |
||
| 3200 | 'HA(HIUY)--1', 'H', None, |
||
| 3201 | 'HANDVOL---^', 'HANT ', 'ANT ', |
||
| 3202 | 'HANNOVE-^', 'HANOF', None, |
||
| 3203 | 'HAVEN7$', 'HAFN', None, |
||
| 3204 | 'HEAD-', 'HE', 'E', |
||
| 3205 | 'HELIEGEN------', 'E ', 'E ', |
||
| 3206 | 'HESTEHEN------', 'E ', 'E ', |
||
| 3207 | 'HE(LMNRST)-3^', 'HE', 'E', |
||
| 3208 | 'HE(LMN)-1', 'E', 'E', |
||
| 3209 | 'HEUR1$', 'ÖR', 'ÖR', |
||
| 3210 | 'HE(HIUY)--1', 'H', None, |
||
| 3211 | 'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
||
| 3212 | 'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
||
| 3213 | 'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
||
| 3214 | 'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
||
| 3215 | 'HOBBY9^', 'HOBI', None, |
||
| 3216 | 'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
||
| 3217 | 'HOCHTALEN-----^', 'HOCH ', 'UK ', |
||
| 3218 | 'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
||
| 3219 | 'HO(HIY)--1', 'H', None, |
||
| 3220 | 'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
||
| 3221 | 'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
||
| 3222 | 'HUIS^^', 'HÜS', 'IZ', |
||
| 3223 | 'HUIS$', 'ÜS', 'IZ', |
||
| 3224 | 'HUI--1', 'H', None, |
||
| 3225 | 'HYGIEN^', 'HÜKIEN', None, |
||
| 3226 | 'HY9^', 'HÜ', None, |
||
| 3227 | 'HY(BDGMNPST)-', 'Ü', None, |
||
| 3228 | 'H.^', None, 'H.', |
||
| 3229 | 'HÄU--1', 'H', None, |
||
| 3230 | 'H^', 'H', '', |
||
| 3231 | 'H', '', '', |
||
| 3232 | 'ICHELL---', 'ISH', 'IZ', |
||
| 3233 | 'ICHI$', 'ISHI', 'IZI', |
||
| 3234 | 'IEC$', 'IZ', 'IZ', |
||
| 3235 | 'IEDENSTELLE------', 'IDN ', 'ITN ', |
||
| 3236 | 'IEI-3', '', '', |
||
| 3237 | 'IELL3', 'IEL', 'IEL', |
||
| 3238 | 'IENNE$', 'IN', 'IN', |
||
| 3239 | 'IERRE$', 'IER', 'IER', |
||
| 3240 | 'IERZULAN---', 'IR ZU ', 'IR ZU ', |
||
| 3241 | 'IETTE$', 'IT', 'IT', |
||
| 3242 | 'IEU', 'IÖ', 'IÖ', |
||
| 3243 | 'IE<4', 'I', 'I', |
||
| 3244 | 'IGL-1', 'IK', None, |
||
| 3245 | 'IGHT3$', 'EIT', 'EIT', |
||
| 3246 | 'IGNI(EO)-', 'INI', 'INI', |
||
| 3247 | 'IGN(AEOU)-$', 'INI', 'INI', |
||
| 3248 | 'IHER(DGLKRT)--1', 'IHE', None, |
||
| 3249 | 'IHE(IUY)--', 'IH', None, |
||
| 3250 | 'IH(AIOÖUÜY)-', 'IH', None, |
||
| 3251 | 'IJ(AOU)-', 'I', 'I', |
||
| 3252 | 'IJ$', 'I', 'I', |
||
| 3253 | 'IJ<', 'EI', 'EI', |
||
| 3254 | 'IKOLE$', 'IKOL', 'IKUL', |
||
| 3255 | 'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
||
| 3256 | 'ILLAR(DT)--4', 'ILIA', 'ILIA', |
||
| 3257 | 'IMSTAN----^', 'IM ', 'IN ', |
||
| 3258 | 'INDELERREGE------', 'INDL ', 'INTL ', |
||
| 3259 | 'INFRAGE-----^$', 'IN ', 'IN ', |
||
| 3260 | 'INTERN(AOU)-^', 'INTAN', 'INTAN', |
||
| 3261 | 'INVER-', 'INWE', 'INFE', |
||
| 3262 | 'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
||
| 3263 | 'IUSZ$', 'IUS', None, |
||
| 3264 | 'IUTZ$', 'IUS', None, |
||
| 3265 | 'IUZ$', 'IUS', None, |
||
| 3266 | 'IVER--<', 'IW', None, |
||
| 3267 | 'IVIER$', 'IWIE', 'IFIE', |
||
| 3268 | 'IV(ÄOÖUÜ)-', 'IW', None, |
||
| 3269 | 'IV<3', 'IW', None, |
||
| 3270 | 'IY2', 'I', None, |
||
| 3271 | 'I(ÈÉÊ)<4', 'I', 'I', |
||
| 3272 | 'JAVIE---<^', 'ZA', 'ZA', |
||
| 3273 | 'JEANS^$', 'JINS', 'INZ', |
||
| 3274 | 'JEANNE^$', 'IAN', 'IAN', |
||
| 3275 | 'JEAN-^', 'IA', 'IA', |
||
| 3276 | 'JER-^', 'IE', 'IE', |
||
| 3277 | 'JE(LMNST)-', 'IE', 'IE', |
||
| 3278 | 'JI^', 'JI', None, |
||
| 3279 | 'JOR(GK)^$', 'IÖRK', 'IÖRK', |
||
| 3280 | 'J', 'I', 'I', |
||
| 3281 | 'KC(ÄEIJ)-', 'X', 'X', |
||
| 3282 | 'KD', 'KT', None, |
||
| 3283 | 'KE(LMNRST)-3^', 'KE', 'KE', |
||
| 3284 | 'KG(AÄEILOÖRUÜY)-', 'K', None, |
||
| 3285 | 'KH<^', 'K', 'K', |
||
| 3286 | 'KIC$', 'KIZ', 'KIZ', |
||
| 3287 | 'KLE(LMNRST)-3^', 'KLE', 'KLE', |
||
| 3288 | 'KOTELE-^', 'KOTL', 'KUTL', |
||
| 3289 | 'KREAT-^', 'KREA', 'KREA', |
||
| 3290 | 'KRÜS(TZ)--^', 'KRI', None, |
||
| 3291 | 'KRYS(TZ)--^', 'KRI', None, |
||
| 3292 | 'KRY9^', 'KRÜ', None, |
||
| 3293 | 'KSCH---', 'K', 'K', |
||
| 3294 | 'KSH--', 'K', 'K', |
||
| 3295 | 'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
||
| 3296 | 'KT\'S$', 'X', 'X', |
||
| 3297 | 'KTI(AIOU)-3', 'XI', 'XI', |
||
| 3298 | 'KT(SßXZ)', 'X', 'X', |
||
| 3299 | 'KY9^', 'KÜ', None, |
||
| 3300 | 'K\'S$', 'X', 'X', |
||
| 3301 | 'K´S$', 'X', 'X', |
||
| 3302 | 'LANGES$', ' LANGES', ' LANKEZ', |
||
| 3303 | 'LANGE$', ' LANGE', ' LANKE', |
||
| 3304 | 'LANG$', ' LANK', ' LANK', |
||
| 3305 | 'LARVE-', 'LARF', 'LARF', |
||
| 3306 | 'LD(SßZ)$', 'LS', 'LZ', |
||
| 3307 | 'LD\'S$', 'LS', 'LZ', |
||
| 3308 | 'LD´S$', 'LS', 'LZ', |
||
| 3309 | 'LEAND-^', 'LEAN', 'LEAN', |
||
| 3310 | 'LEERSTEHE-----^', 'LER ', 'LER ', |
||
| 3311 | 'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
||
| 3312 | 'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
||
| 3313 | 'LEIDERREGE------', 'LEIT ', 'LEIT ', |
||
| 3314 | 'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
||
| 3315 | 'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
||
| 3316 | 'LEL-', 'LE', 'LE', |
||
| 3317 | 'LE(MNRST)-3^', 'LE', 'LE', |
||
| 3318 | 'LETTE$', 'LET', 'LET', |
||
| 3319 | 'LFGNAG-', 'LFGAN', 'LFKAN', |
||
| 3320 | 'LICHERWEIS----', 'LICHA ', 'LIKA ', |
||
| 3321 | 'LIC$', 'LIZ', 'LIZ', |
||
| 3322 | 'LIVE^$', 'LEIF', 'LEIF', |
||
| 3323 | 'LT(SßZ)$', 'LS', 'LZ', |
||
| 3324 | 'LT\'S$', 'LS', 'LZ', |
||
| 3325 | 'LT´S$', 'LS', 'LZ', |
||
| 3326 | 'LUI(GS)--', 'LU', 'LU', |
||
| 3327 | 'LV(AIO)-', 'LW', None, |
||
| 3328 | 'LY9^', 'LÜ', None, |
||
| 3329 | 'LSTS$', 'LS', 'LZ', |
||
| 3330 | 'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
||
| 3331 | 'L(SßZ)$', 'LS', None, |
||
| 3332 | 'MAIR-<', 'MEI', 'NEI', |
||
| 3333 | 'MANAG-', 'MENE', 'NENE', |
||
| 3334 | 'MANUEL', 'MANUEL', None, |
||
| 3335 | 'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
||
| 3336 | 'MATCH', 'MESH', 'NEZ', |
||
| 3337 | 'MAURICE', 'MORIS', 'NURIZ', |
||
| 3338 | 'MBH^$', 'MBH', 'MBH', |
||
| 3339 | 'MB(ßZ)$', 'MS', None, |
||
| 3340 | 'MB(SßTZ)-', 'M', 'N', |
||
| 3341 | 'MCG9^', 'MAK', 'NAK', |
||
| 3342 | 'MC9^', 'MAK', 'NAK', |
||
| 3343 | 'MEMOIR-^', 'MEMOA', 'NENUA', |
||
| 3344 | 'MERHAVEN$', 'MAHAFN', None, |
||
| 3345 | 'ME(LMNRST)-3^', 'ME', 'NE', |
||
| 3346 | 'MEN(STZ)--3', 'ME', None, |
||
| 3347 | 'MEN$', 'MEN', None, |
||
| 3348 | 'MIGUEL-', 'MIGE', 'NIKE', |
||
| 3349 | 'MIKE^$', 'MEIK', 'NEIK', |
||
| 3350 | 'MITHILFE----^$', 'MIT H', 'NIT ', |
||
| 3351 | 'MN$', 'M', None, |
||
| 3352 | 'MN', 'N', 'N', |
||
| 3353 | 'MPJUTE-', 'MPUT', 'NBUT', |
||
| 3354 | 'MP(ßZ)$', 'MS', None, |
||
| 3355 | 'MP(SßTZ)-', 'M', 'N', |
||
| 3356 | 'MP(BDJLMNPQVW)-', 'MB', 'NB', |
||
| 3357 | 'MY9^', 'MÜ', None, |
||
| 3358 | 'M(ßZ)$', 'MS', None, |
||
| 3359 | 'M´G7^', 'MAK', 'NAK', |
||
| 3360 | 'M\'G7^', 'MAK', 'NAK', |
||
| 3361 | 'M´^', 'MAK', 'NAK', |
||
| 3362 | 'M\'^', 'MAK', 'NAK', |
||
| 3363 | 'M', None, 'N', |
||
| 3364 | 'NACH^^', 'NACH', 'NAK', |
||
| 3365 | 'NADINE', 'NADIN', 'NATIN', |
||
| 3366 | 'NAIV--', 'NA', 'NA', |
||
| 3367 | 'NAISE$', 'NESE', 'NEZE', |
||
| 3368 | 'NAUGENOMM------', 'NAU ', 'NAU ', |
||
| 3369 | 'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
||
| 3370 | 'NCH$', 'NSH', 'NZ', |
||
| 3371 | 'NCOISE$', 'SOA', 'ZUA', |
||
| 3372 | 'NCOIS$', 'SOA', 'ZUA', |
||
| 3373 | 'NDAR$', 'NDA', 'NTA', |
||
| 3374 | 'NDERINGEN------', 'NDE ', 'NTE ', |
||
| 3375 | 'NDRO(CDKTZ)-', 'NTRO', None, |
||
| 3376 | 'ND(BFGJLMNPQVW)-', 'NT', None, |
||
| 3377 | 'ND(SßZ)$', 'NS', 'NZ', |
||
| 3378 | 'ND\'S$', 'NS', 'NZ', |
||
| 3379 | 'ND´S$', 'NS', 'NZ', |
||
| 3380 | 'NEBEN^^', 'NEBN', 'NEBN', |
||
| 3381 | 'NENGELERN------', 'NEN ', 'NEN ', |
||
| 3382 | 'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
||
| 3383 | 'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
||
| 3384 | 'NE(LMNRST)-3^', 'NE', 'NE', |
||
| 3385 | 'NEN-3', 'NE', 'NE', |
||
| 3386 | 'NETTE$', 'NET', 'NET', |
||
| 3387 | 'NGU^^', 'NU', 'NU', |
||
| 3388 | 'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
||
| 3389 | 'NH(AUO)-$', 'NI', 'NI', |
||
| 3390 | 'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
||
| 3391 | 'NICHTSSAGE----', 'NIX ', 'NIX ', |
||
| 3392 | 'NICHTS^^', 'NIX', 'NIX', |
||
| 3393 | 'NICHT^^', 'NICHT', 'NIKT', |
||
| 3394 | 'NINE$', 'NIN', 'NIN', |
||
| 3395 | 'NON^^', 'NON', 'NUN', |
||
| 3396 | 'NOTLEIDE-----^', 'NOT ', 'NUT ', |
||
| 3397 | 'NOT^^', 'NOT', 'NUT', |
||
| 3398 | 'NTI(AIOU)-3', 'NZI', 'NZI', |
||
| 3399 | 'NTIEL--3', 'NZI', 'NZI', |
||
| 3400 | 'NT(SßZ)$', 'NS', 'NZ', |
||
| 3401 | 'NT\'S$', 'NS', 'NZ', |
||
| 3402 | 'NT´S$', 'NS', 'NZ', |
||
| 3403 | 'NYLON', 'NEILON', 'NEILUN', |
||
| 3404 | 'NY9^', 'NÜ', None, |
||
| 3405 | 'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
||
| 3406 | 'NSZ-', 'NS', None, |
||
| 3407 | 'NSTS$', 'NS', 'NZ', |
||
| 3408 | 'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
||
| 3409 | 'N(SßZ)$', 'NS', None, |
||
| 3410 | 'OBERE-', 'OBER', None, |
||
| 3411 | 'OBER^^', 'OBA', 'UBA', |
||
| 3412 | 'OEU2', 'Ö', 'Ö', |
||
| 3413 | 'OE<2', 'Ö', 'Ö', |
||
| 3414 | 'OGL-', 'OK', None, |
||
| 3415 | 'OGNIE-', 'ONI', 'UNI', |
||
| 3416 | 'OGN(AEOU)-$', 'ONI', 'UNI', |
||
| 3417 | 'OH(AIOÖUÜY)-', 'OH', None, |
||
| 3418 | 'OIE$', 'Ö', 'Ö', |
||
| 3419 | 'OIRE$', 'OA', 'UA', |
||
| 3420 | 'OIR$', 'OA', 'UA', |
||
| 3421 | 'OIX', 'OA', 'UA', |
||
| 3422 | 'OI<3', 'EU', 'EU', |
||
| 3423 | 'OKAY^$', 'OKE', 'UKE', |
||
| 3424 | 'OLYN$', 'OLIN', 'ULIN', |
||
| 3425 | 'OO(DLMZ)-', 'U', None, |
||
| 3426 | 'OO$', 'U', None, |
||
| 3427 | 'OO-', '', '', |
||
| 3428 | 'ORGINAL-----', 'ORI', 'URI', |
||
| 3429 | 'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
||
| 3430 | 'OUI^', 'WI', 'FI', |
||
| 3431 | 'OUILLE$', 'ULIE', 'ULIE', |
||
| 3432 | 'OU(DT)-^', 'AU', 'AU', |
||
| 3433 | 'OUSE$', 'AUS', 'AUZ', |
||
| 3434 | 'OUT-', 'AU', 'AU', |
||
| 3435 | 'OU', 'U', 'U', |
||
| 3436 | 'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
||
| 3437 | 'OVER--<', 'OW', None, |
||
| 3438 | 'OV(AOU)-', 'OW', None, |
||
| 3439 | 'OW$', 'AU', 'AU', |
||
| 3440 | 'OWS$', 'OS', 'UZ', |
||
| 3441 | 'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
||
| 3442 | 'OYER', 'OIA', None, |
||
| 3443 | 'OY(AÄEIOÖUÜ)--', 'O', 'U', |
||
| 3444 | 'O(JY)<', 'EU', 'EU', |
||
| 3445 | 'OZ$', 'OS', None, |
||
| 3446 | 'O´^', 'O', 'U', |
||
| 3447 | 'O\'^', 'O', 'U', |
||
| 3448 | 'O', None, 'U', |
||
| 3449 | 'PATIEN--^', 'PAZI', 'PAZI', |
||
| 3450 | 'PENSIO-^', 'PANSI', 'PANZI', |
||
| 3451 | 'PE(LMNRST)-3^', 'PE', 'PE', |
||
| 3452 | 'PFER-^', 'FE', 'FE', |
||
| 3453 | 'P(FH)<', 'F', 'F', |
||
| 3454 | 'PIC^$', 'PIK', 'PIK', |
||
| 3455 | 'PIC$', 'PIZ', 'PIZ', |
||
| 3456 | 'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
||
| 3457 | 'POLYP-', 'POLÜ', None, |
||
| 3458 | 'POLY^^', 'POLI', 'PULI', |
||
| 3459 | 'PORTRAIT7', 'PORTRE', 'PURTRE', |
||
| 3460 | 'POWER7', 'PAUA', 'PAUA', |
||
| 3461 | 'PP(FH)--<', 'B', 'B', |
||
| 3462 | 'PP-', '', '', |
||
| 3463 | 'PRODUZ-^', 'PRODU', 'BRUTU', |
||
| 3464 | 'PRODUZI--', ' PRODU', ' BRUTU', |
||
| 3465 | 'PRIX^$', 'PRI', 'PRI', |
||
| 3466 | 'PS-^^', 'P', None, |
||
| 3467 | 'P(SßZ)^', None, 'Z', |
||
| 3468 | 'P(SßZ)$', 'BS', None, |
||
| 3469 | 'PT-^', '', '', |
||
| 3470 | 'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
||
| 3471 | 'PY9^', 'PÜ', None, |
||
| 3472 | 'P(AÄEIOÖRUÜY)-', 'P', 'P', |
||
| 3473 | 'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
||
| 3474 | 'P.^', None, 'P.', |
||
| 3475 | 'P^', 'P', None, |
||
| 3476 | 'P', 'B', 'B', |
||
| 3477 | 'QI-', 'Z', 'Z', |
||
| 3478 | 'QUARANT--', 'KARA', 'KARA', |
||
| 3479 | 'QUE(LMNRST)-3', 'KWE', 'KFE', |
||
| 3480 | 'QUE$', 'K', 'K', |
||
| 3481 | 'QUI(NS)$', 'KI', 'KI', |
||
| 3482 | 'QUIZ7', 'KWIS', None, |
||
| 3483 | 'Q(UV)7', 'KW', 'KF', |
||
| 3484 | 'Q<', 'K', 'K', |
||
| 3485 | 'RADFAHR----', 'RAT ', 'RAT ', |
||
| 3486 | 'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
| 3487 | 'RCH', 'RCH', 'RK', |
||
| 3488 | 'REA(DU)---3^', 'R', None, |
||
| 3489 | 'REBSERZEUG------', 'REBS ', 'REBZ ', |
||
| 3490 | 'RECHERCH^', 'RESHASH', 'REZAZ', |
||
| 3491 | 'RECYCL--', 'RIZEI', 'RIZEI', |
||
| 3492 | 'RE(ALST)-3^', 'RE', None, |
||
| 3493 | 'REE$', 'RI', 'RI', |
||
| 3494 | 'RER$', 'RA', 'RA', |
||
| 3495 | 'RE(MNR)-4', 'RE', 'RE', |
||
| 3496 | 'RETTE$', 'RET', 'RET', |
||
| 3497 | 'REUZ$', 'REUZ', None, |
||
| 3498 | 'REW$', 'RU', 'RU', |
||
| 3499 | 'RH<^', 'R', 'R', |
||
| 3500 | 'RJA(MN)--', 'RI', 'RI', |
||
| 3501 | 'ROWD-^', 'RAU', 'RAU', |
||
| 3502 | 'RTEMONNAIE-', 'RTMON', 'RTNUN', |
||
| 3503 | 'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
||
| 3504 | 'RTIEL--3', 'RZI', 'RZI', |
||
| 3505 | 'RV(AEOU)-3', 'RW', None, |
||
| 3506 | 'RY(KN)-$', 'RI', 'RI', |
||
| 3507 | 'RY9^', 'RÜ', None, |
||
| 3508 | 'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
| 3509 | 'SAISO-^', 'SES', 'ZEZ', |
||
| 3510 | 'SAFE^$', 'SEIF', 'ZEIF', |
||
| 3511 | 'SAUCE-^', 'SOS', 'ZUZ', |
||
| 3512 | 'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
||
| 3513 | 'SCHSCH---7', '', '', |
||
| 3514 | 'SCHTSCH', 'SH', 'Z', |
||
| 3515 | 'SC(HZ)<', 'SH', 'Z', |
||
| 3516 | 'SC', 'SK', 'ZK', |
||
| 3517 | 'SELBSTST--7^^', 'SELB', 'ZELB', |
||
| 3518 | 'SELBST7^^', 'SELBST', 'ZELBZT', |
||
| 3519 | 'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
||
| 3520 | 'SERVI-^', 'SERW', None, |
||
| 3521 | 'SE(LMNRST)-3^', 'SE', 'ZE', |
||
| 3522 | 'SETTE$', 'SET', 'ZET', |
||
| 3523 | 'SHP-^', 'S', 'Z', |
||
| 3524 | 'SHST', 'SHT', 'ZT', |
||
| 3525 | 'SHTSH', 'SH', 'Z', |
||
| 3526 | 'SHT', 'ST', 'Z', |
||
| 3527 | 'SHY9^', 'SHÜ', None, |
||
| 3528 | 'SH^^', 'SH', None, |
||
| 3529 | 'SH3', 'SH', 'Z', |
||
| 3530 | 'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
||
| 3531 | 'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
||
| 3532 | 'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
||
| 3533 | 'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
||
| 3534 | 'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
||
| 3535 | 'SIEGLI-^', 'SIKL', 'ZIKL', |
||
| 3536 | 'SIGLI-^', 'SIKL', 'ZIKL', |
||
| 3537 | 'SIGHT', 'SEIT', 'ZEIT', |
||
| 3538 | 'SIGN', 'SEIN', 'ZEIN', |
||
| 3539 | 'SKI(NPZ)-', 'SKI', 'ZKI', |
||
| 3540 | 'SKI<^', 'SHI', 'ZI', |
||
| 3541 | 'SODASS^$', 'SO DAS', 'ZU TAZ', |
||
| 3542 | 'SODAß^$', 'SO DAS', 'ZU TAZ', |
||
| 3543 | 'SOGENAN--^', 'SO GEN', 'ZU KEN', |
||
| 3544 | 'SOUND-', 'SAUN', 'ZAUN', |
||
| 3545 | 'STAATS^^', 'STAZ', 'ZTAZ', |
||
| 3546 | 'STADT^^', 'STAT', 'ZTAT', |
||
| 3547 | 'STANDE$', ' STANDE', ' ZTANTE', |
||
| 3548 | 'START^^', 'START', 'ZTART', |
||
| 3549 | 'STAURANT7', 'STORAN', 'ZTURAN', |
||
| 3550 | 'STEAK-', 'STE', 'ZTE', |
||
| 3551 | 'STEPHEN-^$', 'STEW', None, |
||
| 3552 | 'STERN', 'STERN', None, |
||
| 3553 | 'STRAF^^', 'STRAF', 'ZTRAF', |
||
| 3554 | 'ST\'S$', 'Z', 'Z', |
||
| 3555 | 'ST´S$', 'Z', 'Z', |
||
| 3556 | 'STST--', '', '', |
||
| 3557 | 'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
||
| 3558 | 'ST(SZ)', 'Z', 'Z', |
||
| 3559 | 'SPAREN---^', 'SPA', 'ZPA', |
||
| 3560 | 'SPAREND----', ' SPA', ' ZPA', |
||
| 3561 | 'S(PTW)-^^', 'S', None, |
||
| 3562 | 'SP', 'SP', None, |
||
| 3563 | 'STYN(AE)-$', 'STIN', 'ZTIN', |
||
| 3564 | 'ST', 'ST', 'ZT', |
||
| 3565 | 'SUITE<', 'SIUT', 'ZIUT', |
||
| 3566 | 'SUKE--$', 'S', 'Z', |
||
| 3567 | 'SURF(EI)-', 'SÖRF', 'ZÖRF', |
||
| 3568 | 'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
||
| 3569 | 'SYB(IY)--^', 'SIB', None, |
||
| 3570 | 'SYL(KVW)--^', 'SI', None, |
||
| 3571 | 'SY9^', 'SÜ', None, |
||
| 3572 | 'SZE(NPT)-^', 'ZE', 'ZE', |
||
| 3573 | 'SZI(ELN)-^', 'ZI', 'ZI', |
||
| 3574 | 'SZCZ<', 'SH', 'Z', |
||
| 3575 | 'SZT<', 'ST', 'ZT', |
||
| 3576 | 'SZ<3', 'SH', 'Z', |
||
| 3577 | 'SÜL(KVW)--^', 'SI', None, |
||
| 3578 | 'S', None, 'Z', |
||
| 3579 | 'TCH', 'SH', 'Z', |
||
| 3580 | 'TD(AÄEIOÖRUÜY)-', 'T', None, |
||
| 3581 | 'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
||
| 3582 | 'TEAT-^', 'TEA', 'TEA', |
||
| 3583 | 'TERRAI7^', 'TERA', 'TERA', |
||
| 3584 | 'TE(LMNRST)-3^', 'TE', 'TE', |
||
| 3585 | 'TH<', 'T', 'T', |
||
| 3586 | 'TICHT-', 'TIK', 'TIK', |
||
| 3587 | 'TICH$', 'TIK', 'TIK', |
||
| 3588 | 'TIC$', 'TIZ', 'TIZ', |
||
| 3589 | 'TIGGESTELL-------', 'TIK ', 'TIK ', |
||
| 3590 | 'TIGSTELL-----', 'TIK ', 'TIK ', |
||
| 3591 | 'TOAS-^', 'TO', 'TU', |
||
| 3592 | 'TOILET-', 'TOLE', 'TULE', |
||
| 3593 | 'TOIN-', 'TOA', 'TUA', |
||
| 3594 | 'TRAECHTI-^', 'TRECHT', 'TREKT', |
||
| 3595 | 'TRAECHTIG--', ' TRECHT', ' TREKT', |
||
| 3596 | 'TRAINI-', 'TREN', 'TREN', |
||
| 3597 | 'TRÄCHTI-^', 'TRECHT', 'TREKT', |
||
| 3598 | 'TRÄCHTIG--', ' TRECHT', ' TREKT', |
||
| 3599 | 'TSCH', 'SH', 'Z', |
||
| 3600 | 'TSH', 'SH', 'Z', |
||
| 3601 | 'TST', 'ZT', 'ZT', |
||
| 3602 | 'T(Sß)', 'Z', 'Z', |
||
| 3603 | 'TT(SZ)--<', '', '', |
||
| 3604 | 'TT9', 'T', 'T', |
||
| 3605 | 'TV^$', 'TV', 'TV', |
||
| 3606 | 'TX(AEIOU)-3', 'SH', 'Z', |
||
| 3607 | 'TY9^', 'TÜ', None, |
||
| 3608 | 'TZ-', '', '', |
||
| 3609 | 'T\'S3$', 'Z', 'Z', |
||
| 3610 | 'T´S3$', 'Z', 'Z', |
||
| 3611 | 'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
| 3612 | 'UEBER^^', 'ÜBA', 'IBA', |
||
| 3613 | 'UE2', 'Ü', 'I', |
||
| 3614 | 'UGL-', 'UK', None, |
||
| 3615 | 'UH(AOÖUÜY)-', 'UH', None, |
||
| 3616 | 'UIE$', 'Ü', 'I', |
||
| 3617 | 'UM^^', 'UM', 'UN', |
||
| 3618 | 'UNTERE--3', 'UNTE', 'UNTE', |
||
| 3619 | 'UNTER^^', 'UNTA', 'UNTA', |
||
| 3620 | 'UNVER^^', 'UNFA', 'UNFA', |
||
| 3621 | 'UN^^', 'UN', 'UN', |
||
| 3622 | 'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
||
| 3623 | 'UVE-4', 'UW', None, |
||
| 3624 | 'UY2', 'UI', None, |
||
| 3625 | 'UZZ', 'AS', 'AZ', |
||
| 3626 | 'VACL-^', 'WAZ', 'FAZ', |
||
| 3627 | 'VAC$', 'WAZ', 'FAZ', |
||
| 3628 | 'VAN DEN ^', 'FANDN', 'FANTN', |
||
| 3629 | 'VANES-^', 'WANE', None, |
||
| 3630 | 'VATRO-', 'WATR', None, |
||
| 3631 | 'VA(DHJNT)--^', 'F', None, |
||
| 3632 | 'VEDD-^', 'FE', 'FE', |
||
| 3633 | 'VE(BEHIU)--^', 'F', None, |
||
| 3634 | 'VEL(BDLMNT)-^', 'FEL', None, |
||
| 3635 | 'VENTZ-^', 'FEN', None, |
||
| 3636 | 'VEN(NRSZ)-^', 'FEN', None, |
||
| 3637 | 'VER(AB)-^$', 'WER', None, |
||
| 3638 | 'VERBAL^$', 'WERBAL', None, |
||
| 3639 | 'VERBAL(EINS)-^', 'WERBAL', None, |
||
| 3640 | 'VERTEBR--', 'WERTE', None, |
||
| 3641 | 'VEREIN-----', 'F', None, |
||
| 3642 | 'VEREN(AEIOU)-^', 'WEREN', None, |
||
| 3643 | 'VERIFI', 'WERIFI', None, |
||
| 3644 | 'VERON(AEIOU)-^', 'WERON', None, |
||
| 3645 | 'VERSEN^', 'FERSN', 'FAZN', |
||
| 3646 | 'VERSIERT--^', 'WERSI', None, |
||
| 3647 | 'VERSIO--^', 'WERS', None, |
||
| 3648 | 'VERSUS', 'WERSUS', None, |
||
| 3649 | 'VERTI(GK)-', 'WERTI', None, |
||
| 3650 | 'VER^^', 'FER', 'FA', |
||
| 3651 | 'VERSPRECHE-------', ' FER', ' FA', |
||
| 3652 | 'VER$', 'WA', None, |
||
| 3653 | 'VER', 'FA', 'FA', |
||
| 3654 | 'VET(HT)-^', 'FET', 'FET', |
||
| 3655 | 'VETTE$', 'WET', 'FET', |
||
| 3656 | 'VE^', 'WE', None, |
||
| 3657 | 'VIC$', 'WIZ', 'FIZ', |
||
| 3658 | 'VIELSAGE----', 'FIL ', 'FIL ', |
||
| 3659 | 'VIEL', 'FIL', 'FIL', |
||
| 3660 | 'VIEW', 'WIU', 'FIU', |
||
| 3661 | 'VILL(AE)-', 'WIL', None, |
||
| 3662 | 'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
||
| 3663 | 'VI(ELS)--^', 'F', None, |
||
| 3664 | 'VILLON--', 'WILI', 'FILI', |
||
| 3665 | 'VIZE^^', 'FIZE', 'FIZE', |
||
| 3666 | 'VLIE--^', 'FL', None, |
||
| 3667 | 'VL(AEIOU)--', 'W', None, |
||
| 3668 | 'VOKA-^', 'WOK', None, |
||
| 3669 | 'VOL(ATUVW)--^', 'WO', None, |
||
| 3670 | 'VOR^^', 'FOR', 'FUR', |
||
| 3671 | 'VR(AEIOU)--', 'W', None, |
||
| 3672 | 'VV9', 'W', None, |
||
| 3673 | 'VY9^', 'WÜ', 'FI', |
||
| 3674 | 'V(ÜY)-', 'W', None, |
||
| 3675 | 'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
||
| 3676 | 'V(AEIJLRU)-<', 'W', None, |
||
| 3677 | 'V.^', 'V.', None, |
||
| 3678 | 'V<', 'F', 'F', |
||
| 3679 | 'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
||
| 3680 | 'WEITREICH-----^', 'WEIT ', 'FEIT ', |
||
| 3681 | 'WEITVER^', 'WEIT FER', 'FEIT FA', |
||
| 3682 | 'WE(LMNRST)-3^', 'WE', 'FE', |
||
| 3683 | 'WER(DST)-', 'WER', None, |
||
| 3684 | 'WIC$', 'WIZ', 'FIZ', |
||
| 3685 | 'WIEDERU--', 'WIDE', 'FITE', |
||
| 3686 | 'WIEDER^$', 'WIDA', 'FITA', |
||
| 3687 | 'WIEDER^^', 'WIDA ', 'FITA ', |
||
| 3688 | 'WIEVIEL', 'WI FIL', 'FI FIL', |
||
| 3689 | 'WISUEL', 'WISUEL', None, |
||
| 3690 | 'WR-^', 'W', None, |
||
| 3691 | 'WY9^', 'WÜ', 'FI', |
||
| 3692 | 'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
||
| 3693 | 'W$', 'F', None, |
||
| 3694 | 'W', None, 'F', |
||
| 3695 | 'X<^', 'Z', 'Z', |
||
| 3696 | 'XHAVEN$', 'XAFN', None, |
||
| 3697 | 'X(CSZ)', 'X', 'X', |
||
| 3698 | 'XTS(CH)--', 'XT', 'XT', |
||
| 3699 | 'XT(SZ)', 'Z', 'Z', |
||
| 3700 | 'YE(LMNRST)-3^', 'IE', 'IE', |
||
| 3701 | 'YE-3', 'I', 'I', |
||
| 3702 | 'YOR(GK)^$', 'IÖRK', 'IÖRK', |
||
| 3703 | 'Y(AOU)-<7', 'I', 'I', |
||
| 3704 | 'Y(BKLMNPRSTX)-1', 'Ü', None, |
||
| 3705 | 'YVES^$', 'IF', 'IF', |
||
| 3706 | 'YVONNE^$', 'IWON', 'IFUN', |
||
| 3707 | 'Y.^', 'Y.', None, |
||
| 3708 | 'Y', 'I', 'I', |
||
| 3709 | 'ZC(AOU)-', 'SK', 'ZK', |
||
| 3710 | 'ZE(LMNRST)-3^', 'ZE', 'ZE', |
||
| 3711 | 'ZIEJ$', 'ZI', 'ZI', |
||
| 3712 | 'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
||
| 3713 | 'ZL(AEIOU)-', 'SL', None, |
||
| 3714 | 'ZS(CHT)--', '', '', |
||
| 3715 | 'ZS', 'SH', 'Z', |
||
| 3716 | 'ZUERST', 'ZUERST', 'ZUERST', |
||
| 3717 | 'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
||
| 3718 | 'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
||
| 3719 | 'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
||
| 3720 | 'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
||
| 3721 | 'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
||
| 3722 | 'ZURUECK^^', 'ZURÜK', 'ZURIK', |
||
| 3723 | 'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
||
| 3724 | 'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
||
| 3725 | 'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
||
| 3726 | 'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
||
| 3727 | 'ZUVER^^', 'ZUFA', 'ZUFA', |
||
| 3728 | 'ZUVIEL', 'ZU FIL', 'ZU FIL', |
||
| 3729 | 'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
||
| 3730 | 'ZY9^', 'ZÜ', None, |
||
| 3731 | 'ZYK3$', 'ZIK', None, |
||
| 3732 | 'Z(VW)7^', 'SW', None, |
||
| 3733 | None, None, None) |
||
| 3734 | |||
| 3735 | phonet_hash = Counter() |
||
| 3736 | alpha_pos = Counter() |
||
| 3737 | |||
| 3738 | phonet_hash_1 = Counter() |
||
| 3739 | phonet_hash_2 = Counter() |
||
| 3740 | |||
| 3741 | _phonet_upper_translation = dict(zip((ord(_) for _ in |
||
| 3742 | 'abcdefghijklmnopqrstuvwxyzàáâãåäæ' + |
||
| 3743 | 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'), |
||
| 3744 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' + |
||
| 3745 | 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ')) |
||
| 3746 | |||
| 3747 | def _trinfo(text, rule, err_text, lang): |
||
| 3748 | """Output debug information.""" |
||
| 3749 | if lang == 'none': |
||
| 3750 | _phonet_rules = _phonet_rules_no_lang |
||
| 3751 | else: |
||
| 3752 | _phonet_rules = _phonet_rules_german |
||
| 3753 | |||
| 3754 | from_rule = ('(NULL)' if _phonet_rules[rule] is None else |
||
| 3755 | _phonet_rules[rule]) |
||
| 3756 | to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else |
||
| 3757 | _phonet_rules[rule + 1]) |
||
| 3758 | to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else |
||
| 3759 | _phonet_rules[rule + 2]) |
||
| 3760 | print('"{} {}: "{}"{}"{}" {}'.format(text, ((rule / 3) + 1), |
||
| 3761 | from_rule, to_rule1, to_rule2, |
||
| 3762 | err_text)) |
||
| 3763 | |||
| 3764 | def _initialize_phonet(lang): |
||
| 3765 | """Initialize phonet variables.""" |
||
| 3766 | if lang == 'none': |
||
| 3767 | _phonet_rules = _phonet_rules_no_lang |
||
| 3768 | else: |
||
| 3769 | _phonet_rules = _phonet_rules_german |
||
| 3770 | |||
| 3771 | phonet_hash[''] = -1 |
||
| 3772 | |||
| 3773 | # German and international umlauts |
||
| 3774 | for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', |
||
| 3775 | 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', |
||
| 3776 | 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}: |
||
| 3777 | alpha_pos[j] = 1 |
||
| 3778 | phonet_hash[j] = -1 |
||
| 3779 | |||
| 3780 | # "normal" letters ('A'-'Z') |
||
| 3781 | for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
||
| 3782 | alpha_pos[j] = i + 2 |
||
| 3783 | phonet_hash[j] = -1 |
||
| 3784 | |||
| 3785 | for i in range(26): |
||
| 3786 | for j in range(28): |
||
| 3787 | phonet_hash_1[i, j] = -1 |
||
| 3788 | phonet_hash_2[i, j] = -1 |
||
| 3789 | |||
| 3790 | # for each phonetc rule |
||
| 3791 | for i in range(len(_phonet_rules)): |
||
| 3792 | rule = _phonet_rules[i] |
||
| 3793 | |||
| 3794 | if rule and i % 3 == 0: |
||
| 3795 | # calculate first hash value |
||
| 3796 | k = _phonet_rules[i][0] |
||
| 3797 | |||
| 3798 | if phonet_hash[k] < 0 and (_phonet_rules[i+1] or |
||
| 3799 | _phonet_rules[i+2]): |
||
| 3800 | phonet_hash[k] = i |
||
| 3801 | |||
| 3802 | # calculate second hash values |
||
| 3803 | if k and alpha_pos[k] >= 2: |
||
| 3804 | k = alpha_pos[k] |
||
| 3805 | |||
| 3806 | j = k-2 |
||
| 3807 | rule = rule[1:] |
||
| 3808 | |||
| 3809 | if not rule: |
||
| 3810 | rule = ' ' |
||
| 3811 | elif rule[0] == '(': |
||
| 3812 | rule = rule[1:] |
||
| 3813 | else: |
||
| 3814 | rule = rule[0] |
||
| 3815 | |||
| 3816 | while rule and (rule[0] != ')'): |
||
| 3817 | k = alpha_pos[rule[0]] |
||
| 3818 | |||
| 3819 | if k > 0: |
||
| 3820 | # add hash value for this letter |
||
| 3821 | if phonet_hash_1[j, k] < 0: |
||
| 3822 | phonet_hash_1[j, k] = i |
||
| 3823 | phonet_hash_2[j, k] = i |
||
| 3824 | |||
| 3825 | if phonet_hash_2[j, k] >= (i-30): |
||
| 3826 | phonet_hash_2[j, k] = i |
||
| 3827 | else: |
||
| 3828 | k = -1 |
||
| 3829 | |||
| 3830 | if k <= 0: |
||
| 3831 | # add hash value for all letters |
||
| 3832 | if phonet_hash_1[j, 0] < 0: |
||
| 3833 | phonet_hash_1[j, 0] = i |
||
| 3834 | |||
| 3835 | phonet_hash_2[j, 0] = i |
||
| 3836 | |||
| 3837 | rule = rule[1:] |
||
| 3838 | |||
| 3839 | def _phonet(term, mode, lang, trace): |
||
| 3840 | """Return the phonet coded form of a term.""" |
||
| 3841 | if lang == 'none': |
||
| 3842 | _phonet_rules = _phonet_rules_no_lang |
||
| 3843 | else: |
||
| 3844 | _phonet_rules = _phonet_rules_german |
||
| 3845 | |||
| 3846 | char0 = '' |
||
| 3847 | dest = term |
||
| 3848 | |||
| 3849 | if not term: |
||
| 3850 | return '' |
||
| 3851 | |||
| 3852 | term_length = len(term) |
||
| 3853 | |||
| 3854 | # convert input string to upper-case |
||
| 3855 | src = term.translate(_phonet_upper_translation) |
||
| 3856 | |||
| 3857 | # check "src" |
||
| 3858 | i = 0 |
||
| 3859 | j = 0 |
||
| 3860 | zeta = 0 |
||
| 3861 | |||
| 3862 | while i < len(src): |
||
| 3863 | char = src[i] |
||
| 3864 | |||
| 3865 | if trace: |
||
| 3866 | print('\ncheck position {}: src = "{}", dest = "{}"'.format |
||
| 3867 | (j, src[i:], dest[:j])) |
||
| 3868 | |||
| 3869 | pos = alpha_pos[char] |
||
| 3870 | |||
| 3871 | if pos >= 2: |
||
| 3872 | xpos = pos-2 |
||
| 3873 | |||
| 3874 | if i+1 == len(src): |
||
| 3875 | pos = alpha_pos[''] |
||
| 3876 | else: |
||
| 3877 | pos = alpha_pos[src[i+1]] |
||
| 3878 | |||
| 3879 | start1 = phonet_hash_1[xpos, pos] |
||
| 3880 | start2 = phonet_hash_1[xpos, 0] |
||
| 3881 | end1 = phonet_hash_2[xpos, pos] |
||
| 3882 | end2 = phonet_hash_2[xpos, 0] |
||
| 3883 | |||
| 3884 | # preserve rule priorities |
||
| 3885 | if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
||
| 3886 | pos = start1 |
||
| 3887 | start1 = start2 |
||
| 3888 | start2 = pos |
||
| 3889 | pos = end1 |
||
| 3890 | end1 = end2 |
||
| 3891 | end2 = pos |
||
| 3892 | |||
| 3893 | if (end1 >= start2) and (start2 >= 0): |
||
| 3894 | if end2 > end1: |
||
| 3895 | end1 = end2 |
||
| 3896 | |||
| 3897 | start2 = -1 |
||
| 3898 | end2 = -1 |
||
| 3899 | else: |
||
| 3900 | pos = phonet_hash[char] |
||
| 3901 | start1 = pos |
||
| 3902 | end1 = 10000 |
||
| 3903 | start2 = -1 |
||
| 3904 | end2 = -1 |
||
| 3905 | |||
| 3906 | pos = start1 |
||
| 3907 | zeta0 = 0 |
||
| 3908 | |||
| 3909 | if pos >= 0: |
||
| 3910 | # check rules for this char |
||
| 3911 | while ((_phonet_rules[pos] is None) or |
||
| 3912 | (_phonet_rules[pos][0] == char)): |
||
| 3913 | if pos > end1: |
||
| 3914 | if start2 > 0: |
||
| 3915 | pos = start2 |
||
| 3916 | start1 = start2 |
||
| 3917 | start2 = -1 |
||
| 3918 | end1 = end2 |
||
| 3919 | end2 = -1 |
||
| 3920 | continue |
||
| 3921 | |||
| 3922 | break |
||
| 3923 | |||
| 3924 | if (((_phonet_rules[pos] is None) or |
||
| 3925 | (_phonet_rules[pos + mode] is None))): |
||
| 3926 | # no conversion rule available |
||
| 3927 | pos += 3 |
||
| 3928 | continue |
||
| 3929 | |||
| 3930 | if trace: |
||
| 3931 | _trinfo('> rule no.', pos, 'is being checked', lang) |
||
| 3932 | |||
| 3933 | # check whole string |
||
| 3934 | matches = 1 # number of matching letters |
||
| 3935 | priority = 5 # default priority |
||
| 3936 | rule = _phonet_rules[pos] |
||
| 3937 | rule = rule[1:] |
||
| 3938 | |||
| 3939 | while (rule and |
||
| 3940 | (len(src) > (i + matches)) and |
||
| 3941 | (src[i + matches] == rule[0]) and |
||
| 3942 | not rule[0].isdigit() and |
||
| 3943 | (rule not in '(-<^$')): |
||
| 3944 | matches += 1 |
||
| 3945 | rule = rule[1:] |
||
| 3946 | |||
| 3947 | if rule and (rule[0] == '('): |
||
| 3948 | # check an array of letters |
||
| 3949 | if (((len(src) > (i + matches)) and |
||
| 3950 | src[i + matches].isalpha() and |
||
| 3951 | (src[i + matches] in rule[1:]))): |
||
| 3952 | matches += 1 |
||
| 3953 | |||
| 3954 | while rule and rule[0] != ')': |
||
| 3955 | rule = rule[1:] |
||
| 3956 | |||
| 3957 | # if rule[0] == ')': |
||
| 3958 | rule = rule[1:] |
||
| 3959 | |||
| 3960 | if rule: |
||
| 3961 | priority0 = ord(rule[0]) |
||
| 3962 | else: |
||
| 3963 | priority0 = 0 |
||
| 3964 | |||
| 3965 | matches0 = matches |
||
| 3966 | |||
| 3967 | while rule and rule[0] == '-' and matches > 1: |
||
| 3968 | matches -= 1 |
||
| 3969 | rule = rule[1:] |
||
| 3970 | |||
| 3971 | if rule and rule[0] == '<': |
||
| 3972 | rule = rule[1:] |
||
| 3973 | |||
| 3974 | if rule and rule[0].isdigit(): |
||
| 3975 | # read priority |
||
| 3976 | priority = int(rule[0]) |
||
| 3977 | rule = rule[1:] |
||
| 3978 | |||
| 3979 | if rule and rule[0:2] == '^^': |
||
| 3980 | rule = rule[1:] |
||
| 3981 | |||
| 3982 | if (not rule or |
||
| 3983 | ((rule[0] == '^') and |
||
| 3984 | ((i == 0) or not src[i-1].isalpha()) and |
||
| 3985 | ((rule[1:2] != '$') or |
||
| 3986 | (not (src[i+matches0:i+matches0+1].isalpha()) and |
||
| 3987 | (src[i+matches0:i+matches0+1] != '.')))) or |
||
| 3988 | ((rule[0] == '$') and (i > 0) and |
||
| 3989 | src[i-1].isalpha() and |
||
| 3990 | ((not src[i+matches0:i+matches0+1].isalpha()) and |
||
| 3991 | (src[i+matches0:i+matches0+1] != '.')))): |
||
| 3992 | # look for continuation, if: |
||
| 3993 | # matches > 1 und NO '-' in first string */ |
||
| 3994 | pos0 = -1 |
||
| 3995 | |||
| 3996 | start3 = 0 |
||
| 3997 | start4 = 0 |
||
| 3998 | end3 = 0 |
||
| 3999 | end4 = 0 |
||
| 4000 | |||
| 4001 | if (((matches > 1) and |
||
| 4002 | src[i+matches:i+matches+1] and |
||
| 4003 | (priority0 != ord('-')))): |
||
| 4004 | char0 = src[i+matches-1] |
||
| 4005 | pos0 = alpha_pos[char0] |
||
| 4006 | |||
| 4007 | if pos0 >= 2 and src[i+matches]: |
||
| 4008 | xpos = pos0 - 2 |
||
| 4009 | pos0 = alpha_pos[src[i+matches]] |
||
| 4010 | start3 = phonet_hash_1[xpos, pos0] |
||
| 4011 | start4 = phonet_hash_1[xpos, 0] |
||
| 4012 | end3 = phonet_hash_2[xpos, pos0] |
||
| 4013 | end4 = phonet_hash_2[xpos, 0] |
||
| 4014 | |||
| 4015 | # preserve rule priorities |
||
| 4016 | if (((start4 >= 0) and |
||
| 4017 | ((start3 < 0) or (start4 < start3)))): |
||
| 4018 | pos0 = start3 |
||
| 4019 | start3 = start4 |
||
| 4020 | start4 = pos0 |
||
| 4021 | pos0 = end3 |
||
| 4022 | end3 = end4 |
||
| 4023 | end4 = pos0 |
||
| 4024 | |||
| 4025 | if (end3 >= start4) and (start4 >= 0): |
||
| 4026 | if end4 > end3: |
||
| 4027 | end3 = end4 |
||
| 4028 | |||
| 4029 | start4 = -1 |
||
| 4030 | end4 = -1 |
||
| 4031 | else: |
||
| 4032 | pos0 = phonet_hash[char0] |
||
| 4033 | start3 = pos0 |
||
| 4034 | end3 = 10000 |
||
| 4035 | start4 = -1 |
||
| 4036 | end4 = -1 |
||
| 4037 | |||
| 4038 | pos0 = start3 |
||
| 4039 | |||
| 4040 | # check continuation rules for src[i+matches] |
||
| 4041 | if pos0 >= 0: |
||
| 4042 | while ((_phonet_rules[pos0] is None) or |
||
| 4043 | (_phonet_rules[pos0][0] == char0)): |
||
| 4044 | if pos0 > end3: |
||
| 4045 | if start4 > 0: |
||
| 4046 | pos0 = start4 |
||
| 4047 | start3 = start4 |
||
| 4048 | start4 = -1 |
||
| 4049 | end3 = end4 |
||
| 4050 | end4 = -1 |
||
| 4051 | continue |
||
| 4052 | |||
| 4053 | priority0 = -1 |
||
| 4054 | |||
| 4055 | # important |
||
| 4056 | break |
||
| 4057 | |||
| 4058 | if (((_phonet_rules[pos0] is None) or |
||
| 4059 | (_phonet_rules[pos0 + mode] is None))): |
||
| 4060 | # no conversion rule available |
||
| 4061 | pos0 += 3 |
||
| 4062 | continue |
||
| 4063 | |||
| 4064 | if trace: |
||
| 4065 | _trinfo('> > continuation rule no.', pos0, |
||
| 4066 | 'is being checked', lang) |
||
| 4067 | |||
| 4068 | # check whole string |
||
| 4069 | matches0 = matches |
||
| 4070 | priority0 = 5 |
||
| 4071 | rule = _phonet_rules[pos0] |
||
| 4072 | rule = rule[1:] |
||
| 4073 | |||
| 4074 | while (rule and |
||
| 4075 | (src[i+matches0:i+matches0+1] == |
||
| 4076 | rule[0]) and |
||
| 4077 | (not rule[0].isdigit() or |
||
| 4078 | (rule in '(-<^$'))): |
||
| 4079 | matches0 += 1 |
||
| 4080 | rule = rule[1:] |
||
| 4081 | |||
| 4082 | if rule and rule[0] == '(': |
||
| 4083 | # check an array of letters |
||
| 4084 | if ((src[i+matches0:i+matches0+1] |
||
| 4085 | .isalpha() and |
||
| 4086 | (src[i+matches0] in rule[1:]))): |
||
| 4087 | matches0 += 1 |
||
| 4088 | |||
| 4089 | while rule and rule[0] != ')': |
||
| 4090 | rule = rule[1:] |
||
| 4091 | |||
| 4092 | # if rule[0] == ')': |
||
| 4093 | rule = rule[1:] |
||
| 4094 | |||
| 4095 | while rule and rule[0] == '-': |
||
| 4096 | # "matches0" is NOT decremented |
||
| 4097 | # because of "if (matches0 == matches)" |
||
| 4098 | rule = rule[1:] |
||
| 4099 | |||
| 4100 | if rule and rule[0] == '<': |
||
| 4101 | rule = rule[1:] |
||
| 4102 | |||
| 4103 | if rule and rule[0].isdigit(): |
||
| 4104 | priority0 = int(rule[0]) |
||
| 4105 | rule = rule[1:] |
||
| 4106 | |||
| 4107 | if (not rule or |
||
| 4108 | # rule == '^' is not possible here |
||
| 4109 | ((rule[0] == '$') and not |
||
| 4110 | src[i+matches0:i+matches0+1] |
||
| 4111 | .isalpha() and |
||
| 4112 | (src[i+matches0:i+matches0+1] |
||
| 4113 | != '.'))): |
||
| 4114 | if matches0 == matches: |
||
| 4115 | # this is only a partial string |
||
| 4116 | if trace: |
||
| 4117 | _trinfo('> > continuation ' + |
||
| 4118 | 'rule no.', |
||
| 4119 | pos0, |
||
| 4120 | 'not used (too short)', |
||
| 4121 | lang) |
||
| 4122 | |||
| 4123 | pos0 += 3 |
||
| 4124 | continue |
||
| 4125 | |||
| 4126 | if priority0 < priority: |
||
| 4127 | # priority is too low |
||
| 4128 | if trace: |
||
| 4129 | _trinfo('> > continuation ' + |
||
| 4130 | 'rule no.', |
||
| 4131 | pos0, |
||
| 4132 | 'not used (priority)', |
||
| 4133 | lang) |
||
| 4134 | |||
| 4135 | pos0 += 3 |
||
| 4136 | continue |
||
| 4137 | |||
| 4138 | # continuation rule found |
||
| 4139 | break |
||
| 4140 | |||
| 4141 | if trace: |
||
| 4142 | _trinfo('> > continuation rule no.', pos0, |
||
| 4143 | 'not used', lang) |
||
| 4144 | |||
| 4145 | pos0 += 3 |
||
| 4146 | |||
| 4147 | # end of "while" |
||
| 4148 | if ((priority0 >= priority) and |
||
| 4149 | ((_phonet_rules[pos0] is not None) and |
||
| 4150 | (_phonet_rules[pos0][0] == char0))): |
||
| 4151 | |||
| 4152 | if trace: |
||
| 4153 | _trinfo('> rule no.', pos, '', lang) |
||
| 4154 | _trinfo('> not used because of ' + |
||
| 4155 | 'continuation', pos0, '', lang) |
||
| 4156 | |||
| 4157 | pos += 3 |
||
| 4158 | continue |
||
| 4159 | |||
| 4160 | # replace string |
||
| 4161 | if trace: |
||
| 4162 | _trinfo('Rule no.', pos, 'is applied', lang) |
||
| 4163 | |||
| 4164 | if ((_phonet_rules[pos] and |
||
| 4165 | ('<' in _phonet_rules[pos][1:]))): |
||
| 4166 | priority0 = 1 |
||
| 4167 | else: |
||
| 4168 | priority0 = 0 |
||
| 4169 | |||
| 4170 | rule = _phonet_rules[pos + mode] |
||
| 4171 | |||
| 4172 | if (priority0 == 1) and (zeta == 0): |
||
| 4173 | # rule with '<' is applied |
||
| 4174 | if ((j > 0) and rule and |
||
| 4175 | ((dest[j-1] == char) or |
||
| 4176 | (dest[j-1] == rule[0]))): |
||
| 4177 | j -= 1 |
||
| 4178 | |||
| 4179 | zeta0 = 1 |
||
| 4180 | zeta += 1 |
||
| 4181 | matches0 = 0 |
||
| 4182 | |||
| 4183 | while rule and src[i+matches0]: |
||
| 4184 | src = (src[0:i+matches0] + rule[0] + |
||
| 4185 | src[i+matches0+1:]) |
||
| 4186 | matches0 += 1 |
||
| 4187 | rule = rule[1:] |
||
| 4188 | |||
| 4189 | if matches0 < matches: |
||
| 4190 | src = (src[0:i+matches0] + |
||
| 4191 | src[i+matches:]) |
||
| 4192 | |||
| 4193 | char = src[i] |
||
| 4194 | else: |
||
| 4195 | i = i + matches - 1 |
||
| 4196 | zeta = 0 |
||
| 4197 | |||
| 4198 | while len(rule) > 1: |
||
| 4199 | if (j == 0) or (dest[j - 1] != rule[0]): |
||
| 4200 | dest = (dest[0:j] + rule[0] + |
||
| 4201 | dest[min(len(dest), j+1):]) |
||
| 4202 | j += 1 |
||
| 4203 | |||
| 4204 | rule = rule[1:] |
||
| 4205 | |||
| 4206 | # new "current char" |
||
| 4207 | if not rule: |
||
| 4208 | rule = '' |
||
| 4209 | char = '' |
||
| 4210 | else: |
||
| 4211 | char = rule[0] |
||
| 4212 | |||
| 4213 | if ((_phonet_rules[pos] and |
||
| 4214 | '^^' in _phonet_rules[pos][1:])): |
||
| 4215 | if char: # pragma: no branch |
||
| 4216 | dest = (dest[0:j] + char + |
||
| 4217 | dest[min(len(dest), j + 1):]) |
||
| 4218 | j += 1 |
||
| 4219 | |||
| 4220 | src = src[i + 1:] |
||
| 4221 | i = 0 |
||
| 4222 | zeta0 = 1 |
||
| 4223 | |||
| 4224 | break |
||
| 4225 | |||
| 4226 | pos += 3 |
||
| 4227 | |||
| 4228 | if pos > end1 and start2 > 0: |
||
| 4229 | pos = start2 |
||
| 4230 | start1 = start2 |
||
| 4231 | end1 = end2 |
||
| 4232 | start2 = -1 |
||
| 4233 | end2 = -1 |
||
| 4234 | |||
| 4235 | if zeta0 == 0: |
||
| 4236 | if char and ((j == 0) or (dest[j-1] != char)): |
||
| 4237 | # delete multiple letters only |
||
| 4238 | dest = dest[0:j] + char + dest[min(j+1, term_length):] |
||
| 4239 | j += 1 |
||
| 4240 | |||
| 4241 | i += 1 |
||
| 4242 | zeta = 0 |
||
| 4243 | |||
| 4244 | dest = dest[0:j] |
||
| 4245 | |||
| 4246 | return dest |
||
| 4247 | |||
| 4248 | _initialize_phonet(lang) |
||
| 4249 | |||
| 4250 | word = unicodedata.normalize('NFKC', text_type(word)) |
||
| 4251 | return _phonet(word, mode, lang, trace) |
||
| 4252 | |||
| 4253 | |||
| 4254 | def spfc(word): |
||
| 4255 | """Return the Standardized Phonetic Frequency Code (SPFC) of a word. |
||
| 4256 | |||
| 4257 | Standardized Phonetic Frequency Code is roughly Soundex-like. |
||
| 4258 | This implementation is based on page 19-21 of |
||
| 4259 | https://archive.org/stream/accessingindivid00moor#page/19/mode/1up |
||
| 4260 | |||
| 4261 | :param str word: the word to transform |
||
| 4262 | :returns: the SPFC value |
||
| 4263 | :rtype: str |
||
| 4264 | |||
| 4265 | >>> spfc('Christopher Smith') |
||
| 4266 | '01160' |
||
| 4267 | >>> spfc('Christopher Schmidt') |
||
| 4268 | '01160' |
||
| 4269 | >>> spfc('Niall Smith') |
||
| 4270 | '01660' |
||
| 4271 | >>> spfc('Niall Schmidt') |
||
| 4272 | |||
| 4273 | >>> spfc('L.Smith') |
||
| 4274 | '01960' |
||
| 4275 | >>> spfc('R.Miller') |
||
| 4276 | '65490' |
||
| 4277 | |||
| 4278 | >>> spfc(('L', 'Smith')) |
||
| 4279 | '01960' |
||
| 4280 | >>> spfc(('R', 'Miller')) |
||
| 4281 | '65490' |
||
| 4282 | """ |
||
| 4283 | _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'), |
||
| 4284 | '0011112222334445556666777')) |
||
| 4285 | _pf2 = dict(zip((ord(_) for _ in |
||
| 4286 | 'SZCKQFPXABORDHIMNGJTUVWEL'), |
||
| 4287 | '0011122233445556677788899')) |
||
| 4288 | _pf3 = dict(zip((ord(_) for _ in |
||
| 4289 | 'BCKQVDTFLPGJXMNRSZAEHIOUWY'), |
||
| 4290 | '00000112223334456677777777')) |
||
| 4291 | |||
| 4292 | _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'), |
||
| 4293 | ('MN', 'N')) |
||
| 4294 | |||
| 4295 | def _raise_word_ex(): |
||
| 4296 | """Raise an AttributeError.""" |
||
| 4297 | raise AttributeError('word attribute must be a string with a space ' + |
||
| 4298 | 'or period dividing the first and last names ' + |
||
| 4299 | 'or a tuple/list consisting of the first and ' + |
||
| 4300 | 'last names') |
||
| 4301 | |||
| 4302 | if not word: |
||
| 4303 | return '' |
||
| 4304 | |||
| 4305 | if isinstance(word, (str, text_type)): |
||
| 4306 | names = word.split('.', 1) |
||
| 4307 | if len(names) != 2: |
||
| 4308 | names = word.split(' ', 1) |
||
| 4309 | if len(names) != 2: |
||
| 4310 | _raise_word_ex() |
||
| 4311 | elif hasattr(word, '__iter__'): |
||
| 4312 | if len(word) != 2: |
||
| 4313 | _raise_word_ex() |
||
| 4314 | names = word |
||
| 4315 | else: |
||
| 4316 | _raise_word_ex() |
||
| 4317 | |||
| 4318 | names = [unicodedata.normalize('NFKD', text_type(_.strip() |
||
| 4319 | .replace('ß', 'SS') |
||
| 4320 | .upper())) |
||
| 4321 | for _ in names] |
||
| 4322 | code = '' |
||
| 4323 | |||
| 4324 | def steps_one_to_three(name): |
||
| 4325 | """Perform the first three steps of SPFC.""" |
||
| 4326 | # filter out non A-Z |
||
| 4327 | name = ''.join(_ for _ in name if _ in |
||
| 4328 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
| 4329 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
| 4330 | 'W', 'X', 'Y', 'Z'}) |
||
| 4331 | |||
| 4332 | # 1. In the field, convert DK to K, DT to T, SC to S, KN to N, |
||
| 4333 | # and MN to N |
||
| 4334 | for subst in _substitutions: |
||
| 4335 | name = name.replace(subst[0], subst[1]) |
||
| 4336 | |||
| 4337 | # 2. In the name field, replace multiple letters with a single letter |
||
| 4338 | name = _delete_consecutive_repeats(name) |
||
| 4339 | |||
| 4340 | # 3. Remove vowels, W, H, and Y, but keep the first letter in the name |
||
| 4341 | # field. |
||
| 4342 | if name: |
||
| 4343 | name = name[0] + ''.join(_ for _ in name[1:] if _ not in |
||
| 4344 | {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}) |
||
| 4345 | return name |
||
| 4346 | |||
| 4347 | names = [steps_one_to_three(_) for _ in names] |
||
| 4348 | |||
| 4349 | # 4. The first digit of the code is obtained using PF1 and the first letter |
||
| 4350 | # of the name field. Remove this letter after coding. |
||
| 4351 | if names[1]: |
||
| 4352 | code += names[1][0].translate(_pf1) |
||
| 4353 | names[1] = names[1][1:] |
||
| 4354 | |||
| 4355 | # 5. Using the last letters of the name, use Table PF3 to obtain the |
||
| 4356 | # second digit of the code. Use as many letters as possible and remove |
||
| 4357 | # after coding. |
||
| 4358 | if names[1]: |
||
| 4359 | if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': |
||
| 4360 | code += '8' |
||
| 4361 | names[1] = names[1][:-3] |
||
| 4362 | elif names[1][-2:] == 'SN': |
||
| 4363 | code += '8' |
||
| 4364 | names[1] = names[1][:-2] |
||
| 4365 | elif names[1][-3:] == 'STR': |
||
| 4366 | code += '9' |
||
| 4367 | names[1] = names[1][:-3] |
||
| 4368 | elif names[1][-2:] in {'SR', 'TN', 'TD'}: |
||
| 4369 | code += '9' |
||
| 4370 | names[1] = names[1][:-2] |
||
| 4371 | elif names[1][-3:] == 'DRS': |
||
| 4372 | code += '7' |
||
| 4373 | names[1] = names[1][:-3] |
||
| 4374 | elif names[1][-2:] in {'TR', 'MN'}: |
||
| 4375 | code += '7' |
||
| 4376 | names[1] = names[1][:-2] |
||
| 4377 | else: |
||
| 4378 | code += names[1][-1].translate(_pf3) |
||
| 4379 | names[1] = names[1][:-1] |
||
| 4380 | |||
| 4381 | # 6. The third digit is found using Table PF2 and the first character of |
||
| 4382 | # the first name. Remove after coding. |
||
| 4383 | if names[0]: |
||
| 4384 | code += names[0][0].translate(_pf2) |
||
| 4385 | names[0] = names[0][1:] |
||
| 4386 | |||
| 4387 | # 7. The fourth digit is found using Table PF2 and the first character of |
||
| 4388 | # the name field. If no letters remain use zero. After coding remove the |
||
| 4389 | # letter. |
||
| 4390 | # 8. The fifth digit is found in the same manner as the fourth using the |
||
| 4391 | # remaining characters of the name field if any. |
||
| 4392 | for _ in range(2): |
||
| 4393 | if names[1]: |
||
| 4394 | code += names[1][0].translate(_pf2) |
||
| 4395 | names[1] = names[1][1:] |
||
| 4396 | else: |
||
| 4397 | code += '0' |
||
| 4398 | |||
| 4399 | return code |
||
| 4400 | |||
| 4401 | |||
| 4402 | def statistics_canada(word, maxlength=4): |
||
| 4403 | """Return the Statistics Canada code for a word. |
||
| 4404 | |||
| 4405 | The original description of this algorithm could not be located, and |
||
| 4406 | may only have been specified in an unpublished TR. The coding does not |
||
| 4407 | appear to be in use by Statistics Canada any longer. In its place, this is |
||
| 4408 | an implementation of the "Census modified Statistics Canada name coding |
||
| 4409 | procedure". |
||
| 4410 | |||
| 4411 | The modified version of this algorithm is described in Appendix B of |
||
| 4412 | Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding |
||
| 4413 | Procedure for the SRS Record Linkage System.` Statistical Reporting |
||
| 4414 | Service, U.S. Department of Agriculture, Washington, D.C. February 1977. |
||
| 4415 | https://naldc.nal.usda.gov/download/27833/PDF |
||
| 4416 | |||
| 4417 | :param str word: the word to transform |
||
| 4418 | :param int maxlength: the maximum length (default 6) of the code to return |
||
| 4419 | :param bool modified: indicates whether to use USDA modified algorithm |
||
| 4420 | :returns: the Statistics Canada name code value |
||
| 4421 | :rtype: str |
||
| 4422 | |||
| 4423 | >>> statistics_canada('Christopher') |
||
| 4424 | 'CHRS' |
||
| 4425 | >>> statistics_canada('Niall') |
||
| 4426 | 'NL' |
||
| 4427 | >>> statistics_canada('Smith') |
||
| 4428 | 'SMTH' |
||
| 4429 | >>> statistics_canada('Schmidt') |
||
| 4430 | 'SCHM' |
||
| 4431 | """ |
||
| 4432 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 4433 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 4434 | word = word.replace('ß', 'SS') |
||
| 4435 | word = ''.join(c for c in word if c in |
||
| 4436 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 4437 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 4438 | 'Y', 'Z'}) |
||
| 4439 | if not word: |
||
| 4440 | return '' |
||
| 4441 | |||
| 4442 | code = word[1:] |
||
| 4443 | for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
| 4444 | code = code.replace(vowel, '') |
||
| 4445 | code = word[0]+code |
||
| 4446 | code = _delete_consecutive_repeats(code) |
||
| 4447 | code = code.replace(' ', '') |
||
| 4448 | |||
| 4449 | return code[:maxlength] |
||
| 4450 | |||
| 4451 | |||
| 4452 | def lein(word, maxlength=4, zero_pad=True): |
||
| 4453 | """Return the Lein code for a word. |
||
| 4454 | |||
| 4455 | This is Lein name coding, based on |
||
| 4456 | https://naldc.nal.usda.gov/download/27833/PDF |
||
| 4457 | |||
| 4458 | :param str word: the word to transform |
||
| 4459 | :param int maxlength: the maximum length (default 4) of the code to return |
||
| 4460 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
| 4461 | maxlength string |
||
| 4462 | :returns: the Lein code |
||
| 4463 | :rtype: str |
||
| 4464 | |||
| 4465 | >>> lein('Christopher') |
||
| 4466 | 'C351' |
||
| 4467 | >>> lein('Niall') |
||
| 4468 | 'N300' |
||
| 4469 | >>> lein('Smith') |
||
| 4470 | 'S210' |
||
| 4471 | >>> lein('Schmidt') |
||
| 4472 | 'S521' |
||
| 4473 | """ |
||
| 4474 | _lein_translation = dict(zip((ord(_) for _ in |
||
| 4475 | 'BCDFGJKLMNPQRSTVXZ'), |
||
| 4476 | '451455532245351455')) |
||
| 4477 | |||
| 4478 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 4479 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 4480 | word = word.replace('ß', 'SS') |
||
| 4481 | word = ''.join(c for c in word if c in |
||
| 4482 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 4483 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 4484 | 'Y', 'Z'}) |
||
| 4485 | |||
| 4486 | if not word: |
||
| 4487 | return '' |
||
| 4488 | |||
| 4489 | code = word[0] # Rule 1 |
||
| 4490 | word = word[1:].translate({32: None, 65: None, 69: None, 72: None, |
||
| 4491 | 73: None, 79: None, 85: None, 87: None, |
||
| 4492 | 89: None}) # Rule 2 |
||
| 4493 | word = _delete_consecutive_repeats(word) # Rule 3 |
||
| 4494 | code += word.translate(_lein_translation) # Rule 4 |
||
| 4495 | |||
| 4496 | if zero_pad: |
||
| 4497 | code += ('0'*maxlength) # Rule 4 |
||
| 4498 | |||
| 4499 | return code[:maxlength] |
||
| 4500 | |||
| 4501 | |||
| 4502 | def roger_root(word, maxlength=5, zero_pad=True): |
||
| 4503 | """Return the Roger Root code for a word. |
||
| 4504 | |||
| 4505 | This is Roger Root name coding, based on |
||
| 4506 | https://naldc.nal.usda.gov/download/27833/PDF |
||
| 4507 | |||
| 4508 | :param str word: the word to transform |
||
| 4509 | :param int maxlength: the maximum length (default 5) of the code to return |
||
| 4510 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
| 4511 | maxlength string |
||
| 4512 | :returns: the Roger Root code |
||
| 4513 | :rtype: str |
||
| 4514 | |||
| 4515 | >>> roger_root('Christopher') |
||
| 4516 | '06401' |
||
| 4517 | >>> roger_root('Niall') |
||
| 4518 | '02500' |
||
| 4519 | >>> roger_root('Smith') |
||
| 4520 | '00310' |
||
| 4521 | >>> roger_root('Schmidt') |
||
| 4522 | '06310' |
||
| 4523 | """ |
||
| 4524 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 4525 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 4526 | word = word.replace('ß', 'SS') |
||
| 4527 | word = ''.join(c for c in word if c in |
||
| 4528 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 4529 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 4530 | 'Y', 'Z'}) |
||
| 4531 | |||
| 4532 | if not word: |
||
| 4533 | return '' |
||
| 4534 | |||
| 4535 | # '*' is used to prevent combining by _delete_consecutive_repeats() |
||
| 4536 | _init_patterns = {4: {'TSCH': '06'}, |
||
| 4537 | 3: {'TSH': '06', 'SCH': '06'}, |
||
| 4538 | 2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0', |
||
| 4539 | 'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02', |
||
| 4540 | 'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02', |
||
| 4541 | 'SH': '06', 'TS': '0*0', 'WR': '04'}, |
||
| 4542 | 1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1', |
||
| 4543 | 'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3', |
||
| 4544 | 'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1', |
||
| 4545 | 'P': '09', 'Q': '07', 'R': '04', 'S': '0*0', |
||
| 4546 | 'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07', |
||
| 4547 | 'Y': '5', 'Z': '0*0'}} |
||
| 4548 | |||
| 4549 | _med_patterns = {4: {'TSCH': '6'}, |
||
| 4550 | 3: {'TSH': '6', 'SCH': '6'}, |
||
| 4551 | 2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7', |
||
| 4552 | 'PH': '8', 'SH': '6', 'TS': '0'}, |
||
| 4553 | 1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7', |
||
| 4554 | 'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2', |
||
| 4555 | 'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1', |
||
| 4556 | 'V': '8', 'X': '7', 'Z': '0', |
||
| 4557 | 'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*', |
||
| 4558 | 'U': '*', 'W': '*', 'Y': '*'}} |
||
| 4559 | |||
| 4560 | code = '' |
||
| 4561 | pos = 0 |
||
| 4562 | |||
| 4563 | # Do first digit(s) first |
||
| 4564 | for num in range(4, 0, -1): |
||
| 4565 | if word[:num] in _init_patterns[num]: |
||
| 4566 | code = _init_patterns[num][word[:num]] |
||
| 4567 | pos += num |
||
| 4568 | break |
||
| 4569 | else: |
||
| 4570 | pos += 1 # Advance if nothing is recognized |
||
| 4571 | |||
| 4572 | # Then code subsequent digits |
||
| 4573 | while pos < len(word): |
||
| 4574 | for num in range(4, 0, -1): |
||
| 4575 | if word[pos:pos+num] in _med_patterns[num]: |
||
| 4576 | code += _med_patterns[num][word[pos:pos+num]] |
||
| 4577 | pos += num |
||
| 4578 | break |
||
| 4579 | else: |
||
| 4580 | pos += 1 # Advance if nothing is recognized |
||
| 4581 | |||
| 4582 | code = _delete_consecutive_repeats(code) |
||
| 4583 | code = code.replace('*', '') |
||
| 4584 | |||
| 4585 | if zero_pad: |
||
| 4586 | code += '0'*maxlength |
||
| 4587 | |||
| 4588 | return code[:maxlength] |
||
| 4589 | |||
| 4590 | |||
| 4591 | def onca(word, maxlength=4, zero_pad=True): |
||
| 4592 | """Return the Oxford Name Compression Algorithm (ONCA) code for a word. |
||
| 4593 | |||
| 4594 | This is the Oxford Name Compression Algorithm, based on: |
||
| 4595 | Gill, Leicester E. 1997. "OX-LINK: The Oxford Medical Record Linkage |
||
| 4596 | System." In ``Record Linkage Techniques -- 1997``. Arlington, VA. March |
||
| 4597 | 20--21, 1997. |
||
| 4598 | https://nces.ed.gov/FCSM/pdf/RLT97.pdf |
||
| 4599 | |||
| 4600 | I can find no complete description of the "anglicised version of the NYSIIS |
||
| 4601 | method" identified as the first step in this algorithm, so this is likely |
||
| 4602 | not a correct implementation, in that it employs the standard NYSIIS |
||
| 4603 | algorithm. |
||
| 4604 | |||
| 4605 | :param str word: the word to transform |
||
| 4606 | :param int maxlength: the maximum length (default 5) of the code to return |
||
| 4607 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
| 4608 | maxlength string |
||
| 4609 | :returns: the ONCA code |
||
| 4610 | :rtype: str |
||
| 4611 | |||
| 4612 | >>> onca('Christopher') |
||
| 4613 | 'C623' |
||
| 4614 | >>> onca('Niall') |
||
| 4615 | 'N400' |
||
| 4616 | >>> onca('Smith') |
||
| 4617 | 'S530' |
||
| 4618 | >>> onca('Schmidt') |
||
| 4619 | 'S530' |
||
| 4620 | """ |
||
| 4621 | # In the most extreme case, 3 characters of NYSIIS input can be compressed |
||
| 4622 | # to one character of output, so give it triple the maxlength. |
||
| 4623 | return soundex(nysiis(word, maxlength=maxlength*3), maxlength, |
||
| 4624 | zero_pad=zero_pad) |
||
| 4625 | |||
| 4626 | |||
| 4627 | def eudex(word, maxlength=8): |
||
| 4628 | """Return the eudex phonetic hash of a word. |
||
| 4629 | |||
| 4630 | This implementation of eudex phonetic hashing is based on the specification |
||
| 4631 | (not the reference implementation) at: |
||
| 4632 | Ticki. 2017. "Eudex: A blazingly fast phonetic reduction/hashing |
||
| 4633 | algorithm." https://docs.rs/crate/eudex |
||
| 4634 | |||
| 4635 | Further details can be found at |
||
| 4636 | http://ticki.github.io/blog/the-eudex-algorithm/ |
||
| 4637 | |||
| 4638 | :param str word: the word to transform |
||
| 4639 | :param int maxlength: the length of the code returned (defaults to 8) |
||
| 4640 | :returns: the eudex hash |
||
| 4641 | :rtype: str |
||
| 4642 | """ |
||
| 4643 | _trailing_phones = { |
||
| 4644 | 'a': 0, # a |
||
| 4645 | 'b': 0b01001000, # b |
||
| 4646 | 'c': 0b00001100, # c |
||
| 4647 | 'd': 0b00011000, # d |
||
| 4648 | 'e': 0, # e |
||
| 4649 | 'f': 0b01000100, # f |
||
| 4650 | 'g': 0b00001000, # g |
||
| 4651 | 'h': 0b00000100, # h |
||
| 4652 | 'i': 1, # i |
||
| 4653 | 'j': 0b00000101, # j |
||
| 4654 | 'k': 0b00001001, # k |
||
| 4655 | 'l': 0b10100000, # l |
||
| 4656 | 'm': 0b00000010, # m |
||
| 4657 | 'n': 0b00010010, # n |
||
| 4658 | 'o': 0, # o |
||
| 4659 | 'p': 0b01001001, # p |
||
| 4660 | 'q': 0b10101000, # q |
||
| 4661 | 'r': 0b10100001, # r |
||
| 4662 | 's': 0b00010100, # s |
||
| 4663 | 't': 0b00011101, # t |
||
| 4664 | 'u': 1, # u |
||
| 4665 | 'v': 0b01000101, # v |
||
| 4666 | 'w': 0b00000000, # w |
||
| 4667 | 'x': 0b10000100, # x |
||
| 4668 | 'y': 1, # y |
||
| 4669 | 'z': 0b10010100, # z |
||
| 4670 | |||
| 4671 | 'ß': 0b00010101, # ß |
||
| 4672 | 'à': 0, # à |
||
| 4673 | 'á': 0, # á |
||
| 4674 | 'â': 0, # â |
||
| 4675 | 'ã': 0, # ã |
||
| 4676 | 'ä': 0, # ä[æ] |
||
| 4677 | 'å': 1, # å[oː] |
||
| 4678 | 'æ': 0, # æ[æ] |
||
| 4679 | 'ç': 0b10010101, # ç[t͡ʃ] |
||
| 4680 | 'è': 1, # è |
||
| 4681 | 'é': 1, # é |
||
| 4682 | 'ê': 1, # ê |
||
| 4683 | 'ë': 1, # ë |
||
| 4684 | 'ì': 1, # ì |
||
| 4685 | 'í': 1, # í |
||
| 4686 | 'î': 1, # î |
||
| 4687 | 'ï': 1, # ï |
||
| 4688 | 'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T) |
||
| 4689 | 'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j) |
||
| 4690 | 'ò': 0, # ò |
||
| 4691 | 'ó': 0, # ó |
||
| 4692 | 'ô': 0, # ô |
||
| 4693 | 'õ': 0, # õ |
||
| 4694 | 'ö': 1, # ö[ø] |
||
| 4695 | '÷': 0b11111111, # ÷ |
||
| 4696 | 'ø': 1, # ø[ø] |
||
| 4697 | 'ù': 1, # ù |
||
| 4698 | 'ú': 1, # ú |
||
| 4699 | 'û': 1, # û |
||
| 4700 | 'ü': 1, # ü |
||
| 4701 | 'ý': 1, # ý |
||
| 4702 | 'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T) |
||
| 4703 | 'ÿ': 1, # ÿ |
||
| 4704 | } |
||
| 4705 | |||
| 4706 | _initial_phones = { |
||
| 4707 | 'a': 0b10000100, # a* |
||
| 4708 | 'b': 0b00100100, # b |
||
| 4709 | 'c': 0b00000110, # c |
||
| 4710 | 'd': 0b00001100, # d |
||
| 4711 | 'e': 0b11011000, # e* |
||
| 4712 | 'f': 0b00100010, # f |
||
| 4713 | 'g': 0b00000100, # g |
||
| 4714 | 'h': 0b00000010, # h |
||
| 4715 | 'i': 0b11111000, # i* |
||
| 4716 | 'j': 0b00000011, # j |
||
| 4717 | 'k': 0b00000101, # k |
||
| 4718 | 'l': 0b01010000, # l |
||
| 4719 | 'm': 0b00000001, # m |
||
| 4720 | 'n': 0b00001001, # n |
||
| 4721 | 'o': 0b10010100, # o* |
||
| 4722 | 'p': 0b00100101, # p |
||
| 4723 | 'q': 0b01010100, # q |
||
| 4724 | 'r': 0b01010001, # r |
||
| 4725 | 's': 0b00001010, # s |
||
| 4726 | 't': 0b00001110, # t |
||
| 4727 | 'u': 0b11100000, # u* |
||
| 4728 | 'v': 0b00100011, # v |
||
| 4729 | 'w': 0b00000000, # w |
||
| 4730 | 'x': 0b01000010, # x |
||
| 4731 | 'y': 0b11100100, # y* |
||
| 4732 | 'z': 0b01001010, # z |
||
| 4733 | |||
| 4734 | 'ß': 0b00001011, # ß |
||
| 4735 | 'à': 0b10000101, # à |
||
| 4736 | 'á': 0b10000101, # á |
||
| 4737 | 'â': 0b10000000, # â |
||
| 4738 | 'ã': 0b10000110, # ã |
||
| 4739 | 'ä': 0b10100110, # ä [æ] |
||
| 4740 | 'å': 0b11000010, # å [oː] |
||
| 4741 | 'æ': 0b10100111, # æ [æ] |
||
| 4742 | 'ç': 0b01010100, # ç [t͡ʃ] |
||
| 4743 | 'è': 0b11011001, # è |
||
| 4744 | 'é': 0b11011001, # é |
||
| 4745 | 'ê': 0b11011001, # ê |
||
| 4746 | 'ë': 0b11000110, # ë [ə] or [œ] |
||
| 4747 | 'ì': 0b11111001, # ì |
||
| 4748 | 'í': 0b11111001, # í |
||
| 4749 | 'î': 0b11111001, # î |
||
| 4750 | 'ï': 0b11111001, # ï |
||
| 4751 | 'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T) |
||
| 4752 | 'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j) |
||
| 4753 | 'ò': 0b10010101, # ò |
||
| 4754 | 'ó': 0b10010101, # ó |
||
| 4755 | 'ô': 0b10010101, # ô |
||
| 4756 | 'õ': 0b10010101, # õ |
||
| 4757 | 'ö': 0b11011100, # ö [œ] or [ø] |
||
| 4758 | '÷': 0b11111111, # ÷ |
||
| 4759 | 'ø': 0b11011101, # ø [œ] or [ø] |
||
| 4760 | 'ù': 0b11100001, # ù |
||
| 4761 | 'ú': 0b11100001, # ú |
||
| 4762 | 'û': 0b11100001, # û |
||
| 4763 | 'ü': 0b11100101, # ü |
||
| 4764 | 'ý': 0b11100101, # ý |
||
| 4765 | 'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T) |
||
| 4766 | 'ÿ': 0b11100101, # ÿ |
||
| 4767 | } |
||
| 4768 | # Lowercase input & filter unknown characters |
||
| 4769 | word = ''.join(char for char in word.lower() if char in _initial_phones) |
||
| 4770 | |||
| 4771 | # Perform initial eudex coding of each character |
||
| 4772 | values = [_initial_phones[word[0]]] |
||
| 4773 | values += [_trailing_phones[char] for char in word[1:]] |
||
| 4774 | |||
| 4775 | # Right-shift by one to determine if second instance should be skipped |
||
| 4776 | shifted_values = [_ >> 1 for _ in values] |
||
| 4777 | condensed_values = [values[0]] |
||
| 4778 | for n in range(1, len(shifted_values)): |
||
| 4779 | if shifted_values[n] != shifted_values[n-1]: |
||
| 4780 | condensed_values.append(values[n]) |
||
| 4781 | |||
| 4782 | # Add padding after first character & trim beyond maxlength |
||
| 4783 | values = ([condensed_values[0]] + |
||
| 4784 | [0]*max(0, maxlength - len(condensed_values)) + |
||
| 4785 | condensed_values[1:maxlength]) |
||
| 4786 | |||
| 4787 | # Combine individual character values into eudex hash |
||
| 4788 | hash_value = 0 |
||
| 4789 | for val in values: |
||
| 4790 | hash_value = (hash_value << 8) | val |
||
| 4791 | |||
| 4792 | return hash_value |
||
| 4793 | |||
| 4794 | |||
| 4795 | def haase_phonetik(word, primary_only=False): |
||
| 4796 | """Return the Haase Phonetik (numeric output) code for a word. |
||
| 4797 | |||
| 4798 | Based on the algorithm described at |
||
| 4799 | https://github.com/elastic/elasticsearch/blob/master/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java |
||
| 4800 | |||
| 4801 | Based on the original |
||
| 4802 | Haase, Martin and Kai Heitmann. 2000. Die Erweiterte Kölner Phonetik. |
||
| 4803 | |||
| 4804 | While the output code is numeric, it is still a str. |
||
| 4805 | |||
| 4806 | :param str word: the word to transform |
||
| 4807 | :returns: the Haase Phonetik value as a numeric string |
||
| 4808 | :rtype: str |
||
| 4809 | """ |
||
| 4810 | def _after(word, i, letters): |
||
| 4811 | """Return True if word[i] follows one of the supplied letters.""" |
||
| 4812 | if i > 0 and word[i-1] in letters: |
||
| 4813 | return True |
||
| 4814 | return False |
||
| 4815 | |||
| 4816 | def _before(word, i, letters): |
||
| 4817 | """Return True if word[i] precedes one of the supplied letters.""" |
||
| 4818 | if i+1 < len(word) and word[i+1] in letters: |
||
| 4819 | return True |
||
| 4820 | return False |
||
| 4821 | |||
| 4822 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
| 4823 | |||
| 4824 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 4825 | word = word.replace('ß', 'SS') |
||
| 4826 | |||
| 4827 | word = word.replace('Ä', 'AE') |
||
| 4828 | word = word.replace('Ö', 'OE') |
||
| 4829 | word = word.replace('Ü', 'UE') |
||
| 4830 | word = ''.join(c for c in word if c in |
||
| 4831 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 4832 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 4833 | 'Y', 'Z'}) |
||
| 4834 | |||
| 4835 | # Nothing to convert, return base case |
||
| 4836 | if not word: |
||
| 4837 | return '' |
||
| 4838 | |||
| 4839 | variants = [] |
||
| 4840 | if primary_only: |
||
| 4841 | variants = [word] |
||
| 4842 | else: |
||
| 4843 | pos = 0 |
||
| 4844 | if word[:2] == 'CH': |
||
| 4845 | variants.append(('CH', 'SCH')) |
||
| 4846 | pos += 2 |
||
| 4847 | len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', |
||
| 4848 | 'AUX': 'O', 'EUX': 'O'} |
||
| 4849 | while pos < len(word): |
||
| 4850 | if word[pos:pos+4] == 'ILLE': |
||
| 4851 | variants.append(('ILLE', 'I')) |
||
| 4852 | pos += 4 |
||
| 4853 | elif word[pos:pos+3] in len_3_vars: |
||
| 4854 | variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]])) |
||
| 4855 | pos += 3 |
||
| 4856 | elif word[pos:pos+2] == 'RB': |
||
| 4857 | variants.append(('RB', 'RW')) |
||
| 4858 | pos += 2 |
||
| 4859 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
||
| 4860 | variants.append(('EAU', 'O')) |
||
| 4861 | pos += 3 |
||
| 4862 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
||
| 4863 | if word[pos:] == 'O': |
||
| 4864 | variants.append(('O', 'OW')) |
||
| 4865 | else: |
||
| 4866 | variants.append(('A', 'AR')) |
||
| 4867 | pos += 1 |
||
| 4868 | else: |
||
| 4869 | variants.append((word[pos],)) |
||
| 4870 | pos += 1 |
||
| 4871 | |||
| 4872 | variants = [''.join(letters) for letters in product(*variants)] |
||
| 4873 | |||
| 4874 | def _haase_code(word): |
||
| 4875 | sdx = '' |
||
| 4876 | for i in range(len(word)): |
||
| 4877 | View Code Duplication | if word[i] in _vowels: |
|
| 4878 | sdx += '9' |
||
| 4879 | elif word[i] == 'B': |
||
| 4880 | sdx += '1' |
||
| 4881 | elif word[i] == 'P': |
||
| 4882 | if _before(word, i, {'H'}): |
||
| 4883 | sdx += '3' |
||
| 4884 | else: |
||
| 4885 | sdx += '1' |
||
| 4886 | elif word[i] in {'D', 'T'}: |
||
| 4887 | if _before(word, i, {'C', 'S', 'Z'}): |
||
| 4888 | sdx += '8' |
||
| 4889 | else: |
||
| 4890 | sdx += '2' |
||
| 4891 | elif word[i] in {'F', 'V', 'W'}: |
||
| 4892 | sdx += '3' |
||
| 4893 | elif word[i] in {'G', 'K', 'Q'}: |
||
| 4894 | sdx += '4' |
||
| 4895 | elif word[i] == 'C': |
||
| 4896 | if _after(word, i, {'S', 'Z'}): |
||
| 4897 | sdx += '8' |
||
| 4898 | elif i == 0: |
||
| 4899 | if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', |
||
| 4900 | 'U', 'X'}): |
||
| 4901 | sdx += '4' |
||
| 4902 | else: |
||
| 4903 | sdx += '8' |
||
| 4904 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
||
| 4905 | sdx += '4' |
||
| 4906 | else: |
||
| 4907 | sdx += '8' |
||
| 4908 | elif word[i] == 'X': |
||
| 4909 | if _after(word, i, {'C', 'K', 'Q'}): |
||
| 4910 | sdx += '8' |
||
| 4911 | else: |
||
| 4912 | sdx += '48' |
||
| 4913 | elif word[i] == 'L': |
||
| 4914 | sdx += '5' |
||
| 4915 | elif word[i] in {'M', 'N'}: |
||
| 4916 | sdx += '6' |
||
| 4917 | elif word[i] == 'R': |
||
| 4918 | sdx += '7' |
||
| 4919 | elif word[i] in {'S', 'Z'}: |
||
| 4920 | sdx += '8' |
||
| 4921 | |||
| 4922 | sdx = _delete_consecutive_repeats(sdx) |
||
| 4923 | |||
| 4924 | # if sdx: |
||
| 4925 | # sdx = sdx[0] + sdx[1:].replace('9', '') |
||
| 4926 | |||
| 4927 | return sdx |
||
| 4928 | |||
| 4929 | return tuple(_haase_code(word) for word in variants) |
||
| 4930 | |||
| 4931 | |||
| 4932 | def reth_schek_phonetik(word): |
||
| 4933 | """Return Reth-Schek Phonetik code for a word. |
||
| 4934 | |||
| 4935 | This algorithm is proposed in: |
||
| 4936 | von Reth, Hans-Peter and Schek, Hans-Jörg. 1977. "Eine Zugriffsmethode für |
||
| 4937 | die phonetische Ähnlichkeitssuche." Heidelberg Scientific Center technical |
||
| 4938 | reports 77.03.002. IBM Deutschland GmbH. |
||
| 4939 | |||
| 4940 | Since I couldn't secure a copy of that document (maybe I'll look for it |
||
| 4941 | next time I'm in Germany), this implementation is based on what I could |
||
| 4942 | glean from the implementations published by German Record Linkage |
||
| 4943 | Center (www.record-linkage.de): |
||
| 4944 | - Privacy-preserving Record Linkage (PPRL) (in R) |
||
| 4945 | - Merge ToolBox (in Java) |
||
| 4946 | |||
| 4947 | Rules that are unclear: |
||
| 4948 | - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
||
| 4949 | - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
||
| 4950 | - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
||
| 4951 | think of a German word with '-tui-' in it.) |
||
| 4952 | - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
||
| 4953 | |||
| 4954 | :param word: |
||
| 4955 | :return: |
||
| 4956 | """ |
||
| 4957 | replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', |
||
| 4958 | 'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', |
||
| 4959 | 'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'}, |
||
| 4960 | 2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', |
||
| 4961 | 'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', |
||
| 4962 | 'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', |
||
| 4963 | 'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', |
||
| 4964 | 'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', |
||
| 4965 | 'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', |
||
| 4966 | 'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', |
||
| 4967 | 'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', |
||
| 4968 | 'SS': 'S', 'KW': 'QU'}, |
||
| 4969 | 1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', |
||
| 4970 | 'K': 'G', 'Y': 'I'}} |
||
| 4971 | |||
| 4972 | # Uppercase |
||
| 4973 | word = word.upper() |
||
| 4974 | |||
| 4975 | # Replace umlauts/eszett |
||
| 4976 | word = word.replace('Ä', 'AE') |
||
| 4977 | word = word.replace('Ö', 'OE') |
||
| 4978 | word = word.replace('Ü', 'UE') |
||
| 4979 | word = word.replace('ß', 'SS') |
||
| 4980 | |||
| 4981 | # Main loop, using above replacements table |
||
| 4982 | pos = 0 |
||
| 4983 | while pos < len(word): |
||
| 4984 | for num in range(3, 0, -1): |
||
| 4985 | if word[pos:pos+num] in replacements[num]: |
||
| 4986 | word = (word[:pos] + replacements[num][word[pos:pos+num]] |
||
| 4987 | + word[pos+num:]) |
||
| 4988 | pos += 1 |
||
| 4989 | break |
||
| 4990 | else: |
||
| 4991 | pos += 1 # Advance if nothing is recognized |
||
| 4992 | |||
| 4993 | # Change 'CH' back(?) to 'SCH' |
||
| 4994 | word = word.replace('CH', 'SCH') |
||
| 4995 | |||
| 4996 | # Replace final sequences |
||
| 4997 | if word[-2:] == 'ER': |
||
| 4998 | word = word[:-2]+'R' |
||
| 4999 | elif word[-2:] == 'EL': |
||
| 5000 | word = word[:-2]+'L' |
||
| 5001 | elif word[-1] == 'H': |
||
| 5002 | word = word[:-1] |
||
| 5003 | |||
| 5004 | return word |
||
| 5005 | |||
| 5006 | |||
| 5007 | def fonem(word): |
||
| 5008 | """Return the FONEM code of a word. |
||
| 5009 | |||
| 5010 | FONEM is a phonetic algorithm designed for French (particularly surnames in |
||
| 5011 | Saguenay, Canada), defined in: |
||
| 5012 | Bouchard, Gérard, Patrick Brard, and Yolande Lavoie. 1981. "FONEM: Un code |
||
| 5013 | de transcription phonétique pour la reconstitution automatique des |
||
| 5014 | familles saguenayennes." Population. 36(6). 1085--1103. |
||
| 5015 | https://doi.org/10.2307/1532326 |
||
| 5016 | http://www.persee.fr/doc/pop_0032-4663_1981_num_36_6_17248 |
||
| 5017 | |||
| 5018 | Guillaume Plique's Javascript implementation at |
||
| 5019 | https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
||
| 5020 | was also consulted for this implementation. |
||
| 5021 | |||
| 5022 | :param str word: the word to transform |
||
| 5023 | :returns: the FONEM code |
||
| 5024 | :rtype: str |
||
| 5025 | """ |
||
| 5026 | # I don't see a sane way of doing this without regexps :( |
||
| 5027 | rule_table = { |
||
| 5028 | # Vowels & groups of vowels |
||
| 5029 | 'V-1': (re.compile('E?AU'), 'O'), |
||
| 5030 | 'V-2,5': (re.compile('(E?AU|O)L[TX]$'), 'O'), |
||
| 5031 | 'V-3,4': (re.compile('E?AU[TX]$'), 'O'), |
||
| 5032 | 'V-6': (re.compile('E?AUL?D$'), 'O'), |
||
| 5033 | 'V-7': (re.compile(r'(?<!G)AY$'), 'E'), |
||
| 5034 | 'V-8': (re.compile('EUX$'), 'EU'), |
||
| 5035 | 'V-9': (re.compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
||
| 5036 | 'V-10': ('Y', 'I'), |
||
| 5037 | 'V-11': (re.compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
||
| 5038 | 'V-12': (re.compile('(?<=[AEIOUY])ILL'), 'Y'), |
||
| 5039 | 'V-13': (re.compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
||
| 5040 | 'V-14': (re.compile(r'([AEIOUY])(?=\1)'), ''), |
||
| 5041 | # Nasal vowels |
||
| 5042 | 'V-15': (re.compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
||
| 5043 | 'V-16': (re.compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
||
| 5044 | 'V-17': (re.compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
||
| 5045 | 'V-18': (re.compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
||
| 5046 | 'IN'), |
||
| 5047 | 'V-19': (re.compile('B(O|U|OU)RNE?$'), 'BURN'), |
||
| 5048 | 'V-20': (re.compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), |
||
| 5049 | 'IN'), |
||
| 5050 | # Consonants and groups of consonants |
||
| 5051 | 'C-1': ('BV', 'V'), |
||
| 5052 | 'C-2': (re.compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
||
| 5053 | 'C-3': (re.compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
||
| 5054 | 'C-4': (re.compile('^C(?=[EIY])'), 'S'), |
||
| 5055 | 'C-5': (re.compile('^C(?=[OUA])'), 'K'), |
||
| 5056 | 'C-6': (re.compile('(?<=[AEIOUY])C$'), 'K'), |
||
| 5057 | 'C-7': (re.compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
||
| 5058 | 'C-8': (re.compile('CC(?=[AOU])'), 'K'), |
||
| 5059 | 'C-9': (re.compile('CC(?=[EIY])'), 'X'), |
||
| 5060 | 'C-10': (re.compile('G(?=[EIY])'), 'J'), |
||
| 5061 | 'C-11': (re.compile('GA(?=I?[MN])'), 'G#'), |
||
| 5062 | 'C-12': (re.compile('GE(O|AU)'), 'JO'), |
||
| 5063 | 'C-13': (re.compile('GNI(?=[AEIOUY])'), 'GN'), |
||
| 5064 | 'C-14': (re.compile('(?<![PCS])H'), ''), |
||
| 5065 | 'C-15': ('JEA', 'JA'), |
||
| 5066 | 'C-16': (re.compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
||
| 5067 | 'C-17': (re.compile('^MC'), 'MA#'), |
||
| 5068 | 'C-18': ('PH', 'F'), |
||
| 5069 | 'C-19': ('QU', 'K'), |
||
| 5070 | 'C-20': (re.compile('^SC(?=[EIY])'), 'S'), |
||
| 5071 | 'C-21': (re.compile('(?<=.)SC(?=[EIY])'), 'SS'), |
||
| 5072 | 'C-22': (re.compile('(?<=.)SC(?=[AOU])'), 'SK'), |
||
| 5073 | 'C-23': ('SH', 'CH'), |
||
| 5074 | 'C-24': (re.compile('TIA$'), 'SSIA'), |
||
| 5075 | 'C-25': (re.compile('(?<=[AIOUY])W'), ''), |
||
| 5076 | 'C-26': (re.compile('X[CSZ]'), 'X'), |
||
| 5077 | 'C-27': (re.compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
||
| 5078 | 'C-28': (re.compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
||
| 5079 | 'C-28a': (re.compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
||
| 5080 | 'C-28b': (re.compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
||
| 5081 | 'C-28bb': (re.compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
||
| 5082 | 'C-28c': (re.compile('((?<=[^I])|^)LL'), 'L'), |
||
| 5083 | 'C-28d': (re.compile('ILE$'), 'ILLE'), |
||
| 5084 | 'C-29': (re.compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKLMNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
||
| 5085 | lambda m: (m.group(1) or '') + (m.group(2) or '')), |
||
| 5086 | 'C-30,32': (re.compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
||
| 5087 | 'C-31,33': (re.compile('^(SAINTE|STE)-?'), 'STE-'), |
||
| 5088 | # Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
||
| 5089 | 'C-34': ('G#', 'GA'), |
||
| 5090 | 'C-35': ('MA#', 'MAC') |
||
| 5091 | } |
||
| 5092 | rule_order = [ |
||
| 5093 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
| 5094 | 'C-12', |
||
| 5095 | 'C-8', 'C-9', 'C-10', |
||
| 5096 | 'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
||
| 5097 | 'V-2,5', 'V-3,4', 'V-6', |
||
| 5098 | 'V-1', 'C-14', |
||
| 5099 | 'C-31,33', 'C-30,32', |
||
| 5100 | 'C-11', 'V-15', 'V-17', 'V-18', |
||
| 5101 | 'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
||
| 5102 | 'V-19', 'V-20', |
||
| 5103 | 'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
||
| 5104 | 'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
||
| 5105 | 'C-25', 'C-26', 'C-27', |
||
| 5106 | 'C-29', |
||
| 5107 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
| 5108 | 'C-34', 'C-35' |
||
| 5109 | ] |
||
| 5110 | |||
| 5111 | # normalize, upper-case, and filter non-French letters |
||
| 5112 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 5113 | word = word.translate({198: 'AE', 338: 'OE'}) |
||
| 5114 | word = ''.join(c for c in word if c in |
||
| 5115 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 5116 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 5117 | 'Y', 'Z', '-'}) |
||
| 5118 | |||
| 5119 | for rule in rule_order: |
||
| 5120 | regex, repl = rule_table[rule] |
||
| 5121 | if isinstance(regex, text_type): |
||
| 5122 | word = word.replace(regex, repl) |
||
| 5123 | else: |
||
| 5124 | word = regex.sub(repl, word) |
||
| 5125 | # print(rule, word) |
||
| 5126 | |||
| 5127 | return word |
||
| 5128 | |||
| 5129 | |||
| 5130 | def parmar_kumbharana(word): |
||
| 5131 | """Return the Parmar-Kumbharana encoding of a word. |
||
| 5132 | |||
| 5133 | This is based on the phonetic algorithm proposed in |
||
| 5134 | Parmar, Vimal P. and CK Kumbharana. 2014. "Study Existing Various Phonetic |
||
| 5135 | Algorithms and Designing and Development of a working model for the New |
||
| 5136 | Developed Algorithm and Comparison by implementing ti with Existing |
||
| 5137 | Algorithm(s)." International Journal of Computer Applications. 98(19). |
||
| 5138 | https://doi.org/10.5120/17295-7795 |
||
| 5139 | |||
| 5140 | :param word: |
||
| 5141 | :return: |
||
| 5142 | """ |
||
| 5143 | rule_table = {4: {'OUGH': 'F'}, |
||
| 5144 | 3: {'DGE': 'J', |
||
| 5145 | 'OUL': 'U', |
||
| 5146 | 'GHT': 'T'}, |
||
| 5147 | 2: {'CE': 'S', 'CI': 'S', 'CY': 'S', |
||
| 5148 | 'GE': 'J', 'GI': 'J', 'GY': 'J', |
||
| 5149 | 'WR': 'R', |
||
| 5150 | 'GN': 'N', 'KN': 'N', 'PN': 'N', |
||
| 5151 | 'CK': 'K', |
||
| 5152 | 'SH': 'S'}} |
||
| 5153 | vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} |
||
| 5154 | |||
| 5155 | word = word.upper() # Rule 3 |
||
| 5156 | word = _delete_consecutive_repeats(word) # Rule 4 |
||
| 5157 | |||
| 5158 | # Rule 5 |
||
| 5159 | i = 0 |
||
| 5160 | while i < len(word): |
||
| 5161 | for match_len in range(4, 1, -1): |
||
| 5162 | if word[i:i+match_len] in rule_table[match_len]: |
||
| 5163 | repl = rule_table[match_len][word[i:i+match_len]] |
||
| 5164 | word = (word[:i] + repl + word[i+match_len:]) |
||
| 5165 | i += len(repl) |
||
| 5166 | break |
||
| 5167 | else: |
||
| 5168 | i += 1 |
||
| 5169 | |||
| 5170 | word = word[0]+word[1:].translate(vowel_trans) # Rule 6 |
||
| 5171 | return word |
||
| 5172 | |||
| 5173 | |||
| 5174 | def davidson(lname, fname='.', omit_fname=False): |
||
| 5175 | """Return Davidson's Consonant Code. |
||
| 5176 | |||
| 5177 | This is based on the name compression system described in: |
||
| 5178 | Davidson, Leon. 1962. "Retrieval of Misspelled Names in an Airline |
||
| 5179 | Passenger Record System." Communications of the ACM. 5(3). 169--171. |
||
| 5180 | https://dl.acm.org/citation.cfm?id=366913 |
||
| 5181 | |||
| 5182 | Dolby (1970) identifies this as having been the name compression algorithm |
||
| 5183 | used by SABRE. |
||
| 5184 | |||
| 5185 | :param str lname: Last name (or word) to be encoded |
||
| 5186 | :param str fname: First name (optional), of which the first character is |
||
| 5187 | included in the code. |
||
| 5188 | :param str omit_fname: Set to True to completely omit the first character |
||
| 5189 | of the first name |
||
| 5190 | :return: Davidson's Consonant Code |
||
| 5191 | """ |
||
| 5192 | trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''} |
||
| 5193 | |||
| 5194 | lname = lname.upper() |
||
| 5195 | code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans)) |
||
| 5196 | code = code[:4] + (4-len(code))*' ' |
||
| 5197 | |||
| 5198 | if not omit_fname: |
||
| 5199 | code += fname[:1].upper() |
||
| 5200 | |||
| 5201 | return code |
||
| 5202 | |||
| 5203 | |||
| 5204 | def sound_d(word, maxlength=4): |
||
| 5205 | """Return the SoundD code. |
||
| 5206 | |||
| 5207 | SoundD is defined in |
||
| 5208 | Varol, Cihan and Coskun Bayrak. 2012. "Hybrid Matching Algorithm for |
||
| 5209 | Personal Names." Journal of Data and Information Quality, 3(4). |
||
| 5210 | doi:10.1145/2348828.2348830 |
||
| 5211 | |||
| 5212 | :param str word: the word to transform |
||
| 5213 | :param int maxlength: the length of the code returned (defaults to 4) |
||
| 5214 | :return: |
||
| 5215 | """ |
||
| 5216 | _ref_soundd_translation = dict(zip((ord(_) for _ in |
||
| 5217 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 5218 | '01230120022455012623010202')) |
||
| 5219 | |||
| 5220 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 5221 | word = word.replace('ß', 'SS') |
||
| 5222 | word = ''.join(c for c in word if c in |
||
| 5223 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 5224 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 5225 | 'Y', 'Z'}) |
||
| 5226 | |||
| 5227 | if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: |
||
| 5228 | word = word[1:] |
||
| 5229 | elif word[:1] == 'X': |
||
| 5230 | word = 'S'+word[1:] |
||
| 5231 | elif word[:2] == 'WH': |
||
| 5232 | word = 'W'+word[2:] |
||
| 5233 | |||
| 5234 | word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') |
||
| 5235 | |||
| 5236 | word = word.translate(_ref_soundd_translation) |
||
| 5237 | word = _delete_consecutive_repeats(word) |
||
| 5238 | word = word.replace('0', '') |
||
| 5239 | |||
| 5240 | if maxlength is not None: |
||
| 5241 | if len(word) < maxlength: |
||
| 5242 | word += '0' * (maxlength-len(word)) |
||
| 5243 | else: |
||
| 5244 | word = word[:maxlength] |
||
| 5245 | |||
| 5246 | return word |
||
| 5247 | |||
| 5248 | |||
| 5249 | def pshp_soundex_last(lname, maxlength=4, german=False): |
||
| 5250 | """Calculate the PSHP Soundex/Viewex Coding of a last name. |
||
| 5251 | |||
| 5252 | This coding is based on Hershberg, Theodore, Alan Burstein, and Robert |
||
| 5253 | Dockhorn. 1976. "Record Linkage." Historical Methods Newsletter. |
||
| 5254 | 9(2-3). 137--163. doi:10.1080/00182494.1976.10112639 |
||
| 5255 | |||
| 5256 | Reference was also made to the German version of the same: |
||
| 5257 | Hershberg, Theodore, Alan Burstein, and Robert Dockhorn. 1976. "Verkettung |
||
| 5258 | von Daten: Record Linkage am Beispiel des Philadelphia Social History |
||
| 5259 | Project." Moderne Stadtgeschichte. Stuttgart: Klett-Cotta, 1979. |
||
| 5260 | http://nbn-resolving.de/urn:nbn:de:0168-ssoar-327824 |
||
| 5261 | |||
| 5262 | A separate function, pshp_soundex_first() is used for first names. |
||
| 5263 | |||
| 5264 | :param lname: the last name to encode |
||
| 5265 | :param german: set to True if the name is German (different rules apply) |
||
| 5266 | :return: |
||
| 5267 | """ |
||
| 5268 | lname = unicodedata.normalize('NFKD', text_type(lname.upper())) |
||
| 5269 | lname = lname.replace('ß', 'SS') |
||
| 5270 | lname = ''.join(c for c in lname if c in |
||
| 5271 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
| 5272 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
| 5273 | 'W', 'X', 'Y', 'Z'}) |
||
| 5274 | |||
| 5275 | # A. Prefix treatment |
||
| 5276 | if lname[:3] == 'VON' or lname[:3] == 'VAN': |
||
| 5277 | lname = lname[3:].strip() |
||
| 5278 | |||
| 5279 | # The rule implemented below says "MC, MAC become 1". I believe it meant to |
||
| 5280 | # say they become M except in German data (where superscripted 1 indicates |
||
| 5281 | # "except in German data"). It doesn't make sense for them to become 1 |
||
| 5282 | # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have |
||
| 5283 | # this error(?). |
||
| 5284 | if not german: |
||
| 5285 | if lname[:3] == 'MAC': |
||
| 5286 | lname = 'M'+lname[3:] |
||
| 5287 | elif lname[:2] == 'MC': |
||
| 5288 | lname = 'M'+lname[2:] |
||
| 5289 | |||
| 5290 | # The non-German-only rule to strip ' is unnecessary due to filtering |
||
| 5291 | |||
| 5292 | if lname[:1] in {'E', 'I', 'O', 'U'}: |
||
| 5293 | lname = 'A' + lname[1:] |
||
| 5294 | elif lname[:2] in {'GE', 'GI', 'GY'}: |
||
| 5295 | lname = 'J' + lname[1:] |
||
| 5296 | elif lname[:2] in {'CE', 'CI', 'CY'}: |
||
| 5297 | lname = 'S' + lname[1:] |
||
| 5298 | elif lname[:3] == 'CHR': |
||
| 5299 | lname = 'K' + lname[1:] |
||
| 5300 | elif lname[:1] == 'C' and lname[:2] != 'CH': |
||
| 5301 | lname = 'K' + lname[1:] |
||
| 5302 | |||
| 5303 | if lname[:2] == 'KN': |
||
| 5304 | lname = 'N' + lname[1:] |
||
| 5305 | elif lname[:2] == 'PH': |
||
| 5306 | lname = 'F' + lname[1:] |
||
| 5307 | elif lname[:3] in {'WIE', 'WEI'}: |
||
| 5308 | lname = 'V' + lname[1:] |
||
| 5309 | |||
| 5310 | if german and lname[:1] in {'W', 'M', 'Y', 'Z'}: |
||
| 5311 | lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:] |
||
| 5312 | |||
| 5313 | code = lname[:1] |
||
| 5314 | |||
| 5315 | # B. Postfix treatment |
||
| 5316 | if lname[-1:] == 'R': |
||
| 5317 | lname = lname[:-1] + 'N' |
||
| 5318 | elif lname[-2:] in {'SE', 'CE'}: |
||
| 5319 | lname = lname[:-2] |
||
| 5320 | if lname[-2:] == 'SS': |
||
| 5321 | lname = lname[:-2] |
||
| 5322 | elif lname[-1:] == 'S': |
||
| 5323 | lname = lname[:-1] |
||
| 5324 | |||
| 5325 | if not german: |
||
| 5326 | l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} |
||
| 5327 | l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', |
||
| 5328 | 'STON': 'SAON'} |
||
| 5329 | if lname[-5:] in l5_repl: |
||
| 5330 | lname = lname[:-5] + l5_repl[lname[-5:]] |
||
| 5331 | elif lname[-4:] in l4_repl: |
||
| 5332 | lname = lname[:-4] + l4_repl[lname[-4:]] |
||
| 5333 | |||
| 5334 | if lname[-2:] in {'NG', 'ND'}: |
||
| 5335 | lname = lname[:-1] |
||
| 5336 | if not german and lname[-3:] in {'GAN', 'GEN'}: |
||
| 5337 | lname = lname[:-3]+'A'+lname[-2:] |
||
| 5338 | |||
| 5339 | if german: |
||
| 5340 | if lname[-3:] == 'TES': |
||
| 5341 | lname = lname[:-3] |
||
| 5342 | elif lname[-2:] == 'TS': |
||
| 5343 | lname = lname[:-2] |
||
| 5344 | if lname[-3:] == 'TZE': |
||
| 5345 | lname = lname[:-3] |
||
| 5346 | elif lname[-2:] == 'ZE': |
||
| 5347 | lname = lname[:-2] |
||
| 5348 | if lname[-1:] == 'Z': |
||
| 5349 | lname = lname[:-1] |
||
| 5350 | elif lname[-2:] == 'TE': |
||
| 5351 | lname = lname[:-2] |
||
| 5352 | |||
| 5353 | # C. Infix Treatment |
||
| 5354 | lname = lname.replace('CK', 'C') |
||
| 5355 | lname = lname.replace('SCH', 'S') |
||
| 5356 | lname = lname.replace('DT', 'T') |
||
| 5357 | lname = lname.replace('ND', 'N') |
||
| 5358 | lname = lname.replace('NG', 'N') |
||
| 5359 | lname = lname.replace('LM', 'M') |
||
| 5360 | lname = lname.replace('MN', 'M') |
||
| 5361 | lname = lname.replace('WIE', 'VIE') |
||
| 5362 | lname = lname.replace('WEI', 'VEI') |
||
| 5363 | |||
| 5364 | # D. Soundexing |
||
| 5365 | # code for X & Y are unspecified, but presumably are 2 & 0 |
||
| 5366 | _pshp_translation = dict(zip((ord(_) for _ in |
||
| 5367 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 5368 | '01230120022455012523010202')) |
||
| 5369 | |||
| 5370 | lname = lname.translate(_pshp_translation) |
||
| 5371 | lname = _delete_consecutive_repeats(lname) |
||
| 5372 | |||
| 5373 | code += lname[1:] |
||
| 5374 | code = code.replace('0', '') # rule 1 |
||
| 5375 | |||
| 5376 | if maxlength is not None: |
||
| 5377 | if len(code) < maxlength: |
||
| 5378 | code += '0' * (maxlength-len(code)) |
||
| 5379 | else: |
||
| 5380 | code = code[:maxlength] |
||
| 5381 | |||
| 5382 | return code |
||
| 5383 | |||
| 5384 | |||
| 5385 | def pshp_soundex_first(fname, maxlength=4, german=False): |
||
| 5386 | """Calculate the PSHP Soundex/Viewex Coding of a first name. |
||
| 5387 | |||
| 5388 | This coding is based on Hershberg, Theodore, Alan Burstein, and Robert |
||
| 5389 | Dockhorn. 1976. "Record Linkage." Historical Methods Newsletter. |
||
| 5390 | 9(2-3). 137--163. doi:10.1080/00182494.1976.10112639 |
||
| 5391 | |||
| 5392 | Reference was also made to the German version of the same: |
||
| 5393 | Hershberg, Theodore, Alan Burstein, and Robert Dockhorn. 1976. "Verkettung |
||
| 5394 | von Daten: Record Linkage am Beispiel des Philadelphia Social History |
||
| 5395 | Project." Moderne Stadtgeschichte. Stuttgart: Klett-Cotta, 1979. |
||
| 5396 | http://nbn-resolving.de/urn:nbn:de:0168-ssoar-327824 |
||
| 5397 | |||
| 5398 | A separate function, pshp_soundex_last() is used for last names. |
||
| 5399 | |||
| 5400 | :param fname: the first name to encode |
||
| 5401 | :param german: set to True if the name is German (different rules apply) |
||
| 5402 | :return: |
||
| 5403 | """ |
||
| 5404 | fname = unicodedata.normalize('NFKD', text_type(fname.upper())) |
||
| 5405 | fname = fname.replace('ß', 'SS') |
||
| 5406 | fname = ''.join(c for c in fname if c in |
||
| 5407 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
| 5408 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
| 5409 | 'W', 'X', 'Y', 'Z'}) |
||
| 5410 | |||
| 5411 | # special rules |
||
| 5412 | if fname == 'JAMES': |
||
| 5413 | code = 'J7' |
||
| 5414 | elif fname == 'PAT': |
||
| 5415 | code = 'P7' |
||
| 5416 | |||
| 5417 | else: |
||
| 5418 | # A. Prefix treatment |
||
| 5419 | if fname[:2] in {'GE', 'GI', 'GY'}: |
||
| 5420 | fname = 'J' + fname[1:] |
||
| 5421 | elif fname[:2] in {'CE', 'CI', 'CY'}: |
||
| 5422 | fname = 'S' + fname[1:] |
||
| 5423 | elif fname[:3] == 'CHR': |
||
| 5424 | fname = 'K' + fname[1:] |
||
| 5425 | elif fname[:1] == 'C' and fname[:2] != 'CH': |
||
| 5426 | fname = 'K' + fname[1:] |
||
| 5427 | |||
| 5428 | if fname[:2] == 'KN': |
||
| 5429 | fname = 'N' + fname[1:] |
||
| 5430 | elif fname[:2] == 'PH': |
||
| 5431 | fname = 'F' + fname[1:] |
||
| 5432 | elif fname[:3] in {'WIE', 'WEI'}: |
||
| 5433 | fname = 'V' + fname[1:] |
||
| 5434 | |||
| 5435 | if german and fname[:1] in {'W', 'M', 'Y', 'Z'}: |
||
| 5436 | fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] + |
||
| 5437 | fname[1:]) |
||
| 5438 | |||
| 5439 | code = fname[:1] |
||
| 5440 | |||
| 5441 | # B. Soundex coding |
||
| 5442 | # code for Y unspecified, but presumably is 0 |
||
| 5443 | _pshp_translation = dict(zip((ord(_) for _ in |
||
| 5444 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 5445 | '01230120022455012523010202')) |
||
| 5446 | |||
| 5447 | fname = fname.translate(_pshp_translation) |
||
| 5448 | fname = _delete_consecutive_repeats(fname) |
||
| 5449 | print(fname) |
||
| 5450 | code += fname[1:] |
||
| 5451 | syl_ptr = code.find('0') |
||
| 5452 | syl2_ptr = code[syl_ptr + 1:].find('0') |
||
| 5453 | if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1: |
||
| 5454 | code = code[:syl_ptr + 2] |
||
| 5455 | |||
| 5456 | code = code.replace('0', '') # rule 1 |
||
| 5457 | |||
| 5458 | if maxlength is not None: |
||
| 5459 | if len(code) < maxlength: |
||
| 5460 | code += '0' * (maxlength-len(code)) |
||
| 5461 | else: |
||
| 5462 | code = code[:maxlength] |
||
| 5463 | |||
| 5464 | return code |
||
| 5465 | |||
| 5466 | |||
| 5467 | def henry_early(word, maxlength=3): |
||
| 5468 | """Calculate the early version of the Henry code for a word. |
||
| 5469 | |||
| 5470 | The early version of Henry coding is given in: |
||
| 5471 | Légaré, Jacques, Yolande Lavoie, and Hubert Charbonneau. 1972. "The Early |
||
| 5472 | Canadian Population: Problems in Automatic Record Linkage." Canadian |
||
| 5473 | Historical Review, 53(4). 427--442. |
||
| 5474 | doi:10.3138/CHR-053-04-03 |
||
| 5475 | |||
| 5476 | :param word: |
||
| 5477 | :param int maxlength: the length of the code returned (defaults to 3) |
||
| 5478 | :return: |
||
| 5479 | """ |
||
| 5480 | _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
||
| 5481 | 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
||
| 5482 | _vows = {'A', 'E', 'I', 'O', 'U', 'Y'} |
||
| 5483 | _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O', |
||
| 5484 | 'EU': 'U'} |
||
| 5485 | _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'} |
||
| 5486 | _simple = {'W': 'V', 'X': 'S', 'V': 'S'} |
||
| 5487 | |||
| 5488 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 5489 | word = ''.join(c for c in word if c in |
||
| 5490 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 5491 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 5492 | 'Y', 'Z'}) |
||
| 5493 | |||
| 5494 | # Rule Ia seems to be covered entirely in II |
||
| 5495 | |||
| 5496 | # Rule Ib |
||
| 5497 | if word[0] in _vows: |
||
| 5498 | # Ib1 |
||
| 5499 | if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or |
||
| 5500 | (word[1:2] in _cons and word[2:3] not in _cons))): |
||
| 5501 | if word[0] == 'Y': |
||
| 5502 | word = 'I'+word[1:] |
||
| 5503 | # Ib2 |
||
| 5504 | elif word[1:2] in {'M', 'N'} and word[2:3] in _cons: |
||
| 5505 | if word[0] == 'E': |
||
| 5506 | word = 'A'+word[1:] |
||
| 5507 | elif word[0] in {'I', 'U', 'Y'}: |
||
| 5508 | word = 'E'+word[1:] |
||
| 5509 | # Ib3 |
||
| 5510 | elif word[:2] in _diph: |
||
| 5511 | word = _diph[word[:2]]+word[2:] |
||
| 5512 | # Ib4 |
||
| 5513 | elif word[1:2] in _vows and word[0] == 'Y': |
||
| 5514 | word = 'I' + word[1:] |
||
| 5515 | |||
| 5516 | code = '' |
||
| 5517 | skip = 0 |
||
| 5518 | |||
| 5519 | # Rule II |
||
| 5520 | for pos, char in enumerate(word): |
||
| 5521 | nxch = char[pos+1:pos+2] |
||
| 5522 | prev = char[pos-1:pos] |
||
| 5523 | |||
| 5524 | if skip: |
||
| 5525 | skip -= 1 |
||
| 5526 | elif char in _vows: |
||
| 5527 | code += char |
||
| 5528 | # IIc |
||
| 5529 | elif char == nxch: |
||
| 5530 | skip = 1 |
||
| 5531 | code += char |
||
| 5532 | elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}: |
||
| 5533 | skip = 1 |
||
| 5534 | code += word[pos+1] |
||
| 5535 | # IId |
||
| 5536 | elif char == 'H' and prev in _cons: |
||
| 5537 | continue |
||
| 5538 | elif char == 'S' and nxch in _cons: |
||
| 5539 | continue |
||
| 5540 | elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}: |
||
| 5541 | continue |
||
| 5542 | elif char == 'L' and nxch in {'M', 'N'}: |
||
| 5543 | continue |
||
| 5544 | elif char in {'M', 'N'} and prev in _vows and nxch in _cons: |
||
| 5545 | continue |
||
| 5546 | # IIa |
||
| 5547 | elif char in _unaltered: |
||
| 5548 | code += char |
||
| 5549 | # IIb |
||
| 5550 | elif char in _simple: |
||
| 5551 | code += _simple[char] |
||
| 5552 | elif char in {'C', 'G', 'P', 'Q', 'S'}: |
||
| 5553 | if char == 'C': |
||
| 5554 | if nxch in {'A', 'O', 'U', 'L', 'R'}: |
||
| 5555 | code += 'K' |
||
| 5556 | elif nxch in {'E', 'I', 'Y'}: |
||
| 5557 | code += 'J' |
||
| 5558 | elif nxch == 'H': |
||
| 5559 | if word[pos+2:pos+3] in _vows: |
||
| 5560 | code += 'C' |
||
| 5561 | elif word[pos+2:pos+3] in {'R', 'L'}: |
||
| 5562 | code += 'K' |
||
| 5563 | elif char == 'G': |
||
| 5564 | if nxch in {'A', 'O', 'U', 'L', 'R'}: |
||
| 5565 | code += 'G' |
||
| 5566 | elif nxch in {'E', 'I', 'Y'}: |
||
| 5567 | code += 'J' |
||
| 5568 | elif nxch == 'N': |
||
| 5569 | code += 'N' |
||
| 5570 | elif char == 'P': |
||
| 5571 | if nxch != 'H': |
||
| 5572 | code += 'P' |
||
| 5573 | else: |
||
| 5574 | code += 'F' |
||
| 5575 | elif char == 'Q': |
||
| 5576 | if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}: |
||
| 5577 | char += 'G' |
||
| 5578 | elif word[pos + 1:pos + 2] in {'UA', 'UO'}: |
||
| 5579 | char += 'K' |
||
| 5580 | elif char == 'S': |
||
| 5581 | if word[pos:pos+6] == 'SAINTE': |
||
| 5582 | code += 'X' |
||
| 5583 | skip = 5 |
||
| 5584 | elif word[pos:pos+5] == 'SAINT': |
||
| 5585 | code += 'X' |
||
| 5586 | skip = 4 |
||
| 5587 | elif word[pos:pos+3] == 'STE': |
||
| 5588 | code += 'X' |
||
| 5589 | skip = 2 |
||
| 5590 | elif word[pos:pos+2] == 'ST': |
||
| 5591 | code += 'X' |
||
| 5592 | skip = 1 |
||
| 5593 | else: |
||
| 5594 | code += 'S' |
||
| 5595 | else: # this should not be possible |
||
| 5596 | continue |
||
| 5597 | |||
| 5598 | # IIe1 |
||
| 5599 | if code[-4:] in {'AULT', 'EULT', 'OULT'}: |
||
| 5600 | code = code[:-2] |
||
| 5601 | elif code[-4:-3] in _vows and code[-3:] == 'MPS': |
||
| 5602 | code = code[:-3] |
||
| 5603 | elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}: |
||
| 5604 | code = code[:-2] |
||
| 5605 | elif code[-2:-1] == 'R' and code[-1:] in _cons: |
||
| 5606 | code = code[:-1] |
||
| 5607 | # IIe2 |
||
| 5608 | elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}: |
||
| 5609 | code = code[:-1] |
||
| 5610 | elif code[-2:] == 'ER': |
||
| 5611 | code = code[:-1] |
||
| 5612 | |||
| 5613 | # Drop non-initial vowels |
||
| 5614 | code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '', |
||
| 5615 | 89: ''}) |
||
| 5616 | |||
| 5617 | if maxlength is not None: |
||
| 5618 | code = code[:maxlength] |
||
| 5619 | |||
| 5620 | return code |
||
| 5621 | |||
| 5622 | |||
| 5623 | def norphone(word): |
||
| 5624 | """Return the Norphone code. |
||
| 5625 | |||
| 5626 | The reference implementation by Lars Marius Garshol is available at |
||
| 5627 | https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/NorphoneComparator.java |
||
| 5628 | |||
| 5629 | Norphone was designed for Norwegian, but this implementation has been |
||
| 5630 | extended to support Swedish vowels as well. This function incorporates |
||
| 5631 | the "not implemented" rules from the above file's rule set. |
||
| 5632 | |||
| 5633 | :param word: |
||
| 5634 | :return: |
||
| 5635 | """ |
||
| 5636 | _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} |
||
| 5637 | |||
| 5638 | replacements = {4: {'SKEI': 'X'}, |
||
| 5639 | 3: {'SKJ': 'X', 'KEI': 'X'}, |
||
| 5640 | 2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K', |
||
| 5641 | 'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X', |
||
| 5642 | 'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'}, |
||
| 5643 | 1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}} |
||
| 5644 | |||
| 5645 | word = word.upper() |
||
| 5646 | |||
| 5647 | code = '' |
||
| 5648 | skip = 0 |
||
| 5649 | |||
| 5650 | if word[0:2] == 'AA': |
||
| 5651 | code = 'Å' |
||
| 5652 | skip = 2 |
||
| 5653 | elif word[0:2] == 'GI': |
||
| 5654 | code = 'J' |
||
| 5655 | skip = 2 |
||
| 5656 | elif word[0:3] == 'SKY': |
||
| 5657 | code = 'X' |
||
| 5658 | skip = 3 |
||
| 5659 | elif word[0:2] == 'EI': |
||
| 5660 | code = 'Æ' |
||
| 5661 | skip = 2 |
||
| 5662 | elif word[0:2] == 'KY': |
||
| 5663 | code = 'X' |
||
| 5664 | skip = 2 |
||
| 5665 | elif word[:1] == 'C': |
||
| 5666 | code = 'K' |
||
| 5667 | skip = 1 |
||
| 5668 | elif word[:1] == 'Ä': |
||
| 5669 | code = 'Æ' |
||
| 5670 | skip = 1 |
||
| 5671 | elif word[:1] == 'Ö': |
||
| 5672 | code = 'Ø' |
||
| 5673 | skip = 1 |
||
| 5674 | |||
| 5675 | if word[-2:] == 'DT': |
||
| 5676 | word = word[:-2]+'T' |
||
| 5677 | # Though the rules indicate this rule applies in all positions, the |
||
| 5678 | # reference implementation indicates it applies only in final position. |
||
| 5679 | elif word[-2:-1] in _vowels and word[-1:] == 'D': |
||
| 5680 | word = word[:-2] |
||
| 5681 | |||
| 5682 | for pos, char in enumerate(word): |
||
| 5683 | if skip: |
||
| 5684 | skip -= 1 |
||
| 5685 | else: |
||
| 5686 | for length in sorted(replacements, reverse=True): |
||
| 5687 | if word[pos:pos+length] in replacements[length]: |
||
| 5688 | code += replacements[length][word[pos:pos+length]] |
||
| 5689 | skip = length-1 |
||
| 5690 | break |
||
| 5691 | else: |
||
| 5692 | if not pos or char not in _vowels: |
||
| 5693 | code += char |
||
| 5694 | |||
| 5695 | code = _delete_consecutive_repeats(code) |
||
| 5696 | |||
| 5697 | return code |
||
| 5698 | |||
| 5699 | |||
| 5700 | def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'): |
||
| 5701 | """Return the Dolby Code of a name. |
||
| 5702 | |||
| 5703 | This follows "A Spelling Equivalent Abbreviation Algorithm For Personal |
||
| 5704 | Names" from: |
||
| 5705 | Dolby, James L. 1970. "An Algorithm for Variable-Length Proper-Name |
||
| 5706 | Compression." Journal of Library Automation, 3(4). |
||
| 5707 | doi:10.6017/ital.v3i4.5259 |
||
| 5708 | |||
| 5709 | :param word: the word to encode |
||
| 5710 | :param maxlength: maximum length of the returned Dolby code -- this also |
||
| 5711 | activates the fixed-length code mode |
||
| 5712 | :param keep_vowels: if True, retains all vowel markers |
||
| 5713 | :param vowel_char: the vowel marker character (default to *) |
||
| 5714 | :return: |
||
| 5715 | """ |
||
| 5716 | _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'} |
||
| 5717 | |||
| 5718 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
| 5719 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 5720 | word = word.replace('ß', 'SS') |
||
| 5721 | word = ''.join(c for c in word if c in |
||
| 5722 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
| 5723 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
| 5724 | 'Y', 'Z'}) |
||
| 5725 | |||
| 5726 | # Rule 1 (FL2) |
||
| 5727 | if word[:3] in {'MCG', 'MAG', 'MAC'}: |
||
| 5728 | word = 'MK'+word[3:] |
||
| 5729 | elif word[:2] == 'MC': |
||
| 5730 | word = 'MK'+word[2:] |
||
| 5731 | |||
| 5732 | # Rule 2 (FL3) |
||
| 5733 | pos = len(word)-2 |
||
| 5734 | while pos > -1: |
||
| 5735 | if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC', |
||
| 5736 | 'SK', 'ST'}: |
||
| 5737 | word = word[:pos+1]+word[pos+2:] |
||
| 5738 | pos += 1 |
||
| 5739 | pos -= 1 |
||
| 5740 | |||
| 5741 | # Rule 3 (FL4) |
||
| 5742 | # Although the rule indicates "after the first letter", the test cases make |
||
| 5743 | # it clear that these apply to the first letter also. |
||
| 5744 | word = word.replace('X', 'KS') |
||
| 5745 | word = word.replace('CE', 'SE') |
||
| 5746 | word = word.replace('CI', 'SI') |
||
| 5747 | word = word.replace('CY', 'SI') |
||
| 5748 | |||
| 5749 | # not in the rule set, but they seem to have intended it |
||
| 5750 | word = word.replace('TCH', 'CH') |
||
| 5751 | |||
| 5752 | pos = word.find('CH', 1) |
||
| 5753 | while pos != -1: |
||
| 5754 | if word[pos-1:pos] not in _vowels: |
||
| 5755 | word = word[:pos]+'S'+word[pos+1:] |
||
| 5756 | pos = word.find('CH', pos+1) |
||
| 5757 | |||
| 5758 | word = word.replace('C', 'K') |
||
| 5759 | word = word.replace('Z', 'S') |
||
| 5760 | |||
| 5761 | word = word.replace('WR', 'R') |
||
| 5762 | word = word.replace('DG', 'G') |
||
| 5763 | word = word.replace('QU', 'K') |
||
| 5764 | word = word.replace('T', 'D') |
||
| 5765 | word = word.replace('PH', 'F') |
||
| 5766 | |||
| 5767 | # Rule 4 (FL5) |
||
| 5768 | # Although the rule indicates "after the first letter", the test cases make |
||
| 5769 | # it clear that these apply to the first letter also. |
||
| 5770 | pos = word.find('K', 0) |
||
| 5771 | while pos != -1: |
||
| 5772 | if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}: |
||
| 5773 | word = word[:pos-1]+word[pos:] |
||
| 5774 | pos -= 1 |
||
| 5775 | pos = word.find('K', pos+1) |
||
| 5776 | |||
| 5777 | # Rule FL6 |
||
| 5778 | if maxlength and word[-1:] == 'E': |
||
| 5779 | word = word[:-1] |
||
| 5780 | |||
| 5781 | # Rule 5 (FL7) |
||
| 5782 | word = _delete_consecutive_repeats(word) |
||
| 5783 | |||
| 5784 | # Rule 6 (FL8) |
||
| 5785 | if word[:2] == 'PF': |
||
| 5786 | word = word[1:] |
||
| 5787 | if word[-2:] == 'PF': |
||
| 5788 | word = word[:-1] |
||
| 5789 | elif word[-2:] == 'GH': |
||
| 5790 | if word[-3:-2] in _vowels: |
||
| 5791 | word = word[:-2]+'F' |
||
| 5792 | else: |
||
| 5793 | word = word[:-2]+'G' |
||
| 5794 | word = word.replace('GH', '') |
||
| 5795 | |||
| 5796 | # Rule FL9 |
||
| 5797 | if maxlength: |
||
| 5798 | word = word.replace('V', 'F') |
||
| 5799 | |||
| 5800 | # Rules 7-9 (FL10-FL12) |
||
| 5801 | first = 1 + (1 if maxlength else 0) |
||
| 5802 | code = '' |
||
| 5803 | for pos, char in enumerate(word): |
||
| 5804 | if char in _vowels: |
||
| 5805 | if first or keep_vowels: |
||
| 5806 | code += vowel_char |
||
| 5807 | first -= 1 |
||
| 5808 | else: |
||
| 5809 | continue |
||
| 5810 | elif pos > 0 and char in {'W', 'H'}: |
||
| 5811 | continue |
||
| 5812 | else: |
||
| 5813 | code += char |
||
| 5814 | |||
| 5815 | if maxlength: |
||
| 5816 | # Rule FL13 |
||
| 5817 | if len(code) > maxlength and code[-1:] == 'S': |
||
| 5818 | code = code[:-1] |
||
| 5819 | if keep_vowels: |
||
| 5820 | code = code[:maxlength] |
||
| 5821 | else: |
||
| 5822 | # Rule FL14 |
||
| 5823 | code = code[:maxlength + 2] |
||
| 5824 | # Rule FL15 |
||
| 5825 | while len(code) > maxlength: |
||
| 5826 | vowels = len(code) - maxlength |
||
| 5827 | excess = vowels - 1 |
||
| 5828 | word = code |
||
| 5829 | code = '' |
||
| 5830 | for char in word: |
||
| 5831 | if char == vowel_char: |
||
| 5832 | if vowels: |
||
| 5833 | code += char |
||
| 5834 | vowels -= 1 |
||
| 5835 | else: |
||
| 5836 | code += char |
||
| 5837 | code = code[:maxlength + excess] |
||
| 5838 | |||
| 5839 | # Rule FL16 |
||
| 5840 | code += ' ' * (maxlength - len(code)) |
||
| 5841 | |||
| 5842 | return code |
||
| 5843 | |||
| 5844 | |||
| 5845 | def phonetic_spanish(word, maxlength=None): |
||
| 5846 | """Return the PhoneticSpanish coding of word. |
||
| 5847 | |||
| 5848 | This follows the coding described in: |
||
| 5849 | Amón, Iván, Francisco Moreno, and Jaime Echeverri. 2012. "Algoritmo |
||
| 5850 | fonético para detección de cadenas de texto duplicadas en el idioma |
||
| 5851 | español." Revista Ingenierías Universidad de Medellín, 11(20). 127--138. |
||
| 5852 | ISSN:1692-3324 |
||
| 5853 | |||
| 5854 | and: |
||
| 5855 | del Pilar Angeles, María, Adrián Espino-Gamez, and Jonathan Gil-Moncada. |
||
| 5856 | 2015. "Comparison of a Modified Spanish Phonetic, Soundex, and Phonex |
||
| 5857 | coding functions during data matching process." 2015 International |
||
| 5858 | Conference on Informatics, Electronics Vision (ICIEV). 1--5. |
||
| 5859 | doi:10.1109/ICIEV.2015.7334028 |
||
| 5860 | |||
| 5861 | :param word: |
||
| 5862 | :return: |
||
| 5863 | """ |
||
| 5864 | _es_soundex_translation = dict(zip((ord(_) for _ in |
||
| 5865 | 'BCDFGHJKLMNPQRSTVXYZ'), |
||
| 5866 | '14328287566079431454')) |
||
| 5867 | |||
| 5868 | # uppercase, normalize, and decompose, filter to A-Z minus vowels & W |
||
| 5869 | word = unicodedata.normalize('NFKD', text_type(word.upper())) |
||
| 5870 | word = ''.join(c for c in word if c in |
||
| 5871 | {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', |
||
| 5872 | 'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'}) |
||
| 5873 | |||
| 5874 | # merge repeated Ls & Rs |
||
| 5875 | word = word.replace('LL', 'L') |
||
| 5876 | word = word.replace('R', 'R') |
||
| 5877 | |||
| 5878 | # apply the Soundex algorithm |
||
| 5879 | sdx = word.translate(_es_soundex_translation) |
||
| 5880 | |||
| 5881 | if maxlength: |
||
| 5882 | sdx = sdx[:maxlength] |
||
| 5883 | |||
| 5884 | return sdx |
||
| 5885 | |||
| 5886 | |||
| 5887 | def spanish_metaphone(word, maxlength=6, modified=False): |
||
| 5888 | """Return the Spanish Metaphone of a word. |
||
| 5889 | |||
| 5890 | This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at |
||
| 5891 | https://github.com/amsqr/Spanish-Metaphone |
||
| 5892 | |||
| 5893 | Mosquera, Alejandro, Elena Lloret, and Paloma Moreda. 2012. "Towards |
||
| 5894 | Facilitating the Accessibility of Web 2.0 Texts through Text |
||
| 5895 | Normalisation." Proceedings of the LREC workshop: Natural Language |
||
| 5896 | Processing for Improving Textual Accessibility (NLP4ITA) ; Istanbul, |
||
| 5897 | Turkey. 9--14. |
||
| 5898 | http://www.taln.upf.edu/pages/nlp4ita/pdfs/mosquera-nlp4ita2012.pdf |
||
| 5899 | |||
| 5900 | Modified version based on: |
||
| 5901 | del Pilar Angeles, María and Noemi Bailón-Miguel. 2016. "Performance of |
||
| 5902 | Spanish Encoding Functions during Record Linkage." DATA ANALYTICS 2016: |
||
| 5903 | The Fifth International Conference on Data Analysis. 1--7 |
||
| 5904 | https://core.ac.uk/download/pdf/55855695.pdf#page=14 |
||
| 5905 | |||
| 5906 | :param word: |
||
| 5907 | :param maxlength: |
||
| 5908 | :param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's |
||
| 5909 | modified version of the algorithm |
||
| 5910 | :return: |
||
| 5911 | """ |
||
| 5912 | def _is_vowel(pos): |
||
| 5913 | """Return True if the character at word[pos] is a vowel.""" |
||
| 5914 | if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}: |
||
| 5915 | return True |
||
| 5916 | return False |
||
| 5917 | |||
| 5918 | word = unicodedata.normalize('NFC', text_type(word.upper())) |
||
| 5919 | |||
| 5920 | meta_key = '' |
||
| 5921 | pos = 0 |
||
| 5922 | |||
| 5923 | # do some replacements for the modified version |
||
| 5924 | if modified: |
||
| 5925 | word = word.replace('MB', 'NB') |
||
| 5926 | word = word.replace('MP', 'NP') |
||
| 5927 | word = word.replace('BS', 'S') |
||
| 5928 | if word[:2] == 'PS': |
||
| 5929 | word = word[1:] |
||
| 5930 | |||
| 5931 | |||
| 5932 | # simple replacements |
||
| 5933 | word = word.replace('Á', 'A') |
||
| 5934 | word = word.replace('CH', 'X') |
||
| 5935 | word = word.replace('Ç', 'S') |
||
| 5936 | word = word.replace('É', 'E') |
||
| 5937 | word = word.replace('Í', 'I') |
||
| 5938 | word = word.replace('Ó', 'O') |
||
| 5939 | word = word.replace('Ú', 'U') |
||
| 5940 | word = word.replace('Ñ', 'NY') |
||
| 5941 | word = word.replace('GÜ', 'W') |
||
| 5942 | word = word.replace('Ü', 'U') |
||
| 5943 | word = word.replace('B', 'V') |
||
| 5944 | word = word.replace('LL', 'Y') |
||
| 5945 | |||
| 5946 | while len(meta_key) < maxlength: |
||
| 5947 | if pos >= len(word): |
||
| 5948 | break |
||
| 5949 | |||
| 5950 | # get the next character |
||
| 5951 | current_char = word[pos] |
||
| 5952 | |||
| 5953 | # if a vowel in pos 0, add to key |
||
| 5954 | if _is_vowel(pos) and pos == 0: |
||
| 5955 | meta_key += current_char |
||
| 5956 | pos +=1 |
||
| 5957 | # otherwise, do consonant rules |
||
| 5958 | else: |
||
| 5959 | # simple consonants (unmutated) |
||
| 5960 | if current_char in {'D','F','J','K','M','N','P','T','V','L','Y'}: |
||
| 5961 | meta_key += current_char |
||
| 5962 | # skip doubled consonants |
||
| 5963 | if word[pos+1:pos+2] == current_char: |
||
| 5964 | pos += 2 |
||
| 5965 | else: |
||
| 5966 | pos += 1 |
||
| 5967 | else: |
||
| 5968 | if current_char == 'C': |
||
| 5969 | # special case 'acción', 'reacción',etc. |
||
| 5970 | if word[pos+1:pos+2] == 'C': |
||
| 5971 | meta_key += 'X' |
||
| 5972 | pos += 2 |
||
| 5973 | # special case 'cesar', 'cien', 'cid', 'conciencia' |
||
| 5974 | elif word[pos+1:pos+2] in {'E', 'I'}: |
||
| 5975 | meta_key += 'Z' |
||
| 5976 | pos += 2 |
||
| 5977 | # base case |
||
| 5978 | else: |
||
| 5979 | meta_key += 'K' |
||
| 5980 | pos += 1 |
||
| 5981 | elif current_char == 'G': |
||
| 5982 | # special case 'gente', 'ecologia',etc |
||
| 5983 | if word[pos + 1:pos + 2] in {'E', 'I'}: |
||
| 5984 | meta_key += 'J' |
||
| 5985 | pos += 2 |
||
| 5986 | # base case |
||
| 5987 | else: |
||
| 5988 | meta_key += 'G' |
||
| 5989 | pos += 1 |
||
| 5990 | elif current_char == 'H': |
||
| 5991 | # since the letter 'H' is silent in Spanish, |
||
| 5992 | # set the meta key to the vowel after the letter 'H' |
||
| 5993 | if _is_vowel(pos+1): |
||
| 5994 | meta_key += word[pos+1] |
||
| 5995 | pos += 2 |
||
| 5996 | else: |
||
| 5997 | meta_key += 'H' |
||
| 5998 | pos += 1 |
||
| 5999 | elif current_char == 'Q': |
||
| 6000 | if word[pos+1:pos+2] == 'U': |
||
| 6001 | pos += 2 |
||
| 6002 | else: |
||
| 6003 | pos += 1 |
||
| 6004 | meta_key += 'K' |
||
| 6005 | elif current_char == 'W': |
||
| 6006 | meta_key += 'U' |
||
| 6007 | pos += 1 |
||
| 6008 | elif current_char == 'R': |
||
| 6009 | meta_key += 'R' |
||
| 6010 | pos += 1 |
||
| 6011 | elif current_char == 'S': |
||
| 6012 | if not _is_vowel(pos+1) and pos == 0: |
||
| 6013 | meta_key += 'ES' |
||
| 6014 | pos += 1 |
||
| 6015 | else: |
||
| 6016 | meta_key += 'S' |
||
| 6017 | pos += 1 |
||
| 6018 | elif current_char == 'Z': |
||
| 6019 | meta_key += 'Z' |
||
| 6020 | pos += 1 |
||
| 6021 | elif current_char == 'X': |
||
| 6022 | if len(word) > 1 and pos == 0 and not _is_vowel(pos+1): |
||
| 6023 | meta_key += 'EX' |
||
| 6024 | pos += 1 |
||
| 6025 | else: |
||
| 6026 | meta_key += 'X' |
||
| 6027 | pos += 1 |
||
| 6028 | else: |
||
| 6029 | pos += 1 |
||
| 6030 | |||
| 6031 | # Final change from S to Z in modified version |
||
| 6032 | if modified: |
||
| 6033 | meta_key = meta_key.replace('S', 'Z') |
||
| 6034 | |||
| 6035 | return meta_key |
||
| 6036 | |||
| 6037 | |||
| 6038 | def metasoundex(word, language='en'): |
||
| 6039 | """Return the MetaSoundex code for a word. |
||
| 6040 | |||
| 6041 | This is based on: |
||
| 6042 | Koneru, Keerthi and Cihan Varol. 2017. "Privacy Preserving Record Linkeage |
||
| 6043 | using MetaSoundex Algorithm." 2017 16th IEEE International Conference on |
||
| 6044 | Machine Learning and Applications (ICMLA). 443--447. |
||
| 6045 | doi:10.1109/ICMLA.2017.0-121 |
||
| 6046 | |||
| 6047 | |||
| 6048 | |||
| 6049 | :param word: |
||
| 6050 | :param language: either 'en' for English or 'es' for Spanish |
||
| 6051 | :return: |
||
| 6052 | """ |
||
| 6053 | _metasoundex_translation = dict(zip((ord(_) for _ in |
||
| 6054 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
| 6055 | '07430755015866075943077514')) |
||
| 6056 | |||
| 6057 | if language == 'es': |
||
| 6058 | return phonetic_spanish(spanish_metaphone(word)) |
||
| 6059 | |||
| 6060 | word = soundex(metaphone(word)) |
||
| 6061 | word = word[0].translate(_metasoundex_translation)+word[1:] |
||
| 6062 | |||
| 6063 | return word |
||
| 6064 | |||
| 6065 | |||
| 6066 | |||
| 6067 | def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx', |
||
| 6068 | concat=False, filter_langs=False): |
||
| 6069 | """Return the Beider-Morse Phonetic Matching algorithm code for a word. |
||
| 6070 | |||
| 6071 | The Beider-Morse Phonetic Matching algorithm is described at: |
||
| 6072 | http://stevemorse.org/phonetics/bmpm.htm |
||
| 6073 | The reference implementation is licensed under GPLv3 and available at: |
||
| 6074 | http://stevemorse.org/phoneticinfo.htm |
||
| 6075 | |||
| 6076 | :param str word: the word to transform |
||
| 6077 | :param str language_arg: the language of the term; supported values |
||
| 6078 | include: |
||
| 6079 | |||
| 6080 | - 'any' |
||
| 6081 | - 'arabic' |
||
| 6082 | - 'cyrillic' |
||
| 6083 | - 'czech' |
||
| 6084 | - 'dutch' |
||
| 6085 | - 'english' |
||
| 6086 | - 'french' |
||
| 6087 | - 'german' |
||
| 6088 | - 'greek' |
||
| 6089 | - 'greeklatin' |
||
| 6090 | - 'hebrew' |
||
| 6091 | - 'hungarian' |
||
| 6092 | - 'italian' |
||
| 6093 | - 'polish' |
||
| 6094 | - 'portuguese' |
||
| 6095 | - 'romanian' |
||
| 6096 | - 'russian' |
||
| 6097 | - 'spanish' |
||
| 6098 | - 'turkish' |
||
| 6099 | - 'germandjsg' |
||
| 6100 | - 'polishdjskp' |
||
| 6101 | - 'russiandjsre' |
||
| 6102 | |||
| 6103 | :param str name_mode: the name mode of the algorithm: |
||
| 6104 | |||
| 6105 | - 'gen' -- general (default) |
||
| 6106 | - 'ash' -- Ashkenazi |
||
| 6107 | - 'sep' -- Sephardic |
||
| 6108 | |||
| 6109 | :param str match_mode: matching mode: 'approx' or 'exact' |
||
| 6110 | :param bool concat: concatenation mode |
||
| 6111 | :param bool filter_langs: filter out incompatible languages |
||
| 6112 | :returns: the BMPM value(s) |
||
| 6113 | :rtype: tuple |
||
| 6114 | |||
| 6115 | >>> bmpm('Christopher') |
||
| 6116 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
| 6117 | xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir |
||
| 6118 | tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir |
||
| 6119 | zritofi' |
||
| 6120 | >>> bmpm('Niall') |
||
| 6121 | 'nial niol' |
||
| 6122 | >>> bmpm('Smith') |
||
| 6123 | 'zmit' |
||
| 6124 | >>> bmpm('Schmidt') |
||
| 6125 | 'zmit stzmit' |
||
| 6126 | |||
| 6127 | >>> bmpm('Christopher', language_arg='German') |
||
| 6128 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
| 6129 | xristYfir' |
||
| 6130 | >>> bmpm('Christopher', language_arg='English') |
||
| 6131 | 'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir |
||
| 6132 | xrQstafir' |
||
| 6133 | >>> bmpm('Christopher', language_arg='German', name_mode='ash') |
||
| 6134 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
| 6135 | xristYfir' |
||
| 6136 | |||
| 6137 | >>> bmpm('Christopher', language_arg='German', match_mode='exact') |
||
| 6138 | 'xriStopher xriStofer xristopher xristofer' |
||
| 6139 | """ |
||
| 6140 | return _bmpm(word, language_arg, name_mode, match_mode, |
||
| 6141 | concat, filter_langs) |
||
| 6142 | |||
| 6143 | |||
| 6144 | if __name__ == '__main__': |
||
| 6145 | import doctest |
||
| 6146 | doctest.testmod() |
||
| 6147 |