Total Complexity | 1071 |
Total Lines | 6461 |
Duplicated Lines | 1.36 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.phonetic often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
|
|||
2 | |||
3 | # Copyright 2014-2018 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | """abydos.phonetic. |
||
20 | |||
21 | The phonetic module implements phonetic algorithms including: |
||
22 | |||
23 | - Robert C. Russell's Index |
||
24 | - American Soundex |
||
25 | - Refined Soundex |
||
26 | - Daitch-Mokotoff Soundex |
||
27 | - Kölner Phonetik |
||
28 | - NYSIIS |
||
29 | - Match Rating Algorithm |
||
30 | - Metaphone |
||
31 | - Double Metaphone |
||
32 | - Caverphone |
||
33 | - Alpha Search Inquiry System |
||
34 | - Fuzzy Soundex |
||
35 | - Phonex |
||
36 | - Phonem |
||
37 | - Phonix |
||
38 | - SfinxBis |
||
39 | - phonet |
||
40 | - Standardized Phonetic Frequency Code |
||
41 | - Statistics Canada |
||
42 | - Lein |
||
43 | - Roger Root |
||
44 | - Oxford Name Compression Algorithm (ONCA) |
||
45 | - Eudex phonetic hash |
||
46 | - Haase Phonetik |
||
47 | - Reth-Schek Phonetik |
||
48 | - FONEM |
||
49 | - Parmar-Kumbharana |
||
50 | - Davidson's Consonant Code |
||
51 | - SoundD |
||
52 | - PSHP Soundex/Viewex Coding |
||
53 | - an early version of Henry Code |
||
54 | - Norphone |
||
55 | - Dolby Code |
||
56 | - Phonetic Spanish |
||
57 | - Spanish Metaphone |
||
58 | - MetaSoundex |
||
59 | - SoundexBR |
||
60 | - NRL English-to-phoneme |
||
61 | - Beider-Morse Phonetic Matching |
||
62 | """ |
||
63 | |||
64 | from __future__ import division, unicode_literals |
||
65 | |||
66 | from collections import Counter |
||
67 | from itertools import groupby, product |
||
68 | from re import compile as re_compile |
||
69 | from re import match as re_match |
||
70 | from unicodedata import normalize |
||
71 | |||
72 | from six import text_type |
||
73 | from six.moves import range |
||
74 | |||
75 | from ._bm import _bmpm |
||
76 | |||
77 | _INFINITY = float('inf') |
||
78 | |||
79 | __all__ = ['alpha_sis', 'bmpm', 'caverphone', 'davidson', 'dm_soundex', |
||
80 | 'dolby', 'double_metaphone', 'eudex', 'fonem', 'fuzzy_soundex', |
||
81 | 'haase_phonetik', 'henry_early', 'koelner_phonetik', |
||
82 | 'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha', 'lein', |
||
83 | 'metaphone', 'metasoundex', 'mra', 'norphone', 'nrl', 'nysiis', |
||
84 | 'onca', 'parmar_kumbharana', 'phonem', 'phonet', 'phonetic_spanish', |
||
85 | 'phonex', 'phonix', 'pshp_soundex_first', 'pshp_soundex_last', |
||
86 | 'refined_soundex', 'reth_schek_phonetik', 'roger_root', |
||
87 | 'russell_index', 'russell_index_alpha', |
||
88 | 'russell_index_num_to_alpha', 'sfinxbis', 'sound_d', 'soundex', |
||
89 | 'soundex_br', 'spanish_metaphone', 'spfc', 'statistics_canada'] |
||
90 | |||
91 | |||
92 | def _delete_consecutive_repeats(word): |
||
93 | """Delete consecutive repeated characters in a word. |
||
94 | |||
95 | :param str word: the word to transform |
||
96 | :returns: word with consecutive repeating characters collapsed to |
||
97 | a single instance |
||
98 | :rtype: str |
||
99 | """ |
||
100 | return ''.join(char for char, _ in groupby(word)) |
||
101 | |||
102 | |||
103 | def russell_index(word): |
||
104 | """Return the Russell Index (integer output) of a word. |
||
105 | |||
106 | This follows Robert C. Russell's Index algorithm, as described in |
||
107 | :cite:`Russell:1917`. |
||
108 | |||
109 | :param str word: the word to transform |
||
110 | :returns: the Russell Index value |
||
111 | :rtype: int |
||
112 | |||
113 | >>> russell_index('Christopher') |
||
114 | 3813428 |
||
115 | >>> russell_index('Niall') |
||
116 | 715 |
||
117 | >>> russell_index('Smith') |
||
118 | 3614 |
||
119 | >>> russell_index('Schmidt') |
||
120 | 3614 |
||
121 | """ |
||
122 | _russell_translation = dict(zip((ord(_) for _ in |
||
123 | 'ABCDEFGIKLMNOPQRSTUVXYZ'), |
||
124 | '12341231356712383412313')) |
||
125 | |||
126 | word = normalize('NFKD', text_type(word.upper())) |
||
127 | word = word.replace('ß', 'SS') |
||
128 | word = word.replace('GH', '') # discard gh (rule 3) |
||
129 | word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) |
||
130 | |||
131 | # translate according to Russell's mapping |
||
132 | word = ''.join(c for c in word if c in |
||
133 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', |
||
134 | 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}) |
||
135 | sdx = word.translate(_russell_translation) |
||
136 | |||
137 | # remove any 1s after the first occurrence |
||
138 | one = sdx.find('1')+1 |
||
139 | if one: |
||
140 | sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') |
||
141 | |||
142 | # remove repeating characters |
||
143 | sdx = _delete_consecutive_repeats(sdx) |
||
144 | |||
145 | # return as an int |
||
146 | return int(sdx) if sdx else float('NaN') |
||
147 | |||
148 | |||
149 | def russell_index_num_to_alpha(num): |
||
150 | """Convert the Russell Index integer to an alphabetic string. |
||
151 | |||
152 | This follows Robert C. Russell's Index algorithm, as described in |
||
153 | :cite:`Russell:1917`. |
||
154 | |||
155 | :param int num: a Russell Index integer value |
||
156 | :returns: the Russell Index as an alphabetic string |
||
157 | :rtype: str |
||
158 | |||
159 | >>> russell_index_num_to_alpha(3813428) |
||
160 | 'CRACDBR' |
||
161 | >>> russell_index_num_to_alpha(715) |
||
162 | 'NAL' |
||
163 | >>> russell_index_num_to_alpha(3614) |
||
164 | 'CMAD' |
||
165 | """ |
||
166 | _russell_num_translation = dict(zip((ord(_) for _ in '12345678'), |
||
167 | 'ABCDLMNR')) |
||
168 | num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5', |
||
169 | '6', '7', '8'}) |
||
170 | if num: |
||
171 | return num.translate(_russell_num_translation) |
||
172 | return '' |
||
173 | |||
174 | |||
175 | def russell_index_alpha(word): |
||
176 | """Return the Russell Index (alphabetic output) for the word. |
||
177 | |||
178 | This follows Robert C. Russell's Index algorithm, as described in |
||
179 | :cite:`Russell:1917`. |
||
180 | |||
181 | :param str word: the word to transform |
||
182 | :returns: the Russell Index value as an alphabetic string |
||
183 | :rtype: str |
||
184 | |||
185 | >>> russell_index_alpha('Christopher') |
||
186 | 'CRACDBR' |
||
187 | >>> russell_index_alpha('Niall') |
||
188 | 'NAL' |
||
189 | >>> russell_index_alpha('Smith') |
||
190 | 'CMAD' |
||
191 | >>> russell_index_alpha('Schmidt') |
||
192 | 'CMAD' |
||
193 | """ |
||
194 | if word: |
||
195 | return russell_index_num_to_alpha(russell_index(word)) |
||
196 | return '' |
||
197 | |||
198 | |||
199 | def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True): |
||
200 | """Return the Soundex code for a word. |
||
201 | |||
202 | :param str word: the word to transform |
||
203 | :param int maxlength: the length of the code returned (defaults to 4) |
||
204 | :param str var: the variant of the algorithm to employ (defaults to |
||
205 | 'American'): |
||
206 | |||
207 | - 'American' follows the American Soundex algorithm, as described at |
||
208 | :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called |
||
209 | Miracode |
||
210 | - 'special' follows the rules from the 1880-1910 US Census |
||
211 | retrospective re-analysis, in which h & w are not treated as blocking |
||
212 | consonants but as vowels. Cf. :cite:`Repici:2013`. |
||
213 | - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the |
||
214 | US Census, including coding prefixed and unprefixed versions of some |
||
215 | names |
||
216 | |||
217 | :param bool reverse: reverse the word before computing the selected Soundex |
||
218 | (defaults to False); This results in "Reverse Soundex", which is useful |
||
219 | for blocking in cases where the initial elements may be in error. |
||
220 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
221 | maxlength string |
||
222 | :returns: the Soundex value |
||
223 | :rtype: str |
||
224 | |||
225 | >>> soundex("Christopher") |
||
226 | 'C623' |
||
227 | >>> soundex("Niall") |
||
228 | 'N400' |
||
229 | >>> soundex('Smith') |
||
230 | 'S530' |
||
231 | >>> soundex('Schmidt') |
||
232 | 'S530' |
||
233 | |||
234 | |||
235 | >>> soundex('Christopher', maxlength=_INFINITY) |
||
236 | 'C623160000000000000000000000000000000000000000000000000000000000' |
||
237 | >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False) |
||
238 | 'C62316' |
||
239 | |||
240 | >>> soundex('Christopher', reverse=True) |
||
241 | 'R132' |
||
242 | |||
243 | >>> soundex('Ashcroft') |
||
244 | 'A261' |
||
245 | >>> soundex('Asicroft') |
||
246 | 'A226' |
||
247 | >>> soundex('Ashcroft', var='special') |
||
248 | 'A226' |
||
249 | >>> soundex('Asicroft', var='special') |
||
250 | 'A226' |
||
251 | """ |
||
252 | _soundex_translation = dict(zip((ord(_) for _ in |
||
253 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
254 | '01230129022455012623019202')) |
||
255 | |||
256 | # Require a maxlength of at least 4 and not more than 64 |
||
257 | if maxlength is not None: |
||
258 | maxlength = min(max(4, maxlength), 64) |
||
259 | else: |
||
260 | maxlength = 64 |
||
261 | |||
262 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
263 | word = normalize('NFKD', text_type(word.upper())) |
||
264 | word = word.replace('ß', 'SS') |
||
265 | |||
266 | if var == 'Census': |
||
267 | # TODO: Should these prefixes be supplemented? (VANDE, DELA, VON) |
||
268 | if word[:3] in {'VAN', 'CON'} and len(word) > 4: |
||
269 | return (soundex(word, maxlength, 'American', reverse, zero_pad), |
||
270 | soundex(word[3:], maxlength, 'American', reverse, |
||
271 | zero_pad)) |
||
272 | if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: |
||
273 | return (soundex(word, maxlength, 'American', reverse, zero_pad), |
||
274 | soundex(word[2:], maxlength, 'American', reverse, |
||
275 | zero_pad)) |
||
276 | # Otherwise, proceed as usual (var='American' mode, ostensibly) |
||
277 | |||
278 | word = ''.join(c for c in word if c in |
||
279 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
280 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
281 | 'Y', 'Z'}) |
||
282 | |||
283 | # Nothing to convert, return base case |
||
284 | if not word: |
||
285 | if zero_pad: |
||
286 | return '0'*maxlength |
||
287 | return '0' |
||
288 | |||
289 | # Reverse word if computing Reverse Soundex |
||
290 | if reverse: |
||
291 | word = word[::-1] |
||
292 | |||
293 | # apply the Soundex algorithm |
||
294 | sdx = word.translate(_soundex_translation) |
||
295 | |||
296 | if var == 'special': |
||
297 | sdx = sdx.replace('9', '0') # special rule for 1880-1910 census |
||
298 | else: |
||
299 | sdx = sdx.replace('9', '') # rule 1 |
||
300 | sdx = _delete_consecutive_repeats(sdx) # rule 3 |
||
301 | |||
302 | if word[0] in 'HW': |
||
303 | sdx = word[0] + sdx |
||
304 | else: |
||
305 | sdx = word[0] + sdx[1:] |
||
306 | sdx = sdx.replace('0', '') # rule 1 |
||
307 | |||
308 | if zero_pad: |
||
309 | sdx += ('0'*maxlength) # rule 4 |
||
310 | |||
311 | return sdx[:maxlength] |
||
312 | |||
313 | |||
314 | def refined_soundex(word, maxlength=_INFINITY, zero_pad=False, |
||
315 | retain_vowels=False): |
||
316 | """Return the Refined Soundex code for a word. |
||
317 | |||
318 | This is Soundex, but with more character classes. It was defined at |
||
319 | :cite:`Boyce:1998`. |
||
320 | |||
321 | :param word: the word to transform |
||
322 | :param maxlength: the length of the code returned (defaults to unlimited) |
||
323 | :param zero_pad: pad the end of the return value with 0s to achieve a |
||
324 | maxlength string |
||
325 | :param retain_vowels: retain vowels (as 0) in the resulting code |
||
326 | :returns: the Refined Soundex value |
||
327 | :rtype: str |
||
328 | |||
329 | >>> refined_soundex('Christopher') |
||
330 | 'C393619' |
||
331 | >>> refined_soundex('Niall') |
||
332 | 'N87' |
||
333 | >>> refined_soundex('Smith') |
||
334 | 'S386' |
||
335 | >>> refined_soundex('Schmidt') |
||
336 | 'S386' |
||
337 | """ |
||
338 | _ref_soundex_translation = dict(zip((ord(_) for _ in |
||
339 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
340 | '01360240043788015936020505')) |
||
341 | |||
342 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
343 | word = normalize('NFKD', text_type(word.upper())) |
||
344 | word = word.replace('ß', 'SS') |
||
345 | word = ''.join(c for c in word if c in |
||
346 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
347 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
348 | 'Y', 'Z'}) |
||
349 | |||
350 | # apply the Soundex algorithm |
||
351 | sdx = word[:1] + word.translate(_ref_soundex_translation) |
||
352 | sdx = _delete_consecutive_repeats(sdx) |
||
353 | if not retain_vowels: |
||
354 | sdx = sdx.replace('0', '') # Delete vowels, H, W, Y |
||
355 | |||
356 | if maxlength < _INFINITY: |
||
357 | if zero_pad: |
||
358 | sdx += ('0' * maxlength) |
||
359 | sdx = sdx[:maxlength] |
||
360 | |||
361 | return sdx |
||
362 | |||
363 | |||
364 | def dm_soundex(word, maxlength=6, zero_pad=True): |
||
365 | """Return the Daitch-Mokotoff Soundex code for a word. |
||
366 | |||
367 | Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values |
||
368 | of a word as a set. A collection is necessary since there can be multiple |
||
369 | values for a single word. |
||
370 | |||
371 | :param word: the word to transform |
||
372 | :param maxlength: the length of the code returned (defaults to 6) |
||
373 | :param zero_pad: pad the end of the return value with 0s to achieve a |
||
374 | maxlength string |
||
375 | :returns: the Daitch-Mokotoff Soundex value |
||
376 | :rtype: str |
||
377 | |||
378 | >>> sorted(dm_soundex('Christopher')) |
||
379 | ['494379', '594379'] |
||
380 | >>> dm_soundex('Niall') |
||
381 | {'680000'} |
||
382 | >>> dm_soundex('Smith') |
||
383 | {'463000'} |
||
384 | >>> dm_soundex('Schmidt') |
||
385 | {'463000'} |
||
386 | |||
387 | >>> sorted(dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)) |
||
388 | ['35457976754', '3557976754'] |
||
389 | """ |
||
390 | _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), |
||
391 | 'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), |
||
392 | 'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), |
||
393 | 'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), |
||
394 | 'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), |
||
395 | 'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), |
||
396 | 'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), |
||
397 | 'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), |
||
398 | 'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), |
||
399 | 'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), |
||
400 | 'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), |
||
401 | 'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), |
||
402 | 'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), |
||
403 | 'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), |
||
404 | 'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), |
||
405 | 'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), |
||
406 | 'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), |
||
407 | 'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), |
||
408 | 'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), |
||
409 | 'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), |
||
410 | 'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), |
||
411 | 'STRS': (2, 4, 4), 'CZS': (4, 4, 4), |
||
412 | 'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), |
||
413 | 'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), |
||
414 | 'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), |
||
415 | 'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), |
||
416 | 'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), |
||
417 | 'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), |
||
418 | 'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), |
||
419 | 'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), |
||
420 | 'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), |
||
421 | 'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), |
||
422 | 'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), |
||
423 | 'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), |
||
424 | 'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), |
||
425 | 'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), |
||
426 | 'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), |
||
427 | 'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), |
||
428 | 'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), |
||
429 | 'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), |
||
430 | 'CH': ((5, 4), (5, 4), (5, 4)), |
||
431 | 'CK': ((5, 45), (5, 45), (5, 45)), |
||
432 | 'C': ((5, 4), (5, 4), (5, 4)), |
||
433 | 'J': ((1, 4), ('_', 4), ('_', 4)), |
||
434 | 'RZ': ((94, 4), (94, 4), (94, 4)), |
||
435 | 'RS': ((94, 4), (94, 4), (94, 4))} |
||
436 | |||
437 | _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
||
438 | 'B': ('B'), |
||
439 | 'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
||
440 | 'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', |
||
441 | 'DZ', 'D'), |
||
442 | 'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
||
443 | 'F': ('FB', 'F'), |
||
444 | 'G': ('G'), |
||
445 | 'H': ('H'), |
||
446 | 'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
||
447 | 'J': ('J'), |
||
448 | 'K': ('KH', 'KS', 'K'), |
||
449 | 'L': ('L'), |
||
450 | 'M': ('MN', 'M'), |
||
451 | 'N': ('NM', 'N'), |
||
452 | 'O': ('OI', 'OJ', 'OY', 'O'), |
||
453 | 'P': ('PF', 'PH', 'P'), |
||
454 | 'Q': ('Q'), |
||
455 | 'R': ('RS', 'RZ', 'R'), |
||
456 | 'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', |
||
457 | 'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', |
||
458 | 'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', |
||
459 | 'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'), |
||
460 | 'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', |
||
461 | 'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', |
||
462 | 'TS', 'TZ', 'T'), |
||
463 | 'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
||
464 | 'V': ('V'), |
||
465 | 'W': ('W'), |
||
466 | 'X': ('X'), |
||
467 | 'Y': ('Y'), |
||
468 | 'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', |
||
469 | 'ZH', 'ZS', 'Z')} |
||
470 | |||
471 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
472 | dms = [''] # initialize empty code list |
||
473 | |||
474 | # Require a maxlength of at least 6 and not more than 64 |
||
475 | if maxlength is not None: |
||
476 | maxlength = min(max(6, maxlength), 64) |
||
477 | else: |
||
478 | maxlength = 64 |
||
479 | |||
480 | # uppercase, normalize, decompose, and filter non-A-Z |
||
481 | word = normalize('NFKD', text_type(word.upper())) |
||
482 | word = word.replace('ß', 'SS') |
||
483 | word = ''.join(c for c in word if c in |
||
484 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
485 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
486 | 'Y', 'Z'}) |
||
487 | |||
488 | # Nothing to convert, return base case |
||
489 | if not word: |
||
490 | if zero_pad: |
||
491 | return {'0'*maxlength} |
||
492 | return {'0'} |
||
493 | |||
494 | pos = 0 |
||
495 | while pos < len(word): |
||
496 | # Iterate through _dms_order, which specifies the possible substrings |
||
497 | # for which codes exist in the Daitch-Mokotoff coding |
||
498 | for sstr in _dms_order[word[pos]]: |
||
499 | if word[pos:].startswith(sstr): |
||
500 | # Having determined a valid substring start, retrieve the code |
||
501 | dm_val = _dms_table[sstr] |
||
502 | |||
503 | # Having retried the code (triple), determine the correct |
||
504 | # positional variant (first, pre-vocalic, elsewhere) |
||
505 | if pos == 0: |
||
506 | dm_val = dm_val[0] |
||
507 | elif (pos+len(sstr) < len(word) and |
||
508 | word[pos+len(sstr)] in _vowels): |
||
509 | dm_val = dm_val[1] |
||
510 | else: |
||
511 | dm_val = dm_val[2] |
||
512 | |||
513 | # Build the code strings |
||
514 | if isinstance(dm_val, tuple): |
||
515 | dms = [_ + text_type(dm_val[0]) for _ in dms] \ |
||
516 | + [_ + text_type(dm_val[1]) for _ in dms] |
||
517 | else: |
||
518 | dms = [_ + text_type(dm_val) for _ in dms] |
||
519 | pos += len(sstr) |
||
520 | break |
||
521 | |||
522 | # Filter out double letters and _ placeholders |
||
523 | dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_') |
||
524 | for _ in dms) |
||
525 | |||
526 | # Trim codes and return set |
||
527 | if zero_pad: |
||
528 | dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms) |
||
529 | else: |
||
530 | dms = (_[:maxlength] for _ in dms) |
||
531 | return set(dms) |
||
532 | |||
533 | |||
534 | def koelner_phonetik(word): |
||
535 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
536 | |||
537 | Based on the algorithm defined by :cite:`Postel:1969`. |
||
538 | |||
539 | While the output code is numeric, it is still a str because 0s can lead |
||
540 | the code. |
||
541 | |||
542 | :param str word: the word to transform |
||
543 | :returns: the Kölner Phonetik value as a numeric string |
||
544 | :rtype: str |
||
545 | |||
546 | >>> koelner_phonetik('Christopher') |
||
547 | '478237' |
||
548 | >>> koelner_phonetik('Niall') |
||
549 | '65' |
||
550 | >>> koelner_phonetik('Smith') |
||
551 | '862' |
||
552 | >>> koelner_phonetik('Schmidt') |
||
553 | '862' |
||
554 | >>> koelner_phonetik('Müller') |
||
555 | '657' |
||
556 | >>> koelner_phonetik('Zimmermann') |
||
557 | '86766' |
||
558 | """ |
||
559 | def _after(word, i, letters): |
||
560 | """Return True if word[i] follows one of the supplied letters.""" |
||
561 | if i > 0 and word[i-1] in letters: |
||
562 | return True |
||
563 | return False |
||
564 | |||
565 | def _before(word, i, letters): |
||
566 | """Return True if word[i] precedes one of the supplied letters.""" |
||
567 | if i+1 < len(word) and word[i+1] in letters: |
||
568 | return True |
||
569 | return False |
||
570 | |||
571 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
572 | |||
573 | sdx = '' |
||
574 | |||
575 | word = normalize('NFKD', text_type(word.upper())) |
||
576 | word = word.replace('ß', 'SS') |
||
577 | |||
578 | word = word.replace('Ä', 'AE') |
||
579 | word = word.replace('Ö', 'OE') |
||
580 | word = word.replace('Ü', 'UE') |
||
581 | word = ''.join(c for c in word if c in |
||
582 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
583 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
584 | 'Y', 'Z'}) |
||
585 | |||
586 | # Nothing to convert, return base case |
||
587 | if not word: |
||
588 | return sdx |
||
589 | |||
590 | for i in range(len(word)): |
||
591 | View Code Duplication | if word[i] in _vowels: |
|
592 | sdx += '0' |
||
593 | elif word[i] == 'B': |
||
594 | sdx += '1' |
||
595 | elif word[i] == 'P': |
||
596 | if _before(word, i, {'H'}): |
||
597 | sdx += '3' |
||
598 | else: |
||
599 | sdx += '1' |
||
600 | elif word[i] in {'D', 'T'}: |
||
601 | if _before(word, i, {'C', 'S', 'Z'}): |
||
602 | sdx += '8' |
||
603 | else: |
||
604 | sdx += '2' |
||
605 | elif word[i] in {'F', 'V', 'W'}: |
||
606 | sdx += '3' |
||
607 | elif word[i] in {'G', 'K', 'Q'}: |
||
608 | sdx += '4' |
||
609 | elif word[i] == 'C': |
||
610 | if _after(word, i, {'S', 'Z'}): |
||
611 | sdx += '8' |
||
612 | elif i == 0: |
||
613 | if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', |
||
614 | 'X'}): |
||
615 | sdx += '4' |
||
616 | else: |
||
617 | sdx += '8' |
||
618 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
||
619 | sdx += '4' |
||
620 | else: |
||
621 | sdx += '8' |
||
622 | elif word[i] == 'X': |
||
623 | if _after(word, i, {'C', 'K', 'Q'}): |
||
624 | sdx += '8' |
||
625 | else: |
||
626 | sdx += '48' |
||
627 | elif word[i] == 'L': |
||
628 | sdx += '5' |
||
629 | elif word[i] in {'M', 'N'}: |
||
630 | sdx += '6' |
||
631 | elif word[i] == 'R': |
||
632 | sdx += '7' |
||
633 | elif word[i] in {'S', 'Z'}: |
||
634 | sdx += '8' |
||
635 | |||
636 | sdx = _delete_consecutive_repeats(sdx) |
||
637 | |||
638 | if sdx: |
||
639 | sdx = sdx[:1] + sdx[1:].replace('0', '') |
||
640 | |||
641 | return sdx |
||
642 | |||
643 | |||
644 | def koelner_phonetik_num_to_alpha(num): |
||
645 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
646 | |||
647 | :param str num: a numeric Kölner Phonetik representation |
||
648 | :returns: an alphabetic representation of the same word |
||
649 | :rtype: str |
||
650 | |||
651 | >>> koelner_phonetik_num_to_alpha(862) |
||
652 | 'SNT' |
||
653 | >>> koelner_phonetik_num_to_alpha(657) |
||
654 | 'NLR' |
||
655 | >>> koelner_phonetik_num_to_alpha(86766) |
||
656 | 'SNRNN' |
||
657 | """ |
||
658 | _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'), |
||
659 | 'APTFKLNRS')) |
||
660 | num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4', |
||
661 | '5', '6', '7', '8'}) |
||
662 | return num.translate(_koelner_num_translation) |
||
663 | |||
664 | |||
665 | def koelner_phonetik_alpha(word): |
||
666 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
667 | |||
668 | :param str word: the word to transform |
||
669 | :returns: the Kölner Phonetik value as an alphabetic string |
||
670 | :rtype: str |
||
671 | |||
672 | >>> koelner_phonetik_alpha('Smith') |
||
673 | 'SNT' |
||
674 | >>> koelner_phonetik_alpha('Schmidt') |
||
675 | 'SNT' |
||
676 | >>> koelner_phonetik_alpha('Müller') |
||
677 | 'NLR' |
||
678 | >>> koelner_phonetik_alpha('Zimmermann') |
||
679 | 'SNRNN' |
||
680 | """ |
||
681 | return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
||
682 | |||
683 | |||
684 | def nysiis(word, maxlength=6, modified=False): |
||
685 | """Return the NYSIIS code for a word. |
||
686 | |||
687 | The New York State Identification and Intelligence System algorithm is |
||
688 | defined in :cite:`Taft:1970`. |
||
689 | |||
690 | The modified version of this algorithm is described in Appendix B of |
||
691 | :cite:`Lynch:1977`. |
||
692 | |||
693 | :param str word: the word to transform |
||
694 | :param int maxlength: the maximum length (default 6) of the code to return |
||
695 | :param bool modified: indicates whether to use USDA modified NYSIIS |
||
696 | :returns: the NYSIIS value |
||
697 | :rtype: str |
||
698 | |||
699 | >>> nysiis('Christopher') |
||
700 | 'CRASTA' |
||
701 | >>> nysiis('Niall') |
||
702 | 'NAL' |
||
703 | >>> nysiis('Smith') |
||
704 | 'SNAT' |
||
705 | >>> nysiis('Schmidt') |
||
706 | 'SNAD' |
||
707 | |||
708 | >>> nysiis('Christopher', maxlength=_INFINITY) |
||
709 | 'CRASTAFAR' |
||
710 | |||
711 | >>> nysiis('Christopher', maxlength=8, modified=True) |
||
712 | 'CRASTAFA' |
||
713 | >>> nysiis('Niall', maxlength=8, modified=True) |
||
714 | 'NAL' |
||
715 | >>> nysiis('Smith', maxlength=8, modified=True) |
||
716 | 'SNAT' |
||
717 | >>> nysiis('Schmidt', maxlength=8, modified=True) |
||
718 | 'SNAD' |
||
719 | """ |
||
720 | # Require a maxlength of at least 6 |
||
721 | if maxlength: |
||
722 | maxlength = max(6, maxlength) |
||
723 | |||
724 | _vowels = {'A', 'E', 'I', 'O', 'U'} |
||
725 | |||
726 | word = ''.join(c for c in word.upper() if c.isalpha()) |
||
727 | word = word.replace('ß', 'SS') |
||
728 | |||
729 | # exit early if there are no alphas |
||
730 | if not word: |
||
731 | return '' |
||
732 | |||
733 | if modified: |
||
734 | original_first_char = word[0] |
||
735 | |||
736 | if word[:3] == 'MAC': |
||
737 | word = 'MCC'+word[3:] |
||
738 | elif word[:2] == 'KN': |
||
739 | word = 'NN'+word[2:] |
||
740 | elif word[:1] == 'K': |
||
741 | word = 'C'+word[1:] |
||
742 | elif word[:2] in {'PH', 'PF'}: |
||
743 | word = 'FF'+word[2:] |
||
744 | elif word[:3] == 'SCH': |
||
745 | word = 'SSS'+word[3:] |
||
746 | elif modified: |
||
747 | if word[:2] == 'WR': |
||
748 | word = 'RR'+word[2:] |
||
749 | elif word[:2] == 'RH': |
||
750 | word = 'RR'+word[2:] |
||
751 | elif word[:2] == 'DG': |
||
752 | word = 'GG'+word[2:] |
||
753 | elif word[:1] in _vowels: |
||
754 | word = 'A'+word[1:] |
||
755 | |||
756 | if modified and word[-1:] in {'S', 'Z'}: |
||
757 | word = word[:-1] |
||
758 | |||
759 | if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and |
||
760 | word[-2:] == 'YE'): |
||
761 | word = word[:-2]+'Y' |
||
762 | elif word[-2:] in {'DT', 'RT', 'RD'}: |
||
763 | word = word[:-2]+'D' |
||
764 | elif word[-2:] in {'NT', 'ND'}: |
||
765 | word = word[:-2]+('N' if modified else 'D') |
||
766 | elif modified: |
||
767 | if word[-2:] == 'IX': |
||
768 | word = word[:-2]+'ICK' |
||
769 | elif word[-2:] == 'EX': |
||
770 | word = word[:-2]+'ECK' |
||
771 | elif word[-2:] in {'JR', 'SR'}: |
||
772 | return 'ERROR' |
||
773 | |||
774 | key = word[:1] |
||
775 | |||
776 | skip = 0 |
||
777 | for i in range(1, len(word)): |
||
778 | if i >= len(word): |
||
779 | continue |
||
780 | elif skip: |
||
781 | skip -= 1 |
||
782 | continue |
||
783 | elif word[i:i+2] == 'EV': |
||
784 | word = word[:i] + 'AF' + word[i+2:] |
||
785 | skip = 1 |
||
786 | elif word[i] in _vowels: |
||
787 | word = word[:i] + 'A' + word[i+1:] |
||
788 | elif modified and i != len(word)-1 and word[i] == 'Y': |
||
789 | word = word[:i] + 'A' + word[i+1:] |
||
790 | elif word[i] == 'Q': |
||
791 | word = word[:i] + 'G' + word[i+1:] |
||
792 | elif word[i] == 'Z': |
||
793 | word = word[:i] + 'S' + word[i+1:] |
||
794 | elif word[i] == 'M': |
||
795 | word = word[:i] + 'N' + word[i+1:] |
||
796 | elif word[i:i+2] == 'KN': |
||
797 | word = word[:i] + 'N' + word[i+2:] |
||
798 | elif word[i] == 'K': |
||
799 | word = word[:i] + 'C' + word[i+1:] |
||
800 | elif modified and i == len(word)-3 and word[i:i+3] == 'SCH': |
||
801 | word = word[:i] + 'SSA' |
||
802 | skip = 2 |
||
803 | elif word[i:i+3] == 'SCH': |
||
804 | word = word[:i] + 'SSS' + word[i+3:] |
||
805 | skip = 2 |
||
806 | elif modified and i == len(word)-2 and word[i:i+2] == 'SH': |
||
807 | word = word[:i] + 'SA' |
||
808 | skip = 1 |
||
809 | elif word[i:i+2] == 'SH': |
||
810 | word = word[:i] + 'SS' + word[i+2:] |
||
811 | skip = 1 |
||
812 | elif word[i:i+2] == 'PH': |
||
813 | word = word[:i] + 'FF' + word[i+2:] |
||
814 | skip = 1 |
||
815 | elif modified and word[i:i+3] == 'GHT': |
||
816 | word = word[:i] + 'TTT' + word[i+3:] |
||
817 | skip = 2 |
||
818 | elif modified and word[i:i+2] == 'DG': |
||
819 | word = word[:i] + 'GG' + word[i+2:] |
||
820 | skip = 1 |
||
821 | elif modified and word[i:i+2] == 'WR': |
||
822 | word = word[:i] + 'RR' + word[i+2:] |
||
823 | skip = 1 |
||
824 | elif word[i] == 'H' and (word[i-1] not in _vowels or |
||
825 | word[i+1:i+2] not in _vowels): |
||
826 | word = word[:i] + word[i-1] + word[i+1:] |
||
827 | elif word[i] == 'W' and word[i-1] in _vowels: |
||
828 | word = word[:i] + word[i-1] + word[i+1:] |
||
829 | |||
830 | if word[i:i+skip+1] != key[-1:]: |
||
831 | key += word[i:i+skip+1] |
||
832 | |||
833 | key = _delete_consecutive_repeats(key) |
||
834 | |||
835 | if key[-1:] == 'S': |
||
836 | key = key[:-1] |
||
837 | if key[-2:] == 'AY': |
||
838 | key = key[:-2] + 'Y' |
||
839 | if key[-1:] == 'A': |
||
840 | key = key[:-1] |
||
841 | if modified and key[:1] == 'A': |
||
842 | key = original_first_char + key[1:] |
||
843 | |||
844 | if maxlength and maxlength < _INFINITY: |
||
845 | key = key[:maxlength] |
||
846 | |||
847 | return key |
||
848 | |||
849 | |||
850 | def mra(word): |
||
851 | """Return the MRA personal numeric identifier (PNI) for a word. |
||
852 | |||
853 | A description of the Western Airlines Surname Match Rating Algorithm can |
||
854 | be found on page 18 of :cite:`Moore:1977`. |
||
855 | |||
856 | :param str word: the word to transform |
||
857 | :returns: the MRA PNI |
||
858 | :rtype: str |
||
859 | |||
860 | >>> mra('Christopher') |
||
861 | 'CHRPHR' |
||
862 | >>> mra('Niall') |
||
863 | 'NL' |
||
864 | >>> mra('Smith') |
||
865 | 'SMTH' |
||
866 | >>> mra('Schmidt') |
||
867 | 'SCHMDT' |
||
868 | """ |
||
869 | if not word: |
||
870 | return word |
||
871 | word = word.upper() |
||
872 | word = word.replace('ß', 'SS') |
||
873 | word = word[0]+''.join(c for c in word[1:] if |
||
874 | c not in {'A', 'E', 'I', 'O', 'U'}) |
||
875 | word = _delete_consecutive_repeats(word) |
||
876 | if len(word) > 6: |
||
877 | word = word[:3]+word[-3:] |
||
878 | return word |
||
879 | |||
880 | |||
881 | def metaphone(word, maxlength=_INFINITY): |
||
882 | """Return the Metaphone code for a word. |
||
883 | |||
884 | Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`, |
||
885 | as described in :cite:`Philips:1990b`. |
||
886 | This incorporates some corrections to the above code, particularly |
||
887 | some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`. |
||
888 | |||
889 | :param str word: the word to transform |
||
890 | :param int maxlength: the maximum length of the returned Metaphone code |
||
891 | (defaults to unlimited, but in Philips' original implementation |
||
892 | this was 4) |
||
893 | :returns: the Metaphone value |
||
894 | :rtype: str |
||
895 | |||
896 | |||
897 | >>> metaphone('Christopher') |
||
898 | 'KRSTFR' |
||
899 | >>> metaphone('Niall') |
||
900 | 'NL' |
||
901 | >>> metaphone('Smith') |
||
902 | 'SM0' |
||
903 | >>> metaphone('Schmidt') |
||
904 | 'SKMTT' |
||
905 | """ |
||
906 | _vowels = {'A', 'E', 'I', 'O', 'U'} |
||
907 | _frontv = {'E', 'I', 'Y'} |
||
908 | _varson = {'C', 'G', 'P', 'S', 'T'} |
||
909 | |||
910 | # Require a maxlength of at least 4 |
||
911 | if maxlength is not None: |
||
912 | maxlength = max(4, maxlength) |
||
913 | else: |
||
914 | maxlength = 64 |
||
915 | |||
916 | # As in variable sound--those modified by adding an "h" |
||
917 | ename = ''.join(c for c in word.upper() if c.isalnum()) |
||
918 | ename = ename.replace('ß', 'SS') |
||
919 | |||
920 | # Delete nonalphanumeric characters and make all caps |
||
921 | if not ename: |
||
922 | return '' |
||
923 | if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}: |
||
924 | ename = ename[1:] |
||
925 | elif ename[0] == 'X': |
||
926 | ename = 'S' + ename[1:] |
||
927 | elif ename[0:2] == 'WH': |
||
928 | ename = 'W' + ename[2:] |
||
929 | |||
930 | # Convert to metaph |
||
931 | elen = len(ename)-1 |
||
932 | metaph = '' |
||
933 | for i in range(len(ename)): |
||
934 | if len(metaph) >= maxlength: |
||
935 | break |
||
936 | if ((ename[i] not in {'G', 'T'} and |
||
937 | i > 0 and ename[i-1] == ename[i])): |
||
938 | continue |
||
939 | |||
940 | if ename[i] in _vowels and i == 0: |
||
941 | metaph = ename[i] |
||
942 | |||
943 | elif ename[i] == 'B': |
||
944 | if i != elen or ename[i-1] != 'M': |
||
945 | metaph += ename[i] |
||
946 | |||
947 | elif ename[i] == 'C': |
||
948 | if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv): |
||
949 | if ename[i+1:i+3] == 'IA': |
||
950 | metaph += 'X' |
||
951 | elif ename[i+1:i+2] in _frontv: |
||
952 | metaph += 'S' |
||
953 | elif i > 0 and ename[i-1:i+2] == 'SCH': |
||
954 | metaph += 'K' |
||
955 | elif ename[i+1:i+2] == 'H': |
||
956 | if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels: |
||
957 | metaph += 'K' |
||
958 | else: |
||
959 | metaph += 'X' |
||
960 | else: |
||
961 | metaph += 'K' |
||
962 | |||
963 | elif ename[i] == 'D': |
||
964 | if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv: |
||
965 | metaph += 'J' |
||
966 | else: |
||
967 | metaph += 'T' |
||
968 | |||
969 | elif ename[i] == 'G': |
||
970 | if ename[i+1:i+2] == 'H' and not (i+1 == elen or |
||
971 | ename[i+2:i+3] not in _vowels): |
||
972 | continue |
||
973 | elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or |
||
974 | (i+3 == elen and ename[i+1:i+4] == 'NED')): |
||
975 | continue |
||
976 | elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and |
||
977 | ename[i+1] in _frontv): |
||
978 | continue |
||
979 | elif ename[i+1:i+2] == 'G': |
||
980 | continue |
||
981 | elif ename[i+1:i+2] in _frontv: |
||
982 | if i == 0 or ename[i-1] != 'G': |
||
983 | metaph += 'J' |
||
984 | else: |
||
985 | metaph += 'K' |
||
986 | else: |
||
987 | metaph += 'K' |
||
988 | |||
989 | elif ename[i] == 'H': |
||
990 | if ((i > 0 and ename[i-1] in _vowels and |
||
991 | ename[i+1:i+2] not in _vowels)): |
||
992 | continue |
||
993 | elif i > 0 and ename[i-1] in _varson: |
||
994 | continue |
||
995 | else: |
||
996 | metaph += 'H' |
||
997 | |||
998 | elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}: |
||
999 | metaph += ename[i] |
||
1000 | |||
1001 | elif ename[i] == 'K': |
||
1002 | if i > 0 and ename[i-1] == 'C': |
||
1003 | continue |
||
1004 | else: |
||
1005 | metaph += 'K' |
||
1006 | |||
1007 | elif ename[i] == 'P': |
||
1008 | if ename[i+1:i+2] == 'H': |
||
1009 | metaph += 'F' |
||
1010 | else: |
||
1011 | metaph += 'P' |
||
1012 | |||
1013 | elif ename[i] == 'Q': |
||
1014 | metaph += 'K' |
||
1015 | |||
1016 | elif ename[i] == 'S': |
||
1017 | if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
||
1018 | ename[i+2] in 'OA')): |
||
1019 | metaph += 'X' |
||
1020 | elif ename[i+1:i+2] == 'H': |
||
1021 | metaph += 'X' |
||
1022 | else: |
||
1023 | metaph += 'S' |
||
1024 | |||
1025 | elif ename[i] == 'T': |
||
1026 | if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
||
1027 | ename[i+2] in {'A', 'O'})): |
||
1028 | metaph += 'X' |
||
1029 | elif ename[i+1:i+2] == 'H': |
||
1030 | metaph += '0' |
||
1031 | elif ename[i+1:i+3] != 'CH': |
||
1032 | if ename[i-1:i] != 'T': |
||
1033 | metaph += 'T' |
||
1034 | |||
1035 | elif ename[i] == 'V': |
||
1036 | metaph += 'F' |
||
1037 | |||
1038 | elif ename[i] in 'WY': |
||
1039 | if ename[i+1:i+2] in _vowels: |
||
1040 | metaph += ename[i] |
||
1041 | |||
1042 | elif ename[i] == 'X': |
||
1043 | metaph += 'KS' |
||
1044 | |||
1045 | elif ename[i] == 'Z': |
||
1046 | metaph += 'S' |
||
1047 | |||
1048 | return metaph |
||
1049 | |||
1050 | |||
1051 | def double_metaphone(word, maxlength=_INFINITY): |
||
1052 | """Return the Double Metaphone code for a word. |
||
1053 | |||
1054 | Based on Lawrence Philips' (Visual) C++ code from 1999 |
||
1055 | :cite:`Philips:2000`. |
||
1056 | |||
1057 | :param word: the word to transform |
||
1058 | :param maxlength: the maximum length of the returned Double Metaphone codes |
||
1059 | (defaults to unlimited, but in Philips' original implementation this |
||
1060 | was 4) |
||
1061 | :returns: the Double Metaphone value(s) |
||
1062 | :rtype: tuple |
||
1063 | |||
1064 | >>> double_metaphone('Christopher') |
||
1065 | ('KRSTFR', '') |
||
1066 | >>> double_metaphone('Niall') |
||
1067 | ('NL', '') |
||
1068 | >>> double_metaphone('Smith') |
||
1069 | ('SM0', 'XMT') |
||
1070 | >>> double_metaphone('Schmidt') |
||
1071 | ('XMT', 'SMT') |
||
1072 | """ |
||
1073 | # Require a maxlength of at least 4 |
||
1074 | if maxlength is not None: |
||
1075 | maxlength = max(4, maxlength) |
||
1076 | else: |
||
1077 | maxlength = 64 |
||
1078 | |||
1079 | primary = '' |
||
1080 | secondary = '' |
||
1081 | |||
1082 | def _slavo_germanic(): |
||
1083 | """Return True if the word appears to be Slavic or Germanic.""" |
||
1084 | if 'W' in word or 'K' in word or 'CZ' in word: |
||
1085 | return True |
||
1086 | return False |
||
1087 | |||
1088 | def _metaph_add(pri, sec=''): |
||
1089 | """Return a new metaphone tuple with the supplied elements.""" |
||
1090 | newpri = primary |
||
1091 | newsec = secondary |
||
1092 | if pri: |
||
1093 | newpri += pri |
||
1094 | if sec: |
||
1095 | if sec != ' ': |
||
1096 | newsec += sec |
||
1097 | else: |
||
1098 | newsec += pri |
||
1099 | return (newpri, newsec) |
||
1100 | |||
1101 | def _is_vowel(pos): |
||
1102 | """Return True if the character at word[pos] is a vowel.""" |
||
1103 | if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
1104 | return True |
||
1105 | return False |
||
1106 | |||
1107 | def _get_at(pos): |
||
1108 | """Return the character at word[pos].""" |
||
1109 | return word[pos] |
||
1110 | |||
1111 | def _string_at(pos, slen, substrings): |
||
1112 | """Return True if word[pos:pos+slen] is in substrings.""" |
||
1113 | if pos < 0: |
||
1114 | return False |
||
1115 | return word[pos:pos+slen] in substrings |
||
1116 | |||
1117 | current = 0 |
||
1118 | length = len(word) |
||
1119 | if length < 1: |
||
1120 | return ('', '') |
||
1121 | last = length - 1 |
||
1122 | |||
1123 | word = word.upper() |
||
1124 | word = word.replace('ß', 'SS') |
||
1125 | |||
1126 | # Pad the original string so that we can index beyond the edge of the world |
||
1127 | word += ' ' |
||
1128 | |||
1129 | # Skip these when at start of word |
||
1130 | if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
||
1131 | current += 1 |
||
1132 | |||
1133 | # Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
||
1134 | if _get_at(0) == 'X': |
||
1135 | (primary, secondary) = _metaph_add('S') # 'Z' maps to 'S' |
||
1136 | current += 1 |
||
1137 | |||
1138 | # Main loop |
||
1139 | while True: |
||
1140 | if current >= length: |
||
1141 | break |
||
1142 | |||
1143 | if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
1144 | if current == 0: |
||
1145 | # All init vowels now map to 'A' |
||
1146 | (primary, secondary) = _metaph_add('A') |
||
1147 | current += 1 |
||
1148 | continue |
||
1149 | |||
1150 | elif _get_at(current) == 'B': |
||
1151 | # "-mb", e.g", "dumb", already skipped over... |
||
1152 | (primary, secondary) = _metaph_add('P') |
||
1153 | if _get_at(current + 1) == 'B': |
||
1154 | current += 2 |
||
1155 | else: |
||
1156 | current += 1 |
||
1157 | continue |
||
1158 | |||
1159 | elif _get_at(current) == 'Ç': |
||
1160 | (primary, secondary) = _metaph_add('S') |
||
1161 | current += 1 |
||
1162 | continue |
||
1163 | |||
1164 | elif _get_at(current) == 'C': |
||
1165 | # Various Germanic |
||
1166 | if (current > 1 and not _is_vowel(current - 2) and |
||
1167 | _string_at((current - 1), 3, {'ACH'}) and |
||
1168 | ((_get_at(current + 2) != 'I') and |
||
1169 | ((_get_at(current + 2) != 'E') or |
||
1170 | _string_at((current - 2), 6, |
||
1171 | {'BACHER', 'MACHER'})))): |
||
1172 | (primary, secondary) = _metaph_add('K') |
||
1173 | current += 2 |
||
1174 | continue |
||
1175 | |||
1176 | # Special case 'caesar' |
||
1177 | elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
||
1178 | (primary, secondary) = _metaph_add('S') |
||
1179 | current += 2 |
||
1180 | continue |
||
1181 | |||
1182 | # Italian 'chianti' |
||
1183 | elif _string_at(current, 4, {'CHIA'}): |
||
1184 | (primary, secondary) = _metaph_add('K') |
||
1185 | current += 2 |
||
1186 | continue |
||
1187 | |||
1188 | elif _string_at(current, 2, {'CH'}): |
||
1189 | # Find 'Michael' |
||
1190 | if current > 0 and _string_at(current, 4, {'CHAE'}): |
||
1191 | (primary, secondary) = _metaph_add('K', 'X') |
||
1192 | current += 2 |
||
1193 | continue |
||
1194 | |||
1195 | # Greek roots e.g. 'chemistry', 'chorus' |
||
1196 | elif (current == 0 and |
||
1197 | (_string_at((current + 1), 5, |
||
1198 | {'HARAC', 'HARIS'}) or |
||
1199 | _string_at((current + 1), 3, |
||
1200 | {'HOR', 'HYM', 'HIA', 'HEM'})) and |
||
1201 | not _string_at(0, 5, {'CHORE'})): |
||
1202 | (primary, secondary) = _metaph_add('K') |
||
1203 | current += 2 |
||
1204 | continue |
||
1205 | |||
1206 | # Germanic, Greek, or otherwise 'ch' for 'kh' sound |
||
1207 | elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
1208 | _string_at(0, 3, {'SCH'})) or |
||
1209 | # 'architect but not 'arch', 'orchestra', 'orchid' |
||
1210 | _string_at((current - 2), 6, |
||
1211 | {'ORCHES', 'ARCHIT', 'ORCHID'}) or |
||
1212 | _string_at((current + 2), 1, {'T', 'S'}) or |
||
1213 | ((_string_at((current - 1), 1, |
||
1214 | {'A', 'O', 'U', 'E'}) or |
||
1215 | (current == 0)) and |
||
1216 | # e.g., 'wachtler', 'wechsler', but not 'tichner' |
||
1217 | _string_at((current + 2), 1, |
||
1218 | {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
||
1219 | ' '}))): |
||
1220 | (primary, secondary) = _metaph_add('K') |
||
1221 | |||
1222 | else: |
||
1223 | if current > 0: |
||
1224 | if _string_at(0, 2, {'MC'}): |
||
1225 | # e.g., "McHugh" |
||
1226 | (primary, secondary) = _metaph_add('K') |
||
1227 | else: |
||
1228 | (primary, secondary) = _metaph_add('X', 'K') |
||
1229 | else: |
||
1230 | (primary, secondary) = _metaph_add('X') |
||
1231 | |||
1232 | current += 2 |
||
1233 | continue |
||
1234 | |||
1235 | # e.g, 'czerny' |
||
1236 | elif (_string_at(current, 2, {'CZ'}) and |
||
1237 | not _string_at((current - 2), 4, {'WICZ'})): |
||
1238 | (primary, secondary) = _metaph_add('S', 'X') |
||
1239 | current += 2 |
||
1240 | continue |
||
1241 | |||
1242 | # e.g., 'focaccia' |
||
1243 | elif _string_at((current + 1), 3, {'CIA'}): |
||
1244 | (primary, secondary) = _metaph_add('X') |
||
1245 | current += 3 |
||
1246 | |||
1247 | # double 'C', but not if e.g. 'McClellan' |
||
1248 | elif (_string_at(current, 2, {'CC'}) and |
||
1249 | not ((current == 1) and (_get_at(0) == 'M'))): |
||
1250 | # 'bellocchio' but not 'bacchus' |
||
1251 | if ((_string_at((current + 2), 1, |
||
1252 | {'I', 'E', 'H'}) and |
||
1253 | not _string_at((current + 2), 2, ['HU']))): |
||
1254 | # 'accident', 'accede' 'succeed' |
||
1255 | if ((((current == 1) and _get_at(current - 1) == 'A') or |
||
1256 | _string_at((current - 1), 5, |
||
1257 | {'UCCEE', 'UCCES'}))): |
||
1258 | (primary, secondary) = _metaph_add('KS') |
||
1259 | # 'bacci', 'bertucci', other italian |
||
1260 | else: |
||
1261 | (primary, secondary) = _metaph_add('X') |
||
1262 | current += 3 |
||
1263 | continue |
||
1264 | else: # Pierce's rule |
||
1265 | (primary, secondary) = _metaph_add('K') |
||
1266 | current += 2 |
||
1267 | continue |
||
1268 | |||
1269 | elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
||
1270 | (primary, secondary) = _metaph_add('K') |
||
1271 | current += 2 |
||
1272 | continue |
||
1273 | |||
1274 | elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
||
1275 | # Italian vs. English |
||
1276 | if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
||
1277 | (primary, secondary) = _metaph_add('S', 'X') |
||
1278 | else: |
||
1279 | (primary, secondary) = _metaph_add('S') |
||
1280 | current += 2 |
||
1281 | continue |
||
1282 | |||
1283 | # else |
||
1284 | else: |
||
1285 | (primary, secondary) = _metaph_add('K') |
||
1286 | |||
1287 | # name sent in 'mac caffrey', 'mac gregor |
||
1288 | if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
||
1289 | current += 3 |
||
1290 | elif (_string_at((current + 1), 1, |
||
1291 | {'C', 'K', 'Q'}) and |
||
1292 | not _string_at((current + 1), 2, {'CE', 'CI'})): |
||
1293 | current += 2 |
||
1294 | else: |
||
1295 | current += 1 |
||
1296 | continue |
||
1297 | |||
1298 | elif _get_at(current) == 'D': |
||
1299 | if _string_at(current, 2, {'DG'}): |
||
1300 | if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
1301 | # e.g. 'edge' |
||
1302 | (primary, secondary) = _metaph_add('J') |
||
1303 | current += 3 |
||
1304 | continue |
||
1305 | else: |
||
1306 | # e.g. 'edgar' |
||
1307 | (primary, secondary) = _metaph_add('TK') |
||
1308 | current += 2 |
||
1309 | continue |
||
1310 | |||
1311 | elif _string_at(current, 2, {'DT', 'DD'}): |
||
1312 | (primary, secondary) = _metaph_add('T') |
||
1313 | current += 2 |
||
1314 | continue |
||
1315 | |||
1316 | # else |
||
1317 | else: |
||
1318 | (primary, secondary) = _metaph_add('T') |
||
1319 | current += 1 |
||
1320 | continue |
||
1321 | |||
1322 | elif _get_at(current) == 'F': |
||
1323 | if _get_at(current + 1) == 'F': |
||
1324 | current += 2 |
||
1325 | else: |
||
1326 | current += 1 |
||
1327 | (primary, secondary) = _metaph_add('F') |
||
1328 | continue |
||
1329 | |||
1330 | elif _get_at(current) == 'G': |
||
1331 | if _get_at(current + 1) == 'H': |
||
1332 | if (current > 0) and not _is_vowel(current - 1): |
||
1333 | (primary, secondary) = _metaph_add('K') |
||
1334 | current += 2 |
||
1335 | continue |
||
1336 | |||
1337 | # 'ghislane', ghiradelli |
||
1338 | elif current == 0: |
||
1339 | if _get_at(current + 2) == 'I': |
||
1340 | (primary, secondary) = _metaph_add('J') |
||
1341 | else: |
||
1342 | (primary, secondary) = _metaph_add('K') |
||
1343 | current += 2 |
||
1344 | continue |
||
1345 | |||
1346 | # Parker's rule (with some further refinements) - e.g., 'hugh' |
||
1347 | elif (((current > 1) and |
||
1348 | _string_at((current - 2), 1, {'B', 'H', 'D'})) or |
||
1349 | # e.g., 'bough' |
||
1350 | ((current > 2) and |
||
1351 | _string_at((current - 3), 1, {'B', 'H', 'D'})) or |
||
1352 | # e.g., 'broughton' |
||
1353 | ((current > 3) and |
||
1354 | _string_at((current - 4), 1, {'B', 'H'}))): |
||
1355 | current += 2 |
||
1356 | continue |
||
1357 | else: |
||
1358 | # e.g. 'laugh', 'McLaughlin', 'cough', |
||
1359 | # 'gough', 'rough', 'tough' |
||
1360 | if ((current > 2) and |
||
1361 | (_get_at(current - 1) == 'U') and |
||
1362 | (_string_at((current - 3), 1, |
||
1363 | {'C', 'G', 'L', 'R', 'T'}))): |
||
1364 | (primary, secondary) = _metaph_add('F') |
||
1365 | elif (current > 0) and _get_at(current - 1) != 'I': |
||
1366 | (primary, secondary) = _metaph_add('K') |
||
1367 | current += 2 |
||
1368 | continue |
||
1369 | |||
1370 | elif _get_at(current + 1) == 'N': |
||
1371 | if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
||
1372 | (primary, secondary) = _metaph_add('KN', 'N') |
||
1373 | # not e.g. 'cagney' |
||
1374 | elif (not _string_at((current + 2), 2, {'EY'}) and |
||
1375 | (_get_at(current + 1) != 'Y') and |
||
1376 | not _slavo_germanic()): |
||
1377 | (primary, secondary) = _metaph_add('N', 'KN') |
||
1378 | else: |
||
1379 | (primary, secondary) = _metaph_add('KN') |
||
1380 | current += 2 |
||
1381 | continue |
||
1382 | |||
1383 | # 'tagliaro' |
||
1384 | elif (_string_at((current + 1), 2, {'LI'}) and |
||
1385 | not _slavo_germanic()): |
||
1386 | (primary, secondary) = _metaph_add('KL', 'L') |
||
1387 | current += 2 |
||
1388 | continue |
||
1389 | |||
1390 | # -ges-, -gep-, -gel-, -gie- at beginning |
||
1391 | elif ((current == 0) and |
||
1392 | ((_get_at(current + 1) == 'Y') or |
||
1393 | _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
||
1394 | 'IB', 'IL', 'IN', 'IE', 'EI', |
||
1395 | 'ER'}))): |
||
1396 | (primary, secondary) = _metaph_add('K', 'J') |
||
1397 | current += 2 |
||
1398 | continue |
||
1399 | |||
1400 | # -ger-, -gy- |
||
1401 | elif ((_string_at((current + 1), 2, {'ER'}) or |
||
1402 | (_get_at(current + 1) == 'Y')) and not |
||
1403 | _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
||
1404 | _string_at((current - 1), 1, {'E', 'I'}) and not |
||
1405 | _string_at((current - 1), 3, {'RGY', 'OGY'})): |
||
1406 | (primary, secondary) = _metaph_add('K', 'J') |
||
1407 | current += 2 |
||
1408 | continue |
||
1409 | |||
1410 | # italian e.g, 'biaggi' |
||
1411 | elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
||
1412 | _string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
||
1413 | # obvious germanic |
||
1414 | if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
||
1415 | _string_at(0, 3, {'SCH'})) or |
||
1416 | _string_at((current + 1), 2, {'ET'}))): |
||
1417 | (primary, secondary) = _metaph_add('K') |
||
1418 | elif _string_at((current + 1), 4, {'IER '}): |
||
1419 | (primary, secondary) = _metaph_add('J') |
||
1420 | else: |
||
1421 | (primary, secondary) = _metaph_add('J', 'K') |
||
1422 | current += 2 |
||
1423 | continue |
||
1424 | |||
1425 | else: |
||
1426 | if _get_at(current + 1) == 'G': |
||
1427 | current += 2 |
||
1428 | else: |
||
1429 | current += 1 |
||
1430 | (primary, secondary) = _metaph_add('K') |
||
1431 | continue |
||
1432 | |||
1433 | elif _get_at(current) == 'H': |
||
1434 | # only keep if first & before vowel or btw. 2 vowels |
||
1435 | if ((((current == 0) or _is_vowel(current - 1)) and |
||
1436 | _is_vowel(current + 1))): |
||
1437 | (primary, secondary) = _metaph_add('H') |
||
1438 | current += 2 |
||
1439 | else: # also takes care of 'HH' |
||
1440 | current += 1 |
||
1441 | continue |
||
1442 | |||
1443 | elif _get_at(current) == 'J': |
||
1444 | # obvious spanish, 'jose', 'san jacinto' |
||
1445 | if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
||
1446 | if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
||
1447 | _string_at(0, 4, ['SAN ']))): |
||
1448 | (primary, secondary) = _metaph_add('H') |
||
1449 | else: |
||
1450 | (primary, secondary) = _metaph_add('J', 'H') |
||
1451 | current += 1 |
||
1452 | continue |
||
1453 | |||
1454 | elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
||
1455 | # Yankelovich/Jankelowicz |
||
1456 | (primary, secondary) = _metaph_add('J', 'A') |
||
1457 | # Spanish pron. of e.g. 'bajador' |
||
1458 | elif (_is_vowel(current - 1) and |
||
1459 | not _slavo_germanic() and |
||
1460 | ((_get_at(current + 1) == 'A') or |
||
1461 | (_get_at(current + 1) == 'O'))): |
||
1462 | (primary, secondary) = _metaph_add('J', 'H') |
||
1463 | elif current == last: |
||
1464 | (primary, secondary) = _metaph_add('J', ' ') |
||
1465 | elif (not _string_at((current + 1), 1, |
||
1466 | {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
||
1467 | not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
||
1468 | (primary, secondary) = _metaph_add('J') |
||
1469 | |||
1470 | if _get_at(current + 1) == 'J': # it could happen! |
||
1471 | current += 2 |
||
1472 | else: |
||
1473 | current += 1 |
||
1474 | continue |
||
1475 | |||
1476 | elif _get_at(current) == 'K': |
||
1477 | if _get_at(current + 1) == 'K': |
||
1478 | current += 2 |
||
1479 | else: |
||
1480 | current += 1 |
||
1481 | (primary, secondary) = _metaph_add('K') |
||
1482 | continue |
||
1483 | |||
1484 | elif _get_at(current) == 'L': |
||
1485 | if _get_at(current + 1) == 'L': |
||
1486 | # Spanish e.g. 'cabrillo', 'gallegos' |
||
1487 | if (((current == (length - 3)) and |
||
1488 | _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
||
1489 | ((_string_at((last - 1), 2, {'AS', 'OS'}) or |
||
1490 | _string_at(last, 1, {'A', 'O'})) and |
||
1491 | _string_at((current - 1), 4, {'ALLE'}))): |
||
1492 | (primary, secondary) = _metaph_add('L', ' ') |
||
1493 | current += 2 |
||
1494 | continue |
||
1495 | current += 2 |
||
1496 | else: |
||
1497 | current += 1 |
||
1498 | (primary, secondary) = _metaph_add('L') |
||
1499 | continue |
||
1500 | |||
1501 | elif _get_at(current) == 'M': |
||
1502 | if (((_string_at((current - 1), 3, {'UMB'}) and |
||
1503 | (((current + 1) == last) or |
||
1504 | _string_at((current + 2), 2, {'ER'}))) or |
||
1505 | # 'dumb', 'thumb' |
||
1506 | (_get_at(current + 1) == 'M'))): |
||
1507 | current += 2 |
||
1508 | else: |
||
1509 | current += 1 |
||
1510 | (primary, secondary) = _metaph_add('M') |
||
1511 | continue |
||
1512 | |||
1513 | elif _get_at(current) == 'N': |
||
1514 | if _get_at(current + 1) == 'N': |
||
1515 | current += 2 |
||
1516 | else: |
||
1517 | current += 1 |
||
1518 | (primary, secondary) = _metaph_add('N') |
||
1519 | continue |
||
1520 | |||
1521 | elif _get_at(current) == 'Ñ': |
||
1522 | current += 1 |
||
1523 | (primary, secondary) = _metaph_add('N') |
||
1524 | continue |
||
1525 | |||
1526 | elif _get_at(current) == 'P': |
||
1527 | if _get_at(current + 1) == 'H': |
||
1528 | (primary, secondary) = _metaph_add('F') |
||
1529 | current += 2 |
||
1530 | continue |
||
1531 | |||
1532 | # also account for "campbell", "raspberry" |
||
1533 | elif _string_at((current + 1), 1, {'P', 'B'}): |
||
1534 | current += 2 |
||
1535 | else: |
||
1536 | current += 1 |
||
1537 | (primary, secondary) = _metaph_add('P') |
||
1538 | continue |
||
1539 | |||
1540 | elif _get_at(current) == 'Q': |
||
1541 | if _get_at(current + 1) == 'Q': |
||
1542 | current += 2 |
||
1543 | else: |
||
1544 | current += 1 |
||
1545 | (primary, secondary) = _metaph_add('K') |
||
1546 | continue |
||
1547 | |||
1548 | elif _get_at(current) == 'R': |
||
1549 | # french e.g. 'rogier', but exclude 'hochmeier' |
||
1550 | if (((current == last) and |
||
1551 | not _slavo_germanic() and |
||
1552 | _string_at((current - 2), 2, {'IE'}) and |
||
1553 | not _string_at((current - 4), 2, {'ME', 'MA'}))): |
||
1554 | (primary, secondary) = _metaph_add('', 'R') |
||
1555 | else: |
||
1556 | (primary, secondary) = _metaph_add('R') |
||
1557 | |||
1558 | if _get_at(current + 1) == 'R': |
||
1559 | current += 2 |
||
1560 | else: |
||
1561 | current += 1 |
||
1562 | continue |
||
1563 | |||
1564 | elif _get_at(current) == 'S': |
||
1565 | # special cases 'island', 'isle', 'carlisle', 'carlysle' |
||
1566 | if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
||
1567 | current += 1 |
||
1568 | continue |
||
1569 | |||
1570 | # special case 'sugar-' |
||
1571 | elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
||
1572 | (primary, secondary) = _metaph_add('X', 'S') |
||
1573 | current += 1 |
||
1574 | continue |
||
1575 | |||
1576 | elif _string_at(current, 2, {'SH'}): |
||
1577 | # Germanic |
||
1578 | if _string_at((current + 1), 4, |
||
1579 | {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
||
1580 | (primary, secondary) = _metaph_add('S') |
||
1581 | else: |
||
1582 | (primary, secondary) = _metaph_add('X') |
||
1583 | current += 2 |
||
1584 | continue |
||
1585 | |||
1586 | # Italian & Armenian |
||
1587 | elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
||
1588 | _string_at(current, 4, {'SIAN'})): |
||
1589 | if not _slavo_germanic(): |
||
1590 | (primary, secondary) = _metaph_add('S', 'X') |
||
1591 | else: |
||
1592 | (primary, secondary) = _metaph_add('S') |
||
1593 | current += 3 |
||
1594 | continue |
||
1595 | |||
1596 | # German & anglicisations, e.g. 'smith' match 'schmidt', |
||
1597 | # 'snider' match 'schneider' |
||
1598 | # also, -sz- in Slavic language although in Hungarian it is |
||
1599 | # pronounced 's' |
||
1600 | elif (((current == 0) and |
||
1601 | _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
||
1602 | _string_at((current + 1), 1, {'Z'})): |
||
1603 | (primary, secondary) = _metaph_add('S', 'X') |
||
1604 | if _string_at((current + 1), 1, {'Z'}): |
||
1605 | current += 2 |
||
1606 | else: |
||
1607 | current += 1 |
||
1608 | continue |
||
1609 | |||
1610 | elif _string_at(current, 2, {'SC'}): |
||
1611 | # Schlesinger's rule |
||
1612 | if _get_at(current + 2) == 'H': |
||
1613 | # dutch origin, e.g. 'school', 'schooner' |
||
1614 | if _string_at((current + 3), 2, |
||
1615 | {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
||
1616 | # 'schermerhorn', 'schenker' |
||
1617 | if _string_at((current + 3), 2, {'ER', 'EN'}): |
||
1618 | (primary, secondary) = _metaph_add('X', 'SK') |
||
1619 | else: |
||
1620 | (primary, secondary) = _metaph_add('SK') |
||
1621 | current += 3 |
||
1622 | continue |
||
1623 | else: |
||
1624 | if (((current == 0) and not _is_vowel(3) and |
||
1625 | (_get_at(3) != 'W'))): |
||
1626 | (primary, secondary) = _metaph_add('X', 'S') |
||
1627 | else: |
||
1628 | (primary, secondary) = _metaph_add('X') |
||
1629 | current += 3 |
||
1630 | continue |
||
1631 | |||
1632 | elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
||
1633 | (primary, secondary) = _metaph_add('S') |
||
1634 | current += 3 |
||
1635 | continue |
||
1636 | |||
1637 | # else |
||
1638 | else: |
||
1639 | (primary, secondary) = _metaph_add('SK') |
||
1640 | current += 3 |
||
1641 | continue |
||
1642 | |||
1643 | else: |
||
1644 | # french e.g. 'resnais', 'artois' |
||
1645 | if (current == last) and _string_at((current - 2), 2, |
||
1646 | {'AI', 'OI'}): |
||
1647 | (primary, secondary) = _metaph_add('', 'S') |
||
1648 | else: |
||
1649 | (primary, secondary) = _metaph_add('S') |
||
1650 | |||
1651 | if _string_at((current + 1), 1, {'S', 'Z'}): |
||
1652 | current += 2 |
||
1653 | else: |
||
1654 | current += 1 |
||
1655 | continue |
||
1656 | |||
1657 | elif _get_at(current) == 'T': |
||
1658 | if _string_at(current, 4, {'TION'}): |
||
1659 | (primary, secondary) = _metaph_add('X') |
||
1660 | current += 3 |
||
1661 | continue |
||
1662 | |||
1663 | elif _string_at(current, 3, {'TIA', 'TCH'}): |
||
1664 | (primary, secondary) = _metaph_add('X') |
||
1665 | current += 3 |
||
1666 | continue |
||
1667 | |||
1668 | elif (_string_at(current, 2, {'TH'}) or |
||
1669 | _string_at(current, 3, {'TTH'})): |
||
1670 | # special case 'thomas', 'thames' or germanic |
||
1671 | if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
||
1672 | _string_at(0, 4, {'VAN ', 'VON '}) or |
||
1673 | _string_at(0, 3, {'SCH'}))): |
||
1674 | (primary, secondary) = _metaph_add('T') |
||
1675 | else: |
||
1676 | (primary, secondary) = _metaph_add('0', 'T') |
||
1677 | current += 2 |
||
1678 | continue |
||
1679 | |||
1680 | elif _string_at((current + 1), 1, {'T', 'D'}): |
||
1681 | current += 2 |
||
1682 | else: |
||
1683 | current += 1 |
||
1684 | (primary, secondary) = _metaph_add('T') |
||
1685 | continue |
||
1686 | |||
1687 | elif _get_at(current) == 'V': |
||
1688 | if _get_at(current + 1) == 'V': |
||
1689 | current += 2 |
||
1690 | else: |
||
1691 | current += 1 |
||
1692 | (primary, secondary) = _metaph_add('F') |
||
1693 | continue |
||
1694 | |||
1695 | elif _get_at(current) == 'W': |
||
1696 | # can also be in middle of word |
||
1697 | if _string_at(current, 2, {'WR'}): |
||
1698 | (primary, secondary) = _metaph_add('R') |
||
1699 | current += 2 |
||
1700 | continue |
||
1701 | elif ((current == 0) and |
||
1702 | (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
||
1703 | # Wasserman should match Vasserman |
||
1704 | if _is_vowel(current + 1): |
||
1705 | (primary, secondary) = _metaph_add('A', 'F') |
||
1706 | else: |
||
1707 | # need Uomo to match Womo |
||
1708 | (primary, secondary) = _metaph_add('A') |
||
1709 | |||
1710 | # Arnow should match Arnoff |
||
1711 | if ((((current == last) and _is_vowel(current - 1)) or |
||
1712 | _string_at((current - 1), 5, |
||
1713 | {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
||
1714 | _string_at(0, 3, ['SCH']))): |
||
1715 | (primary, secondary) = _metaph_add('', 'F') |
||
1716 | current += 1 |
||
1717 | continue |
||
1718 | # Polish e.g. 'filipowicz' |
||
1719 | elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
||
1720 | (primary, secondary) = _metaph_add('TS', 'FX') |
||
1721 | current += 4 |
||
1722 | continue |
||
1723 | # else skip it |
||
1724 | else: |
||
1725 | current += 1 |
||
1726 | continue |
||
1727 | |||
1728 | elif _get_at(current) == 'X': |
||
1729 | # French e.g. breaux |
||
1730 | if (not ((current == last) and |
||
1731 | (_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
||
1732 | _string_at((current - 2), 2, {'AU', 'OU'})))): |
||
1733 | (primary, secondary) = _metaph_add('KS') |
||
1734 | |||
1735 | if _string_at((current + 1), 1, {'C', 'X'}): |
||
1736 | current += 2 |
||
1737 | else: |
||
1738 | current += 1 |
||
1739 | continue |
||
1740 | |||
1741 | elif _get_at(current) == 'Z': |
||
1742 | # Chinese Pinyin e.g. 'zhao' |
||
1743 | if _get_at(current + 1) == 'H': |
||
1744 | (primary, secondary) = _metaph_add('J') |
||
1745 | current += 2 |
||
1746 | continue |
||
1747 | elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
||
1748 | (_slavo_germanic() and ((current > 0) and |
||
1749 | _get_at(current - 1) != 'T'))): |
||
1750 | (primary, secondary) = _metaph_add('S', 'TS') |
||
1751 | else: |
||
1752 | (primary, secondary) = _metaph_add('S') |
||
1753 | |||
1754 | if _get_at(current + 1) == 'Z': |
||
1755 | current += 2 |
||
1756 | else: |
||
1757 | current += 1 |
||
1758 | continue |
||
1759 | |||
1760 | else: |
||
1761 | current += 1 |
||
1762 | |||
1763 | if maxlength and maxlength < _INFINITY: |
||
1764 | primary = primary[:maxlength] |
||
1765 | secondary = secondary[:maxlength] |
||
1766 | if primary == secondary: |
||
1767 | secondary = '' |
||
1768 | |||
1769 | return (primary, secondary) |
||
1770 | |||
1771 | |||
1772 | def caverphone(word, version=2): |
||
1773 | """Return the Caverphone code for a word. |
||
1774 | |||
1775 | A description of version 1 of the algorithm can be found in |
||
1776 | :cite:`Hood:2002`. |
||
1777 | |||
1778 | A description of version 2 of the algorithm can be found in |
||
1779 | :cite:`Hood:2004`. |
||
1780 | |||
1781 | :param str word: the word to transform |
||
1782 | :param int version: the version of Caverphone to employ for encoding |
||
1783 | (defaults to 2) |
||
1784 | :returns: the Caverphone value |
||
1785 | :rtype: str |
||
1786 | |||
1787 | >>> caverphone('Christopher') |
||
1788 | 'KRSTFA1111' |
||
1789 | >>> caverphone('Niall') |
||
1790 | 'NA11111111' |
||
1791 | >>> caverphone('Smith') |
||
1792 | 'SMT1111111' |
||
1793 | >>> caverphone('Schmidt') |
||
1794 | 'SKMT111111' |
||
1795 | |||
1796 | >>> caverphone('Christopher', 1) |
||
1797 | 'KRSTF1' |
||
1798 | >>> caverphone('Niall', 1) |
||
1799 | 'N11111' |
||
1800 | >>> caverphone('Smith', 1) |
||
1801 | 'SMT111' |
||
1802 | >>> caverphone('Schmidt', 1) |
||
1803 | 'SKMT11' |
||
1804 | """ |
||
1805 | _vowels = {'a', 'e', 'i', 'o', 'u'} |
||
1806 | |||
1807 | word = word.lower() |
||
1808 | word = ''.join(c for c in word if c in |
||
1809 | {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
||
1810 | 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
||
1811 | 'y', 'z'}) |
||
1812 | |||
1813 | def _squeeze_replace(word, char, new_char): |
||
1814 | """Convert strings of char in word to one instance of new_char.""" |
||
1815 | while char * 2 in word: |
||
1816 | word = word.replace(char * 2, char) |
||
1817 | return word.replace(char, new_char) |
||
1818 | |||
1819 | # the main replacemet algorithm |
||
1820 | if version != 1 and word[-1:] == 'e': |
||
1821 | word = word[:-1] |
||
1822 | if word: |
||
1823 | if word[:5] == 'cough': |
||
1824 | word = 'cou2f'+word[5:] |
||
1825 | if word[:5] == 'rough': |
||
1826 | word = 'rou2f'+word[5:] |
||
1827 | if word[:5] == 'tough': |
||
1828 | word = 'tou2f'+word[5:] |
||
1829 | if word[:6] == 'enough': |
||
1830 | word = 'enou2f'+word[6:] |
||
1831 | if version != 1 and word[:6] == 'trough': |
||
1832 | word = 'trou2f'+word[6:] |
||
1833 | if word[:2] == 'gn': |
||
1834 | word = '2n'+word[2:] |
||
1835 | if word[-2:] == 'mb': |
||
1836 | word = word[:-1]+'2' |
||
1837 | word = word.replace('cq', '2q') |
||
1838 | word = word.replace('ci', 'si') |
||
1839 | word = word.replace('ce', 'se') |
||
1840 | word = word.replace('cy', 'sy') |
||
1841 | word = word.replace('tch', '2ch') |
||
1842 | word = word.replace('c', 'k') |
||
1843 | word = word.replace('q', 'k') |
||
1844 | word = word.replace('x', 'k') |
||
1845 | word = word.replace('v', 'f') |
||
1846 | word = word.replace('dg', '2g') |
||
1847 | word = word.replace('tio', 'sio') |
||
1848 | word = word.replace('tia', 'sia') |
||
1849 | word = word.replace('d', 't') |
||
1850 | word = word.replace('ph', 'fh') |
||
1851 | word = word.replace('b', 'p') |
||
1852 | word = word.replace('sh', 's2') |
||
1853 | word = word.replace('z', 's') |
||
1854 | if word[0] in _vowels: |
||
1855 | word = 'A'+word[1:] |
||
1856 | word = word.replace('a', '3') |
||
1857 | word = word.replace('e', '3') |
||
1858 | word = word.replace('i', '3') |
||
1859 | word = word.replace('o', '3') |
||
1860 | word = word.replace('u', '3') |
||
1861 | if version != 1: |
||
1862 | word = word.replace('j', 'y') |
||
1863 | if word[:2] == 'y3': |
||
1864 | word = 'Y3'+word[2:] |
||
1865 | if word[:1] == 'y': |
||
1866 | word = 'A'+word[1:] |
||
1867 | word = word.replace('y', '3') |
||
1868 | word = word.replace('3gh3', '3kh3') |
||
1869 | word = word.replace('gh', '22') |
||
1870 | word = word.replace('g', 'k') |
||
1871 | |||
1872 | word = _squeeze_replace(word, 's', 'S') |
||
1873 | word = _squeeze_replace(word, 't', 'T') |
||
1874 | word = _squeeze_replace(word, 'p', 'P') |
||
1875 | word = _squeeze_replace(word, 'k', 'K') |
||
1876 | word = _squeeze_replace(word, 'f', 'F') |
||
1877 | word = _squeeze_replace(word, 'm', 'M') |
||
1878 | word = _squeeze_replace(word, 'n', 'N') |
||
1879 | |||
1880 | word = word.replace('w3', 'W3') |
||
1881 | if version == 1: |
||
1882 | word = word.replace('wy', 'Wy') |
||
1883 | word = word.replace('wh3', 'Wh3') |
||
1884 | if version == 1: |
||
1885 | word = word.replace('why', 'Why') |
||
1886 | if version != 1 and word[-1:] == 'w': |
||
1887 | word = word[:-1]+'3' |
||
1888 | word = word.replace('w', '2') |
||
1889 | if word[:1] == 'h': |
||
1890 | word = 'A'+word[1:] |
||
1891 | word = word.replace('h', '2') |
||
1892 | word = word.replace('r3', 'R3') |
||
1893 | if version == 1: |
||
1894 | word = word.replace('ry', 'Ry') |
||
1895 | if version != 1 and word[-1:] == 'r': |
||
1896 | word = word[:-1]+'3' |
||
1897 | word = word.replace('r', '2') |
||
1898 | word = word.replace('l3', 'L3') |
||
1899 | if version == 1: |
||
1900 | word = word.replace('ly', 'Ly') |
||
1901 | if version != 1 and word[-1:] == 'l': |
||
1902 | word = word[:-1]+'3' |
||
1903 | word = word.replace('l', '2') |
||
1904 | if version == 1: |
||
1905 | word = word.replace('j', 'y') |
||
1906 | word = word.replace('y3', 'Y3') |
||
1907 | word = word.replace('y', '2') |
||
1908 | word = word.replace('2', '') |
||
1909 | if version != 1 and word[-1:] == '3': |
||
1910 | word = word[:-1]+'A' |
||
1911 | word = word.replace('3', '') |
||
1912 | |||
1913 | # pad with 1s, then extract the necessary length of code |
||
1914 | word = word+'1'*10 |
||
1915 | if version != 1: |
||
1916 | word = word[:10] |
||
1917 | else: |
||
1918 | word = word[:6] |
||
1919 | |||
1920 | return word |
||
1921 | |||
1922 | |||
1923 | def alpha_sis(word, maxlength=14): |
||
1924 | """Return the IBM Alpha Search Inquiry System code for a word. |
||
1925 | |||
1926 | The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`. |
||
1927 | This implementation is based on the description in :cite:`Moore:1977`. |
||
1928 | |||
1929 | A collection is necessary since there can be multiple values for a |
||
1930 | single word. But the collection must be ordered since the first value |
||
1931 | is the primary coding. |
||
1932 | |||
1933 | :param str word: the word to transform |
||
1934 | :param int maxlength: the length of the code returned (defaults to 14) |
||
1935 | :returns: the Alpha SIS value |
||
1936 | :rtype: tuple |
||
1937 | |||
1938 | >>> alpha_sis('Christopher') |
||
1939 | ('06401840000000', '07040184000000', '04018400000000') |
||
1940 | >>> alpha_sis('Niall') |
||
1941 | ('02500000000000',) |
||
1942 | >>> alpha_sis('Smith') |
||
1943 | ('03100000000000',) |
||
1944 | >>> alpha_sis('Schmidt') |
||
1945 | ('06310000000000',) |
||
1946 | """ |
||
1947 | _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', |
||
1948 | 'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', |
||
1949 | 'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', |
||
1950 | 'O': '1', 'U': '1', 'W': '4', 'Y': '5'} |
||
1951 | _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', |
||
1952 | 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', |
||
1953 | 'Y') |
||
1954 | _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'), |
||
1955 | 'CH': ('6', '70', '0'), 'CK': ('7', '6'), |
||
1956 | 'DS': ('0', '10'), 'DZ': ('0', '10'), |
||
1957 | 'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', |
||
1958 | 'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', |
||
1959 | 'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', |
||
1960 | 'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', |
||
1961 | 'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', |
||
1962 | 'F': '8', 'V': '8', 'B': '9', 'P': '9'} |
||
1963 | _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', |
||
1964 | 'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', |
||
1965 | 'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', |
||
1966 | 'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P') |
||
1967 | |||
1968 | alpha = [''] |
||
1969 | pos = 0 |
||
1970 | word = normalize('NFKD', text_type(word.upper())) |
||
1971 | word = word.replace('ß', 'SS') |
||
1972 | word = ''.join(c for c in word if c in |
||
1973 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
1974 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
1975 | 'Y', 'Z'}) |
||
1976 | |||
1977 | # Clamp maxlength to [4, 64] |
||
1978 | if maxlength is not None: |
||
1979 | maxlength = min(max(4, maxlength), 64) |
||
1980 | else: |
||
1981 | maxlength = 64 |
||
1982 | |||
1983 | # Do special processing for initial substrings |
||
1984 | for k in _alpha_sis_initials_order: |
||
1985 | if word.startswith(k): |
||
1986 | alpha[0] += _alpha_sis_initials[k] |
||
1987 | pos += len(k) |
||
1988 | break |
||
1989 | |||
1990 | # Add a '0' if alpha is still empty |
||
1991 | if not alpha[0]: |
||
1992 | alpha[0] += '0' |
||
1993 | |||
1994 | # Whether or not any special initial codes were encoded, iterate |
||
1995 | # through the length of the word in the main encoding loop |
||
1996 | while pos < len(word): |
||
1997 | origpos = pos |
||
1998 | for k in _alpha_sis_basic_order: |
||
1999 | if word[pos:].startswith(k): |
||
2000 | if isinstance(_alpha_sis_basic[k], tuple): |
||
2001 | newalpha = [] |
||
2002 | for i in range(len(_alpha_sis_basic[k])): |
||
2003 | newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] |
||
2004 | alpha = newalpha |
||
2005 | else: |
||
2006 | alpha = [_ + _alpha_sis_basic[k] for _ in alpha] |
||
2007 | pos += len(k) |
||
2008 | break |
||
2009 | if pos == origpos: |
||
2010 | alpha = [_ + '_' for _ in alpha] |
||
2011 | pos += 1 |
||
2012 | |||
2013 | # Trim doublets and placeholders |
||
2014 | for i in range(len(alpha)): |
||
2015 | pos = 1 |
||
2016 | while pos < len(alpha[i]): |
||
2017 | if alpha[i][pos] == alpha[i][pos-1]: |
||
2018 | alpha[i] = alpha[i][:pos]+alpha[i][pos+1:] |
||
2019 | pos += 1 |
||
2020 | alpha = (_.replace('_', '') for _ in alpha) |
||
2021 | |||
2022 | # Trim codes and return tuple |
||
2023 | alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha) |
||
2024 | return tuple(alpha) |
||
2025 | |||
2026 | |||
2027 | def fuzzy_soundex(word, maxlength=5, zero_pad=True): |
||
2028 | """Return the Fuzzy Soundex code for a word. |
||
2029 | |||
2030 | Fuzzy Soundex is an algorithm derived from Soundex, defined in |
||
2031 | :cite:`Holmes:2002`. |
||
2032 | |||
2033 | :param str word: the word to transform |
||
2034 | :param int maxlength: the length of the code returned (defaults to 4) |
||
2035 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
2036 | a maxlength string |
||
2037 | :returns: the Fuzzy Soundex value |
||
2038 | :rtype: str |
||
2039 | |||
2040 | >>> fuzzy_soundex('Christopher') |
||
2041 | 'K6931' |
||
2042 | >>> fuzzy_soundex('Niall') |
||
2043 | 'N4000' |
||
2044 | >>> fuzzy_soundex('Smith') |
||
2045 | 'S5300' |
||
2046 | >>> fuzzy_soundex('Smith') |
||
2047 | 'S5300' |
||
2048 | """ |
||
2049 | _fuzzy_soundex_translation = dict(zip((ord(_) for _ in |
||
2050 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
2051 | '0193017-07745501769301-7-9')) |
||
2052 | |||
2053 | word = normalize('NFKD', text_type(word.upper())) |
||
2054 | word = word.replace('ß', 'SS') |
||
2055 | |||
2056 | # Clamp maxlength to [4, 64] |
||
2057 | if maxlength is not None: |
||
2058 | maxlength = min(max(4, maxlength), 64) |
||
2059 | else: |
||
2060 | maxlength = 64 |
||
2061 | |||
2062 | if not word: |
||
2063 | if zero_pad: |
||
2064 | return '0' * maxlength |
||
2065 | return '0' |
||
2066 | |||
2067 | if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: |
||
2068 | word = 'SS' + word[2:] |
||
2069 | elif word[:2] == 'GN': |
||
2070 | word = 'NN' + word[2:] |
||
2071 | elif word[:2] in {'HR', 'WR'}: |
||
2072 | word = 'RR' + word[2:] |
||
2073 | elif word[:2] == 'HW': |
||
2074 | word = 'WW' + word[2:] |
||
2075 | elif word[:2] in {'KN', 'NG'}: |
||
2076 | word = 'NN' + word[2:] |
||
2077 | |||
2078 | if word[-2:] == 'CH': |
||
2079 | word = word[:-2] + 'KK' |
||
2080 | elif word[-2:] == 'NT': |
||
2081 | word = word[:-2] + 'TT' |
||
2082 | elif word[-2:] == 'RT': |
||
2083 | word = word[:-2] + 'RR' |
||
2084 | elif word[-3:] == 'RDT': |
||
2085 | word = word[:-3] + 'RR' |
||
2086 | |||
2087 | word = word.replace('CA', 'KA') |
||
2088 | word = word.replace('CC', 'KK') |
||
2089 | word = word.replace('CK', 'KK') |
||
2090 | word = word.replace('CE', 'SE') |
||
2091 | word = word.replace('CHL', 'KL') |
||
2092 | word = word.replace('CL', 'KL') |
||
2093 | word = word.replace('CHR', 'KR') |
||
2094 | word = word.replace('CR', 'KR') |
||
2095 | word = word.replace('CI', 'SI') |
||
2096 | word = word.replace('CO', 'KO') |
||
2097 | word = word.replace('CU', 'KU') |
||
2098 | word = word.replace('CY', 'SY') |
||
2099 | word = word.replace('DG', 'GG') |
||
2100 | word = word.replace('GH', 'HH') |
||
2101 | word = word.replace('MAC', 'MK') |
||
2102 | word = word.replace('MC', 'MK') |
||
2103 | word = word.replace('NST', 'NSS') |
||
2104 | word = word.replace('PF', 'FF') |
||
2105 | word = word.replace('PH', 'FF') |
||
2106 | word = word.replace('SCH', 'SSS') |
||
2107 | word = word.replace('TIO', 'SIO') |
||
2108 | word = word.replace('TIA', 'SIO') |
||
2109 | word = word.replace('TCH', 'CHH') |
||
2110 | |||
2111 | sdx = word.translate(_fuzzy_soundex_translation) |
||
2112 | sdx = sdx.replace('-', '') |
||
2113 | |||
2114 | # remove repeating characters |
||
2115 | sdx = _delete_consecutive_repeats(sdx) |
||
2116 | |||
2117 | if word[0] in {'H', 'W', 'Y'}: |
||
2118 | sdx = word[0] + sdx |
||
2119 | else: |
||
2120 | sdx = word[0] + sdx[1:] |
||
2121 | |||
2122 | sdx = sdx.replace('0', '') |
||
2123 | |||
2124 | if zero_pad: |
||
2125 | sdx += ('0'*maxlength) |
||
2126 | |||
2127 | return sdx[:maxlength] |
||
2128 | |||
2129 | |||
2130 | def phonex(word, maxlength=4, zero_pad=True): |
||
2131 | """Return the Phonex code for a word. |
||
2132 | |||
2133 | Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`. |
||
2134 | |||
2135 | :param str word: the word to transform |
||
2136 | :param int maxlength: the length of the code returned (defaults to 4) |
||
2137 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
2138 | a maxlength string |
||
2139 | :returns: the Phonex value |
||
2140 | :rtype: str |
||
2141 | |||
2142 | >>> phonex('Christopher') |
||
2143 | 'C623' |
||
2144 | >>> phonex('Niall') |
||
2145 | 'N400' |
||
2146 | >>> phonex('Schmidt') |
||
2147 | 'S253' |
||
2148 | >>> phonex('Smith') |
||
2149 | 'S530' |
||
2150 | """ |
||
2151 | name = normalize('NFKD', text_type(word.upper())) |
||
2152 | name = name.replace('ß', 'SS') |
||
2153 | |||
2154 | # Clamp maxlength to [4, 64] |
||
2155 | if maxlength is not None: |
||
2156 | maxlength = min(max(4, maxlength), 64) |
||
2157 | else: |
||
2158 | maxlength = 64 |
||
2159 | |||
2160 | name_code = last = '' |
||
2161 | |||
2162 | # Deletions effected by replacing with next letter which |
||
2163 | # will be ignored due to duplicate handling of Soundex code. |
||
2164 | # This is faster than 'moving' all subsequent letters. |
||
2165 | |||
2166 | # Remove any trailing Ss |
||
2167 | while name[-1:] == 'S': |
||
2168 | name = name[:-1] |
||
2169 | |||
2170 | # Phonetic equivalents of first 2 characters |
||
2171 | # Works since duplicate letters are ignored |
||
2172 | if name[:2] == 'KN': |
||
2173 | name = 'N' + name[2:] # KN.. == N.. |
||
2174 | elif name[:2] == 'PH': |
||
2175 | name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) |
||
2176 | elif name[:2] == 'WR': |
||
2177 | name = 'R' + name[2:] # WR.. == R.. |
||
2178 | |||
2179 | if name: |
||
2180 | # Special case, ignore H first letter (subsequent Hs ignored anyway) |
||
2181 | # Works since duplicate letters are ignored |
||
2182 | if name[0] == 'H': |
||
2183 | name = name[1:] |
||
2184 | |||
2185 | if name: |
||
2186 | # Phonetic equivalents of first character |
||
2187 | if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
2188 | name = 'A' + name[1:] |
||
2189 | elif name[0] in {'B', 'P'}: |
||
2190 | name = 'B' + name[1:] |
||
2191 | elif name[0] in {'V', 'F'}: |
||
2192 | name = 'F' + name[1:] |
||
2193 | elif name[0] in {'C', 'K', 'Q'}: |
||
2194 | name = 'C' + name[1:] |
||
2195 | elif name[0] in {'G', 'J'}: |
||
2196 | name = 'G' + name[1:] |
||
2197 | elif name[0] in {'S', 'Z'}: |
||
2198 | name = 'S' + name[1:] |
||
2199 | |||
2200 | name_code = last = name[0] |
||
2201 | |||
2202 | # MODIFIED SOUNDEX CODE |
||
2203 | for i in range(1, len(name)): |
||
2204 | code = '0' |
||
2205 | if name[i] in {'B', 'F', 'P', 'V'}: |
||
2206 | code = '1' |
||
2207 | elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: |
||
2208 | code = '2' |
||
2209 | elif name[i] in {'D', 'T'}: |
||
2210 | if name[i+1:i+2] != 'C': |
||
2211 | code = '3' |
||
2212 | elif name[i] == 'L': |
||
2213 | if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
||
2214 | i+1 == len(name)): |
||
2215 | code = '4' |
||
2216 | elif name[i] in {'M', 'N'}: |
||
2217 | if name[i+1:i+2] in {'D', 'G'}: |
||
2218 | name = name[:i+1] + name[i] + name[i+2:] |
||
2219 | code = '5' |
||
2220 | elif name[i] == 'R': |
||
2221 | if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
||
2222 | i+1 == len(name)): |
||
2223 | code = '6' |
||
2224 | |||
2225 | if code != last and code != '0' and i != 0: |
||
2226 | name_code += code |
||
2227 | |||
2228 | last = name_code[-1] |
||
2229 | |||
2230 | if zero_pad: |
||
2231 | name_code += '0' * maxlength |
||
2232 | if not name_code: |
||
2233 | name_code = '0' |
||
2234 | return name_code[:maxlength] |
||
2235 | |||
2236 | |||
2237 | def phonem(word): |
||
2238 | """Return the Phonem code for a word. |
||
2239 | |||
2240 | Phonem is defined in :cite:`Wilde:1988`. |
||
2241 | |||
2242 | This version is based on the Perl implementation documented at |
||
2243 | :cite:`Wilz:2005`. |
||
2244 | It includes some enhancements presented in the Java port at |
||
2245 | :cite:`dcm4che:2011`. |
||
2246 | |||
2247 | Phonem is intended chiefly for German names/words. |
||
2248 | |||
2249 | :param str word: the word to transform |
||
2250 | :returns: the Phonem value |
||
2251 | :rtype: str |
||
2252 | |||
2253 | >>> phonem('Christopher') |
||
2254 | 'CRYSDOVR' |
||
2255 | >>> phonem('Niall') |
||
2256 | 'NYAL' |
||
2257 | >>> phonem('Smith') |
||
2258 | 'SMYD' |
||
2259 | >>> phonem('Schmidt') |
||
2260 | 'CMYD' |
||
2261 | """ |
||
2262 | _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), |
||
2263 | ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), |
||
2264 | ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), |
||
2265 | ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), |
||
2266 | ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), |
||
2267 | ('AU', 'A§'), ('OU', '§')) |
||
2268 | _phonem_translation = dict(zip((ord(_) for _ in |
||
2269 | 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
||
2270 | 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) |
||
2271 | |||
2272 | word = normalize('NFC', text_type(word.upper())) |
||
2273 | for i, j in _phonem_substitutions: |
||
2274 | word = word.replace(i, j) |
||
2275 | word = word.translate(_phonem_translation) |
||
2276 | |||
2277 | return ''.join(c for c in _delete_consecutive_repeats(word) |
||
2278 | if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', |
||
2279 | 'U', 'V', 'W', 'X', 'Y', 'Ö'}) |
||
2280 | |||
2281 | |||
2282 | def phonix(word, maxlength=4, zero_pad=True): |
||
2283 | """Return the Phonix code for a word. |
||
2284 | |||
2285 | Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`. |
||
2286 | |||
2287 | This implementation is based on: |
||
2288 | - :cite:`Pfeifer:2000` |
||
2289 | - :cite:`Christen:2011` |
||
2290 | - :cite:`Kollar:2007` |
||
2291 | |||
2292 | :param str word: the word to transform |
||
2293 | :param int maxlength: the length of the code returned (defaults to 4) |
||
2294 | :param bool zero_pad: pad the end of the return value with 0s to achieve |
||
2295 | a maxlength string |
||
2296 | :returns: the Phonix value |
||
2297 | :rtype: str |
||
2298 | |||
2299 | >>> phonix('Christopher') |
||
2300 | 'K683' |
||
2301 | >>> phonix('Niall') |
||
2302 | 'N400' |
||
2303 | >>> phonix('Smith') |
||
2304 | 'S530' |
||
2305 | >>> phonix('Schmidt') |
||
2306 | 'S530' |
||
2307 | """ |
||
2308 | def _start_repl(word, src, tar, post=None): |
||
2309 | r"""Replace src with tar at the start of word.""" |
||
2310 | if post: |
||
2311 | for i in post: |
||
2312 | if word.startswith(src+i): |
||
2313 | return tar + word[len(src):] |
||
2314 | elif word.startswith(src): |
||
2315 | return tar + word[len(src):] |
||
2316 | return word |
||
2317 | |||
2318 | def _end_repl(word, src, tar, pre=None): |
||
2319 | r"""Replace src with tar at the end of word.""" |
||
2320 | if pre: |
||
2321 | for i in pre: |
||
2322 | if word.endswith(i+src): |
||
2323 | return word[:-len(src)] + tar |
||
2324 | elif word.endswith(src): |
||
2325 | return word[:-len(src)] + tar |
||
2326 | return word |
||
2327 | |||
2328 | def _mid_repl(word, src, tar, pre=None, post=None): |
||
2329 | r"""Replace src with tar in the middle of word.""" |
||
2330 | if pre or post: |
||
2331 | if not pre: |
||
2332 | return word[0] + _all_repl(word[1:], src, tar, pre, post) |
||
2333 | elif not post: |
||
2334 | return _all_repl(word[:-1], src, tar, pre, post) + word[-1] |
||
2335 | return _all_repl(word, src, tar, pre, post) |
||
2336 | return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) + |
||
2337 | word[-1]) |
||
2338 | |||
2339 | def _all_repl(word, src, tar, pre=None, post=None): |
||
2340 | r"""Replace src with tar anywhere in word.""" |
||
2341 | if pre or post: |
||
2342 | if post: |
||
2343 | post = post |
||
2344 | else: |
||
2345 | post = frozenset(('',)) |
||
2346 | if pre: |
||
2347 | pre = pre |
||
2348 | else: |
||
2349 | pre = frozenset(('',)) |
||
2350 | |||
2351 | for i, j in ((i, j) for i in pre for j in post): |
||
2352 | word = word.replace(i+src+j, i+tar+j) |
||
2353 | return word |
||
2354 | else: |
||
2355 | return word.replace(src, tar) |
||
2356 | |||
2357 | _vow = {'A', 'E', 'I', 'O', 'U'} |
||
2358 | _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
||
2359 | 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'} |
||
2360 | |||
2361 | _phonix_substitutions = ((_all_repl, 'DG', 'G'), |
||
2362 | (_all_repl, 'CO', 'KO'), |
||
2363 | (_all_repl, 'CA', 'KA'), |
||
2364 | (_all_repl, 'CU', 'KU'), |
||
2365 | (_all_repl, 'CY', 'SI'), |
||
2366 | (_all_repl, 'CI', 'SI'), |
||
2367 | (_all_repl, 'CE', 'SE'), |
||
2368 | (_start_repl, 'CL', 'KL', _vow), |
||
2369 | (_all_repl, 'CK', 'K'), |
||
2370 | (_end_repl, 'GC', 'K'), |
||
2371 | (_end_repl, 'JC', 'K'), |
||
2372 | (_start_repl, 'CHR', 'KR', _vow), |
||
2373 | (_start_repl, 'CR', 'KR', _vow), |
||
2374 | (_start_repl, 'WR', 'R'), |
||
2375 | (_all_repl, 'NC', 'NK'), |
||
2376 | (_all_repl, 'CT', 'KT'), |
||
2377 | (_all_repl, 'PH', 'F'), |
||
2378 | (_all_repl, 'AA', 'AR'), |
||
2379 | (_all_repl, 'SCH', 'SH'), |
||
2380 | (_all_repl, 'BTL', 'TL'), |
||
2381 | (_all_repl, 'GHT', 'T'), |
||
2382 | (_all_repl, 'AUGH', 'ARF'), |
||
2383 | (_mid_repl, 'LJ', 'LD', _vow, _vow), |
||
2384 | (_all_repl, 'LOUGH', 'LOW'), |
||
2385 | (_start_repl, 'Q', 'KW'), |
||
2386 | (_start_repl, 'KN', 'N'), |
||
2387 | (_end_repl, 'GN', 'N'), |
||
2388 | (_all_repl, 'GHN', 'N'), |
||
2389 | (_end_repl, 'GNE', 'N'), |
||
2390 | (_all_repl, 'GHNE', 'NE'), |
||
2391 | (_end_repl, 'GNES', 'NS'), |
||
2392 | (_start_repl, 'GN', 'N'), |
||
2393 | (_mid_repl, 'GN', 'N', None, _con), |
||
2394 | (_end_repl, 'GN', 'N'), |
||
2395 | (_start_repl, 'PS', 'S'), |
||
2396 | (_start_repl, 'PT', 'T'), |
||
2397 | (_start_repl, 'CZ', 'C'), |
||
2398 | (_mid_repl, 'WZ', 'Z', _vow), |
||
2399 | (_mid_repl, 'CZ', 'CH'), |
||
2400 | (_all_repl, 'LZ', 'LSH'), |
||
2401 | (_all_repl, 'RZ', 'RSH'), |
||
2402 | (_mid_repl, 'Z', 'S', None, _vow), |
||
2403 | (_all_repl, 'ZZ', 'TS'), |
||
2404 | (_mid_repl, 'Z', 'TS', _con), |
||
2405 | (_all_repl, 'HROUG', 'REW'), |
||
2406 | (_all_repl, 'OUGH', 'OF'), |
||
2407 | (_mid_repl, 'Q', 'KW', _vow, _vow), |
||
2408 | (_mid_repl, 'J', 'Y', _vow, _vow), |
||
2409 | (_start_repl, 'YJ', 'Y', _vow), |
||
2410 | (_start_repl, 'GH', 'G'), |
||
2411 | (_end_repl, 'GH', 'E', _vow), |
||
2412 | (_start_repl, 'CY', 'S'), |
||
2413 | (_all_repl, 'NX', 'NKS'), |
||
2414 | (_start_repl, 'PF', 'F'), |
||
2415 | (_end_repl, 'DT', 'T'), |
||
2416 | (_end_repl, 'TL', 'TIL'), |
||
2417 | (_end_repl, 'DL', 'DIL'), |
||
2418 | (_all_repl, 'YTH', 'ITH'), |
||
2419 | (_start_repl, 'TJ', 'CH', _vow), |
||
2420 | (_start_repl, 'TSJ', 'CH', _vow), |
||
2421 | (_start_repl, 'TS', 'T', _vow), |
||
2422 | (_all_repl, 'TCH', 'CH'), |
||
2423 | (_mid_repl, 'WSK', 'VSKIE', _vow), |
||
2424 | (_end_repl, 'WSK', 'VSKIE', _vow), |
||
2425 | (_start_repl, 'MN', 'N', _vow), |
||
2426 | (_start_repl, 'PN', 'N', _vow), |
||
2427 | (_mid_repl, 'STL', 'SL', _vow), |
||
2428 | (_end_repl, 'STL', 'SL', _vow), |
||
2429 | (_end_repl, 'TNT', 'ENT'), |
||
2430 | (_end_repl, 'EAUX', 'OH'), |
||
2431 | (_all_repl, 'EXCI', 'ECS'), |
||
2432 | (_all_repl, 'X', 'ECS'), |
||
2433 | (_end_repl, 'NED', 'ND'), |
||
2434 | (_all_repl, 'JR', 'DR'), |
||
2435 | (_end_repl, 'EE', 'EA'), |
||
2436 | (_all_repl, 'ZS', 'S'), |
||
2437 | (_mid_repl, 'R', 'AH', _vow, _con), |
||
2438 | (_end_repl, 'R', 'AH', _vow), |
||
2439 | (_mid_repl, 'HR', 'AH', _vow, _con), |
||
2440 | (_end_repl, 'HR', 'AH', _vow), |
||
2441 | (_end_repl, 'HR', 'AH', _vow), |
||
2442 | (_end_repl, 'RE', 'AR'), |
||
2443 | (_end_repl, 'R', 'AH', _vow), |
||
2444 | (_all_repl, 'LLE', 'LE'), |
||
2445 | (_end_repl, 'LE', 'ILE', _con), |
||
2446 | (_end_repl, 'LES', 'ILES', _con), |
||
2447 | (_end_repl, 'E', ''), |
||
2448 | (_end_repl, 'ES', 'S'), |
||
2449 | (_end_repl, 'SS', 'AS', _vow), |
||
2450 | (_end_repl, 'MB', 'M', _vow), |
||
2451 | (_all_repl, 'MPTS', 'MPS'), |
||
2452 | (_all_repl, 'MPS', 'MS'), |
||
2453 | (_all_repl, 'MPT', 'MT')) |
||
2454 | |||
2455 | _phonix_translation = dict(zip((ord(_) for _ in |
||
2456 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
2457 | '01230720022455012683070808')) |
||
2458 | |||
2459 | sdx = '' |
||
2460 | |||
2461 | word = normalize('NFKD', text_type(word.upper())) |
||
2462 | word = word.replace('ß', 'SS') |
||
2463 | word = ''.join(c for c in word if c in |
||
2464 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
2465 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
2466 | 'Y', 'Z'}) |
||
2467 | if word: |
||
2468 | for trans in _phonix_substitutions: |
||
2469 | word = trans[0](word, *trans[1:]) |
||
2470 | if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
2471 | sdx = 'v' + word[1:].translate(_phonix_translation) |
||
2472 | else: |
||
2473 | sdx = word[0] + word[1:].translate(_phonix_translation) |
||
2474 | sdx = _delete_consecutive_repeats(sdx) |
||
2475 | sdx = sdx.replace('0', '') |
||
2476 | |||
2477 | # Clamp maxlength to [4, 64] |
||
2478 | if maxlength is not None: |
||
2479 | maxlength = min(max(4, maxlength), 64) |
||
2480 | else: |
||
2481 | maxlength = 64 |
||
2482 | |||
2483 | if zero_pad: |
||
2484 | sdx += '0' * maxlength |
||
2485 | if not sdx: |
||
2486 | sdx = '0' |
||
2487 | return sdx[:maxlength] |
||
2488 | |||
2489 | |||
2490 | def sfinxbis(word, maxlength=None): |
||
2491 | """Return the SfinxBis code for a word. |
||
2492 | |||
2493 | SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`. |
||
2494 | |||
2495 | This implementation follows the reference implementation: |
||
2496 | :cite:`Sjoo:2009`. |
||
2497 | |||
2498 | SfinxBis is intended chiefly for Swedish names. |
||
2499 | |||
2500 | :param str word: the word to transform |
||
2501 | :param int maxlength: the length of the code returned (defaults to |
||
2502 | unlimited) |
||
2503 | :returns: the SfinxBis value |
||
2504 | :rtype: tuple |
||
2505 | |||
2506 | >>> sfinxbis('Christopher') |
||
2507 | ('K68376',) |
||
2508 | >>> sfinxbis('Niall') |
||
2509 | ('N4',) |
||
2510 | >>> sfinxbis('Smith') |
||
2511 | ('S53',) |
||
2512 | >>> sfinxbis('Schmidt') |
||
2513 | ('S53',) |
||
2514 | |||
2515 | >>> sfinxbis('Johansson') |
||
2516 | ('J585',) |
||
2517 | >>> sfinxbis('Sjöberg') |
||
2518 | ('#162',) |
||
2519 | """ |
||
2520 | adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', |
||
2521 | ' VAN DER ', ' VON DEM ', ' VON DER ', |
||
2522 | ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', |
||
2523 | ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', |
||
2524 | ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', |
||
2525 | ' S:T ') |
||
2526 | |||
2527 | _harde_vokaler = {'A', 'O', 'U', 'Å'} |
||
2528 | _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} |
||
2529 | _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', |
||
2530 | 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
||
2531 | _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
2532 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
2533 | 'Y', 'Z', 'Ä', 'Å', 'Ö'} |
||
2534 | |||
2535 | _sfinxbis_translation = dict(zip((ord(_) for _ in |
||
2536 | 'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), |
||
2537 | '123729224551268378999999999')) |
||
2538 | |||
2539 | _sfinxbis_substitutions = dict(zip((ord(_) for _ in |
||
2540 | 'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), |
||
2541 | 'VSAAAAÄCEEEEIIIINOOOOÖUUUYY')) |
||
2542 | |||
2543 | def _foersvensker(ordet): |
||
2544 | """Return the Swedish-ized form of the word.""" |
||
2545 | ordet = ordet.replace('STIERN', 'STJÄRN') |
||
2546 | ordet = ordet.replace('HIE', 'HJ') |
||
2547 | ordet = ordet.replace('SIÖ', 'SJÖ') |
||
2548 | ordet = ordet.replace('SCH', 'SH') |
||
2549 | ordet = ordet.replace('QU', 'KV') |
||
2550 | ordet = ordet.replace('IO', 'JO') |
||
2551 | ordet = ordet.replace('PH', 'F') |
||
2552 | |||
2553 | for i in _harde_vokaler: |
||
2554 | ordet = ordet.replace(i+'Ü', i+'J') |
||
2555 | ordet = ordet.replace(i+'Y', i+'J') |
||
2556 | ordet = ordet.replace(i+'I', i+'J') |
||
2557 | for i in _mjuka_vokaler: |
||
2558 | ordet = ordet.replace(i+'Ü', i+'J') |
||
2559 | ordet = ordet.replace(i+'Y', i+'J') |
||
2560 | ordet = ordet.replace(i+'I', i+'J') |
||
2561 | |||
2562 | if 'H' in ordet: |
||
2563 | for i in _konsonanter: |
||
2564 | ordet = ordet.replace('H'+i, i) |
||
2565 | |||
2566 | ordet = ordet.translate(_sfinxbis_substitutions) |
||
2567 | |||
2568 | ordet = ordet.replace('Ð', 'ETH') |
||
2569 | ordet = ordet.replace('Þ', 'TH') |
||
2570 | ordet = ordet.replace('ß', 'SS') |
||
2571 | |||
2572 | return ordet |
||
2573 | |||
2574 | def _koda_foersta_ljudet(ordet): |
||
2575 | """Return the word with the first sound coded.""" |
||
2576 | if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler: |
||
2577 | ordet = '$' + ordet[1:] |
||
2578 | elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): |
||
2579 | ordet = 'J' + ordet[2:] |
||
2580 | elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler: |
||
2581 | ordet = 'J' + ordet[1:] |
||
2582 | elif ordet[0:1] == 'Q': |
||
2583 | ordet = 'K' + ordet[1:] |
||
2584 | elif (ordet[0:2] == 'CH' and |
||
2585 | ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)): |
||
2586 | ordet = '#' + ordet[2:] |
||
2587 | elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler: |
||
2588 | ordet = 'K' + ordet[1:] |
||
2589 | elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter: |
||
2590 | ordet = 'K' + ordet[1:] |
||
2591 | elif ordet[0:1] == 'X': |
||
2592 | ordet = 'S' + ordet[1:] |
||
2593 | elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler: |
||
2594 | ordet = 'S' + ordet[1:] |
||
2595 | elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'): |
||
2596 | ordet = '#' + ordet[3:] |
||
2597 | elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): |
||
2598 | ordet = '#' + ordet[2:] |
||
2599 | elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler: |
||
2600 | ordet = '#' + ordet[2:] |
||
2601 | elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler: |
||
2602 | ordet = '#' + ordet[1:] |
||
2603 | return ordet |
||
2604 | |||
2605 | # Steg 1, Versaler |
||
2606 | word = normalize('NFC', text_type(word.upper())) |
||
2607 | word = word.replace('ß', 'SS') |
||
2608 | word = word.replace('-', ' ') |
||
2609 | |||
2610 | # Steg 2, Ta bort adelsprefix |
||
2611 | for adelstitel in adelstitler: |
||
2612 | while adelstitel in word: |
||
2613 | word = word.replace(adelstitel, ' ') |
||
2614 | if word.startswith(adelstitel[1:]): |
||
2615 | word = word[len(adelstitel)-1:] |
||
2616 | |||
2617 | # Split word into tokens |
||
2618 | ordlista = word.split() |
||
2619 | |||
2620 | # Steg 3, Ta bort dubbelteckning i början på namnet |
||
2621 | ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] |
||
2622 | if not ordlista: |
||
2623 | return ('',) |
||
2624 | |||
2625 | # Steg 4, Försvenskning |
||
2626 | ordlista = [_foersvensker(ordet) for ordet in ordlista] |
||
2627 | |||
2628 | # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) |
||
2629 | ordlista = [''.join(c for c in ordet if c in _alfabet) |
||
2630 | for ordet in ordlista] |
||
2631 | |||
2632 | # Steg 6, Koda första ljudet |
||
2633 | ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] |
||
2634 | |||
2635 | # Steg 7, Dela upp namnet i två delar |
||
2636 | rest = [ordet[1:] for ordet in ordlista] |
||
2637 | |||
2638 | # Steg 8, Utför fonetisk transformation i resten |
||
2639 | rest = [ordet.replace('DT', 'T') for ordet in rest] |
||
2640 | rest = [ordet.replace('X', 'KS') for ordet in rest] |
||
2641 | |||
2642 | # Steg 9, Koda resten till en sifferkod |
||
2643 | for vokal in _mjuka_vokaler: |
||
2644 | rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest] |
||
2645 | rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] |
||
2646 | |||
2647 | # Steg 10, Ta bort intilliggande dubbletter |
||
2648 | rest = [_delete_consecutive_repeats(ordet) for ordet in rest] |
||
2649 | |||
2650 | # Steg 11, Ta bort alla "9" |
||
2651 | rest = [ordet.replace('9', '') for ordet in rest] |
||
2652 | |||
2653 | # Steg 12, Sätt ihop delarna igen |
||
2654 | ordlista = [''.join(ordet) for ordet in |
||
2655 | zip((_[0:1] for _ in ordlista), rest)] |
||
2656 | |||
2657 | # truncate, if maxlength is set |
||
2658 | if maxlength and maxlength < _INFINITY: |
||
2659 | ordlista = [ordet[:maxlength] for ordet in ordlista] |
||
2660 | |||
2661 | return tuple(ordlista) |
||
2662 | |||
2663 | |||
2664 | def phonet(word, mode=1, lang='de'): |
||
2665 | """Return the phonet code for a word. |
||
2666 | |||
2667 | phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and |
||
2668 | documented in :cite:`Michael:1999`. |
||
2669 | |||
2670 | This is a port of Jesper Zedlitz's code, which is licensed LGPL |
||
2671 | :cite:`Zedlitz:2015`. |
||
2672 | |||
2673 | That is, in turn, based on Michael's C code, which is also licensed LGPL |
||
2674 | :cite:`Michael:2007`. |
||
2675 | |||
2676 | :param str word: the word to transform |
||
2677 | :param int mode: the ponet variant to employ (1 or 2) |
||
2678 | :param str lang: 'de' (default) for German |
||
2679 | 'none' for no language |
||
2680 | :returns: the phonet value |
||
2681 | :rtype: str |
||
2682 | |||
2683 | >>> phonet('Christopher') |
||
2684 | 'KRISTOFA' |
||
2685 | >>> phonet('Niall') |
||
2686 | 'NIAL' |
||
2687 | >>> phonet('Smith') |
||
2688 | 'SMIT' |
||
2689 | >>> phonet('Schmidt') |
||
2690 | 'SHMIT' |
||
2691 | |||
2692 | >>> phonet('Christopher', mode=2) |
||
2693 | 'KRIZTUFA' |
||
2694 | >>> phonet('Niall', mode=2) |
||
2695 | 'NIAL' |
||
2696 | >>> phonet('Smith', mode=2) |
||
2697 | 'ZNIT' |
||
2698 | >>> phonet('Schmidt', mode=2) |
||
2699 | 'ZNIT' |
||
2700 | |||
2701 | >>> phonet('Christopher', lang='none') |
||
2702 | 'CHRISTOPHER' |
||
2703 | >>> phonet('Niall', lang='none') |
||
2704 | 'NIAL' |
||
2705 | >>> phonet('Smith', lang='none') |
||
2706 | 'SMITH' |
||
2707 | >>> phonet('Schmidt', lang='none') |
||
2708 | 'SCHMIDT' |
||
2709 | """ |
||
2710 | _phonet_rules_no_lang = ( # separator chars |
||
2711 | '´', ' ', ' ', |
||
2712 | '"', ' ', ' ', |
||
2713 | '`$', '', '', |
||
2714 | '\'', ' ', ' ', |
||
2715 | ',', ',', ',', |
||
2716 | ';', ',', ',', |
||
2717 | '-', ' ', ' ', |
||
2718 | ' ', ' ', ' ', |
||
2719 | '.', '.', '.', |
||
2720 | ':', '.', '.', |
||
2721 | # German umlauts |
||
2722 | 'Ä', 'AE', 'AE', |
||
2723 | 'Ö', 'OE', 'OE', |
||
2724 | 'Ü', 'UE', 'UE', |
||
2725 | 'ß', 'S', 'S', |
||
2726 | # international umlauts |
||
2727 | 'À', 'A', 'A', |
||
2728 | 'Á', 'A', 'A', |
||
2729 | 'Â', 'A', 'A', |
||
2730 | 'Ã', 'A', 'A', |
||
2731 | 'Å', 'A', 'A', |
||
2732 | 'Æ', 'AE', 'AE', |
||
2733 | 'Ç', 'C', 'C', |
||
2734 | 'Ð', 'DJ', 'DJ', |
||
2735 | 'È', 'E', 'E', |
||
2736 | 'É', 'E', 'E', |
||
2737 | 'Ê', 'E', 'E', |
||
2738 | 'Ë', 'E', 'E', |
||
2739 | 'Ì', 'I', 'I', |
||
2740 | 'Í', 'I', 'I', |
||
2741 | 'Î', 'I', 'I', |
||
2742 | 'Ï', 'I', 'I', |
||
2743 | 'Ñ', 'NH', 'NH', |
||
2744 | 'Ò', 'O', 'O', |
||
2745 | 'Ó', 'O', 'O', |
||
2746 | 'Ô', 'O', 'O', |
||
2747 | 'Õ', 'O', 'O', |
||
2748 | 'Œ', 'OE', 'OE', |
||
2749 | 'Ø', 'OE', 'OE', |
||
2750 | 'Š', 'SH', 'SH', |
||
2751 | 'Þ', 'TH', 'TH', |
||
2752 | 'Ù', 'U', 'U', |
||
2753 | 'Ú', 'U', 'U', |
||
2754 | 'Û', 'U', 'U', |
||
2755 | 'Ý', 'Y', 'Y', |
||
2756 | 'Ÿ', 'Y', 'Y', |
||
2757 | # 'normal' letters (A-Z) |
||
2758 | 'MC^', 'MAC', 'MAC', |
||
2759 | 'MC^', 'MAC', 'MAC', |
||
2760 | 'M´^', 'MAC', 'MAC', |
||
2761 | 'M\'^', 'MAC', 'MAC', |
||
2762 | 'O´^', 'O', 'O', |
||
2763 | 'O\'^', 'O', 'O', |
||
2764 | 'VAN DEN ^', 'VANDEN', 'VANDEN', |
||
2765 | None, None, None) |
||
2766 | |||
2767 | _phonet_rules_german = ( # separator chars |
||
2768 | '´', ' ', ' ', |
||
2769 | '"', ' ', ' ', |
||
2770 | '`$', '', '', |
||
2771 | '\'', ' ', ' ', |
||
2772 | ',', ' ', ' ', |
||
2773 | ';', ' ', ' ', |
||
2774 | '-', ' ', ' ', |
||
2775 | ' ', ' ', ' ', |
||
2776 | '.', '.', '.', |
||
2777 | ':', '.', '.', |
||
2778 | # German umlauts |
||
2779 | 'ÄE', 'E', 'E', |
||
2780 | 'ÄU<', 'EU', 'EU', |
||
2781 | 'ÄV(AEOU)-<', 'EW', None, |
||
2782 | 'Ä$', 'Ä', None, |
||
2783 | 'Ä<', None, 'E', |
||
2784 | 'Ä', 'E', None, |
||
2785 | 'ÖE', 'Ö', 'Ö', |
||
2786 | 'ÖU', 'Ö', 'Ö', |
||
2787 | 'ÖVER--<', 'ÖW', None, |
||
2788 | 'ÖV(AOU)-', 'ÖW', None, |
||
2789 | 'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
2790 | 'ÜBER^^', 'ÜBA', 'IBA', |
||
2791 | 'ÜE', 'Ü', 'I', |
||
2792 | 'ÜVER--<', 'ÜW', None, |
||
2793 | 'ÜV(AOU)-', 'ÜW', None, |
||
2794 | 'Ü', None, 'I', |
||
2795 | 'ßCH<', None, 'Z', |
||
2796 | 'ß<', 'S', 'Z', |
||
2797 | # international umlauts |
||
2798 | 'À<', 'A', 'A', |
||
2799 | 'Á<', 'A', 'A', |
||
2800 | 'Â<', 'A', 'A', |
||
2801 | 'Ã<', 'A', 'A', |
||
2802 | 'Å<', 'A', 'A', |
||
2803 | 'ÆER-', 'E', 'E', |
||
2804 | 'ÆU<', 'EU', 'EU', |
||
2805 | 'ÆV(AEOU)-<', 'EW', None, |
||
2806 | 'Æ$', 'Ä', None, |
||
2807 | 'Æ<', None, 'E', |
||
2808 | 'Æ', 'E', None, |
||
2809 | 'Ç', 'Z', 'Z', |
||
2810 | 'ÐÐ-', '', '', |
||
2811 | 'Ð', 'DI', 'TI', |
||
2812 | 'È<', 'E', 'E', |
||
2813 | 'É<', 'E', 'E', |
||
2814 | 'Ê<', 'E', 'E', |
||
2815 | 'Ë', 'E', 'E', |
||
2816 | 'Ì<', 'I', 'I', |
||
2817 | 'Í<', 'I', 'I', |
||
2818 | 'Î<', 'I', 'I', |
||
2819 | 'Ï', 'I', 'I', |
||
2820 | 'ÑÑ-', '', '', |
||
2821 | 'Ñ', 'NI', 'NI', |
||
2822 | 'Ò<', 'O', 'U', |
||
2823 | 'Ó<', 'O', 'U', |
||
2824 | 'Ô<', 'O', 'U', |
||
2825 | 'Õ<', 'O', 'U', |
||
2826 | 'Œ<', 'Ö', 'Ö', |
||
2827 | 'Ø(IJY)-<', 'E', 'E', |
||
2828 | 'Ø<', 'Ö', 'Ö', |
||
2829 | 'Š', 'SH', 'Z', |
||
2830 | 'Þ', 'T', 'T', |
||
2831 | 'Ù<', 'U', 'U', |
||
2832 | 'Ú<', 'U', 'U', |
||
2833 | 'Û<', 'U', 'U', |
||
2834 | 'Ý<', 'I', 'I', |
||
2835 | 'Ÿ<', 'I', 'I', |
||
2836 | # 'normal' letters (A-Z) |
||
2837 | 'ABELLE$', 'ABL', 'ABL', |
||
2838 | 'ABELL$', 'ABL', 'ABL', |
||
2839 | 'ABIENNE$', 'ABIN', 'ABIN', |
||
2840 | 'ACHME---^', 'ACH', 'AK', |
||
2841 | 'ACEY$', 'AZI', 'AZI', |
||
2842 | 'ADV', 'ATW', None, |
||
2843 | 'AEGL-', 'EK', None, |
||
2844 | 'AEU<', 'EU', 'EU', |
||
2845 | 'AE2', 'E', 'E', |
||
2846 | 'AFTRAUBEN------', 'AFT ', 'AFT ', |
||
2847 | 'AGL-1', 'AK', None, |
||
2848 | 'AGNI-^', 'AKN', 'AKN', |
||
2849 | 'AGNIE-', 'ANI', 'ANI', |
||
2850 | 'AGN(AEOU)-$', 'ANI', 'ANI', |
||
2851 | 'AH(AIOÖUÜY)-', 'AH', None, |
||
2852 | 'AIA2', 'AIA', 'AIA', |
||
2853 | 'AIE$', 'E', 'E', |
||
2854 | 'AILL(EOU)-', 'ALI', 'ALI', |
||
2855 | 'AINE$', 'EN', 'EN', |
||
2856 | 'AIRE$', 'ER', 'ER', |
||
2857 | 'AIR-', 'E', 'E', |
||
2858 | 'AISE$', 'ES', 'EZ', |
||
2859 | 'AISSANCE$', 'ESANS', 'EZANZ', |
||
2860 | 'AISSE$', 'ES', 'EZ', |
||
2861 | 'AIX$', 'EX', 'EX', |
||
2862 | 'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
||
2863 | 'AKTIE', 'AXIE', 'AXIE', |
||
2864 | 'AKTUEL', 'AKTUEL', None, |
||
2865 | 'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
||
2866 | 'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
||
2867 | 'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
||
2868 | 'ANCH(OEI)-', 'ANSH', 'ANZ', |
||
2869 | 'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
||
2870 | 'ANDERGEHE----', 'ANDA ', 'ANTA ', |
||
2871 | 'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
||
2872 | 'ANDERGING----', 'ANDA ', 'ANTA ', |
||
2873 | 'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
||
2874 | 'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
||
2875 | 'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
||
2876 | 'ANER(BKO)---^^', 'AN', None, |
||
2877 | 'ANHAND---^$', 'AN H', 'AN ', |
||
2878 | 'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
||
2879 | 'ANIELLE$', 'ANIEL', 'ANIL', |
||
2880 | 'ANIEL', 'ANIEL', None, |
||
2881 | 'ANSTELLE----^$', 'AN ST', 'AN ZT', |
||
2882 | 'ANTI^^', 'ANTI', 'ANTI', |
||
2883 | 'ANVER^^', 'ANFA', 'ANFA', |
||
2884 | 'ATIA$', 'ATIA', 'ATIA', |
||
2885 | 'ATIA(NS)--', 'ATI', 'ATI', |
||
2886 | 'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
||
2887 | 'AUAU--', '', '', |
||
2888 | 'AUERE$', 'AUERE', None, |
||
2889 | 'AUERE(NS)-$', 'AUERE', None, |
||
2890 | 'AUERE(AIOUY)--', 'AUER', None, |
||
2891 | 'AUER(AÄIOÖUÜY)-', 'AUER', None, |
||
2892 | 'AUER<', 'AUA', 'AUA', |
||
2893 | 'AUF^^', 'AUF', 'AUF', |
||
2894 | 'AULT$', 'O', 'U', |
||
2895 | 'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
||
2896 | 'AUR$', 'AUA', 'AUA', |
||
2897 | 'AUSSE$', 'OS', 'UZ', |
||
2898 | 'AUS(ST)-^', 'AUS', 'AUS', |
||
2899 | 'AUS^^', 'AUS', 'AUS', |
||
2900 | 'AUTOFAHR----', 'AUTO ', 'AUTU ', |
||
2901 | 'AUTO^^', 'AUTO', 'AUTU', |
||
2902 | 'AUX(IY)-', 'AUX', 'AUX', |
||
2903 | 'AUX', 'O', 'U', |
||
2904 | 'AU', 'AU', 'AU', |
||
2905 | 'AVER--<', 'AW', None, |
||
2906 | 'AVIER$', 'AWIE', 'AFIE', |
||
2907 | 'AV(EÈÉÊI)-^', 'AW', None, |
||
2908 | 'AV(AOU)-', 'AW', None, |
||
2909 | 'AYRE$', 'EIRE', 'EIRE', |
||
2910 | 'AYRE(NS)-$', 'EIRE', 'EIRE', |
||
2911 | 'AYRE(AIOUY)--', 'EIR', 'EIR', |
||
2912 | 'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
||
2913 | 'AYR<', 'EIA', 'EIA', |
||
2914 | 'AYER--<', 'EI', 'EI', |
||
2915 | 'AY(AÄEIOÖUÜY)--', 'A', 'A', |
||
2916 | 'AË', 'E', 'E', |
||
2917 | 'A(IJY)<', 'EI', 'EI', |
||
2918 | 'BABY^$', 'BEBI', 'BEBI', |
||
2919 | 'BAB(IY)^', 'BEBI', 'BEBI', |
||
2920 | 'BEAU^$', 'BO', None, |
||
2921 | 'BEA(BCMNRU)-^', 'BEA', 'BEA', |
||
2922 | 'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
||
2923 | 'BEE$', 'BI', 'BI', |
||
2924 | 'BEIGE^$', 'BESH', 'BEZ', |
||
2925 | 'BENOIT--', 'BENO', 'BENU', |
||
2926 | 'BER(DT)-', 'BER', None, |
||
2927 | 'BERN(DT)-', 'BERN', None, |
||
2928 | 'BE(LMNRST)-^', 'BE', 'BE', |
||
2929 | 'BETTE$', 'BET', 'BET', |
||
2930 | 'BEVOR^$', 'BEFOR', None, |
||
2931 | 'BIC$', 'BIZ', 'BIZ', |
||
2932 | 'BOWL(EI)-', 'BOL', 'BUL', |
||
2933 | 'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
||
2934 | 'BRINGEND-----^', 'BRI', 'BRI', |
||
2935 | 'BRINGEND-----', ' BRI', ' BRI', |
||
2936 | 'BROW(NS)-', 'BRAU', 'BRAU', |
||
2937 | 'BUDGET7', 'BÜGE', 'BIKE', |
||
2938 | 'BUFFET7', 'BÜFE', 'BIFE', |
||
2939 | 'BYLLE$', 'BILE', 'BILE', |
||
2940 | 'BYLL$', 'BIL', 'BIL', |
||
2941 | 'BYPA--^', 'BEI', 'BEI', |
||
2942 | 'BYTE<', 'BEIT', 'BEIT', |
||
2943 | 'BY9^', 'BÜ', None, |
||
2944 | 'B(SßZ)$', 'BS', None, |
||
2945 | 'CACH(EI)-^', 'KESH', 'KEZ', |
||
2946 | 'CAE--', 'Z', 'Z', |
||
2947 | 'CA(IY)$', 'ZEI', 'ZEI', |
||
2948 | 'CE(EIJUY)--', 'Z', 'Z', |
||
2949 | 'CENT<', 'ZENT', 'ZENT', |
||
2950 | 'CERST(EI)----^', 'KE', 'KE', |
||
2951 | 'CER$', 'ZA', 'ZA', |
||
2952 | 'CE3', 'ZE', 'ZE', |
||
2953 | 'CH\'S$', 'X', 'X', |
||
2954 | 'CH´S$', 'X', 'X', |
||
2955 | 'CHAO(ST)-', 'KAO', 'KAU', |
||
2956 | 'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
||
2957 | 'CHAR(AI)-^', 'KAR', 'KAR', |
||
2958 | 'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
||
2959 | 'CHÄ(CF)-', 'SHE', 'ZE', |
||
2960 | 'CHE(CF)-', 'SHE', 'ZE', |
||
2961 | 'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
||
2962 | 'CHEQUE<', 'SHEK', 'ZEK', |
||
2963 | 'CHI(CFGPVW)-', 'SHI', 'ZI', |
||
2964 | 'CH(AEUY)-<^', 'SH', 'Z', |
||
2965 | 'CHK-', '', '', |
||
2966 | 'CHO(CKPS)-^', 'SHO', 'ZU', |
||
2967 | 'CHRIS-', 'KRI', None, |
||
2968 | 'CHRO-', 'KR', None, |
||
2969 | 'CH(LOR)-<^', 'K', 'K', |
||
2970 | 'CHST-', 'X', 'X', |
||
2971 | 'CH(SßXZ)3', 'X', 'X', |
||
2972 | 'CHTNI-3', 'CHN', 'KN', |
||
2973 | 'CH^', 'K', 'K', # or: 'CH', 'K' |
||
2974 | 'CH', 'CH', 'K', |
||
2975 | 'CIC$', 'ZIZ', 'ZIZ', |
||
2976 | 'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
||
2977 | 'CIENCE$', 'EIENS', 'EIENZ', |
||
2978 | 'CIER$', 'ZIE', 'ZIE', |
||
2979 | 'CYB-^', 'ZEI', 'ZEI', |
||
2980 | 'CY9^', 'ZÜ', 'ZI', |
||
2981 | 'C(IJY)-<3', 'Z', 'Z', |
||
2982 | 'CLOWN-', 'KLAU', 'KLAU', |
||
2983 | 'CCH', 'Z', 'Z', |
||
2984 | 'CCE-', 'X', 'X', |
||
2985 | 'C(CK)-', '', '', |
||
2986 | 'CLAUDET---', 'KLO', 'KLU', |
||
2987 | 'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
||
2988 | 'COACH', 'KOSH', 'KUZ', |
||
2989 | 'COLE$', 'KOL', 'KUL', |
||
2990 | 'COUCH', 'KAUSH', 'KAUZ', |
||
2991 | 'COW', 'KAU', 'KAU', |
||
2992 | 'CQUES$', 'K', 'K', |
||
2993 | 'CQUE', 'K', 'K', |
||
2994 | 'CRASH--9', 'KRE', 'KRE', |
||
2995 | 'CREAT-^', 'KREA', 'KREA', |
||
2996 | 'CST', 'XT', 'XT', |
||
2997 | 'CS<^', 'Z', 'Z', |
||
2998 | 'C(SßX)', 'X', 'X', |
||
2999 | 'CT\'S$', 'X', 'X', |
||
3000 | 'CT(SßXZ)', 'X', 'X', |
||
3001 | 'CZ<', 'Z', 'Z', |
||
3002 | 'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
||
3003 | 'C.^', 'C.', 'C.', |
||
3004 | 'CÄ-', 'Z', 'Z', |
||
3005 | 'CÜ$', 'ZÜ', 'ZI', |
||
3006 | 'C\'S$', 'X', 'X', |
||
3007 | 'C<', 'K', 'K', |
||
3008 | 'DAHER^$', 'DAHER', None, |
||
3009 | 'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
||
3010 | 'DAVO(NR)-^$', 'DAFO', 'TAFU', |
||
3011 | 'DD(SZ)--<', '', '', |
||
3012 | 'DD9', 'D', None, |
||
3013 | 'DEPOT7', 'DEPO', 'TEBU', |
||
3014 | 'DESIGN', 'DISEIN', 'TIZEIN', |
||
3015 | 'DE(LMNRST)-3^', 'DE', 'TE', |
||
3016 | 'DETTE$', 'DET', 'TET', |
||
3017 | 'DH$', 'T', None, |
||
3018 | 'DIC$', 'DIZ', 'TIZ', |
||
3019 | 'DIDR-^', 'DIT', None, |
||
3020 | 'DIEDR-^', 'DIT', None, |
||
3021 | 'DJ(AEIOU)-^', 'I', 'I', |
||
3022 | 'DMITR-^', 'DIMIT', 'TINIT', |
||
3023 | 'DRY9^', 'DRÜ', None, |
||
3024 | 'DT-', '', '', |
||
3025 | 'DUIS-^', 'DÜ', 'TI', |
||
3026 | 'DURCH^^', 'DURCH', 'TURK', |
||
3027 | 'DVA$', 'TWA', None, |
||
3028 | 'DY9^', 'DÜ', None, |
||
3029 | 'DYS$', 'DIS', None, |
||
3030 | 'DS(CH)--<', 'T', 'T', |
||
3031 | 'DST', 'ZT', 'ZT', |
||
3032 | 'DZS(CH)--', 'T', 'T', |
||
3033 | 'D(SßZ)', 'Z', 'Z', |
||
3034 | 'D(AÄEIOÖRUÜY)-', 'D', None, |
||
3035 | 'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
||
3036 | 'D\'H^', 'D', 'T', |
||
3037 | 'D´H^', 'D', 'T', |
||
3038 | 'D`H^', 'D', 'T', |
||
3039 | 'D\'S3$', 'Z', 'Z', |
||
3040 | 'D´S3$', 'Z', 'Z', |
||
3041 | 'D^', 'D', None, |
||
3042 | 'D', 'T', 'T', |
||
3043 | 'EAULT$', 'O', 'U', |
||
3044 | 'EAUX$', 'O', 'U', |
||
3045 | 'EAU', 'O', 'U', |
||
3046 | 'EAV', 'IW', 'IF', |
||
3047 | 'EAS3$', 'EAS', None, |
||
3048 | 'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
||
3049 | 'EA3$', 'EA', 'EA', |
||
3050 | 'EA3', 'I', 'I', |
||
3051 | 'EBENSO^$', 'EBNSO', 'EBNZU', |
||
3052 | 'EBENSO^^', 'EBNSO ', 'EBNZU ', |
||
3053 | 'EBEN^^', 'EBN', 'EBN', |
||
3054 | 'EE9', 'E', 'E', |
||
3055 | 'EGL-1', 'EK', None, |
||
3056 | 'EHE(IUY)--1', 'EH', None, |
||
3057 | 'EHUNG---1', 'E', None, |
||
3058 | 'EH(AÄIOÖUÜY)-1', 'EH', None, |
||
3059 | 'EIEI--', '', '', |
||
3060 | 'EIERE^$', 'EIERE', None, |
||
3061 | 'EIERE$', 'EIERE', None, |
||
3062 | 'EIERE(NS)-$', 'EIERE', None, |
||
3063 | 'EIERE(AIOUY)--', 'EIER', None, |
||
3064 | 'EIER(AÄIOÖUÜY)-', 'EIER', None, |
||
3065 | 'EIER<', 'EIA', None, |
||
3066 | 'EIGL-1', 'EIK', None, |
||
3067 | 'EIGH$', 'EI', 'EI', |
||
3068 | 'EIH--', 'E', 'E', |
||
3069 | 'EILLE$', 'EI', 'EI', |
||
3070 | 'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
||
3071 | 'EIR$', 'EIA', 'EIA', |
||
3072 | 'EITRAUBEN------', 'EIT ', 'EIT ', |
||
3073 | 'EI', 'EI', 'EI', |
||
3074 | 'EJ$', 'EI', 'EI', |
||
3075 | 'ELIZ^', 'ELIS', None, |
||
3076 | 'ELZ^', 'ELS', None, |
||
3077 | 'EL-^', 'E', 'E', |
||
3078 | 'ELANG----1', 'E', 'E', |
||
3079 | 'EL(DKL)--1', 'E', 'E', |
||
3080 | 'EL(MNT)--1$', 'E', 'E', |
||
3081 | 'ELYNE$', 'ELINE', 'ELINE', |
||
3082 | 'ELYN$', 'ELIN', 'ELIN', |
||
3083 | 'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
||
3084 | 'EL-1', 'L', 'L', |
||
3085 | 'EM-^', None, 'E', |
||
3086 | 'EM(DFKMPQT)--1', None, 'E', |
||
3087 | 'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
||
3088 | 'EM-1', None, 'N', |
||
3089 | 'ENGAG-^', 'ANGA', 'ANKA', |
||
3090 | 'EN-^', 'E', 'E', |
||
3091 | 'ENTUEL', 'ENTUEL', None, |
||
3092 | 'EN(CDGKQSTZ)--1', 'E', 'E', |
||
3093 | 'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
||
3094 | 'EN-1', '', '', |
||
3095 | 'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
||
3096 | 'ER-^', 'E', 'E', |
||
3097 | 'ERREGEND-----', ' ER', ' ER', |
||
3098 | 'ERT1$', 'AT', None, |
||
3099 | 'ER(DGLKMNRQTZß)-1', 'ER', None, |
||
3100 | 'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
||
3101 | 'ER1$', 'A', 'A', |
||
3102 | 'ER<1', 'A', 'A', |
||
3103 | 'ETAT7', 'ETA', 'ETA', |
||
3104 | 'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
||
3105 | 'EUERE$', 'EUERE', None, |
||
3106 | 'EUERE(NS)-$', 'EUERE', None, |
||
3107 | 'EUERE(AIOUY)--', 'EUER', None, |
||
3108 | 'EUER(AÄIOÖUÜY)-', 'EUER', None, |
||
3109 | 'EUER<', 'EUA', None, |
||
3110 | 'EUEU--', '', '', |
||
3111 | 'EUILLE$', 'Ö', 'Ö', |
||
3112 | 'EUR$', 'ÖR', 'ÖR', |
||
3113 | 'EUX', 'Ö', 'Ö', |
||
3114 | 'EUSZ$', 'EUS', None, |
||
3115 | 'EUTZ$', 'EUS', None, |
||
3116 | 'EUYS$', 'EUS', 'EUZ', |
||
3117 | 'EUZ$', 'EUS', None, |
||
3118 | 'EU', 'EU', 'EU', |
||
3119 | 'EVER--<1', 'EW', None, |
||
3120 | 'EV(ÄOÖUÜ)-1', 'EW', None, |
||
3121 | 'EYER<', 'EIA', 'EIA', |
||
3122 | 'EY<', 'EI', 'EI', |
||
3123 | 'FACETTE', 'FASET', 'FAZET', |
||
3124 | 'FANS--^$', 'FE', 'FE', |
||
3125 | 'FAN-^$', 'FE', 'FE', |
||
3126 | 'FAULT-', 'FOL', 'FUL', |
||
3127 | 'FEE(DL)-', 'FI', 'FI', |
||
3128 | 'FEHLER', 'FELA', 'FELA', |
||
3129 | 'FE(LMNRST)-3^', 'FE', 'FE', |
||
3130 | 'FOERDERN---^', 'FÖRD', 'FÖRT', |
||
3131 | 'FOERDERN---', ' FÖRD', ' FÖRT', |
||
3132 | 'FOND7', 'FON', 'FUN', |
||
3133 | 'FRAIN$', 'FRA', 'FRA', |
||
3134 | 'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
||
3135 | 'FY9^', 'FÜ', None, |
||
3136 | 'FÖRDERN---^', 'FÖRD', 'FÖRT', |
||
3137 | 'FÖRDERN---', ' FÖRD', ' FÖRT', |
||
3138 | 'GAGS^$', 'GEX', 'KEX', |
||
3139 | 'GAG^$', 'GEK', 'KEK', |
||
3140 | 'GD', 'KT', 'KT', |
||
3141 | 'GEGEN^^', 'GEGN', 'KEKN', |
||
3142 | 'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
||
3143 | 'GEGENGESET-----', 'GEGN ', 'KEKN ', |
||
3144 | 'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
||
3145 | 'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
||
3146 | 'GENDETWAS-----$', 'GENT ', 'KENT ', |
||
3147 | 'GENRE', 'IORE', 'IURE', |
||
3148 | 'GE(LMNRST)-3^', 'GE', 'KE', |
||
3149 | 'GER(DKT)-', 'GER', None, |
||
3150 | 'GETTE$', 'GET', 'KET', |
||
3151 | 'GGF.', 'GF.', None, |
||
3152 | 'GG-', '', '', |
||
3153 | 'GH', 'G', None, |
||
3154 | 'GI(AOU)-^', 'I', 'I', |
||
3155 | 'GION-3', 'KIO', 'KIU', |
||
3156 | 'G(CK)-', '', '', |
||
3157 | 'GJ(AEIOU)-^', 'I', 'I', |
||
3158 | 'GMBH^$', 'GMBH', 'GMBH', |
||
3159 | 'GNAC$', 'NIAK', 'NIAK', |
||
3160 | 'GNON$', 'NION', 'NIUN', |
||
3161 | 'GN$', 'N', 'N', |
||
3162 | 'GONCAL-^', 'GONZA', 'KUNZA', |
||
3163 | 'GRY9^', 'GRÜ', None, |
||
3164 | 'G(SßXZ)-<', 'K', 'K', |
||
3165 | 'GUCK-', 'KU', 'KU', |
||
3166 | 'GUISEP-^', 'IUSE', 'IUZE', |
||
3167 | 'GUI-^', 'G', 'K', |
||
3168 | 'GUTAUSSEH------^', 'GUT ', 'KUT ', |
||
3169 | 'GUTGEHEND------^', 'GUT ', 'KUT ', |
||
3170 | 'GY9^', 'GÜ', None, |
||
3171 | 'G(AÄEILOÖRUÜY)-', 'G', None, |
||
3172 | 'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
||
3173 | 'G\'S$', 'X', 'X', |
||
3174 | 'G´S$', 'X', 'X', |
||
3175 | 'G^', 'G', None, |
||
3176 | 'G', 'K', 'K', |
||
3177 | 'HA(HIUY)--1', 'H', None, |
||
3178 | 'HANDVOL---^', 'HANT ', 'ANT ', |
||
3179 | 'HANNOVE-^', 'HANOF', None, |
||
3180 | 'HAVEN7$', 'HAFN', None, |
||
3181 | 'HEAD-', 'HE', 'E', |
||
3182 | 'HELIEGEN------', 'E ', 'E ', |
||
3183 | 'HESTEHEN------', 'E ', 'E ', |
||
3184 | 'HE(LMNRST)-3^', 'HE', 'E', |
||
3185 | 'HE(LMN)-1', 'E', 'E', |
||
3186 | 'HEUR1$', 'ÖR', 'ÖR', |
||
3187 | 'HE(HIUY)--1', 'H', None, |
||
3188 | 'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
||
3189 | 'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
||
3190 | 'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
||
3191 | 'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
||
3192 | 'HOBBY9^', 'HOBI', None, |
||
3193 | 'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
||
3194 | 'HOCHTALEN-----^', 'HOCH ', 'UK ', |
||
3195 | 'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
||
3196 | 'HO(HIY)--1', 'H', None, |
||
3197 | 'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
||
3198 | 'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
||
3199 | 'HUIS^^', 'HÜS', 'IZ', |
||
3200 | 'HUIS$', 'ÜS', 'IZ', |
||
3201 | 'HUI--1', 'H', None, |
||
3202 | 'HYGIEN^', 'HÜKIEN', None, |
||
3203 | 'HY9^', 'HÜ', None, |
||
3204 | 'HY(BDGMNPST)-', 'Ü', None, |
||
3205 | 'H.^', None, 'H.', |
||
3206 | 'HÄU--1', 'H', None, |
||
3207 | 'H^', 'H', '', |
||
3208 | 'H', '', '', |
||
3209 | 'ICHELL---', 'ISH', 'IZ', |
||
3210 | 'ICHI$', 'ISHI', 'IZI', |
||
3211 | 'IEC$', 'IZ', 'IZ', |
||
3212 | 'IEDENSTELLE------', 'IDN ', 'ITN ', |
||
3213 | 'IEI-3', '', '', |
||
3214 | 'IELL3', 'IEL', 'IEL', |
||
3215 | 'IENNE$', 'IN', 'IN', |
||
3216 | 'IERRE$', 'IER', 'IER', |
||
3217 | 'IERZULAN---', 'IR ZU ', 'IR ZU ', |
||
3218 | 'IETTE$', 'IT', 'IT', |
||
3219 | 'IEU', 'IÖ', 'IÖ', |
||
3220 | 'IE<4', 'I', 'I', |
||
3221 | 'IGL-1', 'IK', None, |
||
3222 | 'IGHT3$', 'EIT', 'EIT', |
||
3223 | 'IGNI(EO)-', 'INI', 'INI', |
||
3224 | 'IGN(AEOU)-$', 'INI', 'INI', |
||
3225 | 'IHER(DGLKRT)--1', 'IHE', None, |
||
3226 | 'IHE(IUY)--', 'IH', None, |
||
3227 | 'IH(AIOÖUÜY)-', 'IH', None, |
||
3228 | 'IJ(AOU)-', 'I', 'I', |
||
3229 | 'IJ$', 'I', 'I', |
||
3230 | 'IJ<', 'EI', 'EI', |
||
3231 | 'IKOLE$', 'IKOL', 'IKUL', |
||
3232 | 'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
||
3233 | 'ILLAR(DT)--4', 'ILIA', 'ILIA', |
||
3234 | 'IMSTAN----^', 'IM ', 'IN ', |
||
3235 | 'INDELERREGE------', 'INDL ', 'INTL ', |
||
3236 | 'INFRAGE-----^$', 'IN ', 'IN ', |
||
3237 | 'INTERN(AOU)-^', 'INTAN', 'INTAN', |
||
3238 | 'INVER-', 'INWE', 'INFE', |
||
3239 | 'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
||
3240 | 'IUSZ$', 'IUS', None, |
||
3241 | 'IUTZ$', 'IUS', None, |
||
3242 | 'IUZ$', 'IUS', None, |
||
3243 | 'IVER--<', 'IW', None, |
||
3244 | 'IVIER$', 'IWIE', 'IFIE', |
||
3245 | 'IV(ÄOÖUÜ)-', 'IW', None, |
||
3246 | 'IV<3', 'IW', None, |
||
3247 | 'IY2', 'I', None, |
||
3248 | 'I(ÈÉÊ)<4', 'I', 'I', |
||
3249 | 'JAVIE---<^', 'ZA', 'ZA', |
||
3250 | 'JEANS^$', 'JINS', 'INZ', |
||
3251 | 'JEANNE^$', 'IAN', 'IAN', |
||
3252 | 'JEAN-^', 'IA', 'IA', |
||
3253 | 'JER-^', 'IE', 'IE', |
||
3254 | 'JE(LMNST)-', 'IE', 'IE', |
||
3255 | 'JI^', 'JI', None, |
||
3256 | 'JOR(GK)^$', 'IÖRK', 'IÖRK', |
||
3257 | 'J', 'I', 'I', |
||
3258 | 'KC(ÄEIJ)-', 'X', 'X', |
||
3259 | 'KD', 'KT', None, |
||
3260 | 'KE(LMNRST)-3^', 'KE', 'KE', |
||
3261 | 'KG(AÄEILOÖRUÜY)-', 'K', None, |
||
3262 | 'KH<^', 'K', 'K', |
||
3263 | 'KIC$', 'KIZ', 'KIZ', |
||
3264 | 'KLE(LMNRST)-3^', 'KLE', 'KLE', |
||
3265 | 'KOTELE-^', 'KOTL', 'KUTL', |
||
3266 | 'KREAT-^', 'KREA', 'KREA', |
||
3267 | 'KRÜS(TZ)--^', 'KRI', None, |
||
3268 | 'KRYS(TZ)--^', 'KRI', None, |
||
3269 | 'KRY9^', 'KRÜ', None, |
||
3270 | 'KSCH---', 'K', 'K', |
||
3271 | 'KSH--', 'K', 'K', |
||
3272 | 'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
||
3273 | 'KT\'S$', 'X', 'X', |
||
3274 | 'KTI(AIOU)-3', 'XI', 'XI', |
||
3275 | 'KT(SßXZ)', 'X', 'X', |
||
3276 | 'KY9^', 'KÜ', None, |
||
3277 | 'K\'S$', 'X', 'X', |
||
3278 | 'K´S$', 'X', 'X', |
||
3279 | 'LANGES$', ' LANGES', ' LANKEZ', |
||
3280 | 'LANGE$', ' LANGE', ' LANKE', |
||
3281 | 'LANG$', ' LANK', ' LANK', |
||
3282 | 'LARVE-', 'LARF', 'LARF', |
||
3283 | 'LD(SßZ)$', 'LS', 'LZ', |
||
3284 | 'LD\'S$', 'LS', 'LZ', |
||
3285 | 'LD´S$', 'LS', 'LZ', |
||
3286 | 'LEAND-^', 'LEAN', 'LEAN', |
||
3287 | 'LEERSTEHE-----^', 'LER ', 'LER ', |
||
3288 | 'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
||
3289 | 'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
||
3290 | 'LEIDERREGE------', 'LEIT ', 'LEIT ', |
||
3291 | 'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
||
3292 | 'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
||
3293 | 'LEL-', 'LE', 'LE', |
||
3294 | 'LE(MNRST)-3^', 'LE', 'LE', |
||
3295 | 'LETTE$', 'LET', 'LET', |
||
3296 | 'LFGNAG-', 'LFGAN', 'LFKAN', |
||
3297 | 'LICHERWEIS----', 'LICHA ', 'LIKA ', |
||
3298 | 'LIC$', 'LIZ', 'LIZ', |
||
3299 | 'LIVE^$', 'LEIF', 'LEIF', |
||
3300 | 'LT(SßZ)$', 'LS', 'LZ', |
||
3301 | 'LT\'S$', 'LS', 'LZ', |
||
3302 | 'LT´S$', 'LS', 'LZ', |
||
3303 | 'LUI(GS)--', 'LU', 'LU', |
||
3304 | 'LV(AIO)-', 'LW', None, |
||
3305 | 'LY9^', 'LÜ', None, |
||
3306 | 'LSTS$', 'LS', 'LZ', |
||
3307 | 'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
||
3308 | 'L(SßZ)$', 'LS', None, |
||
3309 | 'MAIR-<', 'MEI', 'NEI', |
||
3310 | 'MANAG-', 'MENE', 'NENE', |
||
3311 | 'MANUEL', 'MANUEL', None, |
||
3312 | 'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
||
3313 | 'MATCH', 'MESH', 'NEZ', |
||
3314 | 'MAURICE', 'MORIS', 'NURIZ', |
||
3315 | 'MBH^$', 'MBH', 'MBH', |
||
3316 | 'MB(ßZ)$', 'MS', None, |
||
3317 | 'MB(SßTZ)-', 'M', 'N', |
||
3318 | 'MCG9^', 'MAK', 'NAK', |
||
3319 | 'MC9^', 'MAK', 'NAK', |
||
3320 | 'MEMOIR-^', 'MEMOA', 'NENUA', |
||
3321 | 'MERHAVEN$', 'MAHAFN', None, |
||
3322 | 'ME(LMNRST)-3^', 'ME', 'NE', |
||
3323 | 'MEN(STZ)--3', 'ME', None, |
||
3324 | 'MEN$', 'MEN', None, |
||
3325 | 'MIGUEL-', 'MIGE', 'NIKE', |
||
3326 | 'MIKE^$', 'MEIK', 'NEIK', |
||
3327 | 'MITHILFE----^$', 'MIT H', 'NIT ', |
||
3328 | 'MN$', 'M', None, |
||
3329 | 'MN', 'N', 'N', |
||
3330 | 'MPJUTE-', 'MPUT', 'NBUT', |
||
3331 | 'MP(ßZ)$', 'MS', None, |
||
3332 | 'MP(SßTZ)-', 'M', 'N', |
||
3333 | 'MP(BDJLMNPQVW)-', 'MB', 'NB', |
||
3334 | 'MY9^', 'MÜ', None, |
||
3335 | 'M(ßZ)$', 'MS', None, |
||
3336 | 'M´G7^', 'MAK', 'NAK', |
||
3337 | 'M\'G7^', 'MAK', 'NAK', |
||
3338 | 'M´^', 'MAK', 'NAK', |
||
3339 | 'M\'^', 'MAK', 'NAK', |
||
3340 | 'M', None, 'N', |
||
3341 | 'NACH^^', 'NACH', 'NAK', |
||
3342 | 'NADINE', 'NADIN', 'NATIN', |
||
3343 | 'NAIV--', 'NA', 'NA', |
||
3344 | 'NAISE$', 'NESE', 'NEZE', |
||
3345 | 'NAUGENOMM------', 'NAU ', 'NAU ', |
||
3346 | 'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
||
3347 | 'NCH$', 'NSH', 'NZ', |
||
3348 | 'NCOISE$', 'SOA', 'ZUA', |
||
3349 | 'NCOIS$', 'SOA', 'ZUA', |
||
3350 | 'NDAR$', 'NDA', 'NTA', |
||
3351 | 'NDERINGEN------', 'NDE ', 'NTE ', |
||
3352 | 'NDRO(CDKTZ)-', 'NTRO', None, |
||
3353 | 'ND(BFGJLMNPQVW)-', 'NT', None, |
||
3354 | 'ND(SßZ)$', 'NS', 'NZ', |
||
3355 | 'ND\'S$', 'NS', 'NZ', |
||
3356 | 'ND´S$', 'NS', 'NZ', |
||
3357 | 'NEBEN^^', 'NEBN', 'NEBN', |
||
3358 | 'NENGELERN------', 'NEN ', 'NEN ', |
||
3359 | 'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
||
3360 | 'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
||
3361 | 'NE(LMNRST)-3^', 'NE', 'NE', |
||
3362 | 'NEN-3', 'NE', 'NE', |
||
3363 | 'NETTE$', 'NET', 'NET', |
||
3364 | 'NGU^^', 'NU', 'NU', |
||
3365 | 'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
||
3366 | 'NH(AUO)-$', 'NI', 'NI', |
||
3367 | 'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
||
3368 | 'NICHTSSAGE----', 'NIX ', 'NIX ', |
||
3369 | 'NICHTS^^', 'NIX', 'NIX', |
||
3370 | 'NICHT^^', 'NICHT', 'NIKT', |
||
3371 | 'NINE$', 'NIN', 'NIN', |
||
3372 | 'NON^^', 'NON', 'NUN', |
||
3373 | 'NOTLEIDE-----^', 'NOT ', 'NUT ', |
||
3374 | 'NOT^^', 'NOT', 'NUT', |
||
3375 | 'NTI(AIOU)-3', 'NZI', 'NZI', |
||
3376 | 'NTIEL--3', 'NZI', 'NZI', |
||
3377 | 'NT(SßZ)$', 'NS', 'NZ', |
||
3378 | 'NT\'S$', 'NS', 'NZ', |
||
3379 | 'NT´S$', 'NS', 'NZ', |
||
3380 | 'NYLON', 'NEILON', 'NEILUN', |
||
3381 | 'NY9^', 'NÜ', None, |
||
3382 | 'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
||
3383 | 'NSZ-', 'NS', None, |
||
3384 | 'NSTS$', 'NS', 'NZ', |
||
3385 | 'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
||
3386 | 'N(SßZ)$', 'NS', None, |
||
3387 | 'OBERE-', 'OBER', None, |
||
3388 | 'OBER^^', 'OBA', 'UBA', |
||
3389 | 'OEU2', 'Ö', 'Ö', |
||
3390 | 'OE<2', 'Ö', 'Ö', |
||
3391 | 'OGL-', 'OK', None, |
||
3392 | 'OGNIE-', 'ONI', 'UNI', |
||
3393 | 'OGN(AEOU)-$', 'ONI', 'UNI', |
||
3394 | 'OH(AIOÖUÜY)-', 'OH', None, |
||
3395 | 'OIE$', 'Ö', 'Ö', |
||
3396 | 'OIRE$', 'OA', 'UA', |
||
3397 | 'OIR$', 'OA', 'UA', |
||
3398 | 'OIX', 'OA', 'UA', |
||
3399 | 'OI<3', 'EU', 'EU', |
||
3400 | 'OKAY^$', 'OKE', 'UKE', |
||
3401 | 'OLYN$', 'OLIN', 'ULIN', |
||
3402 | 'OO(DLMZ)-', 'U', None, |
||
3403 | 'OO$', 'U', None, |
||
3404 | 'OO-', '', '', |
||
3405 | 'ORGINAL-----', 'ORI', 'URI', |
||
3406 | 'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
||
3407 | 'OUI^', 'WI', 'FI', |
||
3408 | 'OUILLE$', 'ULIE', 'ULIE', |
||
3409 | 'OU(DT)-^', 'AU', 'AU', |
||
3410 | 'OUSE$', 'AUS', 'AUZ', |
||
3411 | 'OUT-', 'AU', 'AU', |
||
3412 | 'OU', 'U', 'U', |
||
3413 | 'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
||
3414 | 'OVER--<', 'OW', None, |
||
3415 | 'OV(AOU)-', 'OW', None, |
||
3416 | 'OW$', 'AU', 'AU', |
||
3417 | 'OWS$', 'OS', 'UZ', |
||
3418 | 'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
||
3419 | 'OYER', 'OIA', None, |
||
3420 | 'OY(AÄEIOÖUÜ)--', 'O', 'U', |
||
3421 | 'O(JY)<', 'EU', 'EU', |
||
3422 | 'OZ$', 'OS', None, |
||
3423 | 'O´^', 'O', 'U', |
||
3424 | 'O\'^', 'O', 'U', |
||
3425 | 'O', None, 'U', |
||
3426 | 'PATIEN--^', 'PAZI', 'PAZI', |
||
3427 | 'PENSIO-^', 'PANSI', 'PANZI', |
||
3428 | 'PE(LMNRST)-3^', 'PE', 'PE', |
||
3429 | 'PFER-^', 'FE', 'FE', |
||
3430 | 'P(FH)<', 'F', 'F', |
||
3431 | 'PIC^$', 'PIK', 'PIK', |
||
3432 | 'PIC$', 'PIZ', 'PIZ', |
||
3433 | 'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
||
3434 | 'POLYP-', 'POLÜ', None, |
||
3435 | 'POLY^^', 'POLI', 'PULI', |
||
3436 | 'PORTRAIT7', 'PORTRE', 'PURTRE', |
||
3437 | 'POWER7', 'PAUA', 'PAUA', |
||
3438 | 'PP(FH)--<', 'B', 'B', |
||
3439 | 'PP-', '', '', |
||
3440 | 'PRODUZ-^', 'PRODU', 'BRUTU', |
||
3441 | 'PRODUZI--', ' PRODU', ' BRUTU', |
||
3442 | 'PRIX^$', 'PRI', 'PRI', |
||
3443 | 'PS-^^', 'P', None, |
||
3444 | 'P(SßZ)^', None, 'Z', |
||
3445 | 'P(SßZ)$', 'BS', None, |
||
3446 | 'PT-^', '', '', |
||
3447 | 'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
||
3448 | 'PY9^', 'PÜ', None, |
||
3449 | 'P(AÄEIOÖRUÜY)-', 'P', 'P', |
||
3450 | 'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
||
3451 | 'P.^', None, 'P.', |
||
3452 | 'P^', 'P', None, |
||
3453 | 'P', 'B', 'B', |
||
3454 | 'QI-', 'Z', 'Z', |
||
3455 | 'QUARANT--', 'KARA', 'KARA', |
||
3456 | 'QUE(LMNRST)-3', 'KWE', 'KFE', |
||
3457 | 'QUE$', 'K', 'K', |
||
3458 | 'QUI(NS)$', 'KI', 'KI', |
||
3459 | 'QUIZ7', 'KWIS', None, |
||
3460 | 'Q(UV)7', 'KW', 'KF', |
||
3461 | 'Q<', 'K', 'K', |
||
3462 | 'RADFAHR----', 'RAT ', 'RAT ', |
||
3463 | 'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
3464 | 'RCH', 'RCH', 'RK', |
||
3465 | 'REA(DU)---3^', 'R', None, |
||
3466 | 'REBSERZEUG------', 'REBS ', 'REBZ ', |
||
3467 | 'RECHERCH^', 'RESHASH', 'REZAZ', |
||
3468 | 'RECYCL--', 'RIZEI', 'RIZEI', |
||
3469 | 'RE(ALST)-3^', 'RE', None, |
||
3470 | 'REE$', 'RI', 'RI', |
||
3471 | 'RER$', 'RA', 'RA', |
||
3472 | 'RE(MNR)-4', 'RE', 'RE', |
||
3473 | 'RETTE$', 'RET', 'RET', |
||
3474 | 'REUZ$', 'REUZ', None, |
||
3475 | 'REW$', 'RU', 'RU', |
||
3476 | 'RH<^', 'R', 'R', |
||
3477 | 'RJA(MN)--', 'RI', 'RI', |
||
3478 | 'ROWD-^', 'RAU', 'RAU', |
||
3479 | 'RTEMONNAIE-', 'RTMON', 'RTNUN', |
||
3480 | 'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
||
3481 | 'RTIEL--3', 'RZI', 'RZI', |
||
3482 | 'RV(AEOU)-3', 'RW', None, |
||
3483 | 'RY(KN)-$', 'RI', 'RI', |
||
3484 | 'RY9^', 'RÜ', None, |
||
3485 | 'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
||
3486 | 'SAISO-^', 'SES', 'ZEZ', |
||
3487 | 'SAFE^$', 'SEIF', 'ZEIF', |
||
3488 | 'SAUCE-^', 'SOS', 'ZUZ', |
||
3489 | 'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
||
3490 | 'SCHSCH---7', '', '', |
||
3491 | 'SCHTSCH', 'SH', 'Z', |
||
3492 | 'SC(HZ)<', 'SH', 'Z', |
||
3493 | 'SC', 'SK', 'ZK', |
||
3494 | 'SELBSTST--7^^', 'SELB', 'ZELB', |
||
3495 | 'SELBST7^^', 'SELBST', 'ZELBZT', |
||
3496 | 'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
||
3497 | 'SERVI-^', 'SERW', None, |
||
3498 | 'SE(LMNRST)-3^', 'SE', 'ZE', |
||
3499 | 'SETTE$', 'SET', 'ZET', |
||
3500 | 'SHP-^', 'S', 'Z', |
||
3501 | 'SHST', 'SHT', 'ZT', |
||
3502 | 'SHTSH', 'SH', 'Z', |
||
3503 | 'SHT', 'ST', 'Z', |
||
3504 | 'SHY9^', 'SHÜ', None, |
||
3505 | 'SH^^', 'SH', None, |
||
3506 | 'SH3', 'SH', 'Z', |
||
3507 | 'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
||
3508 | 'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
||
3509 | 'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
||
3510 | 'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
||
3511 | 'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
||
3512 | 'SIEGLI-^', 'SIKL', 'ZIKL', |
||
3513 | 'SIGLI-^', 'SIKL', 'ZIKL', |
||
3514 | 'SIGHT', 'SEIT', 'ZEIT', |
||
3515 | 'SIGN', 'SEIN', 'ZEIN', |
||
3516 | 'SKI(NPZ)-', 'SKI', 'ZKI', |
||
3517 | 'SKI<^', 'SHI', 'ZI', |
||
3518 | 'SODASS^$', 'SO DAS', 'ZU TAZ', |
||
3519 | 'SODAß^$', 'SO DAS', 'ZU TAZ', |
||
3520 | 'SOGENAN--^', 'SO GEN', 'ZU KEN', |
||
3521 | 'SOUND-', 'SAUN', 'ZAUN', |
||
3522 | 'STAATS^^', 'STAZ', 'ZTAZ', |
||
3523 | 'STADT^^', 'STAT', 'ZTAT', |
||
3524 | 'STANDE$', ' STANDE', ' ZTANTE', |
||
3525 | 'START^^', 'START', 'ZTART', |
||
3526 | 'STAURANT7', 'STORAN', 'ZTURAN', |
||
3527 | 'STEAK-', 'STE', 'ZTE', |
||
3528 | 'STEPHEN-^$', 'STEW', None, |
||
3529 | 'STERN', 'STERN', None, |
||
3530 | 'STRAF^^', 'STRAF', 'ZTRAF', |
||
3531 | 'ST\'S$', 'Z', 'Z', |
||
3532 | 'ST´S$', 'Z', 'Z', |
||
3533 | 'STST--', '', '', |
||
3534 | 'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
||
3535 | 'ST(SZ)', 'Z', 'Z', |
||
3536 | 'SPAREN---^', 'SPA', 'ZPA', |
||
3537 | 'SPAREND----', ' SPA', ' ZPA', |
||
3538 | 'S(PTW)-^^', 'S', None, |
||
3539 | 'SP', 'SP', None, |
||
3540 | 'STYN(AE)-$', 'STIN', 'ZTIN', |
||
3541 | 'ST', 'ST', 'ZT', |
||
3542 | 'SUITE<', 'SIUT', 'ZIUT', |
||
3543 | 'SUKE--$', 'S', 'Z', |
||
3544 | 'SURF(EI)-', 'SÖRF', 'ZÖRF', |
||
3545 | 'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
||
3546 | 'SYB(IY)--^', 'SIB', None, |
||
3547 | 'SYL(KVW)--^', 'SI', None, |
||
3548 | 'SY9^', 'SÜ', None, |
||
3549 | 'SZE(NPT)-^', 'ZE', 'ZE', |
||
3550 | 'SZI(ELN)-^', 'ZI', 'ZI', |
||
3551 | 'SZCZ<', 'SH', 'Z', |
||
3552 | 'SZT<', 'ST', 'ZT', |
||
3553 | 'SZ<3', 'SH', 'Z', |
||
3554 | 'SÜL(KVW)--^', 'SI', None, |
||
3555 | 'S', None, 'Z', |
||
3556 | 'TCH', 'SH', 'Z', |
||
3557 | 'TD(AÄEIOÖRUÜY)-', 'T', None, |
||
3558 | 'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
||
3559 | 'TEAT-^', 'TEA', 'TEA', |
||
3560 | 'TERRAI7^', 'TERA', 'TERA', |
||
3561 | 'TE(LMNRST)-3^', 'TE', 'TE', |
||
3562 | 'TH<', 'T', 'T', |
||
3563 | 'TICHT-', 'TIK', 'TIK', |
||
3564 | 'TICH$', 'TIK', 'TIK', |
||
3565 | 'TIC$', 'TIZ', 'TIZ', |
||
3566 | 'TIGGESTELL-------', 'TIK ', 'TIK ', |
||
3567 | 'TIGSTELL-----', 'TIK ', 'TIK ', |
||
3568 | 'TOAS-^', 'TO', 'TU', |
||
3569 | 'TOILET-', 'TOLE', 'TULE', |
||
3570 | 'TOIN-', 'TOA', 'TUA', |
||
3571 | 'TRAECHTI-^', 'TRECHT', 'TREKT', |
||
3572 | 'TRAECHTIG--', ' TRECHT', ' TREKT', |
||
3573 | 'TRAINI-', 'TREN', 'TREN', |
||
3574 | 'TRÄCHTI-^', 'TRECHT', 'TREKT', |
||
3575 | 'TRÄCHTIG--', ' TRECHT', ' TREKT', |
||
3576 | 'TSCH', 'SH', 'Z', |
||
3577 | 'TSH', 'SH', 'Z', |
||
3578 | 'TST', 'ZT', 'ZT', |
||
3579 | 'T(Sß)', 'Z', 'Z', |
||
3580 | 'TT(SZ)--<', '', '', |
||
3581 | 'TT9', 'T', 'T', |
||
3582 | 'TV^$', 'TV', 'TV', |
||
3583 | 'TX(AEIOU)-3', 'SH', 'Z', |
||
3584 | 'TY9^', 'TÜ', None, |
||
3585 | 'TZ-', '', '', |
||
3586 | 'T\'S3$', 'Z', 'Z', |
||
3587 | 'T´S3$', 'Z', 'Z', |
||
3588 | 'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
||
3589 | 'UEBER^^', 'ÜBA', 'IBA', |
||
3590 | 'UE2', 'Ü', 'I', |
||
3591 | 'UGL-', 'UK', None, |
||
3592 | 'UH(AOÖUÜY)-', 'UH', None, |
||
3593 | 'UIE$', 'Ü', 'I', |
||
3594 | 'UM^^', 'UM', 'UN', |
||
3595 | 'UNTERE--3', 'UNTE', 'UNTE', |
||
3596 | 'UNTER^^', 'UNTA', 'UNTA', |
||
3597 | 'UNVER^^', 'UNFA', 'UNFA', |
||
3598 | 'UN^^', 'UN', 'UN', |
||
3599 | 'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
||
3600 | 'UVE-4', 'UW', None, |
||
3601 | 'UY2', 'UI', None, |
||
3602 | 'UZZ', 'AS', 'AZ', |
||
3603 | 'VACL-^', 'WAZ', 'FAZ', |
||
3604 | 'VAC$', 'WAZ', 'FAZ', |
||
3605 | 'VAN DEN ^', 'FANDN', 'FANTN', |
||
3606 | 'VANES-^', 'WANE', None, |
||
3607 | 'VATRO-', 'WATR', None, |
||
3608 | 'VA(DHJNT)--^', 'F', None, |
||
3609 | 'VEDD-^', 'FE', 'FE', |
||
3610 | 'VE(BEHIU)--^', 'F', None, |
||
3611 | 'VEL(BDLMNT)-^', 'FEL', None, |
||
3612 | 'VENTZ-^', 'FEN', None, |
||
3613 | 'VEN(NRSZ)-^', 'FEN', None, |
||
3614 | 'VER(AB)-^$', 'WER', None, |
||
3615 | 'VERBAL^$', 'WERBAL', None, |
||
3616 | 'VERBAL(EINS)-^', 'WERBAL', None, |
||
3617 | 'VERTEBR--', 'WERTE', None, |
||
3618 | 'VEREIN-----', 'F', None, |
||
3619 | 'VEREN(AEIOU)-^', 'WEREN', None, |
||
3620 | 'VERIFI', 'WERIFI', None, |
||
3621 | 'VERON(AEIOU)-^', 'WERON', None, |
||
3622 | 'VERSEN^', 'FERSN', 'FAZN', |
||
3623 | 'VERSIERT--^', 'WERSI', None, |
||
3624 | 'VERSIO--^', 'WERS', None, |
||
3625 | 'VERSUS', 'WERSUS', None, |
||
3626 | 'VERTI(GK)-', 'WERTI', None, |
||
3627 | 'VER^^', 'FER', 'FA', |
||
3628 | 'VERSPRECHE-------', ' FER', ' FA', |
||
3629 | 'VER$', 'WA', None, |
||
3630 | 'VER', 'FA', 'FA', |
||
3631 | 'VET(HT)-^', 'FET', 'FET', |
||
3632 | 'VETTE$', 'WET', 'FET', |
||
3633 | 'VE^', 'WE', None, |
||
3634 | 'VIC$', 'WIZ', 'FIZ', |
||
3635 | 'VIELSAGE----', 'FIL ', 'FIL ', |
||
3636 | 'VIEL', 'FIL', 'FIL', |
||
3637 | 'VIEW', 'WIU', 'FIU', |
||
3638 | 'VILL(AE)-', 'WIL', None, |
||
3639 | 'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
||
3640 | 'VI(ELS)--^', 'F', None, |
||
3641 | 'VILLON--', 'WILI', 'FILI', |
||
3642 | 'VIZE^^', 'FIZE', 'FIZE', |
||
3643 | 'VLIE--^', 'FL', None, |
||
3644 | 'VL(AEIOU)--', 'W', None, |
||
3645 | 'VOKA-^', 'WOK', None, |
||
3646 | 'VOL(ATUVW)--^', 'WO', None, |
||
3647 | 'VOR^^', 'FOR', 'FUR', |
||
3648 | 'VR(AEIOU)--', 'W', None, |
||
3649 | 'VV9', 'W', None, |
||
3650 | 'VY9^', 'WÜ', 'FI', |
||
3651 | 'V(ÜY)-', 'W', None, |
||
3652 | 'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
||
3653 | 'V(AEIJLRU)-<', 'W', None, |
||
3654 | 'V.^', 'V.', None, |
||
3655 | 'V<', 'F', 'F', |
||
3656 | 'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
||
3657 | 'WEITREICH-----^', 'WEIT ', 'FEIT ', |
||
3658 | 'WEITVER^', 'WEIT FER', 'FEIT FA', |
||
3659 | 'WE(LMNRST)-3^', 'WE', 'FE', |
||
3660 | 'WER(DST)-', 'WER', None, |
||
3661 | 'WIC$', 'WIZ', 'FIZ', |
||
3662 | 'WIEDERU--', 'WIDE', 'FITE', |
||
3663 | 'WIEDER^$', 'WIDA', 'FITA', |
||
3664 | 'WIEDER^^', 'WIDA ', 'FITA ', |
||
3665 | 'WIEVIEL', 'WI FIL', 'FI FIL', |
||
3666 | 'WISUEL', 'WISUEL', None, |
||
3667 | 'WR-^', 'W', None, |
||
3668 | 'WY9^', 'WÜ', 'FI', |
||
3669 | 'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
||
3670 | 'W$', 'F', None, |
||
3671 | 'W', None, 'F', |
||
3672 | 'X<^', 'Z', 'Z', |
||
3673 | 'XHAVEN$', 'XAFN', None, |
||
3674 | 'X(CSZ)', 'X', 'X', |
||
3675 | 'XTS(CH)--', 'XT', 'XT', |
||
3676 | 'XT(SZ)', 'Z', 'Z', |
||
3677 | 'YE(LMNRST)-3^', 'IE', 'IE', |
||
3678 | 'YE-3', 'I', 'I', |
||
3679 | 'YOR(GK)^$', 'IÖRK', 'IÖRK', |
||
3680 | 'Y(AOU)-<7', 'I', 'I', |
||
3681 | 'Y(BKLMNPRSTX)-1', 'Ü', None, |
||
3682 | 'YVES^$', 'IF', 'IF', |
||
3683 | 'YVONNE^$', 'IWON', 'IFUN', |
||
3684 | 'Y.^', 'Y.', None, |
||
3685 | 'Y', 'I', 'I', |
||
3686 | 'ZC(AOU)-', 'SK', 'ZK', |
||
3687 | 'ZE(LMNRST)-3^', 'ZE', 'ZE', |
||
3688 | 'ZIEJ$', 'ZI', 'ZI', |
||
3689 | 'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
||
3690 | 'ZL(AEIOU)-', 'SL', None, |
||
3691 | 'ZS(CHT)--', '', '', |
||
3692 | 'ZS', 'SH', 'Z', |
||
3693 | 'ZUERST', 'ZUERST', 'ZUERST', |
||
3694 | 'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
||
3695 | 'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
||
3696 | 'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
||
3697 | 'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
||
3698 | 'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
||
3699 | 'ZURUECK^^', 'ZURÜK', 'ZURIK', |
||
3700 | 'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
||
3701 | 'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
||
3702 | 'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
||
3703 | 'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
||
3704 | 'ZUVER^^', 'ZUFA', 'ZUFA', |
||
3705 | 'ZUVIEL', 'ZU FIL', 'ZU FIL', |
||
3706 | 'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
||
3707 | 'ZY9^', 'ZÜ', None, |
||
3708 | 'ZYK3$', 'ZIK', None, |
||
3709 | 'Z(VW)7^', 'SW', None, |
||
3710 | None, None, None) |
||
3711 | |||
3712 | phonet_hash = Counter() |
||
3713 | alpha_pos = Counter() |
||
3714 | |||
3715 | phonet_hash_1 = Counter() |
||
3716 | phonet_hash_2 = Counter() |
||
3717 | |||
3718 | _phonet_upper_translation = dict(zip((ord(_) for _ in |
||
3719 | 'abcdefghijklmnopqrstuvwxyzàáâãåäæ' + |
||
3720 | 'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'), |
||
3721 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' + |
||
3722 | 'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ')) |
||
3723 | |||
3724 | def _initialize_phonet(lang): |
||
3725 | """Initialize phonet variables.""" |
||
3726 | if lang == 'none': |
||
3727 | _phonet_rules = _phonet_rules_no_lang |
||
3728 | else: |
||
3729 | _phonet_rules = _phonet_rules_german |
||
3730 | |||
3731 | phonet_hash[''] = -1 |
||
3732 | |||
3733 | # German and international umlauts |
||
3734 | for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', |
||
3735 | 'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', |
||
3736 | 'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}: |
||
3737 | alpha_pos[j] = 1 |
||
3738 | phonet_hash[j] = -1 |
||
3739 | |||
3740 | # "normal" letters ('A'-'Z') |
||
3741 | for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
||
3742 | alpha_pos[j] = i + 2 |
||
3743 | phonet_hash[j] = -1 |
||
3744 | |||
3745 | for i in range(26): |
||
3746 | for j in range(28): |
||
3747 | phonet_hash_1[i, j] = -1 |
||
3748 | phonet_hash_2[i, j] = -1 |
||
3749 | |||
3750 | # for each phonetc rule |
||
3751 | for i in range(len(_phonet_rules)): |
||
3752 | rule = _phonet_rules[i] |
||
3753 | |||
3754 | if rule and i % 3 == 0: |
||
3755 | # calculate first hash value |
||
3756 | k = _phonet_rules[i][0] |
||
3757 | |||
3758 | if phonet_hash[k] < 0 and (_phonet_rules[i+1] or |
||
3759 | _phonet_rules[i+2]): |
||
3760 | phonet_hash[k] = i |
||
3761 | |||
3762 | # calculate second hash values |
||
3763 | if k and alpha_pos[k] >= 2: |
||
3764 | k = alpha_pos[k] |
||
3765 | |||
3766 | j = k-2 |
||
3767 | rule = rule[1:] |
||
3768 | |||
3769 | if not rule: |
||
3770 | rule = ' ' |
||
3771 | elif rule[0] == '(': |
||
3772 | rule = rule[1:] |
||
3773 | else: |
||
3774 | rule = rule[0] |
||
3775 | |||
3776 | while rule and (rule[0] != ')'): |
||
3777 | k = alpha_pos[rule[0]] |
||
3778 | |||
3779 | if k > 0: |
||
3780 | # add hash value for this letter |
||
3781 | if phonet_hash_1[j, k] < 0: |
||
3782 | phonet_hash_1[j, k] = i |
||
3783 | phonet_hash_2[j, k] = i |
||
3784 | |||
3785 | if phonet_hash_2[j, k] >= (i-30): |
||
3786 | phonet_hash_2[j, k] = i |
||
3787 | else: |
||
3788 | k = -1 |
||
3789 | |||
3790 | if k <= 0: |
||
3791 | # add hash value for all letters |
||
3792 | if phonet_hash_1[j, 0] < 0: |
||
3793 | phonet_hash_1[j, 0] = i |
||
3794 | |||
3795 | phonet_hash_2[j, 0] = i |
||
3796 | |||
3797 | rule = rule[1:] |
||
3798 | |||
3799 | def _phonet(term, mode, lang): |
||
3800 | """Return the phonet coded form of a term.""" |
||
3801 | if lang == 'none': |
||
3802 | _phonet_rules = _phonet_rules_no_lang |
||
3803 | else: |
||
3804 | _phonet_rules = _phonet_rules_german |
||
3805 | |||
3806 | char0 = '' |
||
3807 | dest = term |
||
3808 | |||
3809 | if not term: |
||
3810 | return '' |
||
3811 | |||
3812 | term_length = len(term) |
||
3813 | |||
3814 | # convert input string to upper-case |
||
3815 | src = term.translate(_phonet_upper_translation) |
||
3816 | |||
3817 | # check "src" |
||
3818 | i = 0 |
||
3819 | j = 0 |
||
3820 | zeta = 0 |
||
3821 | |||
3822 | while i < len(src): |
||
3823 | char = src[i] |
||
3824 | |||
3825 | pos = alpha_pos[char] |
||
3826 | |||
3827 | if pos >= 2: |
||
3828 | xpos = pos-2 |
||
3829 | |||
3830 | if i+1 == len(src): |
||
3831 | pos = alpha_pos[''] |
||
3832 | else: |
||
3833 | pos = alpha_pos[src[i+1]] |
||
3834 | |||
3835 | start1 = phonet_hash_1[xpos, pos] |
||
3836 | start2 = phonet_hash_1[xpos, 0] |
||
3837 | end1 = phonet_hash_2[xpos, pos] |
||
3838 | end2 = phonet_hash_2[xpos, 0] |
||
3839 | |||
3840 | # preserve rule priorities |
||
3841 | if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
||
3842 | pos = start1 |
||
3843 | start1 = start2 |
||
3844 | start2 = pos |
||
3845 | pos = end1 |
||
3846 | end1 = end2 |
||
3847 | end2 = pos |
||
3848 | |||
3849 | if (end1 >= start2) and (start2 >= 0): |
||
3850 | if end2 > end1: |
||
3851 | end1 = end2 |
||
3852 | |||
3853 | start2 = -1 |
||
3854 | end2 = -1 |
||
3855 | else: |
||
3856 | pos = phonet_hash[char] |
||
3857 | start1 = pos |
||
3858 | end1 = 10000 |
||
3859 | start2 = -1 |
||
3860 | end2 = -1 |
||
3861 | |||
3862 | pos = start1 |
||
3863 | zeta0 = 0 |
||
3864 | |||
3865 | if pos >= 0: |
||
3866 | # check rules for this char |
||
3867 | while ((_phonet_rules[pos] is None) or |
||
3868 | (_phonet_rules[pos][0] == char)): |
||
3869 | if pos > end1: |
||
3870 | if start2 > 0: |
||
3871 | pos = start2 |
||
3872 | start1 = start2 |
||
3873 | start2 = -1 |
||
3874 | end1 = end2 |
||
3875 | end2 = -1 |
||
3876 | continue |
||
3877 | |||
3878 | break |
||
3879 | |||
3880 | if (((_phonet_rules[pos] is None) or |
||
3881 | (_phonet_rules[pos + mode] is None))): |
||
3882 | # no conversion rule available |
||
3883 | pos += 3 |
||
3884 | continue |
||
3885 | |||
3886 | # check whole string |
||
3887 | matches = 1 # number of matching letters |
||
3888 | priority = 5 # default priority |
||
3889 | rule = _phonet_rules[pos] |
||
3890 | rule = rule[1:] |
||
3891 | |||
3892 | while (rule and |
||
3893 | (len(src) > (i + matches)) and |
||
3894 | (src[i + matches] == rule[0]) and |
||
3895 | not rule[0].isdigit() and |
||
3896 | (rule not in '(-<^$')): |
||
3897 | matches += 1 |
||
3898 | rule = rule[1:] |
||
3899 | |||
3900 | if rule and (rule[0] == '('): |
||
3901 | # check an array of letters |
||
3902 | if (((len(src) > (i + matches)) and |
||
3903 | src[i + matches].isalpha() and |
||
3904 | (src[i + matches] in rule[1:]))): |
||
3905 | matches += 1 |
||
3906 | |||
3907 | while rule and rule[0] != ')': |
||
3908 | rule = rule[1:] |
||
3909 | |||
3910 | # if rule[0] == ')': |
||
3911 | rule = rule[1:] |
||
3912 | |||
3913 | if rule: |
||
3914 | priority0 = ord(rule[0]) |
||
3915 | else: |
||
3916 | priority0 = 0 |
||
3917 | |||
3918 | matches0 = matches |
||
3919 | |||
3920 | while rule and rule[0] == '-' and matches > 1: |
||
3921 | matches -= 1 |
||
3922 | rule = rule[1:] |
||
3923 | |||
3924 | if rule and rule[0] == '<': |
||
3925 | rule = rule[1:] |
||
3926 | |||
3927 | if rule and rule[0].isdigit(): |
||
3928 | # read priority |
||
3929 | priority = int(rule[0]) |
||
3930 | rule = rule[1:] |
||
3931 | |||
3932 | if rule and rule[0:2] == '^^': |
||
3933 | rule = rule[1:] |
||
3934 | |||
3935 | if (not rule or |
||
3936 | ((rule[0] == '^') and |
||
3937 | ((i == 0) or not src[i-1].isalpha()) and |
||
3938 | ((rule[1:2] != '$') or |
||
3939 | (not (src[i+matches0:i+matches0+1].isalpha()) and |
||
3940 | (src[i+matches0:i+matches0+1] != '.')))) or |
||
3941 | ((rule[0] == '$') and (i > 0) and |
||
3942 | src[i-1].isalpha() and |
||
3943 | ((not src[i+matches0:i+matches0+1].isalpha()) and |
||
3944 | (src[i+matches0:i+matches0+1] != '.')))): |
||
3945 | # look for continuation, if: |
||
3946 | # matches > 1 und NO '-' in first string */ |
||
3947 | pos0 = -1 |
||
3948 | |||
3949 | start3 = 0 |
||
3950 | start4 = 0 |
||
3951 | end3 = 0 |
||
3952 | end4 = 0 |
||
3953 | |||
3954 | if (((matches > 1) and |
||
3955 | src[i+matches:i+matches+1] and |
||
3956 | (priority0 != ord('-')))): |
||
3957 | char0 = src[i+matches-1] |
||
3958 | pos0 = alpha_pos[char0] |
||
3959 | |||
3960 | if pos0 >= 2 and src[i+matches]: |
||
3961 | xpos = pos0 - 2 |
||
3962 | pos0 = alpha_pos[src[i+matches]] |
||
3963 | start3 = phonet_hash_1[xpos, pos0] |
||
3964 | start4 = phonet_hash_1[xpos, 0] |
||
3965 | end3 = phonet_hash_2[xpos, pos0] |
||
3966 | end4 = phonet_hash_2[xpos, 0] |
||
3967 | |||
3968 | # preserve rule priorities |
||
3969 | if (((start4 >= 0) and |
||
3970 | ((start3 < 0) or (start4 < start3)))): |
||
3971 | pos0 = start3 |
||
3972 | start3 = start4 |
||
3973 | start4 = pos0 |
||
3974 | pos0 = end3 |
||
3975 | end3 = end4 |
||
3976 | end4 = pos0 |
||
3977 | |||
3978 | if (end3 >= start4) and (start4 >= 0): |
||
3979 | if end4 > end3: |
||
3980 | end3 = end4 |
||
3981 | |||
3982 | start4 = -1 |
||
3983 | end4 = -1 |
||
3984 | else: |
||
3985 | pos0 = phonet_hash[char0] |
||
3986 | start3 = pos0 |
||
3987 | end3 = 10000 |
||
3988 | start4 = -1 |
||
3989 | end4 = -1 |
||
3990 | |||
3991 | pos0 = start3 |
||
3992 | |||
3993 | # check continuation rules for src[i+matches] |
||
3994 | if pos0 >= 0: |
||
3995 | while ((_phonet_rules[pos0] is None) or |
||
3996 | (_phonet_rules[pos0][0] == char0)): |
||
3997 | if pos0 > end3: |
||
3998 | if start4 > 0: |
||
3999 | pos0 = start4 |
||
4000 | start3 = start4 |
||
4001 | start4 = -1 |
||
4002 | end3 = end4 |
||
4003 | end4 = -1 |
||
4004 | continue |
||
4005 | |||
4006 | priority0 = -1 |
||
4007 | |||
4008 | # important |
||
4009 | break |
||
4010 | |||
4011 | if (((_phonet_rules[pos0] is None) or |
||
4012 | (_phonet_rules[pos0 + mode] is None))): |
||
4013 | # no conversion rule available |
||
4014 | pos0 += 3 |
||
4015 | continue |
||
4016 | |||
4017 | # check whole string |
||
4018 | matches0 = matches |
||
4019 | priority0 = 5 |
||
4020 | rule = _phonet_rules[pos0] |
||
4021 | rule = rule[1:] |
||
4022 | |||
4023 | while (rule and |
||
4024 | (src[i+matches0:i+matches0+1] == |
||
4025 | rule[0]) and |
||
4026 | (not rule[0].isdigit() or |
||
4027 | (rule in '(-<^$'))): |
||
4028 | matches0 += 1 |
||
4029 | rule = rule[1:] |
||
4030 | |||
4031 | if rule and rule[0] == '(': |
||
4032 | # check an array of letters |
||
4033 | if ((src[i+matches0:i+matches0+1] |
||
4034 | .isalpha() and |
||
4035 | (src[i+matches0] in rule[1:]))): |
||
4036 | matches0 += 1 |
||
4037 | |||
4038 | while rule and rule[0] != ')': |
||
4039 | rule = rule[1:] |
||
4040 | |||
4041 | # if rule[0] == ')': |
||
4042 | rule = rule[1:] |
||
4043 | |||
4044 | while rule and rule[0] == '-': |
||
4045 | # "matches0" is NOT decremented |
||
4046 | # because of "if (matches0 == matches)" |
||
4047 | rule = rule[1:] |
||
4048 | |||
4049 | if rule and rule[0] == '<': |
||
4050 | rule = rule[1:] |
||
4051 | |||
4052 | if rule and rule[0].isdigit(): |
||
4053 | priority0 = int(rule[0]) |
||
4054 | rule = rule[1:] |
||
4055 | |||
4056 | if (not rule or |
||
4057 | # rule == '^' is not possible here |
||
4058 | ((rule[0] == '$') and not |
||
4059 | src[i+matches0:i+matches0+1] |
||
4060 | .isalpha() and |
||
4061 | (src[i+matches0:i+matches0+1] |
||
4062 | != '.'))): |
||
4063 | if matches0 == matches: |
||
4064 | # this is only a partial string |
||
4065 | pos0 += 3 |
||
4066 | continue |
||
4067 | |||
4068 | if priority0 < priority: |
||
4069 | # priority is too low |
||
4070 | pos0 += 3 |
||
4071 | continue |
||
4072 | |||
4073 | # continuation rule found |
||
4074 | break |
||
4075 | |||
4076 | pos0 += 3 |
||
4077 | |||
4078 | # end of "while" |
||
4079 | if ((priority0 >= priority) and |
||
4080 | ((_phonet_rules[pos0] is not None) and |
||
4081 | (_phonet_rules[pos0][0] == char0))): |
||
4082 | |||
4083 | pos += 3 |
||
4084 | continue |
||
4085 | |||
4086 | # replace string |
||
4087 | if ((_phonet_rules[pos] and |
||
4088 | ('<' in _phonet_rules[pos][1:]))): |
||
4089 | priority0 = 1 |
||
4090 | else: |
||
4091 | priority0 = 0 |
||
4092 | |||
4093 | rule = _phonet_rules[pos + mode] |
||
4094 | |||
4095 | if (priority0 == 1) and (zeta == 0): |
||
4096 | # rule with '<' is applied |
||
4097 | if ((j > 0) and rule and |
||
4098 | ((dest[j-1] == char) or |
||
4099 | (dest[j-1] == rule[0]))): |
||
4100 | j -= 1 |
||
4101 | |||
4102 | zeta0 = 1 |
||
4103 | zeta += 1 |
||
4104 | matches0 = 0 |
||
4105 | |||
4106 | while rule and src[i+matches0]: |
||
4107 | src = (src[0:i+matches0] + rule[0] + |
||
4108 | src[i+matches0+1:]) |
||
4109 | matches0 += 1 |
||
4110 | rule = rule[1:] |
||
4111 | |||
4112 | if matches0 < matches: |
||
4113 | src = (src[0:i+matches0] + |
||
4114 | src[i+matches:]) |
||
4115 | |||
4116 | char = src[i] |
||
4117 | else: |
||
4118 | i = i + matches - 1 |
||
4119 | zeta = 0 |
||
4120 | |||
4121 | while len(rule) > 1: |
||
4122 | if (j == 0) or (dest[j - 1] != rule[0]): |
||
4123 | dest = (dest[0:j] + rule[0] + |
||
4124 | dest[min(len(dest), j+1):]) |
||
4125 | j += 1 |
||
4126 | |||
4127 | rule = rule[1:] |
||
4128 | |||
4129 | # new "current char" |
||
4130 | if not rule: |
||
4131 | rule = '' |
||
4132 | char = '' |
||
4133 | else: |
||
4134 | char = rule[0] |
||
4135 | |||
4136 | if ((_phonet_rules[pos] and |
||
4137 | '^^' in _phonet_rules[pos][1:])): |
||
4138 | if char: |
||
4139 | dest = (dest[0:j] + char + |
||
4140 | dest[min(len(dest), j + 1):]) |
||
4141 | j += 1 |
||
4142 | |||
4143 | src = src[i + 1:] |
||
4144 | i = 0 |
||
4145 | zeta0 = 1 |
||
4146 | |||
4147 | break |
||
4148 | |||
4149 | pos += 3 |
||
4150 | |||
4151 | if pos > end1 and start2 > 0: |
||
4152 | pos = start2 |
||
4153 | start1 = start2 |
||
4154 | end1 = end2 |
||
4155 | start2 = -1 |
||
4156 | end2 = -1 |
||
4157 | |||
4158 | if zeta0 == 0: |
||
4159 | if char and ((j == 0) or (dest[j-1] != char)): |
||
4160 | # delete multiple letters only |
||
4161 | dest = dest[0:j] + char + dest[min(j+1, term_length):] |
||
4162 | j += 1 |
||
4163 | |||
4164 | i += 1 |
||
4165 | zeta = 0 |
||
4166 | |||
4167 | dest = dest[0:j] |
||
4168 | |||
4169 | return dest |
||
4170 | |||
4171 | _initialize_phonet(lang) |
||
4172 | |||
4173 | word = normalize('NFKC', text_type(word)) |
||
4174 | return _phonet(word, mode, lang) |
||
4175 | |||
4176 | |||
4177 | def spfc(word): |
||
4178 | """Return the Standardized Phonetic Frequency Code (SPFC) of a word. |
||
4179 | |||
4180 | Standardized Phonetic Frequency Code is roughly Soundex-like. |
||
4181 | This implementation is based on page 19-21 of :cite:`Moore:1977`. |
||
4182 | |||
4183 | :param str word: the word to transform |
||
4184 | :returns: the SPFC value |
||
4185 | :rtype: str |
||
4186 | |||
4187 | >>> spfc('Christopher Smith') |
||
4188 | '01160' |
||
4189 | >>> spfc('Christopher Schmidt') |
||
4190 | '01160' |
||
4191 | >>> spfc('Niall Smith') |
||
4192 | '01660' |
||
4193 | >>> spfc('Niall Schmidt') |
||
4194 | '01660' |
||
4195 | |||
4196 | >>> spfc('L.Smith') |
||
4197 | '01960' |
||
4198 | >>> spfc('R.Miller') |
||
4199 | '65490' |
||
4200 | |||
4201 | >>> spfc(('L', 'Smith')) |
||
4202 | '01960' |
||
4203 | >>> spfc(('R', 'Miller')) |
||
4204 | '65490' |
||
4205 | """ |
||
4206 | _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'), |
||
4207 | '0011112222334445556666777')) |
||
4208 | _pf2 = dict(zip((ord(_) for _ in |
||
4209 | 'SZCKQFPXABORDHIMNGJTUVWEL'), |
||
4210 | '0011122233445556677788899')) |
||
4211 | _pf3 = dict(zip((ord(_) for _ in |
||
4212 | 'BCKQVDTFLPGJXMNRSZAEHIOUWY'), |
||
4213 | '00000112223334456677777777')) |
||
4214 | |||
4215 | _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'), |
||
4216 | ('MN', 'N')) |
||
4217 | |||
4218 | def _raise_word_ex(): |
||
4219 | """Raise an AttributeError.""" |
||
4220 | raise AttributeError('word attribute must be a string with a space ' + |
||
4221 | 'or period dividing the first and last names ' + |
||
4222 | 'or a tuple/list consisting of the first and ' + |
||
4223 | 'last names') |
||
4224 | |||
4225 | if not word: |
||
4226 | return '' |
||
4227 | |||
4228 | if isinstance(word, (str, text_type)): |
||
4229 | names = word.split('.', 1) |
||
4230 | if len(names) != 2: |
||
4231 | names = word.split(' ', 1) |
||
4232 | if len(names) != 2: |
||
4233 | _raise_word_ex() |
||
4234 | elif hasattr(word, '__iter__'): |
||
4235 | if len(word) != 2: |
||
4236 | _raise_word_ex() |
||
4237 | names = word |
||
4238 | else: |
||
4239 | _raise_word_ex() |
||
4240 | |||
4241 | names = [normalize('NFKD', text_type(_.strip() |
||
4242 | .replace('ß', 'SS') |
||
4243 | .upper())) |
||
4244 | for _ in names] |
||
4245 | code = '' |
||
4246 | |||
4247 | def steps_one_to_three(name): |
||
4248 | """Perform the first three steps of SPFC.""" |
||
4249 | # filter out non A-Z |
||
4250 | name = ''.join(_ for _ in name if _ in |
||
4251 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
4252 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
4253 | 'W', 'X', 'Y', 'Z'}) |
||
4254 | |||
4255 | # 1. In the field, convert DK to K, DT to T, SC to S, KN to N, |
||
4256 | # and MN to N |
||
4257 | for subst in _substitutions: |
||
4258 | name = name.replace(subst[0], subst[1]) |
||
4259 | |||
4260 | # 2. In the name field, replace multiple letters with a single letter |
||
4261 | name = _delete_consecutive_repeats(name) |
||
4262 | |||
4263 | # 3. Remove vowels, W, H, and Y, but keep the first letter in the name |
||
4264 | # field. |
||
4265 | if name: |
||
4266 | name = name[0] + ''.join(_ for _ in name[1:] if _ not in |
||
4267 | {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}) |
||
4268 | return name |
||
4269 | |||
4270 | names = [steps_one_to_three(_) for _ in names] |
||
4271 | |||
4272 | # 4. The first digit of the code is obtained using PF1 and the first letter |
||
4273 | # of the name field. Remove this letter after coding. |
||
4274 | if names[1]: |
||
4275 | code += names[1][0].translate(_pf1) |
||
4276 | names[1] = names[1][1:] |
||
4277 | |||
4278 | # 5. Using the last letters of the name, use Table PF3 to obtain the |
||
4279 | # second digit of the code. Use as many letters as possible and remove |
||
4280 | # after coding. |
||
4281 | if names[1]: |
||
4282 | if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': |
||
4283 | code += '8' |
||
4284 | names[1] = names[1][:-3] |
||
4285 | elif names[1][-2:] == 'SN': |
||
4286 | code += '8' |
||
4287 | names[1] = names[1][:-2] |
||
4288 | elif names[1][-3:] == 'STR': |
||
4289 | code += '9' |
||
4290 | names[1] = names[1][:-3] |
||
4291 | elif names[1][-2:] in {'SR', 'TN', 'TD'}: |
||
4292 | code += '9' |
||
4293 | names[1] = names[1][:-2] |
||
4294 | elif names[1][-3:] == 'DRS': |
||
4295 | code += '7' |
||
4296 | names[1] = names[1][:-3] |
||
4297 | elif names[1][-2:] in {'TR', 'MN'}: |
||
4298 | code += '7' |
||
4299 | names[1] = names[1][:-2] |
||
4300 | else: |
||
4301 | code += names[1][-1].translate(_pf3) |
||
4302 | names[1] = names[1][:-1] |
||
4303 | |||
4304 | # 6. The third digit is found using Table PF2 and the first character of |
||
4305 | # the first name. Remove after coding. |
||
4306 | if names[0]: |
||
4307 | code += names[0][0].translate(_pf2) |
||
4308 | names[0] = names[0][1:] |
||
4309 | |||
4310 | # 7. The fourth digit is found using Table PF2 and the first character of |
||
4311 | # the name field. If no letters remain use zero. After coding remove the |
||
4312 | # letter. |
||
4313 | # 8. The fifth digit is found in the same manner as the fourth using the |
||
4314 | # remaining characters of the name field if any. |
||
4315 | for _ in range(2): |
||
4316 | if names[1]: |
||
4317 | code += names[1][0].translate(_pf2) |
||
4318 | names[1] = names[1][1:] |
||
4319 | else: |
||
4320 | code += '0' |
||
4321 | |||
4322 | return code |
||
4323 | |||
4324 | |||
4325 | def statistics_canada(word, maxlength=4): |
||
4326 | """Return the Statistics Canada code for a word. |
||
4327 | |||
4328 | The original description of this algorithm could not be located, and |
||
4329 | may only have been specified in an unpublished TR. The coding does not |
||
4330 | appear to be in use by Statistics Canada any longer. In its place, this is |
||
4331 | an implementation of the "Census modified Statistics Canada name coding |
||
4332 | procedure". |
||
4333 | |||
4334 | The modified version of this algorithm is described in Appendix B of |
||
4335 | :cite:`Moore:1977`. |
||
4336 | |||
4337 | :param str word: the word to transform |
||
4338 | :param int maxlength: the maximum length (default 6) of the code to return |
||
4339 | :param bool modified: indicates whether to use USDA modified algorithm |
||
4340 | :returns: the Statistics Canada name code value |
||
4341 | :rtype: str |
||
4342 | |||
4343 | >>> statistics_canada('Christopher') |
||
4344 | 'CHRS' |
||
4345 | >>> statistics_canada('Niall') |
||
4346 | 'NL' |
||
4347 | >>> statistics_canada('Smith') |
||
4348 | 'SMTH' |
||
4349 | >>> statistics_canada('Schmidt') |
||
4350 | 'SCHM' |
||
4351 | """ |
||
4352 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
4353 | word = normalize('NFKD', text_type(word.upper())) |
||
4354 | word = word.replace('ß', 'SS') |
||
4355 | word = ''.join(c for c in word if c in |
||
4356 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
4357 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
4358 | 'Y', 'Z'}) |
||
4359 | if not word: |
||
4360 | return '' |
||
4361 | |||
4362 | code = word[1:] |
||
4363 | for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
||
4364 | code = code.replace(vowel, '') |
||
4365 | code = word[0]+code |
||
4366 | code = _delete_consecutive_repeats(code) |
||
4367 | code = code.replace(' ', '') |
||
4368 | |||
4369 | return code[:maxlength] |
||
4370 | |||
4371 | |||
4372 | def lein(word, maxlength=4, zero_pad=True): |
||
4373 | """Return the Lein code for a word. |
||
4374 | |||
4375 | This is Lein name coding, described in :cite:`Moore:1977`. |
||
4376 | |||
4377 | :param str word: the word to transform |
||
4378 | :param int maxlength: the maximum length (default 4) of the code to return |
||
4379 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
4380 | maxlength string |
||
4381 | :returns: the Lein code |
||
4382 | :rtype: str |
||
4383 | |||
4384 | >>> lein('Christopher') |
||
4385 | 'C351' |
||
4386 | >>> lein('Niall') |
||
4387 | 'N300' |
||
4388 | >>> lein('Smith') |
||
4389 | 'S210' |
||
4390 | >>> lein('Schmidt') |
||
4391 | 'S521' |
||
4392 | """ |
||
4393 | _lein_translation = dict(zip((ord(_) for _ in |
||
4394 | 'BCDFGJKLMNPQRSTVXZ'), |
||
4395 | '451455532245351455')) |
||
4396 | |||
4397 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
4398 | word = normalize('NFKD', text_type(word.upper())) |
||
4399 | word = word.replace('ß', 'SS') |
||
4400 | word = ''.join(c for c in word if c in |
||
4401 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
4402 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
4403 | 'Y', 'Z'}) |
||
4404 | |||
4405 | if not word: |
||
4406 | return '' |
||
4407 | |||
4408 | code = word[0] # Rule 1 |
||
4409 | word = word[1:].translate({32: None, 65: None, 69: None, 72: None, |
||
4410 | 73: None, 79: None, 85: None, 87: None, |
||
4411 | 89: None}) # Rule 2 |
||
4412 | word = _delete_consecutive_repeats(word) # Rule 3 |
||
4413 | code += word.translate(_lein_translation) # Rule 4 |
||
4414 | |||
4415 | if zero_pad: |
||
4416 | code += ('0'*maxlength) # Rule 4 |
||
4417 | |||
4418 | return code[:maxlength] |
||
4419 | |||
4420 | |||
4421 | def roger_root(word, maxlength=5, zero_pad=True): |
||
4422 | """Return the Roger Root code for a word. |
||
4423 | |||
4424 | This is Roger Root name coding, described in :cite:`Moore:1977`. |
||
4425 | |||
4426 | :param str word: the word to transform |
||
4427 | :param int maxlength: the maximum length (default 5) of the code to return |
||
4428 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
4429 | maxlength string |
||
4430 | :returns: the Roger Root code |
||
4431 | :rtype: str |
||
4432 | |||
4433 | >>> roger_root('Christopher') |
||
4434 | '06401' |
||
4435 | >>> roger_root('Niall') |
||
4436 | '02500' |
||
4437 | >>> roger_root('Smith') |
||
4438 | '00310' |
||
4439 | >>> roger_root('Schmidt') |
||
4440 | '06310' |
||
4441 | """ |
||
4442 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
4443 | word = normalize('NFKD', text_type(word.upper())) |
||
4444 | word = word.replace('ß', 'SS') |
||
4445 | word = ''.join(c for c in word if c in |
||
4446 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
4447 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
4448 | 'Y', 'Z'}) |
||
4449 | |||
4450 | if not word: |
||
4451 | return '' |
||
4452 | |||
4453 | # '*' is used to prevent combining by _delete_consecutive_repeats() |
||
4454 | _init_patterns = {4: {'TSCH': '06'}, |
||
4455 | 3: {'TSH': '06', 'SCH': '06'}, |
||
4456 | 2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0', |
||
4457 | 'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02', |
||
4458 | 'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02', |
||
4459 | 'SH': '06', 'TS': '0*0', 'WR': '04'}, |
||
4460 | 1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1', |
||
4461 | 'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3', |
||
4462 | 'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1', |
||
4463 | 'P': '09', 'Q': '07', 'R': '04', 'S': '0*0', |
||
4464 | 'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07', |
||
4465 | 'Y': '5', 'Z': '0*0'}} |
||
4466 | |||
4467 | _med_patterns = {4: {'TSCH': '6'}, |
||
4468 | 3: {'TSH': '6', 'SCH': '6'}, |
||
4469 | 2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7', |
||
4470 | 'PH': '8', 'SH': '6', 'TS': '0'}, |
||
4471 | 1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7', |
||
4472 | 'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2', |
||
4473 | 'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1', |
||
4474 | 'V': '8', 'X': '7', 'Z': '0', |
||
4475 | 'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*', |
||
4476 | 'U': '*', 'W': '*', 'Y': '*'}} |
||
4477 | |||
4478 | code = '' |
||
4479 | pos = 0 |
||
4480 | |||
4481 | # Do first digit(s) first |
||
4482 | for num in range(4, 0, -1): |
||
4483 | if word[:num] in _init_patterns[num]: |
||
4484 | code = _init_patterns[num][word[:num]] |
||
4485 | pos += num |
||
4486 | break |
||
4487 | else: |
||
4488 | pos += 1 # Advance if nothing is recognized |
||
4489 | |||
4490 | # Then code subsequent digits |
||
4491 | while pos < len(word): |
||
4492 | for num in range(4, 0, -1): |
||
4493 | if word[pos:pos+num] in _med_patterns[num]: |
||
4494 | code += _med_patterns[num][word[pos:pos+num]] |
||
4495 | pos += num |
||
4496 | break |
||
4497 | else: |
||
4498 | pos += 1 # Advance if nothing is recognized |
||
4499 | |||
4500 | code = _delete_consecutive_repeats(code) |
||
4501 | code = code.replace('*', '') |
||
4502 | |||
4503 | if zero_pad: |
||
4504 | code += '0'*maxlength |
||
4505 | |||
4506 | return code[:maxlength] |
||
4507 | |||
4508 | |||
4509 | def onca(word, maxlength=4, zero_pad=True): |
||
4510 | """Return the Oxford Name Compression Algorithm (ONCA) code for a word. |
||
4511 | |||
4512 | This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`. |
||
4513 | |||
4514 | I can find no complete description of the "anglicised version of the NYSIIS |
||
4515 | method" identified as the first step in this algorithm, so this is likely |
||
4516 | not a precisely correct implementation, in that it employs the standard |
||
4517 | NYSIIS algorithm. |
||
4518 | |||
4519 | :param str word: the word to transform |
||
4520 | :param int maxlength: the maximum length (default 5) of the code to return |
||
4521 | :param bool zero_pad: pad the end of the return value with 0s to achieve a |
||
4522 | maxlength string |
||
4523 | :returns: the ONCA code |
||
4524 | :rtype: str |
||
4525 | |||
4526 | >>> onca('Christopher') |
||
4527 | 'C623' |
||
4528 | >>> onca('Niall') |
||
4529 | 'N400' |
||
4530 | >>> onca('Smith') |
||
4531 | 'S530' |
||
4532 | >>> onca('Schmidt') |
||
4533 | 'S530' |
||
4534 | """ |
||
4535 | # In the most extreme case, 3 characters of NYSIIS input can be compressed |
||
4536 | # to one character of output, so give it triple the maxlength. |
||
4537 | return soundex(nysiis(word, maxlength=maxlength*3), maxlength, |
||
4538 | zero_pad=zero_pad) |
||
4539 | |||
4540 | |||
4541 | def eudex(word, maxlength=8): |
||
4542 | """Return the eudex phonetic hash of a word. |
||
4543 | |||
4544 | This implementation of eudex phonetic hashing is based on the specification |
||
4545 | (not the reference implementation) at :cite:`Ticki:2016`. |
||
4546 | |||
4547 | Further details can be found at :cite:`Ticki:2016b`. |
||
4548 | |||
4549 | :param str word: the word to transform |
||
4550 | :param int maxlength: the length of the code returned (defaults to 8) |
||
4551 | :returns: the eudex hash |
||
4552 | :rtype: str |
||
4553 | """ |
||
4554 | _trailing_phones = { |
||
4555 | 'a': 0, # a |
||
4556 | 'b': 0b01001000, # b |
||
4557 | 'c': 0b00001100, # c |
||
4558 | 'd': 0b00011000, # d |
||
4559 | 'e': 0, # e |
||
4560 | 'f': 0b01000100, # f |
||
4561 | 'g': 0b00001000, # g |
||
4562 | 'h': 0b00000100, # h |
||
4563 | 'i': 1, # i |
||
4564 | 'j': 0b00000101, # j |
||
4565 | 'k': 0b00001001, # k |
||
4566 | 'l': 0b10100000, # l |
||
4567 | 'm': 0b00000010, # m |
||
4568 | 'n': 0b00010010, # n |
||
4569 | 'o': 0, # o |
||
4570 | 'p': 0b01001001, # p |
||
4571 | 'q': 0b10101000, # q |
||
4572 | 'r': 0b10100001, # r |
||
4573 | 's': 0b00010100, # s |
||
4574 | 't': 0b00011101, # t |
||
4575 | 'u': 1, # u |
||
4576 | 'v': 0b01000101, # v |
||
4577 | 'w': 0b00000000, # w |
||
4578 | 'x': 0b10000100, # x |
||
4579 | 'y': 1, # y |
||
4580 | 'z': 0b10010100, # z |
||
4581 | |||
4582 | 'ß': 0b00010101, # ß |
||
4583 | 'à': 0, # à |
||
4584 | 'á': 0, # á |
||
4585 | 'â': 0, # â |
||
4586 | 'ã': 0, # ã |
||
4587 | 'ä': 0, # ä[æ] |
||
4588 | 'å': 1, # å[oː] |
||
4589 | 'æ': 0, # æ[æ] |
||
4590 | 'ç': 0b10010101, # ç[t͡ʃ] |
||
4591 | 'è': 1, # è |
||
4592 | 'é': 1, # é |
||
4593 | 'ê': 1, # ê |
||
4594 | 'ë': 1, # ë |
||
4595 | 'ì': 1, # ì |
||
4596 | 'í': 1, # í |
||
4597 | 'î': 1, # î |
||
4598 | 'ï': 1, # ï |
||
4599 | 'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T) |
||
4600 | 'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j) |
||
4601 | 'ò': 0, # ò |
||
4602 | 'ó': 0, # ó |
||
4603 | 'ô': 0, # ô |
||
4604 | 'õ': 0, # õ |
||
4605 | 'ö': 1, # ö[ø] |
||
4606 | '÷': 0b11111111, # ÷ |
||
4607 | 'ø': 1, # ø[ø] |
||
4608 | 'ù': 1, # ù |
||
4609 | 'ú': 1, # ú |
||
4610 | 'û': 1, # û |
||
4611 | 'ü': 1, # ü |
||
4612 | 'ý': 1, # ý |
||
4613 | 'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T) |
||
4614 | 'ÿ': 1, # ÿ |
||
4615 | } |
||
4616 | |||
4617 | _initial_phones = { |
||
4618 | 'a': 0b10000100, # a* |
||
4619 | 'b': 0b00100100, # b |
||
4620 | 'c': 0b00000110, # c |
||
4621 | 'd': 0b00001100, # d |
||
4622 | 'e': 0b11011000, # e* |
||
4623 | 'f': 0b00100010, # f |
||
4624 | 'g': 0b00000100, # g |
||
4625 | 'h': 0b00000010, # h |
||
4626 | 'i': 0b11111000, # i* |
||
4627 | 'j': 0b00000011, # j |
||
4628 | 'k': 0b00000101, # k |
||
4629 | 'l': 0b01010000, # l |
||
4630 | 'm': 0b00000001, # m |
||
4631 | 'n': 0b00001001, # n |
||
4632 | 'o': 0b10010100, # o* |
||
4633 | 'p': 0b00100101, # p |
||
4634 | 'q': 0b01010100, # q |
||
4635 | 'r': 0b01010001, # r |
||
4636 | 's': 0b00001010, # s |
||
4637 | 't': 0b00001110, # t |
||
4638 | 'u': 0b11100000, # u* |
||
4639 | 'v': 0b00100011, # v |
||
4640 | 'w': 0b00000000, # w |
||
4641 | 'x': 0b01000010, # x |
||
4642 | 'y': 0b11100100, # y* |
||
4643 | 'z': 0b01001010, # z |
||
4644 | |||
4645 | 'ß': 0b00001011, # ß |
||
4646 | 'à': 0b10000101, # à |
||
4647 | 'á': 0b10000101, # á |
||
4648 | 'â': 0b10000000, # â |
||
4649 | 'ã': 0b10000110, # ã |
||
4650 | 'ä': 0b10100110, # ä [æ] |
||
4651 | 'å': 0b11000010, # å [oː] |
||
4652 | 'æ': 0b10100111, # æ [æ] |
||
4653 | 'ç': 0b01010100, # ç [t͡ʃ] |
||
4654 | 'è': 0b11011001, # è |
||
4655 | 'é': 0b11011001, # é |
||
4656 | 'ê': 0b11011001, # ê |
||
4657 | 'ë': 0b11000110, # ë [ə] or [œ] |
||
4658 | 'ì': 0b11111001, # ì |
||
4659 | 'í': 0b11111001, # í |
||
4660 | 'î': 0b11111001, # î |
||
4661 | 'ï': 0b11111001, # ï |
||
4662 | 'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T) |
||
4663 | 'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j) |
||
4664 | 'ò': 0b10010101, # ò |
||
4665 | 'ó': 0b10010101, # ó |
||
4666 | 'ô': 0b10010101, # ô |
||
4667 | 'õ': 0b10010101, # õ |
||
4668 | 'ö': 0b11011100, # ö [œ] or [ø] |
||
4669 | '÷': 0b11111111, # ÷ |
||
4670 | 'ø': 0b11011101, # ø [œ] or [ø] |
||
4671 | 'ù': 0b11100001, # ù |
||
4672 | 'ú': 0b11100001, # ú |
||
4673 | 'û': 0b11100001, # û |
||
4674 | 'ü': 0b11100101, # ü |
||
4675 | 'ý': 0b11100101, # ý |
||
4676 | 'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T) |
||
4677 | 'ÿ': 0b11100101, # ÿ |
||
4678 | } |
||
4679 | # Lowercase input & filter unknown characters |
||
4680 | word = ''.join(char for char in word.lower() if char in _initial_phones) |
||
4681 | |||
4682 | if not word: |
||
4683 | word = '÷' |
||
4684 | |||
4685 | # Perform initial eudex coding of each character |
||
4686 | values = [_initial_phones[word[0]]] |
||
4687 | values += [_trailing_phones[char] for char in word[1:]] |
||
4688 | |||
4689 | # Right-shift by one to determine if second instance should be skipped |
||
4690 | shifted_values = [_ >> 1 for _ in values] |
||
4691 | condensed_values = [values[0]] |
||
4692 | for n in range(1, len(shifted_values)): |
||
4693 | if shifted_values[n] != shifted_values[n-1]: |
||
4694 | condensed_values.append(values[n]) |
||
4695 | |||
4696 | # Add padding after first character & trim beyond maxlength |
||
4697 | values = ([condensed_values[0]] + |
||
4698 | [0]*max(0, maxlength - len(condensed_values)) + |
||
4699 | condensed_values[1:maxlength]) |
||
4700 | |||
4701 | # Combine individual character values into eudex hash |
||
4702 | hash_value = 0 |
||
4703 | for val in values: |
||
4704 | hash_value = (hash_value << 8) | val |
||
4705 | |||
4706 | return hash_value |
||
4707 | |||
4708 | |||
4709 | def haase_phonetik(word, primary_only=False): |
||
4710 | """Return the Haase Phonetik (numeric output) code for a word. |
||
4711 | |||
4712 | Based on the algorithm described at :cite:`Prante:2015`. |
||
4713 | |||
4714 | Based on the original :cite:`Haase:2000`. |
||
4715 | |||
4716 | While the output code is numeric, it is nevertheless a str. |
||
4717 | |||
4718 | :param str word: the word to transform |
||
4719 | :returns: the Haase Phonetik value as a numeric string |
||
4720 | :rtype: str |
||
4721 | """ |
||
4722 | def _after(word, i, letters): |
||
4723 | """Return True if word[i] follows one of the supplied letters.""" |
||
4724 | if i > 0 and word[i-1] in letters: |
||
4725 | return True |
||
4726 | return False |
||
4727 | |||
4728 | def _before(word, i, letters): |
||
4729 | """Return True if word[i] precedes one of the supplied letters.""" |
||
4730 | if i+1 < len(word) and word[i+1] in letters: |
||
4731 | return True |
||
4732 | return False |
||
4733 | |||
4734 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
||
4735 | |||
4736 | word = normalize('NFKD', text_type(word.upper())) |
||
4737 | word = word.replace('ß', 'SS') |
||
4738 | |||
4739 | word = word.replace('Ä', 'AE') |
||
4740 | word = word.replace('Ö', 'OE') |
||
4741 | word = word.replace('Ü', 'UE') |
||
4742 | word = ''.join(c for c in word if c in |
||
4743 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
4744 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
4745 | 'Y', 'Z'}) |
||
4746 | |||
4747 | # Nothing to convert, return base case |
||
4748 | if not word: |
||
4749 | return '' |
||
4750 | |||
4751 | variants = [] |
||
4752 | if primary_only: |
||
4753 | variants = [word] |
||
4754 | else: |
||
4755 | pos = 0 |
||
4756 | if word[:2] == 'CH': |
||
4757 | variants.append(('CH', 'SCH')) |
||
4758 | pos += 2 |
||
4759 | len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', |
||
4760 | 'AUX': 'O', 'EUX': 'O'} |
||
4761 | while pos < len(word): |
||
4762 | if word[pos:pos+4] == 'ILLE': |
||
4763 | variants.append(('ILLE', 'I')) |
||
4764 | pos += 4 |
||
4765 | elif word[pos:pos+3] in len_3_vars: |
||
4766 | variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]])) |
||
4767 | pos += 3 |
||
4768 | elif word[pos:pos+2] == 'RB': |
||
4769 | variants.append(('RB', 'RW')) |
||
4770 | pos += 2 |
||
4771 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
||
4772 | variants.append(('EAU', 'O')) |
||
4773 | pos += 3 |
||
4774 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
||
4775 | if word[pos:] == 'O': |
||
4776 | variants.append(('O', 'OW')) |
||
4777 | else: |
||
4778 | variants.append(('A', 'AR')) |
||
4779 | pos += 1 |
||
4780 | else: |
||
4781 | variants.append((word[pos],)) |
||
4782 | pos += 1 |
||
4783 | |||
4784 | variants = [''.join(letters) for letters in product(*variants)] |
||
4785 | |||
4786 | def _haase_code(word): |
||
4787 | sdx = '' |
||
4788 | for i in range(len(word)): |
||
4789 | View Code Duplication | if word[i] in _vowels: |
|
4790 | sdx += '9' |
||
4791 | elif word[i] == 'B': |
||
4792 | sdx += '1' |
||
4793 | elif word[i] == 'P': |
||
4794 | if _before(word, i, {'H'}): |
||
4795 | sdx += '3' |
||
4796 | else: |
||
4797 | sdx += '1' |
||
4798 | elif word[i] in {'D', 'T'}: |
||
4799 | if _before(word, i, {'C', 'S', 'Z'}): |
||
4800 | sdx += '8' |
||
4801 | else: |
||
4802 | sdx += '2' |
||
4803 | elif word[i] in {'F', 'V', 'W'}: |
||
4804 | sdx += '3' |
||
4805 | elif word[i] in {'G', 'K', 'Q'}: |
||
4806 | sdx += '4' |
||
4807 | elif word[i] == 'C': |
||
4808 | if _after(word, i, {'S', 'Z'}): |
||
4809 | sdx += '8' |
||
4810 | elif i == 0: |
||
4811 | if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', |
||
4812 | 'U', 'X'}): |
||
4813 | sdx += '4' |
||
4814 | else: |
||
4815 | sdx += '8' |
||
4816 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
||
4817 | sdx += '4' |
||
4818 | else: |
||
4819 | sdx += '8' |
||
4820 | elif word[i] == 'X': |
||
4821 | if _after(word, i, {'C', 'K', 'Q'}): |
||
4822 | sdx += '8' |
||
4823 | else: |
||
4824 | sdx += '48' |
||
4825 | elif word[i] == 'L': |
||
4826 | sdx += '5' |
||
4827 | elif word[i] in {'M', 'N'}: |
||
4828 | sdx += '6' |
||
4829 | elif word[i] == 'R': |
||
4830 | sdx += '7' |
||
4831 | elif word[i] in {'S', 'Z'}: |
||
4832 | sdx += '8' |
||
4833 | |||
4834 | sdx = _delete_consecutive_repeats(sdx) |
||
4835 | |||
4836 | # if sdx: |
||
4837 | # sdx = sdx[0] + sdx[1:].replace('9', '') |
||
4838 | |||
4839 | return sdx |
||
4840 | |||
4841 | return tuple(_haase_code(word) for word in variants) |
||
4842 | |||
4843 | |||
4844 | def reth_schek_phonetik(word): |
||
4845 | """Return Reth-Schek Phonetik code for a word. |
||
4846 | |||
4847 | This algorithm is proposed in :cite:`Reth:1977`. |
||
4848 | |||
4849 | Since I couldn't secure a copy of that document (maybe I'll look for it |
||
4850 | next time I'm in Germany), this implementation is based on what I could |
||
4851 | glean from the implementations published by German Record Linkage |
||
4852 | Center (www.record-linkage.de): |
||
4853 | |||
4854 | - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` |
||
4855 | - Merge ToolBox (in Java) :cite:`Schnell:2004` |
||
4856 | |||
4857 | Rules that are unclear: |
||
4858 | |||
4859 | - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
||
4860 | - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
||
4861 | - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
||
4862 | think of a German word with '-tui-' in it.) |
||
4863 | - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
||
4864 | |||
4865 | :param word: |
||
4866 | :return: |
||
4867 | """ |
||
4868 | replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', |
||
4869 | 'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', |
||
4870 | 'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'}, |
||
4871 | 2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', |
||
4872 | 'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', |
||
4873 | 'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', |
||
4874 | 'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', |
||
4875 | 'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', |
||
4876 | 'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', |
||
4877 | 'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', |
||
4878 | 'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', |
||
4879 | 'SS': 'S', 'KW': 'QU'}, |
||
4880 | 1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', |
||
4881 | 'K': 'G', 'Y': 'I'}} |
||
4882 | |||
4883 | # Uppercase |
||
4884 | word = word.upper() |
||
4885 | |||
4886 | # Replace umlauts/eszett |
||
4887 | word = word.replace('Ä', 'AE') |
||
4888 | word = word.replace('Ö', 'OE') |
||
4889 | word = word.replace('Ü', 'UE') |
||
4890 | word = word.replace('ß', 'SS') |
||
4891 | |||
4892 | # Main loop, using above replacements table |
||
4893 | pos = 0 |
||
4894 | while pos < len(word): |
||
4895 | for num in range(3, 0, -1): |
||
4896 | if word[pos:pos+num] in replacements[num]: |
||
4897 | word = (word[:pos] + replacements[num][word[pos:pos+num]] |
||
4898 | + word[pos+num:]) |
||
4899 | pos += 1 |
||
4900 | break |
||
4901 | else: |
||
4902 | pos += 1 # Advance if nothing is recognized |
||
4903 | |||
4904 | # Change 'CH' back(?) to 'SCH' |
||
4905 | word = word.replace('CH', 'SCH') |
||
4906 | |||
4907 | # Replace final sequences |
||
4908 | if word[-2:] == 'ER': |
||
4909 | word = word[:-2]+'R' |
||
4910 | elif word[-2:] == 'EL': |
||
4911 | word = word[:-2]+'L' |
||
4912 | elif word[-1] == 'H': |
||
4913 | word = word[:-1] |
||
4914 | |||
4915 | return word |
||
4916 | |||
4917 | |||
4918 | def fonem(word): |
||
4919 | """Return the FONEM code of a word. |
||
4920 | |||
4921 | FONEM is a phonetic algorithm designed for French (particularly surnames in |
||
4922 | Saguenay, Canada), defined in :cite:`Bouchard:1981`. |
||
4923 | |||
4924 | Guillaume Plique's Javascript implementation :cite:`Plique:2018` at |
||
4925 | https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
||
4926 | was also consulted for this implementation. |
||
4927 | |||
4928 | :param str word: the word to transform |
||
4929 | :returns: the FONEM code |
||
4930 | :rtype: str |
||
4931 | """ |
||
4932 | # I don't see a sane way of doing this without regexps :( |
||
4933 | rule_table = { |
||
4934 | # Vowels & groups of vowels |
||
4935 | 'V-1': (re_compile('E?AU'), 'O'), |
||
4936 | 'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), |
||
4937 | 'V-3,4': (re_compile('E?AU[TX]$'), 'O'), |
||
4938 | 'V-6': (re_compile('E?AUL?D$'), 'O'), |
||
4939 | 'V-7': (re_compile(r'(?<!G)AY$'), 'E'), |
||
4940 | 'V-8': (re_compile('EUX$'), 'EU'), |
||
4941 | 'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
||
4942 | 'V-10': ('Y', 'I'), |
||
4943 | 'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
||
4944 | 'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), |
||
4945 | 'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
||
4946 | 'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), |
||
4947 | # Nasal vowels |
||
4948 | 'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
||
4949 | 'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
||
4950 | 'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
||
4951 | 'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
||
4952 | 'IN'), |
||
4953 | 'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), |
||
4954 | 'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
4955 | 'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'), |
||
4956 | # Consonants and groups of consonants |
||
4957 | 'C-1': ('BV', 'V'), |
||
4958 | 'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
||
4959 | 'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
||
4960 | 'C-4': (re_compile('^C(?=[EIY])'), 'S'), |
||
4961 | 'C-5': (re_compile('^C(?=[OUA])'), 'K'), |
||
4962 | 'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), |
||
4963 | 'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
||
4964 | 'C-8': (re_compile('CC(?=[AOU])'), 'K'), |
||
4965 | 'C-9': (re_compile('CC(?=[EIY])'), 'X'), |
||
4966 | 'C-10': (re_compile('G(?=[EIY])'), 'J'), |
||
4967 | 'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), |
||
4968 | 'C-12': (re_compile('GE(O|AU)'), 'JO'), |
||
4969 | 'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), |
||
4970 | 'C-14': (re_compile('(?<![PCS])H'), ''), |
||
4971 | 'C-15': ('JEA', 'JA'), |
||
4972 | 'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
||
4973 | 'C-17': (re_compile('^MC'), 'MA#'), |
||
4974 | 'C-18': ('PH', 'F'), |
||
4975 | 'C-19': ('QU', 'K'), |
||
4976 | 'C-20': (re_compile('^SC(?=[EIY])'), 'S'), |
||
4977 | 'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), |
||
4978 | 'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), |
||
4979 | 'C-23': ('SH', 'CH'), |
||
4980 | 'C-24': (re_compile('TIA$'), 'SSIA'), |
||
4981 | 'C-25': (re_compile('(?<=[AIOUY])W'), ''), |
||
4982 | 'C-26': (re_compile('X[CSZ]'), 'X'), |
||
4983 | 'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
||
4984 | 'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
||
4985 | 'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
||
4986 | 'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
||
4987 | 'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
||
4988 | 'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
||
4989 | 'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), |
||
4990 | 'C-28d': (re_compile('ILE$'), 'ILLE'), |
||
4991 | 'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + |
||
4992 | 'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
||
4993 | lambda m: (m.group(1) or '') + (m.group(2) or '')), |
||
4994 | 'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
||
4995 | 'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), |
||
4996 | # Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
||
4997 | 'C-34': ('G#', 'GA'), |
||
4998 | 'C-35': ('MA#', 'MAC') |
||
4999 | } |
||
5000 | rule_order = [ |
||
5001 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
5002 | 'C-12', |
||
5003 | 'C-8', 'C-9', 'C-10', |
||
5004 | 'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
||
5005 | 'V-2,5', 'V-3,4', 'V-6', |
||
5006 | 'V-1', 'C-14', |
||
5007 | 'C-31,33', 'C-30,32', |
||
5008 | 'C-11', 'V-15', 'V-17', 'V-18', |
||
5009 | 'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
||
5010 | 'V-19', 'V-20', |
||
5011 | 'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
||
5012 | 'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
||
5013 | 'C-25', 'C-26', 'C-27', |
||
5014 | 'C-29', |
||
5015 | 'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
||
5016 | 'C-34', 'C-35' |
||
5017 | ] |
||
5018 | |||
5019 | # normalize, upper-case, and filter non-French letters |
||
5020 | word = normalize('NFKD', text_type(word.upper())) |
||
5021 | word = word.translate({198: 'AE', 338: 'OE'}) |
||
5022 | word = ''.join(c for c in word if c in |
||
5023 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
5024 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
5025 | 'Y', 'Z', '-'}) |
||
5026 | |||
5027 | for rule in rule_order: |
||
5028 | regex, repl = rule_table[rule] |
||
5029 | if isinstance(regex, text_type): |
||
5030 | word = word.replace(regex, repl) |
||
5031 | else: |
||
5032 | word = regex.sub(repl, word) |
||
5033 | |||
5034 | return word |
||
5035 | |||
5036 | |||
5037 | def parmar_kumbharana(word): |
||
5038 | """Return the Parmar-Kumbharana encoding of a word. |
||
5039 | |||
5040 | This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`. |
||
5041 | |||
5042 | :param word: |
||
5043 | :return: |
||
5044 | """ |
||
5045 | rule_table = {4: {'OUGH': 'F'}, |
||
5046 | 3: {'DGE': 'J', |
||
5047 | 'OUL': 'U', |
||
5048 | 'GHT': 'T'}, |
||
5049 | 2: {'CE': 'S', 'CI': 'S', 'CY': 'S', |
||
5050 | 'GE': 'J', 'GI': 'J', 'GY': 'J', |
||
5051 | 'WR': 'R', |
||
5052 | 'GN': 'N', 'KN': 'N', 'PN': 'N', |
||
5053 | 'CK': 'K', |
||
5054 | 'SH': 'S'}} |
||
5055 | vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} |
||
5056 | |||
5057 | word = word.upper() # Rule 3 |
||
5058 | word = _delete_consecutive_repeats(word) # Rule 4 |
||
5059 | |||
5060 | # Rule 5 |
||
5061 | i = 0 |
||
5062 | while i < len(word): |
||
5063 | for match_len in range(4, 1, -1): |
||
5064 | if word[i:i+match_len] in rule_table[match_len]: |
||
5065 | repl = rule_table[match_len][word[i:i+match_len]] |
||
5066 | word = (word[:i] + repl + word[i+match_len:]) |
||
5067 | i += len(repl) |
||
5068 | break |
||
5069 | else: |
||
5070 | i += 1 |
||
5071 | |||
5072 | word = word[0]+word[1:].translate(vowel_trans) # Rule 6 |
||
5073 | return word |
||
5074 | |||
5075 | |||
5076 | def davidson(lname, fname='.', omit_fname=False): |
||
5077 | """Return Davidson's Consonant Code. |
||
5078 | |||
5079 | This is based on the name compression system described in |
||
5080 | :cite:`Davidson:1962`. |
||
5081 | |||
5082 | :cite:`Dolby:1970` identifies this as having been the name compression |
||
5083 | algorithm used by SABRE. |
||
5084 | |||
5085 | :param str lname: Last name (or word) to be encoded |
||
5086 | :param str fname: First name (optional), of which the first character is |
||
5087 | included in the code. |
||
5088 | :param str omit_fname: Set to True to completely omit the first character |
||
5089 | of the first name |
||
5090 | :return: Davidson's Consonant Code |
||
5091 | """ |
||
5092 | trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''} |
||
5093 | |||
5094 | lname = text_type(lname.upper()) |
||
5095 | code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans)) |
||
5096 | code = code[:4] + (4-len(code))*' ' |
||
5097 | |||
5098 | if not omit_fname: |
||
5099 | code += fname[:1].upper() |
||
5100 | |||
5101 | return code |
||
5102 | |||
5103 | |||
5104 | def sound_d(word, maxlength=4): |
||
5105 | """Return the SoundD code. |
||
5106 | |||
5107 | SoundD is defined in :cite:`Varol:2012`. |
||
5108 | |||
5109 | :param str word: the word to transform |
||
5110 | :param int maxlength: the length of the code returned (defaults to 4) |
||
5111 | :return: |
||
5112 | """ |
||
5113 | _ref_soundd_translation = dict(zip((ord(_) for _ in |
||
5114 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
5115 | '01230120022455012623010202')) |
||
5116 | |||
5117 | word = normalize('NFKD', text_type(word.upper())) |
||
5118 | word = word.replace('ß', 'SS') |
||
5119 | word = ''.join(c for c in word if c in |
||
5120 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
5121 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
5122 | 'Y', 'Z'}) |
||
5123 | |||
5124 | if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: |
||
5125 | word = word[1:] |
||
5126 | elif word[:1] == 'X': |
||
5127 | word = 'S'+word[1:] |
||
5128 | elif word[:2] == 'WH': |
||
5129 | word = 'W'+word[2:] |
||
5130 | |||
5131 | word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') |
||
5132 | |||
5133 | word = word.translate(_ref_soundd_translation) |
||
5134 | word = _delete_consecutive_repeats(word) |
||
5135 | word = word.replace('0', '') |
||
5136 | |||
5137 | if maxlength is not None: |
||
5138 | if len(word) < maxlength: |
||
5139 | word += '0' * (maxlength-len(word)) |
||
5140 | else: |
||
5141 | word = word[:maxlength] |
||
5142 | |||
5143 | return word |
||
5144 | |||
5145 | |||
5146 | def pshp_soundex_last(lname, maxlength=4, german=False): |
||
5147 | """Calculate the PSHP Soundex/Viewex Coding of a last name. |
||
5148 | |||
5149 | This coding is based on :cite:`Hershberg:1976`. |
||
5150 | |||
5151 | Reference was also made to the German version of the same: |
||
5152 | :cite:`Hershberg:1979`. |
||
5153 | |||
5154 | A separate function, pshp_soundex_first() is used for first names. |
||
5155 | |||
5156 | :param lname: the last name to encode |
||
5157 | :param german: set to True if the name is German (different rules apply) |
||
5158 | :return: |
||
5159 | """ |
||
5160 | lname = normalize('NFKD', text_type(lname.upper())) |
||
5161 | lname = lname.replace('ß', 'SS') |
||
5162 | lname = ''.join(c for c in lname if c in |
||
5163 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
5164 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
5165 | 'W', 'X', 'Y', 'Z'}) |
||
5166 | |||
5167 | # A. Prefix treatment |
||
5168 | if lname[:3] == 'VON' or lname[:3] == 'VAN': |
||
5169 | lname = lname[3:].strip() |
||
5170 | |||
5171 | # The rule implemented below says "MC, MAC become 1". I believe it meant to |
||
5172 | # say they become M except in German data (where superscripted 1 indicates |
||
5173 | # "except in German data"). It doesn't make sense for them to become 1 |
||
5174 | # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have |
||
5175 | # this error(?). |
||
5176 | if not german: |
||
5177 | if lname[:3] == 'MAC': |
||
5178 | lname = 'M'+lname[3:] |
||
5179 | elif lname[:2] == 'MC': |
||
5180 | lname = 'M'+lname[2:] |
||
5181 | |||
5182 | # The non-German-only rule to strip ' is unnecessary due to filtering |
||
5183 | |||
5184 | if lname[:1] in {'E', 'I', 'O', 'U'}: |
||
5185 | lname = 'A' + lname[1:] |
||
5186 | elif lname[:2] in {'GE', 'GI', 'GY'}: |
||
5187 | lname = 'J' + lname[1:] |
||
5188 | elif lname[:2] in {'CE', 'CI', 'CY'}: |
||
5189 | lname = 'S' + lname[1:] |
||
5190 | elif lname[:3] == 'CHR': |
||
5191 | lname = 'K' + lname[1:] |
||
5192 | elif lname[:1] == 'C' and lname[:2] != 'CH': |
||
5193 | lname = 'K' + lname[1:] |
||
5194 | |||
5195 | if lname[:2] == 'KN': |
||
5196 | lname = 'N' + lname[1:] |
||
5197 | elif lname[:2] == 'PH': |
||
5198 | lname = 'F' + lname[1:] |
||
5199 | elif lname[:3] in {'WIE', 'WEI'}: |
||
5200 | lname = 'V' + lname[1:] |
||
5201 | |||
5202 | if german and lname[:1] in {'W', 'M', 'Y', 'Z'}: |
||
5203 | lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:] |
||
5204 | |||
5205 | code = lname[:1] |
||
5206 | |||
5207 | # B. Postfix treatment |
||
5208 | if lname[-1:] == 'R': |
||
5209 | lname = lname[:-1] + 'N' |
||
5210 | elif lname[-2:] in {'SE', 'CE'}: |
||
5211 | lname = lname[:-2] |
||
5212 | if lname[-2:] == 'SS': |
||
5213 | lname = lname[:-2] |
||
5214 | elif lname[-1:] == 'S': |
||
5215 | lname = lname[:-1] |
||
5216 | |||
5217 | if not german: |
||
5218 | l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} |
||
5219 | l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', |
||
5220 | 'STON': 'SAON'} |
||
5221 | if lname[-5:] in l5_repl: |
||
5222 | lname = lname[:-5] + l5_repl[lname[-5:]] |
||
5223 | elif lname[-4:] in l4_repl: |
||
5224 | lname = lname[:-4] + l4_repl[lname[-4:]] |
||
5225 | |||
5226 | if lname[-2:] in {'NG', 'ND'}: |
||
5227 | lname = lname[:-1] |
||
5228 | if not german and lname[-3:] in {'GAN', 'GEN'}: |
||
5229 | lname = lname[:-3]+'A'+lname[-2:] |
||
5230 | |||
5231 | if german: |
||
5232 | if lname[-3:] == 'TES': |
||
5233 | lname = lname[:-3] |
||
5234 | elif lname[-2:] == 'TS': |
||
5235 | lname = lname[:-2] |
||
5236 | if lname[-3:] == 'TZE': |
||
5237 | lname = lname[:-3] |
||
5238 | elif lname[-2:] == 'ZE': |
||
5239 | lname = lname[:-2] |
||
5240 | if lname[-1:] == 'Z': |
||
5241 | lname = lname[:-1] |
||
5242 | elif lname[-2:] == 'TE': |
||
5243 | lname = lname[:-2] |
||
5244 | |||
5245 | # C. Infix Treatment |
||
5246 | lname = lname.replace('CK', 'C') |
||
5247 | lname = lname.replace('SCH', 'S') |
||
5248 | lname = lname.replace('DT', 'T') |
||
5249 | lname = lname.replace('ND', 'N') |
||
5250 | lname = lname.replace('NG', 'N') |
||
5251 | lname = lname.replace('LM', 'M') |
||
5252 | lname = lname.replace('MN', 'M') |
||
5253 | lname = lname.replace('WIE', 'VIE') |
||
5254 | lname = lname.replace('WEI', 'VEI') |
||
5255 | |||
5256 | # D. Soundexing |
||
5257 | # code for X & Y are unspecified, but presumably are 2 & 0 |
||
5258 | _pshp_translation = dict(zip((ord(_) for _ in |
||
5259 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
5260 | '01230120022455012523010202')) |
||
5261 | |||
5262 | lname = lname.translate(_pshp_translation) |
||
5263 | lname = _delete_consecutive_repeats(lname) |
||
5264 | |||
5265 | code += lname[1:] |
||
5266 | code = code.replace('0', '') # rule 1 |
||
5267 | |||
5268 | if maxlength is not None: |
||
5269 | if len(code) < maxlength: |
||
5270 | code += '0' * (maxlength-len(code)) |
||
5271 | else: |
||
5272 | code = code[:maxlength] |
||
5273 | |||
5274 | return code |
||
5275 | |||
5276 | |||
5277 | def pshp_soundex_first(fname, maxlength=4, german=False): |
||
5278 | """Calculate the PSHP Soundex/Viewex Coding of a first name. |
||
5279 | |||
5280 | This coding is based on :cite:`Hershberg:1976`. |
||
5281 | |||
5282 | Reference was also made to the German version of the same: |
||
5283 | :cite:`Hershberg:1979`. |
||
5284 | |||
5285 | A separate function, pshp_soundex_last() is used for last names. |
||
5286 | |||
5287 | :param fname: the first name to encode |
||
5288 | :param german: set to True if the name is German (different rules apply) |
||
5289 | :return: |
||
5290 | """ |
||
5291 | fname = normalize('NFKD', text_type(fname.upper())) |
||
5292 | fname = fname.replace('ß', 'SS') |
||
5293 | fname = ''.join(c for c in fname if c in |
||
5294 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
||
5295 | 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
||
5296 | 'W', 'X', 'Y', 'Z'}) |
||
5297 | |||
5298 | # special rules |
||
5299 | if fname == 'JAMES': |
||
5300 | code = 'J7' |
||
5301 | elif fname == 'PAT': |
||
5302 | code = 'P7' |
||
5303 | |||
5304 | else: |
||
5305 | # A. Prefix treatment |
||
5306 | if fname[:2] in {'GE', 'GI', 'GY'}: |
||
5307 | fname = 'J' + fname[1:] |
||
5308 | elif fname[:2] in {'CE', 'CI', 'CY'}: |
||
5309 | fname = 'S' + fname[1:] |
||
5310 | elif fname[:3] == 'CHR': |
||
5311 | fname = 'K' + fname[1:] |
||
5312 | elif fname[:1] == 'C' and fname[:2] != 'CH': |
||
5313 | fname = 'K' + fname[1:] |
||
5314 | |||
5315 | if fname[:2] == 'KN': |
||
5316 | fname = 'N' + fname[1:] |
||
5317 | elif fname[:2] == 'PH': |
||
5318 | fname = 'F' + fname[1:] |
||
5319 | elif fname[:3] in {'WIE', 'WEI'}: |
||
5320 | fname = 'V' + fname[1:] |
||
5321 | |||
5322 | if german and fname[:1] in {'W', 'M', 'Y', 'Z'}: |
||
5323 | fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] + |
||
5324 | fname[1:]) |
||
5325 | |||
5326 | code = fname[:1] |
||
5327 | |||
5328 | # B. Soundex coding |
||
5329 | # code for Y unspecified, but presumably is 0 |
||
5330 | _pshp_translation = dict(zip((ord(_) for _ in |
||
5331 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
5332 | '01230120022455012523010202')) |
||
5333 | |||
5334 | fname = fname.translate(_pshp_translation) |
||
5335 | fname = _delete_consecutive_repeats(fname) |
||
5336 | |||
5337 | code += fname[1:] |
||
5338 | syl_ptr = code.find('0') |
||
5339 | syl2_ptr = code[syl_ptr + 1:].find('0') |
||
5340 | if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1: |
||
5341 | code = code[:syl_ptr + 2] |
||
5342 | |||
5343 | code = code.replace('0', '') # rule 1 |
||
5344 | |||
5345 | if maxlength is not None: |
||
5346 | if len(code) < maxlength: |
||
5347 | code += '0' * (maxlength-len(code)) |
||
5348 | else: |
||
5349 | code = code[:maxlength] |
||
5350 | |||
5351 | return code |
||
5352 | |||
5353 | |||
5354 | def henry_early(word, maxlength=3): |
||
5355 | """Calculate the early version of the Henry code for a word. |
||
5356 | |||
5357 | The early version of Henry coding is given in :cite:`Legare:1972`. This is |
||
5358 | different from the later version defined in :cite:`Henry:1976`. |
||
5359 | |||
5360 | :param word: |
||
5361 | :param int maxlength: the length of the code returned (defaults to 3) |
||
5362 | :return: |
||
5363 | """ |
||
5364 | _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
||
5365 | 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
||
5366 | _vows = {'A', 'E', 'I', 'O', 'U', 'Y'} |
||
5367 | _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O', |
||
5368 | 'EU': 'U'} |
||
5369 | _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'} |
||
5370 | _simple = {'W': 'V', 'X': 'S', 'V': 'S'} |
||
5371 | |||
5372 | word = normalize('NFKD', text_type(word.upper())) |
||
5373 | word = ''.join(c for c in word if c in |
||
5374 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
5375 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
5376 | 'Y', 'Z'}) |
||
5377 | |||
5378 | if not word: |
||
5379 | return '' |
||
5380 | |||
5381 | # Rule Ia seems to be covered entirely in II |
||
5382 | |||
5383 | # Rule Ib |
||
5384 | if word[0] in _vows: |
||
5385 | # Ib1 |
||
5386 | if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or |
||
5387 | (word[1:2] in _cons and word[2:3] not in _cons))): |
||
5388 | if word[0] == 'Y': |
||
5389 | word = 'I'+word[1:] |
||
5390 | # Ib2 |
||
5391 | elif word[1:2] in {'M', 'N'} and word[2:3] in _cons: |
||
5392 | if word[0] == 'E': |
||
5393 | word = 'A'+word[1:] |
||
5394 | elif word[0] in {'I', 'U', 'Y'}: |
||
5395 | word = 'E'+word[1:] |
||
5396 | # Ib3 |
||
5397 | elif word[:2] in _diph: |
||
5398 | word = _diph[word[:2]]+word[2:] |
||
5399 | # Ib4 |
||
5400 | elif word[1:2] in _vows and word[0] == 'Y': |
||
5401 | word = 'I' + word[1:] |
||
5402 | |||
5403 | code = '' |
||
5404 | skip = 0 |
||
5405 | |||
5406 | # Rule II |
||
5407 | for pos, char in enumerate(word): |
||
5408 | nxch = char[pos+1:pos+2] |
||
5409 | prev = char[pos-1:pos] |
||
5410 | |||
5411 | if skip: |
||
5412 | skip -= 1 |
||
5413 | elif char in _vows: |
||
5414 | code += char |
||
5415 | # IIc |
||
5416 | elif char == nxch: |
||
5417 | skip = 1 |
||
5418 | code += char |
||
5419 | elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}: |
||
5420 | skip = 1 |
||
5421 | code += word[pos+1] |
||
5422 | # IId |
||
5423 | elif char == 'H' and prev in _cons: |
||
5424 | continue |
||
5425 | elif char == 'S' and nxch in _cons: |
||
5426 | continue |
||
5427 | elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}: |
||
5428 | continue |
||
5429 | elif char == 'L' and nxch in {'M', 'N'}: |
||
5430 | continue |
||
5431 | elif char in {'M', 'N'} and prev in _vows and nxch in _cons: |
||
5432 | continue |
||
5433 | # IIa |
||
5434 | elif char in _unaltered: |
||
5435 | code += char |
||
5436 | # IIb |
||
5437 | elif char in _simple: |
||
5438 | code += _simple[char] |
||
5439 | elif char in {'C', 'G', 'P', 'Q', 'S'}: |
||
5440 | if char == 'C': |
||
5441 | if nxch in {'A', 'O', 'U', 'L', 'R'}: |
||
5442 | code += 'K' |
||
5443 | elif nxch in {'E', 'I', 'Y'}: |
||
5444 | code += 'J' |
||
5445 | elif nxch == 'H': |
||
5446 | if word[pos+2:pos+3] in _vows: |
||
5447 | code += 'C' |
||
5448 | elif word[pos+2:pos+3] in {'R', 'L'}: |
||
5449 | code += 'K' |
||
5450 | elif char == 'G': |
||
5451 | if nxch in {'A', 'O', 'U', 'L', 'R'}: |
||
5452 | code += 'G' |
||
5453 | elif nxch in {'E', 'I', 'Y'}: |
||
5454 | code += 'J' |
||
5455 | elif nxch == 'N': |
||
5456 | code += 'N' |
||
5457 | elif char == 'P': |
||
5458 | if nxch != 'H': |
||
5459 | code += 'P' |
||
5460 | else: |
||
5461 | code += 'F' |
||
5462 | elif char == 'Q': |
||
5463 | if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}: |
||
5464 | char += 'G' |
||
5465 | elif word[pos + 1:pos + 2] in {'UA', 'UO'}: |
||
5466 | char += 'K' |
||
5467 | elif char == 'S': |
||
5468 | if word[pos:pos+6] == 'SAINTE': |
||
5469 | code += 'X' |
||
5470 | skip = 5 |
||
5471 | elif word[pos:pos+5] == 'SAINT': |
||
5472 | code += 'X' |
||
5473 | skip = 4 |
||
5474 | elif word[pos:pos+3] == 'STE': |
||
5475 | code += 'X' |
||
5476 | skip = 2 |
||
5477 | elif word[pos:pos+2] == 'ST': |
||
5478 | code += 'X' |
||
5479 | skip = 1 |
||
5480 | else: |
||
5481 | code += 'S' |
||
5482 | else: # this should not be possible |
||
5483 | continue |
||
5484 | |||
5485 | # IIe1 |
||
5486 | if code[-4:] in {'AULT', 'EULT', 'OULT'}: |
||
5487 | code = code[:-2] |
||
5488 | elif code[-4:-3] in _vows and code[-3:] == 'MPS': |
||
5489 | code = code[:-3] |
||
5490 | elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}: |
||
5491 | code = code[:-2] |
||
5492 | elif code[-2:-1] == 'R' and code[-1:] in _cons: |
||
5493 | code = code[:-1] |
||
5494 | # IIe2 |
||
5495 | elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}: |
||
5496 | code = code[:-1] |
||
5497 | elif code[-2:] == 'ER': |
||
5498 | code = code[:-1] |
||
5499 | |||
5500 | # Drop non-initial vowels |
||
5501 | code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '', |
||
5502 | 89: ''}) |
||
5503 | |||
5504 | if maxlength is not None: |
||
5505 | code = code[:maxlength] |
||
5506 | |||
5507 | return code |
||
5508 | |||
5509 | |||
5510 | def norphone(word): |
||
5511 | """Return the Norphone code. |
||
5512 | |||
5513 | The reference implementation by Lars Marius Garshol is available in |
||
5514 | :cite:`Garshol:2015`. |
||
5515 | |||
5516 | Norphone was designed for Norwegian, but this implementation has been |
||
5517 | extended to support Swedish vowels as well. This function incorporates |
||
5518 | the "not implemented" rules from the above file's rule set. |
||
5519 | |||
5520 | :param word: |
||
5521 | :return: |
||
5522 | """ |
||
5523 | _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} |
||
5524 | |||
5525 | replacements = {4: {'SKEI': 'X'}, |
||
5526 | 3: {'SKJ': 'X', 'KEI': 'X'}, |
||
5527 | 2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K', |
||
5528 | 'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X', |
||
5529 | 'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'}, |
||
5530 | 1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}} |
||
5531 | |||
5532 | word = word.upper() |
||
5533 | |||
5534 | code = '' |
||
5535 | skip = 0 |
||
5536 | |||
5537 | if word[0:2] == 'AA': |
||
5538 | code = 'Å' |
||
5539 | skip = 2 |
||
5540 | elif word[0:2] == 'GI': |
||
5541 | code = 'J' |
||
5542 | skip = 2 |
||
5543 | elif word[0:3] == 'SKY': |
||
5544 | code = 'X' |
||
5545 | skip = 3 |
||
5546 | elif word[0:2] == 'EI': |
||
5547 | code = 'Æ' |
||
5548 | skip = 2 |
||
5549 | elif word[0:2] == 'KY': |
||
5550 | code = 'X' |
||
5551 | skip = 2 |
||
5552 | elif word[:1] == 'C': |
||
5553 | code = 'K' |
||
5554 | skip = 1 |
||
5555 | elif word[:1] == 'Ä': |
||
5556 | code = 'Æ' |
||
5557 | skip = 1 |
||
5558 | elif word[:1] == 'Ö': |
||
5559 | code = 'Ø' |
||
5560 | skip = 1 |
||
5561 | |||
5562 | if word[-2:] == 'DT': |
||
5563 | word = word[:-2]+'T' |
||
5564 | # Though the rules indicate this rule applies in all positions, the |
||
5565 | # reference implementation indicates it applies only in final position. |
||
5566 | elif word[-2:-1] in _vowels and word[-1:] == 'D': |
||
5567 | word = word[:-2] |
||
5568 | |||
5569 | for pos, char in enumerate(word): |
||
5570 | if skip: |
||
5571 | skip -= 1 |
||
5572 | else: |
||
5573 | for length in sorted(replacements, reverse=True): |
||
5574 | if word[pos:pos+length] in replacements[length]: |
||
5575 | code += replacements[length][word[pos:pos+length]] |
||
5576 | skip = length-1 |
||
5577 | break |
||
5578 | else: |
||
5579 | if not pos or char not in _vowels: |
||
5580 | code += char |
||
5581 | |||
5582 | code = _delete_consecutive_repeats(code) |
||
5583 | |||
5584 | return code |
||
5585 | |||
5586 | |||
5587 | def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'): |
||
5588 | r"""Return the Dolby Code of a name. |
||
5589 | |||
5590 | This follows "A Spelling Equivalent Abbreviation Algorithm For Personal |
||
5591 | Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`. |
||
5592 | |||
5593 | :param word: the word to encode |
||
5594 | :param maxlength: maximum length of the returned Dolby code -- this also |
||
5595 | activates the fixed-length code mode |
||
5596 | :param keep_vowels: if True, retains all vowel markers |
||
5597 | :param vowel_char: the vowel marker character (default to \*) |
||
5598 | :return: |
||
5599 | """ |
||
5600 | _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'} |
||
5601 | |||
5602 | # uppercase, normalize, decompose, and filter non-A-Z out |
||
5603 | word = normalize('NFKD', text_type(word.upper())) |
||
5604 | word = word.replace('ß', 'SS') |
||
5605 | word = ''.join(c for c in word if c in |
||
5606 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
5607 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
5608 | 'Y', 'Z'}) |
||
5609 | |||
5610 | # Rule 1 (FL2) |
||
5611 | if word[:3] in {'MCG', 'MAG', 'MAC'}: |
||
5612 | word = 'MK'+word[3:] |
||
5613 | elif word[:2] == 'MC': |
||
5614 | word = 'MK'+word[2:] |
||
5615 | |||
5616 | # Rule 2 (FL3) |
||
5617 | pos = len(word)-2 |
||
5618 | while pos > -1: |
||
5619 | if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC', |
||
5620 | 'SK', 'ST'}: |
||
5621 | word = word[:pos+1]+word[pos+2:] |
||
5622 | pos += 1 |
||
5623 | pos -= 1 |
||
5624 | |||
5625 | # Rule 3 (FL4) |
||
5626 | # Although the rule indicates "after the first letter", the test cases make |
||
5627 | # it clear that these apply to the first letter also. |
||
5628 | word = word.replace('X', 'KS') |
||
5629 | word = word.replace('CE', 'SE') |
||
5630 | word = word.replace('CI', 'SI') |
||
5631 | word = word.replace('CY', 'SI') |
||
5632 | |||
5633 | # not in the rule set, but they seem to have intended it |
||
5634 | word = word.replace('TCH', 'CH') |
||
5635 | |||
5636 | pos = word.find('CH', 1) |
||
5637 | while pos != -1: |
||
5638 | if word[pos-1:pos] not in _vowels: |
||
5639 | word = word[:pos]+'S'+word[pos+1:] |
||
5640 | pos = word.find('CH', pos+1) |
||
5641 | |||
5642 | word = word.replace('C', 'K') |
||
5643 | word = word.replace('Z', 'S') |
||
5644 | |||
5645 | word = word.replace('WR', 'R') |
||
5646 | word = word.replace('DG', 'G') |
||
5647 | word = word.replace('QU', 'K') |
||
5648 | word = word.replace('T', 'D') |
||
5649 | word = word.replace('PH', 'F') |
||
5650 | |||
5651 | # Rule 4 (FL5) |
||
5652 | # Although the rule indicates "after the first letter", the test cases make |
||
5653 | # it clear that these apply to the first letter also. |
||
5654 | pos = word.find('K', 0) |
||
5655 | while pos != -1: |
||
5656 | if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}: |
||
5657 | word = word[:pos-1]+word[pos:] |
||
5658 | pos -= 1 |
||
5659 | pos = word.find('K', pos+1) |
||
5660 | |||
5661 | # Rule FL6 |
||
5662 | if maxlength and word[-1:] == 'E': |
||
5663 | word = word[:-1] |
||
5664 | |||
5665 | # Rule 5 (FL7) |
||
5666 | word = _delete_consecutive_repeats(word) |
||
5667 | |||
5668 | # Rule 6 (FL8) |
||
5669 | if word[:2] == 'PF': |
||
5670 | word = word[1:] |
||
5671 | if word[-2:] == 'PF': |
||
5672 | word = word[:-1] |
||
5673 | elif word[-2:] == 'GH': |
||
5674 | if word[-3:-2] in _vowels: |
||
5675 | word = word[:-2]+'F' |
||
5676 | else: |
||
5677 | word = word[:-2]+'G' |
||
5678 | word = word.replace('GH', '') |
||
5679 | |||
5680 | # Rule FL9 |
||
5681 | if maxlength: |
||
5682 | word = word.replace('V', 'F') |
||
5683 | |||
5684 | # Rules 7-9 (FL10-FL12) |
||
5685 | first = 1 + (1 if maxlength else 0) |
||
5686 | code = '' |
||
5687 | for pos, char in enumerate(word): |
||
5688 | if char in _vowels: |
||
5689 | if first or keep_vowels: |
||
5690 | code += vowel_char |
||
5691 | first -= 1 |
||
5692 | else: |
||
5693 | continue |
||
5694 | elif pos > 0 and char in {'W', 'H'}: |
||
5695 | continue |
||
5696 | else: |
||
5697 | code += char |
||
5698 | |||
5699 | if maxlength: |
||
5700 | # Rule FL13 |
||
5701 | if len(code) > maxlength and code[-1:] == 'S': |
||
5702 | code = code[:-1] |
||
5703 | if keep_vowels: |
||
5704 | code = code[:maxlength] |
||
5705 | else: |
||
5706 | # Rule FL14 |
||
5707 | code = code[:maxlength + 2] |
||
5708 | # Rule FL15 |
||
5709 | while len(code) > maxlength: |
||
5710 | vowels = len(code) - maxlength |
||
5711 | excess = vowels - 1 |
||
5712 | word = code |
||
5713 | code = '' |
||
5714 | for char in word: |
||
5715 | if char == vowel_char: |
||
5716 | if vowels: |
||
5717 | code += char |
||
5718 | vowels -= 1 |
||
5719 | else: |
||
5720 | code += char |
||
5721 | code = code[:maxlength + excess] |
||
5722 | |||
5723 | # Rule FL16 |
||
5724 | code += ' ' * (maxlength - len(code)) |
||
5725 | |||
5726 | return code |
||
5727 | |||
5728 | |||
5729 | def phonetic_spanish(word, maxlength=None): |
||
5730 | """Return the PhoneticSpanish coding of word. |
||
5731 | |||
5732 | This follows the coding described in :cite:`Amon:2012` and |
||
5733 | :cite:`delPilarAngeles:2015`. |
||
5734 | |||
5735 | :param word: |
||
5736 | :return: |
||
5737 | """ |
||
5738 | _es_soundex_translation = dict(zip((ord(_) for _ in |
||
5739 | 'BCDFGHJKLMNPQRSTVXYZ'), |
||
5740 | '14328287566079431454')) |
||
5741 | |||
5742 | # uppercase, normalize, and decompose, filter to A-Z minus vowels & W |
||
5743 | word = normalize('NFKD', text_type(word.upper())) |
||
5744 | word = ''.join(c for c in word if c in |
||
5745 | {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', |
||
5746 | 'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'}) |
||
5747 | |||
5748 | # merge repeated Ls & Rs |
||
5749 | word = word.replace('LL', 'L') |
||
5750 | word = word.replace('R', 'R') |
||
5751 | |||
5752 | # apply the Soundex algorithm |
||
5753 | sdx = word.translate(_es_soundex_translation) |
||
5754 | |||
5755 | if maxlength: |
||
5756 | sdx = sdx[:maxlength] |
||
5757 | |||
5758 | return sdx |
||
5759 | |||
5760 | |||
5761 | def spanish_metaphone(word, maxlength=6, modified=False): |
||
5762 | """Return the Spanish Metaphone of a word. |
||
5763 | |||
5764 | This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at |
||
5765 | https://github.com/amsqr/Spanish-Metaphone and discussed in |
||
5766 | :cite:`Mosquera:2012`. |
||
5767 | |||
5768 | Modified version based on :cite:`delPilarAngeles:2016`. |
||
5769 | |||
5770 | :param word: |
||
5771 | :param maxlength: |
||
5772 | :param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's |
||
5773 | modified version of the algorithm |
||
5774 | :return: |
||
5775 | """ |
||
5776 | def _is_vowel(pos): |
||
5777 | """Return True if the character at word[pos] is a vowel.""" |
||
5778 | if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}: |
||
5779 | return True |
||
5780 | return False |
||
5781 | |||
5782 | word = normalize('NFC', text_type(word.upper())) |
||
5783 | |||
5784 | meta_key = '' |
||
5785 | pos = 0 |
||
5786 | |||
5787 | # do some replacements for the modified version |
||
5788 | if modified: |
||
5789 | word = word.replace('MB', 'NB') |
||
5790 | word = word.replace('MP', 'NP') |
||
5791 | word = word.replace('BS', 'S') |
||
5792 | if word[:2] == 'PS': |
||
5793 | word = word[1:] |
||
5794 | |||
5795 | # simple replacements |
||
5796 | word = word.replace('Á', 'A') |
||
5797 | word = word.replace('CH', 'X') |
||
5798 | word = word.replace('Ç', 'S') |
||
5799 | word = word.replace('É', 'E') |
||
5800 | word = word.replace('Í', 'I') |
||
5801 | word = word.replace('Ó', 'O') |
||
5802 | word = word.replace('Ú', 'U') |
||
5803 | word = word.replace('Ñ', 'NY') |
||
5804 | word = word.replace('GÜ', 'W') |
||
5805 | word = word.replace('Ü', 'U') |
||
5806 | word = word.replace('B', 'V') |
||
5807 | word = word.replace('LL', 'Y') |
||
5808 | |||
5809 | while len(meta_key) < maxlength: |
||
5810 | if pos >= len(word): |
||
5811 | break |
||
5812 | |||
5813 | # get the next character |
||
5814 | current_char = word[pos] |
||
5815 | |||
5816 | # if a vowel in pos 0, add to key |
||
5817 | if _is_vowel(pos) and pos == 0: |
||
5818 | meta_key += current_char |
||
5819 | pos += 1 |
||
5820 | # otherwise, do consonant rules |
||
5821 | else: |
||
5822 | # simple consonants (unmutated) |
||
5823 | if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V', |
||
5824 | 'L', 'Y'}: |
||
5825 | meta_key += current_char |
||
5826 | # skip doubled consonants |
||
5827 | if word[pos+1:pos+2] == current_char: |
||
5828 | pos += 2 |
||
5829 | else: |
||
5830 | pos += 1 |
||
5831 | else: |
||
5832 | if current_char == 'C': |
||
5833 | # special case 'acción', 'reacción',etc. |
||
5834 | if word[pos+1:pos+2] == 'C': |
||
5835 | meta_key += 'X' |
||
5836 | pos += 2 |
||
5837 | # special case 'cesar', 'cien', 'cid', 'conciencia' |
||
5838 | elif word[pos+1:pos+2] in {'E', 'I'}: |
||
5839 | meta_key += 'Z' |
||
5840 | pos += 2 |
||
5841 | # base case |
||
5842 | else: |
||
5843 | meta_key += 'K' |
||
5844 | pos += 1 |
||
5845 | elif current_char == 'G': |
||
5846 | # special case 'gente', 'ecologia',etc |
||
5847 | if word[pos + 1:pos + 2] in {'E', 'I'}: |
||
5848 | meta_key += 'J' |
||
5849 | pos += 2 |
||
5850 | # base case |
||
5851 | else: |
||
5852 | meta_key += 'G' |
||
5853 | pos += 1 |
||
5854 | elif current_char == 'H': |
||
5855 | # since the letter 'H' is silent in Spanish, |
||
5856 | # set the meta key to the vowel after the letter 'H' |
||
5857 | if _is_vowel(pos+1): |
||
5858 | meta_key += word[pos+1] |
||
5859 | pos += 2 |
||
5860 | else: |
||
5861 | meta_key += 'H' |
||
5862 | pos += 1 |
||
5863 | elif current_char == 'Q': |
||
5864 | if word[pos+1:pos+2] == 'U': |
||
5865 | pos += 2 |
||
5866 | else: |
||
5867 | pos += 1 |
||
5868 | meta_key += 'K' |
||
5869 | elif current_char == 'W': |
||
5870 | meta_key += 'U' |
||
5871 | pos += 1 |
||
5872 | elif current_char == 'R': |
||
5873 | meta_key += 'R' |
||
5874 | pos += 1 |
||
5875 | elif current_char == 'S': |
||
5876 | if not _is_vowel(pos+1) and pos == 0: |
||
5877 | meta_key += 'ES' |
||
5878 | pos += 1 |
||
5879 | else: |
||
5880 | meta_key += 'S' |
||
5881 | pos += 1 |
||
5882 | elif current_char == 'Z': |
||
5883 | meta_key += 'Z' |
||
5884 | pos += 1 |
||
5885 | elif current_char == 'X': |
||
5886 | if len(word) > 1 and pos == 0 and not _is_vowel(pos+1): |
||
5887 | meta_key += 'EX' |
||
5888 | pos += 1 |
||
5889 | else: |
||
5890 | meta_key += 'X' |
||
5891 | pos += 1 |
||
5892 | else: |
||
5893 | pos += 1 |
||
5894 | |||
5895 | # Final change from S to Z in modified version |
||
5896 | if modified: |
||
5897 | meta_key = meta_key.replace('S', 'Z') |
||
5898 | |||
5899 | return meta_key |
||
5900 | |||
5901 | |||
5902 | def metasoundex(word, language='en'): |
||
5903 | """Return the MetaSoundex code for a word. |
||
5904 | |||
5905 | This is based on :cite:`Koneru:2017`. |
||
5906 | |||
5907 | :param word: |
||
5908 | :param language: either 'en' for English or 'es' for Spanish |
||
5909 | :return: |
||
5910 | """ |
||
5911 | _metasoundex_translation = dict(zip((ord(_) for _ in |
||
5912 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
5913 | '07430755015866075943077514')) |
||
5914 | |||
5915 | if language == 'es': |
||
5916 | return phonetic_spanish(spanish_metaphone(word)) |
||
5917 | |||
5918 | word = soundex(metaphone(word)) |
||
5919 | word = word[0].translate(_metasoundex_translation)+word[1:] |
||
5920 | |||
5921 | return word |
||
5922 | |||
5923 | |||
5924 | def soundex_br(word, maxlength=4, zero_pad=True): |
||
5925 | """Return the SoundexBR encoding of a word. |
||
5926 | |||
5927 | This is based on :cite:`Marcelino:2015`. |
||
5928 | |||
5929 | :param word: |
||
5930 | :return: |
||
5931 | """ |
||
5932 | _soundex_br_translation = dict(zip((ord(_) for _ in |
||
5933 | 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
||
5934 | '01230120022455012623010202')) |
||
5935 | |||
5936 | word = normalize('NFKD', text_type(word.upper())) |
||
5937 | word = ''.join(c for c in word if c in |
||
5938 | {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
||
5939 | 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
||
5940 | 'Y', 'Z'}) |
||
5941 | |||
5942 | if word[:2] == 'WA': |
||
5943 | first = 'V' |
||
5944 | elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}: |
||
5945 | first = 'C' |
||
5946 | elif word[:1] == 'C' and word[1:2] in {'I', 'E'}: |
||
5947 | first = 'S' |
||
5948 | elif word[:1] == 'G' and word[1:2] in {'E', 'I'}: |
||
5949 | first = 'J' |
||
5950 | elif word[:1] == 'Y': |
||
5951 | first = 'I' |
||
5952 | elif word[:1] == 'H': |
||
5953 | first = word[1:2] |
||
5954 | word = word[1:] |
||
5955 | else: |
||
5956 | first = word[:1] |
||
5957 | |||
5958 | sdx = first + word[1:].translate(_soundex_br_translation) |
||
5959 | sdx = _delete_consecutive_repeats(sdx) |
||
5960 | sdx = sdx.replace('0', '') |
||
5961 | |||
5962 | if zero_pad: |
||
5963 | sdx += ('0'*maxlength) |
||
5964 | |||
5965 | return sdx[:maxlength] |
||
5966 | |||
5967 | |||
5968 | def nrl(word): |
||
5969 | """Return the Naval Research Laboratory phonetic encoding of a word. |
||
5970 | |||
5971 | This is defined by :cite:`Elovitz:1976`. |
||
5972 | |||
5973 | :param word: |
||
5974 | :return: |
||
5975 | """ |
||
5976 | def to_regex(pattern, left=True): |
||
5977 | new_pattern = '' |
||
5978 | replacements = {'#': '[AEIOU]+', |
||
5979 | ':': '[BCDFGHJKLMNPQRSTVWXYZ]*', |
||
5980 | '^': '[BCDFGHJKLMNPQRSTVWXYZ]', |
||
5981 | '.': '[BDVGJLMNTWZ]', |
||
5982 | '%': '(ER|E|ES|ED|ING|ELY)', |
||
5983 | '+': '[EIY]', |
||
5984 | ' ': '^'} |
||
5985 | for char in pattern: |
||
5986 | new_pattern += (replacements[char] if char in replacements |
||
5987 | else char) |
||
5988 | |||
5989 | if left: |
||
5990 | new_pattern += '$' |
||
5991 | if '^' not in pattern: |
||
5992 | new_pattern = '^.*' + new_pattern |
||
5993 | else: |
||
5994 | new_pattern = '^' + new_pattern.replace('^', '$') |
||
5995 | if '$' not in new_pattern: |
||
5996 | new_pattern += '.*$' |
||
5997 | |||
5998 | return new_pattern |
||
5999 | |||
6000 | rules = {' ': (('', ' ', '', ' '), |
||
6001 | ('', '-', '', ''), |
||
6002 | ('.', '\'S', '', 'z'), |
||
6003 | ('#:.E', '\'S', '', 'z'), |
||
6004 | ('#', '\'S', '', 'z'), |
||
6005 | ('', '\'', '', ''), |
||
6006 | ('', ',', '', ' '), |
||
6007 | ('', '.', '', ' '), |
||
6008 | ('', '?', '', ' '), |
||
6009 | ('', '!', '', ' ')), |
||
6010 | 'A': (('', 'A', ' ', 'AX'), |
||
6011 | (' ', 'ARE', ' ', 'AAr'), |
||
6012 | (' ', 'AR', 'O', 'AXr'), |
||
6013 | ('', 'AR', '#', 'EHr'), |
||
6014 | ('^', 'AS', '#', 'EYs'), |
||
6015 | ('', 'A', 'WA', 'AX'), |
||
6016 | ('', 'AW', '', 'AO'), |
||
6017 | (' :', 'ANY', '', 'EHnIY'), |
||
6018 | ('', 'A', '^+#', 'EY'), |
||
6019 | ('#:', 'ALLY', '', 'AXlIY'), |
||
6020 | (' ', 'AL', '#', 'AXl'), |
||
6021 | ('', 'AGAIN', '', 'AXgEHn'), |
||
6022 | ('#:', 'AG', 'E', 'IHj'), |
||
6023 | ('', 'A', '^+:#', 'AE'), |
||
6024 | (' :', 'A', '^+ ', 'EY'), |
||
6025 | ('', 'A', '^%', 'EY'), |
||
6026 | (' ', 'ARR', '', 'AXr'), |
||
6027 | ('', 'ARR', '', 'AEr'), |
||
6028 | (' :', 'AR', ' ', 'AAr'), |
||
6029 | ('', 'AR', ' ', 'ER'), |
||
6030 | ('', 'AR', '', 'AAr'), |
||
6031 | ('', 'AIR', '', 'EHr'), |
||
6032 | ('', 'AI', '', 'EY'), |
||
6033 | ('', 'AY', '', 'EY'), |
||
6034 | ('', 'AU', '', 'AO'), |
||
6035 | ('#:', 'AL', ' ', 'AXl'), |
||
6036 | ('#:', 'ALS', ' ', 'AXlz'), |
||
6037 | ('', 'ALK', '', 'AOk'), |
||
6038 | ('', 'AL', '^', 'AOl'), |
||
6039 | (' :', 'ABLE', '', 'EYbAXl'), |
||
6040 | ('', 'ABLE', '', 'AXbAXl'), |
||
6041 | ('', 'ANG', '+', 'EYnj'), |
||
6042 | ('', 'A', '', 'AE')), |
||
6043 | 'B': ((' ', 'BE', '^#', 'bIH'), |
||
6044 | ('', 'BEING', '', 'bIYIHNG'), |
||
6045 | (' ', 'BOTH', ' ', 'bOWTH'), |
||
6046 | (' ', 'BUS', '#', 'bIHz'), |
||
6047 | ('', 'BUIL', '', 'bIHl'), |
||
6048 | ('', 'B', '', 'b')), |
||
6049 | 'C': ((' ', 'CH', '^', 'k'), |
||
6050 | ('^E', 'CH', '', 'k'), |
||
6051 | ('', 'CH', '', 'CH'), |
||
6052 | (' S', 'CI', '#', 'sAY'), |
||
6053 | ('', 'CI', 'A', 'SH'), |
||
6054 | ('', 'CI', 'O', 'SH'), |
||
6055 | ('', 'CI', 'EN', 'SH'), |
||
6056 | ('', 'C', '+', 's'), |
||
6057 | ('', 'CK', '', 'k'), |
||
6058 | ('', 'COM', '%', 'kAHm'), |
||
6059 | ('', 'C', '', 'k')), |
||
6060 | 'D': (('#:', 'DED', ' ', 'dIHd'), |
||
6061 | ('.E', 'D', ' ', 'd'), |
||
6062 | ('#:^E', 'D', ' ', 't'), |
||
6063 | (' ', 'DE', '^#', 'dIH'), |
||
6064 | (' ', 'DO', ' ', 'dUW'), |
||
6065 | (' ', 'DOES', '', 'dAHz'), |
||
6066 | (' ', 'DOING', '', 'dUWIHNG'), |
||
6067 | (' ', 'DOW', '', 'dAW'), |
||
6068 | ('', 'DU', 'A', 'jUW'), |
||
6069 | ('', 'D', '', 'd')), |
||
6070 | 'E': (('#:', 'E', ' ', ''), |
||
6071 | ('\':^', 'E', ' ', ''), |
||
6072 | (' :', 'E', ' ', 'IY'), |
||
6073 | ('#', 'ED', ' ', 'd'), |
||
6074 | ('#:', 'E', 'D ', ''), |
||
6075 | ('', 'EV', 'ER', 'EHv'), |
||
6076 | ('', 'E', '^%', 'IY'), |
||
6077 | ('', 'ERI', '#', 'IYrIY'), |
||
6078 | ('', 'ERI', '', 'EHrIH'), |
||
6079 | ('#:', 'ER', '#', 'ER'), |
||
6080 | ('', 'ER', '#', 'EHr'), |
||
6081 | ('', 'ER', '', 'ER'), |
||
6082 | (' ', 'EVEN', '', 'IYvEHn'), |
||
6083 | ('#:', 'E', 'W', ''), |
||
6084 | ('T', 'EW', '', 'UW'), |
||
6085 | ('S', 'EW', '', 'UW'), |
||
6086 | ('R', 'EW', '', 'UW'), |
||
6087 | ('D', 'EW', '', 'UW'), |
||
6088 | ('L', 'EW', '', 'UW'), |
||
6089 | ('Z', 'EW', '', 'UW'), |
||
6090 | ('N', 'EW', '', 'UW'), |
||
6091 | ('J', 'EW', '', 'UW'), |
||
6092 | ('TH', 'EW', '', 'UW'), |
||
6093 | ('CH', 'EW', '', 'UW'), |
||
6094 | ('SH', 'EW', '', 'UW'), |
||
6095 | ('', 'EW', '', 'yUW'), |
||
6096 | ('', 'E', 'O', 'IY'), |
||
6097 | ('#:S', 'ES', ' ', 'IHz'), |
||
6098 | ('#:C', 'ES', ' ', 'IHz'), |
||
6099 | ('#:G', 'ES', ' ', 'IHz'), |
||
6100 | ('#:Z', 'ES', ' ', 'IHz'), |
||
6101 | ('#:X', 'ES', ' ', 'IHz'), |
||
6102 | ('#:J', 'ES', ' ', 'IHz'), |
||
6103 | ('#:CH', 'ES', ' ', 'IHz'), |
||
6104 | ('#:SH', 'ES', ' ', 'IHz'), |
||
6105 | ('#:', 'E', 'S ', ''), |
||
6106 | ('#:', 'ELY', ' ', 'lIY'), |
||
6107 | ('#:', 'EMENT', '', 'mEHnt'), |
||
6108 | ('', 'EFUL', '', 'fUHl'), |
||
6109 | ('', 'EE', '', 'IY'), |
||
6110 | ('', 'EARN', '', 'ERn'), |
||
6111 | (' ', 'EAR', '^', 'ER'), |
||
6112 | ('', 'EAD', '', 'EHd'), |
||
6113 | ('#:', 'EA', ' ', 'IYAX'), |
||
6114 | ('', 'EA', 'SU', 'EH'), |
||
6115 | ('', 'EA', '', 'IY'), |
||
6116 | ('', 'EIGH', '', 'EY'), |
||
6117 | ('', 'EI', '', 'IY'), |
||
6118 | (' ', 'EYE', '', 'AY'), |
||
6119 | ('', 'EY', '', 'IY'), |
||
6120 | ('', 'EU', '', 'yUW'), |
||
6121 | ('', 'E', '', 'EH')), |
||
6122 | 'F': (('', 'FUL', '', 'fUHl'), |
||
6123 | ('', 'F', '', 'f')), |
||
6124 | 'G': (('', 'GIV', '', 'gIHv'), |
||
6125 | (' ', 'G', 'I^', 'g'), |
||
6126 | ('', 'GE', 'T', 'gEH'), |
||
6127 | ('SU', 'GGES', '', 'gjEHs'), |
||
6128 | ('', 'GG', '', 'g'), |
||
6129 | (' B#', 'G', '', 'g'), |
||
6130 | ('', 'G', '+', 'j'), |
||
6131 | ('', 'GREAT', '', 'grEYt'), |
||
6132 | ('#', 'GH', '', ''), |
||
6133 | ('', 'G', '', 'g')), |
||
6134 | 'H': ((' ', 'HAV', '', 'hAEv'), |
||
6135 | (' ', 'HERE', '', 'hIYr'), |
||
6136 | (' ', 'HOUR', '', 'AWER'), |
||
6137 | ('', 'HOW', '', 'hAW'), |
||
6138 | ('', 'H', '#', 'h'), |
||
6139 | ('', 'H', '', '')), |
||
6140 | 'I': ((' ', 'IN', '', 'IHn'), |
||
6141 | (' ', 'I', ' ', 'AY'), |
||
6142 | ('', 'IN', 'D', 'AYn'), |
||
6143 | ('', 'IER', '', 'IYER'), |
||
6144 | ('#:R', 'IED', '', 'IYd'), |
||
6145 | ('', 'IED', ' ', 'AYd'), |
||
6146 | ('', 'IEN', '', 'IYEHn'), |
||
6147 | ('', 'IE', 'T', 'AYEH'), |
||
6148 | (' :', 'I', '%', 'AY'), |
||
6149 | ('', 'I', '%', 'IY'), |
||
6150 | ('', 'IE', '', 'IY'), |
||
6151 | ('', 'I', '^+:#', 'IH'), |
||
6152 | ('', 'IR', '#', 'AYr'), |
||
6153 | ('', 'IZ', '%', 'AYz'), |
||
6154 | ('', 'IS', '%', 'AYz'), |
||
6155 | ('', 'I', 'D%', 'AY'), |
||
6156 | ('+^', 'I', '^+', 'IH'), |
||
6157 | ('', 'I', 'T%', 'AY'), |
||
6158 | ('#:^', 'I', '^+', 'IH'), |
||
6159 | ('', 'I', '^+', 'AY'), |
||
6160 | ('', 'IR', '', 'ER'), |
||
6161 | ('', 'IGH', '', 'AY'), |
||
6162 | ('', 'ILD', '', 'AYld'), |
||
6163 | ('', 'IGN', ' ', 'AYn'), |
||
6164 | ('', 'IGN', '^', 'AYn'), |
||
6165 | ('', 'IGN', '%', 'AYn'), |
||
6166 | ('', 'IQUE', '', 'IYk'), |
||
6167 | ('', 'I', '', 'IH')), |
||
6168 | 'J': (('', 'J', '', 'j'),), |
||
6169 | 'K': ((' ', 'K', 'N', ''), |
||
6170 | ('', 'K', '', 'k')), |
||
6171 | 'L': (('', 'LO', 'C#', 'lOW'), |
||
6172 | ('L', 'L', '', ''), |
||
6173 | ('#:^', 'L', '%', 'AXl'), |
||
6174 | ('', 'LEAD', '', 'lIYd'), |
||
6175 | ('', 'L', '', 'l')), |
||
6176 | 'M': (('', 'MOV', '', 'mUWv'), |
||
6177 | ('', 'M', '', 'm')), |
||
6178 | 'N': (('E', 'NG', '+', 'nj'), |
||
6179 | ('', 'NG', 'R', 'NGg'), |
||
6180 | ('', 'NG', '#', 'NGg'), |
||
6181 | ('', 'NGL', '%', 'NGgAXl'), |
||
6182 | ('', 'NG', '', 'NG'), |
||
6183 | ('', 'NK', '', 'NGk'), |
||
6184 | (' ', 'NOW', ' ', 'nAW'), |
||
6185 | ('', 'N', '', 'n')), |
||
6186 | 'O': (('', 'OF', ' ', 'AXv'), |
||
6187 | ('', 'OROUGH', '', 'EROW'), |
||
6188 | ('#:', 'OR', ' ', 'ER'), |
||
6189 | ('#:', 'ORS', ' ', 'ERz'), |
||
6190 | ('', 'OR', '', 'AOr'), |
||
6191 | (' ', 'ONE', '', 'wAHn'), |
||
6192 | ('', 'OW', '', 'OW'), |
||
6193 | (' ', 'OVER', '', 'OWvER'), |
||
6194 | ('', 'OV', '', 'AHv'), |
||
6195 | ('', 'O', '^%', 'OW'), |
||
6196 | ('', 'O', '^EN', 'OW'), |
||
6197 | ('', 'O', '^I#', 'OW'), |
||
6198 | ('', 'OL', 'D', 'OWl'), |
||
6199 | ('', 'OUGHT', '', 'AOt'), |
||
6200 | ('', 'OUGH', '', 'AHf'), |
||
6201 | (' ', 'OU', '', 'AW'), |
||
6202 | ('H', 'OU', 'S#', 'AW'), |
||
6203 | ('', 'OUS', '', 'AXs'), |
||
6204 | ('', 'OUR', '', 'AOr'), |
||
6205 | ('', 'OULD', '', 'UHd'), |
||
6206 | ('^', 'OU', '^L', 'AH'), |
||
6207 | ('', 'OUP', '', 'UWp'), |
||
6208 | ('', 'OU', '', 'AW'), |
||
6209 | ('', 'OY', '', 'OY'), |
||
6210 | ('', 'OING', '', 'OWIHNG'), |
||
6211 | ('', 'OI', '', 'OY'), |
||
6212 | ('', 'OOR', '', 'AOr'), |
||
6213 | ('', 'OOK', '', 'UHk'), |
||
6214 | ('', 'OOD', '', 'UHd'), |
||
6215 | ('', 'OO', '', 'UW'), |
||
6216 | ('', 'O', 'E', 'OW'), |
||
6217 | ('', 'O', ' ', 'OW'), |
||
6218 | ('', 'OA', '', 'OW'), |
||
6219 | (' ', 'ONLY', '', 'OWnlIY'), |
||
6220 | (' ', 'ONCE', '', 'wAHns'), |
||
6221 | ('', 'ON\'T', '', 'OWnt'), |
||
6222 | ('C', 'O', 'N', 'AA'), |
||
6223 | ('', 'O', 'NG', 'AO'), |
||
6224 | (' :^', 'O', 'N', 'AH'), |
||
6225 | ('I', 'ON', '', 'AXn'), |
||
6226 | ('#:', 'ON', ' ', 'AXn'), |
||
6227 | ('#^', 'ON', '', 'AXn'), |
||
6228 | ('', 'O', 'ST ', 'OW'), |
||
6229 | ('', 'OF', '^', 'AOf'), |
||
6230 | ('', 'OTHER', '', 'AHDHER'), |
||
6231 | ('', 'OSS', ' ', 'AOs'), |
||
6232 | ('#:^', 'OM', '', 'AHm'), |
||
6233 | ('', 'O', '', 'AA')), |
||
6234 | 'P': (('', 'PH', '', 'f'), |
||
6235 | ('', 'PEOP', '', 'pIYp'), |
||
6236 | ('', 'POW', '', 'pAW'), |
||
6237 | ('', 'PUT', ' ', 'pUHt'), |
||
6238 | ('', 'P', '', 'p')), |
||
6239 | 'Q': (('', 'QUAR', '', 'kwAOr'), |
||
6240 | ('', 'QU', '', 'kw'), |
||
6241 | ('', 'Q', '', 'k')), |
||
6242 | 'R': ((' ', 'RE', '^#', 'rIY'), |
||
6243 | ('', 'R', '', 'r')), |
||
6244 | 'S': (('', 'SH', '', 'SH'), |
||
6245 | ('#', 'SION', '', 'ZHAXn'), |
||
6246 | ('', 'SOME', '', 'sAHm'), |
||
6247 | ('#', 'SUR', '#', 'ZHER'), |
||
6248 | ('', 'SUR', '#', 'SHER'), |
||
6249 | ('#', 'SU', '#', 'ZHUW'), |
||
6250 | ('#', 'SSU', '#', 'SHUW'), |
||
6251 | ('#', 'SED', ' ', 'zd'), |
||
6252 | ('#', 'S', '#', 'z'), |
||
6253 | ('', 'SAID', '', 'sEHd'), |
||
6254 | ('^', 'SION', '', 'SHAXn'), |
||
6255 | ('', 'S', 'S', ''), |
||
6256 | ('.', 'S', ' ', 'z'), |
||
6257 | ('#:.E', 'S', ' ', 'z'), |
||
6258 | ('#:^##', 'S', ' ', 'z'), |
||
6259 | ('#:^#', 'S', ' ', 's'), |
||
6260 | ('U', 'S', ' ', 's'), |
||
6261 | (' :#', 'S', ' ', 'z'), |
||
6262 | (' ', 'SCH', '', 'sk'), |
||
6263 | ('', 'S', 'C+', ''), |
||
6264 | ('#', 'SM', '', 'zm'), |
||
6265 | ('#', 'SN', '\'', 'zAXn'), |
||
6266 | ('', 'S', '', 's')), |
||
6267 | 'T': ((' ', 'THE', ' ', 'DHAX'), |
||
6268 | ('', 'TO', ' ', 'tUW'), |
||
6269 | ('', 'THAT', ' ', 'DHAEt'), |
||
6270 | (' ', 'THIS', ' ', 'DHIHs'), |
||
6271 | (' ', 'THEY', '', 'DHEY'), |
||
6272 | (' ', 'THERE', '', 'DHEHr'), |
||
6273 | ('', 'THER', '', 'DHER'), |
||
6274 | ('', 'THEIR', '', 'DHEHr'), |
||
6275 | (' ', 'THAN', ' ', 'DHAEn'), |
||
6276 | (' ', 'THEM', ' ', 'DHEHm'), |
||
6277 | ('', 'THESE', ' ', 'DHIYz'), |
||
6278 | (' ', 'THEN', '', 'DHEHn'), |
||
6279 | ('', 'THROUGH', '', 'THrUW'), |
||
6280 | ('', 'THOSE', '', 'DHOWz'), |
||
6281 | ('', 'THOUGH', ' ', 'DHOW'), |
||
6282 | (' ', 'THUS', '', 'DHAHs'), |
||
6283 | ('', 'TH', '', 'TH'), |
||
6284 | ('#:', 'TED', ' ', 'tIHd'), |
||
6285 | ('S', 'TI', '#N', 'CH'), |
||
6286 | ('', 'TI', 'O', 'SH'), |
||
6287 | ('', 'TI', 'A', 'SH'), |
||
6288 | ('', 'TIEN', '', 'SHAXn'), |
||
6289 | ('', 'TUR', '#', 'CHER'), |
||
6290 | ('', 'TU', 'A', 'CHUW'), |
||
6291 | (' ', 'TWO', '', 'tUW'), |
||
6292 | ('', 'T', '', 't')), |
||
6293 | 'U': ((' ', 'UN', 'I', 'yUWn'), |
||
6294 | (' ', 'UN', '', 'AHn'), |
||
6295 | (' ', 'UPON', '', 'AXpAOn'), |
||
6296 | ('T', 'UR', '#', 'UHr'), |
||
6297 | ('S', 'UR', '#', 'UHr'), |
||
6298 | ('R', 'UR', '#', 'UHr'), |
||
6299 | ('D', 'UR', '#', 'UHr'), |
||
6300 | ('L', 'UR', '#', 'UHr'), |
||
6301 | ('Z', 'UR', '#', 'UHr'), |
||
6302 | ('N', 'UR', '#', 'UHr'), |
||
6303 | ('J', 'UR', '#', 'UHr'), |
||
6304 | ('TH', 'UR', '#', 'UHr'), |
||
6305 | ('CH', 'UR', '#', 'UHr'), |
||
6306 | ('SH', 'UR', '#', 'UHr'), |
||
6307 | ('', 'UR', '#', 'yUHr'), |
||
6308 | ('', 'UR', '', 'ER'), |
||
6309 | ('', 'U', '^ ', 'AH'), |
||
6310 | ('', 'U', '^^', 'AH'), |
||
6311 | ('', 'UY', '', 'AY'), |
||
6312 | (' G', 'U', '#', ''), |
||
6313 | ('G', 'U', '%', ''), |
||
6314 | ('G', 'U', '#', 'w'), |
||
6315 | ('#N', 'U', '', 'yUW'), |
||
6316 | ('T', 'U', '', 'UW'), |
||
6317 | ('S', 'U', '', 'UW'), |
||
6318 | ('R', 'U', '', 'UW'), |
||
6319 | ('D', 'U', '', 'UW'), |
||
6320 | ('L', 'U', '', 'UW'), |
||
6321 | ('Z', 'U', '', 'UW'), |
||
6322 | ('N', 'U', '', 'UW'), |
||
6323 | ('J', 'U', '', 'UW'), |
||
6324 | ('TH', 'U', '', 'UW'), |
||
6325 | ('CH', 'U', '', 'UW'), |
||
6326 | ('SH', 'U', '', 'UW'), |
||
6327 | ('', 'U', '', 'yUW')), |
||
6328 | 'V': (('', 'VIEW', '', 'vyUW'), |
||
6329 | ('', 'V', '', 'v')), |
||
6330 | 'W': ((' ', 'WERE', '', 'wER'), |
||
6331 | ('', 'WA', 'S', 'wAA'), |
||
6332 | ('', 'WA', 'T', 'wAA'), |
||
6333 | ('', 'WHERE', '', 'WHEHr'), |
||
6334 | ('', 'WHAT', '', 'WHAAt'), |
||
6335 | ('', 'WHOL', '', 'hOWl'), |
||
6336 | ('', 'WHO', '', 'hUW'), |
||
6337 | ('', 'WH', '', 'WH'), |
||
6338 | ('', 'WAR', '', 'wAOr'), |
||
6339 | ('', 'WOR', '^', 'wER'), |
||
6340 | ('', 'WR', '', 'r'), |
||
6341 | ('', 'W', '', 'w')), |
||
6342 | 'X': (('', 'X', '', 'ks'),), |
||
6343 | 'Y': (('', 'YOUNG', '', 'yAHNG'), |
||
6344 | (' ', 'YOU', '', 'yUW'), |
||
6345 | (' ', 'YES', '', 'yEHs'), |
||
6346 | (' ', 'Y', '', 'y'), |
||
6347 | ('#:^', 'Y', ' ', 'IY'), |
||
6348 | ('#:^', 'Y', 'I', 'IY'), |
||
6349 | (' :', 'Y', ' ', 'AY'), |
||
6350 | (' :', 'Y', '#', 'AY'), |
||
6351 | (' :', 'Y', '^+:#', 'IH'), |
||
6352 | (' :', 'Y', '^#', 'AY'), |
||
6353 | ('', 'Y', '', 'IH')), |
||
6354 | 'Z': (('', 'Z', '', 'z'),)} |
||
6355 | |||
6356 | word = word.upper() |
||
6357 | |||
6358 | pron = '' |
||
6359 | pos = 0 |
||
6360 | while pos < len(word): |
||
6361 | left_orig = word[:pos] |
||
6362 | right_orig = word[pos:] |
||
6363 | first = word[pos] if word[pos] in rules else ' ' |
||
6364 | for rule in rules[first]: |
||
6365 | left, match, right, out = rule |
||
6366 | if right_orig.startswith(match): |
||
6367 | if left: |
||
6368 | l_pattern = to_regex(left, left=True) |
||
6369 | if right: |
||
6370 | r_pattern = to_regex(right, left=False) |
||
6371 | if ((not left or re_match(l_pattern, left_orig)) and |
||
6372 | (not right or |
||
6373 | re_match(r_pattern, right_orig[len(match):]))): |
||
6374 | pron += out |
||
6375 | pos += len(match) |
||
6376 | break |
||
6377 | else: |
||
6378 | pron += word[pos] |
||
6379 | pos += 1 |
||
6380 | |||
6381 | return pron |
||
6382 | |||
6383 | |||
6384 | def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx', |
||
6385 | concat=False, filter_langs=False): |
||
6386 | """Return the Beider-Morse Phonetic Matching algorithm code for a word. |
||
6387 | |||
6388 | The Beider-Morse Phonetic Matching algorithm is described in |
||
6389 | :cite:`Beider:2008`. |
||
6390 | The reference implementation is licensed under GPLv3. |
||
6391 | |||
6392 | :param str word: the word to transform |
||
6393 | :param str language_arg: the language of the term; supported values |
||
6394 | include: |
||
6395 | |||
6396 | - 'any' |
||
6397 | - 'arabic' |
||
6398 | - 'cyrillic' |
||
6399 | - 'czech' |
||
6400 | - 'dutch' |
||
6401 | - 'english' |
||
6402 | - 'french' |
||
6403 | - 'german' |
||
6404 | - 'greek' |
||
6405 | - 'greeklatin' |
||
6406 | - 'hebrew' |
||
6407 | - 'hungarian' |
||
6408 | - 'italian' |
||
6409 | - 'latvian' |
||
6410 | - 'polish' |
||
6411 | - 'portuguese' |
||
6412 | - 'romanian' |
||
6413 | - 'russian' |
||
6414 | - 'spanish' |
||
6415 | - 'turkish' |
||
6416 | |||
6417 | :param str name_mode: the name mode of the algorithm: |
||
6418 | |||
6419 | - 'gen' -- general (default) |
||
6420 | - 'ash' -- Ashkenazi |
||
6421 | - 'sep' -- Sephardic |
||
6422 | |||
6423 | :param str match_mode: matching mode: 'approx' or 'exact' |
||
6424 | :param bool concat: concatenation mode |
||
6425 | :param bool filter_langs: filter out incompatible languages |
||
6426 | :returns: the BMPM value(s) |
||
6427 | :rtype: tuple |
||
6428 | |||
6429 | >>> bmpm('Christopher') |
||
6430 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
6431 | xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir |
||
6432 | tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir |
||
6433 | zritofi' |
||
6434 | >>> bmpm('Niall') |
||
6435 | 'nial niol' |
||
6436 | >>> bmpm('Smith') |
||
6437 | 'zmit' |
||
6438 | >>> bmpm('Schmidt') |
||
6439 | 'zmit stzmit' |
||
6440 | |||
6441 | >>> bmpm('Christopher', language_arg='German') |
||
6442 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
6443 | xristYfir' |
||
6444 | >>> bmpm('Christopher', language_arg='English') |
||
6445 | 'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir |
||
6446 | xrQstafir' |
||
6447 | >>> bmpm('Christopher', language_arg='German', name_mode='ash') |
||
6448 | 'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
||
6449 | xristYfir' |
||
6450 | |||
6451 | >>> bmpm('Christopher', language_arg='German', match_mode='exact') |
||
6452 | 'xriStopher xriStofer xristopher xristofer' |
||
6453 | """ |
||
6454 | return _bmpm(word, language_arg, name_mode, match_mode, |
||
6455 | concat, filter_langs) |
||
6456 | |||
6457 | |||
6458 | if __name__ == '__main__': |
||
6459 | import doctest |
||
6460 | doctest.testmod() |
||
6461 |