1
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.phonetic. |
20
|
|
|
|
21
|
|
|
The phonetic module implements phonetic algorithms including: |
22
|
|
|
|
23
|
|
|
- Robert C. Russell's Index |
24
|
|
|
- American Soundex |
25
|
|
|
- Refined Soundex |
26
|
|
|
- Daitch-Mokotoff Soundex |
27
|
|
|
- Kölner Phonetik |
28
|
|
|
- NYSIIS |
29
|
|
|
- Match Rating Algorithm |
30
|
|
|
- Metaphone |
31
|
|
|
- Double Metaphone |
32
|
|
|
- Caverphone |
33
|
|
|
- Alpha Search Inquiry System |
34
|
|
|
- Fuzzy Soundex |
35
|
|
|
- Phonex |
36
|
|
|
- Phonem |
37
|
|
|
- Phonix |
38
|
|
|
- SfinxBis |
39
|
|
|
- phonet |
40
|
|
|
- Standardized Phonetic Frequency Code |
41
|
|
|
- Statistics Canada |
42
|
|
|
- Lein |
43
|
|
|
- Roger Root |
44
|
|
|
- Oxford Name Compression Algorithm (ONCA) |
45
|
|
|
- Eudex phonetic hash |
46
|
|
|
- Haase Phonetik |
47
|
|
|
- Reth-Schek Phonetik |
48
|
|
|
- FONEM |
49
|
|
|
- Parmar-Kumbharana |
50
|
|
|
- Davidson's Consonant Code |
51
|
|
|
- SoundD |
52
|
|
|
- PSHP Soundex/Viewex Coding |
53
|
|
|
- an early version of Henry Code |
54
|
|
|
- Norphone |
55
|
|
|
- Dolby Code |
56
|
|
|
- Phonetic Spanish |
57
|
|
|
- Spanish Metaphone |
58
|
|
|
- MetaSoundex |
59
|
|
|
- SoundexBR |
60
|
|
|
- NRL English-to-phoneme |
61
|
|
|
- Beider-Morse Phonetic Matching |
62
|
|
|
""" |
63
|
|
|
|
64
|
|
|
from __future__ import division, unicode_literals |
65
|
|
|
|
66
|
|
|
from collections import Counter |
67
|
|
|
from itertools import groupby, product |
68
|
|
|
from re import compile as re_compile |
69
|
|
|
from re import match as re_match |
70
|
|
|
from unicodedata import normalize |
71
|
|
|
|
72
|
|
|
from six import text_type |
73
|
|
|
from six.moves import range |
74
|
|
|
|
75
|
|
|
from ._bm import _bmpm |
76
|
|
|
|
77
|
|
|
_INFINITY = float('inf') |
78
|
|
|
|
79
|
|
|
__all__ = ['alpha_sis', 'bmpm', 'caverphone', 'davidson', 'dm_soundex', |
80
|
|
|
'dolby', 'double_metaphone', 'eudex', 'fonem', 'fuzzy_soundex', |
81
|
|
|
'haase_phonetik', 'henry_early', 'koelner_phonetik', |
82
|
|
|
'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha', 'lein', |
83
|
|
|
'metaphone', 'metasoundex', 'mra', 'norphone', 'nrl', 'nysiis', |
84
|
|
|
'onca', 'parmar_kumbharana', 'phonem', 'phonet', 'phonetic_spanish', |
85
|
|
|
'phonex', 'phonix', 'pshp_soundex_first', 'pshp_soundex_last', |
86
|
|
|
'refined_soundex', 'reth_schek_phonetik', 'roger_root', |
87
|
|
|
'russell_index', 'russell_index_alpha', |
88
|
|
|
'russell_index_num_to_alpha', 'sfinxbis', 'sound_d', 'soundex', |
89
|
|
|
'soundex_br', 'spanish_metaphone', 'spfc', 'statistics_canada'] |
90
|
|
|
|
91
|
|
|
|
92
|
|
|
def _delete_consecutive_repeats(word): |
93
|
|
|
"""Delete consecutive repeated characters in a word. |
94
|
|
|
|
95
|
|
|
:param str word: the word to transform |
96
|
|
|
:returns: word with consecutive repeating characters collapsed to |
97
|
|
|
a single instance |
98
|
|
|
:rtype: str |
99
|
|
|
""" |
100
|
|
|
return ''.join(char for char, _ in groupby(word)) |
101
|
|
|
|
102
|
|
|
|
103
|
|
|
def russell_index(word): |
104
|
|
|
"""Return the Russell Index (integer output) of a word. |
105
|
|
|
|
106
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
107
|
|
|
:cite:`Russell:1917`. |
108
|
|
|
|
109
|
|
|
:param str word: the word to transform |
110
|
|
|
:returns: the Russell Index value |
111
|
|
|
:rtype: int |
112
|
|
|
|
113
|
|
|
>>> russell_index('Christopher') |
114
|
|
|
3813428 |
115
|
|
|
>>> russell_index('Niall') |
116
|
|
|
715 |
117
|
|
|
>>> russell_index('Smith') |
118
|
|
|
3614 |
119
|
|
|
>>> russell_index('Schmidt') |
120
|
|
|
3614 |
121
|
|
|
""" |
122
|
|
|
_russell_translation = dict(zip((ord(_) for _ in |
|
|
|
|
123
|
|
|
'ABCDEFGIKLMNOPQRSTUVXYZ'), |
124
|
|
|
'12341231356712383412313')) |
125
|
|
|
|
126
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
127
|
|
|
word = word.replace('ß', 'SS') |
128
|
|
|
word = word.replace('GH', '') # discard gh (rule 3) |
129
|
|
|
word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) |
130
|
|
|
|
131
|
|
|
# translate according to Russell's mapping |
132
|
|
|
word = ''.join(c for c in word if c in |
133
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', |
134
|
|
|
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}) |
135
|
|
|
sdx = word.translate(_russell_translation) |
136
|
|
|
|
137
|
|
|
# remove any 1s after the first occurrence |
138
|
|
|
one = sdx.find('1')+1 |
139
|
|
|
if one: |
140
|
|
|
sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') |
141
|
|
|
|
142
|
|
|
# remove repeating characters |
143
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
144
|
|
|
|
145
|
|
|
# return as an int |
146
|
|
|
return int(sdx) if sdx else float('NaN') |
147
|
|
|
|
148
|
|
|
|
149
|
|
|
def russell_index_num_to_alpha(num): |
150
|
|
|
"""Convert the Russell Index integer to an alphabetic string. |
151
|
|
|
|
152
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
153
|
|
|
:cite:`Russell:1917`. |
154
|
|
|
|
155
|
|
|
:param int num: a Russell Index integer value |
156
|
|
|
:returns: the Russell Index as an alphabetic string |
157
|
|
|
:rtype: str |
158
|
|
|
|
159
|
|
|
>>> russell_index_num_to_alpha(3813428) |
160
|
|
|
'CRACDBR' |
161
|
|
|
>>> russell_index_num_to_alpha(715) |
162
|
|
|
'NAL' |
163
|
|
|
>>> russell_index_num_to_alpha(3614) |
164
|
|
|
'CMAD' |
165
|
|
|
""" |
166
|
|
|
_russell_num_translation = dict(zip((ord(_) for _ in '12345678'), |
|
|
|
|
167
|
|
|
'ABCDLMNR')) |
168
|
|
|
num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5', |
169
|
|
|
'6', '7', '8'}) |
170
|
|
|
if num: |
171
|
|
|
return num.translate(_russell_num_translation) |
172
|
|
|
return '' |
173
|
|
|
|
174
|
|
|
|
175
|
|
|
def russell_index_alpha(word): |
176
|
|
|
"""Return the Russell Index (alphabetic output) for the word. |
177
|
|
|
|
178
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
179
|
|
|
:cite:`Russell:1917`. |
180
|
|
|
|
181
|
|
|
:param str word: the word to transform |
182
|
|
|
:returns: the Russell Index value as an alphabetic string |
183
|
|
|
:rtype: str |
184
|
|
|
|
185
|
|
|
>>> russell_index_alpha('Christopher') |
186
|
|
|
'CRACDBR' |
187
|
|
|
>>> russell_index_alpha('Niall') |
188
|
|
|
'NAL' |
189
|
|
|
>>> russell_index_alpha('Smith') |
190
|
|
|
'CMAD' |
191
|
|
|
>>> russell_index_alpha('Schmidt') |
192
|
|
|
'CMAD' |
193
|
|
|
""" |
194
|
|
|
if word: |
195
|
|
|
return russell_index_num_to_alpha(russell_index(word)) |
196
|
|
|
return '' |
197
|
|
|
|
198
|
|
|
|
199
|
|
|
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True): |
200
|
|
|
"""Return the Soundex code for a word. |
201
|
|
|
|
202
|
|
|
:param str word: the word to transform |
203
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
204
|
|
|
:param str var: the variant of the algorithm to employ (defaults to |
205
|
|
|
'American'): |
206
|
|
|
|
207
|
|
|
- 'American' follows the American Soundex algorithm, as described at |
208
|
|
|
:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called |
209
|
|
|
Miracode |
210
|
|
|
- 'special' follows the rules from the 1880-1910 US Census |
211
|
|
|
retrospective re-analysis, in which h & w are not treated as blocking |
212
|
|
|
consonants but as vowels. Cf. :cite:`Repici:2013`. |
213
|
|
|
- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the |
214
|
|
|
US Census, including coding prefixed and unprefixed versions of some |
215
|
|
|
names |
216
|
|
|
|
217
|
|
|
:param bool reverse: reverse the word before computing the selected Soundex |
218
|
|
|
(defaults to False); This results in "Reverse Soundex", which is useful |
219
|
|
|
for blocking in cases where the initial elements may be in error. |
220
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
221
|
|
|
maxlength string |
222
|
|
|
:returns: the Soundex value |
223
|
|
|
:rtype: str |
224
|
|
|
|
225
|
|
|
>>> soundex("Christopher") |
226
|
|
|
'C623' |
227
|
|
|
>>> soundex("Niall") |
228
|
|
|
'N400' |
229
|
|
|
>>> soundex('Smith') |
230
|
|
|
'S530' |
231
|
|
|
>>> soundex('Schmidt') |
232
|
|
|
'S530' |
233
|
|
|
|
234
|
|
|
|
235
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY) |
236
|
|
|
'C623160000000000000000000000000000000000000000000000000000000000' |
237
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False) |
238
|
|
|
'C62316' |
239
|
|
|
|
240
|
|
|
>>> soundex('Christopher', reverse=True) |
241
|
|
|
'R132' |
242
|
|
|
|
243
|
|
|
>>> soundex('Ashcroft') |
244
|
|
|
'A261' |
245
|
|
|
>>> soundex('Asicroft') |
246
|
|
|
'A226' |
247
|
|
|
>>> soundex('Ashcroft', var='special') |
248
|
|
|
'A226' |
249
|
|
|
>>> soundex('Asicroft', var='special') |
250
|
|
|
'A226' |
251
|
|
|
""" |
252
|
|
|
_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
253
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
254
|
|
|
'01230129022455012623019202')) |
255
|
|
|
|
256
|
|
|
# Require a maxlength of at least 4 and not more than 64 |
257
|
|
|
if maxlength is not None: |
258
|
|
|
maxlength = min(max(4, maxlength), 64) |
259
|
|
|
else: |
260
|
|
|
maxlength = 64 |
261
|
|
|
|
262
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
263
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
264
|
|
|
word = word.replace('ß', 'SS') |
265
|
|
|
|
266
|
|
|
if var == 'Census': |
267
|
|
|
# TODO: Should these prefixes be supplemented? (VANDE, DELA, VON) |
|
|
|
|
268
|
|
|
if word[:3] in {'VAN', 'CON'} and len(word) > 4: |
269
|
|
|
return (soundex(word, maxlength, 'American', reverse, zero_pad), |
270
|
|
|
soundex(word[3:], maxlength, 'American', reverse, |
271
|
|
|
zero_pad)) |
272
|
|
|
if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: |
273
|
|
|
return (soundex(word, maxlength, 'American', reverse, zero_pad), |
274
|
|
|
soundex(word[2:], maxlength, 'American', reverse, |
275
|
|
|
zero_pad)) |
276
|
|
|
# Otherwise, proceed as usual (var='American' mode, ostensibly) |
277
|
|
|
|
278
|
|
|
word = ''.join(c for c in word if c in |
279
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
280
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
281
|
|
|
'Y', 'Z'}) |
282
|
|
|
|
283
|
|
|
# Nothing to convert, return base case |
284
|
|
|
if not word: |
285
|
|
|
if zero_pad: |
286
|
|
|
return '0'*maxlength |
287
|
|
|
return '0' |
288
|
|
|
|
289
|
|
|
# Reverse word if computing Reverse Soundex |
290
|
|
|
if reverse: |
291
|
|
|
word = word[::-1] |
292
|
|
|
|
293
|
|
|
# apply the Soundex algorithm |
294
|
|
|
sdx = word.translate(_soundex_translation) |
295
|
|
|
|
296
|
|
|
if var == 'special': |
297
|
|
|
sdx = sdx.replace('9', '0') # special rule for 1880-1910 census |
298
|
|
|
else: |
299
|
|
|
sdx = sdx.replace('9', '') # rule 1 |
300
|
|
|
sdx = _delete_consecutive_repeats(sdx) # rule 3 |
301
|
|
|
|
302
|
|
|
if word[0] in 'HW': |
303
|
|
|
sdx = word[0] + sdx |
304
|
|
|
else: |
305
|
|
|
sdx = word[0] + sdx[1:] |
306
|
|
|
sdx = sdx.replace('0', '') # rule 1 |
307
|
|
|
|
308
|
|
|
if zero_pad: |
309
|
|
|
sdx += ('0'*maxlength) # rule 4 |
310
|
|
|
|
311
|
|
|
return sdx[:maxlength] |
312
|
|
|
|
313
|
|
|
|
314
|
|
|
def refined_soundex(word, maxlength=_INFINITY, zero_pad=False, |
315
|
|
|
retain_vowels=False): |
316
|
|
|
"""Return the Refined Soundex code for a word. |
317
|
|
|
|
318
|
|
|
This is Soundex, but with more character classes. It was defined at |
319
|
|
|
:cite:`Boyce:1998`. |
320
|
|
|
|
321
|
|
|
:param word: the word to transform |
322
|
|
|
:param maxlength: the length of the code returned (defaults to unlimited) |
323
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
324
|
|
|
maxlength string |
325
|
|
|
:param retain_vowels: retain vowels (as 0) in the resulting code |
326
|
|
|
:returns: the Refined Soundex value |
327
|
|
|
:rtype: str |
328
|
|
|
|
329
|
|
|
>>> refined_soundex('Christopher') |
330
|
|
|
'C393619' |
331
|
|
|
>>> refined_soundex('Niall') |
332
|
|
|
'N87' |
333
|
|
|
>>> refined_soundex('Smith') |
334
|
|
|
'S386' |
335
|
|
|
>>> refined_soundex('Schmidt') |
336
|
|
|
'S386' |
337
|
|
|
""" |
338
|
|
|
_ref_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
339
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
340
|
|
|
'01360240043788015936020505')) |
341
|
|
|
|
342
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
343
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
344
|
|
|
word = word.replace('ß', 'SS') |
345
|
|
|
word = ''.join(c for c in word if c in |
346
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
347
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
348
|
|
|
'Y', 'Z'}) |
349
|
|
|
|
350
|
|
|
# apply the Soundex algorithm |
351
|
|
|
sdx = word[:1] + word.translate(_ref_soundex_translation) |
352
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
353
|
|
|
if not retain_vowels: |
354
|
|
|
sdx = sdx.replace('0', '') # Delete vowels, H, W, Y |
355
|
|
|
|
356
|
|
|
if maxlength < _INFINITY: |
357
|
|
|
if zero_pad: |
358
|
|
|
sdx += ('0' * maxlength) |
359
|
|
|
sdx = sdx[:maxlength] |
360
|
|
|
|
361
|
|
|
return sdx |
362
|
|
|
|
363
|
|
|
|
364
|
|
|
def dm_soundex(word, maxlength=6, zero_pad=True): |
365
|
|
|
"""Return the Daitch-Mokotoff Soundex code for a word. |
366
|
|
|
|
367
|
|
|
Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values |
368
|
|
|
of a word as a set. A collection is necessary since there can be multiple |
369
|
|
|
values for a single word. |
370
|
|
|
|
371
|
|
|
:param word: the word to transform |
372
|
|
|
:param maxlength: the length of the code returned (defaults to 6) |
373
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
374
|
|
|
maxlength string |
375
|
|
|
:returns: the Daitch-Mokotoff Soundex value |
376
|
|
|
:rtype: str |
377
|
|
|
|
378
|
|
|
>>> sorted(dm_soundex('Christopher')) |
379
|
|
|
['494379', '594379'] |
380
|
|
|
>>> dm_soundex('Niall') |
381
|
|
|
{'680000'} |
382
|
|
|
>>> dm_soundex('Smith') |
383
|
|
|
{'463000'} |
384
|
|
|
>>> dm_soundex('Schmidt') |
385
|
|
|
{'463000'} |
386
|
|
|
|
387
|
|
|
>>> sorted(dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)) |
388
|
|
|
['35457976754', '3557976754'] |
389
|
|
|
""" |
390
|
|
|
_dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), |
391
|
|
|
'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), |
392
|
|
|
'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), |
393
|
|
|
'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), |
394
|
|
|
'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), |
395
|
|
|
'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), |
396
|
|
|
'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), |
397
|
|
|
'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), |
398
|
|
|
'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), |
399
|
|
|
'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), |
400
|
|
|
'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), |
401
|
|
|
'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), |
402
|
|
|
'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), |
403
|
|
|
'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), |
404
|
|
|
'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), |
405
|
|
|
'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), |
406
|
|
|
'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), |
407
|
|
|
'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), |
408
|
|
|
'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), |
409
|
|
|
'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), |
410
|
|
|
'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), |
411
|
|
|
'STRS': (2, 4, 4), 'CZS': (4, 4, 4), |
412
|
|
|
'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), |
413
|
|
|
'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), |
414
|
|
|
'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), |
415
|
|
|
'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), |
416
|
|
|
'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), |
417
|
|
|
'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), |
418
|
|
|
'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), |
419
|
|
|
'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), |
420
|
|
|
'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), |
421
|
|
|
'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), |
422
|
|
|
'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), |
423
|
|
|
'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), |
424
|
|
|
'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), |
425
|
|
|
'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), |
426
|
|
|
'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), |
427
|
|
|
'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), |
428
|
|
|
'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), |
429
|
|
|
'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), |
430
|
|
|
'CH': ((5, 4), (5, 4), (5, 4)), |
431
|
|
|
'CK': ((5, 45), (5, 45), (5, 45)), |
432
|
|
|
'C': ((5, 4), (5, 4), (5, 4)), |
433
|
|
|
'J': ((1, 4), ('_', 4), ('_', 4)), |
434
|
|
|
'RZ': ((94, 4), (94, 4), (94, 4)), |
435
|
|
|
'RS': ((94, 4), (94, 4), (94, 4))} |
436
|
|
|
|
437
|
|
|
_dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
438
|
|
|
'B': ('B'), |
439
|
|
|
'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
440
|
|
|
'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', |
441
|
|
|
'DZ', 'D'), |
442
|
|
|
'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
443
|
|
|
'F': ('FB', 'F'), |
444
|
|
|
'G': ('G'), |
445
|
|
|
'H': ('H'), |
446
|
|
|
'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
447
|
|
|
'J': ('J'), |
448
|
|
|
'K': ('KH', 'KS', 'K'), |
449
|
|
|
'L': ('L'), |
450
|
|
|
'M': ('MN', 'M'), |
451
|
|
|
'N': ('NM', 'N'), |
452
|
|
|
'O': ('OI', 'OJ', 'OY', 'O'), |
453
|
|
|
'P': ('PF', 'PH', 'P'), |
454
|
|
|
'Q': ('Q'), |
455
|
|
|
'R': ('RS', 'RZ', 'R'), |
456
|
|
|
'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', |
457
|
|
|
'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', |
458
|
|
|
'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', |
459
|
|
|
'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'), |
460
|
|
|
'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', |
461
|
|
|
'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', |
462
|
|
|
'TS', 'TZ', 'T'), |
463
|
|
|
'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
464
|
|
|
'V': ('V'), |
465
|
|
|
'W': ('W'), |
466
|
|
|
'X': ('X'), |
467
|
|
|
'Y': ('Y'), |
468
|
|
|
'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', |
469
|
|
|
'ZH', 'ZS', 'Z')} |
470
|
|
|
|
471
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
472
|
|
|
dms = [''] # initialize empty code list |
473
|
|
|
|
474
|
|
|
# Require a maxlength of at least 6 and not more than 64 |
475
|
|
|
if maxlength is not None: |
476
|
|
|
maxlength = min(max(6, maxlength), 64) |
477
|
|
|
else: |
478
|
|
|
maxlength = 64 |
479
|
|
|
|
480
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z |
481
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
482
|
|
|
word = word.replace('ß', 'SS') |
483
|
|
|
word = ''.join(c for c in word if c in |
484
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
485
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
486
|
|
|
'Y', 'Z'}) |
487
|
|
|
|
488
|
|
|
# Nothing to convert, return base case |
489
|
|
|
if not word: |
490
|
|
|
if zero_pad: |
491
|
|
|
return {'0'*maxlength} |
492
|
|
|
return {'0'} |
493
|
|
|
|
494
|
|
|
pos = 0 |
495
|
|
|
while pos < len(word): |
496
|
|
|
# Iterate through _dms_order, which specifies the possible substrings |
497
|
|
|
# for which codes exist in the Daitch-Mokotoff coding |
498
|
|
|
for sstr in _dms_order[word[pos]]: |
499
|
|
|
if word[pos:].startswith(sstr): |
500
|
|
|
# Having determined a valid substring start, retrieve the code |
501
|
|
|
dm_val = _dms_table[sstr] |
502
|
|
|
|
503
|
|
|
# Having retried the code (triple), determine the correct |
504
|
|
|
# positional variant (first, pre-vocalic, elsewhere) |
505
|
|
|
if pos == 0: |
506
|
|
|
dm_val = dm_val[0] |
507
|
|
|
elif (pos+len(sstr) < len(word) and |
508
|
|
|
word[pos+len(sstr)] in _vowels): |
509
|
|
|
dm_val = dm_val[1] |
510
|
|
|
else: |
511
|
|
|
dm_val = dm_val[2] |
512
|
|
|
|
513
|
|
|
# Build the code strings |
514
|
|
|
if isinstance(dm_val, tuple): |
515
|
|
|
dms = [_ + text_type(dm_val[0]) for _ in dms] \ |
516
|
|
|
+ [_ + text_type(dm_val[1]) for _ in dms] |
517
|
|
|
else: |
518
|
|
|
dms = [_ + text_type(dm_val) for _ in dms] |
519
|
|
|
pos += len(sstr) |
520
|
|
|
break |
521
|
|
|
|
522
|
|
|
# Filter out double letters and _ placeholders |
523
|
|
|
dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_') |
|
|
|
|
524
|
|
|
for _ in dms) |
525
|
|
|
|
526
|
|
|
# Trim codes and return set |
527
|
|
|
if zero_pad: |
528
|
|
|
dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms) |
529
|
|
|
else: |
530
|
|
|
dms = (_[:maxlength] for _ in dms) |
531
|
|
|
return set(dms) |
532
|
|
|
|
533
|
|
|
|
534
|
|
|
def koelner_phonetik(word): |
535
|
|
|
"""Return the Kölner Phonetik (numeric output) code for a word. |
536
|
|
|
|
537
|
|
|
Based on the algorithm defined by :cite:`Postel:1969`. |
538
|
|
|
|
539
|
|
|
While the output code is numeric, it is still a str because 0s can lead |
540
|
|
|
the code. |
541
|
|
|
|
542
|
|
|
:param str word: the word to transform |
543
|
|
|
:returns: the Kölner Phonetik value as a numeric string |
544
|
|
|
:rtype: str |
545
|
|
|
|
546
|
|
|
>>> koelner_phonetik('Christopher') |
547
|
|
|
'478237' |
548
|
|
|
>>> koelner_phonetik('Niall') |
549
|
|
|
'65' |
550
|
|
|
>>> koelner_phonetik('Smith') |
551
|
|
|
'862' |
552
|
|
|
>>> koelner_phonetik('Schmidt') |
553
|
|
|
'862' |
554
|
|
|
>>> koelner_phonetik('Müller') |
555
|
|
|
'657' |
556
|
|
|
>>> koelner_phonetik('Zimmermann') |
557
|
|
|
'86766' |
558
|
|
|
""" |
559
|
|
|
def _after(word, i, letters): |
560
|
|
|
"""Return True if word[i] follows one of the supplied letters.""" |
561
|
|
|
if i > 0 and word[i-1] in letters: |
562
|
|
|
return True |
563
|
|
|
return False |
564
|
|
|
|
565
|
|
|
def _before(word, i, letters): |
566
|
|
|
"""Return True if word[i] precedes one of the supplied letters.""" |
567
|
|
|
if i+1 < len(word) and word[i+1] in letters: |
568
|
|
|
return True |
569
|
|
|
return False |
570
|
|
|
|
571
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
572
|
|
|
|
573
|
|
|
sdx = '' |
574
|
|
|
|
575
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
576
|
|
|
word = word.replace('ß', 'SS') |
577
|
|
|
|
578
|
|
|
word = word.replace('Ä', 'AE') |
579
|
|
|
word = word.replace('Ö', 'OE') |
580
|
|
|
word = word.replace('Ü', 'UE') |
581
|
|
|
word = ''.join(c for c in word if c in |
582
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
583
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
584
|
|
|
'Y', 'Z'}) |
585
|
|
|
|
586
|
|
|
# Nothing to convert, return base case |
587
|
|
|
if not word: |
588
|
|
|
return sdx |
589
|
|
|
|
590
|
|
|
for i in range(len(word)): |
|
|
|
|
591
|
|
View Code Duplication |
if word[i] in _vowels: |
|
|
|
|
592
|
|
|
sdx += '0' |
593
|
|
|
elif word[i] == 'B': |
594
|
|
|
sdx += '1' |
595
|
|
|
elif word[i] == 'P': |
596
|
|
|
if _before(word, i, {'H'}): |
597
|
|
|
sdx += '3' |
598
|
|
|
else: |
599
|
|
|
sdx += '1' |
600
|
|
|
elif word[i] in {'D', 'T'}: |
601
|
|
|
if _before(word, i, {'C', 'S', 'Z'}): |
602
|
|
|
sdx += '8' |
603
|
|
|
else: |
604
|
|
|
sdx += '2' |
605
|
|
|
elif word[i] in {'F', 'V', 'W'}: |
606
|
|
|
sdx += '3' |
607
|
|
|
elif word[i] in {'G', 'K', 'Q'}: |
608
|
|
|
sdx += '4' |
609
|
|
|
elif word[i] == 'C': |
610
|
|
|
if _after(word, i, {'S', 'Z'}): |
611
|
|
|
sdx += '8' |
612
|
|
|
elif i == 0: |
613
|
|
|
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', |
614
|
|
|
'X'}): |
615
|
|
|
sdx += '4' |
616
|
|
|
else: |
617
|
|
|
sdx += '8' |
618
|
|
|
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
619
|
|
|
sdx += '4' |
620
|
|
|
else: |
621
|
|
|
sdx += '8' |
622
|
|
|
elif word[i] == 'X': |
623
|
|
|
if _after(word, i, {'C', 'K', 'Q'}): |
624
|
|
|
sdx += '8' |
625
|
|
|
else: |
626
|
|
|
sdx += '48' |
627
|
|
|
elif word[i] == 'L': |
628
|
|
|
sdx += '5' |
629
|
|
|
elif word[i] in {'M', 'N'}: |
630
|
|
|
sdx += '6' |
631
|
|
|
elif word[i] == 'R': |
632
|
|
|
sdx += '7' |
633
|
|
|
elif word[i] in {'S', 'Z'}: |
634
|
|
|
sdx += '8' |
635
|
|
|
|
636
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
637
|
|
|
|
638
|
|
|
if sdx: |
639
|
|
|
sdx = sdx[:1] + sdx[1:].replace('0', '') |
640
|
|
|
|
641
|
|
|
return sdx |
642
|
|
|
|
643
|
|
|
|
644
|
|
|
def koelner_phonetik_num_to_alpha(num): |
645
|
|
|
"""Convert a Kölner Phonetik code from numeric to alphabetic. |
646
|
|
|
|
647
|
|
|
:param str num: a numeric Kölner Phonetik representation |
648
|
|
|
:returns: an alphabetic representation of the same word |
649
|
|
|
:rtype: str |
650
|
|
|
|
651
|
|
|
>>> koelner_phonetik_num_to_alpha(862) |
652
|
|
|
'SNT' |
653
|
|
|
>>> koelner_phonetik_num_to_alpha(657) |
654
|
|
|
'NLR' |
655
|
|
|
>>> koelner_phonetik_num_to_alpha(86766) |
656
|
|
|
'SNRNN' |
657
|
|
|
""" |
658
|
|
|
_koelner_num_translation = dict(zip((ord(_) for _ in '012345678'), |
|
|
|
|
659
|
|
|
'APTFKLNRS')) |
660
|
|
|
num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4', |
661
|
|
|
'5', '6', '7', '8'}) |
662
|
|
|
return num.translate(_koelner_num_translation) |
663
|
|
|
|
664
|
|
|
|
665
|
|
|
def koelner_phonetik_alpha(word): |
666
|
|
|
"""Return the Kölner Phonetik (alphabetic output) code for a word. |
667
|
|
|
|
668
|
|
|
:param str word: the word to transform |
669
|
|
|
:returns: the Kölner Phonetik value as an alphabetic string |
670
|
|
|
:rtype: str |
671
|
|
|
|
672
|
|
|
>>> koelner_phonetik_alpha('Smith') |
673
|
|
|
'SNT' |
674
|
|
|
>>> koelner_phonetik_alpha('Schmidt') |
675
|
|
|
'SNT' |
676
|
|
|
>>> koelner_phonetik_alpha('Müller') |
677
|
|
|
'NLR' |
678
|
|
|
>>> koelner_phonetik_alpha('Zimmermann') |
679
|
|
|
'SNRNN' |
680
|
|
|
""" |
681
|
|
|
return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
682
|
|
|
|
683
|
|
|
|
684
|
|
|
def nysiis(word, maxlength=6, modified=False): |
685
|
|
|
"""Return the NYSIIS code for a word. |
686
|
|
|
|
687
|
|
|
The New York State Identification and Intelligence System algorithm is |
688
|
|
|
defined in :cite:`Taft:1970`. |
689
|
|
|
|
690
|
|
|
The modified version of this algorithm is described in Appendix B of |
691
|
|
|
:cite:`Lynch:1977`. |
692
|
|
|
|
693
|
|
|
:param str word: the word to transform |
694
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
695
|
|
|
:param bool modified: indicates whether to use USDA modified NYSIIS |
696
|
|
|
:returns: the NYSIIS value |
697
|
|
|
:rtype: str |
698
|
|
|
|
699
|
|
|
>>> nysiis('Christopher') |
700
|
|
|
'CRASTA' |
701
|
|
|
>>> nysiis('Niall') |
702
|
|
|
'NAL' |
703
|
|
|
>>> nysiis('Smith') |
704
|
|
|
'SNAT' |
705
|
|
|
>>> nysiis('Schmidt') |
706
|
|
|
'SNAD' |
707
|
|
|
|
708
|
|
|
>>> nysiis('Christopher', maxlength=_INFINITY) |
709
|
|
|
'CRASTAFAR' |
710
|
|
|
|
711
|
|
|
>>> nysiis('Christopher', maxlength=8, modified=True) |
712
|
|
|
'CRASTAFA' |
713
|
|
|
>>> nysiis('Niall', maxlength=8, modified=True) |
714
|
|
|
'NAL' |
715
|
|
|
>>> nysiis('Smith', maxlength=8, modified=True) |
716
|
|
|
'SNAT' |
717
|
|
|
>>> nysiis('Schmidt', maxlength=8, modified=True) |
718
|
|
|
'SNAD' |
719
|
|
|
""" |
720
|
|
|
# Require a maxlength of at least 6 |
721
|
|
|
if maxlength: |
722
|
|
|
maxlength = max(6, maxlength) |
723
|
|
|
|
724
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
725
|
|
|
|
726
|
|
|
word = ''.join(c for c in word.upper() if c.isalpha()) |
727
|
|
|
word = word.replace('ß', 'SS') |
728
|
|
|
|
729
|
|
|
# exit early if there are no alphas |
730
|
|
|
if not word: |
731
|
|
|
return '' |
732
|
|
|
|
733
|
|
|
if modified: |
734
|
|
|
original_first_char = word[0] |
735
|
|
|
|
736
|
|
|
if word[:3] == 'MAC': |
737
|
|
|
word = 'MCC'+word[3:] |
738
|
|
|
elif word[:2] == 'KN': |
739
|
|
|
word = 'NN'+word[2:] |
740
|
|
|
elif word[:1] == 'K': |
741
|
|
|
word = 'C'+word[1:] |
742
|
|
|
elif word[:2] in {'PH', 'PF'}: |
743
|
|
|
word = 'FF'+word[2:] |
744
|
|
|
elif word[:3] == 'SCH': |
745
|
|
|
word = 'SSS'+word[3:] |
746
|
|
|
elif modified: |
747
|
|
|
if word[:2] == 'WR': |
748
|
|
|
word = 'RR'+word[2:] |
749
|
|
|
elif word[:2] == 'RH': |
750
|
|
|
word = 'RR'+word[2:] |
751
|
|
|
elif word[:2] == 'DG': |
752
|
|
|
word = 'GG'+word[2:] |
753
|
|
|
elif word[:1] in _vowels: |
754
|
|
|
word = 'A'+word[1:] |
755
|
|
|
|
756
|
|
|
if modified and word[-1:] in {'S', 'Z'}: |
757
|
|
|
word = word[:-1] |
758
|
|
|
|
759
|
|
|
if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and |
760
|
|
|
word[-2:] == 'YE'): |
761
|
|
|
word = word[:-2]+'Y' |
762
|
|
|
elif word[-2:] in {'DT', 'RT', 'RD'}: |
763
|
|
|
word = word[:-2]+'D' |
764
|
|
|
elif word[-2:] in {'NT', 'ND'}: |
765
|
|
|
word = word[:-2]+('N' if modified else 'D') |
766
|
|
|
elif modified: |
767
|
|
|
if word[-2:] == 'IX': |
768
|
|
|
word = word[:-2]+'ICK' |
769
|
|
|
elif word[-2:] == 'EX': |
770
|
|
|
word = word[:-2]+'ECK' |
771
|
|
|
elif word[-2:] in {'JR', 'SR'}: |
772
|
|
|
return 'ERROR' |
773
|
|
|
|
774
|
|
|
key = word[:1] |
775
|
|
|
|
776
|
|
|
skip = 0 |
777
|
|
|
for i in range(1, len(word)): |
778
|
|
|
if i >= len(word): |
779
|
|
|
continue |
780
|
|
|
elif skip: |
781
|
|
|
skip -= 1 |
782
|
|
|
continue |
783
|
|
|
elif word[i:i+2] == 'EV': |
784
|
|
|
word = word[:i] + 'AF' + word[i+2:] |
785
|
|
|
skip = 1 |
786
|
|
|
elif word[i] in _vowels: |
787
|
|
|
word = word[:i] + 'A' + word[i+1:] |
788
|
|
|
elif modified and i != len(word)-1 and word[i] == 'Y': |
789
|
|
|
word = word[:i] + 'A' + word[i+1:] |
790
|
|
|
elif word[i] == 'Q': |
791
|
|
|
word = word[:i] + 'G' + word[i+1:] |
792
|
|
|
elif word[i] == 'Z': |
793
|
|
|
word = word[:i] + 'S' + word[i+1:] |
794
|
|
|
elif word[i] == 'M': |
795
|
|
|
word = word[:i] + 'N' + word[i+1:] |
796
|
|
|
elif word[i:i+2] == 'KN': |
797
|
|
|
word = word[:i] + 'N' + word[i+2:] |
798
|
|
|
elif word[i] == 'K': |
799
|
|
|
word = word[:i] + 'C' + word[i+1:] |
800
|
|
|
elif modified and i == len(word)-3 and word[i:i+3] == 'SCH': |
801
|
|
|
word = word[:i] + 'SSA' |
802
|
|
|
skip = 2 |
803
|
|
|
elif word[i:i+3] == 'SCH': |
804
|
|
|
word = word[:i] + 'SSS' + word[i+3:] |
805
|
|
|
skip = 2 |
806
|
|
|
elif modified and i == len(word)-2 and word[i:i+2] == 'SH': |
807
|
|
|
word = word[:i] + 'SA' |
808
|
|
|
skip = 1 |
809
|
|
|
elif word[i:i+2] == 'SH': |
810
|
|
|
word = word[:i] + 'SS' + word[i+2:] |
811
|
|
|
skip = 1 |
812
|
|
|
elif word[i:i+2] == 'PH': |
813
|
|
|
word = word[:i] + 'FF' + word[i+2:] |
814
|
|
|
skip = 1 |
815
|
|
|
elif modified and word[i:i+3] == 'GHT': |
816
|
|
|
word = word[:i] + 'TTT' + word[i+3:] |
817
|
|
|
skip = 2 |
818
|
|
|
elif modified and word[i:i+2] == 'DG': |
819
|
|
|
word = word[:i] + 'GG' + word[i+2:] |
820
|
|
|
skip = 1 |
821
|
|
|
elif modified and word[i:i+2] == 'WR': |
822
|
|
|
word = word[:i] + 'RR' + word[i+2:] |
823
|
|
|
skip = 1 |
824
|
|
|
elif word[i] == 'H' and (word[i-1] not in _vowels or |
825
|
|
|
word[i+1:i+2] not in _vowels): |
826
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
827
|
|
|
elif word[i] == 'W' and word[i-1] in _vowels: |
828
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
829
|
|
|
|
830
|
|
|
if word[i:i+skip+1] != key[-1:]: |
831
|
|
|
key += word[i:i+skip+1] |
832
|
|
|
|
833
|
|
|
key = _delete_consecutive_repeats(key) |
834
|
|
|
|
835
|
|
|
if key[-1:] == 'S': |
836
|
|
|
key = key[:-1] |
837
|
|
|
if key[-2:] == 'AY': |
838
|
|
|
key = key[:-2] + 'Y' |
839
|
|
|
if key[-1:] == 'A': |
840
|
|
|
key = key[:-1] |
841
|
|
|
if modified and key[:1] == 'A': |
842
|
|
|
key = original_first_char + key[1:] |
|
|
|
|
843
|
|
|
|
844
|
|
|
if maxlength and maxlength < _INFINITY: |
845
|
|
|
key = key[:maxlength] |
846
|
|
|
|
847
|
|
|
return key |
848
|
|
|
|
849
|
|
|
|
850
|
|
|
def mra(word): |
851
|
|
|
"""Return the MRA personal numeric identifier (PNI) for a word. |
852
|
|
|
|
853
|
|
|
A description of the Western Airlines Surname Match Rating Algorithm can |
854
|
|
|
be found on page 18 of :cite:`Moore:1977`. |
855
|
|
|
|
856
|
|
|
:param str word: the word to transform |
857
|
|
|
:returns: the MRA PNI |
858
|
|
|
:rtype: str |
859
|
|
|
|
860
|
|
|
>>> mra('Christopher') |
861
|
|
|
'CHRPHR' |
862
|
|
|
>>> mra('Niall') |
863
|
|
|
'NL' |
864
|
|
|
>>> mra('Smith') |
865
|
|
|
'SMTH' |
866
|
|
|
>>> mra('Schmidt') |
867
|
|
|
'SCHMDT' |
868
|
|
|
""" |
869
|
|
|
if not word: |
870
|
|
|
return word |
871
|
|
|
word = word.upper() |
872
|
|
|
word = word.replace('ß', 'SS') |
873
|
|
|
word = word[0]+''.join(c for c in word[1:] if |
874
|
|
|
c not in {'A', 'E', 'I', 'O', 'U'}) |
875
|
|
|
word = _delete_consecutive_repeats(word) |
876
|
|
|
if len(word) > 6: |
877
|
|
|
word = word[:3]+word[-3:] |
878
|
|
|
return word |
879
|
|
|
|
880
|
|
|
|
881
|
|
|
def metaphone(word, maxlength=_INFINITY): |
882
|
|
|
"""Return the Metaphone code for a word. |
883
|
|
|
|
884
|
|
|
Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`, |
885
|
|
|
as described in :cite:`Philips:1990b`. |
886
|
|
|
This incorporates some corrections to the above code, particularly |
887
|
|
|
some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`. |
888
|
|
|
|
889
|
|
|
:param str word: the word to transform |
890
|
|
|
:param int maxlength: the maximum length of the returned Metaphone code |
891
|
|
|
(defaults to unlimited, but in Philips' original implementation |
892
|
|
|
this was 4) |
893
|
|
|
:returns: the Metaphone value |
894
|
|
|
:rtype: str |
895
|
|
|
|
896
|
|
|
|
897
|
|
|
>>> metaphone('Christopher') |
898
|
|
|
'KRSTFR' |
899
|
|
|
>>> metaphone('Niall') |
900
|
|
|
'NL' |
901
|
|
|
>>> metaphone('Smith') |
902
|
|
|
'SM0' |
903
|
|
|
>>> metaphone('Schmidt') |
904
|
|
|
'SKMTT' |
905
|
|
|
""" |
906
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
907
|
|
|
_frontv = {'E', 'I', 'Y'} |
908
|
|
|
_varson = {'C', 'G', 'P', 'S', 'T'} |
909
|
|
|
|
910
|
|
|
# Require a maxlength of at least 4 |
911
|
|
|
if maxlength is not None: |
912
|
|
|
maxlength = max(4, maxlength) |
913
|
|
|
else: |
914
|
|
|
maxlength = 64 |
915
|
|
|
|
916
|
|
|
# As in variable sound--those modified by adding an "h" |
917
|
|
|
ename = ''.join(c for c in word.upper() if c.isalnum()) |
918
|
|
|
ename = ename.replace('ß', 'SS') |
919
|
|
|
|
920
|
|
|
# Delete nonalphanumeric characters and make all caps |
921
|
|
|
if not ename: |
922
|
|
|
return '' |
923
|
|
|
if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}: |
924
|
|
|
ename = ename[1:] |
925
|
|
|
elif ename[0] == 'X': |
926
|
|
|
ename = 'S' + ename[1:] |
927
|
|
|
elif ename[0:2] == 'WH': |
928
|
|
|
ename = 'W' + ename[2:] |
929
|
|
|
|
930
|
|
|
# Convert to metaph |
931
|
|
|
elen = len(ename)-1 |
932
|
|
|
metaph = '' |
933
|
|
|
for i in range(len(ename)): |
|
|
|
|
934
|
|
|
if len(metaph) >= maxlength: |
935
|
|
|
break |
936
|
|
|
if ((ename[i] not in {'G', 'T'} and |
937
|
|
|
i > 0 and ename[i-1] == ename[i])): |
938
|
|
|
continue |
939
|
|
|
|
940
|
|
|
if ename[i] in _vowels and i == 0: |
941
|
|
|
metaph = ename[i] |
942
|
|
|
|
943
|
|
|
elif ename[i] == 'B': |
944
|
|
|
if i != elen or ename[i-1] != 'M': |
945
|
|
|
metaph += ename[i] |
946
|
|
|
|
947
|
|
|
elif ename[i] == 'C': |
948
|
|
|
if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv): |
949
|
|
|
if ename[i+1:i+3] == 'IA': |
950
|
|
|
metaph += 'X' |
951
|
|
|
elif ename[i+1:i+2] in _frontv: |
952
|
|
|
metaph += 'S' |
953
|
|
|
elif i > 0 and ename[i-1:i+2] == 'SCH': |
954
|
|
|
metaph += 'K' |
955
|
|
|
elif ename[i+1:i+2] == 'H': |
956
|
|
|
if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels: |
957
|
|
|
metaph += 'K' |
958
|
|
|
else: |
959
|
|
|
metaph += 'X' |
960
|
|
|
else: |
961
|
|
|
metaph += 'K' |
962
|
|
|
|
963
|
|
|
elif ename[i] == 'D': |
964
|
|
|
if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv: |
965
|
|
|
metaph += 'J' |
966
|
|
|
else: |
967
|
|
|
metaph += 'T' |
968
|
|
|
|
969
|
|
|
elif ename[i] == 'G': |
970
|
|
|
if ename[i+1:i+2] == 'H' and not (i+1 == elen or |
971
|
|
|
ename[i+2:i+3] not in _vowels): |
972
|
|
|
continue |
973
|
|
|
elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or |
974
|
|
|
(i+3 == elen and ename[i+1:i+4] == 'NED')): |
975
|
|
|
continue |
976
|
|
|
elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and |
977
|
|
|
ename[i+1] in _frontv): |
978
|
|
|
continue |
979
|
|
|
elif ename[i+1:i+2] == 'G': |
980
|
|
|
continue |
981
|
|
|
elif ename[i+1:i+2] in _frontv: |
982
|
|
|
if i == 0 or ename[i-1] != 'G': |
983
|
|
|
metaph += 'J' |
984
|
|
|
else: |
985
|
|
|
metaph += 'K' |
986
|
|
|
else: |
987
|
|
|
metaph += 'K' |
988
|
|
|
|
989
|
|
|
elif ename[i] == 'H': |
990
|
|
|
if ((i > 0 and ename[i-1] in _vowels and |
991
|
|
|
ename[i+1:i+2] not in _vowels)): |
992
|
|
|
continue |
993
|
|
|
elif i > 0 and ename[i-1] in _varson: |
994
|
|
|
continue |
995
|
|
|
else: |
996
|
|
|
metaph += 'H' |
997
|
|
|
|
998
|
|
|
elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}: |
999
|
|
|
metaph += ename[i] |
1000
|
|
|
|
1001
|
|
|
elif ename[i] == 'K': |
1002
|
|
|
if i > 0 and ename[i-1] == 'C': |
1003
|
|
|
continue |
1004
|
|
|
else: |
1005
|
|
|
metaph += 'K' |
1006
|
|
|
|
1007
|
|
|
elif ename[i] == 'P': |
1008
|
|
|
if ename[i+1:i+2] == 'H': |
1009
|
|
|
metaph += 'F' |
1010
|
|
|
else: |
1011
|
|
|
metaph += 'P' |
1012
|
|
|
|
1013
|
|
|
elif ename[i] == 'Q': |
1014
|
|
|
metaph += 'K' |
1015
|
|
|
|
1016
|
|
|
elif ename[i] == 'S': |
1017
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
1018
|
|
|
ename[i+2] in 'OA')): |
1019
|
|
|
metaph += 'X' |
1020
|
|
|
elif ename[i+1:i+2] == 'H': |
1021
|
|
|
metaph += 'X' |
1022
|
|
|
else: |
1023
|
|
|
metaph += 'S' |
1024
|
|
|
|
1025
|
|
|
elif ename[i] == 'T': |
1026
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
1027
|
|
|
ename[i+2] in {'A', 'O'})): |
1028
|
|
|
metaph += 'X' |
1029
|
|
|
elif ename[i+1:i+2] == 'H': |
1030
|
|
|
metaph += '0' |
1031
|
|
|
elif ename[i+1:i+3] != 'CH': |
1032
|
|
|
if ename[i-1:i] != 'T': |
1033
|
|
|
metaph += 'T' |
1034
|
|
|
|
1035
|
|
|
elif ename[i] == 'V': |
1036
|
|
|
metaph += 'F' |
1037
|
|
|
|
1038
|
|
|
elif ename[i] in 'WY': |
1039
|
|
|
if ename[i+1:i+2] in _vowels: |
1040
|
|
|
metaph += ename[i] |
1041
|
|
|
|
1042
|
|
|
elif ename[i] == 'X': |
1043
|
|
|
metaph += 'KS' |
1044
|
|
|
|
1045
|
|
|
elif ename[i] == 'Z': |
1046
|
|
|
metaph += 'S' |
1047
|
|
|
|
1048
|
|
|
return metaph |
1049
|
|
|
|
1050
|
|
|
|
1051
|
|
|
def double_metaphone(word, maxlength=_INFINITY): |
1052
|
|
|
"""Return the Double Metaphone code for a word. |
1053
|
|
|
|
1054
|
|
|
Based on Lawrence Philips' (Visual) C++ code from 1999 |
1055
|
|
|
:cite:`Philips:2000`. |
1056
|
|
|
|
1057
|
|
|
:param word: the word to transform |
1058
|
|
|
:param maxlength: the maximum length of the returned Double Metaphone codes |
1059
|
|
|
(defaults to unlimited, but in Philips' original implementation this |
1060
|
|
|
was 4) |
1061
|
|
|
:returns: the Double Metaphone value(s) |
1062
|
|
|
:rtype: tuple |
1063
|
|
|
|
1064
|
|
|
>>> double_metaphone('Christopher') |
1065
|
|
|
('KRSTFR', '') |
1066
|
|
|
>>> double_metaphone('Niall') |
1067
|
|
|
('NL', '') |
1068
|
|
|
>>> double_metaphone('Smith') |
1069
|
|
|
('SM0', 'XMT') |
1070
|
|
|
>>> double_metaphone('Schmidt') |
1071
|
|
|
('XMT', 'SMT') |
1072
|
|
|
""" |
1073
|
|
|
# Require a maxlength of at least 4 |
1074
|
|
|
if maxlength is not None: |
1075
|
|
|
maxlength = max(4, maxlength) |
1076
|
|
|
else: |
1077
|
|
|
maxlength = 64 |
1078
|
|
|
|
1079
|
|
|
primary = '' |
1080
|
|
|
secondary = '' |
1081
|
|
|
|
1082
|
|
|
def _slavo_germanic(): |
1083
|
|
|
"""Return True if the word appears to be Slavic or Germanic.""" |
1084
|
|
|
if 'W' in word or 'K' in word or 'CZ' in word: |
1085
|
|
|
return True |
1086
|
|
|
return False |
1087
|
|
|
|
1088
|
|
|
def _metaph_add(pri, sec=''): |
1089
|
|
|
"""Return a new metaphone tuple with the supplied elements.""" |
1090
|
|
|
newpri = primary |
1091
|
|
|
newsec = secondary |
1092
|
|
|
if pri: |
1093
|
|
|
newpri += pri |
1094
|
|
|
if sec: |
1095
|
|
|
if sec != ' ': |
1096
|
|
|
newsec += sec |
1097
|
|
|
else: |
1098
|
|
|
newsec += pri |
1099
|
|
|
return (newpri, newsec) |
1100
|
|
|
|
1101
|
|
|
def _is_vowel(pos): |
1102
|
|
|
"""Return True if the character at word[pos] is a vowel.""" |
1103
|
|
|
if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
1104
|
|
|
return True |
1105
|
|
|
return False |
1106
|
|
|
|
1107
|
|
|
def _get_at(pos): |
1108
|
|
|
"""Return the character at word[pos].""" |
1109
|
|
|
return word[pos] |
1110
|
|
|
|
1111
|
|
|
def _string_at(pos, slen, substrings): |
1112
|
|
|
"""Return True if word[pos:pos+slen] is in substrings.""" |
1113
|
|
|
if pos < 0: |
1114
|
|
|
return False |
1115
|
|
|
return word[pos:pos+slen] in substrings |
1116
|
|
|
|
1117
|
|
|
current = 0 |
1118
|
|
|
length = len(word) |
1119
|
|
|
if length < 1: |
1120
|
|
|
return ('', '') |
1121
|
|
|
last = length - 1 |
1122
|
|
|
|
1123
|
|
|
word = word.upper() |
1124
|
|
|
word = word.replace('ß', 'SS') |
1125
|
|
|
|
1126
|
|
|
# Pad the original string so that we can index beyond the edge of the world |
1127
|
|
|
word += ' ' |
1128
|
|
|
|
1129
|
|
|
# Skip these when at start of word |
1130
|
|
|
if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
1131
|
|
|
current += 1 |
1132
|
|
|
|
1133
|
|
|
# Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
1134
|
|
|
if _get_at(0) == 'X': |
1135
|
|
|
(primary, secondary) = _metaph_add('S') # 'Z' maps to 'S' |
1136
|
|
|
current += 1 |
1137
|
|
|
|
1138
|
|
|
# Main loop |
1139
|
|
|
while True: |
|
|
|
|
1140
|
|
|
if current >= length: |
1141
|
|
|
break |
1142
|
|
|
|
1143
|
|
|
if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
1144
|
|
|
if current == 0: |
1145
|
|
|
# All init vowels now map to 'A' |
1146
|
|
|
(primary, secondary) = _metaph_add('A') |
1147
|
|
|
current += 1 |
1148
|
|
|
continue |
1149
|
|
|
|
1150
|
|
|
elif _get_at(current) == 'B': |
1151
|
|
|
# "-mb", e.g", "dumb", already skipped over... |
1152
|
|
|
(primary, secondary) = _metaph_add('P') |
1153
|
|
|
if _get_at(current + 1) == 'B': |
1154
|
|
|
current += 2 |
1155
|
|
|
else: |
1156
|
|
|
current += 1 |
1157
|
|
|
continue |
1158
|
|
|
|
1159
|
|
|
elif _get_at(current) == 'Ç': |
1160
|
|
|
(primary, secondary) = _metaph_add('S') |
1161
|
|
|
current += 1 |
1162
|
|
|
continue |
1163
|
|
|
|
1164
|
|
|
elif _get_at(current) == 'C': |
1165
|
|
|
# Various Germanic |
1166
|
|
|
if (current > 1 and not _is_vowel(current - 2) and |
|
|
|
|
1167
|
|
|
_string_at((current - 1), 3, {'ACH'}) and |
1168
|
|
|
((_get_at(current + 2) != 'I') and |
1169
|
|
|
((_get_at(current + 2) != 'E') or |
1170
|
|
|
_string_at((current - 2), 6, |
1171
|
|
|
{'BACHER', 'MACHER'})))): |
1172
|
|
|
(primary, secondary) = _metaph_add('K') |
1173
|
|
|
current += 2 |
1174
|
|
|
continue |
1175
|
|
|
|
1176
|
|
|
# Special case 'caesar' |
1177
|
|
|
elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
1178
|
|
|
(primary, secondary) = _metaph_add('S') |
1179
|
|
|
current += 2 |
1180
|
|
|
continue |
1181
|
|
|
|
1182
|
|
|
# Italian 'chianti' |
1183
|
|
|
elif _string_at(current, 4, {'CHIA'}): |
1184
|
|
|
(primary, secondary) = _metaph_add('K') |
1185
|
|
|
current += 2 |
1186
|
|
|
continue |
1187
|
|
|
|
1188
|
|
|
elif _string_at(current, 2, {'CH'}): |
1189
|
|
|
# Find 'Michael' |
1190
|
|
|
if current > 0 and _string_at(current, 4, {'CHAE'}): |
1191
|
|
|
(primary, secondary) = _metaph_add('K', 'X') |
1192
|
|
|
current += 2 |
1193
|
|
|
continue |
1194
|
|
|
|
1195
|
|
|
# Greek roots e.g. 'chemistry', 'chorus' |
1196
|
|
|
elif (current == 0 and |
1197
|
|
|
(_string_at((current + 1), 5, |
1198
|
|
|
{'HARAC', 'HARIS'}) or |
1199
|
|
|
_string_at((current + 1), 3, |
1200
|
|
|
{'HOR', 'HYM', 'HIA', 'HEM'})) and |
1201
|
|
|
not _string_at(0, 5, {'CHORE'})): |
1202
|
|
|
(primary, secondary) = _metaph_add('K') |
1203
|
|
|
current += 2 |
1204
|
|
|
continue |
1205
|
|
|
|
1206
|
|
|
# Germanic, Greek, or otherwise 'ch' for 'kh' sound |
1207
|
|
|
elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
|
|
|
|
1208
|
|
|
_string_at(0, 3, {'SCH'})) or |
1209
|
|
|
# 'architect but not 'arch', 'orchestra', 'orchid' |
1210
|
|
|
_string_at((current - 2), 6, |
1211
|
|
|
{'ORCHES', 'ARCHIT', 'ORCHID'}) or |
1212
|
|
|
_string_at((current + 2), 1, {'T', 'S'}) or |
1213
|
|
|
((_string_at((current - 1), 1, |
1214
|
|
|
{'A', 'O', 'U', 'E'}) or |
1215
|
|
|
(current == 0)) and |
1216
|
|
|
# e.g., 'wachtler', 'wechsler', but not 'tichner' |
1217
|
|
|
_string_at((current + 2), 1, |
1218
|
|
|
{'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
1219
|
|
|
' '}))): |
1220
|
|
|
(primary, secondary) = _metaph_add('K') |
1221
|
|
|
|
1222
|
|
|
else: |
1223
|
|
|
if current > 0: |
1224
|
|
|
if _string_at(0, 2, {'MC'}): |
1225
|
|
|
# e.g., "McHugh" |
1226
|
|
|
(primary, secondary) = _metaph_add('K') |
1227
|
|
|
else: |
1228
|
|
|
(primary, secondary) = _metaph_add('X', 'K') |
1229
|
|
|
else: |
1230
|
|
|
(primary, secondary) = _metaph_add('X') |
1231
|
|
|
|
1232
|
|
|
current += 2 |
1233
|
|
|
continue |
1234
|
|
|
|
1235
|
|
|
# e.g, 'czerny' |
1236
|
|
|
elif (_string_at(current, 2, {'CZ'}) and |
1237
|
|
|
not _string_at((current - 2), 4, {'WICZ'})): |
1238
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1239
|
|
|
current += 2 |
1240
|
|
|
continue |
1241
|
|
|
|
1242
|
|
|
# e.g., 'focaccia' |
1243
|
|
|
elif _string_at((current + 1), 3, {'CIA'}): |
1244
|
|
|
(primary, secondary) = _metaph_add('X') |
1245
|
|
|
current += 3 |
1246
|
|
|
|
1247
|
|
|
# double 'C', but not if e.g. 'McClellan' |
1248
|
|
|
elif (_string_at(current, 2, {'CC'}) and |
1249
|
|
|
not ((current == 1) and (_get_at(0) == 'M'))): |
1250
|
|
|
# 'bellocchio' but not 'bacchus' |
1251
|
|
|
if ((_string_at((current + 2), 1, |
1252
|
|
|
{'I', 'E', 'H'}) and |
1253
|
|
|
not _string_at((current + 2), 2, ['HU']))): |
1254
|
|
|
# 'accident', 'accede' 'succeed' |
1255
|
|
|
if ((((current == 1) and _get_at(current - 1) == 'A') or |
1256
|
|
|
_string_at((current - 1), 5, |
1257
|
|
|
{'UCCEE', 'UCCES'}))): |
1258
|
|
|
(primary, secondary) = _metaph_add('KS') |
1259
|
|
|
# 'bacci', 'bertucci', other italian |
1260
|
|
|
else: |
1261
|
|
|
(primary, secondary) = _metaph_add('X') |
1262
|
|
|
current += 3 |
1263
|
|
|
continue |
1264
|
|
|
else: # Pierce's rule |
1265
|
|
|
(primary, secondary) = _metaph_add('K') |
1266
|
|
|
current += 2 |
1267
|
|
|
continue |
1268
|
|
|
|
1269
|
|
|
elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
1270
|
|
|
(primary, secondary) = _metaph_add('K') |
1271
|
|
|
current += 2 |
1272
|
|
|
continue |
1273
|
|
|
|
1274
|
|
|
elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
1275
|
|
|
# Italian vs. English |
1276
|
|
|
if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
1277
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1278
|
|
|
else: |
1279
|
|
|
(primary, secondary) = _metaph_add('S') |
1280
|
|
|
current += 2 |
1281
|
|
|
continue |
1282
|
|
|
|
1283
|
|
|
# else |
1284
|
|
|
else: |
1285
|
|
|
(primary, secondary) = _metaph_add('K') |
1286
|
|
|
|
1287
|
|
|
# name sent in 'mac caffrey', 'mac gregor |
1288
|
|
|
if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
1289
|
|
|
current += 3 |
1290
|
|
|
elif (_string_at((current + 1), 1, |
1291
|
|
|
{'C', 'K', 'Q'}) and |
1292
|
|
|
not _string_at((current + 1), 2, {'CE', 'CI'})): |
1293
|
|
|
current += 2 |
1294
|
|
|
else: |
1295
|
|
|
current += 1 |
1296
|
|
|
continue |
1297
|
|
|
|
1298
|
|
|
elif _get_at(current) == 'D': |
1299
|
|
|
if _string_at(current, 2, {'DG'}): |
1300
|
|
|
if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
1301
|
|
|
# e.g. 'edge' |
1302
|
|
|
(primary, secondary) = _metaph_add('J') |
1303
|
|
|
current += 3 |
1304
|
|
|
continue |
1305
|
|
|
else: |
1306
|
|
|
# e.g. 'edgar' |
1307
|
|
|
(primary, secondary) = _metaph_add('TK') |
1308
|
|
|
current += 2 |
1309
|
|
|
continue |
1310
|
|
|
|
1311
|
|
|
elif _string_at(current, 2, {'DT', 'DD'}): |
1312
|
|
|
(primary, secondary) = _metaph_add('T') |
1313
|
|
|
current += 2 |
1314
|
|
|
continue |
1315
|
|
|
|
1316
|
|
|
# else |
1317
|
|
|
else: |
1318
|
|
|
(primary, secondary) = _metaph_add('T') |
1319
|
|
|
current += 1 |
1320
|
|
|
continue |
1321
|
|
|
|
1322
|
|
|
elif _get_at(current) == 'F': |
1323
|
|
|
if _get_at(current + 1) == 'F': |
1324
|
|
|
current += 2 |
1325
|
|
|
else: |
1326
|
|
|
current += 1 |
1327
|
|
|
(primary, secondary) = _metaph_add('F') |
1328
|
|
|
continue |
1329
|
|
|
|
1330
|
|
|
elif _get_at(current) == 'G': |
1331
|
|
|
if _get_at(current + 1) == 'H': |
1332
|
|
|
if (current > 0) and not _is_vowel(current - 1): |
1333
|
|
|
(primary, secondary) = _metaph_add('K') |
1334
|
|
|
current += 2 |
1335
|
|
|
continue |
1336
|
|
|
|
1337
|
|
|
# 'ghislane', ghiradelli |
1338
|
|
|
elif current == 0: |
1339
|
|
|
if _get_at(current + 2) == 'I': |
1340
|
|
|
(primary, secondary) = _metaph_add('J') |
1341
|
|
|
else: |
1342
|
|
|
(primary, secondary) = _metaph_add('K') |
1343
|
|
|
current += 2 |
1344
|
|
|
continue |
1345
|
|
|
|
1346
|
|
|
# Parker's rule (with some further refinements) - e.g., 'hugh' |
1347
|
|
|
elif (((current > 1) and |
|
|
|
|
1348
|
|
|
_string_at((current - 2), 1, {'B', 'H', 'D'})) or |
1349
|
|
|
# e.g., 'bough' |
1350
|
|
|
((current > 2) and |
1351
|
|
|
_string_at((current - 3), 1, {'B', 'H', 'D'})) or |
1352
|
|
|
# e.g., 'broughton' |
1353
|
|
|
((current > 3) and |
1354
|
|
|
_string_at((current - 4), 1, {'B', 'H'}))): |
1355
|
|
|
current += 2 |
1356
|
|
|
continue |
1357
|
|
|
else: |
1358
|
|
|
# e.g. 'laugh', 'McLaughlin', 'cough', |
1359
|
|
|
# 'gough', 'rough', 'tough' |
1360
|
|
|
if ((current > 2) and |
1361
|
|
|
(_get_at(current - 1) == 'U') and |
1362
|
|
|
(_string_at((current - 3), 1, |
1363
|
|
|
{'C', 'G', 'L', 'R', 'T'}))): |
1364
|
|
|
(primary, secondary) = _metaph_add('F') |
1365
|
|
|
elif (current > 0) and _get_at(current - 1) != 'I': |
1366
|
|
|
(primary, secondary) = _metaph_add('K') |
1367
|
|
|
current += 2 |
1368
|
|
|
continue |
1369
|
|
|
|
1370
|
|
|
elif _get_at(current + 1) == 'N': |
1371
|
|
|
if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
1372
|
|
|
(primary, secondary) = _metaph_add('KN', 'N') |
1373
|
|
|
# not e.g. 'cagney' |
1374
|
|
|
elif (not _string_at((current + 2), 2, {'EY'}) and |
1375
|
|
|
(_get_at(current + 1) != 'Y') and |
1376
|
|
|
not _slavo_germanic()): |
1377
|
|
|
(primary, secondary) = _metaph_add('N', 'KN') |
1378
|
|
|
else: |
1379
|
|
|
(primary, secondary) = _metaph_add('KN') |
1380
|
|
|
current += 2 |
1381
|
|
|
continue |
1382
|
|
|
|
1383
|
|
|
# 'tagliaro' |
1384
|
|
|
elif (_string_at((current + 1), 2, {'LI'}) and |
1385
|
|
|
not _slavo_germanic()): |
1386
|
|
|
(primary, secondary) = _metaph_add('KL', 'L') |
1387
|
|
|
current += 2 |
1388
|
|
|
continue |
1389
|
|
|
|
1390
|
|
|
# -ges-, -gep-, -gel-, -gie- at beginning |
1391
|
|
|
elif ((current == 0) and |
1392
|
|
|
((_get_at(current + 1) == 'Y') or |
1393
|
|
|
_string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
1394
|
|
|
'IB', 'IL', 'IN', 'IE', 'EI', |
1395
|
|
|
'ER'}))): |
1396
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
1397
|
|
|
current += 2 |
1398
|
|
|
continue |
1399
|
|
|
|
1400
|
|
|
# -ger-, -gy- |
1401
|
|
|
elif ((_string_at((current + 1), 2, {'ER'}) or |
1402
|
|
|
(_get_at(current + 1) == 'Y')) and not |
1403
|
|
|
_string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
1404
|
|
|
_string_at((current - 1), 1, {'E', 'I'}) and not |
1405
|
|
|
_string_at((current - 1), 3, {'RGY', 'OGY'})): |
1406
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
1407
|
|
|
current += 2 |
1408
|
|
|
continue |
1409
|
|
|
|
1410
|
|
|
# italian e.g, 'biaggi' |
1411
|
|
|
elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
1412
|
|
|
_string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
1413
|
|
|
# obvious germanic |
1414
|
|
|
if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
1415
|
|
|
_string_at(0, 3, {'SCH'})) or |
1416
|
|
|
_string_at((current + 1), 2, {'ET'}))): |
1417
|
|
|
(primary, secondary) = _metaph_add('K') |
1418
|
|
|
elif _string_at((current + 1), 4, {'IER '}): |
1419
|
|
|
(primary, secondary) = _metaph_add('J') |
1420
|
|
|
else: |
1421
|
|
|
(primary, secondary) = _metaph_add('J', 'K') |
1422
|
|
|
current += 2 |
1423
|
|
|
continue |
1424
|
|
|
|
1425
|
|
|
else: |
1426
|
|
|
if _get_at(current + 1) == 'G': |
1427
|
|
|
current += 2 |
1428
|
|
|
else: |
1429
|
|
|
current += 1 |
1430
|
|
|
(primary, secondary) = _metaph_add('K') |
1431
|
|
|
continue |
1432
|
|
|
|
1433
|
|
|
elif _get_at(current) == 'H': |
1434
|
|
|
# only keep if first & before vowel or btw. 2 vowels |
1435
|
|
|
if ((((current == 0) or _is_vowel(current - 1)) and |
1436
|
|
|
_is_vowel(current + 1))): |
1437
|
|
|
(primary, secondary) = _metaph_add('H') |
1438
|
|
|
current += 2 |
1439
|
|
|
else: # also takes care of 'HH' |
1440
|
|
|
current += 1 |
1441
|
|
|
continue |
1442
|
|
|
|
1443
|
|
|
elif _get_at(current) == 'J': |
1444
|
|
|
# obvious spanish, 'jose', 'san jacinto' |
1445
|
|
|
if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
1446
|
|
|
if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
1447
|
|
|
_string_at(0, 4, ['SAN ']))): |
1448
|
|
|
(primary, secondary) = _metaph_add('H') |
1449
|
|
|
else: |
1450
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
1451
|
|
|
current += 1 |
1452
|
|
|
continue |
1453
|
|
|
|
1454
|
|
|
elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
1455
|
|
|
# Yankelovich/Jankelowicz |
1456
|
|
|
(primary, secondary) = _metaph_add('J', 'A') |
1457
|
|
|
# Spanish pron. of e.g. 'bajador' |
1458
|
|
|
elif (_is_vowel(current - 1) and |
1459
|
|
|
not _slavo_germanic() and |
1460
|
|
|
((_get_at(current + 1) == 'A') or |
1461
|
|
|
(_get_at(current + 1) == 'O'))): |
1462
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
1463
|
|
|
elif current == last: |
1464
|
|
|
(primary, secondary) = _metaph_add('J', ' ') |
1465
|
|
|
elif (not _string_at((current + 1), 1, |
1466
|
|
|
{'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
1467
|
|
|
not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
1468
|
|
|
(primary, secondary) = _metaph_add('J') |
1469
|
|
|
|
1470
|
|
|
if _get_at(current + 1) == 'J': # it could happen! |
1471
|
|
|
current += 2 |
1472
|
|
|
else: |
1473
|
|
|
current += 1 |
1474
|
|
|
continue |
1475
|
|
|
|
1476
|
|
|
elif _get_at(current) == 'K': |
1477
|
|
|
if _get_at(current + 1) == 'K': |
1478
|
|
|
current += 2 |
1479
|
|
|
else: |
1480
|
|
|
current += 1 |
1481
|
|
|
(primary, secondary) = _metaph_add('K') |
1482
|
|
|
continue |
1483
|
|
|
|
1484
|
|
|
elif _get_at(current) == 'L': |
1485
|
|
|
if _get_at(current + 1) == 'L': |
1486
|
|
|
# Spanish e.g. 'cabrillo', 'gallegos' |
1487
|
|
|
if (((current == (length - 3)) and |
1488
|
|
|
_string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
1489
|
|
|
((_string_at((last - 1), 2, {'AS', 'OS'}) or |
1490
|
|
|
_string_at(last, 1, {'A', 'O'})) and |
1491
|
|
|
_string_at((current - 1), 4, {'ALLE'}))): |
1492
|
|
|
(primary, secondary) = _metaph_add('L', ' ') |
1493
|
|
|
current += 2 |
1494
|
|
|
continue |
1495
|
|
|
current += 2 |
1496
|
|
|
else: |
1497
|
|
|
current += 1 |
1498
|
|
|
(primary, secondary) = _metaph_add('L') |
1499
|
|
|
continue |
1500
|
|
|
|
1501
|
|
|
elif _get_at(current) == 'M': |
1502
|
|
|
if (((_string_at((current - 1), 3, {'UMB'}) and |
1503
|
|
|
(((current + 1) == last) or |
1504
|
|
|
_string_at((current + 2), 2, {'ER'}))) or |
1505
|
|
|
# 'dumb', 'thumb' |
1506
|
|
|
(_get_at(current + 1) == 'M'))): |
1507
|
|
|
current += 2 |
1508
|
|
|
else: |
1509
|
|
|
current += 1 |
1510
|
|
|
(primary, secondary) = _metaph_add('M') |
1511
|
|
|
continue |
1512
|
|
|
|
1513
|
|
|
elif _get_at(current) == 'N': |
1514
|
|
|
if _get_at(current + 1) == 'N': |
1515
|
|
|
current += 2 |
1516
|
|
|
else: |
1517
|
|
|
current += 1 |
1518
|
|
|
(primary, secondary) = _metaph_add('N') |
1519
|
|
|
continue |
1520
|
|
|
|
1521
|
|
|
elif _get_at(current) == 'Ñ': |
1522
|
|
|
current += 1 |
1523
|
|
|
(primary, secondary) = _metaph_add('N') |
1524
|
|
|
continue |
1525
|
|
|
|
1526
|
|
|
elif _get_at(current) == 'P': |
1527
|
|
|
if _get_at(current + 1) == 'H': |
1528
|
|
|
(primary, secondary) = _metaph_add('F') |
1529
|
|
|
current += 2 |
1530
|
|
|
continue |
1531
|
|
|
|
1532
|
|
|
# also account for "campbell", "raspberry" |
1533
|
|
|
elif _string_at((current + 1), 1, {'P', 'B'}): |
1534
|
|
|
current += 2 |
1535
|
|
|
else: |
1536
|
|
|
current += 1 |
1537
|
|
|
(primary, secondary) = _metaph_add('P') |
1538
|
|
|
continue |
1539
|
|
|
|
1540
|
|
|
elif _get_at(current) == 'Q': |
1541
|
|
|
if _get_at(current + 1) == 'Q': |
1542
|
|
|
current += 2 |
1543
|
|
|
else: |
1544
|
|
|
current += 1 |
1545
|
|
|
(primary, secondary) = _metaph_add('K') |
1546
|
|
|
continue |
1547
|
|
|
|
1548
|
|
|
elif _get_at(current) == 'R': |
1549
|
|
|
# french e.g. 'rogier', but exclude 'hochmeier' |
1550
|
|
|
if (((current == last) and |
1551
|
|
|
not _slavo_germanic() and |
1552
|
|
|
_string_at((current - 2), 2, {'IE'}) and |
1553
|
|
|
not _string_at((current - 4), 2, {'ME', 'MA'}))): |
1554
|
|
|
(primary, secondary) = _metaph_add('', 'R') |
1555
|
|
|
else: |
1556
|
|
|
(primary, secondary) = _metaph_add('R') |
1557
|
|
|
|
1558
|
|
|
if _get_at(current + 1) == 'R': |
1559
|
|
|
current += 2 |
1560
|
|
|
else: |
1561
|
|
|
current += 1 |
1562
|
|
|
continue |
1563
|
|
|
|
1564
|
|
|
elif _get_at(current) == 'S': |
1565
|
|
|
# special cases 'island', 'isle', 'carlisle', 'carlysle' |
1566
|
|
|
if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
1567
|
|
|
current += 1 |
1568
|
|
|
continue |
1569
|
|
|
|
1570
|
|
|
# special case 'sugar-' |
1571
|
|
|
elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
1572
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
1573
|
|
|
current += 1 |
1574
|
|
|
continue |
1575
|
|
|
|
1576
|
|
|
elif _string_at(current, 2, {'SH'}): |
1577
|
|
|
# Germanic |
1578
|
|
|
if _string_at((current + 1), 4, |
1579
|
|
|
{'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
1580
|
|
|
(primary, secondary) = _metaph_add('S') |
1581
|
|
|
else: |
1582
|
|
|
(primary, secondary) = _metaph_add('X') |
1583
|
|
|
current += 2 |
1584
|
|
|
continue |
1585
|
|
|
|
1586
|
|
|
# Italian & Armenian |
1587
|
|
|
elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
1588
|
|
|
_string_at(current, 4, {'SIAN'})): |
1589
|
|
|
if not _slavo_germanic(): |
1590
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1591
|
|
|
else: |
1592
|
|
|
(primary, secondary) = _metaph_add('S') |
1593
|
|
|
current += 3 |
1594
|
|
|
continue |
1595
|
|
|
|
1596
|
|
|
# German & anglicisations, e.g. 'smith' match 'schmidt', |
1597
|
|
|
# 'snider' match 'schneider' |
1598
|
|
|
# also, -sz- in Slavic language although in Hungarian it is |
1599
|
|
|
# pronounced 's' |
1600
|
|
|
elif (((current == 0) and |
1601
|
|
|
_string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
1602
|
|
|
_string_at((current + 1), 1, {'Z'})): |
1603
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1604
|
|
|
if _string_at((current + 1), 1, {'Z'}): |
1605
|
|
|
current += 2 |
1606
|
|
|
else: |
1607
|
|
|
current += 1 |
1608
|
|
|
continue |
1609
|
|
|
|
1610
|
|
|
elif _string_at(current, 2, {'SC'}): |
1611
|
|
|
# Schlesinger's rule |
1612
|
|
|
if _get_at(current + 2) == 'H': |
1613
|
|
|
# dutch origin, e.g. 'school', 'schooner' |
1614
|
|
|
if _string_at((current + 3), 2, |
1615
|
|
|
{'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
1616
|
|
|
# 'schermerhorn', 'schenker' |
1617
|
|
|
if _string_at((current + 3), 2, {'ER', 'EN'}): |
1618
|
|
|
(primary, secondary) = _metaph_add('X', 'SK') |
1619
|
|
|
else: |
1620
|
|
|
(primary, secondary) = _metaph_add('SK') |
1621
|
|
|
current += 3 |
1622
|
|
|
continue |
1623
|
|
|
else: |
1624
|
|
|
if (((current == 0) and not _is_vowel(3) and |
1625
|
|
|
(_get_at(3) != 'W'))): |
1626
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
1627
|
|
|
else: |
1628
|
|
|
(primary, secondary) = _metaph_add('X') |
1629
|
|
|
current += 3 |
1630
|
|
|
continue |
1631
|
|
|
|
1632
|
|
|
elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
1633
|
|
|
(primary, secondary) = _metaph_add('S') |
1634
|
|
|
current += 3 |
1635
|
|
|
continue |
1636
|
|
|
|
1637
|
|
|
# else |
1638
|
|
|
else: |
1639
|
|
|
(primary, secondary) = _metaph_add('SK') |
1640
|
|
|
current += 3 |
1641
|
|
|
continue |
1642
|
|
|
|
1643
|
|
|
else: |
1644
|
|
|
# french e.g. 'resnais', 'artois' |
1645
|
|
|
if (current == last) and _string_at((current - 2), 2, |
1646
|
|
|
{'AI', 'OI'}): |
1647
|
|
|
(primary, secondary) = _metaph_add('', 'S') |
1648
|
|
|
else: |
1649
|
|
|
(primary, secondary) = _metaph_add('S') |
1650
|
|
|
|
1651
|
|
|
if _string_at((current + 1), 1, {'S', 'Z'}): |
1652
|
|
|
current += 2 |
1653
|
|
|
else: |
1654
|
|
|
current += 1 |
1655
|
|
|
continue |
1656
|
|
|
|
1657
|
|
|
elif _get_at(current) == 'T': |
1658
|
|
|
if _string_at(current, 4, {'TION'}): |
1659
|
|
|
(primary, secondary) = _metaph_add('X') |
1660
|
|
|
current += 3 |
1661
|
|
|
continue |
1662
|
|
|
|
1663
|
|
|
elif _string_at(current, 3, {'TIA', 'TCH'}): |
1664
|
|
|
(primary, secondary) = _metaph_add('X') |
1665
|
|
|
current += 3 |
1666
|
|
|
continue |
1667
|
|
|
|
1668
|
|
|
elif (_string_at(current, 2, {'TH'}) or |
1669
|
|
|
_string_at(current, 3, {'TTH'})): |
1670
|
|
|
# special case 'thomas', 'thames' or germanic |
1671
|
|
|
if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
1672
|
|
|
_string_at(0, 4, {'VAN ', 'VON '}) or |
1673
|
|
|
_string_at(0, 3, {'SCH'}))): |
1674
|
|
|
(primary, secondary) = _metaph_add('T') |
1675
|
|
|
else: |
1676
|
|
|
(primary, secondary) = _metaph_add('0', 'T') |
1677
|
|
|
current += 2 |
1678
|
|
|
continue |
1679
|
|
|
|
1680
|
|
|
elif _string_at((current + 1), 1, {'T', 'D'}): |
1681
|
|
|
current += 2 |
1682
|
|
|
else: |
1683
|
|
|
current += 1 |
1684
|
|
|
(primary, secondary) = _metaph_add('T') |
1685
|
|
|
continue |
1686
|
|
|
|
1687
|
|
|
elif _get_at(current) == 'V': |
1688
|
|
|
if _get_at(current + 1) == 'V': |
1689
|
|
|
current += 2 |
1690
|
|
|
else: |
1691
|
|
|
current += 1 |
1692
|
|
|
(primary, secondary) = _metaph_add('F') |
1693
|
|
|
continue |
1694
|
|
|
|
1695
|
|
|
elif _get_at(current) == 'W': |
1696
|
|
|
# can also be in middle of word |
1697
|
|
|
if _string_at(current, 2, {'WR'}): |
1698
|
|
|
(primary, secondary) = _metaph_add('R') |
1699
|
|
|
current += 2 |
1700
|
|
|
continue |
1701
|
|
|
elif ((current == 0) and |
1702
|
|
|
(_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
1703
|
|
|
# Wasserman should match Vasserman |
1704
|
|
|
if _is_vowel(current + 1): |
1705
|
|
|
(primary, secondary) = _metaph_add('A', 'F') |
1706
|
|
|
else: |
1707
|
|
|
# need Uomo to match Womo |
1708
|
|
|
(primary, secondary) = _metaph_add('A') |
1709
|
|
|
|
1710
|
|
|
# Arnow should match Arnoff |
1711
|
|
|
if ((((current == last) and _is_vowel(current - 1)) or |
1712
|
|
|
_string_at((current - 1), 5, |
1713
|
|
|
{'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
1714
|
|
|
_string_at(0, 3, ['SCH']))): |
1715
|
|
|
(primary, secondary) = _metaph_add('', 'F') |
1716
|
|
|
current += 1 |
1717
|
|
|
continue |
1718
|
|
|
# Polish e.g. 'filipowicz' |
1719
|
|
|
elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
1720
|
|
|
(primary, secondary) = _metaph_add('TS', 'FX') |
1721
|
|
|
current += 4 |
1722
|
|
|
continue |
1723
|
|
|
# else skip it |
1724
|
|
|
else: |
1725
|
|
|
current += 1 |
1726
|
|
|
continue |
1727
|
|
|
|
1728
|
|
|
elif _get_at(current) == 'X': |
1729
|
|
|
# French e.g. breaux |
1730
|
|
|
if (not ((current == last) and |
1731
|
|
|
(_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
1732
|
|
|
_string_at((current - 2), 2, {'AU', 'OU'})))): |
1733
|
|
|
(primary, secondary) = _metaph_add('KS') |
1734
|
|
|
|
1735
|
|
|
if _string_at((current + 1), 1, {'C', 'X'}): |
1736
|
|
|
current += 2 |
1737
|
|
|
else: |
1738
|
|
|
current += 1 |
1739
|
|
|
continue |
1740
|
|
|
|
1741
|
|
|
elif _get_at(current) == 'Z': |
1742
|
|
|
# Chinese Pinyin e.g. 'zhao' |
1743
|
|
|
if _get_at(current + 1) == 'H': |
1744
|
|
|
(primary, secondary) = _metaph_add('J') |
1745
|
|
|
current += 2 |
1746
|
|
|
continue |
1747
|
|
|
elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
1748
|
|
|
(_slavo_germanic() and ((current > 0) and |
1749
|
|
|
_get_at(current - 1) != 'T'))): |
1750
|
|
|
(primary, secondary) = _metaph_add('S', 'TS') |
1751
|
|
|
else: |
1752
|
|
|
(primary, secondary) = _metaph_add('S') |
1753
|
|
|
|
1754
|
|
|
if _get_at(current + 1) == 'Z': |
1755
|
|
|
current += 2 |
1756
|
|
|
else: |
1757
|
|
|
current += 1 |
1758
|
|
|
continue |
1759
|
|
|
|
1760
|
|
|
else: |
1761
|
|
|
current += 1 |
1762
|
|
|
|
1763
|
|
|
if maxlength and maxlength < _INFINITY: |
1764
|
|
|
primary = primary[:maxlength] |
1765
|
|
|
secondary = secondary[:maxlength] |
1766
|
|
|
if primary == secondary: |
1767
|
|
|
secondary = '' |
1768
|
|
|
|
1769
|
|
|
return (primary, secondary) |
1770
|
|
|
|
1771
|
|
|
|
1772
|
|
|
def caverphone(word, version=2): |
1773
|
|
|
"""Return the Caverphone code for a word. |
1774
|
|
|
|
1775
|
|
|
A description of version 1 of the algorithm can be found in |
1776
|
|
|
:cite:`Hood:2002`. |
1777
|
|
|
|
1778
|
|
|
A description of version 2 of the algorithm can be found in |
1779
|
|
|
:cite:`Hood:2004`. |
1780
|
|
|
|
1781
|
|
|
:param str word: the word to transform |
1782
|
|
|
:param int version: the version of Caverphone to employ for encoding |
1783
|
|
|
(defaults to 2) |
1784
|
|
|
:returns: the Caverphone value |
1785
|
|
|
:rtype: str |
1786
|
|
|
|
1787
|
|
|
>>> caverphone('Christopher') |
1788
|
|
|
'KRSTFA1111' |
1789
|
|
|
>>> caverphone('Niall') |
1790
|
|
|
'NA11111111' |
1791
|
|
|
>>> caverphone('Smith') |
1792
|
|
|
'SMT1111111' |
1793
|
|
|
>>> caverphone('Schmidt') |
1794
|
|
|
'SKMT111111' |
1795
|
|
|
|
1796
|
|
|
>>> caverphone('Christopher', 1) |
1797
|
|
|
'KRSTF1' |
1798
|
|
|
>>> caverphone('Niall', 1) |
1799
|
|
|
'N11111' |
1800
|
|
|
>>> caverphone('Smith', 1) |
1801
|
|
|
'SMT111' |
1802
|
|
|
>>> caverphone('Schmidt', 1) |
1803
|
|
|
'SKMT11' |
1804
|
|
|
""" |
1805
|
|
|
_vowels = {'a', 'e', 'i', 'o', 'u'} |
1806
|
|
|
|
1807
|
|
|
word = word.lower() |
1808
|
|
|
word = ''.join(c for c in word if c in |
1809
|
|
|
{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
1810
|
|
|
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
1811
|
|
|
'y', 'z'}) |
1812
|
|
|
|
1813
|
|
|
def _squeeze_replace(word, char, new_char): |
1814
|
|
|
"""Convert strings of char in word to one instance of new_char.""" |
1815
|
|
|
while char * 2 in word: |
1816
|
|
|
word = word.replace(char * 2, char) |
1817
|
|
|
return word.replace(char, new_char) |
1818
|
|
|
|
1819
|
|
|
# the main replacemet algorithm |
1820
|
|
|
if version != 1 and word[-1:] == 'e': |
1821
|
|
|
word = word[:-1] |
1822
|
|
|
if word: |
1823
|
|
|
if word[:5] == 'cough': |
1824
|
|
|
word = 'cou2f'+word[5:] |
1825
|
|
|
if word[:5] == 'rough': |
1826
|
|
|
word = 'rou2f'+word[5:] |
1827
|
|
|
if word[:5] == 'tough': |
1828
|
|
|
word = 'tou2f'+word[5:] |
1829
|
|
|
if word[:6] == 'enough': |
1830
|
|
|
word = 'enou2f'+word[6:] |
1831
|
|
|
if version != 1 and word[:6] == 'trough': |
1832
|
|
|
word = 'trou2f'+word[6:] |
1833
|
|
|
if word[:2] == 'gn': |
1834
|
|
|
word = '2n'+word[2:] |
1835
|
|
|
if word[-2:] == 'mb': |
1836
|
|
|
word = word[:-1]+'2' |
1837
|
|
|
word = word.replace('cq', '2q') |
1838
|
|
|
word = word.replace('ci', 'si') |
1839
|
|
|
word = word.replace('ce', 'se') |
1840
|
|
|
word = word.replace('cy', 'sy') |
1841
|
|
|
word = word.replace('tch', '2ch') |
1842
|
|
|
word = word.replace('c', 'k') |
1843
|
|
|
word = word.replace('q', 'k') |
1844
|
|
|
word = word.replace('x', 'k') |
1845
|
|
|
word = word.replace('v', 'f') |
1846
|
|
|
word = word.replace('dg', '2g') |
1847
|
|
|
word = word.replace('tio', 'sio') |
1848
|
|
|
word = word.replace('tia', 'sia') |
1849
|
|
|
word = word.replace('d', 't') |
1850
|
|
|
word = word.replace('ph', 'fh') |
1851
|
|
|
word = word.replace('b', 'p') |
1852
|
|
|
word = word.replace('sh', 's2') |
1853
|
|
|
word = word.replace('z', 's') |
1854
|
|
|
if word[0] in _vowels: |
1855
|
|
|
word = 'A'+word[1:] |
1856
|
|
|
word = word.replace('a', '3') |
1857
|
|
|
word = word.replace('e', '3') |
1858
|
|
|
word = word.replace('i', '3') |
1859
|
|
|
word = word.replace('o', '3') |
1860
|
|
|
word = word.replace('u', '3') |
1861
|
|
|
if version != 1: |
1862
|
|
|
word = word.replace('j', 'y') |
1863
|
|
|
if word[:2] == 'y3': |
1864
|
|
|
word = 'Y3'+word[2:] |
1865
|
|
|
if word[:1] == 'y': |
1866
|
|
|
word = 'A'+word[1:] |
1867
|
|
|
word = word.replace('y', '3') |
1868
|
|
|
word = word.replace('3gh3', '3kh3') |
1869
|
|
|
word = word.replace('gh', '22') |
1870
|
|
|
word = word.replace('g', 'k') |
1871
|
|
|
|
1872
|
|
|
word = _squeeze_replace(word, 's', 'S') |
1873
|
|
|
word = _squeeze_replace(word, 't', 'T') |
1874
|
|
|
word = _squeeze_replace(word, 'p', 'P') |
1875
|
|
|
word = _squeeze_replace(word, 'k', 'K') |
1876
|
|
|
word = _squeeze_replace(word, 'f', 'F') |
1877
|
|
|
word = _squeeze_replace(word, 'm', 'M') |
1878
|
|
|
word = _squeeze_replace(word, 'n', 'N') |
1879
|
|
|
|
1880
|
|
|
word = word.replace('w3', 'W3') |
1881
|
|
|
if version == 1: |
1882
|
|
|
word = word.replace('wy', 'Wy') |
1883
|
|
|
word = word.replace('wh3', 'Wh3') |
1884
|
|
|
if version == 1: |
1885
|
|
|
word = word.replace('why', 'Why') |
1886
|
|
|
if version != 1 and word[-1:] == 'w': |
1887
|
|
|
word = word[:-1]+'3' |
1888
|
|
|
word = word.replace('w', '2') |
1889
|
|
|
if word[:1] == 'h': |
1890
|
|
|
word = 'A'+word[1:] |
1891
|
|
|
word = word.replace('h', '2') |
1892
|
|
|
word = word.replace('r3', 'R3') |
1893
|
|
|
if version == 1: |
1894
|
|
|
word = word.replace('ry', 'Ry') |
1895
|
|
|
if version != 1 and word[-1:] == 'r': |
1896
|
|
|
word = word[:-1]+'3' |
1897
|
|
|
word = word.replace('r', '2') |
1898
|
|
|
word = word.replace('l3', 'L3') |
1899
|
|
|
if version == 1: |
1900
|
|
|
word = word.replace('ly', 'Ly') |
1901
|
|
|
if version != 1 and word[-1:] == 'l': |
1902
|
|
|
word = word[:-1]+'3' |
1903
|
|
|
word = word.replace('l', '2') |
1904
|
|
|
if version == 1: |
1905
|
|
|
word = word.replace('j', 'y') |
1906
|
|
|
word = word.replace('y3', 'Y3') |
1907
|
|
|
word = word.replace('y', '2') |
1908
|
|
|
word = word.replace('2', '') |
1909
|
|
|
if version != 1 and word[-1:] == '3': |
1910
|
|
|
word = word[:-1]+'A' |
1911
|
|
|
word = word.replace('3', '') |
1912
|
|
|
|
1913
|
|
|
# pad with 1s, then extract the necessary length of code |
1914
|
|
|
word = word+'1'*10 |
1915
|
|
|
if version != 1: |
1916
|
|
|
word = word[:10] |
1917
|
|
|
else: |
1918
|
|
|
word = word[:6] |
1919
|
|
|
|
1920
|
|
|
return word |
1921
|
|
|
|
1922
|
|
|
|
1923
|
|
|
def alpha_sis(word, maxlength=14): |
1924
|
|
|
"""Return the IBM Alpha Search Inquiry System code for a word. |
1925
|
|
|
|
1926
|
|
|
The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`. |
1927
|
|
|
This implementation is based on the description in :cite:`Moore:1977`. |
1928
|
|
|
|
1929
|
|
|
A collection is necessary since there can be multiple values for a |
1930
|
|
|
single word. But the collection must be ordered since the first value |
1931
|
|
|
is the primary coding. |
1932
|
|
|
|
1933
|
|
|
:param str word: the word to transform |
1934
|
|
|
:param int maxlength: the length of the code returned (defaults to 14) |
1935
|
|
|
:returns: the Alpha SIS value |
1936
|
|
|
:rtype: tuple |
1937
|
|
|
|
1938
|
|
|
>>> alpha_sis('Christopher') |
1939
|
|
|
('06401840000000', '07040184000000', '04018400000000') |
1940
|
|
|
>>> alpha_sis('Niall') |
1941
|
|
|
('02500000000000',) |
1942
|
|
|
>>> alpha_sis('Smith') |
1943
|
|
|
('03100000000000',) |
1944
|
|
|
>>> alpha_sis('Schmidt') |
1945
|
|
|
('06310000000000',) |
1946
|
|
|
""" |
1947
|
|
|
_alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', |
1948
|
|
|
'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', |
1949
|
|
|
'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', |
1950
|
|
|
'O': '1', 'U': '1', 'W': '4', 'Y': '5'} |
1951
|
|
|
_alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', |
1952
|
|
|
'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', |
1953
|
|
|
'Y') |
1954
|
|
|
_alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'), |
1955
|
|
|
'CH': ('6', '70', '0'), 'CK': ('7', '6'), |
1956
|
|
|
'DS': ('0', '10'), 'DZ': ('0', '10'), |
1957
|
|
|
'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', |
1958
|
|
|
'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', |
1959
|
|
|
'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', |
1960
|
|
|
'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', |
1961
|
|
|
'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', |
1962
|
|
|
'F': '8', 'V': '8', 'B': '9', 'P': '9'} |
1963
|
|
|
_alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', |
1964
|
|
|
'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', |
1965
|
|
|
'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', |
1966
|
|
|
'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P') |
1967
|
|
|
|
1968
|
|
|
alpha = [''] |
1969
|
|
|
pos = 0 |
1970
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
1971
|
|
|
word = word.replace('ß', 'SS') |
1972
|
|
|
word = ''.join(c for c in word if c in |
1973
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
1974
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
1975
|
|
|
'Y', 'Z'}) |
1976
|
|
|
|
1977
|
|
|
# Clamp maxlength to [4, 64] |
1978
|
|
|
if maxlength is not None: |
1979
|
|
|
maxlength = min(max(4, maxlength), 64) |
1980
|
|
|
else: |
1981
|
|
|
maxlength = 64 |
1982
|
|
|
|
1983
|
|
|
# Do special processing for initial substrings |
1984
|
|
|
for k in _alpha_sis_initials_order: |
1985
|
|
|
if word.startswith(k): |
1986
|
|
|
alpha[0] += _alpha_sis_initials[k] |
1987
|
|
|
pos += len(k) |
1988
|
|
|
break |
1989
|
|
|
|
1990
|
|
|
# Add a '0' if alpha is still empty |
1991
|
|
|
if not alpha[0]: |
1992
|
|
|
alpha[0] += '0' |
1993
|
|
|
|
1994
|
|
|
# Whether or not any special initial codes were encoded, iterate |
1995
|
|
|
# through the length of the word in the main encoding loop |
1996
|
|
|
while pos < len(word): |
1997
|
|
|
origpos = pos |
1998
|
|
|
for k in _alpha_sis_basic_order: |
1999
|
|
|
if word[pos:].startswith(k): |
2000
|
|
|
if isinstance(_alpha_sis_basic[k], tuple): |
2001
|
|
|
newalpha = [] |
2002
|
|
|
for i in range(len(_alpha_sis_basic[k])): |
2003
|
|
|
newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] |
2004
|
|
|
alpha = newalpha |
2005
|
|
|
else: |
2006
|
|
|
alpha = [_ + _alpha_sis_basic[k] for _ in alpha] |
2007
|
|
|
pos += len(k) |
2008
|
|
|
break |
2009
|
|
|
if pos == origpos: |
2010
|
|
|
alpha = [_ + '_' for _ in alpha] |
2011
|
|
|
pos += 1 |
2012
|
|
|
|
2013
|
|
|
# Trim doublets and placeholders |
2014
|
|
|
for i in range(len(alpha)): |
2015
|
|
|
pos = 1 |
2016
|
|
|
while pos < len(alpha[i]): |
2017
|
|
|
if alpha[i][pos] == alpha[i][pos-1]: |
2018
|
|
|
alpha[i] = alpha[i][:pos]+alpha[i][pos+1:] |
2019
|
|
|
pos += 1 |
2020
|
|
|
alpha = (_.replace('_', '') for _ in alpha) |
|
|
|
|
2021
|
|
|
|
2022
|
|
|
# Trim codes and return tuple |
2023
|
|
|
alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha) |
2024
|
|
|
return tuple(alpha) |
2025
|
|
|
|
2026
|
|
|
|
2027
|
|
|
def fuzzy_soundex(word, maxlength=5, zero_pad=True): |
2028
|
|
|
"""Return the Fuzzy Soundex code for a word. |
2029
|
|
|
|
2030
|
|
|
Fuzzy Soundex is an algorithm derived from Soundex, defined in |
2031
|
|
|
:cite:`Holmes:2002`. |
2032
|
|
|
|
2033
|
|
|
:param str word: the word to transform |
2034
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2035
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2036
|
|
|
a maxlength string |
2037
|
|
|
:returns: the Fuzzy Soundex value |
2038
|
|
|
:rtype: str |
2039
|
|
|
|
2040
|
|
|
>>> fuzzy_soundex('Christopher') |
2041
|
|
|
'K6931' |
2042
|
|
|
>>> fuzzy_soundex('Niall') |
2043
|
|
|
'N4000' |
2044
|
|
|
>>> fuzzy_soundex('Smith') |
2045
|
|
|
'S5300' |
2046
|
|
|
>>> fuzzy_soundex('Smith') |
2047
|
|
|
'S5300' |
2048
|
|
|
""" |
2049
|
|
|
_fuzzy_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2050
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
2051
|
|
|
'0193017-07745501769301-7-9')) |
2052
|
|
|
|
2053
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
2054
|
|
|
word = word.replace('ß', 'SS') |
2055
|
|
|
|
2056
|
|
|
# Clamp maxlength to [4, 64] |
2057
|
|
|
if maxlength is not None: |
2058
|
|
|
maxlength = min(max(4, maxlength), 64) |
2059
|
|
|
else: |
2060
|
|
|
maxlength = 64 |
2061
|
|
|
|
2062
|
|
|
if not word: |
2063
|
|
|
if zero_pad: |
2064
|
|
|
return '0' * maxlength |
2065
|
|
|
return '0' |
2066
|
|
|
|
2067
|
|
|
if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: |
2068
|
|
|
word = 'SS' + word[2:] |
2069
|
|
|
elif word[:2] == 'GN': |
2070
|
|
|
word = 'NN' + word[2:] |
2071
|
|
|
elif word[:2] in {'HR', 'WR'}: |
2072
|
|
|
word = 'RR' + word[2:] |
2073
|
|
|
elif word[:2] == 'HW': |
2074
|
|
|
word = 'WW' + word[2:] |
2075
|
|
|
elif word[:2] in {'KN', 'NG'}: |
2076
|
|
|
word = 'NN' + word[2:] |
2077
|
|
|
|
2078
|
|
|
if word[-2:] == 'CH': |
2079
|
|
|
word = word[:-2] + 'KK' |
2080
|
|
|
elif word[-2:] == 'NT': |
2081
|
|
|
word = word[:-2] + 'TT' |
2082
|
|
|
elif word[-2:] == 'RT': |
2083
|
|
|
word = word[:-2] + 'RR' |
2084
|
|
|
elif word[-3:] == 'RDT': |
2085
|
|
|
word = word[:-3] + 'RR' |
2086
|
|
|
|
2087
|
|
|
word = word.replace('CA', 'KA') |
2088
|
|
|
word = word.replace('CC', 'KK') |
2089
|
|
|
word = word.replace('CK', 'KK') |
2090
|
|
|
word = word.replace('CE', 'SE') |
2091
|
|
|
word = word.replace('CHL', 'KL') |
2092
|
|
|
word = word.replace('CL', 'KL') |
2093
|
|
|
word = word.replace('CHR', 'KR') |
2094
|
|
|
word = word.replace('CR', 'KR') |
2095
|
|
|
word = word.replace('CI', 'SI') |
2096
|
|
|
word = word.replace('CO', 'KO') |
2097
|
|
|
word = word.replace('CU', 'KU') |
2098
|
|
|
word = word.replace('CY', 'SY') |
2099
|
|
|
word = word.replace('DG', 'GG') |
2100
|
|
|
word = word.replace('GH', 'HH') |
2101
|
|
|
word = word.replace('MAC', 'MK') |
2102
|
|
|
word = word.replace('MC', 'MK') |
2103
|
|
|
word = word.replace('NST', 'NSS') |
2104
|
|
|
word = word.replace('PF', 'FF') |
2105
|
|
|
word = word.replace('PH', 'FF') |
2106
|
|
|
word = word.replace('SCH', 'SSS') |
2107
|
|
|
word = word.replace('TIO', 'SIO') |
2108
|
|
|
word = word.replace('TIA', 'SIO') |
2109
|
|
|
word = word.replace('TCH', 'CHH') |
2110
|
|
|
|
2111
|
|
|
sdx = word.translate(_fuzzy_soundex_translation) |
2112
|
|
|
sdx = sdx.replace('-', '') |
2113
|
|
|
|
2114
|
|
|
# remove repeating characters |
2115
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
2116
|
|
|
|
2117
|
|
|
if word[0] in {'H', 'W', 'Y'}: |
2118
|
|
|
sdx = word[0] + sdx |
2119
|
|
|
else: |
2120
|
|
|
sdx = word[0] + sdx[1:] |
2121
|
|
|
|
2122
|
|
|
sdx = sdx.replace('0', '') |
2123
|
|
|
|
2124
|
|
|
if zero_pad: |
2125
|
|
|
sdx += ('0'*maxlength) |
2126
|
|
|
|
2127
|
|
|
return sdx[:maxlength] |
2128
|
|
|
|
2129
|
|
|
|
2130
|
|
|
def phonex(word, maxlength=4, zero_pad=True): |
2131
|
|
|
"""Return the Phonex code for a word. |
2132
|
|
|
|
2133
|
|
|
Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`. |
2134
|
|
|
|
2135
|
|
|
:param str word: the word to transform |
2136
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2137
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2138
|
|
|
a maxlength string |
2139
|
|
|
:returns: the Phonex value |
2140
|
|
|
:rtype: str |
2141
|
|
|
|
2142
|
|
|
>>> phonex('Christopher') |
2143
|
|
|
'C623' |
2144
|
|
|
>>> phonex('Niall') |
2145
|
|
|
'N400' |
2146
|
|
|
>>> phonex('Schmidt') |
2147
|
|
|
'S253' |
2148
|
|
|
>>> phonex('Smith') |
2149
|
|
|
'S530' |
2150
|
|
|
""" |
2151
|
|
|
name = normalize('NFKD', text_type(word.upper())) |
2152
|
|
|
name = name.replace('ß', 'SS') |
2153
|
|
|
|
2154
|
|
|
# Clamp maxlength to [4, 64] |
2155
|
|
|
if maxlength is not None: |
2156
|
|
|
maxlength = min(max(4, maxlength), 64) |
2157
|
|
|
else: |
2158
|
|
|
maxlength = 64 |
2159
|
|
|
|
2160
|
|
|
name_code = last = '' |
2161
|
|
|
|
2162
|
|
|
# Deletions effected by replacing with next letter which |
2163
|
|
|
# will be ignored due to duplicate handling of Soundex code. |
2164
|
|
|
# This is faster than 'moving' all subsequent letters. |
2165
|
|
|
|
2166
|
|
|
# Remove any trailing Ss |
2167
|
|
|
while name[-1:] == 'S': |
2168
|
|
|
name = name[:-1] |
2169
|
|
|
|
2170
|
|
|
# Phonetic equivalents of first 2 characters |
2171
|
|
|
# Works since duplicate letters are ignored |
2172
|
|
|
if name[:2] == 'KN': |
2173
|
|
|
name = 'N' + name[2:] # KN.. == N.. |
2174
|
|
|
elif name[:2] == 'PH': |
2175
|
|
|
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) |
2176
|
|
|
elif name[:2] == 'WR': |
2177
|
|
|
name = 'R' + name[2:] # WR.. == R.. |
2178
|
|
|
|
2179
|
|
|
if name: |
2180
|
|
|
# Special case, ignore H first letter (subsequent Hs ignored anyway) |
2181
|
|
|
# Works since duplicate letters are ignored |
2182
|
|
|
if name[0] == 'H': |
2183
|
|
|
name = name[1:] |
2184
|
|
|
|
2185
|
|
|
if name: |
2186
|
|
|
# Phonetic equivalents of first character |
2187
|
|
|
if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
2188
|
|
|
name = 'A' + name[1:] |
2189
|
|
|
elif name[0] in {'B', 'P'}: |
2190
|
|
|
name = 'B' + name[1:] |
2191
|
|
|
elif name[0] in {'V', 'F'}: |
2192
|
|
|
name = 'F' + name[1:] |
2193
|
|
|
elif name[0] in {'C', 'K', 'Q'}: |
2194
|
|
|
name = 'C' + name[1:] |
2195
|
|
|
elif name[0] in {'G', 'J'}: |
2196
|
|
|
name = 'G' + name[1:] |
2197
|
|
|
elif name[0] in {'S', 'Z'}: |
2198
|
|
|
name = 'S' + name[1:] |
2199
|
|
|
|
2200
|
|
|
name_code = last = name[0] |
2201
|
|
|
|
2202
|
|
|
# MODIFIED SOUNDEX CODE |
2203
|
|
|
for i in range(1, len(name)): |
2204
|
|
|
code = '0' |
2205
|
|
|
if name[i] in {'B', 'F', 'P', 'V'}: |
2206
|
|
|
code = '1' |
2207
|
|
|
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: |
2208
|
|
|
code = '2' |
2209
|
|
|
elif name[i] in {'D', 'T'}: |
2210
|
|
|
if name[i+1:i+2] != 'C': |
2211
|
|
|
code = '3' |
2212
|
|
|
elif name[i] == 'L': |
2213
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
2214
|
|
|
i+1 == len(name)): |
2215
|
|
|
code = '4' |
2216
|
|
|
elif name[i] in {'M', 'N'}: |
2217
|
|
|
if name[i+1:i+2] in {'D', 'G'}: |
2218
|
|
|
name = name[:i+1] + name[i] + name[i+2:] |
2219
|
|
|
code = '5' |
2220
|
|
|
elif name[i] == 'R': |
2221
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
2222
|
|
|
i+1 == len(name)): |
2223
|
|
|
code = '6' |
2224
|
|
|
|
2225
|
|
|
if code != last and code != '0' and i != 0: |
2226
|
|
|
name_code += code |
2227
|
|
|
|
2228
|
|
|
last = name_code[-1] |
2229
|
|
|
|
2230
|
|
|
if zero_pad: |
2231
|
|
|
name_code += '0' * maxlength |
2232
|
|
|
if not name_code: |
2233
|
|
|
name_code = '0' |
2234
|
|
|
return name_code[:maxlength] |
2235
|
|
|
|
2236
|
|
|
|
2237
|
|
|
def phonem(word): |
2238
|
|
|
"""Return the Phonem code for a word. |
2239
|
|
|
|
2240
|
|
|
Phonem is defined in :cite:`Wilde:1988`. |
2241
|
|
|
|
2242
|
|
|
This version is based on the Perl implementation documented at |
2243
|
|
|
:cite:`Wilz:2005`. |
2244
|
|
|
It includes some enhancements presented in the Java port at |
2245
|
|
|
:cite:`dcm4che:2011`. |
2246
|
|
|
|
2247
|
|
|
Phonem is intended chiefly for German names/words. |
2248
|
|
|
|
2249
|
|
|
:param str word: the word to transform |
2250
|
|
|
:returns: the Phonem value |
2251
|
|
|
:rtype: str |
2252
|
|
|
|
2253
|
|
|
>>> phonem('Christopher') |
2254
|
|
|
'CRYSDOVR' |
2255
|
|
|
>>> phonem('Niall') |
2256
|
|
|
'NYAL' |
2257
|
|
|
>>> phonem('Smith') |
2258
|
|
|
'SMYD' |
2259
|
|
|
>>> phonem('Schmidt') |
2260
|
|
|
'CMYD' |
2261
|
|
|
""" |
2262
|
|
|
_phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), |
2263
|
|
|
('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), |
2264
|
|
|
('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), |
2265
|
|
|
('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), |
2266
|
|
|
('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), |
2267
|
|
|
('AU', 'A§'), ('OU', '§')) |
2268
|
|
|
_phonem_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2269
|
|
|
'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
2270
|
|
|
'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) |
2271
|
|
|
|
2272
|
|
|
word = normalize('NFC', text_type(word.upper())) |
2273
|
|
|
for i, j in _phonem_substitutions: |
2274
|
|
|
word = word.replace(i, j) |
2275
|
|
|
word = word.translate(_phonem_translation) |
2276
|
|
|
|
2277
|
|
|
return ''.join(c for c in _delete_consecutive_repeats(word) |
2278
|
|
|
if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', |
2279
|
|
|
'U', 'V', 'W', 'X', 'Y', 'Ö'}) |
2280
|
|
|
|
2281
|
|
|
|
2282
|
|
|
def phonix(word, maxlength=4, zero_pad=True): |
2283
|
|
|
"""Return the Phonix code for a word. |
2284
|
|
|
|
2285
|
|
|
Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`. |
2286
|
|
|
|
2287
|
|
|
This implementation is based on: |
2288
|
|
|
- :cite:`Pfeifer:2000` |
2289
|
|
|
- :cite:`Christen:2011` |
2290
|
|
|
- :cite:`Kollar:2007` |
2291
|
|
|
|
2292
|
|
|
:param str word: the word to transform |
2293
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2294
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2295
|
|
|
a maxlength string |
2296
|
|
|
:returns: the Phonix value |
2297
|
|
|
:rtype: str |
2298
|
|
|
|
2299
|
|
|
>>> phonix('Christopher') |
2300
|
|
|
'K683' |
2301
|
|
|
>>> phonix('Niall') |
2302
|
|
|
'N400' |
2303
|
|
|
>>> phonix('Smith') |
2304
|
|
|
'S530' |
2305
|
|
|
>>> phonix('Schmidt') |
2306
|
|
|
'S530' |
2307
|
|
|
""" |
2308
|
|
|
def _start_repl(word, src, tar, post=None): |
2309
|
|
|
r"""Replace src with tar at the start of word.""" |
2310
|
|
|
if post: |
2311
|
|
|
for i in post: |
2312
|
|
|
if word.startswith(src+i): |
2313
|
|
|
return tar + word[len(src):] |
2314
|
|
|
elif word.startswith(src): |
2315
|
|
|
return tar + word[len(src):] |
2316
|
|
|
return word |
2317
|
|
|
|
2318
|
|
|
def _end_repl(word, src, tar, pre=None): |
2319
|
|
|
r"""Replace src with tar at the end of word.""" |
2320
|
|
|
if pre: |
2321
|
|
|
for i in pre: |
2322
|
|
|
if word.endswith(i+src): |
2323
|
|
|
return word[:-len(src)] + tar |
2324
|
|
|
elif word.endswith(src): |
2325
|
|
|
return word[:-len(src)] + tar |
2326
|
|
|
return word |
2327
|
|
|
|
2328
|
|
|
def _mid_repl(word, src, tar, pre=None, post=None): |
2329
|
|
|
r"""Replace src with tar in the middle of word.""" |
2330
|
|
|
if pre or post: |
2331
|
|
|
if not pre: |
2332
|
|
|
return word[0] + _all_repl(word[1:], src, tar, pre, post) |
2333
|
|
|
elif not post: |
2334
|
|
|
return _all_repl(word[:-1], src, tar, pre, post) + word[-1] |
2335
|
|
|
return _all_repl(word, src, tar, pre, post) |
2336
|
|
|
return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) + |
2337
|
|
|
word[-1]) |
2338
|
|
|
|
2339
|
|
|
def _all_repl(word, src, tar, pre=None, post=None): |
2340
|
|
|
r"""Replace src with tar anywhere in word.""" |
2341
|
|
|
if pre or post: |
2342
|
|
|
if post: |
2343
|
|
|
post = post |
2344
|
|
|
else: |
2345
|
|
|
post = frozenset(('',)) |
2346
|
|
|
if pre: |
2347
|
|
|
pre = pre |
2348
|
|
|
else: |
2349
|
|
|
pre = frozenset(('',)) |
2350
|
|
|
|
2351
|
|
|
for i, j in ((i, j) for i in pre for j in post): |
2352
|
|
|
word = word.replace(i+src+j, i+tar+j) |
2353
|
|
|
return word |
2354
|
|
|
else: |
2355
|
|
|
return word.replace(src, tar) |
2356
|
|
|
|
2357
|
|
|
_vow = {'A', 'E', 'I', 'O', 'U'} |
2358
|
|
|
_con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
2359
|
|
|
'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'} |
2360
|
|
|
|
2361
|
|
|
_phonix_substitutions = ((_all_repl, 'DG', 'G'), |
2362
|
|
|
(_all_repl, 'CO', 'KO'), |
2363
|
|
|
(_all_repl, 'CA', 'KA'), |
2364
|
|
|
(_all_repl, 'CU', 'KU'), |
2365
|
|
|
(_all_repl, 'CY', 'SI'), |
2366
|
|
|
(_all_repl, 'CI', 'SI'), |
2367
|
|
|
(_all_repl, 'CE', 'SE'), |
2368
|
|
|
(_start_repl, 'CL', 'KL', _vow), |
2369
|
|
|
(_all_repl, 'CK', 'K'), |
2370
|
|
|
(_end_repl, 'GC', 'K'), |
2371
|
|
|
(_end_repl, 'JC', 'K'), |
2372
|
|
|
(_start_repl, 'CHR', 'KR', _vow), |
2373
|
|
|
(_start_repl, 'CR', 'KR', _vow), |
2374
|
|
|
(_start_repl, 'WR', 'R'), |
2375
|
|
|
(_all_repl, 'NC', 'NK'), |
2376
|
|
|
(_all_repl, 'CT', 'KT'), |
2377
|
|
|
(_all_repl, 'PH', 'F'), |
2378
|
|
|
(_all_repl, 'AA', 'AR'), |
2379
|
|
|
(_all_repl, 'SCH', 'SH'), |
2380
|
|
|
(_all_repl, 'BTL', 'TL'), |
2381
|
|
|
(_all_repl, 'GHT', 'T'), |
2382
|
|
|
(_all_repl, 'AUGH', 'ARF'), |
2383
|
|
|
(_mid_repl, 'LJ', 'LD', _vow, _vow), |
2384
|
|
|
(_all_repl, 'LOUGH', 'LOW'), |
2385
|
|
|
(_start_repl, 'Q', 'KW'), |
2386
|
|
|
(_start_repl, 'KN', 'N'), |
2387
|
|
|
(_end_repl, 'GN', 'N'), |
2388
|
|
|
(_all_repl, 'GHN', 'N'), |
2389
|
|
|
(_end_repl, 'GNE', 'N'), |
2390
|
|
|
(_all_repl, 'GHNE', 'NE'), |
2391
|
|
|
(_end_repl, 'GNES', 'NS'), |
2392
|
|
|
(_start_repl, 'GN', 'N'), |
2393
|
|
|
(_mid_repl, 'GN', 'N', None, _con), |
2394
|
|
|
(_end_repl, 'GN', 'N'), |
2395
|
|
|
(_start_repl, 'PS', 'S'), |
2396
|
|
|
(_start_repl, 'PT', 'T'), |
2397
|
|
|
(_start_repl, 'CZ', 'C'), |
2398
|
|
|
(_mid_repl, 'WZ', 'Z', _vow), |
2399
|
|
|
(_mid_repl, 'CZ', 'CH'), |
2400
|
|
|
(_all_repl, 'LZ', 'LSH'), |
2401
|
|
|
(_all_repl, 'RZ', 'RSH'), |
2402
|
|
|
(_mid_repl, 'Z', 'S', None, _vow), |
2403
|
|
|
(_all_repl, 'ZZ', 'TS'), |
2404
|
|
|
(_mid_repl, 'Z', 'TS', _con), |
2405
|
|
|
(_all_repl, 'HROUG', 'REW'), |
2406
|
|
|
(_all_repl, 'OUGH', 'OF'), |
2407
|
|
|
(_mid_repl, 'Q', 'KW', _vow, _vow), |
2408
|
|
|
(_mid_repl, 'J', 'Y', _vow, _vow), |
2409
|
|
|
(_start_repl, 'YJ', 'Y', _vow), |
2410
|
|
|
(_start_repl, 'GH', 'G'), |
2411
|
|
|
(_end_repl, 'GH', 'E', _vow), |
2412
|
|
|
(_start_repl, 'CY', 'S'), |
2413
|
|
|
(_all_repl, 'NX', 'NKS'), |
2414
|
|
|
(_start_repl, 'PF', 'F'), |
2415
|
|
|
(_end_repl, 'DT', 'T'), |
2416
|
|
|
(_end_repl, 'TL', 'TIL'), |
2417
|
|
|
(_end_repl, 'DL', 'DIL'), |
2418
|
|
|
(_all_repl, 'YTH', 'ITH'), |
2419
|
|
|
(_start_repl, 'TJ', 'CH', _vow), |
2420
|
|
|
(_start_repl, 'TSJ', 'CH', _vow), |
2421
|
|
|
(_start_repl, 'TS', 'T', _vow), |
2422
|
|
|
(_all_repl, 'TCH', 'CH'), |
2423
|
|
|
(_mid_repl, 'WSK', 'VSKIE', _vow), |
2424
|
|
|
(_end_repl, 'WSK', 'VSKIE', _vow), |
2425
|
|
|
(_start_repl, 'MN', 'N', _vow), |
2426
|
|
|
(_start_repl, 'PN', 'N', _vow), |
2427
|
|
|
(_mid_repl, 'STL', 'SL', _vow), |
2428
|
|
|
(_end_repl, 'STL', 'SL', _vow), |
2429
|
|
|
(_end_repl, 'TNT', 'ENT'), |
2430
|
|
|
(_end_repl, 'EAUX', 'OH'), |
2431
|
|
|
(_all_repl, 'EXCI', 'ECS'), |
2432
|
|
|
(_all_repl, 'X', 'ECS'), |
2433
|
|
|
(_end_repl, 'NED', 'ND'), |
2434
|
|
|
(_all_repl, 'JR', 'DR'), |
2435
|
|
|
(_end_repl, 'EE', 'EA'), |
2436
|
|
|
(_all_repl, 'ZS', 'S'), |
2437
|
|
|
(_mid_repl, 'R', 'AH', _vow, _con), |
2438
|
|
|
(_end_repl, 'R', 'AH', _vow), |
2439
|
|
|
(_mid_repl, 'HR', 'AH', _vow, _con), |
2440
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
2441
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
2442
|
|
|
(_end_repl, 'RE', 'AR'), |
2443
|
|
|
(_end_repl, 'R', 'AH', _vow), |
2444
|
|
|
(_all_repl, 'LLE', 'LE'), |
2445
|
|
|
(_end_repl, 'LE', 'ILE', _con), |
2446
|
|
|
(_end_repl, 'LES', 'ILES', _con), |
2447
|
|
|
(_end_repl, 'E', ''), |
2448
|
|
|
(_end_repl, 'ES', 'S'), |
2449
|
|
|
(_end_repl, 'SS', 'AS', _vow), |
2450
|
|
|
(_end_repl, 'MB', 'M', _vow), |
2451
|
|
|
(_all_repl, 'MPTS', 'MPS'), |
2452
|
|
|
(_all_repl, 'MPS', 'MS'), |
2453
|
|
|
(_all_repl, 'MPT', 'MT')) |
2454
|
|
|
|
2455
|
|
|
_phonix_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2456
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
2457
|
|
|
'01230720022455012683070808')) |
2458
|
|
|
|
2459
|
|
|
sdx = '' |
2460
|
|
|
|
2461
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
2462
|
|
|
word = word.replace('ß', 'SS') |
2463
|
|
|
word = ''.join(c for c in word if c in |
2464
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
2465
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
2466
|
|
|
'Y', 'Z'}) |
2467
|
|
|
if word: |
2468
|
|
|
for trans in _phonix_substitutions: |
2469
|
|
|
word = trans[0](word, *trans[1:]) |
2470
|
|
|
if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
2471
|
|
|
sdx = 'v' + word[1:].translate(_phonix_translation) |
2472
|
|
|
else: |
2473
|
|
|
sdx = word[0] + word[1:].translate(_phonix_translation) |
2474
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
2475
|
|
|
sdx = sdx.replace('0', '') |
2476
|
|
|
|
2477
|
|
|
# Clamp maxlength to [4, 64] |
2478
|
|
|
if maxlength is not None: |
2479
|
|
|
maxlength = min(max(4, maxlength), 64) |
2480
|
|
|
else: |
2481
|
|
|
maxlength = 64 |
2482
|
|
|
|
2483
|
|
|
if zero_pad: |
2484
|
|
|
sdx += '0' * maxlength |
2485
|
|
|
if not sdx: |
2486
|
|
|
sdx = '0' |
2487
|
|
|
return sdx[:maxlength] |
2488
|
|
|
|
2489
|
|
|
|
2490
|
|
|
def sfinxbis(word, maxlength=None): |
2491
|
|
|
"""Return the SfinxBis code for a word. |
2492
|
|
|
|
2493
|
|
|
SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`. |
2494
|
|
|
|
2495
|
|
|
This implementation follows the reference implementation: |
2496
|
|
|
:cite:`Sjoo:2009`. |
2497
|
|
|
|
2498
|
|
|
SfinxBis is intended chiefly for Swedish names. |
2499
|
|
|
|
2500
|
|
|
:param str word: the word to transform |
2501
|
|
|
:param int maxlength: the length of the code returned (defaults to |
2502
|
|
|
unlimited) |
2503
|
|
|
:returns: the SfinxBis value |
2504
|
|
|
:rtype: tuple |
2505
|
|
|
|
2506
|
|
|
>>> sfinxbis('Christopher') |
2507
|
|
|
('K68376',) |
2508
|
|
|
>>> sfinxbis('Niall') |
2509
|
|
|
('N4',) |
2510
|
|
|
>>> sfinxbis('Smith') |
2511
|
|
|
('S53',) |
2512
|
|
|
>>> sfinxbis('Schmidt') |
2513
|
|
|
('S53',) |
2514
|
|
|
|
2515
|
|
|
>>> sfinxbis('Johansson') |
2516
|
|
|
('J585',) |
2517
|
|
|
>>> sfinxbis('Sjöberg') |
2518
|
|
|
('#162',) |
2519
|
|
|
""" |
2520
|
|
|
adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', |
2521
|
|
|
' VAN DER ', ' VON DEM ', ' VON DER ', |
2522
|
|
|
' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', |
2523
|
|
|
' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', |
2524
|
|
|
' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', |
2525
|
|
|
' S:T ') |
2526
|
|
|
|
2527
|
|
|
_harde_vokaler = {'A', 'O', 'U', 'Å'} |
2528
|
|
|
_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} |
2529
|
|
|
_konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', |
2530
|
|
|
'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
2531
|
|
|
_alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
2532
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
2533
|
|
|
'Y', 'Z', 'Ä', 'Å', 'Ö'} |
2534
|
|
|
|
2535
|
|
|
_sfinxbis_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2536
|
|
|
'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), |
2537
|
|
|
'123729224551268378999999999')) |
2538
|
|
|
|
2539
|
|
|
_sfinxbis_substitutions = dict(zip((ord(_) for _ in |
2540
|
|
|
'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), |
2541
|
|
|
'VSAAAAÄCEEEEIIIINOOOOÖUUUYY')) |
2542
|
|
|
|
2543
|
|
|
def _foersvensker(ordet): |
2544
|
|
|
"""Return the Swedish-ized form of the word.""" |
2545
|
|
|
ordet = ordet.replace('STIERN', 'STJÄRN') |
2546
|
|
|
ordet = ordet.replace('HIE', 'HJ') |
2547
|
|
|
ordet = ordet.replace('SIÖ', 'SJÖ') |
2548
|
|
|
ordet = ordet.replace('SCH', 'SH') |
2549
|
|
|
ordet = ordet.replace('QU', 'KV') |
2550
|
|
|
ordet = ordet.replace('IO', 'JO') |
2551
|
|
|
ordet = ordet.replace('PH', 'F') |
2552
|
|
|
|
2553
|
|
|
for i in _harde_vokaler: |
2554
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
2555
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
2556
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
2557
|
|
|
for i in _mjuka_vokaler: |
2558
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
2559
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
2560
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
2561
|
|
|
|
2562
|
|
|
if 'H' in ordet: |
2563
|
|
|
for i in _konsonanter: |
2564
|
|
|
ordet = ordet.replace('H'+i, i) |
2565
|
|
|
|
2566
|
|
|
ordet = ordet.translate(_sfinxbis_substitutions) |
2567
|
|
|
|
2568
|
|
|
ordet = ordet.replace('Ð', 'ETH') |
2569
|
|
|
ordet = ordet.replace('Þ', 'TH') |
2570
|
|
|
ordet = ordet.replace('ß', 'SS') |
2571
|
|
|
|
2572
|
|
|
return ordet |
2573
|
|
|
|
2574
|
|
|
def _koda_foersta_ljudet(ordet): |
2575
|
|
|
"""Return the word with the first sound coded.""" |
2576
|
|
|
if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler: |
2577
|
|
|
ordet = '$' + ordet[1:] |
2578
|
|
|
elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): |
2579
|
|
|
ordet = 'J' + ordet[2:] |
2580
|
|
|
elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler: |
2581
|
|
|
ordet = 'J' + ordet[1:] |
2582
|
|
|
elif ordet[0:1] == 'Q': |
2583
|
|
|
ordet = 'K' + ordet[1:] |
2584
|
|
|
elif (ordet[0:2] == 'CH' and |
2585
|
|
|
ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)): |
2586
|
|
|
ordet = '#' + ordet[2:] |
2587
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler: |
2588
|
|
|
ordet = 'K' + ordet[1:] |
2589
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter: |
2590
|
|
|
ordet = 'K' + ordet[1:] |
2591
|
|
|
elif ordet[0:1] == 'X': |
2592
|
|
|
ordet = 'S' + ordet[1:] |
2593
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler: |
2594
|
|
|
ordet = 'S' + ordet[1:] |
2595
|
|
|
elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'): |
2596
|
|
|
ordet = '#' + ordet[3:] |
2597
|
|
|
elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): |
2598
|
|
|
ordet = '#' + ordet[2:] |
2599
|
|
|
elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler: |
2600
|
|
|
ordet = '#' + ordet[2:] |
2601
|
|
|
elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler: |
2602
|
|
|
ordet = '#' + ordet[1:] |
2603
|
|
|
return ordet |
2604
|
|
|
|
2605
|
|
|
# Steg 1, Versaler |
2606
|
|
|
word = normalize('NFC', text_type(word.upper())) |
2607
|
|
|
word = word.replace('ß', 'SS') |
2608
|
|
|
word = word.replace('-', ' ') |
2609
|
|
|
|
2610
|
|
|
# Steg 2, Ta bort adelsprefix |
2611
|
|
|
for adelstitel in adelstitler: |
2612
|
|
|
while adelstitel in word: |
2613
|
|
|
word = word.replace(adelstitel, ' ') |
2614
|
|
|
if word.startswith(adelstitel[1:]): |
2615
|
|
|
word = word[len(adelstitel)-1:] |
2616
|
|
|
|
2617
|
|
|
# Split word into tokens |
2618
|
|
|
ordlista = word.split() |
2619
|
|
|
|
2620
|
|
|
# Steg 3, Ta bort dubbelteckning i början på namnet |
2621
|
|
|
ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] |
2622
|
|
|
if not ordlista: |
2623
|
|
|
return ('',) |
2624
|
|
|
|
2625
|
|
|
# Steg 4, Försvenskning |
2626
|
|
|
ordlista = [_foersvensker(ordet) for ordet in ordlista] |
2627
|
|
|
|
2628
|
|
|
# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) |
2629
|
|
|
ordlista = [''.join(c for c in ordet if c in _alfabet) |
2630
|
|
|
for ordet in ordlista] |
2631
|
|
|
|
2632
|
|
|
# Steg 6, Koda första ljudet |
2633
|
|
|
ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] |
2634
|
|
|
|
2635
|
|
|
# Steg 7, Dela upp namnet i två delar |
2636
|
|
|
rest = [ordet[1:] for ordet in ordlista] |
2637
|
|
|
|
2638
|
|
|
# Steg 8, Utför fonetisk transformation i resten |
2639
|
|
|
rest = [ordet.replace('DT', 'T') for ordet in rest] |
2640
|
|
|
rest = [ordet.replace('X', 'KS') for ordet in rest] |
2641
|
|
|
|
2642
|
|
|
# Steg 9, Koda resten till en sifferkod |
2643
|
|
|
for vokal in _mjuka_vokaler: |
2644
|
|
|
rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest] |
2645
|
|
|
rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] |
2646
|
|
|
|
2647
|
|
|
# Steg 10, Ta bort intilliggande dubbletter |
2648
|
|
|
rest = [_delete_consecutive_repeats(ordet) for ordet in rest] |
2649
|
|
|
|
2650
|
|
|
# Steg 11, Ta bort alla "9" |
2651
|
|
|
rest = [ordet.replace('9', '') for ordet in rest] |
2652
|
|
|
|
2653
|
|
|
# Steg 12, Sätt ihop delarna igen |
2654
|
|
|
ordlista = [''.join(ordet) for ordet in |
2655
|
|
|
zip((_[0:1] for _ in ordlista), rest)] |
2656
|
|
|
|
2657
|
|
|
# truncate, if maxlength is set |
2658
|
|
|
if maxlength and maxlength < _INFINITY: |
2659
|
|
|
ordlista = [ordet[:maxlength] for ordet in ordlista] |
2660
|
|
|
|
2661
|
|
|
return tuple(ordlista) |
2662
|
|
|
|
2663
|
|
|
|
2664
|
|
|
def phonet(word, mode=1, lang='de'): |
2665
|
|
|
"""Return the phonet code for a word. |
2666
|
|
|
|
2667
|
|
|
phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and |
2668
|
|
|
documented in :cite:`Michael:1999`. |
2669
|
|
|
|
2670
|
|
|
This is a port of Jesper Zedlitz's code, which is licensed LGPL |
2671
|
|
|
:cite:`Zedlitz:2015`. |
2672
|
|
|
|
2673
|
|
|
That is, in turn, based on Michael's C code, which is also licensed LGPL |
2674
|
|
|
:cite:`Michael:2007`. |
2675
|
|
|
|
2676
|
|
|
:param str word: the word to transform |
2677
|
|
|
:param int mode: the ponet variant to employ (1 or 2) |
2678
|
|
|
:param str lang: 'de' (default) for German |
2679
|
|
|
'none' for no language |
2680
|
|
|
:returns: the phonet value |
2681
|
|
|
:rtype: str |
2682
|
|
|
|
2683
|
|
|
>>> phonet('Christopher') |
2684
|
|
|
'KRISTOFA' |
2685
|
|
|
>>> phonet('Niall') |
2686
|
|
|
'NIAL' |
2687
|
|
|
>>> phonet('Smith') |
2688
|
|
|
'SMIT' |
2689
|
|
|
>>> phonet('Schmidt') |
2690
|
|
|
'SHMIT' |
2691
|
|
|
|
2692
|
|
|
>>> phonet('Christopher', mode=2) |
2693
|
|
|
'KRIZTUFA' |
2694
|
|
|
>>> phonet('Niall', mode=2) |
2695
|
|
|
'NIAL' |
2696
|
|
|
>>> phonet('Smith', mode=2) |
2697
|
|
|
'ZNIT' |
2698
|
|
|
>>> phonet('Schmidt', mode=2) |
2699
|
|
|
'ZNIT' |
2700
|
|
|
|
2701
|
|
|
>>> phonet('Christopher', lang='none') |
2702
|
|
|
'CHRISTOPHER' |
2703
|
|
|
>>> phonet('Niall', lang='none') |
2704
|
|
|
'NIAL' |
2705
|
|
|
>>> phonet('Smith', lang='none') |
2706
|
|
|
'SMITH' |
2707
|
|
|
>>> phonet('Schmidt', lang='none') |
2708
|
|
|
'SCHMIDT' |
2709
|
|
|
""" |
2710
|
|
|
_phonet_rules_no_lang = ( # separator chars |
2711
|
|
|
'´', ' ', ' ', |
2712
|
|
|
'"', ' ', ' ', |
2713
|
|
|
'`$', '', '', |
2714
|
|
|
'\'', ' ', ' ', |
2715
|
|
|
',', ',', ',', |
2716
|
|
|
';', ',', ',', |
2717
|
|
|
'-', ' ', ' ', |
2718
|
|
|
' ', ' ', ' ', |
2719
|
|
|
'.', '.', '.', |
2720
|
|
|
':', '.', '.', |
2721
|
|
|
# German umlauts |
2722
|
|
|
'Ä', 'AE', 'AE', |
2723
|
|
|
'Ö', 'OE', 'OE', |
2724
|
|
|
'Ü', 'UE', 'UE', |
2725
|
|
|
'ß', 'S', 'S', |
2726
|
|
|
# international umlauts |
2727
|
|
|
'À', 'A', 'A', |
2728
|
|
|
'Á', 'A', 'A', |
2729
|
|
|
'Â', 'A', 'A', |
2730
|
|
|
'Ã', 'A', 'A', |
2731
|
|
|
'Å', 'A', 'A', |
2732
|
|
|
'Æ', 'AE', 'AE', |
2733
|
|
|
'Ç', 'C', 'C', |
2734
|
|
|
'Ð', 'DJ', 'DJ', |
2735
|
|
|
'È', 'E', 'E', |
2736
|
|
|
'É', 'E', 'E', |
2737
|
|
|
'Ê', 'E', 'E', |
2738
|
|
|
'Ë', 'E', 'E', |
2739
|
|
|
'Ì', 'I', 'I', |
2740
|
|
|
'Í', 'I', 'I', |
2741
|
|
|
'Î', 'I', 'I', |
2742
|
|
|
'Ï', 'I', 'I', |
2743
|
|
|
'Ñ', 'NH', 'NH', |
2744
|
|
|
'Ò', 'O', 'O', |
2745
|
|
|
'Ó', 'O', 'O', |
2746
|
|
|
'Ô', 'O', 'O', |
2747
|
|
|
'Õ', 'O', 'O', |
2748
|
|
|
'Œ', 'OE', 'OE', |
2749
|
|
|
'Ø', 'OE', 'OE', |
2750
|
|
|
'Š', 'SH', 'SH', |
2751
|
|
|
'Þ', 'TH', 'TH', |
2752
|
|
|
'Ù', 'U', 'U', |
2753
|
|
|
'Ú', 'U', 'U', |
2754
|
|
|
'Û', 'U', 'U', |
2755
|
|
|
'Ý', 'Y', 'Y', |
2756
|
|
|
'Ÿ', 'Y', 'Y', |
2757
|
|
|
# 'normal' letters (A-Z) |
2758
|
|
|
'MC^', 'MAC', 'MAC', |
2759
|
|
|
'MC^', 'MAC', 'MAC', |
2760
|
|
|
'M´^', 'MAC', 'MAC', |
2761
|
|
|
'M\'^', 'MAC', 'MAC', |
2762
|
|
|
'O´^', 'O', 'O', |
2763
|
|
|
'O\'^', 'O', 'O', |
2764
|
|
|
'VAN DEN ^', 'VANDEN', 'VANDEN', |
2765
|
|
|
None, None, None) |
2766
|
|
|
|
2767
|
|
|
_phonet_rules_german = ( # separator chars |
2768
|
|
|
'´', ' ', ' ', |
2769
|
|
|
'"', ' ', ' ', |
2770
|
|
|
'`$', '', '', |
2771
|
|
|
'\'', ' ', ' ', |
2772
|
|
|
',', ' ', ' ', |
2773
|
|
|
';', ' ', ' ', |
2774
|
|
|
'-', ' ', ' ', |
2775
|
|
|
' ', ' ', ' ', |
2776
|
|
|
'.', '.', '.', |
2777
|
|
|
':', '.', '.', |
2778
|
|
|
# German umlauts |
2779
|
|
|
'ÄE', 'E', 'E', |
2780
|
|
|
'ÄU<', 'EU', 'EU', |
2781
|
|
|
'ÄV(AEOU)-<', 'EW', None, |
2782
|
|
|
'Ä$', 'Ä', None, |
2783
|
|
|
'Ä<', None, 'E', |
2784
|
|
|
'Ä', 'E', None, |
2785
|
|
|
'ÖE', 'Ö', 'Ö', |
2786
|
|
|
'ÖU', 'Ö', 'Ö', |
2787
|
|
|
'ÖVER--<', 'ÖW', None, |
2788
|
|
|
'ÖV(AOU)-', 'ÖW', None, |
2789
|
|
|
'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
2790
|
|
|
'ÜBER^^', 'ÜBA', 'IBA', |
2791
|
|
|
'ÜE', 'Ü', 'I', |
2792
|
|
|
'ÜVER--<', 'ÜW', None, |
2793
|
|
|
'ÜV(AOU)-', 'ÜW', None, |
2794
|
|
|
'Ü', None, 'I', |
2795
|
|
|
'ßCH<', None, 'Z', |
2796
|
|
|
'ß<', 'S', 'Z', |
2797
|
|
|
# international umlauts |
2798
|
|
|
'À<', 'A', 'A', |
2799
|
|
|
'Á<', 'A', 'A', |
2800
|
|
|
'Â<', 'A', 'A', |
2801
|
|
|
'Ã<', 'A', 'A', |
2802
|
|
|
'Å<', 'A', 'A', |
2803
|
|
|
'ÆER-', 'E', 'E', |
2804
|
|
|
'ÆU<', 'EU', 'EU', |
2805
|
|
|
'ÆV(AEOU)-<', 'EW', None, |
2806
|
|
|
'Æ$', 'Ä', None, |
2807
|
|
|
'Æ<', None, 'E', |
2808
|
|
|
'Æ', 'E', None, |
2809
|
|
|
'Ç', 'Z', 'Z', |
2810
|
|
|
'ÐÐ-', '', '', |
2811
|
|
|
'Ð', 'DI', 'TI', |
2812
|
|
|
'È<', 'E', 'E', |
2813
|
|
|
'É<', 'E', 'E', |
2814
|
|
|
'Ê<', 'E', 'E', |
2815
|
|
|
'Ë', 'E', 'E', |
2816
|
|
|
'Ì<', 'I', 'I', |
2817
|
|
|
'Í<', 'I', 'I', |
2818
|
|
|
'Î<', 'I', 'I', |
2819
|
|
|
'Ï', 'I', 'I', |
2820
|
|
|
'ÑÑ-', '', '', |
2821
|
|
|
'Ñ', 'NI', 'NI', |
2822
|
|
|
'Ò<', 'O', 'U', |
2823
|
|
|
'Ó<', 'O', 'U', |
2824
|
|
|
'Ô<', 'O', 'U', |
2825
|
|
|
'Õ<', 'O', 'U', |
2826
|
|
|
'Œ<', 'Ö', 'Ö', |
2827
|
|
|
'Ø(IJY)-<', 'E', 'E', |
2828
|
|
|
'Ø<', 'Ö', 'Ö', |
2829
|
|
|
'Š', 'SH', 'Z', |
2830
|
|
|
'Þ', 'T', 'T', |
2831
|
|
|
'Ù<', 'U', 'U', |
2832
|
|
|
'Ú<', 'U', 'U', |
2833
|
|
|
'Û<', 'U', 'U', |
2834
|
|
|
'Ý<', 'I', 'I', |
2835
|
|
|
'Ÿ<', 'I', 'I', |
2836
|
|
|
# 'normal' letters (A-Z) |
2837
|
|
|
'ABELLE$', 'ABL', 'ABL', |
2838
|
|
|
'ABELL$', 'ABL', 'ABL', |
2839
|
|
|
'ABIENNE$', 'ABIN', 'ABIN', |
2840
|
|
|
'ACHME---^', 'ACH', 'AK', |
2841
|
|
|
'ACEY$', 'AZI', 'AZI', |
2842
|
|
|
'ADV', 'ATW', None, |
2843
|
|
|
'AEGL-', 'EK', None, |
2844
|
|
|
'AEU<', 'EU', 'EU', |
2845
|
|
|
'AE2', 'E', 'E', |
2846
|
|
|
'AFTRAUBEN------', 'AFT ', 'AFT ', |
2847
|
|
|
'AGL-1', 'AK', None, |
2848
|
|
|
'AGNI-^', 'AKN', 'AKN', |
2849
|
|
|
'AGNIE-', 'ANI', 'ANI', |
2850
|
|
|
'AGN(AEOU)-$', 'ANI', 'ANI', |
2851
|
|
|
'AH(AIOÖUÜY)-', 'AH', None, |
2852
|
|
|
'AIA2', 'AIA', 'AIA', |
2853
|
|
|
'AIE$', 'E', 'E', |
2854
|
|
|
'AILL(EOU)-', 'ALI', 'ALI', |
2855
|
|
|
'AINE$', 'EN', 'EN', |
2856
|
|
|
'AIRE$', 'ER', 'ER', |
2857
|
|
|
'AIR-', 'E', 'E', |
2858
|
|
|
'AISE$', 'ES', 'EZ', |
2859
|
|
|
'AISSANCE$', 'ESANS', 'EZANZ', |
2860
|
|
|
'AISSE$', 'ES', 'EZ', |
2861
|
|
|
'AIX$', 'EX', 'EX', |
2862
|
|
|
'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
2863
|
|
|
'AKTIE', 'AXIE', 'AXIE', |
2864
|
|
|
'AKTUEL', 'AKTUEL', None, |
2865
|
|
|
'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
2866
|
|
|
'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
2867
|
|
|
'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
2868
|
|
|
'ANCH(OEI)-', 'ANSH', 'ANZ', |
2869
|
|
|
'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
2870
|
|
|
'ANDERGEHE----', 'ANDA ', 'ANTA ', |
2871
|
|
|
'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
2872
|
|
|
'ANDERGING----', 'ANDA ', 'ANTA ', |
2873
|
|
|
'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
2874
|
|
|
'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
2875
|
|
|
'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
2876
|
|
|
'ANER(BKO)---^^', 'AN', None, |
2877
|
|
|
'ANHAND---^$', 'AN H', 'AN ', |
2878
|
|
|
'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
2879
|
|
|
'ANIELLE$', 'ANIEL', 'ANIL', |
2880
|
|
|
'ANIEL', 'ANIEL', None, |
2881
|
|
|
'ANSTELLE----^$', 'AN ST', 'AN ZT', |
2882
|
|
|
'ANTI^^', 'ANTI', 'ANTI', |
2883
|
|
|
'ANVER^^', 'ANFA', 'ANFA', |
2884
|
|
|
'ATIA$', 'ATIA', 'ATIA', |
2885
|
|
|
'ATIA(NS)--', 'ATI', 'ATI', |
2886
|
|
|
'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
2887
|
|
|
'AUAU--', '', '', |
2888
|
|
|
'AUERE$', 'AUERE', None, |
2889
|
|
|
'AUERE(NS)-$', 'AUERE', None, |
2890
|
|
|
'AUERE(AIOUY)--', 'AUER', None, |
2891
|
|
|
'AUER(AÄIOÖUÜY)-', 'AUER', None, |
2892
|
|
|
'AUER<', 'AUA', 'AUA', |
2893
|
|
|
'AUF^^', 'AUF', 'AUF', |
2894
|
|
|
'AULT$', 'O', 'U', |
2895
|
|
|
'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
2896
|
|
|
'AUR$', 'AUA', 'AUA', |
2897
|
|
|
'AUSSE$', 'OS', 'UZ', |
2898
|
|
|
'AUS(ST)-^', 'AUS', 'AUS', |
2899
|
|
|
'AUS^^', 'AUS', 'AUS', |
2900
|
|
|
'AUTOFAHR----', 'AUTO ', 'AUTU ', |
2901
|
|
|
'AUTO^^', 'AUTO', 'AUTU', |
2902
|
|
|
'AUX(IY)-', 'AUX', 'AUX', |
2903
|
|
|
'AUX', 'O', 'U', |
2904
|
|
|
'AU', 'AU', 'AU', |
2905
|
|
|
'AVER--<', 'AW', None, |
2906
|
|
|
'AVIER$', 'AWIE', 'AFIE', |
2907
|
|
|
'AV(EÈÉÊI)-^', 'AW', None, |
2908
|
|
|
'AV(AOU)-', 'AW', None, |
2909
|
|
|
'AYRE$', 'EIRE', 'EIRE', |
2910
|
|
|
'AYRE(NS)-$', 'EIRE', 'EIRE', |
2911
|
|
|
'AYRE(AIOUY)--', 'EIR', 'EIR', |
2912
|
|
|
'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
2913
|
|
|
'AYR<', 'EIA', 'EIA', |
2914
|
|
|
'AYER--<', 'EI', 'EI', |
2915
|
|
|
'AY(AÄEIOÖUÜY)--', 'A', 'A', |
2916
|
|
|
'AË', 'E', 'E', |
2917
|
|
|
'A(IJY)<', 'EI', 'EI', |
2918
|
|
|
'BABY^$', 'BEBI', 'BEBI', |
2919
|
|
|
'BAB(IY)^', 'BEBI', 'BEBI', |
2920
|
|
|
'BEAU^$', 'BO', None, |
2921
|
|
|
'BEA(BCMNRU)-^', 'BEA', 'BEA', |
2922
|
|
|
'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
2923
|
|
|
'BEE$', 'BI', 'BI', |
2924
|
|
|
'BEIGE^$', 'BESH', 'BEZ', |
2925
|
|
|
'BENOIT--', 'BENO', 'BENU', |
2926
|
|
|
'BER(DT)-', 'BER', None, |
2927
|
|
|
'BERN(DT)-', 'BERN', None, |
2928
|
|
|
'BE(LMNRST)-^', 'BE', 'BE', |
2929
|
|
|
'BETTE$', 'BET', 'BET', |
2930
|
|
|
'BEVOR^$', 'BEFOR', None, |
2931
|
|
|
'BIC$', 'BIZ', 'BIZ', |
2932
|
|
|
'BOWL(EI)-', 'BOL', 'BUL', |
2933
|
|
|
'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
2934
|
|
|
'BRINGEND-----^', 'BRI', 'BRI', |
2935
|
|
|
'BRINGEND-----', ' BRI', ' BRI', |
2936
|
|
|
'BROW(NS)-', 'BRAU', 'BRAU', |
2937
|
|
|
'BUDGET7', 'BÜGE', 'BIKE', |
2938
|
|
|
'BUFFET7', 'BÜFE', 'BIFE', |
2939
|
|
|
'BYLLE$', 'BILE', 'BILE', |
2940
|
|
|
'BYLL$', 'BIL', 'BIL', |
2941
|
|
|
'BYPA--^', 'BEI', 'BEI', |
2942
|
|
|
'BYTE<', 'BEIT', 'BEIT', |
2943
|
|
|
'BY9^', 'BÜ', None, |
2944
|
|
|
'B(SßZ)$', 'BS', None, |
2945
|
|
|
'CACH(EI)-^', 'KESH', 'KEZ', |
2946
|
|
|
'CAE--', 'Z', 'Z', |
2947
|
|
|
'CA(IY)$', 'ZEI', 'ZEI', |
2948
|
|
|
'CE(EIJUY)--', 'Z', 'Z', |
2949
|
|
|
'CENT<', 'ZENT', 'ZENT', |
2950
|
|
|
'CERST(EI)----^', 'KE', 'KE', |
2951
|
|
|
'CER$', 'ZA', 'ZA', |
2952
|
|
|
'CE3', 'ZE', 'ZE', |
2953
|
|
|
'CH\'S$', 'X', 'X', |
2954
|
|
|
'CH´S$', 'X', 'X', |
2955
|
|
|
'CHAO(ST)-', 'KAO', 'KAU', |
2956
|
|
|
'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
2957
|
|
|
'CHAR(AI)-^', 'KAR', 'KAR', |
2958
|
|
|
'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
2959
|
|
|
'CHÄ(CF)-', 'SHE', 'ZE', |
2960
|
|
|
'CHE(CF)-', 'SHE', 'ZE', |
2961
|
|
|
'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
2962
|
|
|
'CHEQUE<', 'SHEK', 'ZEK', |
2963
|
|
|
'CHI(CFGPVW)-', 'SHI', 'ZI', |
2964
|
|
|
'CH(AEUY)-<^', 'SH', 'Z', |
2965
|
|
|
'CHK-', '', '', |
2966
|
|
|
'CHO(CKPS)-^', 'SHO', 'ZU', |
2967
|
|
|
'CHRIS-', 'KRI', None, |
2968
|
|
|
'CHRO-', 'KR', None, |
2969
|
|
|
'CH(LOR)-<^', 'K', 'K', |
2970
|
|
|
'CHST-', 'X', 'X', |
2971
|
|
|
'CH(SßXZ)3', 'X', 'X', |
2972
|
|
|
'CHTNI-3', 'CHN', 'KN', |
2973
|
|
|
'CH^', 'K', 'K', # or: 'CH', 'K' |
2974
|
|
|
'CH', 'CH', 'K', |
2975
|
|
|
'CIC$', 'ZIZ', 'ZIZ', |
2976
|
|
|
'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
2977
|
|
|
'CIENCE$', 'EIENS', 'EIENZ', |
2978
|
|
|
'CIER$', 'ZIE', 'ZIE', |
2979
|
|
|
'CYB-^', 'ZEI', 'ZEI', |
2980
|
|
|
'CY9^', 'ZÜ', 'ZI', |
2981
|
|
|
'C(IJY)-<3', 'Z', 'Z', |
2982
|
|
|
'CLOWN-', 'KLAU', 'KLAU', |
2983
|
|
|
'CCH', 'Z', 'Z', |
2984
|
|
|
'CCE-', 'X', 'X', |
2985
|
|
|
'C(CK)-', '', '', |
2986
|
|
|
'CLAUDET---', 'KLO', 'KLU', |
2987
|
|
|
'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
2988
|
|
|
'COACH', 'KOSH', 'KUZ', |
2989
|
|
|
'COLE$', 'KOL', 'KUL', |
2990
|
|
|
'COUCH', 'KAUSH', 'KAUZ', |
2991
|
|
|
'COW', 'KAU', 'KAU', |
2992
|
|
|
'CQUES$', 'K', 'K', |
2993
|
|
|
'CQUE', 'K', 'K', |
2994
|
|
|
'CRASH--9', 'KRE', 'KRE', |
2995
|
|
|
'CREAT-^', 'KREA', 'KREA', |
2996
|
|
|
'CST', 'XT', 'XT', |
2997
|
|
|
'CS<^', 'Z', 'Z', |
2998
|
|
|
'C(SßX)', 'X', 'X', |
2999
|
|
|
'CT\'S$', 'X', 'X', |
3000
|
|
|
'CT(SßXZ)', 'X', 'X', |
3001
|
|
|
'CZ<', 'Z', 'Z', |
3002
|
|
|
'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
3003
|
|
|
'C.^', 'C.', 'C.', |
3004
|
|
|
'CÄ-', 'Z', 'Z', |
3005
|
|
|
'CÜ$', 'ZÜ', 'ZI', |
3006
|
|
|
'C\'S$', 'X', 'X', |
3007
|
|
|
'C<', 'K', 'K', |
3008
|
|
|
'DAHER^$', 'DAHER', None, |
3009
|
|
|
'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
3010
|
|
|
'DAVO(NR)-^$', 'DAFO', 'TAFU', |
3011
|
|
|
'DD(SZ)--<', '', '', |
3012
|
|
|
'DD9', 'D', None, |
3013
|
|
|
'DEPOT7', 'DEPO', 'TEBU', |
3014
|
|
|
'DESIGN', 'DISEIN', 'TIZEIN', |
3015
|
|
|
'DE(LMNRST)-3^', 'DE', 'TE', |
3016
|
|
|
'DETTE$', 'DET', 'TET', |
3017
|
|
|
'DH$', 'T', None, |
3018
|
|
|
'DIC$', 'DIZ', 'TIZ', |
3019
|
|
|
'DIDR-^', 'DIT', None, |
3020
|
|
|
'DIEDR-^', 'DIT', None, |
3021
|
|
|
'DJ(AEIOU)-^', 'I', 'I', |
3022
|
|
|
'DMITR-^', 'DIMIT', 'TINIT', |
3023
|
|
|
'DRY9^', 'DRÜ', None, |
3024
|
|
|
'DT-', '', '', |
3025
|
|
|
'DUIS-^', 'DÜ', 'TI', |
3026
|
|
|
'DURCH^^', 'DURCH', 'TURK', |
3027
|
|
|
'DVA$', 'TWA', None, |
3028
|
|
|
'DY9^', 'DÜ', None, |
3029
|
|
|
'DYS$', 'DIS', None, |
3030
|
|
|
'DS(CH)--<', 'T', 'T', |
3031
|
|
|
'DST', 'ZT', 'ZT', |
3032
|
|
|
'DZS(CH)--', 'T', 'T', |
3033
|
|
|
'D(SßZ)', 'Z', 'Z', |
3034
|
|
|
'D(AÄEIOÖRUÜY)-', 'D', None, |
3035
|
|
|
'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
3036
|
|
|
'D\'H^', 'D', 'T', |
3037
|
|
|
'D´H^', 'D', 'T', |
3038
|
|
|
'D`H^', 'D', 'T', |
3039
|
|
|
'D\'S3$', 'Z', 'Z', |
3040
|
|
|
'D´S3$', 'Z', 'Z', |
3041
|
|
|
'D^', 'D', None, |
3042
|
|
|
'D', 'T', 'T', |
3043
|
|
|
'EAULT$', 'O', 'U', |
3044
|
|
|
'EAUX$', 'O', 'U', |
3045
|
|
|
'EAU', 'O', 'U', |
3046
|
|
|
'EAV', 'IW', 'IF', |
3047
|
|
|
'EAS3$', 'EAS', None, |
3048
|
|
|
'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
3049
|
|
|
'EA3$', 'EA', 'EA', |
3050
|
|
|
'EA3', 'I', 'I', |
3051
|
|
|
'EBENSO^$', 'EBNSO', 'EBNZU', |
3052
|
|
|
'EBENSO^^', 'EBNSO ', 'EBNZU ', |
3053
|
|
|
'EBEN^^', 'EBN', 'EBN', |
3054
|
|
|
'EE9', 'E', 'E', |
3055
|
|
|
'EGL-1', 'EK', None, |
3056
|
|
|
'EHE(IUY)--1', 'EH', None, |
3057
|
|
|
'EHUNG---1', 'E', None, |
3058
|
|
|
'EH(AÄIOÖUÜY)-1', 'EH', None, |
3059
|
|
|
'EIEI--', '', '', |
3060
|
|
|
'EIERE^$', 'EIERE', None, |
3061
|
|
|
'EIERE$', 'EIERE', None, |
3062
|
|
|
'EIERE(NS)-$', 'EIERE', None, |
3063
|
|
|
'EIERE(AIOUY)--', 'EIER', None, |
3064
|
|
|
'EIER(AÄIOÖUÜY)-', 'EIER', None, |
3065
|
|
|
'EIER<', 'EIA', None, |
3066
|
|
|
'EIGL-1', 'EIK', None, |
3067
|
|
|
'EIGH$', 'EI', 'EI', |
3068
|
|
|
'EIH--', 'E', 'E', |
3069
|
|
|
'EILLE$', 'EI', 'EI', |
3070
|
|
|
'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
3071
|
|
|
'EIR$', 'EIA', 'EIA', |
3072
|
|
|
'EITRAUBEN------', 'EIT ', 'EIT ', |
3073
|
|
|
'EI', 'EI', 'EI', |
3074
|
|
|
'EJ$', 'EI', 'EI', |
3075
|
|
|
'ELIZ^', 'ELIS', None, |
3076
|
|
|
'ELZ^', 'ELS', None, |
3077
|
|
|
'EL-^', 'E', 'E', |
3078
|
|
|
'ELANG----1', 'E', 'E', |
3079
|
|
|
'EL(DKL)--1', 'E', 'E', |
3080
|
|
|
'EL(MNT)--1$', 'E', 'E', |
3081
|
|
|
'ELYNE$', 'ELINE', 'ELINE', |
3082
|
|
|
'ELYN$', 'ELIN', 'ELIN', |
3083
|
|
|
'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
3084
|
|
|
'EL-1', 'L', 'L', |
3085
|
|
|
'EM-^', None, 'E', |
3086
|
|
|
'EM(DFKMPQT)--1', None, 'E', |
3087
|
|
|
'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
3088
|
|
|
'EM-1', None, 'N', |
3089
|
|
|
'ENGAG-^', 'ANGA', 'ANKA', |
3090
|
|
|
'EN-^', 'E', 'E', |
3091
|
|
|
'ENTUEL', 'ENTUEL', None, |
3092
|
|
|
'EN(CDGKQSTZ)--1', 'E', 'E', |
3093
|
|
|
'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
3094
|
|
|
'EN-1', '', '', |
3095
|
|
|
'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
3096
|
|
|
'ER-^', 'E', 'E', |
3097
|
|
|
'ERREGEND-----', ' ER', ' ER', |
3098
|
|
|
'ERT1$', 'AT', None, |
3099
|
|
|
'ER(DGLKMNRQTZß)-1', 'ER', None, |
3100
|
|
|
'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
3101
|
|
|
'ER1$', 'A', 'A', |
3102
|
|
|
'ER<1', 'A', 'A', |
3103
|
|
|
'ETAT7', 'ETA', 'ETA', |
3104
|
|
|
'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
3105
|
|
|
'EUERE$', 'EUERE', None, |
3106
|
|
|
'EUERE(NS)-$', 'EUERE', None, |
3107
|
|
|
'EUERE(AIOUY)--', 'EUER', None, |
3108
|
|
|
'EUER(AÄIOÖUÜY)-', 'EUER', None, |
3109
|
|
|
'EUER<', 'EUA', None, |
3110
|
|
|
'EUEU--', '', '', |
3111
|
|
|
'EUILLE$', 'Ö', 'Ö', |
3112
|
|
|
'EUR$', 'ÖR', 'ÖR', |
3113
|
|
|
'EUX', 'Ö', 'Ö', |
3114
|
|
|
'EUSZ$', 'EUS', None, |
3115
|
|
|
'EUTZ$', 'EUS', None, |
3116
|
|
|
'EUYS$', 'EUS', 'EUZ', |
3117
|
|
|
'EUZ$', 'EUS', None, |
3118
|
|
|
'EU', 'EU', 'EU', |
3119
|
|
|
'EVER--<1', 'EW', None, |
3120
|
|
|
'EV(ÄOÖUÜ)-1', 'EW', None, |
3121
|
|
|
'EYER<', 'EIA', 'EIA', |
3122
|
|
|
'EY<', 'EI', 'EI', |
3123
|
|
|
'FACETTE', 'FASET', 'FAZET', |
3124
|
|
|
'FANS--^$', 'FE', 'FE', |
3125
|
|
|
'FAN-^$', 'FE', 'FE', |
3126
|
|
|
'FAULT-', 'FOL', 'FUL', |
3127
|
|
|
'FEE(DL)-', 'FI', 'FI', |
3128
|
|
|
'FEHLER', 'FELA', 'FELA', |
3129
|
|
|
'FE(LMNRST)-3^', 'FE', 'FE', |
3130
|
|
|
'FOERDERN---^', 'FÖRD', 'FÖRT', |
3131
|
|
|
'FOERDERN---', ' FÖRD', ' FÖRT', |
3132
|
|
|
'FOND7', 'FON', 'FUN', |
3133
|
|
|
'FRAIN$', 'FRA', 'FRA', |
3134
|
|
|
'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
3135
|
|
|
'FY9^', 'FÜ', None, |
3136
|
|
|
'FÖRDERN---^', 'FÖRD', 'FÖRT', |
3137
|
|
|
'FÖRDERN---', ' FÖRD', ' FÖRT', |
3138
|
|
|
'GAGS^$', 'GEX', 'KEX', |
3139
|
|
|
'GAG^$', 'GEK', 'KEK', |
3140
|
|
|
'GD', 'KT', 'KT', |
3141
|
|
|
'GEGEN^^', 'GEGN', 'KEKN', |
3142
|
|
|
'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
3143
|
|
|
'GEGENGESET-----', 'GEGN ', 'KEKN ', |
3144
|
|
|
'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
3145
|
|
|
'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
3146
|
|
|
'GENDETWAS-----$', 'GENT ', 'KENT ', |
3147
|
|
|
'GENRE', 'IORE', 'IURE', |
3148
|
|
|
'GE(LMNRST)-3^', 'GE', 'KE', |
3149
|
|
|
'GER(DKT)-', 'GER', None, |
3150
|
|
|
'GETTE$', 'GET', 'KET', |
3151
|
|
|
'GGF.', 'GF.', None, |
3152
|
|
|
'GG-', '', '', |
3153
|
|
|
'GH', 'G', None, |
3154
|
|
|
'GI(AOU)-^', 'I', 'I', |
3155
|
|
|
'GION-3', 'KIO', 'KIU', |
3156
|
|
|
'G(CK)-', '', '', |
3157
|
|
|
'GJ(AEIOU)-^', 'I', 'I', |
3158
|
|
|
'GMBH^$', 'GMBH', 'GMBH', |
3159
|
|
|
'GNAC$', 'NIAK', 'NIAK', |
3160
|
|
|
'GNON$', 'NION', 'NIUN', |
3161
|
|
|
'GN$', 'N', 'N', |
3162
|
|
|
'GONCAL-^', 'GONZA', 'KUNZA', |
3163
|
|
|
'GRY9^', 'GRÜ', None, |
3164
|
|
|
'G(SßXZ)-<', 'K', 'K', |
3165
|
|
|
'GUCK-', 'KU', 'KU', |
3166
|
|
|
'GUISEP-^', 'IUSE', 'IUZE', |
3167
|
|
|
'GUI-^', 'G', 'K', |
3168
|
|
|
'GUTAUSSEH------^', 'GUT ', 'KUT ', |
3169
|
|
|
'GUTGEHEND------^', 'GUT ', 'KUT ', |
3170
|
|
|
'GY9^', 'GÜ', None, |
3171
|
|
|
'G(AÄEILOÖRUÜY)-', 'G', None, |
3172
|
|
|
'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
3173
|
|
|
'G\'S$', 'X', 'X', |
3174
|
|
|
'G´S$', 'X', 'X', |
3175
|
|
|
'G^', 'G', None, |
3176
|
|
|
'G', 'K', 'K', |
3177
|
|
|
'HA(HIUY)--1', 'H', None, |
3178
|
|
|
'HANDVOL---^', 'HANT ', 'ANT ', |
3179
|
|
|
'HANNOVE-^', 'HANOF', None, |
3180
|
|
|
'HAVEN7$', 'HAFN', None, |
3181
|
|
|
'HEAD-', 'HE', 'E', |
3182
|
|
|
'HELIEGEN------', 'E ', 'E ', |
3183
|
|
|
'HESTEHEN------', 'E ', 'E ', |
3184
|
|
|
'HE(LMNRST)-3^', 'HE', 'E', |
3185
|
|
|
'HE(LMN)-1', 'E', 'E', |
3186
|
|
|
'HEUR1$', 'ÖR', 'ÖR', |
3187
|
|
|
'HE(HIUY)--1', 'H', None, |
3188
|
|
|
'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
3189
|
|
|
'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
3190
|
|
|
'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
3191
|
|
|
'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
3192
|
|
|
'HOBBY9^', 'HOBI', None, |
3193
|
|
|
'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
3194
|
|
|
'HOCHTALEN-----^', 'HOCH ', 'UK ', |
3195
|
|
|
'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
3196
|
|
|
'HO(HIY)--1', 'H', None, |
3197
|
|
|
'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
3198
|
|
|
'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
3199
|
|
|
'HUIS^^', 'HÜS', 'IZ', |
3200
|
|
|
'HUIS$', 'ÜS', 'IZ', |
3201
|
|
|
'HUI--1', 'H', None, |
3202
|
|
|
'HYGIEN^', 'HÜKIEN', None, |
3203
|
|
|
'HY9^', 'HÜ', None, |
3204
|
|
|
'HY(BDGMNPST)-', 'Ü', None, |
3205
|
|
|
'H.^', None, 'H.', |
3206
|
|
|
'HÄU--1', 'H', None, |
3207
|
|
|
'H^', 'H', '', |
3208
|
|
|
'H', '', '', |
3209
|
|
|
'ICHELL---', 'ISH', 'IZ', |
3210
|
|
|
'ICHI$', 'ISHI', 'IZI', |
3211
|
|
|
'IEC$', 'IZ', 'IZ', |
3212
|
|
|
'IEDENSTELLE------', 'IDN ', 'ITN ', |
3213
|
|
|
'IEI-3', '', '', |
3214
|
|
|
'IELL3', 'IEL', 'IEL', |
3215
|
|
|
'IENNE$', 'IN', 'IN', |
3216
|
|
|
'IERRE$', 'IER', 'IER', |
3217
|
|
|
'IERZULAN---', 'IR ZU ', 'IR ZU ', |
3218
|
|
|
'IETTE$', 'IT', 'IT', |
3219
|
|
|
'IEU', 'IÖ', 'IÖ', |
3220
|
|
|
'IE<4', 'I', 'I', |
3221
|
|
|
'IGL-1', 'IK', None, |
3222
|
|
|
'IGHT3$', 'EIT', 'EIT', |
3223
|
|
|
'IGNI(EO)-', 'INI', 'INI', |
3224
|
|
|
'IGN(AEOU)-$', 'INI', 'INI', |
3225
|
|
|
'IHER(DGLKRT)--1', 'IHE', None, |
3226
|
|
|
'IHE(IUY)--', 'IH', None, |
3227
|
|
|
'IH(AIOÖUÜY)-', 'IH', None, |
3228
|
|
|
'IJ(AOU)-', 'I', 'I', |
3229
|
|
|
'IJ$', 'I', 'I', |
3230
|
|
|
'IJ<', 'EI', 'EI', |
3231
|
|
|
'IKOLE$', 'IKOL', 'IKUL', |
3232
|
|
|
'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
3233
|
|
|
'ILLAR(DT)--4', 'ILIA', 'ILIA', |
3234
|
|
|
'IMSTAN----^', 'IM ', 'IN ', |
3235
|
|
|
'INDELERREGE------', 'INDL ', 'INTL ', |
3236
|
|
|
'INFRAGE-----^$', 'IN ', 'IN ', |
3237
|
|
|
'INTERN(AOU)-^', 'INTAN', 'INTAN', |
3238
|
|
|
'INVER-', 'INWE', 'INFE', |
3239
|
|
|
'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
3240
|
|
|
'IUSZ$', 'IUS', None, |
3241
|
|
|
'IUTZ$', 'IUS', None, |
3242
|
|
|
'IUZ$', 'IUS', None, |
3243
|
|
|
'IVER--<', 'IW', None, |
3244
|
|
|
'IVIER$', 'IWIE', 'IFIE', |
3245
|
|
|
'IV(ÄOÖUÜ)-', 'IW', None, |
3246
|
|
|
'IV<3', 'IW', None, |
3247
|
|
|
'IY2', 'I', None, |
3248
|
|
|
'I(ÈÉÊ)<4', 'I', 'I', |
3249
|
|
|
'JAVIE---<^', 'ZA', 'ZA', |
3250
|
|
|
'JEANS^$', 'JINS', 'INZ', |
3251
|
|
|
'JEANNE^$', 'IAN', 'IAN', |
3252
|
|
|
'JEAN-^', 'IA', 'IA', |
3253
|
|
|
'JER-^', 'IE', 'IE', |
3254
|
|
|
'JE(LMNST)-', 'IE', 'IE', |
3255
|
|
|
'JI^', 'JI', None, |
3256
|
|
|
'JOR(GK)^$', 'IÖRK', 'IÖRK', |
3257
|
|
|
'J', 'I', 'I', |
3258
|
|
|
'KC(ÄEIJ)-', 'X', 'X', |
3259
|
|
|
'KD', 'KT', None, |
3260
|
|
|
'KE(LMNRST)-3^', 'KE', 'KE', |
3261
|
|
|
'KG(AÄEILOÖRUÜY)-', 'K', None, |
3262
|
|
|
'KH<^', 'K', 'K', |
3263
|
|
|
'KIC$', 'KIZ', 'KIZ', |
3264
|
|
|
'KLE(LMNRST)-3^', 'KLE', 'KLE', |
3265
|
|
|
'KOTELE-^', 'KOTL', 'KUTL', |
3266
|
|
|
'KREAT-^', 'KREA', 'KREA', |
3267
|
|
|
'KRÜS(TZ)--^', 'KRI', None, |
3268
|
|
|
'KRYS(TZ)--^', 'KRI', None, |
3269
|
|
|
'KRY9^', 'KRÜ', None, |
3270
|
|
|
'KSCH---', 'K', 'K', |
3271
|
|
|
'KSH--', 'K', 'K', |
3272
|
|
|
'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
3273
|
|
|
'KT\'S$', 'X', 'X', |
3274
|
|
|
'KTI(AIOU)-3', 'XI', 'XI', |
3275
|
|
|
'KT(SßXZ)', 'X', 'X', |
3276
|
|
|
'KY9^', 'KÜ', None, |
3277
|
|
|
'K\'S$', 'X', 'X', |
3278
|
|
|
'K´S$', 'X', 'X', |
3279
|
|
|
'LANGES$', ' LANGES', ' LANKEZ', |
3280
|
|
|
'LANGE$', ' LANGE', ' LANKE', |
3281
|
|
|
'LANG$', ' LANK', ' LANK', |
3282
|
|
|
'LARVE-', 'LARF', 'LARF', |
3283
|
|
|
'LD(SßZ)$', 'LS', 'LZ', |
3284
|
|
|
'LD\'S$', 'LS', 'LZ', |
3285
|
|
|
'LD´S$', 'LS', 'LZ', |
3286
|
|
|
'LEAND-^', 'LEAN', 'LEAN', |
3287
|
|
|
'LEERSTEHE-----^', 'LER ', 'LER ', |
3288
|
|
|
'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
3289
|
|
|
'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
3290
|
|
|
'LEIDERREGE------', 'LEIT ', 'LEIT ', |
3291
|
|
|
'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
3292
|
|
|
'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
3293
|
|
|
'LEL-', 'LE', 'LE', |
3294
|
|
|
'LE(MNRST)-3^', 'LE', 'LE', |
3295
|
|
|
'LETTE$', 'LET', 'LET', |
3296
|
|
|
'LFGNAG-', 'LFGAN', 'LFKAN', |
3297
|
|
|
'LICHERWEIS----', 'LICHA ', 'LIKA ', |
3298
|
|
|
'LIC$', 'LIZ', 'LIZ', |
3299
|
|
|
'LIVE^$', 'LEIF', 'LEIF', |
3300
|
|
|
'LT(SßZ)$', 'LS', 'LZ', |
3301
|
|
|
'LT\'S$', 'LS', 'LZ', |
3302
|
|
|
'LT´S$', 'LS', 'LZ', |
3303
|
|
|
'LUI(GS)--', 'LU', 'LU', |
3304
|
|
|
'LV(AIO)-', 'LW', None, |
3305
|
|
|
'LY9^', 'LÜ', None, |
3306
|
|
|
'LSTS$', 'LS', 'LZ', |
3307
|
|
|
'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
3308
|
|
|
'L(SßZ)$', 'LS', None, |
3309
|
|
|
'MAIR-<', 'MEI', 'NEI', |
3310
|
|
|
'MANAG-', 'MENE', 'NENE', |
3311
|
|
|
'MANUEL', 'MANUEL', None, |
3312
|
|
|
'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
3313
|
|
|
'MATCH', 'MESH', 'NEZ', |
3314
|
|
|
'MAURICE', 'MORIS', 'NURIZ', |
3315
|
|
|
'MBH^$', 'MBH', 'MBH', |
3316
|
|
|
'MB(ßZ)$', 'MS', None, |
3317
|
|
|
'MB(SßTZ)-', 'M', 'N', |
3318
|
|
|
'MCG9^', 'MAK', 'NAK', |
3319
|
|
|
'MC9^', 'MAK', 'NAK', |
3320
|
|
|
'MEMOIR-^', 'MEMOA', 'NENUA', |
3321
|
|
|
'MERHAVEN$', 'MAHAFN', None, |
3322
|
|
|
'ME(LMNRST)-3^', 'ME', 'NE', |
3323
|
|
|
'MEN(STZ)--3', 'ME', None, |
3324
|
|
|
'MEN$', 'MEN', None, |
3325
|
|
|
'MIGUEL-', 'MIGE', 'NIKE', |
3326
|
|
|
'MIKE^$', 'MEIK', 'NEIK', |
3327
|
|
|
'MITHILFE----^$', 'MIT H', 'NIT ', |
3328
|
|
|
'MN$', 'M', None, |
3329
|
|
|
'MN', 'N', 'N', |
3330
|
|
|
'MPJUTE-', 'MPUT', 'NBUT', |
3331
|
|
|
'MP(ßZ)$', 'MS', None, |
3332
|
|
|
'MP(SßTZ)-', 'M', 'N', |
3333
|
|
|
'MP(BDJLMNPQVW)-', 'MB', 'NB', |
3334
|
|
|
'MY9^', 'MÜ', None, |
3335
|
|
|
'M(ßZ)$', 'MS', None, |
3336
|
|
|
'M´G7^', 'MAK', 'NAK', |
3337
|
|
|
'M\'G7^', 'MAK', 'NAK', |
3338
|
|
|
'M´^', 'MAK', 'NAK', |
3339
|
|
|
'M\'^', 'MAK', 'NAK', |
3340
|
|
|
'M', None, 'N', |
3341
|
|
|
'NACH^^', 'NACH', 'NAK', |
3342
|
|
|
'NADINE', 'NADIN', 'NATIN', |
3343
|
|
|
'NAIV--', 'NA', 'NA', |
3344
|
|
|
'NAISE$', 'NESE', 'NEZE', |
3345
|
|
|
'NAUGENOMM------', 'NAU ', 'NAU ', |
3346
|
|
|
'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
3347
|
|
|
'NCH$', 'NSH', 'NZ', |
3348
|
|
|
'NCOISE$', 'SOA', 'ZUA', |
3349
|
|
|
'NCOIS$', 'SOA', 'ZUA', |
3350
|
|
|
'NDAR$', 'NDA', 'NTA', |
3351
|
|
|
'NDERINGEN------', 'NDE ', 'NTE ', |
3352
|
|
|
'NDRO(CDKTZ)-', 'NTRO', None, |
3353
|
|
|
'ND(BFGJLMNPQVW)-', 'NT', None, |
3354
|
|
|
'ND(SßZ)$', 'NS', 'NZ', |
3355
|
|
|
'ND\'S$', 'NS', 'NZ', |
3356
|
|
|
'ND´S$', 'NS', 'NZ', |
3357
|
|
|
'NEBEN^^', 'NEBN', 'NEBN', |
3358
|
|
|
'NENGELERN------', 'NEN ', 'NEN ', |
3359
|
|
|
'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
3360
|
|
|
'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
3361
|
|
|
'NE(LMNRST)-3^', 'NE', 'NE', |
3362
|
|
|
'NEN-3', 'NE', 'NE', |
3363
|
|
|
'NETTE$', 'NET', 'NET', |
3364
|
|
|
'NGU^^', 'NU', 'NU', |
3365
|
|
|
'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
3366
|
|
|
'NH(AUO)-$', 'NI', 'NI', |
3367
|
|
|
'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
3368
|
|
|
'NICHTSSAGE----', 'NIX ', 'NIX ', |
3369
|
|
|
'NICHTS^^', 'NIX', 'NIX', |
3370
|
|
|
'NICHT^^', 'NICHT', 'NIKT', |
3371
|
|
|
'NINE$', 'NIN', 'NIN', |
3372
|
|
|
'NON^^', 'NON', 'NUN', |
3373
|
|
|
'NOTLEIDE-----^', 'NOT ', 'NUT ', |
3374
|
|
|
'NOT^^', 'NOT', 'NUT', |
3375
|
|
|
'NTI(AIOU)-3', 'NZI', 'NZI', |
3376
|
|
|
'NTIEL--3', 'NZI', 'NZI', |
3377
|
|
|
'NT(SßZ)$', 'NS', 'NZ', |
3378
|
|
|
'NT\'S$', 'NS', 'NZ', |
3379
|
|
|
'NT´S$', 'NS', 'NZ', |
3380
|
|
|
'NYLON', 'NEILON', 'NEILUN', |
3381
|
|
|
'NY9^', 'NÜ', None, |
3382
|
|
|
'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
3383
|
|
|
'NSZ-', 'NS', None, |
3384
|
|
|
'NSTS$', 'NS', 'NZ', |
3385
|
|
|
'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
3386
|
|
|
'N(SßZ)$', 'NS', None, |
3387
|
|
|
'OBERE-', 'OBER', None, |
3388
|
|
|
'OBER^^', 'OBA', 'UBA', |
3389
|
|
|
'OEU2', 'Ö', 'Ö', |
3390
|
|
|
'OE<2', 'Ö', 'Ö', |
3391
|
|
|
'OGL-', 'OK', None, |
3392
|
|
|
'OGNIE-', 'ONI', 'UNI', |
3393
|
|
|
'OGN(AEOU)-$', 'ONI', 'UNI', |
3394
|
|
|
'OH(AIOÖUÜY)-', 'OH', None, |
3395
|
|
|
'OIE$', 'Ö', 'Ö', |
3396
|
|
|
'OIRE$', 'OA', 'UA', |
3397
|
|
|
'OIR$', 'OA', 'UA', |
3398
|
|
|
'OIX', 'OA', 'UA', |
3399
|
|
|
'OI<3', 'EU', 'EU', |
3400
|
|
|
'OKAY^$', 'OKE', 'UKE', |
3401
|
|
|
'OLYN$', 'OLIN', 'ULIN', |
3402
|
|
|
'OO(DLMZ)-', 'U', None, |
3403
|
|
|
'OO$', 'U', None, |
3404
|
|
|
'OO-', '', '', |
3405
|
|
|
'ORGINAL-----', 'ORI', 'URI', |
3406
|
|
|
'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
3407
|
|
|
'OUI^', 'WI', 'FI', |
3408
|
|
|
'OUILLE$', 'ULIE', 'ULIE', |
3409
|
|
|
'OU(DT)-^', 'AU', 'AU', |
3410
|
|
|
'OUSE$', 'AUS', 'AUZ', |
3411
|
|
|
'OUT-', 'AU', 'AU', |
3412
|
|
|
'OU', 'U', 'U', |
3413
|
|
|
'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
3414
|
|
|
'OVER--<', 'OW', None, |
3415
|
|
|
'OV(AOU)-', 'OW', None, |
3416
|
|
|
'OW$', 'AU', 'AU', |
3417
|
|
|
'OWS$', 'OS', 'UZ', |
3418
|
|
|
'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
3419
|
|
|
'OYER', 'OIA', None, |
3420
|
|
|
'OY(AÄEIOÖUÜ)--', 'O', 'U', |
3421
|
|
|
'O(JY)<', 'EU', 'EU', |
3422
|
|
|
'OZ$', 'OS', None, |
3423
|
|
|
'O´^', 'O', 'U', |
3424
|
|
|
'O\'^', 'O', 'U', |
3425
|
|
|
'O', None, 'U', |
3426
|
|
|
'PATIEN--^', 'PAZI', 'PAZI', |
3427
|
|
|
'PENSIO-^', 'PANSI', 'PANZI', |
3428
|
|
|
'PE(LMNRST)-3^', 'PE', 'PE', |
3429
|
|
|
'PFER-^', 'FE', 'FE', |
3430
|
|
|
'P(FH)<', 'F', 'F', |
3431
|
|
|
'PIC^$', 'PIK', 'PIK', |
3432
|
|
|
'PIC$', 'PIZ', 'PIZ', |
3433
|
|
|
'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
3434
|
|
|
'POLYP-', 'POLÜ', None, |
3435
|
|
|
'POLY^^', 'POLI', 'PULI', |
3436
|
|
|
'PORTRAIT7', 'PORTRE', 'PURTRE', |
3437
|
|
|
'POWER7', 'PAUA', 'PAUA', |
3438
|
|
|
'PP(FH)--<', 'B', 'B', |
3439
|
|
|
'PP-', '', '', |
3440
|
|
|
'PRODUZ-^', 'PRODU', 'BRUTU', |
3441
|
|
|
'PRODUZI--', ' PRODU', ' BRUTU', |
3442
|
|
|
'PRIX^$', 'PRI', 'PRI', |
3443
|
|
|
'PS-^^', 'P', None, |
3444
|
|
|
'P(SßZ)^', None, 'Z', |
3445
|
|
|
'P(SßZ)$', 'BS', None, |
3446
|
|
|
'PT-^', '', '', |
3447
|
|
|
'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
3448
|
|
|
'PY9^', 'PÜ', None, |
3449
|
|
|
'P(AÄEIOÖRUÜY)-', 'P', 'P', |
3450
|
|
|
'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
3451
|
|
|
'P.^', None, 'P.', |
3452
|
|
|
'P^', 'P', None, |
3453
|
|
|
'P', 'B', 'B', |
3454
|
|
|
'QI-', 'Z', 'Z', |
3455
|
|
|
'QUARANT--', 'KARA', 'KARA', |
3456
|
|
|
'QUE(LMNRST)-3', 'KWE', 'KFE', |
3457
|
|
|
'QUE$', 'K', 'K', |
3458
|
|
|
'QUI(NS)$', 'KI', 'KI', |
3459
|
|
|
'QUIZ7', 'KWIS', None, |
3460
|
|
|
'Q(UV)7', 'KW', 'KF', |
3461
|
|
|
'Q<', 'K', 'K', |
3462
|
|
|
'RADFAHR----', 'RAT ', 'RAT ', |
3463
|
|
|
'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
3464
|
|
|
'RCH', 'RCH', 'RK', |
3465
|
|
|
'REA(DU)---3^', 'R', None, |
3466
|
|
|
'REBSERZEUG------', 'REBS ', 'REBZ ', |
3467
|
|
|
'RECHERCH^', 'RESHASH', 'REZAZ', |
3468
|
|
|
'RECYCL--', 'RIZEI', 'RIZEI', |
3469
|
|
|
'RE(ALST)-3^', 'RE', None, |
3470
|
|
|
'REE$', 'RI', 'RI', |
3471
|
|
|
'RER$', 'RA', 'RA', |
3472
|
|
|
'RE(MNR)-4', 'RE', 'RE', |
3473
|
|
|
'RETTE$', 'RET', 'RET', |
3474
|
|
|
'REUZ$', 'REUZ', None, |
3475
|
|
|
'REW$', 'RU', 'RU', |
3476
|
|
|
'RH<^', 'R', 'R', |
3477
|
|
|
'RJA(MN)--', 'RI', 'RI', |
3478
|
|
|
'ROWD-^', 'RAU', 'RAU', |
3479
|
|
|
'RTEMONNAIE-', 'RTMON', 'RTNUN', |
3480
|
|
|
'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
3481
|
|
|
'RTIEL--3', 'RZI', 'RZI', |
3482
|
|
|
'RV(AEOU)-3', 'RW', None, |
3483
|
|
|
'RY(KN)-$', 'RI', 'RI', |
3484
|
|
|
'RY9^', 'RÜ', None, |
3485
|
|
|
'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
3486
|
|
|
'SAISO-^', 'SES', 'ZEZ', |
3487
|
|
|
'SAFE^$', 'SEIF', 'ZEIF', |
3488
|
|
|
'SAUCE-^', 'SOS', 'ZUZ', |
3489
|
|
|
'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
3490
|
|
|
'SCHSCH---7', '', '', |
3491
|
|
|
'SCHTSCH', 'SH', 'Z', |
3492
|
|
|
'SC(HZ)<', 'SH', 'Z', |
3493
|
|
|
'SC', 'SK', 'ZK', |
3494
|
|
|
'SELBSTST--7^^', 'SELB', 'ZELB', |
3495
|
|
|
'SELBST7^^', 'SELBST', 'ZELBZT', |
3496
|
|
|
'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
3497
|
|
|
'SERVI-^', 'SERW', None, |
3498
|
|
|
'SE(LMNRST)-3^', 'SE', 'ZE', |
3499
|
|
|
'SETTE$', 'SET', 'ZET', |
3500
|
|
|
'SHP-^', 'S', 'Z', |
3501
|
|
|
'SHST', 'SHT', 'ZT', |
3502
|
|
|
'SHTSH', 'SH', 'Z', |
3503
|
|
|
'SHT', 'ST', 'Z', |
3504
|
|
|
'SHY9^', 'SHÜ', None, |
3505
|
|
|
'SH^^', 'SH', None, |
3506
|
|
|
'SH3', 'SH', 'Z', |
3507
|
|
|
'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
3508
|
|
|
'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
3509
|
|
|
'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
3510
|
|
|
'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
3511
|
|
|
'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
3512
|
|
|
'SIEGLI-^', 'SIKL', 'ZIKL', |
3513
|
|
|
'SIGLI-^', 'SIKL', 'ZIKL', |
3514
|
|
|
'SIGHT', 'SEIT', 'ZEIT', |
3515
|
|
|
'SIGN', 'SEIN', 'ZEIN', |
3516
|
|
|
'SKI(NPZ)-', 'SKI', 'ZKI', |
3517
|
|
|
'SKI<^', 'SHI', 'ZI', |
3518
|
|
|
'SODASS^$', 'SO DAS', 'ZU TAZ', |
3519
|
|
|
'SODAß^$', 'SO DAS', 'ZU TAZ', |
3520
|
|
|
'SOGENAN--^', 'SO GEN', 'ZU KEN', |
3521
|
|
|
'SOUND-', 'SAUN', 'ZAUN', |
3522
|
|
|
'STAATS^^', 'STAZ', 'ZTAZ', |
3523
|
|
|
'STADT^^', 'STAT', 'ZTAT', |
3524
|
|
|
'STANDE$', ' STANDE', ' ZTANTE', |
3525
|
|
|
'START^^', 'START', 'ZTART', |
3526
|
|
|
'STAURANT7', 'STORAN', 'ZTURAN', |
3527
|
|
|
'STEAK-', 'STE', 'ZTE', |
3528
|
|
|
'STEPHEN-^$', 'STEW', None, |
3529
|
|
|
'STERN', 'STERN', None, |
3530
|
|
|
'STRAF^^', 'STRAF', 'ZTRAF', |
3531
|
|
|
'ST\'S$', 'Z', 'Z', |
3532
|
|
|
'ST´S$', 'Z', 'Z', |
3533
|
|
|
'STST--', '', '', |
3534
|
|
|
'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
3535
|
|
|
'ST(SZ)', 'Z', 'Z', |
3536
|
|
|
'SPAREN---^', 'SPA', 'ZPA', |
3537
|
|
|
'SPAREND----', ' SPA', ' ZPA', |
3538
|
|
|
'S(PTW)-^^', 'S', None, |
3539
|
|
|
'SP', 'SP', None, |
3540
|
|
|
'STYN(AE)-$', 'STIN', 'ZTIN', |
3541
|
|
|
'ST', 'ST', 'ZT', |
3542
|
|
|
'SUITE<', 'SIUT', 'ZIUT', |
3543
|
|
|
'SUKE--$', 'S', 'Z', |
3544
|
|
|
'SURF(EI)-', 'SÖRF', 'ZÖRF', |
3545
|
|
|
'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
3546
|
|
|
'SYB(IY)--^', 'SIB', None, |
3547
|
|
|
'SYL(KVW)--^', 'SI', None, |
3548
|
|
|
'SY9^', 'SÜ', None, |
3549
|
|
|
'SZE(NPT)-^', 'ZE', 'ZE', |
3550
|
|
|
'SZI(ELN)-^', 'ZI', 'ZI', |
3551
|
|
|
'SZCZ<', 'SH', 'Z', |
3552
|
|
|
'SZT<', 'ST', 'ZT', |
3553
|
|
|
'SZ<3', 'SH', 'Z', |
3554
|
|
|
'SÜL(KVW)--^', 'SI', None, |
3555
|
|
|
'S', None, 'Z', |
3556
|
|
|
'TCH', 'SH', 'Z', |
3557
|
|
|
'TD(AÄEIOÖRUÜY)-', 'T', None, |
3558
|
|
|
'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
3559
|
|
|
'TEAT-^', 'TEA', 'TEA', |
3560
|
|
|
'TERRAI7^', 'TERA', 'TERA', |
3561
|
|
|
'TE(LMNRST)-3^', 'TE', 'TE', |
3562
|
|
|
'TH<', 'T', 'T', |
3563
|
|
|
'TICHT-', 'TIK', 'TIK', |
3564
|
|
|
'TICH$', 'TIK', 'TIK', |
3565
|
|
|
'TIC$', 'TIZ', 'TIZ', |
3566
|
|
|
'TIGGESTELL-------', 'TIK ', 'TIK ', |
3567
|
|
|
'TIGSTELL-----', 'TIK ', 'TIK ', |
3568
|
|
|
'TOAS-^', 'TO', 'TU', |
3569
|
|
|
'TOILET-', 'TOLE', 'TULE', |
3570
|
|
|
'TOIN-', 'TOA', 'TUA', |
3571
|
|
|
'TRAECHTI-^', 'TRECHT', 'TREKT', |
3572
|
|
|
'TRAECHTIG--', ' TRECHT', ' TREKT', |
3573
|
|
|
'TRAINI-', 'TREN', 'TREN', |
3574
|
|
|
'TRÄCHTI-^', 'TRECHT', 'TREKT', |
3575
|
|
|
'TRÄCHTIG--', ' TRECHT', ' TREKT', |
3576
|
|
|
'TSCH', 'SH', 'Z', |
3577
|
|
|
'TSH', 'SH', 'Z', |
3578
|
|
|
'TST', 'ZT', 'ZT', |
3579
|
|
|
'T(Sß)', 'Z', 'Z', |
3580
|
|
|
'TT(SZ)--<', '', '', |
3581
|
|
|
'TT9', 'T', 'T', |
3582
|
|
|
'TV^$', 'TV', 'TV', |
3583
|
|
|
'TX(AEIOU)-3', 'SH', 'Z', |
3584
|
|
|
'TY9^', 'TÜ', None, |
3585
|
|
|
'TZ-', '', '', |
3586
|
|
|
'T\'S3$', 'Z', 'Z', |
3587
|
|
|
'T´S3$', 'Z', 'Z', |
3588
|
|
|
'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
3589
|
|
|
'UEBER^^', 'ÜBA', 'IBA', |
3590
|
|
|
'UE2', 'Ü', 'I', |
3591
|
|
|
'UGL-', 'UK', None, |
3592
|
|
|
'UH(AOÖUÜY)-', 'UH', None, |
3593
|
|
|
'UIE$', 'Ü', 'I', |
3594
|
|
|
'UM^^', 'UM', 'UN', |
3595
|
|
|
'UNTERE--3', 'UNTE', 'UNTE', |
3596
|
|
|
'UNTER^^', 'UNTA', 'UNTA', |
3597
|
|
|
'UNVER^^', 'UNFA', 'UNFA', |
3598
|
|
|
'UN^^', 'UN', 'UN', |
3599
|
|
|
'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
3600
|
|
|
'UVE-4', 'UW', None, |
3601
|
|
|
'UY2', 'UI', None, |
3602
|
|
|
'UZZ', 'AS', 'AZ', |
3603
|
|
|
'VACL-^', 'WAZ', 'FAZ', |
3604
|
|
|
'VAC$', 'WAZ', 'FAZ', |
3605
|
|
|
'VAN DEN ^', 'FANDN', 'FANTN', |
3606
|
|
|
'VANES-^', 'WANE', None, |
3607
|
|
|
'VATRO-', 'WATR', None, |
3608
|
|
|
'VA(DHJNT)--^', 'F', None, |
3609
|
|
|
'VEDD-^', 'FE', 'FE', |
3610
|
|
|
'VE(BEHIU)--^', 'F', None, |
3611
|
|
|
'VEL(BDLMNT)-^', 'FEL', None, |
3612
|
|
|
'VENTZ-^', 'FEN', None, |
3613
|
|
|
'VEN(NRSZ)-^', 'FEN', None, |
3614
|
|
|
'VER(AB)-^$', 'WER', None, |
3615
|
|
|
'VERBAL^$', 'WERBAL', None, |
3616
|
|
|
'VERBAL(EINS)-^', 'WERBAL', None, |
3617
|
|
|
'VERTEBR--', 'WERTE', None, |
3618
|
|
|
'VEREIN-----', 'F', None, |
3619
|
|
|
'VEREN(AEIOU)-^', 'WEREN', None, |
3620
|
|
|
'VERIFI', 'WERIFI', None, |
3621
|
|
|
'VERON(AEIOU)-^', 'WERON', None, |
3622
|
|
|
'VERSEN^', 'FERSN', 'FAZN', |
3623
|
|
|
'VERSIERT--^', 'WERSI', None, |
3624
|
|
|
'VERSIO--^', 'WERS', None, |
3625
|
|
|
'VERSUS', 'WERSUS', None, |
3626
|
|
|
'VERTI(GK)-', 'WERTI', None, |
3627
|
|
|
'VER^^', 'FER', 'FA', |
3628
|
|
|
'VERSPRECHE-------', ' FER', ' FA', |
3629
|
|
|
'VER$', 'WA', None, |
3630
|
|
|
'VER', 'FA', 'FA', |
3631
|
|
|
'VET(HT)-^', 'FET', 'FET', |
3632
|
|
|
'VETTE$', 'WET', 'FET', |
3633
|
|
|
'VE^', 'WE', None, |
3634
|
|
|
'VIC$', 'WIZ', 'FIZ', |
3635
|
|
|
'VIELSAGE----', 'FIL ', 'FIL ', |
3636
|
|
|
'VIEL', 'FIL', 'FIL', |
3637
|
|
|
'VIEW', 'WIU', 'FIU', |
3638
|
|
|
'VILL(AE)-', 'WIL', None, |
3639
|
|
|
'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
3640
|
|
|
'VI(ELS)--^', 'F', None, |
3641
|
|
|
'VILLON--', 'WILI', 'FILI', |
3642
|
|
|
'VIZE^^', 'FIZE', 'FIZE', |
3643
|
|
|
'VLIE--^', 'FL', None, |
3644
|
|
|
'VL(AEIOU)--', 'W', None, |
3645
|
|
|
'VOKA-^', 'WOK', None, |
3646
|
|
|
'VOL(ATUVW)--^', 'WO', None, |
3647
|
|
|
'VOR^^', 'FOR', 'FUR', |
3648
|
|
|
'VR(AEIOU)--', 'W', None, |
3649
|
|
|
'VV9', 'W', None, |
3650
|
|
|
'VY9^', 'WÜ', 'FI', |
3651
|
|
|
'V(ÜY)-', 'W', None, |
3652
|
|
|
'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
3653
|
|
|
'V(AEIJLRU)-<', 'W', None, |
3654
|
|
|
'V.^', 'V.', None, |
3655
|
|
|
'V<', 'F', 'F', |
3656
|
|
|
'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
3657
|
|
|
'WEITREICH-----^', 'WEIT ', 'FEIT ', |
3658
|
|
|
'WEITVER^', 'WEIT FER', 'FEIT FA', |
3659
|
|
|
'WE(LMNRST)-3^', 'WE', 'FE', |
3660
|
|
|
'WER(DST)-', 'WER', None, |
3661
|
|
|
'WIC$', 'WIZ', 'FIZ', |
3662
|
|
|
'WIEDERU--', 'WIDE', 'FITE', |
3663
|
|
|
'WIEDER^$', 'WIDA', 'FITA', |
3664
|
|
|
'WIEDER^^', 'WIDA ', 'FITA ', |
3665
|
|
|
'WIEVIEL', 'WI FIL', 'FI FIL', |
3666
|
|
|
'WISUEL', 'WISUEL', None, |
3667
|
|
|
'WR-^', 'W', None, |
3668
|
|
|
'WY9^', 'WÜ', 'FI', |
3669
|
|
|
'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
3670
|
|
|
'W$', 'F', None, |
3671
|
|
|
'W', None, 'F', |
3672
|
|
|
'X<^', 'Z', 'Z', |
3673
|
|
|
'XHAVEN$', 'XAFN', None, |
3674
|
|
|
'X(CSZ)', 'X', 'X', |
3675
|
|
|
'XTS(CH)--', 'XT', 'XT', |
3676
|
|
|
'XT(SZ)', 'Z', 'Z', |
3677
|
|
|
'YE(LMNRST)-3^', 'IE', 'IE', |
3678
|
|
|
'YE-3', 'I', 'I', |
3679
|
|
|
'YOR(GK)^$', 'IÖRK', 'IÖRK', |
3680
|
|
|
'Y(AOU)-<7', 'I', 'I', |
3681
|
|
|
'Y(BKLMNPRSTX)-1', 'Ü', None, |
3682
|
|
|
'YVES^$', 'IF', 'IF', |
3683
|
|
|
'YVONNE^$', 'IWON', 'IFUN', |
3684
|
|
|
'Y.^', 'Y.', None, |
3685
|
|
|
'Y', 'I', 'I', |
3686
|
|
|
'ZC(AOU)-', 'SK', 'ZK', |
3687
|
|
|
'ZE(LMNRST)-3^', 'ZE', 'ZE', |
3688
|
|
|
'ZIEJ$', 'ZI', 'ZI', |
3689
|
|
|
'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
3690
|
|
|
'ZL(AEIOU)-', 'SL', None, |
3691
|
|
|
'ZS(CHT)--', '', '', |
3692
|
|
|
'ZS', 'SH', 'Z', |
3693
|
|
|
'ZUERST', 'ZUERST', 'ZUERST', |
3694
|
|
|
'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
3695
|
|
|
'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
3696
|
|
|
'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
3697
|
|
|
'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
3698
|
|
|
'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
3699
|
|
|
'ZURUECK^^', 'ZURÜK', 'ZURIK', |
3700
|
|
|
'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
3701
|
|
|
'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
3702
|
|
|
'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
3703
|
|
|
'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
3704
|
|
|
'ZUVER^^', 'ZUFA', 'ZUFA', |
3705
|
|
|
'ZUVIEL', 'ZU FIL', 'ZU FIL', |
3706
|
|
|
'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
3707
|
|
|
'ZY9^', 'ZÜ', None, |
3708
|
|
|
'ZYK3$', 'ZIK', None, |
3709
|
|
|
'Z(VW)7^', 'SW', None, |
3710
|
|
|
None, None, None) |
3711
|
|
|
|
3712
|
|
|
phonet_hash = Counter() |
3713
|
|
|
alpha_pos = Counter() |
3714
|
|
|
|
3715
|
|
|
phonet_hash_1 = Counter() |
3716
|
|
|
phonet_hash_2 = Counter() |
3717
|
|
|
|
3718
|
|
|
_phonet_upper_translation = dict(zip((ord(_) for _ in |
|
|
|
|
3719
|
|
|
'abcdefghijklmnopqrstuvwxyzàáâãåäæ' + |
3720
|
|
|
'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'), |
3721
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' + |
3722
|
|
|
'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ')) |
3723
|
|
|
|
3724
|
|
|
def _initialize_phonet(lang): |
3725
|
|
|
"""Initialize phonet variables.""" |
3726
|
|
|
if lang == 'none': |
3727
|
|
|
_phonet_rules = _phonet_rules_no_lang |
3728
|
|
|
else: |
3729
|
|
|
_phonet_rules = _phonet_rules_german |
3730
|
|
|
|
3731
|
|
|
phonet_hash[''] = -1 |
3732
|
|
|
|
3733
|
|
|
# German and international umlauts |
3734
|
|
|
for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', |
3735
|
|
|
'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', |
3736
|
|
|
'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}: |
3737
|
|
|
alpha_pos[j] = 1 |
3738
|
|
|
phonet_hash[j] = -1 |
3739
|
|
|
|
3740
|
|
|
# "normal" letters ('A'-'Z') |
3741
|
|
|
for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
3742
|
|
|
alpha_pos[j] = i + 2 |
3743
|
|
|
phonet_hash[j] = -1 |
3744
|
|
|
|
3745
|
|
|
for i in range(26): |
3746
|
|
|
for j in range(28): |
3747
|
|
|
phonet_hash_1[i, j] = -1 |
3748
|
|
|
phonet_hash_2[i, j] = -1 |
3749
|
|
|
|
3750
|
|
|
# for each phonetc rule |
3751
|
|
|
for i in range(len(_phonet_rules)): |
3752
|
|
|
rule = _phonet_rules[i] |
3753
|
|
|
|
3754
|
|
|
if rule and i % 3 == 0: |
3755
|
|
|
# calculate first hash value |
3756
|
|
|
k = _phonet_rules[i][0] |
3757
|
|
|
|
3758
|
|
|
if phonet_hash[k] < 0 and (_phonet_rules[i+1] or |
|
|
|
|
3759
|
|
|
_phonet_rules[i+2]): |
3760
|
|
|
phonet_hash[k] = i |
3761
|
|
|
|
3762
|
|
|
# calculate second hash values |
3763
|
|
|
if k and alpha_pos[k] >= 2: |
|
|
|
|
3764
|
|
|
k = alpha_pos[k] |
3765
|
|
|
|
3766
|
|
|
j = k-2 |
3767
|
|
|
rule = rule[1:] |
3768
|
|
|
|
3769
|
|
|
if not rule: |
3770
|
|
|
rule = ' ' |
3771
|
|
|
elif rule[0] == '(': |
3772
|
|
|
rule = rule[1:] |
3773
|
|
|
else: |
3774
|
|
|
rule = rule[0] |
3775
|
|
|
|
3776
|
|
|
while rule and (rule[0] != ')'): |
3777
|
|
|
k = alpha_pos[rule[0]] |
3778
|
|
|
|
3779
|
|
|
if k > 0: |
3780
|
|
|
# add hash value for this letter |
3781
|
|
|
if phonet_hash_1[j, k] < 0: |
|
|
|
|
3782
|
|
|
phonet_hash_1[j, k] = i |
3783
|
|
|
phonet_hash_2[j, k] = i |
3784
|
|
|
|
3785
|
|
|
if phonet_hash_2[j, k] >= (i-30): |
|
|
|
|
3786
|
|
|
phonet_hash_2[j, k] = i |
3787
|
|
|
else: |
3788
|
|
|
k = -1 |
3789
|
|
|
|
3790
|
|
|
if k <= 0: |
3791
|
|
|
# add hash value for all letters |
3792
|
|
|
if phonet_hash_1[j, 0] < 0: |
3793
|
|
|
phonet_hash_1[j, 0] = i |
3794
|
|
|
|
3795
|
|
|
phonet_hash_2[j, 0] = i |
3796
|
|
|
|
3797
|
|
|
rule = rule[1:] |
3798
|
|
|
|
3799
|
|
|
def _phonet(term, mode, lang): |
3800
|
|
|
"""Return the phonet coded form of a term.""" |
3801
|
|
|
if lang == 'none': |
3802
|
|
|
_phonet_rules = _phonet_rules_no_lang |
3803
|
|
|
else: |
3804
|
|
|
_phonet_rules = _phonet_rules_german |
3805
|
|
|
|
3806
|
|
|
char0 = '' |
3807
|
|
|
dest = term |
3808
|
|
|
|
3809
|
|
|
if not term: |
3810
|
|
|
return '' |
3811
|
|
|
|
3812
|
|
|
term_length = len(term) |
3813
|
|
|
|
3814
|
|
|
# convert input string to upper-case |
3815
|
|
|
src = term.translate(_phonet_upper_translation) |
3816
|
|
|
|
3817
|
|
|
# check "src" |
3818
|
|
|
i = 0 |
3819
|
|
|
j = 0 |
3820
|
|
|
zeta = 0 |
3821
|
|
|
|
3822
|
|
|
while i < len(src): |
3823
|
|
|
char = src[i] |
3824
|
|
|
|
3825
|
|
|
pos = alpha_pos[char] |
3826
|
|
|
|
3827
|
|
|
if pos >= 2: |
3828
|
|
|
xpos = pos-2 |
3829
|
|
|
|
3830
|
|
|
if i+1 == len(src): |
3831
|
|
|
pos = alpha_pos[''] |
3832
|
|
|
else: |
3833
|
|
|
pos = alpha_pos[src[i+1]] |
3834
|
|
|
|
3835
|
|
|
start1 = phonet_hash_1[xpos, pos] |
3836
|
|
|
start2 = phonet_hash_1[xpos, 0] |
3837
|
|
|
end1 = phonet_hash_2[xpos, pos] |
3838
|
|
|
end2 = phonet_hash_2[xpos, 0] |
3839
|
|
|
|
3840
|
|
|
# preserve rule priorities |
3841
|
|
|
if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
3842
|
|
|
pos = start1 |
3843
|
|
|
start1 = start2 |
3844
|
|
|
start2 = pos |
3845
|
|
|
pos = end1 |
3846
|
|
|
end1 = end2 |
3847
|
|
|
end2 = pos |
3848
|
|
|
|
3849
|
|
|
if (end1 >= start2) and (start2 >= 0): |
3850
|
|
|
if end2 > end1: |
3851
|
|
|
end1 = end2 |
3852
|
|
|
|
3853
|
|
|
start2 = -1 |
3854
|
|
|
end2 = -1 |
3855
|
|
|
else: |
3856
|
|
|
pos = phonet_hash[char] |
3857
|
|
|
start1 = pos |
3858
|
|
|
end1 = 10000 |
3859
|
|
|
start2 = -1 |
3860
|
|
|
end2 = -1 |
3861
|
|
|
|
3862
|
|
|
pos = start1 |
3863
|
|
|
zeta0 = 0 |
3864
|
|
|
|
3865
|
|
|
if pos >= 0: |
3866
|
|
|
# check rules for this char |
3867
|
|
|
while ((_phonet_rules[pos] is None) or |
3868
|
|
|
(_phonet_rules[pos][0] == char)): |
3869
|
|
|
if pos > end1: |
3870
|
|
|
if start2 > 0: |
3871
|
|
|
pos = start2 |
3872
|
|
|
start1 = start2 |
3873
|
|
|
start2 = -1 |
3874
|
|
|
end1 = end2 |
3875
|
|
|
end2 = -1 |
3876
|
|
|
continue |
3877
|
|
|
|
3878
|
|
|
break |
3879
|
|
|
|
3880
|
|
|
if (((_phonet_rules[pos] is None) or |
3881
|
|
|
(_phonet_rules[pos + mode] is None))): |
3882
|
|
|
# no conversion rule available |
3883
|
|
|
pos += 3 |
3884
|
|
|
continue |
3885
|
|
|
|
3886
|
|
|
# check whole string |
3887
|
|
|
matches = 1 # number of matching letters |
3888
|
|
|
priority = 5 # default priority |
3889
|
|
|
rule = _phonet_rules[pos] |
3890
|
|
|
rule = rule[1:] |
3891
|
|
|
|
3892
|
|
|
while (rule and |
3893
|
|
|
(len(src) > (i + matches)) and |
3894
|
|
|
(src[i + matches] == rule[0]) and |
3895
|
|
|
not rule[0].isdigit() and |
3896
|
|
|
(rule not in '(-<^$')): |
3897
|
|
|
matches += 1 |
3898
|
|
|
rule = rule[1:] |
3899
|
|
|
|
3900
|
|
|
if rule and (rule[0] == '('): |
3901
|
|
|
# check an array of letters |
3902
|
|
|
if (((len(src) > (i + matches)) and |
3903
|
|
|
src[i + matches].isalpha() and |
3904
|
|
|
(src[i + matches] in rule[1:]))): |
3905
|
|
|
matches += 1 |
3906
|
|
|
|
3907
|
|
|
while rule and rule[0] != ')': |
3908
|
|
|
rule = rule[1:] |
3909
|
|
|
|
3910
|
|
|
# if rule[0] == ')': |
3911
|
|
|
rule = rule[1:] |
3912
|
|
|
|
3913
|
|
|
if rule: |
3914
|
|
|
priority0 = ord(rule[0]) |
3915
|
|
|
else: |
3916
|
|
|
priority0 = 0 |
3917
|
|
|
|
3918
|
|
|
matches0 = matches |
3919
|
|
|
|
3920
|
|
|
while rule and rule[0] == '-' and matches > 1: |
3921
|
|
|
matches -= 1 |
3922
|
|
|
rule = rule[1:] |
3923
|
|
|
|
3924
|
|
|
if rule and rule[0] == '<': |
3925
|
|
|
rule = rule[1:] |
3926
|
|
|
|
3927
|
|
|
if rule and rule[0].isdigit(): |
3928
|
|
|
# read priority |
3929
|
|
|
priority = int(rule[0]) |
3930
|
|
|
rule = rule[1:] |
3931
|
|
|
|
3932
|
|
|
if rule and rule[0:2] == '^^': |
3933
|
|
|
rule = rule[1:] |
3934
|
|
|
|
3935
|
|
|
if (not rule or |
3936
|
|
|
((rule[0] == '^') and |
3937
|
|
|
((i == 0) or not src[i-1].isalpha()) and |
3938
|
|
|
((rule[1:2] != '$') or |
3939
|
|
|
(not (src[i+matches0:i+matches0+1].isalpha()) and |
3940
|
|
|
(src[i+matches0:i+matches0+1] != '.')))) or |
3941
|
|
|
((rule[0] == '$') and (i > 0) and |
3942
|
|
|
src[i-1].isalpha() and |
3943
|
|
|
((not src[i+matches0:i+matches0+1].isalpha()) and |
3944
|
|
|
(src[i+matches0:i+matches0+1] != '.')))): |
3945
|
|
|
# look for continuation, if: |
3946
|
|
|
# matches > 1 und NO '-' in first string */ |
3947
|
|
|
pos0 = -1 |
3948
|
|
|
|
3949
|
|
|
start3 = 0 |
3950
|
|
|
start4 = 0 |
3951
|
|
|
end3 = 0 |
3952
|
|
|
end4 = 0 |
3953
|
|
|
|
3954
|
|
|
if (((matches > 1) and |
3955
|
|
|
src[i+matches:i+matches+1] and |
3956
|
|
|
(priority0 != ord('-')))): |
3957
|
|
|
char0 = src[i+matches-1] |
3958
|
|
|
pos0 = alpha_pos[char0] |
3959
|
|
|
|
3960
|
|
|
if pos0 >= 2 and src[i+matches]: |
3961
|
|
|
xpos = pos0 - 2 |
3962
|
|
|
pos0 = alpha_pos[src[i+matches]] |
3963
|
|
|
start3 = phonet_hash_1[xpos, pos0] |
3964
|
|
|
start4 = phonet_hash_1[xpos, 0] |
3965
|
|
|
end3 = phonet_hash_2[xpos, pos0] |
3966
|
|
|
end4 = phonet_hash_2[xpos, 0] |
3967
|
|
|
|
3968
|
|
|
# preserve rule priorities |
3969
|
|
|
if (((start4 >= 0) and |
3970
|
|
|
((start3 < 0) or (start4 < start3)))): |
3971
|
|
|
pos0 = start3 |
3972
|
|
|
start3 = start4 |
3973
|
|
|
start4 = pos0 |
3974
|
|
|
pos0 = end3 |
3975
|
|
|
end3 = end4 |
3976
|
|
|
end4 = pos0 |
3977
|
|
|
|
3978
|
|
|
if (end3 >= start4) and (start4 >= 0): |
3979
|
|
|
if end4 > end3: |
3980
|
|
|
end3 = end4 |
3981
|
|
|
|
3982
|
|
|
start4 = -1 |
3983
|
|
|
end4 = -1 |
3984
|
|
|
else: |
3985
|
|
|
pos0 = phonet_hash[char0] |
3986
|
|
|
start3 = pos0 |
3987
|
|
|
end3 = 10000 |
3988
|
|
|
start4 = -1 |
3989
|
|
|
end4 = -1 |
3990
|
|
|
|
3991
|
|
|
pos0 = start3 |
3992
|
|
|
|
3993
|
|
|
# check continuation rules for src[i+matches] |
3994
|
|
|
if pos0 >= 0: |
3995
|
|
|
while ((_phonet_rules[pos0] is None) or |
3996
|
|
|
(_phonet_rules[pos0][0] == char0)): |
3997
|
|
|
if pos0 > end3: |
3998
|
|
|
if start4 > 0: |
3999
|
|
|
pos0 = start4 |
4000
|
|
|
start3 = start4 |
4001
|
|
|
start4 = -1 |
4002
|
|
|
end3 = end4 |
4003
|
|
|
end4 = -1 |
4004
|
|
|
continue |
4005
|
|
|
|
4006
|
|
|
priority0 = -1 |
4007
|
|
|
|
4008
|
|
|
# important |
4009
|
|
|
break |
4010
|
|
|
|
4011
|
|
|
if (((_phonet_rules[pos0] is None) or |
4012
|
|
|
(_phonet_rules[pos0 + mode] is None))): |
4013
|
|
|
# no conversion rule available |
4014
|
|
|
pos0 += 3 |
4015
|
|
|
continue |
4016
|
|
|
|
4017
|
|
|
# check whole string |
4018
|
|
|
matches0 = matches |
4019
|
|
|
priority0 = 5 |
4020
|
|
|
rule = _phonet_rules[pos0] |
4021
|
|
|
rule = rule[1:] |
4022
|
|
|
|
4023
|
|
|
while (rule and |
4024
|
|
|
(src[i+matches0:i+matches0+1] == |
4025
|
|
|
rule[0]) and |
4026
|
|
|
(not rule[0].isdigit() or |
4027
|
|
|
(rule in '(-<^$'))): |
4028
|
|
|
matches0 += 1 |
4029
|
|
|
rule = rule[1:] |
4030
|
|
|
|
4031
|
|
|
if rule and rule[0] == '(': |
4032
|
|
|
# check an array of letters |
4033
|
|
|
if ((src[i+matches0:i+matches0+1] |
4034
|
|
|
.isalpha() and |
4035
|
|
|
(src[i+matches0] in rule[1:]))): |
4036
|
|
|
matches0 += 1 |
4037
|
|
|
|
4038
|
|
|
while rule and rule[0] != ')': |
4039
|
|
|
rule = rule[1:] |
4040
|
|
|
|
4041
|
|
|
# if rule[0] == ')': |
4042
|
|
|
rule = rule[1:] |
4043
|
|
|
|
4044
|
|
|
while rule and rule[0] == '-': |
4045
|
|
|
# "matches0" is NOT decremented |
4046
|
|
|
# because of "if (matches0 == matches)" |
4047
|
|
|
rule = rule[1:] |
4048
|
|
|
|
4049
|
|
|
if rule and rule[0] == '<': |
4050
|
|
|
rule = rule[1:] |
4051
|
|
|
|
4052
|
|
|
if rule and rule[0].isdigit(): |
4053
|
|
|
priority0 = int(rule[0]) |
4054
|
|
|
rule = rule[1:] |
4055
|
|
|
|
4056
|
|
|
if (not rule or |
4057
|
|
|
# rule == '^' is not possible here |
4058
|
|
|
((rule[0] == '$') and not |
4059
|
|
|
src[i+matches0:i+matches0+1] |
4060
|
|
|
.isalpha() and |
4061
|
|
|
(src[i+matches0:i+matches0+1] |
4062
|
|
|
!= '.'))): |
4063
|
|
|
if matches0 == matches: |
4064
|
|
|
# this is only a partial string |
4065
|
|
|
pos0 += 3 |
4066
|
|
|
continue |
4067
|
|
|
|
4068
|
|
|
if priority0 < priority: |
4069
|
|
|
# priority is too low |
4070
|
|
|
pos0 += 3 |
4071
|
|
|
continue |
4072
|
|
|
|
4073
|
|
|
# continuation rule found |
4074
|
|
|
break |
4075
|
|
|
|
4076
|
|
|
pos0 += 3 |
4077
|
|
|
|
4078
|
|
|
# end of "while" |
4079
|
|
|
if ((priority0 >= priority) and |
4080
|
|
|
((_phonet_rules[pos0] is not None) and |
4081
|
|
|
(_phonet_rules[pos0][0] == char0))): |
4082
|
|
|
|
4083
|
|
|
pos += 3 |
4084
|
|
|
continue |
4085
|
|
|
|
4086
|
|
|
# replace string |
4087
|
|
|
if ((_phonet_rules[pos] and |
4088
|
|
|
('<' in _phonet_rules[pos][1:]))): |
4089
|
|
|
priority0 = 1 |
4090
|
|
|
else: |
4091
|
|
|
priority0 = 0 |
4092
|
|
|
|
4093
|
|
|
rule = _phonet_rules[pos + mode] |
4094
|
|
|
|
4095
|
|
|
if (priority0 == 1) and (zeta == 0): |
4096
|
|
|
# rule with '<' is applied |
4097
|
|
|
if ((j > 0) and rule and |
4098
|
|
|
((dest[j-1] == char) or |
4099
|
|
|
(dest[j-1] == rule[0]))): |
4100
|
|
|
j -= 1 |
4101
|
|
|
|
4102
|
|
|
zeta0 = 1 |
4103
|
|
|
zeta += 1 |
4104
|
|
|
matches0 = 0 |
4105
|
|
|
|
4106
|
|
|
while rule and src[i+matches0]: |
4107
|
|
|
src = (src[0:i+matches0] + rule[0] + |
4108
|
|
|
src[i+matches0+1:]) |
4109
|
|
|
matches0 += 1 |
4110
|
|
|
rule = rule[1:] |
4111
|
|
|
|
4112
|
|
|
if matches0 < matches: |
4113
|
|
|
src = (src[0:i+matches0] + |
4114
|
|
|
src[i+matches:]) |
4115
|
|
|
|
4116
|
|
|
char = src[i] |
4117
|
|
|
else: |
4118
|
|
|
i = i + matches - 1 |
4119
|
|
|
zeta = 0 |
4120
|
|
|
|
4121
|
|
|
while len(rule) > 1: |
4122
|
|
|
if (j == 0) or (dest[j - 1] != rule[0]): |
4123
|
|
|
dest = (dest[0:j] + rule[0] + |
4124
|
|
|
dest[min(len(dest), j+1):]) |
4125
|
|
|
j += 1 |
4126
|
|
|
|
4127
|
|
|
rule = rule[1:] |
4128
|
|
|
|
4129
|
|
|
# new "current char" |
4130
|
|
|
if not rule: |
4131
|
|
|
rule = '' |
4132
|
|
|
char = '' |
4133
|
|
|
else: |
4134
|
|
|
char = rule[0] |
4135
|
|
|
|
4136
|
|
|
if ((_phonet_rules[pos] and |
4137
|
|
|
'^^' in _phonet_rules[pos][1:])): |
4138
|
|
|
if char: |
4139
|
|
|
dest = (dest[0:j] + char + |
4140
|
|
|
dest[min(len(dest), j + 1):]) |
4141
|
|
|
j += 1 |
4142
|
|
|
|
4143
|
|
|
src = src[i + 1:] |
4144
|
|
|
i = 0 |
4145
|
|
|
zeta0 = 1 |
4146
|
|
|
|
4147
|
|
|
break |
4148
|
|
|
|
4149
|
|
|
pos += 3 |
4150
|
|
|
|
4151
|
|
|
if pos > end1 and start2 > 0: |
4152
|
|
|
pos = start2 |
4153
|
|
|
start1 = start2 |
4154
|
|
|
end1 = end2 |
4155
|
|
|
start2 = -1 |
4156
|
|
|
end2 = -1 |
4157
|
|
|
|
4158
|
|
|
if zeta0 == 0: |
4159
|
|
|
if char and ((j == 0) or (dest[j-1] != char)): |
4160
|
|
|
# delete multiple letters only |
4161
|
|
|
dest = dest[0:j] + char + dest[min(j+1, term_length):] |
4162
|
|
|
j += 1 |
4163
|
|
|
|
4164
|
|
|
i += 1 |
4165
|
|
|
zeta = 0 |
4166
|
|
|
|
4167
|
|
|
dest = dest[0:j] |
4168
|
|
|
|
4169
|
|
|
return dest |
4170
|
|
|
|
4171
|
|
|
_initialize_phonet(lang) |
4172
|
|
|
|
4173
|
|
|
word = normalize('NFKC', text_type(word)) |
4174
|
|
|
return _phonet(word, mode, lang) |
4175
|
|
|
|
4176
|
|
|
|
4177
|
|
|
def spfc(word): |
4178
|
|
|
"""Return the Standardized Phonetic Frequency Code (SPFC) of a word. |
4179
|
|
|
|
4180
|
|
|
Standardized Phonetic Frequency Code is roughly Soundex-like. |
4181
|
|
|
This implementation is based on page 19-21 of :cite:`Moore:1977`. |
4182
|
|
|
|
4183
|
|
|
:param str word: the word to transform |
4184
|
|
|
:returns: the SPFC value |
4185
|
|
|
:rtype: str |
4186
|
|
|
|
4187
|
|
|
>>> spfc('Christopher Smith') |
4188
|
|
|
'01160' |
4189
|
|
|
>>> spfc('Christopher Schmidt') |
4190
|
|
|
'01160' |
4191
|
|
|
>>> spfc('Niall Smith') |
4192
|
|
|
'01660' |
4193
|
|
|
>>> spfc('Niall Schmidt') |
4194
|
|
|
'01660' |
4195
|
|
|
|
4196
|
|
|
>>> spfc('L.Smith') |
4197
|
|
|
'01960' |
4198
|
|
|
>>> spfc('R.Miller') |
4199
|
|
|
'65490' |
4200
|
|
|
|
4201
|
|
|
>>> spfc(('L', 'Smith')) |
4202
|
|
|
'01960' |
4203
|
|
|
>>> spfc(('R', 'Miller')) |
4204
|
|
|
'65490' |
4205
|
|
|
""" |
4206
|
|
|
_pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'), |
|
|
|
|
4207
|
|
|
'0011112222334445556666777')) |
4208
|
|
|
_pf2 = dict(zip((ord(_) for _ in |
4209
|
|
|
'SZCKQFPXABORDHIMNGJTUVWEL'), |
4210
|
|
|
'0011122233445556677788899')) |
4211
|
|
|
_pf3 = dict(zip((ord(_) for _ in |
4212
|
|
|
'BCKQVDTFLPGJXMNRSZAEHIOUWY'), |
4213
|
|
|
'00000112223334456677777777')) |
4214
|
|
|
|
4215
|
|
|
_substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'), |
4216
|
|
|
('MN', 'N')) |
4217
|
|
|
|
4218
|
|
|
def _raise_word_ex(): |
4219
|
|
|
"""Raise an AttributeError.""" |
4220
|
|
|
raise AttributeError('word attribute must be a string with a space ' + |
4221
|
|
|
'or period dividing the first and last names ' + |
4222
|
|
|
'or a tuple/list consisting of the first and ' + |
4223
|
|
|
'last names') |
4224
|
|
|
|
4225
|
|
|
if not word: |
4226
|
|
|
return '' |
4227
|
|
|
|
4228
|
|
|
if isinstance(word, (str, text_type)): |
4229
|
|
|
names = word.split('.', 1) |
4230
|
|
|
if len(names) != 2: |
4231
|
|
|
names = word.split(' ', 1) |
4232
|
|
|
if len(names) != 2: |
4233
|
|
|
_raise_word_ex() |
4234
|
|
|
elif hasattr(word, '__iter__'): |
4235
|
|
|
if len(word) != 2: |
4236
|
|
|
_raise_word_ex() |
4237
|
|
|
names = word |
4238
|
|
|
else: |
4239
|
|
|
_raise_word_ex() |
4240
|
|
|
|
4241
|
|
|
names = [normalize('NFKD', text_type(_.strip() |
4242
|
|
|
.replace('ß', 'SS') |
4243
|
|
|
.upper())) |
4244
|
|
|
for _ in names] |
|
|
|
|
4245
|
|
|
code = '' |
4246
|
|
|
|
4247
|
|
|
def steps_one_to_three(name): |
4248
|
|
|
"""Perform the first three steps of SPFC.""" |
4249
|
|
|
# filter out non A-Z |
4250
|
|
|
name = ''.join(_ for _ in name if _ in |
4251
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
4252
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
4253
|
|
|
'W', 'X', 'Y', 'Z'}) |
4254
|
|
|
|
4255
|
|
|
# 1. In the field, convert DK to K, DT to T, SC to S, KN to N, |
4256
|
|
|
# and MN to N |
4257
|
|
|
for subst in _substitutions: |
4258
|
|
|
name = name.replace(subst[0], subst[1]) |
4259
|
|
|
|
4260
|
|
|
# 2. In the name field, replace multiple letters with a single letter |
4261
|
|
|
name = _delete_consecutive_repeats(name) |
4262
|
|
|
|
4263
|
|
|
# 3. Remove vowels, W, H, and Y, but keep the first letter in the name |
4264
|
|
|
# field. |
4265
|
|
|
if name: |
4266
|
|
|
name = name[0] + ''.join(_ for _ in name[1:] if _ not in |
4267
|
|
|
{'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}) |
4268
|
|
|
return name |
4269
|
|
|
|
4270
|
|
|
names = [steps_one_to_three(_) for _ in names] |
4271
|
|
|
|
4272
|
|
|
# 4. The first digit of the code is obtained using PF1 and the first letter |
4273
|
|
|
# of the name field. Remove this letter after coding. |
4274
|
|
|
if names[1]: |
4275
|
|
|
code += names[1][0].translate(_pf1) |
4276
|
|
|
names[1] = names[1][1:] |
4277
|
|
|
|
4278
|
|
|
# 5. Using the last letters of the name, use Table PF3 to obtain the |
4279
|
|
|
# second digit of the code. Use as many letters as possible and remove |
4280
|
|
|
# after coding. |
4281
|
|
|
if names[1]: |
4282
|
|
|
if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': |
4283
|
|
|
code += '8' |
4284
|
|
|
names[1] = names[1][:-3] |
4285
|
|
|
elif names[1][-2:] == 'SN': |
4286
|
|
|
code += '8' |
4287
|
|
|
names[1] = names[1][:-2] |
4288
|
|
|
elif names[1][-3:] == 'STR': |
4289
|
|
|
code += '9' |
4290
|
|
|
names[1] = names[1][:-3] |
4291
|
|
|
elif names[1][-2:] in {'SR', 'TN', 'TD'}: |
4292
|
|
|
code += '9' |
4293
|
|
|
names[1] = names[1][:-2] |
4294
|
|
|
elif names[1][-3:] == 'DRS': |
4295
|
|
|
code += '7' |
4296
|
|
|
names[1] = names[1][:-3] |
4297
|
|
|
elif names[1][-2:] in {'TR', 'MN'}: |
4298
|
|
|
code += '7' |
4299
|
|
|
names[1] = names[1][:-2] |
4300
|
|
|
else: |
4301
|
|
|
code += names[1][-1].translate(_pf3) |
4302
|
|
|
names[1] = names[1][:-1] |
4303
|
|
|
|
4304
|
|
|
# 6. The third digit is found using Table PF2 and the first character of |
4305
|
|
|
# the first name. Remove after coding. |
4306
|
|
|
if names[0]: |
4307
|
|
|
code += names[0][0].translate(_pf2) |
4308
|
|
|
names[0] = names[0][1:] |
4309
|
|
|
|
4310
|
|
|
# 7. The fourth digit is found using Table PF2 and the first character of |
4311
|
|
|
# the name field. If no letters remain use zero. After coding remove the |
4312
|
|
|
# letter. |
4313
|
|
|
# 8. The fifth digit is found in the same manner as the fourth using the |
4314
|
|
|
# remaining characters of the name field if any. |
4315
|
|
|
for _ in range(2): |
4316
|
|
|
if names[1]: |
4317
|
|
|
code += names[1][0].translate(_pf2) |
4318
|
|
|
names[1] = names[1][1:] |
4319
|
|
|
else: |
4320
|
|
|
code += '0' |
4321
|
|
|
|
4322
|
|
|
return code |
4323
|
|
|
|
4324
|
|
|
|
4325
|
|
|
def statistics_canada(word, maxlength=4): |
4326
|
|
|
"""Return the Statistics Canada code for a word. |
4327
|
|
|
|
4328
|
|
|
The original description of this algorithm could not be located, and |
4329
|
|
|
may only have been specified in an unpublished TR. The coding does not |
4330
|
|
|
appear to be in use by Statistics Canada any longer. In its place, this is |
4331
|
|
|
an implementation of the "Census modified Statistics Canada name coding |
4332
|
|
|
procedure". |
4333
|
|
|
|
4334
|
|
|
The modified version of this algorithm is described in Appendix B of |
4335
|
|
|
:cite:`Moore:1977`. |
4336
|
|
|
|
4337
|
|
|
:param str word: the word to transform |
4338
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
4339
|
|
|
:param bool modified: indicates whether to use USDA modified algorithm |
4340
|
|
|
:returns: the Statistics Canada name code value |
4341
|
|
|
:rtype: str |
4342
|
|
|
|
4343
|
|
|
>>> statistics_canada('Christopher') |
4344
|
|
|
'CHRS' |
4345
|
|
|
>>> statistics_canada('Niall') |
4346
|
|
|
'NL' |
4347
|
|
|
>>> statistics_canada('Smith') |
4348
|
|
|
'SMTH' |
4349
|
|
|
>>> statistics_canada('Schmidt') |
4350
|
|
|
'SCHM' |
4351
|
|
|
""" |
4352
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4353
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
4354
|
|
|
word = word.replace('ß', 'SS') |
4355
|
|
|
word = ''.join(c for c in word if c in |
4356
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4357
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4358
|
|
|
'Y', 'Z'}) |
4359
|
|
|
if not word: |
4360
|
|
|
return '' |
4361
|
|
|
|
4362
|
|
|
code = word[1:] |
4363
|
|
|
for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
4364
|
|
|
code = code.replace(vowel, '') |
4365
|
|
|
code = word[0]+code |
4366
|
|
|
code = _delete_consecutive_repeats(code) |
4367
|
|
|
code = code.replace(' ', '') |
4368
|
|
|
|
4369
|
|
|
return code[:maxlength] |
4370
|
|
|
|
4371
|
|
|
|
4372
|
|
|
def lein(word, maxlength=4, zero_pad=True): |
4373
|
|
|
"""Return the Lein code for a word. |
4374
|
|
|
|
4375
|
|
|
This is Lein name coding, described in :cite:`Moore:1977`. |
4376
|
|
|
|
4377
|
|
|
:param str word: the word to transform |
4378
|
|
|
:param int maxlength: the maximum length (default 4) of the code to return |
4379
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4380
|
|
|
maxlength string |
4381
|
|
|
:returns: the Lein code |
4382
|
|
|
:rtype: str |
4383
|
|
|
|
4384
|
|
|
>>> lein('Christopher') |
4385
|
|
|
'C351' |
4386
|
|
|
>>> lein('Niall') |
4387
|
|
|
'N300' |
4388
|
|
|
>>> lein('Smith') |
4389
|
|
|
'S210' |
4390
|
|
|
>>> lein('Schmidt') |
4391
|
|
|
'S521' |
4392
|
|
|
""" |
4393
|
|
|
_lein_translation = dict(zip((ord(_) for _ in |
|
|
|
|
4394
|
|
|
'BCDFGJKLMNPQRSTVXZ'), |
4395
|
|
|
'451455532245351455')) |
4396
|
|
|
|
4397
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4398
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
4399
|
|
|
word = word.replace('ß', 'SS') |
4400
|
|
|
word = ''.join(c for c in word if c in |
4401
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4402
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4403
|
|
|
'Y', 'Z'}) |
4404
|
|
|
|
4405
|
|
|
if not word: |
4406
|
|
|
return '' |
4407
|
|
|
|
4408
|
|
|
code = word[0] # Rule 1 |
4409
|
|
|
word = word[1:].translate({32: None, 65: None, 69: None, 72: None, |
4410
|
|
|
73: None, 79: None, 85: None, 87: None, |
4411
|
|
|
89: None}) # Rule 2 |
4412
|
|
|
word = _delete_consecutive_repeats(word) # Rule 3 |
4413
|
|
|
code += word.translate(_lein_translation) # Rule 4 |
4414
|
|
|
|
4415
|
|
|
if zero_pad: |
4416
|
|
|
code += ('0'*maxlength) # Rule 4 |
4417
|
|
|
|
4418
|
|
|
return code[:maxlength] |
4419
|
|
|
|
4420
|
|
|
|
4421
|
|
|
def roger_root(word, maxlength=5, zero_pad=True): |
4422
|
|
|
"""Return the Roger Root code for a word. |
4423
|
|
|
|
4424
|
|
|
This is Roger Root name coding, described in :cite:`Moore:1977`. |
4425
|
|
|
|
4426
|
|
|
:param str word: the word to transform |
4427
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
4428
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4429
|
|
|
maxlength string |
4430
|
|
|
:returns: the Roger Root code |
4431
|
|
|
:rtype: str |
4432
|
|
|
|
4433
|
|
|
>>> roger_root('Christopher') |
4434
|
|
|
'06401' |
4435
|
|
|
>>> roger_root('Niall') |
4436
|
|
|
'02500' |
4437
|
|
|
>>> roger_root('Smith') |
4438
|
|
|
'00310' |
4439
|
|
|
>>> roger_root('Schmidt') |
4440
|
|
|
'06310' |
4441
|
|
|
""" |
4442
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4443
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
4444
|
|
|
word = word.replace('ß', 'SS') |
4445
|
|
|
word = ''.join(c for c in word if c in |
4446
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4447
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4448
|
|
|
'Y', 'Z'}) |
4449
|
|
|
|
4450
|
|
|
if not word: |
4451
|
|
|
return '' |
4452
|
|
|
|
4453
|
|
|
# '*' is used to prevent combining by _delete_consecutive_repeats() |
4454
|
|
|
_init_patterns = {4: {'TSCH': '06'}, |
4455
|
|
|
3: {'TSH': '06', 'SCH': '06'}, |
4456
|
|
|
2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0', |
4457
|
|
|
'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02', |
4458
|
|
|
'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02', |
4459
|
|
|
'SH': '06', 'TS': '0*0', 'WR': '04'}, |
4460
|
|
|
1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1', |
4461
|
|
|
'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3', |
4462
|
|
|
'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1', |
4463
|
|
|
'P': '09', 'Q': '07', 'R': '04', 'S': '0*0', |
4464
|
|
|
'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07', |
4465
|
|
|
'Y': '5', 'Z': '0*0'}} |
4466
|
|
|
|
4467
|
|
|
_med_patterns = {4: {'TSCH': '6'}, |
4468
|
|
|
3: {'TSH': '6', 'SCH': '6'}, |
4469
|
|
|
2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7', |
4470
|
|
|
'PH': '8', 'SH': '6', 'TS': '0'}, |
4471
|
|
|
1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7', |
4472
|
|
|
'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2', |
4473
|
|
|
'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1', |
4474
|
|
|
'V': '8', 'X': '7', 'Z': '0', |
4475
|
|
|
'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*', |
4476
|
|
|
'U': '*', 'W': '*', 'Y': '*'}} |
4477
|
|
|
|
4478
|
|
|
code = '' |
4479
|
|
|
pos = 0 |
4480
|
|
|
|
4481
|
|
|
# Do first digit(s) first |
4482
|
|
|
for num in range(4, 0, -1): |
4483
|
|
|
if word[:num] in _init_patterns[num]: |
4484
|
|
|
code = _init_patterns[num][word[:num]] |
4485
|
|
|
pos += num |
4486
|
|
|
break |
4487
|
|
|
else: |
4488
|
|
|
pos += 1 # Advance if nothing is recognized |
4489
|
|
|
|
4490
|
|
|
# Then code subsequent digits |
4491
|
|
|
while pos < len(word): |
4492
|
|
|
for num in range(4, 0, -1): |
4493
|
|
|
if word[pos:pos+num] in _med_patterns[num]: |
4494
|
|
|
code += _med_patterns[num][word[pos:pos+num]] |
4495
|
|
|
pos += num |
4496
|
|
|
break |
4497
|
|
|
else: |
4498
|
|
|
pos += 1 # Advance if nothing is recognized |
4499
|
|
|
|
4500
|
|
|
code = _delete_consecutive_repeats(code) |
4501
|
|
|
code = code.replace('*', '') |
4502
|
|
|
|
4503
|
|
|
if zero_pad: |
4504
|
|
|
code += '0'*maxlength |
4505
|
|
|
|
4506
|
|
|
return code[:maxlength] |
4507
|
|
|
|
4508
|
|
|
|
4509
|
|
|
def onca(word, maxlength=4, zero_pad=True): |
4510
|
|
|
"""Return the Oxford Name Compression Algorithm (ONCA) code for a word. |
4511
|
|
|
|
4512
|
|
|
This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`. |
4513
|
|
|
|
4514
|
|
|
I can find no complete description of the "anglicised version of the NYSIIS |
4515
|
|
|
method" identified as the first step in this algorithm, so this is likely |
4516
|
|
|
not a precisely correct implementation, in that it employs the standard |
4517
|
|
|
NYSIIS algorithm. |
4518
|
|
|
|
4519
|
|
|
:param str word: the word to transform |
4520
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
4521
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4522
|
|
|
maxlength string |
4523
|
|
|
:returns: the ONCA code |
4524
|
|
|
:rtype: str |
4525
|
|
|
|
4526
|
|
|
>>> onca('Christopher') |
4527
|
|
|
'C623' |
4528
|
|
|
>>> onca('Niall') |
4529
|
|
|
'N400' |
4530
|
|
|
>>> onca('Smith') |
4531
|
|
|
'S530' |
4532
|
|
|
>>> onca('Schmidt') |
4533
|
|
|
'S530' |
4534
|
|
|
""" |
4535
|
|
|
# In the most extreme case, 3 characters of NYSIIS input can be compressed |
4536
|
|
|
# to one character of output, so give it triple the maxlength. |
4537
|
|
|
return soundex(nysiis(word, maxlength=maxlength*3), maxlength, |
4538
|
|
|
zero_pad=zero_pad) |
4539
|
|
|
|
4540
|
|
|
|
4541
|
|
|
def eudex(word, maxlength=8): |
4542
|
|
|
"""Return the eudex phonetic hash of a word. |
4543
|
|
|
|
4544
|
|
|
This implementation of eudex phonetic hashing is based on the specification |
4545
|
|
|
(not the reference implementation) at :cite:`Ticki:2016`. |
4546
|
|
|
|
4547
|
|
|
Further details can be found at :cite:`Ticki:2016b`. |
4548
|
|
|
|
4549
|
|
|
:param str word: the word to transform |
4550
|
|
|
:param int maxlength: the length of the code returned (defaults to 8) |
4551
|
|
|
:returns: the eudex hash |
4552
|
|
|
:rtype: str |
4553
|
|
|
""" |
4554
|
|
|
_trailing_phones = { |
4555
|
|
|
'a': 0, # a |
4556
|
|
|
'b': 0b01001000, # b |
4557
|
|
|
'c': 0b00001100, # c |
4558
|
|
|
'd': 0b00011000, # d |
4559
|
|
|
'e': 0, # e |
4560
|
|
|
'f': 0b01000100, # f |
4561
|
|
|
'g': 0b00001000, # g |
4562
|
|
|
'h': 0b00000100, # h |
4563
|
|
|
'i': 1, # i |
4564
|
|
|
'j': 0b00000101, # j |
4565
|
|
|
'k': 0b00001001, # k |
4566
|
|
|
'l': 0b10100000, # l |
4567
|
|
|
'm': 0b00000010, # m |
4568
|
|
|
'n': 0b00010010, # n |
4569
|
|
|
'o': 0, # o |
4570
|
|
|
'p': 0b01001001, # p |
4571
|
|
|
'q': 0b10101000, # q |
4572
|
|
|
'r': 0b10100001, # r |
4573
|
|
|
's': 0b00010100, # s |
4574
|
|
|
't': 0b00011101, # t |
4575
|
|
|
'u': 1, # u |
4576
|
|
|
'v': 0b01000101, # v |
4577
|
|
|
'w': 0b00000000, # w |
4578
|
|
|
'x': 0b10000100, # x |
4579
|
|
|
'y': 1, # y |
4580
|
|
|
'z': 0b10010100, # z |
4581
|
|
|
|
4582
|
|
|
'ß': 0b00010101, # ß |
4583
|
|
|
'à': 0, # à |
4584
|
|
|
'á': 0, # á |
4585
|
|
|
'â': 0, # â |
4586
|
|
|
'ã': 0, # ã |
4587
|
|
|
'ä': 0, # ä[æ] |
4588
|
|
|
'å': 1, # å[oː] |
4589
|
|
|
'æ': 0, # æ[æ] |
4590
|
|
|
'ç': 0b10010101, # ç[t͡ʃ] |
4591
|
|
|
'è': 1, # è |
4592
|
|
|
'é': 1, # é |
4593
|
|
|
'ê': 1, # ê |
4594
|
|
|
'ë': 1, # ë |
4595
|
|
|
'ì': 1, # ì |
4596
|
|
|
'í': 1, # í |
4597
|
|
|
'î': 1, # î |
4598
|
|
|
'ï': 1, # ï |
4599
|
|
|
'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T) |
4600
|
|
|
'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j) |
4601
|
|
|
'ò': 0, # ò |
4602
|
|
|
'ó': 0, # ó |
4603
|
|
|
'ô': 0, # ô |
4604
|
|
|
'õ': 0, # õ |
4605
|
|
|
'ö': 1, # ö[ø] |
4606
|
|
|
'÷': 0b11111111, # ÷ |
4607
|
|
|
'ø': 1, # ø[ø] |
4608
|
|
|
'ù': 1, # ù |
4609
|
|
|
'ú': 1, # ú |
4610
|
|
|
'û': 1, # û |
4611
|
|
|
'ü': 1, # ü |
4612
|
|
|
'ý': 1, # ý |
4613
|
|
|
'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T) |
4614
|
|
|
'ÿ': 1, # ÿ |
4615
|
|
|
} |
4616
|
|
|
|
4617
|
|
|
_initial_phones = { |
4618
|
|
|
'a': 0b10000100, # a* |
4619
|
|
|
'b': 0b00100100, # b |
4620
|
|
|
'c': 0b00000110, # c |
4621
|
|
|
'd': 0b00001100, # d |
4622
|
|
|
'e': 0b11011000, # e* |
4623
|
|
|
'f': 0b00100010, # f |
4624
|
|
|
'g': 0b00000100, # g |
4625
|
|
|
'h': 0b00000010, # h |
4626
|
|
|
'i': 0b11111000, # i* |
4627
|
|
|
'j': 0b00000011, # j |
4628
|
|
|
'k': 0b00000101, # k |
4629
|
|
|
'l': 0b01010000, # l |
4630
|
|
|
'm': 0b00000001, # m |
4631
|
|
|
'n': 0b00001001, # n |
4632
|
|
|
'o': 0b10010100, # o* |
4633
|
|
|
'p': 0b00100101, # p |
4634
|
|
|
'q': 0b01010100, # q |
4635
|
|
|
'r': 0b01010001, # r |
4636
|
|
|
's': 0b00001010, # s |
4637
|
|
|
't': 0b00001110, # t |
4638
|
|
|
'u': 0b11100000, # u* |
4639
|
|
|
'v': 0b00100011, # v |
4640
|
|
|
'w': 0b00000000, # w |
4641
|
|
|
'x': 0b01000010, # x |
4642
|
|
|
'y': 0b11100100, # y* |
4643
|
|
|
'z': 0b01001010, # z |
4644
|
|
|
|
4645
|
|
|
'ß': 0b00001011, # ß |
4646
|
|
|
'à': 0b10000101, # à |
4647
|
|
|
'á': 0b10000101, # á |
4648
|
|
|
'â': 0b10000000, # â |
4649
|
|
|
'ã': 0b10000110, # ã |
4650
|
|
|
'ä': 0b10100110, # ä [æ] |
4651
|
|
|
'å': 0b11000010, # å [oː] |
4652
|
|
|
'æ': 0b10100111, # æ [æ] |
4653
|
|
|
'ç': 0b01010100, # ç [t͡ʃ] |
4654
|
|
|
'è': 0b11011001, # è |
4655
|
|
|
'é': 0b11011001, # é |
4656
|
|
|
'ê': 0b11011001, # ê |
4657
|
|
|
'ë': 0b11000110, # ë [ə] or [œ] |
4658
|
|
|
'ì': 0b11111001, # ì |
4659
|
|
|
'í': 0b11111001, # í |
4660
|
|
|
'î': 0b11111001, # î |
4661
|
|
|
'ï': 0b11111001, # ï |
4662
|
|
|
'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T) |
4663
|
|
|
'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j) |
4664
|
|
|
'ò': 0b10010101, # ò |
4665
|
|
|
'ó': 0b10010101, # ó |
4666
|
|
|
'ô': 0b10010101, # ô |
4667
|
|
|
'õ': 0b10010101, # õ |
4668
|
|
|
'ö': 0b11011100, # ö [œ] or [ø] |
4669
|
|
|
'÷': 0b11111111, # ÷ |
4670
|
|
|
'ø': 0b11011101, # ø [œ] or [ø] |
4671
|
|
|
'ù': 0b11100001, # ù |
4672
|
|
|
'ú': 0b11100001, # ú |
4673
|
|
|
'û': 0b11100001, # û |
4674
|
|
|
'ü': 0b11100101, # ü |
4675
|
|
|
'ý': 0b11100101, # ý |
4676
|
|
|
'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T) |
4677
|
|
|
'ÿ': 0b11100101, # ÿ |
4678
|
|
|
} |
4679
|
|
|
# Lowercase input & filter unknown characters |
4680
|
|
|
word = ''.join(char for char in word.lower() if char in _initial_phones) |
4681
|
|
|
|
4682
|
|
|
if not word: |
4683
|
|
|
word = '÷' |
4684
|
|
|
|
4685
|
|
|
# Perform initial eudex coding of each character |
4686
|
|
|
values = [_initial_phones[word[0]]] |
4687
|
|
|
values += [_trailing_phones[char] for char in word[1:]] |
4688
|
|
|
|
4689
|
|
|
# Right-shift by one to determine if second instance should be skipped |
4690
|
|
|
shifted_values = [_ >> 1 for _ in values] |
4691
|
|
|
condensed_values = [values[0]] |
4692
|
|
|
for n in range(1, len(shifted_values)): |
4693
|
|
|
if shifted_values[n] != shifted_values[n-1]: |
4694
|
|
|
condensed_values.append(values[n]) |
4695
|
|
|
|
4696
|
|
|
# Add padding after first character & trim beyond maxlength |
4697
|
|
|
values = ([condensed_values[0]] + |
4698
|
|
|
[0]*max(0, maxlength - len(condensed_values)) + |
4699
|
|
|
condensed_values[1:maxlength]) |
4700
|
|
|
|
4701
|
|
|
# Combine individual character values into eudex hash |
4702
|
|
|
hash_value = 0 |
4703
|
|
|
for val in values: |
4704
|
|
|
hash_value = (hash_value << 8) | val |
4705
|
|
|
|
4706
|
|
|
return hash_value |
4707
|
|
|
|
4708
|
|
|
|
4709
|
|
|
def haase_phonetik(word, primary_only=False): |
4710
|
|
|
"""Return the Haase Phonetik (numeric output) code for a word. |
4711
|
|
|
|
4712
|
|
|
Based on the algorithm described at :cite:`Prante:2015`. |
4713
|
|
|
|
4714
|
|
|
Based on the original :cite:`Haase:2000`. |
4715
|
|
|
|
4716
|
|
|
While the output code is numeric, it is nevertheless a str. |
4717
|
|
|
|
4718
|
|
|
:param str word: the word to transform |
4719
|
|
|
:returns: the Haase Phonetik value as a numeric string |
4720
|
|
|
:rtype: str |
4721
|
|
|
""" |
4722
|
|
|
def _after(word, i, letters): |
4723
|
|
|
"""Return True if word[i] follows one of the supplied letters.""" |
4724
|
|
|
if i > 0 and word[i-1] in letters: |
4725
|
|
|
return True |
4726
|
|
|
return False |
4727
|
|
|
|
4728
|
|
|
def _before(word, i, letters): |
4729
|
|
|
"""Return True if word[i] precedes one of the supplied letters.""" |
4730
|
|
|
if i+1 < len(word) and word[i+1] in letters: |
4731
|
|
|
return True |
4732
|
|
|
return False |
4733
|
|
|
|
4734
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
4735
|
|
|
|
4736
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
4737
|
|
|
word = word.replace('ß', 'SS') |
4738
|
|
|
|
4739
|
|
|
word = word.replace('Ä', 'AE') |
4740
|
|
|
word = word.replace('Ö', 'OE') |
4741
|
|
|
word = word.replace('Ü', 'UE') |
4742
|
|
|
word = ''.join(c for c in word if c in |
4743
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4744
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4745
|
|
|
'Y', 'Z'}) |
4746
|
|
|
|
4747
|
|
|
# Nothing to convert, return base case |
4748
|
|
|
if not word: |
4749
|
|
|
return '' |
4750
|
|
|
|
4751
|
|
|
variants = [] |
4752
|
|
|
if primary_only: |
4753
|
|
|
variants = [word] |
4754
|
|
|
else: |
4755
|
|
|
pos = 0 |
4756
|
|
|
if word[:2] == 'CH': |
4757
|
|
|
variants.append(('CH', 'SCH')) |
4758
|
|
|
pos += 2 |
4759
|
|
|
len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', |
4760
|
|
|
'AUX': 'O', 'EUX': 'O'} |
4761
|
|
|
while pos < len(word): |
4762
|
|
|
if word[pos:pos+4] == 'ILLE': |
4763
|
|
|
variants.append(('ILLE', 'I')) |
4764
|
|
|
pos += 4 |
4765
|
|
|
elif word[pos:pos+3] in len_3_vars: |
4766
|
|
|
variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]])) |
4767
|
|
|
pos += 3 |
4768
|
|
|
elif word[pos:pos+2] == 'RB': |
4769
|
|
|
variants.append(('RB', 'RW')) |
4770
|
|
|
pos += 2 |
4771
|
|
|
elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
4772
|
|
|
variants.append(('EAU', 'O')) |
4773
|
|
|
pos += 3 |
4774
|
|
|
elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
4775
|
|
|
if word[pos:] == 'O': |
4776
|
|
|
variants.append(('O', 'OW')) |
4777
|
|
|
else: |
4778
|
|
|
variants.append(('A', 'AR')) |
4779
|
|
|
pos += 1 |
4780
|
|
|
else: |
4781
|
|
|
variants.append((word[pos],)) |
4782
|
|
|
pos += 1 |
4783
|
|
|
|
4784
|
|
|
variants = [''.join(letters) for letters in product(*variants)] |
4785
|
|
|
|
4786
|
|
|
def _haase_code(word): |
4787
|
|
|
sdx = '' |
4788
|
|
|
for i in range(len(word)): |
4789
|
|
View Code Duplication |
if word[i] in _vowels: |
|
|
|
|
4790
|
|
|
sdx += '9' |
4791
|
|
|
elif word[i] == 'B': |
4792
|
|
|
sdx += '1' |
4793
|
|
|
elif word[i] == 'P': |
4794
|
|
|
if _before(word, i, {'H'}): |
4795
|
|
|
sdx += '3' |
4796
|
|
|
else: |
4797
|
|
|
sdx += '1' |
4798
|
|
|
elif word[i] in {'D', 'T'}: |
4799
|
|
|
if _before(word, i, {'C', 'S', 'Z'}): |
4800
|
|
|
sdx += '8' |
4801
|
|
|
else: |
4802
|
|
|
sdx += '2' |
4803
|
|
|
elif word[i] in {'F', 'V', 'W'}: |
4804
|
|
|
sdx += '3' |
4805
|
|
|
elif word[i] in {'G', 'K', 'Q'}: |
4806
|
|
|
sdx += '4' |
4807
|
|
|
elif word[i] == 'C': |
4808
|
|
|
if _after(word, i, {'S', 'Z'}): |
4809
|
|
|
sdx += '8' |
4810
|
|
|
elif i == 0: |
4811
|
|
|
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', |
4812
|
|
|
'U', 'X'}): |
4813
|
|
|
sdx += '4' |
4814
|
|
|
else: |
4815
|
|
|
sdx += '8' |
4816
|
|
|
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
4817
|
|
|
sdx += '4' |
4818
|
|
|
else: |
4819
|
|
|
sdx += '8' |
4820
|
|
|
elif word[i] == 'X': |
4821
|
|
|
if _after(word, i, {'C', 'K', 'Q'}): |
4822
|
|
|
sdx += '8' |
4823
|
|
|
else: |
4824
|
|
|
sdx += '48' |
4825
|
|
|
elif word[i] == 'L': |
4826
|
|
|
sdx += '5' |
4827
|
|
|
elif word[i] in {'M', 'N'}: |
4828
|
|
|
sdx += '6' |
4829
|
|
|
elif word[i] == 'R': |
4830
|
|
|
sdx += '7' |
4831
|
|
|
elif word[i] in {'S', 'Z'}: |
4832
|
|
|
sdx += '8' |
4833
|
|
|
|
4834
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
4835
|
|
|
|
4836
|
|
|
# if sdx: |
4837
|
|
|
# sdx = sdx[0] + sdx[1:].replace('9', '') |
4838
|
|
|
|
4839
|
|
|
return sdx |
4840
|
|
|
|
4841
|
|
|
return tuple(_haase_code(word) for word in variants) |
4842
|
|
|
|
4843
|
|
|
|
4844
|
|
|
def reth_schek_phonetik(word): |
4845
|
|
|
"""Return Reth-Schek Phonetik code for a word. |
4846
|
|
|
|
4847
|
|
|
This algorithm is proposed in :cite:`Reth:1977`. |
4848
|
|
|
|
4849
|
|
|
Since I couldn't secure a copy of that document (maybe I'll look for it |
4850
|
|
|
next time I'm in Germany), this implementation is based on what I could |
4851
|
|
|
glean from the implementations published by German Record Linkage |
4852
|
|
|
Center (www.record-linkage.de): |
4853
|
|
|
|
4854
|
|
|
- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` |
4855
|
|
|
- Merge ToolBox (in Java) :cite:`Schnell:2004` |
4856
|
|
|
|
4857
|
|
|
Rules that are unclear: |
4858
|
|
|
|
4859
|
|
|
- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
4860
|
|
|
- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
4861
|
|
|
- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
4862
|
|
|
think of a German word with '-tui-' in it.) |
4863
|
|
|
- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
4864
|
|
|
|
4865
|
|
|
:param word: |
4866
|
|
|
:return: |
4867
|
|
|
""" |
4868
|
|
|
replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', |
4869
|
|
|
'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', |
4870
|
|
|
'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'}, |
4871
|
|
|
2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', |
4872
|
|
|
'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', |
4873
|
|
|
'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', |
4874
|
|
|
'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', |
4875
|
|
|
'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', |
4876
|
|
|
'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', |
4877
|
|
|
'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', |
4878
|
|
|
'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', |
4879
|
|
|
'SS': 'S', 'KW': 'QU'}, |
4880
|
|
|
1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', |
4881
|
|
|
'K': 'G', 'Y': 'I'}} |
4882
|
|
|
|
4883
|
|
|
# Uppercase |
4884
|
|
|
word = word.upper() |
4885
|
|
|
|
4886
|
|
|
# Replace umlauts/eszett |
4887
|
|
|
word = word.replace('Ä', 'AE') |
4888
|
|
|
word = word.replace('Ö', 'OE') |
4889
|
|
|
word = word.replace('Ü', 'UE') |
4890
|
|
|
word = word.replace('ß', 'SS') |
4891
|
|
|
|
4892
|
|
|
# Main loop, using above replacements table |
4893
|
|
|
pos = 0 |
4894
|
|
|
while pos < len(word): |
4895
|
|
|
for num in range(3, 0, -1): |
4896
|
|
|
if word[pos:pos+num] in replacements[num]: |
4897
|
|
|
word = (word[:pos] + replacements[num][word[pos:pos+num]] |
4898
|
|
|
+ word[pos+num:]) |
4899
|
|
|
pos += 1 |
4900
|
|
|
break |
4901
|
|
|
else: |
4902
|
|
|
pos += 1 # Advance if nothing is recognized |
4903
|
|
|
|
4904
|
|
|
# Change 'CH' back(?) to 'SCH' |
4905
|
|
|
word = word.replace('CH', 'SCH') |
4906
|
|
|
|
4907
|
|
|
# Replace final sequences |
4908
|
|
|
if word[-2:] == 'ER': |
4909
|
|
|
word = word[:-2]+'R' |
4910
|
|
|
elif word[-2:] == 'EL': |
4911
|
|
|
word = word[:-2]+'L' |
4912
|
|
|
elif word[-1] == 'H': |
4913
|
|
|
word = word[:-1] |
4914
|
|
|
|
4915
|
|
|
return word |
4916
|
|
|
|
4917
|
|
|
|
4918
|
|
|
def fonem(word): |
4919
|
|
|
"""Return the FONEM code of a word. |
4920
|
|
|
|
4921
|
|
|
FONEM is a phonetic algorithm designed for French (particularly surnames in |
4922
|
|
|
Saguenay, Canada), defined in :cite:`Bouchard:1981`. |
4923
|
|
|
|
4924
|
|
|
Guillaume Plique's Javascript implementation :cite:`Plique:2018` at |
4925
|
|
|
https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
4926
|
|
|
was also consulted for this implementation. |
4927
|
|
|
|
4928
|
|
|
:param str word: the word to transform |
4929
|
|
|
:returns: the FONEM code |
4930
|
|
|
:rtype: str |
4931
|
|
|
""" |
4932
|
|
|
# I don't see a sane way of doing this without regexps :( |
4933
|
|
|
rule_table = { |
4934
|
|
|
# Vowels & groups of vowels |
4935
|
|
|
'V-1': (re_compile('E?AU'), 'O'), |
4936
|
|
|
'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), |
4937
|
|
|
'V-3,4': (re_compile('E?AU[TX]$'), 'O'), |
4938
|
|
|
'V-6': (re_compile('E?AUL?D$'), 'O'), |
4939
|
|
|
'V-7': (re_compile(r'(?<!G)AY$'), 'E'), |
4940
|
|
|
'V-8': (re_compile('EUX$'), 'EU'), |
4941
|
|
|
'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
4942
|
|
|
'V-10': ('Y', 'I'), |
4943
|
|
|
'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
4944
|
|
|
'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), |
4945
|
|
|
'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
4946
|
|
|
'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), |
4947
|
|
|
# Nasal vowels |
4948
|
|
|
'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
4949
|
|
|
'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
4950
|
|
|
'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
4951
|
|
|
'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
4952
|
|
|
'IN'), |
4953
|
|
|
'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), |
4954
|
|
|
'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
4955
|
|
|
'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'), |
4956
|
|
|
# Consonants and groups of consonants |
4957
|
|
|
'C-1': ('BV', 'V'), |
4958
|
|
|
'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
4959
|
|
|
'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
4960
|
|
|
'C-4': (re_compile('^C(?=[EIY])'), 'S'), |
4961
|
|
|
'C-5': (re_compile('^C(?=[OUA])'), 'K'), |
4962
|
|
|
'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), |
4963
|
|
|
'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
4964
|
|
|
'C-8': (re_compile('CC(?=[AOU])'), 'K'), |
4965
|
|
|
'C-9': (re_compile('CC(?=[EIY])'), 'X'), |
4966
|
|
|
'C-10': (re_compile('G(?=[EIY])'), 'J'), |
4967
|
|
|
'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), |
4968
|
|
|
'C-12': (re_compile('GE(O|AU)'), 'JO'), |
4969
|
|
|
'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), |
4970
|
|
|
'C-14': (re_compile('(?<![PCS])H'), ''), |
4971
|
|
|
'C-15': ('JEA', 'JA'), |
4972
|
|
|
'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
4973
|
|
|
'C-17': (re_compile('^MC'), 'MA#'), |
4974
|
|
|
'C-18': ('PH', 'F'), |
4975
|
|
|
'C-19': ('QU', 'K'), |
4976
|
|
|
'C-20': (re_compile('^SC(?=[EIY])'), 'S'), |
4977
|
|
|
'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), |
4978
|
|
|
'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), |
4979
|
|
|
'C-23': ('SH', 'CH'), |
4980
|
|
|
'C-24': (re_compile('TIA$'), 'SSIA'), |
4981
|
|
|
'C-25': (re_compile('(?<=[AIOUY])W'), ''), |
4982
|
|
|
'C-26': (re_compile('X[CSZ]'), 'X'), |
4983
|
|
|
'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
4984
|
|
|
'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
4985
|
|
|
'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
4986
|
|
|
'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
4987
|
|
|
'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
4988
|
|
|
'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
4989
|
|
|
'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), |
4990
|
|
|
'C-28d': (re_compile('ILE$'), 'ILLE'), |
4991
|
|
|
'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + |
4992
|
|
|
'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
4993
|
|
|
lambda m: (m.group(1) or '') + (m.group(2) or '')), |
4994
|
|
|
'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
4995
|
|
|
'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), |
4996
|
|
|
# Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
4997
|
|
|
'C-34': ('G#', 'GA'), |
4998
|
|
|
'C-35': ('MA#', 'MAC') |
4999
|
|
|
} |
5000
|
|
|
rule_order = [ |
5001
|
|
|
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
5002
|
|
|
'C-12', |
5003
|
|
|
'C-8', 'C-9', 'C-10', |
5004
|
|
|
'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
5005
|
|
|
'V-2,5', 'V-3,4', 'V-6', |
5006
|
|
|
'V-1', 'C-14', |
5007
|
|
|
'C-31,33', 'C-30,32', |
5008
|
|
|
'C-11', 'V-15', 'V-17', 'V-18', |
5009
|
|
|
'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
5010
|
|
|
'V-19', 'V-20', |
5011
|
|
|
'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
5012
|
|
|
'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
5013
|
|
|
'C-25', 'C-26', 'C-27', |
5014
|
|
|
'C-29', |
5015
|
|
|
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
5016
|
|
|
'C-34', 'C-35' |
5017
|
|
|
] |
5018
|
|
|
|
5019
|
|
|
# normalize, upper-case, and filter non-French letters |
5020
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5021
|
|
|
word = word.translate({198: 'AE', 338: 'OE'}) |
5022
|
|
|
word = ''.join(c for c in word if c in |
5023
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
5024
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
5025
|
|
|
'Y', 'Z', '-'}) |
5026
|
|
|
|
5027
|
|
|
for rule in rule_order: |
5028
|
|
|
regex, repl = rule_table[rule] |
5029
|
|
|
if isinstance(regex, text_type): |
5030
|
|
|
word = word.replace(regex, repl) |
5031
|
|
|
else: |
5032
|
|
|
word = regex.sub(repl, word) |
5033
|
|
|
|
5034
|
|
|
return word |
5035
|
|
|
|
5036
|
|
|
|
5037
|
|
|
def parmar_kumbharana(word): |
5038
|
|
|
"""Return the Parmar-Kumbharana encoding of a word. |
5039
|
|
|
|
5040
|
|
|
This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`. |
5041
|
|
|
|
5042
|
|
|
:param word: |
5043
|
|
|
:return: |
5044
|
|
|
""" |
5045
|
|
|
rule_table = {4: {'OUGH': 'F'}, |
5046
|
|
|
3: {'DGE': 'J', |
5047
|
|
|
'OUL': 'U', |
5048
|
|
|
'GHT': 'T'}, |
5049
|
|
|
2: {'CE': 'S', 'CI': 'S', 'CY': 'S', |
5050
|
|
|
'GE': 'J', 'GI': 'J', 'GY': 'J', |
5051
|
|
|
'WR': 'R', |
5052
|
|
|
'GN': 'N', 'KN': 'N', 'PN': 'N', |
5053
|
|
|
'CK': 'K', |
5054
|
|
|
'SH': 'S'}} |
5055
|
|
|
vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} |
5056
|
|
|
|
5057
|
|
|
word = word.upper() # Rule 3 |
5058
|
|
|
word = _delete_consecutive_repeats(word) # Rule 4 |
5059
|
|
|
|
5060
|
|
|
# Rule 5 |
5061
|
|
|
i = 0 |
5062
|
|
|
while i < len(word): |
5063
|
|
|
for match_len in range(4, 1, -1): |
5064
|
|
|
if word[i:i+match_len] in rule_table[match_len]: |
5065
|
|
|
repl = rule_table[match_len][word[i:i+match_len]] |
5066
|
|
|
word = (word[:i] + repl + word[i+match_len:]) |
5067
|
|
|
i += len(repl) |
5068
|
|
|
break |
5069
|
|
|
else: |
5070
|
|
|
i += 1 |
5071
|
|
|
|
5072
|
|
|
word = word[0]+word[1:].translate(vowel_trans) # Rule 6 |
5073
|
|
|
return word |
5074
|
|
|
|
5075
|
|
|
|
5076
|
|
|
def davidson(lname, fname='.', omit_fname=False): |
5077
|
|
|
"""Return Davidson's Consonant Code. |
5078
|
|
|
|
5079
|
|
|
This is based on the name compression system described in |
5080
|
|
|
:cite:`Davidson:1962`. |
5081
|
|
|
|
5082
|
|
|
:cite:`Dolby:1970` identifies this as having been the name compression |
5083
|
|
|
algorithm used by SABRE. |
5084
|
|
|
|
5085
|
|
|
:param str lname: Last name (or word) to be encoded |
5086
|
|
|
:param str fname: First name (optional), of which the first character is |
5087
|
|
|
included in the code. |
5088
|
|
|
:param str omit_fname: Set to True to completely omit the first character |
5089
|
|
|
of the first name |
5090
|
|
|
:return: Davidson's Consonant Code |
5091
|
|
|
""" |
5092
|
|
|
trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''} |
5093
|
|
|
|
5094
|
|
|
lname = text_type(lname.upper()) |
5095
|
|
|
code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans)) |
5096
|
|
|
code = code[:4] + (4-len(code))*' ' |
5097
|
|
|
|
5098
|
|
|
if not omit_fname: |
5099
|
|
|
code += fname[:1].upper() |
5100
|
|
|
|
5101
|
|
|
return code |
5102
|
|
|
|
5103
|
|
|
|
5104
|
|
|
def sound_d(word, maxlength=4): |
5105
|
|
|
"""Return the SoundD code. |
5106
|
|
|
|
5107
|
|
|
SoundD is defined in :cite:`Varol:2012`. |
5108
|
|
|
|
5109
|
|
|
:param str word: the word to transform |
5110
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
5111
|
|
|
:return: |
5112
|
|
|
""" |
5113
|
|
|
_ref_soundd_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5114
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
5115
|
|
|
'01230120022455012623010202')) |
5116
|
|
|
|
5117
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5118
|
|
|
word = word.replace('ß', 'SS') |
5119
|
|
|
word = ''.join(c for c in word if c in |
5120
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
5121
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
5122
|
|
|
'Y', 'Z'}) |
5123
|
|
|
|
5124
|
|
|
if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: |
5125
|
|
|
word = word[1:] |
5126
|
|
|
elif word[:1] == 'X': |
5127
|
|
|
word = 'S'+word[1:] |
5128
|
|
|
elif word[:2] == 'WH': |
5129
|
|
|
word = 'W'+word[2:] |
5130
|
|
|
|
5131
|
|
|
word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') |
5132
|
|
|
|
5133
|
|
|
word = word.translate(_ref_soundd_translation) |
5134
|
|
|
word = _delete_consecutive_repeats(word) |
5135
|
|
|
word = word.replace('0', '') |
5136
|
|
|
|
5137
|
|
|
if maxlength is not None: |
5138
|
|
|
if len(word) < maxlength: |
5139
|
|
|
word += '0' * (maxlength-len(word)) |
5140
|
|
|
else: |
5141
|
|
|
word = word[:maxlength] |
5142
|
|
|
|
5143
|
|
|
return word |
5144
|
|
|
|
5145
|
|
|
|
5146
|
|
|
def pshp_soundex_last(lname, maxlength=4, german=False): |
5147
|
|
|
"""Calculate the PSHP Soundex/Viewex Coding of a last name. |
5148
|
|
|
|
5149
|
|
|
This coding is based on :cite:`Hershberg:1976`. |
5150
|
|
|
|
5151
|
|
|
Reference was also made to the German version of the same: |
5152
|
|
|
:cite:`Hershberg:1979`. |
5153
|
|
|
|
5154
|
|
|
A separate function, pshp_soundex_first() is used for first names. |
5155
|
|
|
|
5156
|
|
|
:param lname: the last name to encode |
5157
|
|
|
:param german: set to True if the name is German (different rules apply) |
5158
|
|
|
:return: |
5159
|
|
|
""" |
5160
|
|
|
lname = normalize('NFKD', text_type(lname.upper())) |
5161
|
|
|
lname = lname.replace('ß', 'SS') |
5162
|
|
|
lname = ''.join(c for c in lname if c in |
5163
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
5164
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
5165
|
|
|
'W', 'X', 'Y', 'Z'}) |
5166
|
|
|
|
5167
|
|
|
# A. Prefix treatment |
5168
|
|
|
if lname[:3] == 'VON' or lname[:3] == 'VAN': |
5169
|
|
|
lname = lname[3:].strip() |
5170
|
|
|
|
5171
|
|
|
# The rule implemented below says "MC, MAC become 1". I believe it meant to |
5172
|
|
|
# say they become M except in German data (where superscripted 1 indicates |
5173
|
|
|
# "except in German data"). It doesn't make sense for them to become 1 |
5174
|
|
|
# (BPFV -> 1) or to apply outside German. Unfortunately, both articles have |
5175
|
|
|
# this error(?). |
5176
|
|
|
if not german: |
5177
|
|
|
if lname[:3] == 'MAC': |
5178
|
|
|
lname = 'M'+lname[3:] |
5179
|
|
|
elif lname[:2] == 'MC': |
5180
|
|
|
lname = 'M'+lname[2:] |
5181
|
|
|
|
5182
|
|
|
# The non-German-only rule to strip ' is unnecessary due to filtering |
5183
|
|
|
|
5184
|
|
|
if lname[:1] in {'E', 'I', 'O', 'U'}: |
5185
|
|
|
lname = 'A' + lname[1:] |
5186
|
|
|
elif lname[:2] in {'GE', 'GI', 'GY'}: |
5187
|
|
|
lname = 'J' + lname[1:] |
5188
|
|
|
elif lname[:2] in {'CE', 'CI', 'CY'}: |
5189
|
|
|
lname = 'S' + lname[1:] |
5190
|
|
|
elif lname[:3] == 'CHR': |
5191
|
|
|
lname = 'K' + lname[1:] |
5192
|
|
|
elif lname[:1] == 'C' and lname[:2] != 'CH': |
5193
|
|
|
lname = 'K' + lname[1:] |
5194
|
|
|
|
5195
|
|
|
if lname[:2] == 'KN': |
5196
|
|
|
lname = 'N' + lname[1:] |
5197
|
|
|
elif lname[:2] == 'PH': |
5198
|
|
|
lname = 'F' + lname[1:] |
5199
|
|
|
elif lname[:3] in {'WIE', 'WEI'}: |
5200
|
|
|
lname = 'V' + lname[1:] |
5201
|
|
|
|
5202
|
|
|
if german and lname[:1] in {'W', 'M', 'Y', 'Z'}: |
5203
|
|
|
lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:] |
5204
|
|
|
|
5205
|
|
|
code = lname[:1] |
5206
|
|
|
|
5207
|
|
|
# B. Postfix treatment |
5208
|
|
|
if lname[-1:] == 'R': |
5209
|
|
|
lname = lname[:-1] + 'N' |
5210
|
|
|
elif lname[-2:] in {'SE', 'CE'}: |
5211
|
|
|
lname = lname[:-2] |
5212
|
|
|
if lname[-2:] == 'SS': |
5213
|
|
|
lname = lname[:-2] |
5214
|
|
|
elif lname[-1:] == 'S': |
5215
|
|
|
lname = lname[:-1] |
5216
|
|
|
|
5217
|
|
|
if not german: |
5218
|
|
|
l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} |
5219
|
|
|
l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', |
5220
|
|
|
'STON': 'SAON'} |
5221
|
|
|
if lname[-5:] in l5_repl: |
5222
|
|
|
lname = lname[:-5] + l5_repl[lname[-5:]] |
5223
|
|
|
elif lname[-4:] in l4_repl: |
5224
|
|
|
lname = lname[:-4] + l4_repl[lname[-4:]] |
5225
|
|
|
|
5226
|
|
|
if lname[-2:] in {'NG', 'ND'}: |
5227
|
|
|
lname = lname[:-1] |
5228
|
|
|
if not german and lname[-3:] in {'GAN', 'GEN'}: |
5229
|
|
|
lname = lname[:-3]+'A'+lname[-2:] |
5230
|
|
|
|
5231
|
|
|
if german: |
5232
|
|
|
if lname[-3:] == 'TES': |
5233
|
|
|
lname = lname[:-3] |
5234
|
|
|
elif lname[-2:] == 'TS': |
5235
|
|
|
lname = lname[:-2] |
5236
|
|
|
if lname[-3:] == 'TZE': |
5237
|
|
|
lname = lname[:-3] |
5238
|
|
|
elif lname[-2:] == 'ZE': |
5239
|
|
|
lname = lname[:-2] |
5240
|
|
|
if lname[-1:] == 'Z': |
5241
|
|
|
lname = lname[:-1] |
5242
|
|
|
elif lname[-2:] == 'TE': |
5243
|
|
|
lname = lname[:-2] |
5244
|
|
|
|
5245
|
|
|
# C. Infix Treatment |
5246
|
|
|
lname = lname.replace('CK', 'C') |
5247
|
|
|
lname = lname.replace('SCH', 'S') |
5248
|
|
|
lname = lname.replace('DT', 'T') |
5249
|
|
|
lname = lname.replace('ND', 'N') |
5250
|
|
|
lname = lname.replace('NG', 'N') |
5251
|
|
|
lname = lname.replace('LM', 'M') |
5252
|
|
|
lname = lname.replace('MN', 'M') |
5253
|
|
|
lname = lname.replace('WIE', 'VIE') |
5254
|
|
|
lname = lname.replace('WEI', 'VEI') |
5255
|
|
|
|
5256
|
|
|
# D. Soundexing |
5257
|
|
|
# code for X & Y are unspecified, but presumably are 2 & 0 |
5258
|
|
|
_pshp_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5259
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
5260
|
|
|
'01230120022455012523010202')) |
5261
|
|
|
|
5262
|
|
|
lname = lname.translate(_pshp_translation) |
5263
|
|
|
lname = _delete_consecutive_repeats(lname) |
5264
|
|
|
|
5265
|
|
|
code += lname[1:] |
5266
|
|
|
code = code.replace('0', '') # rule 1 |
5267
|
|
|
|
5268
|
|
|
if maxlength is not None: |
5269
|
|
|
if len(code) < maxlength: |
5270
|
|
|
code += '0' * (maxlength-len(code)) |
5271
|
|
|
else: |
5272
|
|
|
code = code[:maxlength] |
5273
|
|
|
|
5274
|
|
|
return code |
5275
|
|
|
|
5276
|
|
|
|
5277
|
|
|
def pshp_soundex_first(fname, maxlength=4, german=False): |
5278
|
|
|
"""Calculate the PSHP Soundex/Viewex Coding of a first name. |
5279
|
|
|
|
5280
|
|
|
This coding is based on :cite:`Hershberg:1976`. |
5281
|
|
|
|
5282
|
|
|
Reference was also made to the German version of the same: |
5283
|
|
|
:cite:`Hershberg:1979`. |
5284
|
|
|
|
5285
|
|
|
A separate function, pshp_soundex_last() is used for last names. |
5286
|
|
|
|
5287
|
|
|
:param fname: the first name to encode |
5288
|
|
|
:param german: set to True if the name is German (different rules apply) |
5289
|
|
|
:return: |
5290
|
|
|
""" |
5291
|
|
|
fname = normalize('NFKD', text_type(fname.upper())) |
5292
|
|
|
fname = fname.replace('ß', 'SS') |
5293
|
|
|
fname = ''.join(c for c in fname if c in |
5294
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
5295
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
5296
|
|
|
'W', 'X', 'Y', 'Z'}) |
5297
|
|
|
|
5298
|
|
|
# special rules |
5299
|
|
|
if fname == 'JAMES': |
5300
|
|
|
code = 'J7' |
5301
|
|
|
elif fname == 'PAT': |
5302
|
|
|
code = 'P7' |
5303
|
|
|
|
5304
|
|
|
else: |
5305
|
|
|
# A. Prefix treatment |
5306
|
|
|
if fname[:2] in {'GE', 'GI', 'GY'}: |
5307
|
|
|
fname = 'J' + fname[1:] |
5308
|
|
|
elif fname[:2] in {'CE', 'CI', 'CY'}: |
5309
|
|
|
fname = 'S' + fname[1:] |
5310
|
|
|
elif fname[:3] == 'CHR': |
5311
|
|
|
fname = 'K' + fname[1:] |
5312
|
|
|
elif fname[:1] == 'C' and fname[:2] != 'CH': |
5313
|
|
|
fname = 'K' + fname[1:] |
5314
|
|
|
|
5315
|
|
|
if fname[:2] == 'KN': |
5316
|
|
|
fname = 'N' + fname[1:] |
5317
|
|
|
elif fname[:2] == 'PH': |
5318
|
|
|
fname = 'F' + fname[1:] |
5319
|
|
|
elif fname[:3] in {'WIE', 'WEI'}: |
5320
|
|
|
fname = 'V' + fname[1:] |
5321
|
|
|
|
5322
|
|
|
if german and fname[:1] in {'W', 'M', 'Y', 'Z'}: |
5323
|
|
|
fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] + |
5324
|
|
|
fname[1:]) |
5325
|
|
|
|
5326
|
|
|
code = fname[:1] |
5327
|
|
|
|
5328
|
|
|
# B. Soundex coding |
5329
|
|
|
# code for Y unspecified, but presumably is 0 |
5330
|
|
|
_pshp_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5331
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
5332
|
|
|
'01230120022455012523010202')) |
5333
|
|
|
|
5334
|
|
|
fname = fname.translate(_pshp_translation) |
5335
|
|
|
fname = _delete_consecutive_repeats(fname) |
5336
|
|
|
|
5337
|
|
|
code += fname[1:] |
5338
|
|
|
syl_ptr = code.find('0') |
5339
|
|
|
syl2_ptr = code[syl_ptr + 1:].find('0') |
5340
|
|
|
if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1: |
5341
|
|
|
code = code[:syl_ptr + 2] |
5342
|
|
|
|
5343
|
|
|
code = code.replace('0', '') # rule 1 |
5344
|
|
|
|
5345
|
|
|
if maxlength is not None: |
5346
|
|
|
if len(code) < maxlength: |
5347
|
|
|
code += '0' * (maxlength-len(code)) |
5348
|
|
|
else: |
5349
|
|
|
code = code[:maxlength] |
5350
|
|
|
|
5351
|
|
|
return code |
5352
|
|
|
|
5353
|
|
|
|
5354
|
|
|
def henry_early(word, maxlength=3): |
5355
|
|
|
"""Calculate the early version of the Henry code for a word. |
5356
|
|
|
|
5357
|
|
|
The early version of Henry coding is given in :cite:`Legare:1972`. This is |
5358
|
|
|
different from the later version defined in :cite:`Henry:1976`. |
5359
|
|
|
|
5360
|
|
|
:param word: |
5361
|
|
|
:param int maxlength: the length of the code returned (defaults to 3) |
5362
|
|
|
:return: |
5363
|
|
|
""" |
5364
|
|
|
_cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
5365
|
|
|
'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
5366
|
|
|
_vows = {'A', 'E', 'I', 'O', 'U', 'Y'} |
5367
|
|
|
_diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O', |
5368
|
|
|
'EU': 'U'} |
5369
|
|
|
_unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'} |
5370
|
|
|
_simple = {'W': 'V', 'X': 'S', 'V': 'S'} |
5371
|
|
|
|
5372
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5373
|
|
|
word = ''.join(c for c in word if c in |
5374
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
5375
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
5376
|
|
|
'Y', 'Z'}) |
5377
|
|
|
|
5378
|
|
|
if not word: |
5379
|
|
|
return '' |
5380
|
|
|
|
5381
|
|
|
# Rule Ia seems to be covered entirely in II |
5382
|
|
|
|
5383
|
|
|
# Rule Ib |
5384
|
|
|
if word[0] in _vows: |
5385
|
|
|
# Ib1 |
5386
|
|
|
if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or |
5387
|
|
|
(word[1:2] in _cons and word[2:3] not in _cons))): |
5388
|
|
|
if word[0] == 'Y': |
5389
|
|
|
word = 'I'+word[1:] |
5390
|
|
|
# Ib2 |
5391
|
|
|
elif word[1:2] in {'M', 'N'} and word[2:3] in _cons: |
5392
|
|
|
if word[0] == 'E': |
5393
|
|
|
word = 'A'+word[1:] |
5394
|
|
|
elif word[0] in {'I', 'U', 'Y'}: |
5395
|
|
|
word = 'E'+word[1:] |
5396
|
|
|
# Ib3 |
5397
|
|
|
elif word[:2] in _diph: |
5398
|
|
|
word = _diph[word[:2]]+word[2:] |
5399
|
|
|
# Ib4 |
5400
|
|
|
elif word[1:2] in _vows and word[0] == 'Y': |
5401
|
|
|
word = 'I' + word[1:] |
5402
|
|
|
|
5403
|
|
|
code = '' |
5404
|
|
|
skip = 0 |
5405
|
|
|
|
5406
|
|
|
# Rule II |
5407
|
|
|
for pos, char in enumerate(word): |
5408
|
|
|
nxch = char[pos+1:pos+2] |
5409
|
|
|
prev = char[pos-1:pos] |
5410
|
|
|
|
5411
|
|
|
if skip: |
5412
|
|
|
skip -= 1 |
5413
|
|
|
elif char in _vows: |
5414
|
|
|
code += char |
5415
|
|
|
# IIc |
5416
|
|
|
elif char == nxch: |
5417
|
|
|
skip = 1 |
5418
|
|
|
code += char |
5419
|
|
|
elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}: |
5420
|
|
|
skip = 1 |
5421
|
|
|
code += word[pos+1] |
5422
|
|
|
# IId |
5423
|
|
|
elif char == 'H' and prev in _cons: |
5424
|
|
|
continue |
5425
|
|
|
elif char == 'S' and nxch in _cons: |
5426
|
|
|
continue |
5427
|
|
|
elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}: |
5428
|
|
|
continue |
5429
|
|
|
elif char == 'L' and nxch in {'M', 'N'}: |
5430
|
|
|
continue |
5431
|
|
|
elif char in {'M', 'N'} and prev in _vows and nxch in _cons: |
5432
|
|
|
continue |
5433
|
|
|
# IIa |
5434
|
|
|
elif char in _unaltered: |
5435
|
|
|
code += char |
5436
|
|
|
# IIb |
5437
|
|
|
elif char in _simple: |
5438
|
|
|
code += _simple[char] |
5439
|
|
|
elif char in {'C', 'G', 'P', 'Q', 'S'}: |
5440
|
|
|
if char == 'C': |
5441
|
|
|
if nxch in {'A', 'O', 'U', 'L', 'R'}: |
5442
|
|
|
code += 'K' |
5443
|
|
|
elif nxch in {'E', 'I', 'Y'}: |
5444
|
|
|
code += 'J' |
5445
|
|
|
elif nxch == 'H': |
5446
|
|
|
if word[pos+2:pos+3] in _vows: |
5447
|
|
|
code += 'C' |
5448
|
|
|
elif word[pos+2:pos+3] in {'R', 'L'}: |
5449
|
|
|
code += 'K' |
5450
|
|
|
elif char == 'G': |
5451
|
|
|
if nxch in {'A', 'O', 'U', 'L', 'R'}: |
5452
|
|
|
code += 'G' |
5453
|
|
|
elif nxch in {'E', 'I', 'Y'}: |
5454
|
|
|
code += 'J' |
5455
|
|
|
elif nxch == 'N': |
5456
|
|
|
code += 'N' |
5457
|
|
|
elif char == 'P': |
5458
|
|
|
if nxch != 'H': |
5459
|
|
|
code += 'P' |
5460
|
|
|
else: |
5461
|
|
|
code += 'F' |
5462
|
|
|
elif char == 'Q': |
5463
|
|
|
if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}: |
5464
|
|
|
char += 'G' |
5465
|
|
|
elif word[pos + 1:pos + 2] in {'UA', 'UO'}: |
5466
|
|
|
char += 'K' |
5467
|
|
|
elif char == 'S': |
5468
|
|
|
if word[pos:pos+6] == 'SAINTE': |
5469
|
|
|
code += 'X' |
5470
|
|
|
skip = 5 |
5471
|
|
|
elif word[pos:pos+5] == 'SAINT': |
5472
|
|
|
code += 'X' |
5473
|
|
|
skip = 4 |
5474
|
|
|
elif word[pos:pos+3] == 'STE': |
5475
|
|
|
code += 'X' |
5476
|
|
|
skip = 2 |
5477
|
|
|
elif word[pos:pos+2] == 'ST': |
5478
|
|
|
code += 'X' |
5479
|
|
|
skip = 1 |
5480
|
|
|
else: |
5481
|
|
|
code += 'S' |
5482
|
|
|
else: # this should not be possible |
5483
|
|
|
continue |
5484
|
|
|
|
5485
|
|
|
# IIe1 |
5486
|
|
|
if code[-4:] in {'AULT', 'EULT', 'OULT'}: |
5487
|
|
|
code = code[:-2] |
5488
|
|
|
elif code[-4:-3] in _vows and code[-3:] == 'MPS': |
5489
|
|
|
code = code[:-3] |
5490
|
|
|
elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}: |
5491
|
|
|
code = code[:-2] |
5492
|
|
|
elif code[-2:-1] == 'R' and code[-1:] in _cons: |
5493
|
|
|
code = code[:-1] |
5494
|
|
|
# IIe2 |
5495
|
|
|
elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}: |
5496
|
|
|
code = code[:-1] |
5497
|
|
|
elif code[-2:] == 'ER': |
5498
|
|
|
code = code[:-1] |
5499
|
|
|
|
5500
|
|
|
# Drop non-initial vowels |
5501
|
|
|
code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '', |
5502
|
|
|
89: ''}) |
5503
|
|
|
|
5504
|
|
|
if maxlength is not None: |
5505
|
|
|
code = code[:maxlength] |
|
|
|
|
5506
|
|
|
|
5507
|
|
|
return code |
5508
|
|
|
|
5509
|
|
|
|
5510
|
|
|
def norphone(word): |
5511
|
|
|
"""Return the Norphone code. |
5512
|
|
|
|
5513
|
|
|
The reference implementation by Lars Marius Garshol is available in |
5514
|
|
|
:cite:`Garshol:2015`. |
5515
|
|
|
|
5516
|
|
|
Norphone was designed for Norwegian, but this implementation has been |
5517
|
|
|
extended to support Swedish vowels as well. This function incorporates |
5518
|
|
|
the "not implemented" rules from the above file's rule set. |
5519
|
|
|
|
5520
|
|
|
:param word: |
5521
|
|
|
:return: |
5522
|
|
|
""" |
5523
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} |
5524
|
|
|
|
5525
|
|
|
replacements = {4: {'SKEI': 'X'}, |
5526
|
|
|
3: {'SKJ': 'X', 'KEI': 'X'}, |
5527
|
|
|
2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K', |
5528
|
|
|
'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X', |
5529
|
|
|
'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'}, |
5530
|
|
|
1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}} |
5531
|
|
|
|
5532
|
|
|
word = word.upper() |
5533
|
|
|
|
5534
|
|
|
code = '' |
5535
|
|
|
skip = 0 |
5536
|
|
|
|
5537
|
|
|
if word[0:2] == 'AA': |
5538
|
|
|
code = 'Å' |
5539
|
|
|
skip = 2 |
5540
|
|
|
elif word[0:2] == 'GI': |
5541
|
|
|
code = 'J' |
5542
|
|
|
skip = 2 |
5543
|
|
|
elif word[0:3] == 'SKY': |
5544
|
|
|
code = 'X' |
5545
|
|
|
skip = 3 |
5546
|
|
|
elif word[0:2] == 'EI': |
5547
|
|
|
code = 'Æ' |
5548
|
|
|
skip = 2 |
5549
|
|
|
elif word[0:2] == 'KY': |
5550
|
|
|
code = 'X' |
5551
|
|
|
skip = 2 |
5552
|
|
|
elif word[:1] == 'C': |
5553
|
|
|
code = 'K' |
5554
|
|
|
skip = 1 |
5555
|
|
|
elif word[:1] == 'Ä': |
5556
|
|
|
code = 'Æ' |
5557
|
|
|
skip = 1 |
5558
|
|
|
elif word[:1] == 'Ö': |
5559
|
|
|
code = 'Ø' |
5560
|
|
|
skip = 1 |
5561
|
|
|
|
5562
|
|
|
if word[-2:] == 'DT': |
5563
|
|
|
word = word[:-2]+'T' |
5564
|
|
|
# Though the rules indicate this rule applies in all positions, the |
5565
|
|
|
# reference implementation indicates it applies only in final position. |
5566
|
|
|
elif word[-2:-1] in _vowels and word[-1:] == 'D': |
5567
|
|
|
word = word[:-2] |
5568
|
|
|
|
5569
|
|
|
for pos, char in enumerate(word): |
5570
|
|
|
if skip: |
5571
|
|
|
skip -= 1 |
5572
|
|
|
else: |
5573
|
|
|
for length in sorted(replacements, reverse=True): |
5574
|
|
|
if word[pos:pos+length] in replacements[length]: |
5575
|
|
|
code += replacements[length][word[pos:pos+length]] |
5576
|
|
|
skip = length-1 |
5577
|
|
|
break |
5578
|
|
|
else: |
5579
|
|
|
if not pos or char not in _vowels: |
5580
|
|
|
code += char |
5581
|
|
|
|
5582
|
|
|
code = _delete_consecutive_repeats(code) |
5583
|
|
|
|
5584
|
|
|
return code |
5585
|
|
|
|
5586
|
|
|
|
5587
|
|
|
def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'): |
5588
|
|
|
r"""Return the Dolby Code of a name. |
5589
|
|
|
|
5590
|
|
|
This follows "A Spelling Equivalent Abbreviation Algorithm For Personal |
5591
|
|
|
Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`. |
5592
|
|
|
|
5593
|
|
|
:param word: the word to encode |
5594
|
|
|
:param maxlength: maximum length of the returned Dolby code -- this also |
5595
|
|
|
activates the fixed-length code mode |
5596
|
|
|
:param keep_vowels: if True, retains all vowel markers |
5597
|
|
|
:param vowel_char: the vowel marker character (default to \*) |
5598
|
|
|
:return: |
5599
|
|
|
""" |
5600
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y'} |
5601
|
|
|
|
5602
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
5603
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5604
|
|
|
word = word.replace('ß', 'SS') |
5605
|
|
|
word = ''.join(c for c in word if c in |
5606
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
5607
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
5608
|
|
|
'Y', 'Z'}) |
5609
|
|
|
|
5610
|
|
|
# Rule 1 (FL2) |
5611
|
|
|
if word[:3] in {'MCG', 'MAG', 'MAC'}: |
5612
|
|
|
word = 'MK'+word[3:] |
5613
|
|
|
elif word[:2] == 'MC': |
5614
|
|
|
word = 'MK'+word[2:] |
5615
|
|
|
|
5616
|
|
|
# Rule 2 (FL3) |
5617
|
|
|
pos = len(word)-2 |
5618
|
|
|
while pos > -1: |
5619
|
|
|
if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC', |
5620
|
|
|
'SK', 'ST'}: |
5621
|
|
|
word = word[:pos+1]+word[pos+2:] |
5622
|
|
|
pos += 1 |
5623
|
|
|
pos -= 1 |
5624
|
|
|
|
5625
|
|
|
# Rule 3 (FL4) |
5626
|
|
|
# Although the rule indicates "after the first letter", the test cases make |
5627
|
|
|
# it clear that these apply to the first letter also. |
5628
|
|
|
word = word.replace('X', 'KS') |
5629
|
|
|
word = word.replace('CE', 'SE') |
5630
|
|
|
word = word.replace('CI', 'SI') |
5631
|
|
|
word = word.replace('CY', 'SI') |
5632
|
|
|
|
5633
|
|
|
# not in the rule set, but they seem to have intended it |
5634
|
|
|
word = word.replace('TCH', 'CH') |
5635
|
|
|
|
5636
|
|
|
pos = word.find('CH', 1) |
5637
|
|
|
while pos != -1: |
5638
|
|
|
if word[pos-1:pos] not in _vowels: |
5639
|
|
|
word = word[:pos]+'S'+word[pos+1:] |
5640
|
|
|
pos = word.find('CH', pos+1) |
5641
|
|
|
|
5642
|
|
|
word = word.replace('C', 'K') |
5643
|
|
|
word = word.replace('Z', 'S') |
5644
|
|
|
|
5645
|
|
|
word = word.replace('WR', 'R') |
5646
|
|
|
word = word.replace('DG', 'G') |
5647
|
|
|
word = word.replace('QU', 'K') |
5648
|
|
|
word = word.replace('T', 'D') |
5649
|
|
|
word = word.replace('PH', 'F') |
5650
|
|
|
|
5651
|
|
|
# Rule 4 (FL5) |
5652
|
|
|
# Although the rule indicates "after the first letter", the test cases make |
5653
|
|
|
# it clear that these apply to the first letter also. |
5654
|
|
|
pos = word.find('K', 0) |
5655
|
|
|
while pos != -1: |
5656
|
|
|
if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}: |
5657
|
|
|
word = word[:pos-1]+word[pos:] |
5658
|
|
|
pos -= 1 |
5659
|
|
|
pos = word.find('K', pos+1) |
5660
|
|
|
|
5661
|
|
|
# Rule FL6 |
5662
|
|
|
if maxlength and word[-1:] == 'E': |
5663
|
|
|
word = word[:-1] |
5664
|
|
|
|
5665
|
|
|
# Rule 5 (FL7) |
5666
|
|
|
word = _delete_consecutive_repeats(word) |
5667
|
|
|
|
5668
|
|
|
# Rule 6 (FL8) |
5669
|
|
|
if word[:2] == 'PF': |
5670
|
|
|
word = word[1:] |
5671
|
|
|
if word[-2:] == 'PF': |
5672
|
|
|
word = word[:-1] |
5673
|
|
|
elif word[-2:] == 'GH': |
5674
|
|
|
if word[-3:-2] in _vowels: |
5675
|
|
|
word = word[:-2]+'F' |
5676
|
|
|
else: |
5677
|
|
|
word = word[:-2]+'G' |
5678
|
|
|
word = word.replace('GH', '') |
5679
|
|
|
|
5680
|
|
|
# Rule FL9 |
5681
|
|
|
if maxlength: |
5682
|
|
|
word = word.replace('V', 'F') |
5683
|
|
|
|
5684
|
|
|
# Rules 7-9 (FL10-FL12) |
5685
|
|
|
first = 1 + (1 if maxlength else 0) |
5686
|
|
|
code = '' |
5687
|
|
|
for pos, char in enumerate(word): |
5688
|
|
|
if char in _vowels: |
5689
|
|
|
if first or keep_vowels: |
5690
|
|
|
code += vowel_char |
5691
|
|
|
first -= 1 |
5692
|
|
|
else: |
5693
|
|
|
continue |
5694
|
|
|
elif pos > 0 and char in {'W', 'H'}: |
5695
|
|
|
continue |
5696
|
|
|
else: |
5697
|
|
|
code += char |
5698
|
|
|
|
5699
|
|
|
if maxlength: |
5700
|
|
|
# Rule FL13 |
5701
|
|
|
if len(code) > maxlength and code[-1:] == 'S': |
5702
|
|
|
code = code[:-1] |
5703
|
|
|
if keep_vowels: |
5704
|
|
|
code = code[:maxlength] |
5705
|
|
|
else: |
5706
|
|
|
# Rule FL14 |
5707
|
|
|
code = code[:maxlength + 2] |
5708
|
|
|
# Rule FL15 |
5709
|
|
|
while len(code) > maxlength: |
5710
|
|
|
vowels = len(code) - maxlength |
5711
|
|
|
excess = vowels - 1 |
5712
|
|
|
word = code |
5713
|
|
|
code = '' |
5714
|
|
|
for char in word: |
5715
|
|
|
if char == vowel_char: |
5716
|
|
|
if vowels: |
5717
|
|
|
code += char |
5718
|
|
|
vowels -= 1 |
5719
|
|
|
else: |
5720
|
|
|
code += char |
5721
|
|
|
code = code[:maxlength + excess] |
5722
|
|
|
|
5723
|
|
|
# Rule FL16 |
5724
|
|
|
code += ' ' * (maxlength - len(code)) |
5725
|
|
|
|
5726
|
|
|
return code |
5727
|
|
|
|
5728
|
|
|
|
5729
|
|
|
def phonetic_spanish(word, maxlength=None): |
5730
|
|
|
"""Return the PhoneticSpanish coding of word. |
5731
|
|
|
|
5732
|
|
|
This follows the coding described in :cite:`Amon:2012` and |
5733
|
|
|
:cite:`delPilarAngeles:2015`. |
5734
|
|
|
|
5735
|
|
|
:param word: |
5736
|
|
|
:return: |
5737
|
|
|
""" |
5738
|
|
|
_es_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5739
|
|
|
'BCDFGHJKLMNPQRSTVXYZ'), |
5740
|
|
|
'14328287566079431454')) |
5741
|
|
|
|
5742
|
|
|
# uppercase, normalize, and decompose, filter to A-Z minus vowels & W |
5743
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5744
|
|
|
word = ''.join(c for c in word if c in |
5745
|
|
|
{'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', |
5746
|
|
|
'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'}) |
5747
|
|
|
|
5748
|
|
|
# merge repeated Ls & Rs |
5749
|
|
|
word = word.replace('LL', 'L') |
5750
|
|
|
word = word.replace('R', 'R') |
5751
|
|
|
|
5752
|
|
|
# apply the Soundex algorithm |
5753
|
|
|
sdx = word.translate(_es_soundex_translation) |
5754
|
|
|
|
5755
|
|
|
if maxlength: |
5756
|
|
|
sdx = sdx[:maxlength] |
5757
|
|
|
|
5758
|
|
|
return sdx |
5759
|
|
|
|
5760
|
|
|
|
5761
|
|
|
def spanish_metaphone(word, maxlength=6, modified=False): |
5762
|
|
|
"""Return the Spanish Metaphone of a word. |
5763
|
|
|
|
5764
|
|
|
This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at |
5765
|
|
|
https://github.com/amsqr/Spanish-Metaphone and discussed in |
5766
|
|
|
:cite:`Mosquera:2012`. |
5767
|
|
|
|
5768
|
|
|
Modified version based on :cite:`delPilarAngeles:2016`. |
5769
|
|
|
|
5770
|
|
|
:param word: |
5771
|
|
|
:param maxlength: |
5772
|
|
|
:param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's |
5773
|
|
|
modified version of the algorithm |
5774
|
|
|
:return: |
5775
|
|
|
""" |
5776
|
|
|
def _is_vowel(pos): |
5777
|
|
|
"""Return True if the character at word[pos] is a vowel.""" |
5778
|
|
|
if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}: |
5779
|
|
|
return True |
5780
|
|
|
return False |
5781
|
|
|
|
5782
|
|
|
word = normalize('NFC', text_type(word.upper())) |
5783
|
|
|
|
5784
|
|
|
meta_key = '' |
5785
|
|
|
pos = 0 |
5786
|
|
|
|
5787
|
|
|
# do some replacements for the modified version |
5788
|
|
|
if modified: |
5789
|
|
|
word = word.replace('MB', 'NB') |
5790
|
|
|
word = word.replace('MP', 'NP') |
5791
|
|
|
word = word.replace('BS', 'S') |
5792
|
|
|
if word[:2] == 'PS': |
5793
|
|
|
word = word[1:] |
5794
|
|
|
|
5795
|
|
|
# simple replacements |
5796
|
|
|
word = word.replace('Á', 'A') |
5797
|
|
|
word = word.replace('CH', 'X') |
5798
|
|
|
word = word.replace('Ç', 'S') |
5799
|
|
|
word = word.replace('É', 'E') |
5800
|
|
|
word = word.replace('Í', 'I') |
5801
|
|
|
word = word.replace('Ó', 'O') |
5802
|
|
|
word = word.replace('Ú', 'U') |
5803
|
|
|
word = word.replace('Ñ', 'NY') |
5804
|
|
|
word = word.replace('GÜ', 'W') |
5805
|
|
|
word = word.replace('Ü', 'U') |
5806
|
|
|
word = word.replace('B', 'V') |
5807
|
|
|
word = word.replace('LL', 'Y') |
5808
|
|
|
|
5809
|
|
|
while len(meta_key) < maxlength: |
5810
|
|
|
if pos >= len(word): |
5811
|
|
|
break |
5812
|
|
|
|
5813
|
|
|
# get the next character |
5814
|
|
|
current_char = word[pos] |
5815
|
|
|
|
5816
|
|
|
# if a vowel in pos 0, add to key |
5817
|
|
|
if _is_vowel(pos) and pos == 0: |
5818
|
|
|
meta_key += current_char |
5819
|
|
|
pos += 1 |
5820
|
|
|
# otherwise, do consonant rules |
5821
|
|
|
else: |
5822
|
|
|
# simple consonants (unmutated) |
5823
|
|
|
if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V', |
5824
|
|
|
'L', 'Y'}: |
5825
|
|
|
meta_key += current_char |
5826
|
|
|
# skip doubled consonants |
5827
|
|
|
if word[pos+1:pos+2] == current_char: |
5828
|
|
|
pos += 2 |
5829
|
|
|
else: |
5830
|
|
|
pos += 1 |
5831
|
|
|
else: |
5832
|
|
|
if current_char == 'C': |
5833
|
|
|
# special case 'acción', 'reacción',etc. |
5834
|
|
|
if word[pos+1:pos+2] == 'C': |
5835
|
|
|
meta_key += 'X' |
5836
|
|
|
pos += 2 |
5837
|
|
|
# special case 'cesar', 'cien', 'cid', 'conciencia' |
5838
|
|
|
elif word[pos+1:pos+2] in {'E', 'I'}: |
5839
|
|
|
meta_key += 'Z' |
5840
|
|
|
pos += 2 |
5841
|
|
|
# base case |
5842
|
|
|
else: |
5843
|
|
|
meta_key += 'K' |
5844
|
|
|
pos += 1 |
5845
|
|
|
elif current_char == 'G': |
5846
|
|
|
# special case 'gente', 'ecologia',etc |
5847
|
|
|
if word[pos + 1:pos + 2] in {'E', 'I'}: |
5848
|
|
|
meta_key += 'J' |
5849
|
|
|
pos += 2 |
5850
|
|
|
# base case |
5851
|
|
|
else: |
5852
|
|
|
meta_key += 'G' |
5853
|
|
|
pos += 1 |
5854
|
|
|
elif current_char == 'H': |
5855
|
|
|
# since the letter 'H' is silent in Spanish, |
5856
|
|
|
# set the meta key to the vowel after the letter 'H' |
5857
|
|
|
if _is_vowel(pos+1): |
5858
|
|
|
meta_key += word[pos+1] |
5859
|
|
|
pos += 2 |
5860
|
|
|
else: |
5861
|
|
|
meta_key += 'H' |
5862
|
|
|
pos += 1 |
5863
|
|
|
elif current_char == 'Q': |
5864
|
|
|
if word[pos+1:pos+2] == 'U': |
5865
|
|
|
pos += 2 |
5866
|
|
|
else: |
5867
|
|
|
pos += 1 |
5868
|
|
|
meta_key += 'K' |
5869
|
|
|
elif current_char == 'W': |
5870
|
|
|
meta_key += 'U' |
5871
|
|
|
pos += 1 |
5872
|
|
|
elif current_char == 'R': |
5873
|
|
|
meta_key += 'R' |
5874
|
|
|
pos += 1 |
5875
|
|
|
elif current_char == 'S': |
5876
|
|
|
if not _is_vowel(pos+1) and pos == 0: |
5877
|
|
|
meta_key += 'ES' |
5878
|
|
|
pos += 1 |
5879
|
|
|
else: |
5880
|
|
|
meta_key += 'S' |
5881
|
|
|
pos += 1 |
5882
|
|
|
elif current_char == 'Z': |
5883
|
|
|
meta_key += 'Z' |
5884
|
|
|
pos += 1 |
5885
|
|
|
elif current_char == 'X': |
5886
|
|
|
if len(word) > 1 and pos == 0 and not _is_vowel(pos+1): |
5887
|
|
|
meta_key += 'EX' |
5888
|
|
|
pos += 1 |
5889
|
|
|
else: |
5890
|
|
|
meta_key += 'X' |
5891
|
|
|
pos += 1 |
5892
|
|
|
else: |
5893
|
|
|
pos += 1 |
5894
|
|
|
|
5895
|
|
|
# Final change from S to Z in modified version |
5896
|
|
|
if modified: |
5897
|
|
|
meta_key = meta_key.replace('S', 'Z') |
5898
|
|
|
|
5899
|
|
|
return meta_key |
5900
|
|
|
|
5901
|
|
|
|
5902
|
|
|
def metasoundex(word, language='en'): |
5903
|
|
|
"""Return the MetaSoundex code for a word. |
5904
|
|
|
|
5905
|
|
|
This is based on :cite:`Koneru:2017`. |
5906
|
|
|
|
5907
|
|
|
:param word: |
5908
|
|
|
:param language: either 'en' for English or 'es' for Spanish |
5909
|
|
|
:return: |
5910
|
|
|
""" |
5911
|
|
|
_metasoundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5912
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
5913
|
|
|
'07430755015866075943077514')) |
5914
|
|
|
|
5915
|
|
|
if language == 'es': |
5916
|
|
|
return phonetic_spanish(spanish_metaphone(word)) |
5917
|
|
|
|
5918
|
|
|
word = soundex(metaphone(word)) |
5919
|
|
|
word = word[0].translate(_metasoundex_translation)+word[1:] |
5920
|
|
|
|
5921
|
|
|
return word |
5922
|
|
|
|
5923
|
|
|
|
5924
|
|
|
def soundex_br(word, maxlength=4, zero_pad=True): |
5925
|
|
|
"""Return the SoundexBR encoding of a word. |
5926
|
|
|
|
5927
|
|
|
This is based on :cite:`Marcelino:2015`. |
5928
|
|
|
|
5929
|
|
|
:param word: |
5930
|
|
|
:return: |
5931
|
|
|
""" |
5932
|
|
|
_soundex_br_translation = dict(zip((ord(_) for _ in |
|
|
|
|
5933
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
5934
|
|
|
'01230120022455012623010202')) |
5935
|
|
|
|
5936
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
5937
|
|
|
word = ''.join(c for c in word if c in |
5938
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
5939
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
5940
|
|
|
'Y', 'Z'}) |
5941
|
|
|
|
5942
|
|
|
if word[:2] == 'WA': |
5943
|
|
|
first = 'V' |
5944
|
|
|
elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}: |
5945
|
|
|
first = 'C' |
5946
|
|
|
elif word[:1] == 'C' and word[1:2] in {'I', 'E'}: |
5947
|
|
|
first = 'S' |
5948
|
|
|
elif word[:1] == 'G' and word[1:2] in {'E', 'I'}: |
5949
|
|
|
first = 'J' |
5950
|
|
|
elif word[:1] == 'Y': |
5951
|
|
|
first = 'I' |
5952
|
|
|
elif word[:1] == 'H': |
5953
|
|
|
first = word[1:2] |
5954
|
|
|
word = word[1:] |
5955
|
|
|
else: |
5956
|
|
|
first = word[:1] |
5957
|
|
|
|
5958
|
|
|
sdx = first + word[1:].translate(_soundex_br_translation) |
5959
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
5960
|
|
|
sdx = sdx.replace('0', '') |
5961
|
|
|
|
5962
|
|
|
if zero_pad: |
5963
|
|
|
sdx += ('0'*maxlength) |
5964
|
|
|
|
5965
|
|
|
return sdx[:maxlength] |
5966
|
|
|
|
5967
|
|
|
|
5968
|
|
|
def nrl(word): |
5969
|
|
|
"""Return the Naval Research Laboratory phonetic encoding of a word. |
5970
|
|
|
|
5971
|
|
|
This is defined by :cite:`Elovitz:1976`. |
5972
|
|
|
|
5973
|
|
|
:param word: |
5974
|
|
|
:return: |
5975
|
|
|
""" |
5976
|
|
|
def to_regex(pattern, left=True): |
5977
|
|
|
new_pattern = '' |
5978
|
|
|
replacements = {'#': '[AEIOU]+', |
5979
|
|
|
':': '[BCDFGHJKLMNPQRSTVWXYZ]*', |
5980
|
|
|
'^': '[BCDFGHJKLMNPQRSTVWXYZ]', |
5981
|
|
|
'.': '[BDVGJLMNTWZ]', |
5982
|
|
|
'%': '(ER|E|ES|ED|ING|ELY)', |
5983
|
|
|
'+': '[EIY]', |
5984
|
|
|
' ': '^'} |
5985
|
|
|
for char in pattern: |
5986
|
|
|
new_pattern += (replacements[char] if char in replacements |
5987
|
|
|
else char) |
5988
|
|
|
|
5989
|
|
|
if left: |
5990
|
|
|
new_pattern += '$' |
5991
|
|
|
if '^' not in pattern: |
5992
|
|
|
new_pattern = '^.*' + new_pattern |
5993
|
|
|
else: |
5994
|
|
|
new_pattern = '^' + new_pattern.replace('^', '$') |
5995
|
|
|
if '$' not in new_pattern: |
5996
|
|
|
new_pattern += '.*$' |
5997
|
|
|
|
5998
|
|
|
return new_pattern |
5999
|
|
|
|
6000
|
|
|
rules = {' ': (('', ' ', '', ' '), |
6001
|
|
|
('', '-', '', ''), |
6002
|
|
|
('.', '\'S', '', 'z'), |
6003
|
|
|
('#:.E', '\'S', '', 'z'), |
6004
|
|
|
('#', '\'S', '', 'z'), |
6005
|
|
|
('', '\'', '', ''), |
6006
|
|
|
('', ',', '', ' '), |
6007
|
|
|
('', '.', '', ' '), |
6008
|
|
|
('', '?', '', ' '), |
6009
|
|
|
('', '!', '', ' ')), |
6010
|
|
|
'A': (('', 'A', ' ', 'AX'), |
6011
|
|
|
(' ', 'ARE', ' ', 'AAr'), |
6012
|
|
|
(' ', 'AR', 'O', 'AXr'), |
6013
|
|
|
('', 'AR', '#', 'EHr'), |
6014
|
|
|
('^', 'AS', '#', 'EYs'), |
6015
|
|
|
('', 'A', 'WA', 'AX'), |
6016
|
|
|
('', 'AW', '', 'AO'), |
6017
|
|
|
(' :', 'ANY', '', 'EHnIY'), |
6018
|
|
|
('', 'A', '^+#', 'EY'), |
6019
|
|
|
('#:', 'ALLY', '', 'AXlIY'), |
6020
|
|
|
(' ', 'AL', '#', 'AXl'), |
6021
|
|
|
('', 'AGAIN', '', 'AXgEHn'), |
6022
|
|
|
('#:', 'AG', 'E', 'IHj'), |
6023
|
|
|
('', 'A', '^+:#', 'AE'), |
6024
|
|
|
(' :', 'A', '^+ ', 'EY'), |
6025
|
|
|
('', 'A', '^%', 'EY'), |
6026
|
|
|
(' ', 'ARR', '', 'AXr'), |
6027
|
|
|
('', 'ARR', '', 'AEr'), |
6028
|
|
|
(' :', 'AR', ' ', 'AAr'), |
6029
|
|
|
('', 'AR', ' ', 'ER'), |
6030
|
|
|
('', 'AR', '', 'AAr'), |
6031
|
|
|
('', 'AIR', '', 'EHr'), |
6032
|
|
|
('', 'AI', '', 'EY'), |
6033
|
|
|
('', 'AY', '', 'EY'), |
6034
|
|
|
('', 'AU', '', 'AO'), |
6035
|
|
|
('#:', 'AL', ' ', 'AXl'), |
6036
|
|
|
('#:', 'ALS', ' ', 'AXlz'), |
6037
|
|
|
('', 'ALK', '', 'AOk'), |
6038
|
|
|
('', 'AL', '^', 'AOl'), |
6039
|
|
|
(' :', 'ABLE', '', 'EYbAXl'), |
6040
|
|
|
('', 'ABLE', '', 'AXbAXl'), |
6041
|
|
|
('', 'ANG', '+', 'EYnj'), |
6042
|
|
|
('', 'A', '', 'AE')), |
6043
|
|
|
'B': ((' ', 'BE', '^#', 'bIH'), |
6044
|
|
|
('', 'BEING', '', 'bIYIHNG'), |
6045
|
|
|
(' ', 'BOTH', ' ', 'bOWTH'), |
6046
|
|
|
(' ', 'BUS', '#', 'bIHz'), |
6047
|
|
|
('', 'BUIL', '', 'bIHl'), |
6048
|
|
|
('', 'B', '', 'b')), |
6049
|
|
|
'C': ((' ', 'CH', '^', 'k'), |
6050
|
|
|
('^E', 'CH', '', 'k'), |
6051
|
|
|
('', 'CH', '', 'CH'), |
6052
|
|
|
(' S', 'CI', '#', 'sAY'), |
6053
|
|
|
('', 'CI', 'A', 'SH'), |
6054
|
|
|
('', 'CI', 'O', 'SH'), |
6055
|
|
|
('', 'CI', 'EN', 'SH'), |
6056
|
|
|
('', 'C', '+', 's'), |
6057
|
|
|
('', 'CK', '', 'k'), |
6058
|
|
|
('', 'COM', '%', 'kAHm'), |
6059
|
|
|
('', 'C', '', 'k')), |
6060
|
|
|
'D': (('#:', 'DED', ' ', 'dIHd'), |
6061
|
|
|
('.E', 'D', ' ', 'd'), |
6062
|
|
|
('#:^E', 'D', ' ', 't'), |
6063
|
|
|
(' ', 'DE', '^#', 'dIH'), |
6064
|
|
|
(' ', 'DO', ' ', 'dUW'), |
6065
|
|
|
(' ', 'DOES', '', 'dAHz'), |
6066
|
|
|
(' ', 'DOING', '', 'dUWIHNG'), |
6067
|
|
|
(' ', 'DOW', '', 'dAW'), |
6068
|
|
|
('', 'DU', 'A', 'jUW'), |
6069
|
|
|
('', 'D', '', 'd')), |
6070
|
|
|
'E': (('#:', 'E', ' ', ''), |
6071
|
|
|
('\':^', 'E', ' ', ''), |
6072
|
|
|
(' :', 'E', ' ', 'IY'), |
6073
|
|
|
('#', 'ED', ' ', 'd'), |
6074
|
|
|
('#:', 'E', 'D ', ''), |
6075
|
|
|
('', 'EV', 'ER', 'EHv'), |
6076
|
|
|
('', 'E', '^%', 'IY'), |
6077
|
|
|
('', 'ERI', '#', 'IYrIY'), |
6078
|
|
|
('', 'ERI', '', 'EHrIH'), |
6079
|
|
|
('#:', 'ER', '#', 'ER'), |
6080
|
|
|
('', 'ER', '#', 'EHr'), |
6081
|
|
|
('', 'ER', '', 'ER'), |
6082
|
|
|
(' ', 'EVEN', '', 'IYvEHn'), |
6083
|
|
|
('#:', 'E', 'W', ''), |
6084
|
|
|
('T', 'EW', '', 'UW'), |
6085
|
|
|
('S', 'EW', '', 'UW'), |
6086
|
|
|
('R', 'EW', '', 'UW'), |
6087
|
|
|
('D', 'EW', '', 'UW'), |
6088
|
|
|
('L', 'EW', '', 'UW'), |
6089
|
|
|
('Z', 'EW', '', 'UW'), |
6090
|
|
|
('N', 'EW', '', 'UW'), |
6091
|
|
|
('J', 'EW', '', 'UW'), |
6092
|
|
|
('TH', 'EW', '', 'UW'), |
6093
|
|
|
('CH', 'EW', '', 'UW'), |
6094
|
|
|
('SH', 'EW', '', 'UW'), |
6095
|
|
|
('', 'EW', '', 'yUW'), |
6096
|
|
|
('', 'E', 'O', 'IY'), |
6097
|
|
|
('#:S', 'ES', ' ', 'IHz'), |
6098
|
|
|
('#:C', 'ES', ' ', 'IHz'), |
6099
|
|
|
('#:G', 'ES', ' ', 'IHz'), |
6100
|
|
|
('#:Z', 'ES', ' ', 'IHz'), |
6101
|
|
|
('#:X', 'ES', ' ', 'IHz'), |
6102
|
|
|
('#:J', 'ES', ' ', 'IHz'), |
6103
|
|
|
('#:CH', 'ES', ' ', 'IHz'), |
6104
|
|
|
('#:SH', 'ES', ' ', 'IHz'), |
6105
|
|
|
('#:', 'E', 'S ', ''), |
6106
|
|
|
('#:', 'ELY', ' ', 'lIY'), |
6107
|
|
|
('#:', 'EMENT', '', 'mEHnt'), |
6108
|
|
|
('', 'EFUL', '', 'fUHl'), |
6109
|
|
|
('', 'EE', '', 'IY'), |
6110
|
|
|
('', 'EARN', '', 'ERn'), |
6111
|
|
|
(' ', 'EAR', '^', 'ER'), |
6112
|
|
|
('', 'EAD', '', 'EHd'), |
6113
|
|
|
('#:', 'EA', ' ', 'IYAX'), |
6114
|
|
|
('', 'EA', 'SU', 'EH'), |
6115
|
|
|
('', 'EA', '', 'IY'), |
6116
|
|
|
('', 'EIGH', '', 'EY'), |
6117
|
|
|
('', 'EI', '', 'IY'), |
6118
|
|
|
(' ', 'EYE', '', 'AY'), |
6119
|
|
|
('', 'EY', '', 'IY'), |
6120
|
|
|
('', 'EU', '', 'yUW'), |
6121
|
|
|
('', 'E', '', 'EH')), |
6122
|
|
|
'F': (('', 'FUL', '', 'fUHl'), |
6123
|
|
|
('', 'F', '', 'f')), |
6124
|
|
|
'G': (('', 'GIV', '', 'gIHv'), |
6125
|
|
|
(' ', 'G', 'I^', 'g'), |
6126
|
|
|
('', 'GE', 'T', 'gEH'), |
6127
|
|
|
('SU', 'GGES', '', 'gjEHs'), |
6128
|
|
|
('', 'GG', '', 'g'), |
6129
|
|
|
(' B#', 'G', '', 'g'), |
6130
|
|
|
('', 'G', '+', 'j'), |
6131
|
|
|
('', 'GREAT', '', 'grEYt'), |
6132
|
|
|
('#', 'GH', '', ''), |
6133
|
|
|
('', 'G', '', 'g')), |
6134
|
|
|
'H': ((' ', 'HAV', '', 'hAEv'), |
6135
|
|
|
(' ', 'HERE', '', 'hIYr'), |
6136
|
|
|
(' ', 'HOUR', '', 'AWER'), |
6137
|
|
|
('', 'HOW', '', 'hAW'), |
6138
|
|
|
('', 'H', '#', 'h'), |
6139
|
|
|
('', 'H', '', '')), |
6140
|
|
|
'I': ((' ', 'IN', '', 'IHn'), |
6141
|
|
|
(' ', 'I', ' ', 'AY'), |
6142
|
|
|
('', 'IN', 'D', 'AYn'), |
6143
|
|
|
('', 'IER', '', 'IYER'), |
6144
|
|
|
('#:R', 'IED', '', 'IYd'), |
6145
|
|
|
('', 'IED', ' ', 'AYd'), |
6146
|
|
|
('', 'IEN', '', 'IYEHn'), |
6147
|
|
|
('', 'IE', 'T', 'AYEH'), |
6148
|
|
|
(' :', 'I', '%', 'AY'), |
6149
|
|
|
('', 'I', '%', 'IY'), |
6150
|
|
|
('', 'IE', '', 'IY'), |
6151
|
|
|
('', 'I', '^+:#', 'IH'), |
6152
|
|
|
('', 'IR', '#', 'AYr'), |
6153
|
|
|
('', 'IZ', '%', 'AYz'), |
6154
|
|
|
('', 'IS', '%', 'AYz'), |
6155
|
|
|
('', 'I', 'D%', 'AY'), |
6156
|
|
|
('+^', 'I', '^+', 'IH'), |
6157
|
|
|
('', 'I', 'T%', 'AY'), |
6158
|
|
|
('#:^', 'I', '^+', 'IH'), |
6159
|
|
|
('', 'I', '^+', 'AY'), |
6160
|
|
|
('', 'IR', '', 'ER'), |
6161
|
|
|
('', 'IGH', '', 'AY'), |
6162
|
|
|
('', 'ILD', '', 'AYld'), |
6163
|
|
|
('', 'IGN', ' ', 'AYn'), |
6164
|
|
|
('', 'IGN', '^', 'AYn'), |
6165
|
|
|
('', 'IGN', '%', 'AYn'), |
6166
|
|
|
('', 'IQUE', '', 'IYk'), |
6167
|
|
|
('', 'I', '', 'IH')), |
6168
|
|
|
'J': (('', 'J', '', 'j'),), |
6169
|
|
|
'K': ((' ', 'K', 'N', ''), |
6170
|
|
|
('', 'K', '', 'k')), |
6171
|
|
|
'L': (('', 'LO', 'C#', 'lOW'), |
6172
|
|
|
('L', 'L', '', ''), |
6173
|
|
|
('#:^', 'L', '%', 'AXl'), |
6174
|
|
|
('', 'LEAD', '', 'lIYd'), |
6175
|
|
|
('', 'L', '', 'l')), |
6176
|
|
|
'M': (('', 'MOV', '', 'mUWv'), |
6177
|
|
|
('', 'M', '', 'm')), |
6178
|
|
|
'N': (('E', 'NG', '+', 'nj'), |
6179
|
|
|
('', 'NG', 'R', 'NGg'), |
6180
|
|
|
('', 'NG', '#', 'NGg'), |
6181
|
|
|
('', 'NGL', '%', 'NGgAXl'), |
6182
|
|
|
('', 'NG', '', 'NG'), |
6183
|
|
|
('', 'NK', '', 'NGk'), |
6184
|
|
|
(' ', 'NOW', ' ', 'nAW'), |
6185
|
|
|
('', 'N', '', 'n')), |
6186
|
|
|
'O': (('', 'OF', ' ', 'AXv'), |
6187
|
|
|
('', 'OROUGH', '', 'EROW'), |
6188
|
|
|
('#:', 'OR', ' ', 'ER'), |
6189
|
|
|
('#:', 'ORS', ' ', 'ERz'), |
6190
|
|
|
('', 'OR', '', 'AOr'), |
6191
|
|
|
(' ', 'ONE', '', 'wAHn'), |
6192
|
|
|
('', 'OW', '', 'OW'), |
6193
|
|
|
(' ', 'OVER', '', 'OWvER'), |
6194
|
|
|
('', 'OV', '', 'AHv'), |
6195
|
|
|
('', 'O', '^%', 'OW'), |
6196
|
|
|
('', 'O', '^EN', 'OW'), |
6197
|
|
|
('', 'O', '^I#', 'OW'), |
6198
|
|
|
('', 'OL', 'D', 'OWl'), |
6199
|
|
|
('', 'OUGHT', '', 'AOt'), |
6200
|
|
|
('', 'OUGH', '', 'AHf'), |
6201
|
|
|
(' ', 'OU', '', 'AW'), |
6202
|
|
|
('H', 'OU', 'S#', 'AW'), |
6203
|
|
|
('', 'OUS', '', 'AXs'), |
6204
|
|
|
('', 'OUR', '', 'AOr'), |
6205
|
|
|
('', 'OULD', '', 'UHd'), |
6206
|
|
|
('^', 'OU', '^L', 'AH'), |
6207
|
|
|
('', 'OUP', '', 'UWp'), |
6208
|
|
|
('', 'OU', '', 'AW'), |
6209
|
|
|
('', 'OY', '', 'OY'), |
6210
|
|
|
('', 'OING', '', 'OWIHNG'), |
6211
|
|
|
('', 'OI', '', 'OY'), |
6212
|
|
|
('', 'OOR', '', 'AOr'), |
6213
|
|
|
('', 'OOK', '', 'UHk'), |
6214
|
|
|
('', 'OOD', '', 'UHd'), |
6215
|
|
|
('', 'OO', '', 'UW'), |
6216
|
|
|
('', 'O', 'E', 'OW'), |
6217
|
|
|
('', 'O', ' ', 'OW'), |
6218
|
|
|
('', 'OA', '', 'OW'), |
6219
|
|
|
(' ', 'ONLY', '', 'OWnlIY'), |
6220
|
|
|
(' ', 'ONCE', '', 'wAHns'), |
6221
|
|
|
('', 'ON\'T', '', 'OWnt'), |
6222
|
|
|
('C', 'O', 'N', 'AA'), |
6223
|
|
|
('', 'O', 'NG', 'AO'), |
6224
|
|
|
(' :^', 'O', 'N', 'AH'), |
6225
|
|
|
('I', 'ON', '', 'AXn'), |
6226
|
|
|
('#:', 'ON', ' ', 'AXn'), |
6227
|
|
|
('#^', 'ON', '', 'AXn'), |
6228
|
|
|
('', 'O', 'ST ', 'OW'), |
6229
|
|
|
('', 'OF', '^', 'AOf'), |
6230
|
|
|
('', 'OTHER', '', 'AHDHER'), |
6231
|
|
|
('', 'OSS', ' ', 'AOs'), |
6232
|
|
|
('#:^', 'OM', '', 'AHm'), |
6233
|
|
|
('', 'O', '', 'AA')), |
6234
|
|
|
'P': (('', 'PH', '', 'f'), |
6235
|
|
|
('', 'PEOP', '', 'pIYp'), |
6236
|
|
|
('', 'POW', '', 'pAW'), |
6237
|
|
|
('', 'PUT', ' ', 'pUHt'), |
6238
|
|
|
('', 'P', '', 'p')), |
6239
|
|
|
'Q': (('', 'QUAR', '', 'kwAOr'), |
6240
|
|
|
('', 'QU', '', 'kw'), |
6241
|
|
|
('', 'Q', '', 'k')), |
6242
|
|
|
'R': ((' ', 'RE', '^#', 'rIY'), |
6243
|
|
|
('', 'R', '', 'r')), |
6244
|
|
|
'S': (('', 'SH', '', 'SH'), |
6245
|
|
|
('#', 'SION', '', 'ZHAXn'), |
6246
|
|
|
('', 'SOME', '', 'sAHm'), |
6247
|
|
|
('#', 'SUR', '#', 'ZHER'), |
6248
|
|
|
('', 'SUR', '#', 'SHER'), |
6249
|
|
|
('#', 'SU', '#', 'ZHUW'), |
6250
|
|
|
('#', 'SSU', '#', 'SHUW'), |
6251
|
|
|
('#', 'SED', ' ', 'zd'), |
6252
|
|
|
('#', 'S', '#', 'z'), |
6253
|
|
|
('', 'SAID', '', 'sEHd'), |
6254
|
|
|
('^', 'SION', '', 'SHAXn'), |
6255
|
|
|
('', 'S', 'S', ''), |
6256
|
|
|
('.', 'S', ' ', 'z'), |
6257
|
|
|
('#:.E', 'S', ' ', 'z'), |
6258
|
|
|
('#:^##', 'S', ' ', 'z'), |
6259
|
|
|
('#:^#', 'S', ' ', 's'), |
6260
|
|
|
('U', 'S', ' ', 's'), |
6261
|
|
|
(' :#', 'S', ' ', 'z'), |
6262
|
|
|
(' ', 'SCH', '', 'sk'), |
6263
|
|
|
('', 'S', 'C+', ''), |
6264
|
|
|
('#', 'SM', '', 'zm'), |
6265
|
|
|
('#', 'SN', '\'', 'zAXn'), |
6266
|
|
|
('', 'S', '', 's')), |
6267
|
|
|
'T': ((' ', 'THE', ' ', 'DHAX'), |
6268
|
|
|
('', 'TO', ' ', 'tUW'), |
6269
|
|
|
('', 'THAT', ' ', 'DHAEt'), |
6270
|
|
|
(' ', 'THIS', ' ', 'DHIHs'), |
6271
|
|
|
(' ', 'THEY', '', 'DHEY'), |
6272
|
|
|
(' ', 'THERE', '', 'DHEHr'), |
6273
|
|
|
('', 'THER', '', 'DHER'), |
6274
|
|
|
('', 'THEIR', '', 'DHEHr'), |
6275
|
|
|
(' ', 'THAN', ' ', 'DHAEn'), |
6276
|
|
|
(' ', 'THEM', ' ', 'DHEHm'), |
6277
|
|
|
('', 'THESE', ' ', 'DHIYz'), |
6278
|
|
|
(' ', 'THEN', '', 'DHEHn'), |
6279
|
|
|
('', 'THROUGH', '', 'THrUW'), |
6280
|
|
|
('', 'THOSE', '', 'DHOWz'), |
6281
|
|
|
('', 'THOUGH', ' ', 'DHOW'), |
6282
|
|
|
(' ', 'THUS', '', 'DHAHs'), |
6283
|
|
|
('', 'TH', '', 'TH'), |
6284
|
|
|
('#:', 'TED', ' ', 'tIHd'), |
6285
|
|
|
('S', 'TI', '#N', 'CH'), |
6286
|
|
|
('', 'TI', 'O', 'SH'), |
6287
|
|
|
('', 'TI', 'A', 'SH'), |
6288
|
|
|
('', 'TIEN', '', 'SHAXn'), |
6289
|
|
|
('', 'TUR', '#', 'CHER'), |
6290
|
|
|
('', 'TU', 'A', 'CHUW'), |
6291
|
|
|
(' ', 'TWO', '', 'tUW'), |
6292
|
|
|
('', 'T', '', 't')), |
6293
|
|
|
'U': ((' ', 'UN', 'I', 'yUWn'), |
6294
|
|
|
(' ', 'UN', '', 'AHn'), |
6295
|
|
|
(' ', 'UPON', '', 'AXpAOn'), |
6296
|
|
|
('T', 'UR', '#', 'UHr'), |
6297
|
|
|
('S', 'UR', '#', 'UHr'), |
6298
|
|
|
('R', 'UR', '#', 'UHr'), |
6299
|
|
|
('D', 'UR', '#', 'UHr'), |
6300
|
|
|
('L', 'UR', '#', 'UHr'), |
6301
|
|
|
('Z', 'UR', '#', 'UHr'), |
6302
|
|
|
('N', 'UR', '#', 'UHr'), |
6303
|
|
|
('J', 'UR', '#', 'UHr'), |
6304
|
|
|
('TH', 'UR', '#', 'UHr'), |
6305
|
|
|
('CH', 'UR', '#', 'UHr'), |
6306
|
|
|
('SH', 'UR', '#', 'UHr'), |
6307
|
|
|
('', 'UR', '#', 'yUHr'), |
6308
|
|
|
('', 'UR', '', 'ER'), |
6309
|
|
|
('', 'U', '^ ', 'AH'), |
6310
|
|
|
('', 'U', '^^', 'AH'), |
6311
|
|
|
('', 'UY', '', 'AY'), |
6312
|
|
|
(' G', 'U', '#', ''), |
6313
|
|
|
('G', 'U', '%', ''), |
6314
|
|
|
('G', 'U', '#', 'w'), |
6315
|
|
|
('#N', 'U', '', 'yUW'), |
6316
|
|
|
('T', 'U', '', 'UW'), |
6317
|
|
|
('S', 'U', '', 'UW'), |
6318
|
|
|
('R', 'U', '', 'UW'), |
6319
|
|
|
('D', 'U', '', 'UW'), |
6320
|
|
|
('L', 'U', '', 'UW'), |
6321
|
|
|
('Z', 'U', '', 'UW'), |
6322
|
|
|
('N', 'U', '', 'UW'), |
6323
|
|
|
('J', 'U', '', 'UW'), |
6324
|
|
|
('TH', 'U', '', 'UW'), |
6325
|
|
|
('CH', 'U', '', 'UW'), |
6326
|
|
|
('SH', 'U', '', 'UW'), |
6327
|
|
|
('', 'U', '', 'yUW')), |
6328
|
|
|
'V': (('', 'VIEW', '', 'vyUW'), |
6329
|
|
|
('', 'V', '', 'v')), |
6330
|
|
|
'W': ((' ', 'WERE', '', 'wER'), |
6331
|
|
|
('', 'WA', 'S', 'wAA'), |
6332
|
|
|
('', 'WA', 'T', 'wAA'), |
6333
|
|
|
('', 'WHERE', '', 'WHEHr'), |
6334
|
|
|
('', 'WHAT', '', 'WHAAt'), |
6335
|
|
|
('', 'WHOL', '', 'hOWl'), |
6336
|
|
|
('', 'WHO', '', 'hUW'), |
6337
|
|
|
('', 'WH', '', 'WH'), |
6338
|
|
|
('', 'WAR', '', 'wAOr'), |
6339
|
|
|
('', 'WOR', '^', 'wER'), |
6340
|
|
|
('', 'WR', '', 'r'), |
6341
|
|
|
('', 'W', '', 'w')), |
6342
|
|
|
'X': (('', 'X', '', 'ks'),), |
6343
|
|
|
'Y': (('', 'YOUNG', '', 'yAHNG'), |
6344
|
|
|
(' ', 'YOU', '', 'yUW'), |
6345
|
|
|
(' ', 'YES', '', 'yEHs'), |
6346
|
|
|
(' ', 'Y', '', 'y'), |
6347
|
|
|
('#:^', 'Y', ' ', 'IY'), |
6348
|
|
|
('#:^', 'Y', 'I', 'IY'), |
6349
|
|
|
(' :', 'Y', ' ', 'AY'), |
6350
|
|
|
(' :', 'Y', '#', 'AY'), |
6351
|
|
|
(' :', 'Y', '^+:#', 'IH'), |
6352
|
|
|
(' :', 'Y', '^#', 'AY'), |
6353
|
|
|
('', 'Y', '', 'IH')), |
6354
|
|
|
'Z': (('', 'Z', '', 'z'),)} |
6355
|
|
|
|
6356
|
|
|
word = word.upper() |
6357
|
|
|
|
6358
|
|
|
pron = '' |
6359
|
|
|
pos = 0 |
6360
|
|
|
while pos < len(word): |
6361
|
|
|
left_orig = word[:pos] |
6362
|
|
|
right_orig = word[pos:] |
6363
|
|
|
first = word[pos] if word[pos] in rules else ' ' |
6364
|
|
|
for rule in rules[first]: |
6365
|
|
|
left, match, right, out = rule |
6366
|
|
|
if right_orig.startswith(match): |
6367
|
|
|
if left: |
6368
|
|
|
l_pattern = to_regex(left, left=True) |
6369
|
|
|
if right: |
6370
|
|
|
r_pattern = to_regex(right, left=False) |
6371
|
|
|
if ((not left or re_match(l_pattern, left_orig)) and |
|
|
|
|
6372
|
|
|
(not right or |
6373
|
|
|
re_match(r_pattern, right_orig[len(match):]))): |
|
|
|
|
6374
|
|
|
pron += out |
6375
|
|
|
pos += len(match) |
6376
|
|
|
break |
6377
|
|
|
else: |
6378
|
|
|
pron += word[pos] |
6379
|
|
|
pos += 1 |
6380
|
|
|
|
6381
|
|
|
return pron |
6382
|
|
|
|
6383
|
|
|
|
6384
|
|
|
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx', |
6385
|
|
|
concat=False, filter_langs=False): |
6386
|
|
|
"""Return the Beider-Morse Phonetic Matching algorithm code for a word. |
6387
|
|
|
|
6388
|
|
|
The Beider-Morse Phonetic Matching algorithm is described in |
6389
|
|
|
:cite:`Beider:2008`. |
6390
|
|
|
The reference implementation is licensed under GPLv3. |
6391
|
|
|
|
6392
|
|
|
:param str word: the word to transform |
6393
|
|
|
:param str language_arg: the language of the term; supported values |
6394
|
|
|
include: |
6395
|
|
|
|
6396
|
|
|
- 'any' |
6397
|
|
|
- 'arabic' |
6398
|
|
|
- 'cyrillic' |
6399
|
|
|
- 'czech' |
6400
|
|
|
- 'dutch' |
6401
|
|
|
- 'english' |
6402
|
|
|
- 'french' |
6403
|
|
|
- 'german' |
6404
|
|
|
- 'greek' |
6405
|
|
|
- 'greeklatin' |
6406
|
|
|
- 'hebrew' |
6407
|
|
|
- 'hungarian' |
6408
|
|
|
- 'italian' |
6409
|
|
|
- 'latvian' |
6410
|
|
|
- 'polish' |
6411
|
|
|
- 'portuguese' |
6412
|
|
|
- 'romanian' |
6413
|
|
|
- 'russian' |
6414
|
|
|
- 'spanish' |
6415
|
|
|
- 'turkish' |
6416
|
|
|
|
6417
|
|
|
:param str name_mode: the name mode of the algorithm: |
6418
|
|
|
|
6419
|
|
|
- 'gen' -- general (default) |
6420
|
|
|
- 'ash' -- Ashkenazi |
6421
|
|
|
- 'sep' -- Sephardic |
6422
|
|
|
|
6423
|
|
|
:param str match_mode: matching mode: 'approx' or 'exact' |
6424
|
|
|
:param bool concat: concatenation mode |
6425
|
|
|
:param bool filter_langs: filter out incompatible languages |
6426
|
|
|
:returns: the BMPM value(s) |
6427
|
|
|
:rtype: tuple |
6428
|
|
|
|
6429
|
|
|
>>> bmpm('Christopher') |
6430
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
6431
|
|
|
xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir |
6432
|
|
|
tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir |
6433
|
|
|
zritofi' |
6434
|
|
|
>>> bmpm('Niall') |
6435
|
|
|
'nial niol' |
6436
|
|
|
>>> bmpm('Smith') |
6437
|
|
|
'zmit' |
6438
|
|
|
>>> bmpm('Schmidt') |
6439
|
|
|
'zmit stzmit' |
6440
|
|
|
|
6441
|
|
|
>>> bmpm('Christopher', language_arg='German') |
6442
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
6443
|
|
|
xristYfir' |
6444
|
|
|
>>> bmpm('Christopher', language_arg='English') |
6445
|
|
|
'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir |
6446
|
|
|
xrQstafir' |
6447
|
|
|
>>> bmpm('Christopher', language_arg='German', name_mode='ash') |
6448
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
6449
|
|
|
xristYfir' |
6450
|
|
|
|
6451
|
|
|
>>> bmpm('Christopher', language_arg='German', match_mode='exact') |
6452
|
|
|
'xriStopher xriStofer xristopher xristofer' |
6453
|
|
|
""" |
6454
|
|
|
return _bmpm(word, language_arg, name_mode, match_mode, |
6455
|
|
|
concat, filter_langs) |
6456
|
|
|
|
6457
|
|
|
|
6458
|
|
|
if __name__ == '__main__': |
6459
|
|
|
import doctest |
6460
|
|
|
doctest.testmod() |
6461
|
|
|
|