1
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.phonetic. |
20
|
|
|
|
21
|
|
|
The phonetic module implements phonetic algorithms including: |
22
|
|
|
|
23
|
|
|
- Robert C. Russell's Index |
24
|
|
|
- American Soundex |
25
|
|
|
- Refined Soundex |
26
|
|
|
- Daitch-Mokotoff Soundex |
27
|
|
|
- Kölner Phonetik |
28
|
|
|
- NYSIIS |
29
|
|
|
- Match Rating Algorithm |
30
|
|
|
- Metaphone |
31
|
|
|
- Double Metaphone |
32
|
|
|
- Caverphone |
33
|
|
|
- Alpha Search Inquiry System |
34
|
|
|
- Fuzzy Soundex |
35
|
|
|
- Phonex |
36
|
|
|
- Phonem |
37
|
|
|
- Phonix |
38
|
|
|
- SfinxBis |
39
|
|
|
- phonet |
40
|
|
|
- Standardized Phonetic Frequency Code |
41
|
|
|
- Statistics Canada |
42
|
|
|
- Lein |
43
|
|
|
- Roger Root |
44
|
|
|
- Oxford Name Compression Algorithm (ONCA) |
45
|
|
|
- Beider-Morse Phonetic Matching |
46
|
|
|
""" |
47
|
|
|
|
48
|
|
|
from __future__ import division, unicode_literals |
49
|
|
|
|
50
|
|
|
import re |
51
|
|
|
import unicodedata |
52
|
|
|
from collections import Counter |
53
|
|
|
from itertools import groupby |
54
|
|
|
|
55
|
|
|
from six import text_type |
56
|
|
|
from six.moves import range |
57
|
|
|
|
58
|
|
|
from ._bm import _bmpm |
59
|
|
|
|
60
|
|
|
_INFINITY = float('inf') |
61
|
|
|
|
62
|
|
|
|
63
|
|
|
def _delete_consecutive_repeats(word): |
64
|
|
|
"""Delete consecutive repeated characters in a word. |
65
|
|
|
|
66
|
|
|
:param str word: the word to transform |
67
|
|
|
:returns: word with consecutive repeating characters collapsed to |
68
|
|
|
a single instance |
69
|
|
|
:rtype: str |
70
|
|
|
""" |
71
|
|
|
return ''.join(char for char, _ in groupby(word)) |
72
|
|
|
|
73
|
|
|
|
74
|
|
|
def russell_index(word): |
75
|
|
|
"""Return the Russell Index (integer output) of a word. |
76
|
|
|
|
77
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
78
|
|
|
US Patent 1,261,167 (1917) |
79
|
|
|
|
80
|
|
|
:param str word: the word to transform |
81
|
|
|
:returns: the Russell Index value |
82
|
|
|
:rtype: int |
83
|
|
|
|
84
|
|
|
>>> russell_index('Christopher') |
85
|
|
|
3813428 |
86
|
|
|
>>> russell_index('Niall') |
87
|
|
|
715 |
88
|
|
|
>>> russell_index('Smith') |
89
|
|
|
3614 |
90
|
|
|
>>> russell_index('Schmidt') |
91
|
|
|
3614 |
92
|
|
|
""" |
93
|
|
|
_russell_translation = dict(zip((ord(_) for _ in |
|
|
|
|
94
|
|
|
'ABCDEFGIKLMNOPQRSTUVXYZ'), |
95
|
|
|
'12341231356712383412313')) |
96
|
|
|
|
97
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
98
|
|
|
word = word.replace('ß', 'SS') |
99
|
|
|
word = word.replace('GH', '') # discard gh (rule 3) |
100
|
|
|
word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) |
101
|
|
|
|
102
|
|
|
# translate according to Russell's mapping |
103
|
|
|
word = ''.join(c for c in word if c in |
104
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', |
105
|
|
|
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}) |
106
|
|
|
sdx = word.translate(_russell_translation) |
107
|
|
|
|
108
|
|
|
# remove any 1s after the first occurrence |
109
|
|
|
one = sdx.find('1')+1 |
110
|
|
|
if one: |
111
|
|
|
sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') |
112
|
|
|
|
113
|
|
|
# remove repeating characters |
114
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
115
|
|
|
|
116
|
|
|
# return as an int |
117
|
|
|
return int(sdx) if sdx else float('NaN') |
118
|
|
|
|
119
|
|
|
|
120
|
|
|
def russell_index_num_to_alpha(num): |
121
|
|
|
"""Convert the Russell Index integer to an alphabetic string. |
122
|
|
|
|
123
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
124
|
|
|
US Patent 1,261,167 (1917) |
125
|
|
|
|
126
|
|
|
:param int num: a Russell Index integer value |
127
|
|
|
:returns: the Russell Index as an alphabetic string |
128
|
|
|
:rtype: str |
129
|
|
|
|
130
|
|
|
>>> russell_index_num_to_alpha(3813428) |
131
|
|
|
'CRACDBR' |
132
|
|
|
>>> russell_index_num_to_alpha(715) |
133
|
|
|
'NAL' |
134
|
|
|
>>> russell_index_num_to_alpha(3614) |
135
|
|
|
'CMAD' |
136
|
|
|
""" |
137
|
|
|
_russell_num_translation = dict(zip((ord(_) for _ in '12345678'), |
|
|
|
|
138
|
|
|
'ABCDLMNR')) |
139
|
|
|
num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5', |
140
|
|
|
'6', '7', '8'}) |
141
|
|
|
if num: |
142
|
|
|
return num.translate(_russell_num_translation) |
143
|
|
|
return '' |
144
|
|
|
|
145
|
|
|
|
146
|
|
|
def russell_index_alpha(word): |
147
|
|
|
"""Return the Russell Index (alphabetic output) for the word. |
148
|
|
|
|
149
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
150
|
|
|
US Patent 1,261,167 (1917) |
151
|
|
|
|
152
|
|
|
:param str word: the word to transform |
153
|
|
|
:returns: the Russell Index value as an alphabetic string |
154
|
|
|
:rtype: str |
155
|
|
|
|
156
|
|
|
>>> russell_index_alpha('Christopher') |
157
|
|
|
'CRACDBR' |
158
|
|
|
>>> russell_index_alpha('Niall') |
159
|
|
|
'NAL' |
160
|
|
|
>>> russell_index_alpha('Smith') |
161
|
|
|
'CMAD' |
162
|
|
|
>>> russell_index_alpha('Schmidt') |
163
|
|
|
'CMAD' |
164
|
|
|
""" |
165
|
|
|
if word: |
166
|
|
|
return russell_index_num_to_alpha(russell_index(word)) |
167
|
|
|
return '' |
168
|
|
|
|
169
|
|
|
|
170
|
|
|
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True): |
171
|
|
|
"""Return the Soundex code for a word. |
172
|
|
|
|
173
|
|
|
:param str word: the word to transform |
174
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
175
|
|
|
:param str var: the variant of the algorithm to employ (defaults to |
176
|
|
|
'American'): |
177
|
|
|
|
178
|
|
|
- 'American' follows the American Soundex algorithm, as described at |
179
|
|
|
http://www.archives.gov/publications/general-info-leaflets/55-census.html |
180
|
|
|
and in Knuth(1998:394); this is also called Miracode |
181
|
|
|
- 'special' follows the rules from the 1880-1910 US Census |
182
|
|
|
retrospective re-analysis, in which h & w are not treated as blocking |
183
|
|
|
consonants but as vowels. |
184
|
|
|
Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm |
185
|
|
|
- 'dm' computes the Daitch-Mokotoff Soundex |
186
|
|
|
|
187
|
|
|
:param bool reverse: reverse the word before computing the selected Soundex |
188
|
|
|
(defaults to False); This results in "Reverse Soundex" |
189
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
190
|
|
|
maxlength string |
191
|
|
|
:returns: the Soundex value |
192
|
|
|
:rtype: str |
193
|
|
|
|
194
|
|
|
>>> soundex("Christopher") |
195
|
|
|
'C623' |
196
|
|
|
>>> soundex("Niall") |
197
|
|
|
'N400' |
198
|
|
|
>>> soundex('Smith') |
199
|
|
|
'S530' |
200
|
|
|
>>> soundex('Schmidt') |
201
|
|
|
'S530' |
202
|
|
|
|
203
|
|
|
|
204
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY) |
205
|
|
|
'C623160000000000000000000000000000000000000000000000000000000000' |
206
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False) |
207
|
|
|
'C62316' |
208
|
|
|
|
209
|
|
|
>>> soundex('Christopher', reverse=True) |
210
|
|
|
'R132' |
211
|
|
|
|
212
|
|
|
>>> soundex('Ashcroft') |
213
|
|
|
'A261' |
214
|
|
|
>>> soundex('Asicroft') |
215
|
|
|
'A226' |
216
|
|
|
>>> soundex('Ashcroft', var='special') |
217
|
|
|
'A226' |
218
|
|
|
>>> soundex('Asicroft', var='special') |
219
|
|
|
'A226' |
220
|
|
|
|
221
|
|
|
>>> soundex('Christopher', var='dm') |
222
|
|
|
{'494379', '594379'} |
223
|
|
|
>>> soundex('Niall', var='dm') |
224
|
|
|
{'680000'} |
225
|
|
|
>>> soundex('Smith', var='dm') |
226
|
|
|
{'463000'} |
227
|
|
|
>>> soundex('Schmidt', var='dm') |
228
|
|
|
{'463000'} |
229
|
|
|
""" |
230
|
|
|
_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
231
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
232
|
|
|
'01230129022455012623019202')) |
233
|
|
|
|
234
|
|
|
# Call the D-M Soundex function itself if requested |
235
|
|
|
if var == 'dm': |
|
|
|
|
236
|
|
|
return dm_soundex(word, maxlength, reverse, zero_pad) |
237
|
|
|
elif var == 'refined': |
238
|
|
|
return refined_soundex(word, maxlength, reverse, zero_pad) |
239
|
|
|
|
240
|
|
|
# Require a maxlength of at least 4 and not more than 64 |
241
|
|
|
if maxlength is not None: |
242
|
|
|
maxlength = min(max(4, maxlength), 64) |
243
|
|
|
else: |
244
|
|
|
maxlength = 64 |
245
|
|
|
|
246
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
247
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
248
|
|
|
word = word.replace('ß', 'SS') |
249
|
|
|
word = ''.join(c for c in word if c in |
250
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
251
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
252
|
|
|
'Y', 'Z'}) |
253
|
|
|
|
254
|
|
|
# Nothing to convert, return base case |
255
|
|
|
if not word: |
256
|
|
|
if zero_pad: |
257
|
|
|
return '0'*maxlength |
258
|
|
|
return '0' |
259
|
|
|
|
260
|
|
|
# Reverse word if computing Reverse Soundex |
261
|
|
|
if reverse: |
262
|
|
|
word = word[::-1] |
263
|
|
|
|
264
|
|
|
# apply the Soundex algorithm |
265
|
|
|
sdx = word.translate(_soundex_translation) |
266
|
|
|
|
267
|
|
|
if var == 'special': |
268
|
|
|
sdx = sdx.replace('9', '0') # special rule for 1880-1910 census |
269
|
|
|
else: |
270
|
|
|
sdx = sdx.replace('9', '') # rule 1 |
271
|
|
|
sdx = _delete_consecutive_repeats(sdx) # rule 3 |
272
|
|
|
|
273
|
|
|
if word[0] in 'HW': |
274
|
|
|
sdx = word[0] + sdx |
275
|
|
|
else: |
276
|
|
|
sdx = word[0] + sdx[1:] |
277
|
|
|
sdx = sdx.replace('0', '') # rule 1 |
278
|
|
|
|
279
|
|
|
if zero_pad: |
280
|
|
|
sdx += ('0'*maxlength) # rule 4 |
281
|
|
|
|
282
|
|
|
return sdx[:maxlength] |
283
|
|
|
|
284
|
|
|
|
285
|
|
|
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False): |
|
|
|
|
286
|
|
|
"""Return the Refined Soundex code for a word. |
287
|
|
|
|
288
|
|
|
This is Soundex, but with more character classes. It appears to have been |
289
|
|
|
defined by the Apache Commons: |
290
|
|
|
https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/RefinedSoundex.html |
291
|
|
|
|
292
|
|
|
:param word: the word to transform |
293
|
|
|
:param maxlength: the length of the code returned (defaults to unlimited) |
294
|
|
|
:param reverse: reverse the word before computing the selected Soundex |
295
|
|
|
(defaults to False); This results in "Reverse Soundex" |
296
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
297
|
|
|
maxlength string |
298
|
|
|
:returns: the Refined Soundex value |
299
|
|
|
:rtype: str |
300
|
|
|
|
301
|
|
|
>>> refined_soundex('Christopher') |
302
|
|
|
'C3090360109' |
303
|
|
|
>>> refined_soundex('Niall') |
304
|
|
|
'N807' |
305
|
|
|
>>> refined_soundex('Smith') |
306
|
|
|
'S38060' |
307
|
|
|
>>> refined_soundex('Schmidt') |
308
|
|
|
'S30806' |
309
|
|
|
""" |
310
|
|
|
_ref_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
311
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
312
|
|
|
'01360240043788015936020505')) |
313
|
|
|
|
314
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
315
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
316
|
|
|
word = word.replace('ß', 'SS') |
317
|
|
|
word = ''.join(c for c in word if c in |
318
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
319
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
320
|
|
|
'Y', 'Z'}) |
321
|
|
|
|
322
|
|
|
# Reverse word if computing Reverse Soundex |
323
|
|
|
if reverse: |
324
|
|
|
word = word[::-1] |
325
|
|
|
|
326
|
|
|
# apply the Soundex algorithm |
327
|
|
|
sdx = word[0] + word.translate(_ref_soundex_translation) |
328
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
329
|
|
|
|
330
|
|
|
if maxlength and maxlength < _INFINITY: |
331
|
|
|
sdx = sdx[:maxlength] |
332
|
|
|
sdx += ('0' * maxlength) # rule 4 |
333
|
|
|
|
334
|
|
|
return sdx |
335
|
|
|
|
336
|
|
|
|
337
|
|
|
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True): |
338
|
|
|
"""Return the Daitch-Mokotoff Soundex code for a word. |
339
|
|
|
|
340
|
|
|
Returns values of a word as a set. A collection is necessary since there |
341
|
|
|
can be multiple values for a single word. |
342
|
|
|
|
343
|
|
|
:param word: the word to transform |
344
|
|
|
:param maxlength: the length of the code returned (defaults to 6) |
345
|
|
|
:param reverse: reverse the word before computing the selected Soundex |
346
|
|
|
(defaults to False); This results in "Reverse Soundex" |
347
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
348
|
|
|
maxlength string |
349
|
|
|
:returns: the Daitch-Mokotoff Soundex value |
350
|
|
|
:rtype: str |
351
|
|
|
|
352
|
|
|
>>> dm_soundex('Christopher') |
353
|
|
|
{'494379', '594379'} |
354
|
|
|
>>> dm_soundex('Niall') |
355
|
|
|
{'680000'} |
356
|
|
|
>>> dm_soundex('Smith') |
357
|
|
|
{'463000'} |
358
|
|
|
>>> dm_soundex('Schmidt') |
359
|
|
|
{'463000'} |
360
|
|
|
|
361
|
|
|
>>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False) |
362
|
|
|
{'35457976754', '3557976754'} |
363
|
|
|
""" |
364
|
|
|
_dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), |
365
|
|
|
'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), |
366
|
|
|
'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), |
367
|
|
|
'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), |
368
|
|
|
'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), |
369
|
|
|
'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), |
370
|
|
|
'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), |
371
|
|
|
'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), |
372
|
|
|
'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), |
373
|
|
|
'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), |
374
|
|
|
'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), |
375
|
|
|
'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), |
376
|
|
|
'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), |
377
|
|
|
'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), |
378
|
|
|
'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), |
379
|
|
|
'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), |
380
|
|
|
'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), |
381
|
|
|
'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), |
382
|
|
|
'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), |
383
|
|
|
'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), |
384
|
|
|
'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), |
385
|
|
|
'STRS': (2, 4, 4), 'CZS': (4, 4, 4), |
386
|
|
|
'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), |
387
|
|
|
'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), |
388
|
|
|
'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), |
389
|
|
|
'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), |
390
|
|
|
'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), |
391
|
|
|
'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), |
392
|
|
|
'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), |
393
|
|
|
'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), |
394
|
|
|
'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), |
395
|
|
|
'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), |
396
|
|
|
'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), |
397
|
|
|
'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), |
398
|
|
|
'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), |
399
|
|
|
'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), |
400
|
|
|
'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), |
401
|
|
|
'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), |
402
|
|
|
'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), |
403
|
|
|
'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), |
404
|
|
|
'CH': ((5, 4), (5, 4), (5, 4)), |
405
|
|
|
'CK': ((5, 45), (5, 45), (5, 45)), |
406
|
|
|
'C': ((5, 4), (5, 4), (5, 4)), |
407
|
|
|
'J': ((1, 4), ('_', 4), ('_', 4)), |
408
|
|
|
'RZ': ((94, 4), (94, 4), (94, 4)), |
409
|
|
|
'RS': ((94, 4), (94, 4), (94, 4))} |
410
|
|
|
|
411
|
|
|
_dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
412
|
|
|
'B': ('B'), |
413
|
|
|
'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
414
|
|
|
'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', |
415
|
|
|
'DZ', 'D'), |
416
|
|
|
'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
417
|
|
|
'F': ('FB', 'F'), |
418
|
|
|
'G': ('G'), |
419
|
|
|
'H': ('H'), |
420
|
|
|
'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
421
|
|
|
'J': ('J'), |
422
|
|
|
'K': ('KH', 'KS', 'K'), |
423
|
|
|
'L': ('L'), |
424
|
|
|
'M': ('MN', 'M'), |
425
|
|
|
'N': ('NM', 'N'), |
426
|
|
|
'O': ('OI', 'OJ', 'OY', 'O'), |
427
|
|
|
'P': ('PF', 'PH', 'P'), |
428
|
|
|
'Q': ('Q'), |
429
|
|
|
'R': ('RS', 'RZ', 'R'), |
430
|
|
|
'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', |
431
|
|
|
'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', |
432
|
|
|
'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', |
433
|
|
|
'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'), |
434
|
|
|
'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', |
435
|
|
|
'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', |
436
|
|
|
'TS', 'TZ', 'T'), |
437
|
|
|
'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
438
|
|
|
'V': ('V'), |
439
|
|
|
'W': ('W'), |
440
|
|
|
'X': ('X'), |
441
|
|
|
'Y': ('Y'), |
442
|
|
|
'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', |
443
|
|
|
'ZH', 'ZS', 'Z')} |
444
|
|
|
|
445
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
446
|
|
|
dms = [''] # initialize empty code list |
447
|
|
|
|
448
|
|
|
# Require a maxlength of at least 6 and not more than 64 |
449
|
|
|
if maxlength is not None: |
450
|
|
|
maxlength = min(max(6, maxlength), 64) |
451
|
|
|
else: |
452
|
|
|
maxlength = 64 |
453
|
|
|
|
454
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z |
455
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
456
|
|
|
word = word.replace('ß', 'SS') |
457
|
|
|
word = ''.join(c for c in word if c in |
458
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
459
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
460
|
|
|
'Y', 'Z'}) |
461
|
|
|
|
462
|
|
|
# Nothing to convert, return base case |
463
|
|
|
if not word: |
464
|
|
|
if zero_pad: |
465
|
|
|
return {'0'*maxlength} |
466
|
|
|
return {'0'} |
467
|
|
|
|
468
|
|
|
# Reverse word if computing Reverse Soundex |
469
|
|
|
if reverse: |
470
|
|
|
word = word[::-1] |
471
|
|
|
|
472
|
|
|
pos = 0 |
473
|
|
|
while pos < len(word): |
474
|
|
|
# Iterate through _dms_order, which specifies the possible substrings |
475
|
|
|
# for which codes exist in the Daitch-Mokotoff coding |
476
|
|
|
for sstr in _dms_order[word[pos]]: # pragma: no branch |
477
|
|
|
if word[pos:].startswith(sstr): |
478
|
|
|
# Having determined a valid substring start, retrieve the code |
479
|
|
|
dm_val = _dms_table[sstr] |
480
|
|
|
|
481
|
|
|
# Having retried the code (triple), determine the correct |
482
|
|
|
# positional variant (first, pre-vocalic, elsewhere) |
483
|
|
|
if pos == 0: |
484
|
|
|
dm_val = dm_val[0] |
485
|
|
|
elif (pos+len(sstr) < len(word) and |
486
|
|
|
word[pos+len(sstr)] in _vowels): |
487
|
|
|
dm_val = dm_val[1] |
488
|
|
|
else: |
489
|
|
|
dm_val = dm_val[2] |
490
|
|
|
|
491
|
|
|
# Build the code strings |
492
|
|
|
if isinstance(dm_val, tuple): |
493
|
|
|
dms = [_ + text_type(dm_val[0]) for _ in dms] \ |
494
|
|
|
+ [_ + text_type(dm_val[1]) for _ in dms] |
495
|
|
|
else: |
496
|
|
|
dms = [_ + text_type(dm_val) for _ in dms] |
497
|
|
|
pos += len(sstr) |
498
|
|
|
break |
499
|
|
|
|
500
|
|
|
# Filter out double letters and _ placeholders |
501
|
|
|
dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_') |
|
|
|
|
502
|
|
|
for _ in dms) |
503
|
|
|
|
504
|
|
|
# Trim codes and return set |
505
|
|
|
if zero_pad: |
506
|
|
|
dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms) |
507
|
|
|
else: |
508
|
|
|
dms = (_[:maxlength] for _ in dms) |
509
|
|
|
return set(dms) |
510
|
|
|
|
511
|
|
|
|
512
|
|
|
def koelner_phonetik(word): |
513
|
|
|
"""Return the Kölner Phonetik (numeric output) code for a word. |
514
|
|
|
|
515
|
|
|
Based on the algorithm described at |
516
|
|
|
https://de.wikipedia.org/wiki/Kölner_Phonetik |
517
|
|
|
|
518
|
|
|
While the output code is numeric, it is still a str because 0s can lead |
519
|
|
|
the code. |
520
|
|
|
|
521
|
|
|
:param str word: the word to transform |
522
|
|
|
:returns: the Kölner Phonetik value as a numeric string |
523
|
|
|
:rtype: str |
524
|
|
|
|
525
|
|
|
>>> koelner_phonetik('Christopher') |
526
|
|
|
'478237' |
527
|
|
|
>>> koelner_phonetik('Niall') |
528
|
|
|
'65' |
529
|
|
|
>>> koelner_phonetik('Smith') |
530
|
|
|
'862' |
531
|
|
|
>>> koelner_phonetik('Schmidt') |
532
|
|
|
'862' |
533
|
|
|
>>> koelner_phonetik('Müller') |
534
|
|
|
'657' |
535
|
|
|
>>> koelner_phonetik('Zimmermann') |
536
|
|
|
'86766' |
537
|
|
|
""" |
538
|
|
|
# pylint: disable=too-many-branches |
539
|
|
|
def _after(word, i, letters): |
540
|
|
|
"""Return True if word[i] follows one of the supplied letters.""" |
541
|
|
|
if i > 0 and word[i-1] in letters: |
542
|
|
|
return True |
543
|
|
|
return False |
544
|
|
|
|
545
|
|
|
def _before(word, i, letters): |
546
|
|
|
"""Return True if word[i] precedes one of the supplied letters.""" |
547
|
|
|
if i+1 < len(word) and word[i+1] in letters: |
548
|
|
|
return True |
549
|
|
|
return False |
550
|
|
|
|
551
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
552
|
|
|
|
553
|
|
|
sdx = '' |
554
|
|
|
|
555
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
556
|
|
|
word = word.replace('ß', 'SS') |
557
|
|
|
|
558
|
|
|
word = word.replace('Ä', 'AE') |
559
|
|
|
word = word.replace('Ö', 'OE') |
560
|
|
|
word = word.replace('Ü', 'UE') |
561
|
|
|
word = ''.join(c for c in word if c in |
562
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
563
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
564
|
|
|
'Y', 'Z'}) |
565
|
|
|
|
566
|
|
|
# Nothing to convert, return base case |
567
|
|
|
if not word: |
568
|
|
|
return sdx |
569
|
|
|
|
570
|
|
|
for i in range(len(word)): |
|
|
|
|
571
|
|
|
if word[i] in _vowels: |
572
|
|
|
sdx += '0' |
573
|
|
|
elif word[i] == 'B': |
574
|
|
|
sdx += '1' |
575
|
|
|
elif word[i] == 'P': |
576
|
|
|
if _before(word, i, {'H'}): |
577
|
|
|
sdx += '3' |
578
|
|
|
else: |
579
|
|
|
sdx += '1' |
580
|
|
|
elif word[i] in {'D', 'T'}: |
581
|
|
|
if _before(word, i, {'C', 'S', 'Z'}): |
582
|
|
|
sdx += '8' |
583
|
|
|
else: |
584
|
|
|
sdx += '2' |
585
|
|
|
elif word[i] in {'F', 'V', 'W'}: |
586
|
|
|
sdx += '3' |
587
|
|
|
elif word[i] in {'G', 'K', 'Q'}: |
588
|
|
|
sdx += '4' |
589
|
|
|
elif word[i] == 'C': |
590
|
|
|
if _after(word, i, {'S', 'Z'}): |
591
|
|
|
sdx += '8' |
592
|
|
|
elif i == 0: |
593
|
|
|
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', |
594
|
|
|
'X'}): |
595
|
|
|
sdx += '4' |
596
|
|
|
else: |
597
|
|
|
sdx += '8' |
598
|
|
|
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
599
|
|
|
sdx += '4' |
600
|
|
|
else: |
601
|
|
|
sdx += '8' |
602
|
|
|
elif word[i] == 'X': |
603
|
|
|
if _after(word, i, {'C', 'K', 'Q'}): |
604
|
|
|
sdx += '8' |
605
|
|
|
else: |
606
|
|
|
sdx += '48' |
607
|
|
|
elif word[i] == 'L': |
608
|
|
|
sdx += '5' |
609
|
|
|
elif word[i] in {'M', 'N'}: |
610
|
|
|
sdx += '6' |
611
|
|
|
elif word[i] == 'R': |
612
|
|
|
sdx += '7' |
613
|
|
|
elif word[i] in {'S', 'Z'}: |
614
|
|
|
sdx += '8' |
615
|
|
|
|
616
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
617
|
|
|
|
618
|
|
|
if sdx: |
619
|
|
|
sdx = sdx[0] + sdx[1:].replace('0', '') |
620
|
|
|
|
621
|
|
|
return sdx |
622
|
|
|
|
623
|
|
|
|
624
|
|
|
def koelner_phonetik_num_to_alpha(num): |
625
|
|
|
"""Convert a Kölner Phonetik code from numeric to alphabetic. |
626
|
|
|
|
627
|
|
|
:param str num: a numeric Kölner Phonetik representation |
628
|
|
|
:returns: an alphabetic representation of the same word |
629
|
|
|
:rtype: str |
630
|
|
|
|
631
|
|
|
>>> koelner_phonetik_num_to_alpha(862) |
632
|
|
|
'SNT' |
633
|
|
|
>>> koelner_phonetik_num_to_alpha(657) |
634
|
|
|
'NLR' |
635
|
|
|
>>> koelner_phonetik_num_to_alpha(86766) |
636
|
|
|
'SNRNN' |
637
|
|
|
""" |
638
|
|
|
_koelner_num_translation = dict(zip((ord(_) for _ in '012345678'), |
|
|
|
|
639
|
|
|
'APTFKLNRS')) |
640
|
|
|
num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4', |
641
|
|
|
'5', '6', '7', '8'}) |
642
|
|
|
return num.translate(_koelner_num_translation) |
643
|
|
|
|
644
|
|
|
|
645
|
|
|
def koelner_phonetik_alpha(word): |
646
|
|
|
"""Return the Kölner Phonetik (alphabetic output) code for a word. |
647
|
|
|
|
648
|
|
|
:param str word: the word to transform |
649
|
|
|
:returns: the Kölner Phonetik value as an alphabetic string |
650
|
|
|
:rtype: str |
651
|
|
|
|
652
|
|
|
>>> koelner_phonetik_alpha('Smith') |
653
|
|
|
'SNT' |
654
|
|
|
>>> koelner_phonetik_alpha('Schmidt') |
655
|
|
|
'SNT' |
656
|
|
|
>>> koelner_phonetik_alpha('Müller') |
657
|
|
|
'NLR' |
658
|
|
|
>>> koelner_phonetik_alpha('Zimmermann') |
659
|
|
|
'SNRNN' |
660
|
|
|
""" |
661
|
|
|
return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
662
|
|
|
|
663
|
|
|
|
664
|
|
|
def nysiis(word, maxlength=6, modified=False): |
665
|
|
|
"""Return the NYSIIS code for a word. |
666
|
|
|
|
667
|
|
|
A description of the New York State Identification and Intelligence System |
668
|
|
|
algorithm can be found at |
669
|
|
|
https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System |
670
|
|
|
|
671
|
|
|
The modified version of this algorithm is described in Appendix B of |
672
|
|
|
Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding |
673
|
|
|
Procedure for the SRS Record Linkage System.` Statistical Reporting |
674
|
|
|
Service, U.S. Department of Agriculture, Washington, D.C. February 1977. |
675
|
|
|
https://naldc.nal.usda.gov/download/27833/PDF |
676
|
|
|
|
677
|
|
|
:param str word: the word to transform |
678
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
679
|
|
|
:param bool modified: indicates whether to use USDA modified NYSIIS |
680
|
|
|
:returns: the NYSIIS value |
681
|
|
|
:rtype: str |
682
|
|
|
|
683
|
|
|
>>> nysiis('Christopher') |
684
|
|
|
'CRASTA' |
685
|
|
|
>>> nysiis('Niall') |
686
|
|
|
'NAL' |
687
|
|
|
>>> nysiis('Smith') |
688
|
|
|
'SNAT' |
689
|
|
|
>>> nysiis('Schmidt') |
690
|
|
|
'SNAD' |
691
|
|
|
|
692
|
|
|
>>> nysiis('Christopher', maxlength=_INFINITY) |
693
|
|
|
'CRASTAFAR' |
694
|
|
|
|
695
|
|
|
>>> nysiis('Christopher', maxlength=8, modified=True) |
696
|
|
|
'CRASTAFA' |
697
|
|
|
>>> nysiis('Niall', maxlength=8, modified=True) |
698
|
|
|
'NAL' |
699
|
|
|
>>> nysiis('Smith', maxlength=8, modified=True) |
700
|
|
|
'SNAT' |
701
|
|
|
>>> nysiis('Schmidt', maxlength=8, modified=True) |
702
|
|
|
'SNAD' |
703
|
|
|
""" |
704
|
|
|
# Require a maxlength of at least 6 |
705
|
|
|
if maxlength: |
706
|
|
|
maxlength = max(6, maxlength) |
707
|
|
|
|
708
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
709
|
|
|
|
710
|
|
|
word = ''.join(c for c in word.upper() if c.isalpha()) |
711
|
|
|
word = word.replace('ß', 'SS') |
712
|
|
|
|
713
|
|
|
# exit early if there are no alphas |
714
|
|
|
if not word: |
715
|
|
|
return '' |
716
|
|
|
|
717
|
|
|
if modified: |
718
|
|
|
original_first_char = word[0] |
719
|
|
|
|
720
|
|
|
if word[:3] == 'MAC': |
721
|
|
|
word = 'MCC'+word[3:] |
722
|
|
|
elif word[:2] == 'KN': |
723
|
|
|
word = 'NN'+word[2:] |
724
|
|
|
elif word[:1] == 'K': |
725
|
|
|
word = 'C'+word[1:] |
726
|
|
|
elif word[:2] in {'PH', 'PF'}: |
727
|
|
|
word = 'FF'+word[2:] |
728
|
|
|
elif word[:3] == 'SCH': |
729
|
|
|
word = 'SSS'+word[3:] |
730
|
|
|
elif modified: |
731
|
|
|
if word[:2] == 'WR': |
732
|
|
|
word = 'RR'+word[2:] |
733
|
|
|
elif word[:2] == 'RH': |
734
|
|
|
word = 'RR'+word[2:] |
735
|
|
|
elif word[:2] == 'DG': |
736
|
|
|
word = 'GG'+word[2:] |
737
|
|
|
elif word[:1] in _vowels: |
738
|
|
|
word = 'A'+word[1:] |
739
|
|
|
|
740
|
|
|
if modified and word[-1] in {'S', 'Z'}: |
741
|
|
|
word = word[:-1] |
742
|
|
|
|
743
|
|
|
if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and |
744
|
|
|
word[-2:] == 'YE'): |
745
|
|
|
word = word[:-2]+'Y' |
746
|
|
|
elif word[-2:] in {'DT', 'RT', 'RD'}: |
747
|
|
|
word = word[:-2]+'D' |
748
|
|
|
elif word[-2:] in {'NT', 'ND'}: |
749
|
|
|
word = word[:-2]+('N' if modified else 'D') |
750
|
|
|
elif modified: |
751
|
|
|
if word[-2:] == 'IX': |
752
|
|
|
word = word[:-2]+'ICK' |
753
|
|
|
elif word[-2:] == 'EX': |
754
|
|
|
word = word[:-2]+'ECK' |
755
|
|
|
elif word[-2:] in {'JR', 'SR'}: |
756
|
|
|
return 'ERROR' # TODO: decide how best to return an error |
|
|
|
|
757
|
|
|
|
758
|
|
|
key = word[0] |
759
|
|
|
|
760
|
|
|
skip = 0 |
761
|
|
|
for i in range(1, len(word)): |
762
|
|
|
if i >= len(word): |
763
|
|
|
continue |
764
|
|
|
elif skip: |
765
|
|
|
skip -= 1 |
766
|
|
|
continue |
767
|
|
|
elif word[i:i+2] == 'EV': |
768
|
|
|
word = word[:i] + 'AF' + word[i+2:] |
769
|
|
|
skip = 1 |
770
|
|
|
elif word[i] in _vowels: |
771
|
|
|
word = word[:i] + 'A' + word[i+1:] |
772
|
|
|
elif modified and i != len(word)-1 and word[i] == 'Y': |
773
|
|
|
word = word[:i] + 'A' + word[i+1:] |
774
|
|
|
elif word[i] == 'Q': |
775
|
|
|
word = word[:i] + 'G' + word[i+1:] |
776
|
|
|
elif word[i] == 'Z': |
777
|
|
|
word = word[:i] + 'S' + word[i+1:] |
778
|
|
|
elif word[i] == 'M': |
779
|
|
|
word = word[:i] + 'N' + word[i+1:] |
780
|
|
|
elif word[i:i+2] == 'KN': |
781
|
|
|
word = word[:i] + 'N' + word[i+2:] |
782
|
|
|
elif word[i] == 'K': |
783
|
|
|
word = word[:i] + 'C' + word[i+1:] |
784
|
|
|
elif modified and i == len(word)-3 and word[i:i+3] == 'SCH': |
785
|
|
|
word = word[:i] + 'SSA' |
786
|
|
|
skip = 2 |
787
|
|
|
elif word[i:i+3] == 'SCH': |
788
|
|
|
word = word[:i] + 'SSS' + word[i+3:] |
789
|
|
|
skip = 2 |
790
|
|
|
elif modified and i == len(word)-2 and word[i:i+2] == 'SH': |
791
|
|
|
word = word[:i] + 'SA' |
792
|
|
|
skip = 1 |
793
|
|
|
elif word[i:i+2] == 'SH': |
794
|
|
|
word = word[:i] + 'SS' + word[i+2:] |
795
|
|
|
skip = 1 |
796
|
|
|
elif word[i:i+2] == 'PH': |
797
|
|
|
word = word[:i] + 'FF' + word[i+2:] |
798
|
|
|
skip = 1 |
799
|
|
|
elif modified and word[i:i+3] == 'GHT': |
800
|
|
|
word = word[:i] + 'TTT' + word[i+3:] |
801
|
|
|
skip = 2 |
802
|
|
|
elif modified and word[i:i+2] == 'DG': |
803
|
|
|
word = word[:i] + 'GG' + word[i+2:] |
804
|
|
|
skip = 1 |
805
|
|
|
elif modified and word[i:i+2] == 'WR': |
806
|
|
|
word = word[:i] + 'RR' + word[i+2:] |
807
|
|
|
skip = 1 |
808
|
|
|
elif word[i] == 'H' and (word[i-1] not in _vowels or |
809
|
|
|
word[i+1:i+2] not in _vowels): |
810
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
811
|
|
|
elif word[i] == 'W' and word[i-1] in _vowels: |
812
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
813
|
|
|
|
814
|
|
|
if word[i:i+skip+1] != key[-1:]: |
815
|
|
|
key += word[i:i+skip+1] |
816
|
|
|
|
817
|
|
|
key = _delete_consecutive_repeats(key) |
818
|
|
|
|
819
|
|
|
if key[-1] == 'S': |
820
|
|
|
key = key[:-1] |
821
|
|
|
if key[-2:] == 'AY': |
822
|
|
|
key = key[:-2] + 'Y' |
823
|
|
|
if key[-1:] == 'A': |
824
|
|
|
key = key[:-1] |
825
|
|
|
if modified and key[0] == 'A': |
826
|
|
|
key = original_first_char + key[1:] |
|
|
|
|
827
|
|
|
|
828
|
|
|
if maxlength and maxlength < _INFINITY: |
829
|
|
|
key = key[:maxlength] |
830
|
|
|
|
831
|
|
|
return key |
832
|
|
|
|
833
|
|
|
|
834
|
|
|
def mra(word): |
835
|
|
|
"""Return the MRA personal numeric identifier (PNI) for a word. |
836
|
|
|
|
837
|
|
|
A description of the Western Airlines Surname Match Rating Algorithm can |
838
|
|
|
be found on page 18 of |
839
|
|
|
https://archive.org/details/accessingindivid00moor |
840
|
|
|
|
841
|
|
|
:param str word: the word to transform |
842
|
|
|
:returns: the MRA PNI |
843
|
|
|
:rtype: str |
844
|
|
|
|
845
|
|
|
>>> mra('Christopher') |
846
|
|
|
'CHRPHR' |
847
|
|
|
>>> mra('Niall') |
848
|
|
|
'NL' |
849
|
|
|
>>> mra('Smith') |
850
|
|
|
'SMTH' |
851
|
|
|
>>> mra('Schmidt') |
852
|
|
|
'SCHMDT' |
853
|
|
|
""" |
854
|
|
|
if not word: |
855
|
|
|
return word |
856
|
|
|
word = word.upper() |
857
|
|
|
word = word.replace('ß', 'SS') |
858
|
|
|
word = word[0]+''.join(c for c in word[1:] if |
859
|
|
|
c not in {'A', 'E', 'I', 'O', 'U'}) |
860
|
|
|
word = _delete_consecutive_repeats(word) |
861
|
|
|
if len(word) > 6: |
862
|
|
|
word = word[:3]+word[-3:] |
863
|
|
|
return word |
864
|
|
|
|
865
|
|
|
|
866
|
|
|
def metaphone(word, maxlength=_INFINITY): |
867
|
|
|
"""Return the Metaphone code for a word. |
868
|
|
|
|
869
|
|
|
Based on Lawrence Philips' Pick BASIC code from 1990: |
870
|
|
|
http://aspell.net/metaphone/metaphone.basic |
871
|
|
|
This incorporates some corrections to the above code, particularly |
872
|
|
|
some of those suggested by Michael Kuhn in: |
873
|
|
|
http://aspell.net/metaphone/metaphone-kuhn.txt |
874
|
|
|
|
875
|
|
|
:param str word: the word to transform |
876
|
|
|
:param int maxlength: the maximum length of the returned Metaphone code |
877
|
|
|
(defaults to unlimited, but in Philips' original implementation |
878
|
|
|
this was 4) |
879
|
|
|
:returns: the Metaphone value |
880
|
|
|
:rtype: str |
881
|
|
|
|
882
|
|
|
|
883
|
|
|
>>> metaphone('Christopher') |
884
|
|
|
'KRSTFR' |
885
|
|
|
>>> metaphone('Niall') |
886
|
|
|
'NL' |
887
|
|
|
>>> metaphone('Smith') |
888
|
|
|
'SM0' |
889
|
|
|
>>> metaphone('Schmidt') |
890
|
|
|
'SKMTT' |
891
|
|
|
""" |
892
|
|
|
# pylint: disable=too-many-branches |
893
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
894
|
|
|
_frontv = {'E', 'I', 'Y'} |
895
|
|
|
_varson = {'C', 'G', 'P', 'S', 'T'} |
896
|
|
|
|
897
|
|
|
# Require a maxlength of at least 4 |
898
|
|
|
if maxlength is not None: |
899
|
|
|
maxlength = max(4, maxlength) |
900
|
|
|
else: |
901
|
|
|
maxlength = 64 |
902
|
|
|
|
903
|
|
|
# As in variable sound--those modified by adding an "h" |
904
|
|
|
ename = ''.join(c for c in word.upper() if c.isalnum()) |
905
|
|
|
ename = ename.replace('ß', 'SS') |
906
|
|
|
|
907
|
|
|
# Delete nonalphanumeric characters and make all caps |
908
|
|
|
if not ename: |
909
|
|
|
return '' |
910
|
|
|
if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}: |
911
|
|
|
ename = ename[1:] |
912
|
|
|
elif ename[0] == 'X': |
913
|
|
|
ename = 'S' + ename[1:] |
914
|
|
|
elif ename[0:2] == 'WH': |
915
|
|
|
ename = 'W' + ename[2:] |
916
|
|
|
|
917
|
|
|
# Convert to metaph |
918
|
|
|
elen = len(ename)-1 |
919
|
|
|
metaph = '' |
920
|
|
|
for i in range(len(ename)): |
|
|
|
|
921
|
|
|
if len(metaph) >= maxlength: |
922
|
|
|
break |
923
|
|
|
if ((ename[i] not in {'G', 'T'} and |
924
|
|
|
i > 0 and ename[i-1] == ename[i])): |
925
|
|
|
continue |
926
|
|
|
|
927
|
|
|
if ename[i] in _vowels and i == 0: |
928
|
|
|
metaph = ename[i] |
929
|
|
|
|
930
|
|
|
elif ename[i] == 'B': |
931
|
|
|
if i != elen or ename[i-1] != 'M': |
932
|
|
|
metaph += ename[i] |
933
|
|
|
|
934
|
|
|
elif ename[i] == 'C': |
935
|
|
|
if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv): |
936
|
|
|
if ename[i+1:i+3] == 'IA': |
937
|
|
|
metaph += 'X' |
938
|
|
|
elif ename[i+1:i+2] in _frontv: |
939
|
|
|
metaph += 'S' |
940
|
|
|
elif i > 0 and ename[i-1:i+2] == 'SCH': |
941
|
|
|
metaph += 'K' |
942
|
|
|
elif ename[i+1:i+2] == 'H': |
943
|
|
|
if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels: |
944
|
|
|
metaph += 'K' |
945
|
|
|
else: |
946
|
|
|
metaph += 'X' |
947
|
|
|
else: |
948
|
|
|
metaph += 'K' |
949
|
|
|
|
950
|
|
|
elif ename[i] == 'D': |
951
|
|
|
if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv: |
952
|
|
|
metaph += 'J' |
953
|
|
|
else: |
954
|
|
|
metaph += 'T' |
955
|
|
|
|
956
|
|
|
elif ename[i] == 'G': |
957
|
|
|
if ename[i+1:i+2] == 'H' and not (i+1 == elen or |
958
|
|
|
ename[i+2:i+3] not in _vowels): |
959
|
|
|
continue |
960
|
|
|
elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or |
961
|
|
|
(i+3 == elen and ename[i+1:i+4] == 'NED')): |
962
|
|
|
continue |
963
|
|
|
elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and |
964
|
|
|
ename[i+1] in _frontv): |
965
|
|
|
continue |
966
|
|
|
elif ename[i+1:i+2] == 'G': |
967
|
|
|
continue |
968
|
|
|
elif ename[i+1:i+2] in _frontv: |
969
|
|
|
if i == 0 or ename[i-1] != 'G': |
970
|
|
|
metaph += 'J' |
971
|
|
|
else: |
972
|
|
|
metaph += 'K' |
973
|
|
|
else: |
974
|
|
|
metaph += 'K' |
975
|
|
|
|
976
|
|
|
elif ename[i] == 'H': |
977
|
|
|
if ((i > 0 and ename[i-1] in _vowels and |
978
|
|
|
ename[i+1:i+2] not in _vowels)): |
979
|
|
|
continue |
980
|
|
|
elif i > 0 and ename[i-1] in _varson: |
981
|
|
|
continue |
982
|
|
|
else: |
983
|
|
|
metaph += 'H' |
984
|
|
|
|
985
|
|
|
elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}: |
986
|
|
|
metaph += ename[i] |
987
|
|
|
|
988
|
|
|
elif ename[i] == 'K': |
989
|
|
|
if i > 0 and ename[i-1] == 'C': |
990
|
|
|
continue |
991
|
|
|
else: |
992
|
|
|
metaph += 'K' |
993
|
|
|
|
994
|
|
|
elif ename[i] == 'P': |
995
|
|
|
if ename[i+1:i+2] == 'H': |
996
|
|
|
metaph += 'F' |
997
|
|
|
else: |
998
|
|
|
metaph += 'P' |
999
|
|
|
|
1000
|
|
|
elif ename[i] == 'Q': |
1001
|
|
|
metaph += 'K' |
1002
|
|
|
|
1003
|
|
|
elif ename[i] == 'S': |
1004
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
1005
|
|
|
ename[i+2] in 'OA')): |
1006
|
|
|
metaph += 'X' |
1007
|
|
|
elif ename[i+1:i+2] == 'H': |
1008
|
|
|
metaph += 'X' |
1009
|
|
|
else: |
1010
|
|
|
metaph += 'S' |
1011
|
|
|
|
1012
|
|
|
elif ename[i] == 'T': |
1013
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
1014
|
|
|
ename[i+2] in {'A', 'O'})): |
1015
|
|
|
metaph += 'X' |
1016
|
|
|
elif ename[i+1:i+2] == 'H': |
1017
|
|
|
metaph += '0' |
1018
|
|
|
elif ename[i+1:i+3] != 'CH': |
1019
|
|
|
if ename[i-1:i] != 'T': |
1020
|
|
|
metaph += 'T' |
1021
|
|
|
|
1022
|
|
|
elif ename[i] == 'V': |
1023
|
|
|
metaph += 'F' |
1024
|
|
|
|
1025
|
|
|
elif ename[i] in 'WY': |
1026
|
|
|
if ename[i+1:i+2] in _vowels: |
1027
|
|
|
metaph += ename[i] |
1028
|
|
|
|
1029
|
|
|
elif ename[i] == 'X': |
1030
|
|
|
metaph += 'KS' |
1031
|
|
|
|
1032
|
|
|
elif ename[i] == 'Z': |
1033
|
|
|
metaph += 'S' |
1034
|
|
|
|
1035
|
|
|
return metaph |
1036
|
|
|
|
1037
|
|
|
|
1038
|
|
|
def double_metaphone(word, maxlength=_INFINITY): |
1039
|
|
|
"""Return the Double Metaphone code for a word. |
1040
|
|
|
|
1041
|
|
|
Based on Lawrence Philips' (Visual) C++ code from 1999: |
1042
|
|
|
http://aspell.net/metaphone/dmetaph.cpp |
1043
|
|
|
|
1044
|
|
|
:param word: the word to transform |
1045
|
|
|
:param maxlength: the maximum length of the returned Double Metaphone codes |
1046
|
|
|
(defaults to unlimited, but in Philips' original implementation this |
1047
|
|
|
was 4) |
1048
|
|
|
:returns: the Double Metaphone value(s) |
1049
|
|
|
:rtype: tuple |
1050
|
|
|
|
1051
|
|
|
>>> double_metaphone('Christopher') |
1052
|
|
|
('KRSTFR', '') |
1053
|
|
|
>>> double_metaphone('Niall') |
1054
|
|
|
('NL', '') |
1055
|
|
|
>>> double_metaphone('Smith') |
1056
|
|
|
('SM0', 'XMT') |
1057
|
|
|
>>> double_metaphone('Schmidt') |
1058
|
|
|
('XMT', 'SMT') |
1059
|
|
|
""" |
1060
|
|
|
# pylint: disable=too-many-branches |
1061
|
|
|
# Require a maxlength of at least 4 |
1062
|
|
|
if maxlength is not None: |
1063
|
|
|
maxlength = max(4, maxlength) |
1064
|
|
|
else: |
1065
|
|
|
maxlength = 64 |
1066
|
|
|
|
1067
|
|
|
primary = '' |
1068
|
|
|
secondary = '' |
1069
|
|
|
|
1070
|
|
|
def _slavo_germanic(): |
1071
|
|
|
"""Return True if the word appears to be Slavic or Germanic.""" |
1072
|
|
|
if 'W' in word or 'K' in word or 'CZ' in word: |
1073
|
|
|
return True |
1074
|
|
|
return False |
1075
|
|
|
|
1076
|
|
|
def _metaph_add(pri, sec=''): |
1077
|
|
|
"""Return a new metaphone tuple with the supplied elements.""" |
1078
|
|
|
newpri = primary |
1079
|
|
|
newsec = secondary |
1080
|
|
|
if pri: |
1081
|
|
|
newpri += pri |
1082
|
|
|
if sec: |
1083
|
|
|
if sec != ' ': |
1084
|
|
|
newsec += sec |
1085
|
|
|
else: |
1086
|
|
|
newsec += pri |
1087
|
|
|
return (newpri, newsec) |
1088
|
|
|
|
1089
|
|
|
def _is_vowel(pos): |
1090
|
|
|
"""Return True if the character at word[pos] is a vowel.""" |
1091
|
|
|
if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
1092
|
|
|
return True |
1093
|
|
|
return False |
1094
|
|
|
|
1095
|
|
|
def _get_at(pos): |
1096
|
|
|
"""Return the character at word[pos].""" |
1097
|
|
|
return word[pos] |
1098
|
|
|
|
1099
|
|
|
def _string_at(pos, slen, substrings): |
1100
|
|
|
"""Return True if word[pos:pos+slen] is in substrings.""" |
1101
|
|
|
if pos < 0: |
1102
|
|
|
return False |
1103
|
|
|
return word[pos:pos+slen] in substrings |
1104
|
|
|
|
1105
|
|
|
current = 0 |
1106
|
|
|
length = len(word) |
1107
|
|
|
if length < 1: |
1108
|
|
|
return ('', '') |
1109
|
|
|
last = length - 1 |
1110
|
|
|
|
1111
|
|
|
word = word.upper() |
1112
|
|
|
word = word.replace('ß', 'SS') |
1113
|
|
|
|
1114
|
|
|
# Pad the original string so that we can index beyond the edge of the world |
1115
|
|
|
word += ' ' |
1116
|
|
|
|
1117
|
|
|
# Skip these when at start of word |
1118
|
|
|
if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
1119
|
|
|
current += 1 |
1120
|
|
|
|
1121
|
|
|
# Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
1122
|
|
|
if _get_at(0) == 'X': |
1123
|
|
|
(primary, secondary) = _metaph_add('S') # 'Z' maps to 'S' |
1124
|
|
|
current += 1 |
1125
|
|
|
|
1126
|
|
|
# Main loop |
1127
|
|
|
while True: |
|
|
|
|
1128
|
|
|
if current >= length: |
1129
|
|
|
break |
1130
|
|
|
|
1131
|
|
|
if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
1132
|
|
|
if current == 0: |
1133
|
|
|
# All init vowels now map to 'A' |
1134
|
|
|
(primary, secondary) = _metaph_add('A') |
1135
|
|
|
current += 1 |
1136
|
|
|
continue |
1137
|
|
|
|
1138
|
|
|
elif _get_at(current) == 'B': |
1139
|
|
|
# "-mb", e.g", "dumb", already skipped over... |
1140
|
|
|
(primary, secondary) = _metaph_add('P') |
1141
|
|
|
if _get_at(current + 1) == 'B': |
1142
|
|
|
current += 2 |
1143
|
|
|
else: |
1144
|
|
|
current += 1 |
1145
|
|
|
continue |
1146
|
|
|
|
1147
|
|
|
elif _get_at(current) == 'Ç': |
1148
|
|
|
(primary, secondary) = _metaph_add('S') |
1149
|
|
|
current += 1 |
1150
|
|
|
continue |
1151
|
|
|
|
1152
|
|
|
elif _get_at(current) == 'C': |
1153
|
|
|
# Various Germanic |
1154
|
|
|
if (current > 1 and not _is_vowel(current - 2) and |
|
|
|
|
1155
|
|
|
_string_at((current - 1), 3, {'ACH'}) and |
1156
|
|
|
((_get_at(current + 2) != 'I') and |
1157
|
|
|
((_get_at(current + 2) != 'E') or |
1158
|
|
|
_string_at((current - 2), 6, |
1159
|
|
|
{'BACHER', 'MACHER'})))): |
1160
|
|
|
(primary, secondary) = _metaph_add('K') |
1161
|
|
|
current += 2 |
1162
|
|
|
continue |
1163
|
|
|
|
1164
|
|
|
# Special case 'caesar' |
1165
|
|
|
elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
1166
|
|
|
(primary, secondary) = _metaph_add('S') |
1167
|
|
|
current += 2 |
1168
|
|
|
continue |
1169
|
|
|
|
1170
|
|
|
# Italian 'chianti' |
1171
|
|
|
elif _string_at(current, 4, {'CHIA'}): |
1172
|
|
|
(primary, secondary) = _metaph_add('K') |
1173
|
|
|
current += 2 |
1174
|
|
|
continue |
1175
|
|
|
|
1176
|
|
|
elif _string_at(current, 2, {'CH'}): |
1177
|
|
|
# Find 'Michael' |
1178
|
|
|
if current > 0 and _string_at(current, 4, {'CHAE'}): |
1179
|
|
|
(primary, secondary) = _metaph_add('K', 'X') |
1180
|
|
|
current += 2 |
1181
|
|
|
continue |
1182
|
|
|
|
1183
|
|
|
# Greek roots e.g. 'chemistry', 'chorus' |
1184
|
|
|
elif (current == 0 and |
1185
|
|
|
(_string_at((current + 1), 5, |
1186
|
|
|
{'HARAC', 'HARIS'}) or |
1187
|
|
|
_string_at((current + 1), 3, |
1188
|
|
|
{'HOR', 'HYM', 'HIA', 'HEM'})) and |
1189
|
|
|
not _string_at(0, 5, {'CHORE'})): |
1190
|
|
|
(primary, secondary) = _metaph_add('K') |
1191
|
|
|
current += 2 |
1192
|
|
|
continue |
1193
|
|
|
|
1194
|
|
|
# Germanic, Greek, or otherwise 'ch' for 'kh' sound |
1195
|
|
|
elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
|
|
|
|
1196
|
|
|
_string_at(0, 3, {'SCH'})) or |
1197
|
|
|
# 'architect but not 'arch', 'orchestra', 'orchid' |
1198
|
|
|
_string_at((current - 2), 6, |
1199
|
|
|
{'ORCHES', 'ARCHIT', 'ORCHID'}) or |
1200
|
|
|
_string_at((current + 2), 1, {'T', 'S'}) or |
1201
|
|
|
((_string_at((current - 1), 1, |
1202
|
|
|
{'A', 'O', 'U', 'E'}) or |
1203
|
|
|
(current == 0)) and |
1204
|
|
|
# e.g., 'wachtler', 'wechsler', but not 'tichner' |
1205
|
|
|
_string_at((current + 2), 1, |
1206
|
|
|
{'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
1207
|
|
|
' '}))): |
1208
|
|
|
(primary, secondary) = _metaph_add('K') |
1209
|
|
|
|
1210
|
|
|
else: |
1211
|
|
|
if current > 0: |
1212
|
|
|
if _string_at(0, 2, {'MC'}): |
1213
|
|
|
# e.g., "McHugh" |
1214
|
|
|
(primary, secondary) = _metaph_add('K') |
1215
|
|
|
else: |
1216
|
|
|
(primary, secondary) = _metaph_add('X', 'K') |
1217
|
|
|
else: |
1218
|
|
|
(primary, secondary) = _metaph_add('X') |
1219
|
|
|
|
1220
|
|
|
current += 2 |
1221
|
|
|
continue |
1222
|
|
|
|
1223
|
|
|
# e.g, 'czerny' |
1224
|
|
|
elif (_string_at(current, 2, {'CZ'}) and |
1225
|
|
|
not _string_at((current - 2), 4, {'WICZ'})): |
1226
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1227
|
|
|
current += 2 |
1228
|
|
|
continue |
1229
|
|
|
|
1230
|
|
|
# e.g., 'focaccia' |
1231
|
|
|
elif _string_at((current + 1), 3, {'CIA'}): |
1232
|
|
|
(primary, secondary) = _metaph_add('X') |
1233
|
|
|
current += 3 |
1234
|
|
|
|
1235
|
|
|
# double 'C', but not if e.g. 'McClellan' |
1236
|
|
|
elif (_string_at(current, 2, {'CC'}) and |
1237
|
|
|
not ((current == 1) and (_get_at(0) == 'M'))): |
1238
|
|
|
# 'bellocchio' but not 'bacchus' |
1239
|
|
|
if ((_string_at((current + 2), 1, |
1240
|
|
|
{'I', 'E', 'H'}) and |
1241
|
|
|
not _string_at((current + 2), 2, ['HU']))): |
1242
|
|
|
# 'accident', 'accede' 'succeed' |
1243
|
|
|
if ((((current == 1) and _get_at(current - 1) == 'A') or |
1244
|
|
|
_string_at((current - 1), 5, |
1245
|
|
|
{'UCCEE', 'UCCES'}))): |
1246
|
|
|
(primary, secondary) = _metaph_add('KS') |
1247
|
|
|
# 'bacci', 'bertucci', other italian |
1248
|
|
|
else: |
1249
|
|
|
(primary, secondary) = _metaph_add('X') |
1250
|
|
|
current += 3 |
1251
|
|
|
continue |
1252
|
|
|
else: # Pierce's rule |
1253
|
|
|
(primary, secondary) = _metaph_add('K') |
1254
|
|
|
current += 2 |
1255
|
|
|
continue |
1256
|
|
|
|
1257
|
|
|
elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
1258
|
|
|
(primary, secondary) = _metaph_add('K') |
1259
|
|
|
current += 2 |
1260
|
|
|
continue |
1261
|
|
|
|
1262
|
|
|
elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
1263
|
|
|
# Italian vs. English |
1264
|
|
|
if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
1265
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1266
|
|
|
else: |
1267
|
|
|
(primary, secondary) = _metaph_add('S') |
1268
|
|
|
current += 2 |
1269
|
|
|
continue |
1270
|
|
|
|
1271
|
|
|
# else |
1272
|
|
|
else: |
1273
|
|
|
(primary, secondary) = _metaph_add('K') |
1274
|
|
|
|
1275
|
|
|
# name sent in 'mac caffrey', 'mac gregor |
1276
|
|
|
if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
1277
|
|
|
current += 3 |
1278
|
|
|
elif (_string_at((current + 1), 1, |
1279
|
|
|
{'C', 'K', 'Q'}) and |
1280
|
|
|
not _string_at((current + 1), 2, {'CE', 'CI'})): |
1281
|
|
|
current += 2 |
1282
|
|
|
else: |
1283
|
|
|
current += 1 |
1284
|
|
|
continue |
1285
|
|
|
|
1286
|
|
|
elif _get_at(current) == 'D': |
1287
|
|
|
if _string_at(current, 2, {'DG'}): |
1288
|
|
|
if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
1289
|
|
|
# e.g. 'edge' |
1290
|
|
|
(primary, secondary) = _metaph_add('J') |
1291
|
|
|
current += 3 |
1292
|
|
|
continue |
1293
|
|
|
else: |
1294
|
|
|
# e.g. 'edgar' |
1295
|
|
|
(primary, secondary) = _metaph_add('TK') |
1296
|
|
|
current += 2 |
1297
|
|
|
continue |
1298
|
|
|
|
1299
|
|
|
elif _string_at(current, 2, {'DT', 'DD'}): |
1300
|
|
|
(primary, secondary) = _metaph_add('T') |
1301
|
|
|
current += 2 |
1302
|
|
|
continue |
1303
|
|
|
|
1304
|
|
|
# else |
1305
|
|
|
else: |
1306
|
|
|
(primary, secondary) = _metaph_add('T') |
1307
|
|
|
current += 1 |
1308
|
|
|
continue |
1309
|
|
|
|
1310
|
|
|
elif _get_at(current) == 'F': |
1311
|
|
|
if _get_at(current + 1) == 'F': |
1312
|
|
|
current += 2 |
1313
|
|
|
else: |
1314
|
|
|
current += 1 |
1315
|
|
|
(primary, secondary) = _metaph_add('F') |
1316
|
|
|
continue |
1317
|
|
|
|
1318
|
|
|
elif _get_at(current) == 'G': |
1319
|
|
|
if _get_at(current + 1) == 'H': |
1320
|
|
|
if (current > 0) and not _is_vowel(current - 1): |
1321
|
|
|
(primary, secondary) = _metaph_add('K') |
1322
|
|
|
current += 2 |
1323
|
|
|
continue |
1324
|
|
|
|
1325
|
|
|
# 'ghislane', ghiradelli |
1326
|
|
|
elif current == 0: |
1327
|
|
|
if _get_at(current + 2) == 'I': |
1328
|
|
|
(primary, secondary) = _metaph_add('J') |
1329
|
|
|
else: |
1330
|
|
|
(primary, secondary) = _metaph_add('K') |
1331
|
|
|
current += 2 |
1332
|
|
|
continue |
1333
|
|
|
|
1334
|
|
|
# Parker's rule (with some further refinements) - e.g., 'hugh' |
1335
|
|
|
elif (((current > 1) and |
|
|
|
|
1336
|
|
|
_string_at((current - 2), 1, {'B', 'H', 'D'})) or |
1337
|
|
|
# e.g., 'bough' |
1338
|
|
|
((current > 2) and |
1339
|
|
|
_string_at((current - 3), 1, {'B', 'H', 'D'})) or |
1340
|
|
|
# e.g., 'broughton' |
1341
|
|
|
((current > 3) and |
1342
|
|
|
_string_at((current - 4), 1, {'B', 'H'}))): |
1343
|
|
|
current += 2 |
1344
|
|
|
continue |
1345
|
|
|
else: |
1346
|
|
|
# e.g. 'laugh', 'McLaughlin', 'cough', |
1347
|
|
|
# 'gough', 'rough', 'tough' |
1348
|
|
|
if ((current > 2) and |
1349
|
|
|
(_get_at(current - 1) == 'U') and |
1350
|
|
|
(_string_at((current - 3), 1, |
1351
|
|
|
{'C', 'G', 'L', 'R', 'T'}))): |
1352
|
|
|
(primary, secondary) = _metaph_add('F') |
1353
|
|
|
elif (current > 0) and _get_at(current - 1) != 'I': |
1354
|
|
|
(primary, secondary) = _metaph_add('K') |
1355
|
|
|
current += 2 |
1356
|
|
|
continue |
1357
|
|
|
|
1358
|
|
|
elif _get_at(current + 1) == 'N': |
1359
|
|
|
if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
1360
|
|
|
(primary, secondary) = _metaph_add('KN', 'N') |
1361
|
|
|
# not e.g. 'cagney' |
1362
|
|
|
elif (not _string_at((current + 2), 2, {'EY'}) and |
1363
|
|
|
(_get_at(current + 1) != 'Y') and |
1364
|
|
|
not _slavo_germanic()): |
1365
|
|
|
(primary, secondary) = _metaph_add('N', 'KN') |
1366
|
|
|
else: |
1367
|
|
|
(primary, secondary) = _metaph_add('KN') |
1368
|
|
|
current += 2 |
1369
|
|
|
continue |
1370
|
|
|
|
1371
|
|
|
# 'tagliaro' |
1372
|
|
|
elif (_string_at((current + 1), 2, {'LI'}) and |
1373
|
|
|
not _slavo_germanic()): |
1374
|
|
|
(primary, secondary) = _metaph_add('KL', 'L') |
1375
|
|
|
current += 2 |
1376
|
|
|
continue |
1377
|
|
|
|
1378
|
|
|
# -ges-, -gep-, -gel-, -gie- at beginning |
1379
|
|
|
elif ((current == 0) and |
1380
|
|
|
((_get_at(current + 1) == 'Y') or |
1381
|
|
|
_string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
1382
|
|
|
'IB', 'IL', 'IN', 'IE', 'EI', |
1383
|
|
|
'ER'}))): |
1384
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
1385
|
|
|
current += 2 |
1386
|
|
|
continue |
1387
|
|
|
|
1388
|
|
|
# -ger-, -gy- |
1389
|
|
|
elif ((_string_at((current + 1), 2, {'ER'}) or |
1390
|
|
|
(_get_at(current + 1) == 'Y')) and not |
1391
|
|
|
_string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
1392
|
|
|
_string_at((current - 1), 1, {'E', 'I'}) and not |
1393
|
|
|
_string_at((current - 1), 3, {'RGY', 'OGY'})): |
1394
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
1395
|
|
|
current += 2 |
1396
|
|
|
continue |
1397
|
|
|
|
1398
|
|
|
# italian e.g, 'biaggi' |
1399
|
|
|
elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
1400
|
|
|
_string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
1401
|
|
|
# obvious germanic |
1402
|
|
|
if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
1403
|
|
|
_string_at(0, 3, {'SCH'})) or |
1404
|
|
|
_string_at((current + 1), 2, {'ET'}))): |
1405
|
|
|
(primary, secondary) = _metaph_add('K') |
1406
|
|
|
elif _string_at((current + 1), 4, {'IER '}): |
1407
|
|
|
(primary, secondary) = _metaph_add('J') |
1408
|
|
|
else: |
1409
|
|
|
(primary, secondary) = _metaph_add('J', 'K') |
1410
|
|
|
current += 2 |
1411
|
|
|
continue |
1412
|
|
|
|
1413
|
|
|
else: |
1414
|
|
|
if _get_at(current + 1) == 'G': |
1415
|
|
|
current += 2 |
1416
|
|
|
else: |
1417
|
|
|
current += 1 |
1418
|
|
|
(primary, secondary) = _metaph_add('K') |
1419
|
|
|
continue |
1420
|
|
|
|
1421
|
|
|
elif _get_at(current) == 'H': |
1422
|
|
|
# only keep if first & before vowel or btw. 2 vowels |
1423
|
|
|
if ((((current == 0) or _is_vowel(current - 1)) and |
1424
|
|
|
_is_vowel(current + 1))): |
1425
|
|
|
(primary, secondary) = _metaph_add('H') |
1426
|
|
|
current += 2 |
1427
|
|
|
else: # also takes care of 'HH' |
1428
|
|
|
current += 1 |
1429
|
|
|
continue |
1430
|
|
|
|
1431
|
|
|
elif _get_at(current) == 'J': |
1432
|
|
|
# obvious spanish, 'jose', 'san jacinto' |
1433
|
|
|
if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
1434
|
|
|
if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
1435
|
|
|
_string_at(0, 4, ['SAN ']))): |
1436
|
|
|
(primary, secondary) = _metaph_add('H') |
1437
|
|
|
else: |
1438
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
1439
|
|
|
current += 1 |
1440
|
|
|
continue |
1441
|
|
|
|
1442
|
|
|
elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
1443
|
|
|
# Yankelovich/Jankelowicz |
1444
|
|
|
(primary, secondary) = _metaph_add('J', 'A') |
1445
|
|
|
# Spanish pron. of e.g. 'bajador' |
1446
|
|
|
elif (_is_vowel(current - 1) and |
1447
|
|
|
not _slavo_germanic() and |
1448
|
|
|
((_get_at(current + 1) == 'A') or |
1449
|
|
|
(_get_at(current + 1) == 'O'))): |
1450
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
1451
|
|
|
elif current == last: |
1452
|
|
|
(primary, secondary) = _metaph_add('J', ' ') |
1453
|
|
|
elif (not _string_at((current + 1), 1, |
1454
|
|
|
{'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
1455
|
|
|
not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
1456
|
|
|
(primary, secondary) = _metaph_add('J') |
1457
|
|
|
|
1458
|
|
|
if _get_at(current + 1) == 'J': # it could happen! |
1459
|
|
|
current += 2 |
1460
|
|
|
else: |
1461
|
|
|
current += 1 |
1462
|
|
|
continue |
1463
|
|
|
|
1464
|
|
|
elif _get_at(current) == 'K': |
1465
|
|
|
if _get_at(current + 1) == 'K': |
1466
|
|
|
current += 2 |
1467
|
|
|
else: |
1468
|
|
|
current += 1 |
1469
|
|
|
(primary, secondary) = _metaph_add('K') |
1470
|
|
|
continue |
1471
|
|
|
|
1472
|
|
|
elif _get_at(current) == 'L': |
1473
|
|
|
if _get_at(current + 1) == 'L': |
1474
|
|
|
# Spanish e.g. 'cabrillo', 'gallegos' |
1475
|
|
|
if (((current == (length - 3)) and |
1476
|
|
|
_string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
1477
|
|
|
((_string_at((last - 1), 2, {'AS', 'OS'}) or |
1478
|
|
|
_string_at(last, 1, {'A', 'O'})) and |
1479
|
|
|
_string_at((current - 1), 4, {'ALLE'}))): |
1480
|
|
|
(primary, secondary) = _metaph_add('L', ' ') |
1481
|
|
|
current += 2 |
1482
|
|
|
continue |
1483
|
|
|
current += 2 |
1484
|
|
|
else: |
1485
|
|
|
current += 1 |
1486
|
|
|
(primary, secondary) = _metaph_add('L') |
1487
|
|
|
continue |
1488
|
|
|
|
1489
|
|
|
elif _get_at(current) == 'M': |
1490
|
|
|
if (((_string_at((current - 1), 3, {'UMB'}) and |
1491
|
|
|
(((current + 1) == last) or |
1492
|
|
|
_string_at((current + 2), 2, {'ER'}))) or |
1493
|
|
|
# 'dumb', 'thumb' |
1494
|
|
|
(_get_at(current + 1) == 'M'))): |
1495
|
|
|
current += 2 |
1496
|
|
|
else: |
1497
|
|
|
current += 1 |
1498
|
|
|
(primary, secondary) = _metaph_add('M') |
1499
|
|
|
continue |
1500
|
|
|
|
1501
|
|
|
elif _get_at(current) == 'N': |
1502
|
|
|
if _get_at(current + 1) == 'N': |
1503
|
|
|
current += 2 |
1504
|
|
|
else: |
1505
|
|
|
current += 1 |
1506
|
|
|
(primary, secondary) = _metaph_add('N') |
1507
|
|
|
continue |
1508
|
|
|
|
1509
|
|
|
elif _get_at(current) == 'Ñ': |
1510
|
|
|
current += 1 |
1511
|
|
|
(primary, secondary) = _metaph_add('N') |
1512
|
|
|
continue |
1513
|
|
|
|
1514
|
|
|
elif _get_at(current) == 'P': |
1515
|
|
|
if _get_at(current + 1) == 'H': |
1516
|
|
|
(primary, secondary) = _metaph_add('F') |
1517
|
|
|
current += 2 |
1518
|
|
|
continue |
1519
|
|
|
|
1520
|
|
|
# also account for "campbell", "raspberry" |
1521
|
|
|
elif _string_at((current + 1), 1, {'P', 'B'}): |
1522
|
|
|
current += 2 |
1523
|
|
|
else: |
1524
|
|
|
current += 1 |
1525
|
|
|
(primary, secondary) = _metaph_add('P') |
1526
|
|
|
continue |
1527
|
|
|
|
1528
|
|
|
elif _get_at(current) == 'Q': |
1529
|
|
|
if _get_at(current + 1) == 'Q': |
1530
|
|
|
current += 2 |
1531
|
|
|
else: |
1532
|
|
|
current += 1 |
1533
|
|
|
(primary, secondary) = _metaph_add('K') |
1534
|
|
|
continue |
1535
|
|
|
|
1536
|
|
|
elif _get_at(current) == 'R': |
1537
|
|
|
# french e.g. 'rogier', but exclude 'hochmeier' |
1538
|
|
|
if (((current == last) and |
1539
|
|
|
not _slavo_germanic() and |
1540
|
|
|
_string_at((current - 2), 2, {'IE'}) and |
1541
|
|
|
not _string_at((current - 4), 2, {'ME', 'MA'}))): |
1542
|
|
|
(primary, secondary) = _metaph_add('', 'R') |
1543
|
|
|
else: |
1544
|
|
|
(primary, secondary) = _metaph_add('R') |
1545
|
|
|
|
1546
|
|
|
if _get_at(current + 1) == 'R': |
1547
|
|
|
current += 2 |
1548
|
|
|
else: |
1549
|
|
|
current += 1 |
1550
|
|
|
continue |
1551
|
|
|
|
1552
|
|
|
elif _get_at(current) == 'S': |
1553
|
|
|
# special cases 'island', 'isle', 'carlisle', 'carlysle' |
1554
|
|
|
if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
1555
|
|
|
current += 1 |
1556
|
|
|
continue |
1557
|
|
|
|
1558
|
|
|
# special case 'sugar-' |
1559
|
|
|
elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
1560
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
1561
|
|
|
current += 1 |
1562
|
|
|
continue |
1563
|
|
|
|
1564
|
|
|
elif _string_at(current, 2, {'SH'}): |
1565
|
|
|
# Germanic |
1566
|
|
|
if _string_at((current + 1), 4, |
1567
|
|
|
{'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
1568
|
|
|
(primary, secondary) = _metaph_add('S') |
1569
|
|
|
else: |
1570
|
|
|
(primary, secondary) = _metaph_add('X') |
1571
|
|
|
current += 2 |
1572
|
|
|
continue |
1573
|
|
|
|
1574
|
|
|
# Italian & Armenian |
1575
|
|
|
elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
1576
|
|
|
_string_at(current, 4, {'SIAN'})): |
1577
|
|
|
if not _slavo_germanic(): |
1578
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1579
|
|
|
else: |
1580
|
|
|
(primary, secondary) = _metaph_add('S') |
1581
|
|
|
current += 3 |
1582
|
|
|
continue |
1583
|
|
|
|
1584
|
|
|
# German & anglicisations, e.g. 'smith' match 'schmidt', |
1585
|
|
|
# 'snider' match 'schneider' |
1586
|
|
|
# also, -sz- in Slavic language although in Hungarian it is |
1587
|
|
|
# pronounced 's' |
1588
|
|
|
elif (((current == 0) and |
1589
|
|
|
_string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
1590
|
|
|
_string_at((current + 1), 1, {'Z'})): |
1591
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
1592
|
|
|
if _string_at((current + 1), 1, {'Z'}): |
1593
|
|
|
current += 2 |
1594
|
|
|
else: |
1595
|
|
|
current += 1 |
1596
|
|
|
continue |
1597
|
|
|
|
1598
|
|
|
elif _string_at(current, 2, {'SC'}): |
1599
|
|
|
# Schlesinger's rule |
1600
|
|
|
if _get_at(current + 2) == 'H': |
1601
|
|
|
# dutch origin, e.g. 'school', 'schooner' |
1602
|
|
|
if _string_at((current + 3), 2, |
1603
|
|
|
{'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
1604
|
|
|
# 'schermerhorn', 'schenker' |
1605
|
|
|
if _string_at((current + 3), 2, {'ER', 'EN'}): |
1606
|
|
|
(primary, secondary) = _metaph_add('X', 'SK') |
1607
|
|
|
else: |
1608
|
|
|
(primary, secondary) = _metaph_add('SK') |
1609
|
|
|
current += 3 |
1610
|
|
|
continue |
1611
|
|
|
else: |
1612
|
|
|
if (((current == 0) and not _is_vowel(3) and |
1613
|
|
|
(_get_at(3) != 'W'))): |
1614
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
1615
|
|
|
else: |
1616
|
|
|
(primary, secondary) = _metaph_add('X') |
1617
|
|
|
current += 3 |
1618
|
|
|
continue |
1619
|
|
|
|
1620
|
|
|
elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
1621
|
|
|
(primary, secondary) = _metaph_add('S') |
1622
|
|
|
current += 3 |
1623
|
|
|
continue |
1624
|
|
|
|
1625
|
|
|
# else |
1626
|
|
|
else: |
1627
|
|
|
(primary, secondary) = _metaph_add('SK') |
1628
|
|
|
current += 3 |
1629
|
|
|
continue |
1630
|
|
|
|
1631
|
|
|
else: |
1632
|
|
|
# french e.g. 'resnais', 'artois' |
1633
|
|
|
if (current == last) and _string_at((current - 2), 2, |
1634
|
|
|
{'AI', 'OI'}): |
1635
|
|
|
(primary, secondary) = _metaph_add('', 'S') |
1636
|
|
|
else: |
1637
|
|
|
(primary, secondary) = _metaph_add('S') |
1638
|
|
|
|
1639
|
|
|
if _string_at((current + 1), 1, {'S', 'Z'}): |
1640
|
|
|
current += 2 |
1641
|
|
|
else: |
1642
|
|
|
current += 1 |
1643
|
|
|
continue |
1644
|
|
|
|
1645
|
|
|
elif _get_at(current) == 'T': |
1646
|
|
|
if _string_at(current, 4, {'TION'}): |
1647
|
|
|
(primary, secondary) = _metaph_add('X') |
1648
|
|
|
current += 3 |
1649
|
|
|
continue |
1650
|
|
|
|
1651
|
|
|
elif _string_at(current, 3, {'TIA', 'TCH'}): |
1652
|
|
|
(primary, secondary) = _metaph_add('X') |
1653
|
|
|
current += 3 |
1654
|
|
|
continue |
1655
|
|
|
|
1656
|
|
|
elif (_string_at(current, 2, {'TH'}) or |
1657
|
|
|
_string_at(current, 3, {'TTH'})): |
1658
|
|
|
# special case 'thomas', 'thames' or germanic |
1659
|
|
|
if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
1660
|
|
|
_string_at(0, 4, {'VAN ', 'VON '}) or |
1661
|
|
|
_string_at(0, 3, {'SCH'}))): |
1662
|
|
|
(primary, secondary) = _metaph_add('T') |
1663
|
|
|
else: |
1664
|
|
|
(primary, secondary) = _metaph_add('0', 'T') |
1665
|
|
|
current += 2 |
1666
|
|
|
continue |
1667
|
|
|
|
1668
|
|
|
elif _string_at((current + 1), 1, {'T', 'D'}): |
1669
|
|
|
current += 2 |
1670
|
|
|
else: |
1671
|
|
|
current += 1 |
1672
|
|
|
(primary, secondary) = _metaph_add('T') |
1673
|
|
|
continue |
1674
|
|
|
|
1675
|
|
|
elif _get_at(current) == 'V': |
1676
|
|
|
if _get_at(current + 1) == 'V': |
1677
|
|
|
current += 2 |
1678
|
|
|
else: |
1679
|
|
|
current += 1 |
1680
|
|
|
(primary, secondary) = _metaph_add('F') |
1681
|
|
|
continue |
1682
|
|
|
|
1683
|
|
|
elif _get_at(current) == 'W': |
1684
|
|
|
# can also be in middle of word |
1685
|
|
|
if _string_at(current, 2, {'WR'}): |
1686
|
|
|
(primary, secondary) = _metaph_add('R') |
1687
|
|
|
current += 2 |
1688
|
|
|
continue |
1689
|
|
|
elif ((current == 0) and |
1690
|
|
|
(_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
1691
|
|
|
# Wasserman should match Vasserman |
1692
|
|
|
if _is_vowel(current + 1): |
1693
|
|
|
(primary, secondary) = _metaph_add('A', 'F') |
1694
|
|
|
else: |
1695
|
|
|
# need Uomo to match Womo |
1696
|
|
|
(primary, secondary) = _metaph_add('A') |
1697
|
|
|
|
1698
|
|
|
# Arnow should match Arnoff |
1699
|
|
|
if ((((current == last) and _is_vowel(current - 1)) or |
1700
|
|
|
_string_at((current - 1), 5, |
1701
|
|
|
{'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
1702
|
|
|
_string_at(0, 3, ['SCH']))): |
1703
|
|
|
(primary, secondary) = _metaph_add('', 'F') |
1704
|
|
|
current += 1 |
1705
|
|
|
continue |
1706
|
|
|
# Polish e.g. 'filipowicz' |
1707
|
|
|
elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
1708
|
|
|
(primary, secondary) = _metaph_add('TS', 'FX') |
1709
|
|
|
current += 4 |
1710
|
|
|
continue |
1711
|
|
|
# else skip it |
1712
|
|
|
else: |
1713
|
|
|
current += 1 |
1714
|
|
|
continue |
1715
|
|
|
|
1716
|
|
|
elif _get_at(current) == 'X': |
1717
|
|
|
# French e.g. breaux |
1718
|
|
|
if (not ((current == last) and |
1719
|
|
|
(_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
1720
|
|
|
_string_at((current - 2), 2, {'AU', 'OU'})))): |
1721
|
|
|
(primary, secondary) = _metaph_add('KS') |
1722
|
|
|
|
1723
|
|
|
if _string_at((current + 1), 1, {'C', 'X'}): |
1724
|
|
|
current += 2 |
1725
|
|
|
else: |
1726
|
|
|
current += 1 |
1727
|
|
|
continue |
1728
|
|
|
|
1729
|
|
|
elif _get_at(current) == 'Z': |
1730
|
|
|
# Chinese Pinyin e.g. 'zhao' |
1731
|
|
|
if _get_at(current + 1) == 'H': |
1732
|
|
|
(primary, secondary) = _metaph_add('J') |
1733
|
|
|
current += 2 |
1734
|
|
|
continue |
1735
|
|
|
elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
1736
|
|
|
(_slavo_germanic() and ((current > 0) and |
1737
|
|
|
_get_at(current - 1) != 'T'))): |
1738
|
|
|
(primary, secondary) = _metaph_add('S', 'TS') |
1739
|
|
|
else: |
1740
|
|
|
(primary, secondary) = _metaph_add('S') |
1741
|
|
|
|
1742
|
|
|
if _get_at(current + 1) == 'Z': |
1743
|
|
|
current += 2 |
1744
|
|
|
else: |
1745
|
|
|
current += 1 |
1746
|
|
|
continue |
1747
|
|
|
|
1748
|
|
|
else: |
1749
|
|
|
current += 1 |
1750
|
|
|
|
1751
|
|
|
if maxlength and maxlength < _INFINITY: |
1752
|
|
|
primary = primary[:maxlength] |
1753
|
|
|
secondary = secondary[:maxlength] |
1754
|
|
|
if primary == secondary: |
1755
|
|
|
secondary = '' |
1756
|
|
|
|
1757
|
|
|
return (primary, secondary) |
1758
|
|
|
|
1759
|
|
|
|
1760
|
|
|
def caverphone(word, version=2): |
1761
|
|
|
"""Return the Caverphone code for a word. |
1762
|
|
|
|
1763
|
|
|
A description of version 1 of the algorithm can be found at: |
1764
|
|
|
http://caversham.otago.ac.nz/files/working/ctp060902.pdf |
1765
|
|
|
|
1766
|
|
|
A description of version 2 of the algorithm can be found at: |
1767
|
|
|
http://caversham.otago.ac.nz/files/working/ctp150804.pdf |
1768
|
|
|
|
1769
|
|
|
:param str word: the word to transform |
1770
|
|
|
:param int version: the version of Caverphone to employ for encoding |
1771
|
|
|
(defaults to 2) |
1772
|
|
|
:returns: the Caverphone value |
1773
|
|
|
:rtype: str |
1774
|
|
|
|
1775
|
|
|
>>> caverphone('Christopher') |
1776
|
|
|
'KRSTFA1111' |
1777
|
|
|
>>> caverphone('Niall') |
1778
|
|
|
'NA11111111' |
1779
|
|
|
>>> caverphone('Smith') |
1780
|
|
|
'SMT1111111' |
1781
|
|
|
>>> caverphone('Schmidt') |
1782
|
|
|
'SKMT111111' |
1783
|
|
|
|
1784
|
|
|
>>> caverphone('Christopher', 1) |
1785
|
|
|
'KRSTF1' |
1786
|
|
|
>>> caverphone('Niall', 1) |
1787
|
|
|
'N11111' |
1788
|
|
|
>>> caverphone('Smith', 1) |
1789
|
|
|
'SMT111' |
1790
|
|
|
>>> caverphone('Schmidt', 1) |
1791
|
|
|
'SKMT11' |
1792
|
|
|
""" |
1793
|
|
|
_vowels = {'a', 'e', 'i', 'o', 'u'} |
1794
|
|
|
|
1795
|
|
|
word = word.lower() |
1796
|
|
|
word = ''.join(c for c in word if c in |
1797
|
|
|
{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
1798
|
|
|
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
1799
|
|
|
'y', 'z'}) |
1800
|
|
|
# the main replacemet algorithm |
1801
|
|
|
if version != 1 and word[-1:] == 'e': |
1802
|
|
|
word = word[:-1] |
1803
|
|
|
if word: |
1804
|
|
|
if word[:5] == 'cough': |
1805
|
|
|
word = 'cou2f'+word[5:] |
1806
|
|
|
if word[:5] == 'rough': |
1807
|
|
|
word = 'rou2f'+word[5:] |
1808
|
|
|
if word[:5] == 'tough': |
1809
|
|
|
word = 'tou2f'+word[5:] |
1810
|
|
|
if word[:6] == 'enough': |
1811
|
|
|
word = 'enou2f'+word[6:] |
1812
|
|
|
if version != 1 and word[:6] == 'trough': |
1813
|
|
|
word = 'trou2f'+word[6:] |
1814
|
|
|
if word[:2] == 'gn': |
1815
|
|
|
word = '2n'+word[2:] |
1816
|
|
|
if word[-2:] == 'mb': |
1817
|
|
|
word = word[:-1]+'2' |
1818
|
|
|
word = word.replace('cq', '2q') |
1819
|
|
|
word = word.replace('ci', 'si') |
1820
|
|
|
word = word.replace('ce', 'se') |
1821
|
|
|
word = word.replace('cy', 'sy') |
1822
|
|
|
word = word.replace('tch', '2ch') |
1823
|
|
|
word = word.replace('c', 'k') |
1824
|
|
|
word = word.replace('q', 'k') |
1825
|
|
|
word = word.replace('x', 'k') |
1826
|
|
|
word = word.replace('v', 'f') |
1827
|
|
|
word = word.replace('dg', '2g') |
1828
|
|
|
word = word.replace('tio', 'sio') |
1829
|
|
|
word = word.replace('tia', 'sia') |
1830
|
|
|
word = word.replace('d', 't') |
1831
|
|
|
word = word.replace('ph', 'fh') |
1832
|
|
|
word = word.replace('b', 'p') |
1833
|
|
|
word = word.replace('sh', 's2') |
1834
|
|
|
word = word.replace('z', 's') |
1835
|
|
|
if word[0] in _vowels: |
1836
|
|
|
word = 'A'+word[1:] |
1837
|
|
|
word = word.replace('a', '3') |
1838
|
|
|
word = word.replace('e', '3') |
1839
|
|
|
word = word.replace('i', '3') |
1840
|
|
|
word = word.replace('o', '3') |
1841
|
|
|
word = word.replace('u', '3') |
1842
|
|
|
if version != 1: |
1843
|
|
|
word = word.replace('j', 'y') |
1844
|
|
|
if word[:2] == 'y3': |
1845
|
|
|
word = 'Y3'+word[2:] |
1846
|
|
|
if word[:1] == 'y': |
1847
|
|
|
word = 'A'+word[1:] |
1848
|
|
|
word = word.replace('y', '3') |
1849
|
|
|
word = word.replace('3gh3', '3kh3') |
1850
|
|
|
word = word.replace('gh', '22') |
1851
|
|
|
word = word.replace('g', 'k') |
1852
|
|
|
word = re.sub(r's+', r'S', word) # TODO: implement w/o re? |
|
|
|
|
1853
|
|
|
word = re.sub(r't+', r'T', word) |
1854
|
|
|
word = re.sub(r'p+', r'P', word) |
1855
|
|
|
word = re.sub(r'k+', r'K', word) |
1856
|
|
|
word = re.sub(r'f+', r'F', word) |
1857
|
|
|
word = re.sub(r'm+', r'M', word) |
1858
|
|
|
word = re.sub(r'n+', r'N', word) |
1859
|
|
|
word = word.replace('w3', 'W3') |
1860
|
|
|
if version == 1: |
1861
|
|
|
word = word.replace('wy', 'Wy') |
1862
|
|
|
word = word.replace('wh3', 'Wh3') |
1863
|
|
|
if version == 1: |
1864
|
|
|
word = word.replace('why', 'Why') |
1865
|
|
|
if version != 1 and word[-1:] == 'w': |
1866
|
|
|
word = word[:-1]+'3' |
1867
|
|
|
word = word.replace('w', '2') |
1868
|
|
|
if word[:1] == 'h': |
1869
|
|
|
word = 'A'+word[1:] |
1870
|
|
|
word = word.replace('h', '2') |
1871
|
|
|
word = word.replace('r3', 'R3') |
1872
|
|
|
if version == 1: |
1873
|
|
|
word = word.replace('ry', 'Ry') |
1874
|
|
|
if version != 1 and word[-1:] == 'r': |
1875
|
|
|
word = word[:-1]+'3' |
1876
|
|
|
word = word.replace('r', '2') |
1877
|
|
|
word = word.replace('l3', 'L3') |
1878
|
|
|
if version == 1: |
1879
|
|
|
word = word.replace('ly', 'Ly') |
1880
|
|
|
if version != 1 and word[-1:] == 'l': |
1881
|
|
|
word = word[:-1]+'3' |
1882
|
|
|
word = word.replace('l', '2') |
1883
|
|
|
if version == 1: |
1884
|
|
|
word = word.replace('j', 'y') |
1885
|
|
|
word = word.replace('y3', 'Y3') |
1886
|
|
|
word = word.replace('y', '2') |
1887
|
|
|
word = word.replace('2', '') |
1888
|
|
|
if version != 1 and word[-1:] == '3': |
1889
|
|
|
word = word[:-1]+'A' |
1890
|
|
|
word = word.replace('3', '') |
1891
|
|
|
|
1892
|
|
|
# pad with 1s, then extract the necessary length of code |
1893
|
|
|
word = word+'1'*10 |
1894
|
|
|
if version != 1: |
1895
|
|
|
word = word[:10] |
1896
|
|
|
else: |
1897
|
|
|
word = word[:6] |
1898
|
|
|
|
1899
|
|
|
return word |
1900
|
|
|
|
1901
|
|
|
|
1902
|
|
|
def alpha_sis(word, maxlength=14): |
1903
|
|
|
"""Return the IBM Alpha Search Inquiry System code for a word. |
1904
|
|
|
|
1905
|
|
|
Based on the algorithm described in "Accessing individual records from |
1906
|
|
|
personal data files using non-unique identifiers" / Gwendolyn B. Moore, |
1907
|
|
|
et al.; prepared for the Institute for Computer Sciences and Technology, |
1908
|
|
|
National Bureau of Standards, Washington, D.C (1977): |
1909
|
|
|
https://archive.org/stream/accessingindivid00moor#page/15/mode/1up |
1910
|
|
|
|
1911
|
|
|
A collection is necessary since there can be multiple values for a |
1912
|
|
|
single word. But the collection must be ordered since the first value |
1913
|
|
|
is the primary coding. |
1914
|
|
|
|
1915
|
|
|
:param str word: the word to transform |
1916
|
|
|
:param int maxlength: the length of the code returned (defaults to 14) |
1917
|
|
|
:returns: the Alpha SIS value |
1918
|
|
|
:rtype: tuple |
1919
|
|
|
|
1920
|
|
|
>>> alpha_sis('Christopher') |
1921
|
|
|
('06401840000000', '07040184000000', '04018400000000') |
1922
|
|
|
>>> alpha_sis('Niall') |
1923
|
|
|
('02500000000000',) |
1924
|
|
|
>>> alpha_sis('Smith') |
1925
|
|
|
('03100000000000',) |
1926
|
|
|
>>> alpha_sis('Schmidt') |
1927
|
|
|
('06310000000000',) |
1928
|
|
|
""" |
1929
|
|
|
_alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', |
1930
|
|
|
'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', |
1931
|
|
|
'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', |
1932
|
|
|
'O': '1', 'U': '1', 'W': '4', 'Y': '5'} |
1933
|
|
|
_alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', |
1934
|
|
|
'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', |
1935
|
|
|
'Y') |
1936
|
|
|
_alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'), |
1937
|
|
|
'CH': ('6', '70', '0'), 'CK': ('7', '6'), |
1938
|
|
|
'DS': ('0', '10'), 'DZ': ('0', '10'), |
1939
|
|
|
'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', |
1940
|
|
|
'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', |
1941
|
|
|
'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', |
1942
|
|
|
'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', |
1943
|
|
|
'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', |
1944
|
|
|
'F': '8', 'V': '8', 'B': '9', 'P': '9'} |
1945
|
|
|
_alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', |
1946
|
|
|
'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', |
1947
|
|
|
'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', |
1948
|
|
|
'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P') |
1949
|
|
|
|
1950
|
|
|
alpha = [''] |
1951
|
|
|
pos = 0 |
1952
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
1953
|
|
|
word = word.replace('ß', 'SS') |
1954
|
|
|
word = ''.join(c for c in word if c in |
1955
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
1956
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
1957
|
|
|
'Y', 'Z'}) |
1958
|
|
|
|
1959
|
|
|
# Clamp maxlength to [4, 64] |
1960
|
|
|
if maxlength is not None: |
1961
|
|
|
maxlength = min(max(4, maxlength), 64) |
1962
|
|
|
else: |
1963
|
|
|
maxlength = 64 |
1964
|
|
|
|
1965
|
|
|
# Do special processing for initial substrings |
1966
|
|
|
for k in _alpha_sis_initials_order: |
1967
|
|
|
if word.startswith(k): |
1968
|
|
|
alpha[0] += _alpha_sis_initials[k] |
1969
|
|
|
pos += len(k) |
1970
|
|
|
break |
1971
|
|
|
|
1972
|
|
|
# Add a '0' if alpha is still empty |
1973
|
|
|
if not alpha[0]: |
1974
|
|
|
alpha[0] += '0' |
1975
|
|
|
|
1976
|
|
|
# Whether or not any special initial codes were encoded, iterate |
1977
|
|
|
# through the length of the word in the main encoding loop |
1978
|
|
|
while pos < len(word): |
1979
|
|
|
origpos = pos |
1980
|
|
|
for k in _alpha_sis_basic_order: |
1981
|
|
|
if word[pos:].startswith(k): |
1982
|
|
|
if isinstance(_alpha_sis_basic[k], tuple): |
1983
|
|
|
newalpha = [] |
1984
|
|
|
for i in range(len(_alpha_sis_basic[k])): |
1985
|
|
|
newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] |
1986
|
|
|
alpha = newalpha |
1987
|
|
|
else: |
1988
|
|
|
alpha = [_ + _alpha_sis_basic[k] for _ in alpha] |
1989
|
|
|
pos += len(k) |
1990
|
|
|
break |
1991
|
|
|
if pos == origpos: |
1992
|
|
|
alpha = [_ + '_' for _ in alpha] |
1993
|
|
|
pos += 1 |
1994
|
|
|
|
1995
|
|
|
# Trim doublets and placeholders |
1996
|
|
|
for i in range(len(alpha)): |
|
|
|
|
1997
|
|
|
pos = 1 |
1998
|
|
|
while pos < len(alpha[i]): |
1999
|
|
|
if alpha[i][pos] == alpha[i][pos-1]: |
2000
|
|
|
alpha[i] = alpha[i][:pos]+alpha[i][pos+1:] |
2001
|
|
|
pos += 1 |
2002
|
|
|
alpha = (_.replace('_', '') for _ in alpha) |
|
|
|
|
2003
|
|
|
|
2004
|
|
|
# Trim codes and return tuple |
2005
|
|
|
alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha) |
2006
|
|
|
return tuple(alpha) |
2007
|
|
|
|
2008
|
|
|
|
2009
|
|
|
def fuzzy_soundex(word, maxlength=5, zero_pad=True): |
2010
|
|
|
"""Return the Fuzzy Soundex code for a word. |
2011
|
|
|
|
2012
|
|
|
Fuzzy Soundex is an algorithm derived from Soundex, defined in: |
2013
|
|
|
Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for |
2014
|
|
|
Soundex Retrieval." |
2015
|
|
|
http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf |
2016
|
|
|
|
2017
|
|
|
:param str word: the word to transform |
2018
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2019
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2020
|
|
|
a maxlength string |
2021
|
|
|
:returns: the Fuzzy Soundex value |
2022
|
|
|
:rtype: str |
2023
|
|
|
|
2024
|
|
|
>>> fuzzy_soundex('Christopher') |
2025
|
|
|
'K6931' |
2026
|
|
|
>>> fuzzy_soundex('Niall') |
2027
|
|
|
'N4000' |
2028
|
|
|
>>> fuzzy_soundex('Smith') |
2029
|
|
|
'S5300' |
2030
|
|
|
>>> fuzzy_soundex('Smith') |
2031
|
|
|
'S5300' |
2032
|
|
|
""" |
2033
|
|
|
_fuzzy_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2034
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
2035
|
|
|
'0193017-07745501769301-7-9')) |
2036
|
|
|
|
2037
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
2038
|
|
|
word = word.replace('ß', 'SS') |
2039
|
|
|
|
2040
|
|
|
# Clamp maxlength to [4, 64] |
2041
|
|
|
if maxlength is not None: |
2042
|
|
|
maxlength = min(max(4, maxlength), 64) |
2043
|
|
|
else: |
2044
|
|
|
maxlength = 64 |
2045
|
|
|
|
2046
|
|
|
if not word: |
2047
|
|
|
if zero_pad: |
2048
|
|
|
return '0' * maxlength |
2049
|
|
|
return '0' |
2050
|
|
|
|
2051
|
|
|
if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: |
2052
|
|
|
word = 'SS' + word[2:] |
2053
|
|
|
elif word[:2] == 'GN': |
2054
|
|
|
word = 'NN' + word[2:] |
2055
|
|
|
elif word[:2] in {'HR', 'WR'}: |
2056
|
|
|
word = 'RR' + word[2:] |
2057
|
|
|
elif word[:2] == 'HW': |
2058
|
|
|
word = 'WW' + word[2:] |
2059
|
|
|
elif word[:2] in {'KN', 'NG'}: |
2060
|
|
|
word = 'NN' + word[2:] |
2061
|
|
|
|
2062
|
|
|
if word[-2:] == 'CH': |
2063
|
|
|
word = word[:-2] + 'KK' |
2064
|
|
|
elif word[-2:] == 'NT': |
2065
|
|
|
word = word[:-2] + 'TT' |
2066
|
|
|
elif word[-2:] == 'RT': |
2067
|
|
|
word = word[:-2] + 'RR' |
2068
|
|
|
elif word[-3:] == 'RDT': |
2069
|
|
|
word = word[:-3] + 'RR' |
2070
|
|
|
|
2071
|
|
|
word = word.replace('CA', 'KA') |
2072
|
|
|
word = word.replace('CC', 'KK') |
2073
|
|
|
word = word.replace('CK', 'KK') |
2074
|
|
|
word = word.replace('CE', 'SE') |
2075
|
|
|
word = word.replace('CHL', 'KL') |
2076
|
|
|
word = word.replace('CL', 'KL') |
2077
|
|
|
word = word.replace('CHR', 'KR') |
2078
|
|
|
word = word.replace('CR', 'KR') |
2079
|
|
|
word = word.replace('CI', 'SI') |
2080
|
|
|
word = word.replace('CO', 'KO') |
2081
|
|
|
word = word.replace('CU', 'KU') |
2082
|
|
|
word = word.replace('CY', 'SY') |
2083
|
|
|
word = word.replace('DG', 'GG') |
2084
|
|
|
word = word.replace('GH', 'HH') |
2085
|
|
|
word = word.replace('MAC', 'MK') |
2086
|
|
|
word = word.replace('MC', 'MK') |
2087
|
|
|
word = word.replace('NST', 'NSS') |
2088
|
|
|
word = word.replace('PF', 'FF') |
2089
|
|
|
word = word.replace('PH', 'FF') |
2090
|
|
|
word = word.replace('SCH', 'SSS') |
2091
|
|
|
word = word.replace('TIO', 'SIO') |
2092
|
|
|
word = word.replace('TIA', 'SIO') |
2093
|
|
|
word = word.replace('TCH', 'CHH') |
2094
|
|
|
|
2095
|
|
|
sdx = word.translate(_fuzzy_soundex_translation) |
2096
|
|
|
sdx = sdx.replace('-', '') |
2097
|
|
|
|
2098
|
|
|
# remove repeating characters |
2099
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
2100
|
|
|
|
2101
|
|
|
if word[0] in {'H', 'W', 'Y'}: |
2102
|
|
|
sdx = word[0] + sdx |
2103
|
|
|
else: |
2104
|
|
|
sdx = word[0] + sdx[1:] |
2105
|
|
|
|
2106
|
|
|
sdx = sdx.replace('0', '') |
2107
|
|
|
|
2108
|
|
|
if zero_pad: |
2109
|
|
|
sdx += ('0'*maxlength) |
2110
|
|
|
|
2111
|
|
|
return sdx[:maxlength] |
2112
|
|
|
|
2113
|
|
|
|
2114
|
|
|
def phonex(word, maxlength=4, zero_pad=True): |
2115
|
|
|
"""Return the Phonex code for a word. |
2116
|
|
|
|
2117
|
|
|
Phonex is an algorithm derived from Soundex, defined in: |
2118
|
|
|
Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms". |
2119
|
|
|
http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf |
2120
|
|
|
|
2121
|
|
|
:param str word: the word to transform |
2122
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2123
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2124
|
|
|
a maxlength string |
2125
|
|
|
:returns: the Phonex value |
2126
|
|
|
:rtype: str |
2127
|
|
|
|
2128
|
|
|
>>> phonex('Christopher') |
2129
|
|
|
'C623' |
2130
|
|
|
>>> phonex('Niall') |
2131
|
|
|
'N400' |
2132
|
|
|
>>> phonex('Schmidt') |
2133
|
|
|
'S253' |
2134
|
|
|
>>> phonex('Smith') |
2135
|
|
|
'S530' |
2136
|
|
|
""" |
2137
|
|
|
name = unicodedata.normalize('NFKD', text_type(word.upper())) |
2138
|
|
|
name = name.replace('ß', 'SS') |
2139
|
|
|
|
2140
|
|
|
# Clamp maxlength to [4, 64] |
2141
|
|
|
if maxlength is not None: |
2142
|
|
|
maxlength = min(max(4, maxlength), 64) |
2143
|
|
|
else: |
2144
|
|
|
maxlength = 64 |
2145
|
|
|
|
2146
|
|
|
name_code = last = '' |
2147
|
|
|
|
2148
|
|
|
# Deletions effected by replacing with next letter which |
2149
|
|
|
# will be ignored due to duplicate handling of Soundex code. |
2150
|
|
|
# This is faster than 'moving' all subsequent letters. |
2151
|
|
|
|
2152
|
|
|
# Remove any trailing Ss |
2153
|
|
|
while name[-1:] == 'S': |
2154
|
|
|
name = name[:-1] |
2155
|
|
|
|
2156
|
|
|
# Phonetic equivalents of first 2 characters |
2157
|
|
|
# Works since duplicate letters are ignored |
2158
|
|
|
if name[:2] == 'KN': |
2159
|
|
|
name = 'N' + name[2:] # KN.. == N.. |
2160
|
|
|
elif name[:2] == 'PH': |
2161
|
|
|
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) |
2162
|
|
|
elif name[:2] == 'WR': |
2163
|
|
|
name = 'R' + name[2:] # WR.. == R.. |
2164
|
|
|
|
2165
|
|
|
if name: |
2166
|
|
|
# Special case, ignore H first letter (subsequent Hs ignored anyway) |
2167
|
|
|
# Works since duplicate letters are ignored |
2168
|
|
|
if name[0] == 'H': |
2169
|
|
|
name = name[1:] |
2170
|
|
|
|
2171
|
|
|
if name: |
2172
|
|
|
# Phonetic equivalents of first character |
2173
|
|
|
if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
2174
|
|
|
name = 'A' + name[1:] |
2175
|
|
|
elif name[0] in {'B', 'P'}: |
2176
|
|
|
name = 'B' + name[1:] |
2177
|
|
|
elif name[0] in {'V', 'F'}: |
2178
|
|
|
name = 'F' + name[1:] |
2179
|
|
|
elif name[0] in {'C', 'K', 'Q'}: |
2180
|
|
|
name = 'C' + name[1:] |
2181
|
|
|
elif name[0] in {'G', 'J'}: |
2182
|
|
|
name = 'G' + name[1:] |
2183
|
|
|
elif name[0] in {'S', 'Z'}: |
2184
|
|
|
name = 'S' + name[1:] |
2185
|
|
|
|
2186
|
|
|
name_code = last = name[0] |
2187
|
|
|
|
2188
|
|
|
# MODIFIED SOUNDEX CODE |
2189
|
|
|
for i in range(1, len(name)): |
2190
|
|
|
code = '0' |
2191
|
|
|
if name[i] in {'B', 'F', 'P', 'V'}: |
2192
|
|
|
code = '1' |
2193
|
|
|
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: |
2194
|
|
|
code = '2' |
2195
|
|
|
elif name[i] in {'D', 'T'}: |
2196
|
|
|
if name[i+1:i+2] != 'C': |
2197
|
|
|
code = '3' |
2198
|
|
|
elif name[i] == 'L': |
2199
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
2200
|
|
|
i+1 == len(name)): |
2201
|
|
|
code = '4' |
2202
|
|
|
elif name[i] in {'M', 'N'}: |
2203
|
|
|
if name[i+1:i+2] in {'D', 'G'}: |
2204
|
|
|
name = name[:i+1] + name[i] + name[i+2:] |
2205
|
|
|
code = '5' |
2206
|
|
|
elif name[i] == 'R': |
2207
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
2208
|
|
|
i+1 == len(name)): |
2209
|
|
|
code = '6' |
2210
|
|
|
|
2211
|
|
|
if code != last and code != '0' and i != 0: |
2212
|
|
|
name_code += code |
2213
|
|
|
|
2214
|
|
|
last = name_code[-1] |
2215
|
|
|
|
2216
|
|
|
if zero_pad: |
2217
|
|
|
name_code += '0' * maxlength |
2218
|
|
|
if not name_code: |
2219
|
|
|
name_code = '0' |
2220
|
|
|
return name_code[:maxlength] |
2221
|
|
|
|
2222
|
|
|
|
2223
|
|
|
def phonem(word): |
2224
|
|
|
"""Return the Phonem code for a word. |
2225
|
|
|
|
2226
|
|
|
Phonem is defined in Wilde, Georg and Carsten Meyer. 1999. "Doppelgaenger |
2227
|
|
|
gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung." |
2228
|
|
|
ct Magazin fuer Computer & Technik 25/1999. |
2229
|
|
|
|
2230
|
|
|
This version is based on the Perl implementation documented at: |
2231
|
|
|
http://phonetik.phil-fak.uni-koeln.de/fileadmin/home/ritters/Allgemeine_Dateien/Martin_Wilz.pdf |
2232
|
|
|
It includes some enhancements presented in the Java port at: |
2233
|
|
|
https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java |
2234
|
|
|
|
2235
|
|
|
Phonem is intended chiefly for German names/words. |
2236
|
|
|
|
2237
|
|
|
:param str word: the word to transform |
2238
|
|
|
:returns: the Phonem value |
2239
|
|
|
:rtype: str |
2240
|
|
|
|
2241
|
|
|
>>> phonem('Christopher') |
2242
|
|
|
'CRYSDOVR' |
2243
|
|
|
>>> phonem('Niall') |
2244
|
|
|
'NYAL' |
2245
|
|
|
>>> phonem('Smith') |
2246
|
|
|
'SMYD' |
2247
|
|
|
>>> phonem('Schmidt') |
2248
|
|
|
'CMYD' |
2249
|
|
|
""" |
2250
|
|
|
_phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), |
2251
|
|
|
('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), |
2252
|
|
|
('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), |
2253
|
|
|
('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), |
2254
|
|
|
('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), |
2255
|
|
|
('AU', 'A§'), ('OU', '§')) |
2256
|
|
|
_phonem_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2257
|
|
|
'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
2258
|
|
|
'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) |
2259
|
|
|
|
2260
|
|
|
word = unicodedata.normalize('NFC', text_type(word.upper())) |
2261
|
|
|
for i, j in _phonem_substitutions: |
2262
|
|
|
word = word.replace(i, j) |
2263
|
|
|
word = word.translate(_phonem_translation) |
2264
|
|
|
|
2265
|
|
|
return ''.join(c for c in _delete_consecutive_repeats(word) |
2266
|
|
|
if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', |
2267
|
|
|
'U', 'V', 'W', 'X', 'Y', 'Ö'}) |
2268
|
|
|
|
2269
|
|
|
|
2270
|
|
|
def phonix(word, maxlength=4, zero_pad=True): |
2271
|
|
|
"""Return the Phonix code for a word. |
2272
|
|
|
|
2273
|
|
|
Phonix is a Soundex-like algorithm defined in: |
2274
|
|
|
T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366. |
2275
|
|
|
|
2276
|
|
|
This implementation is based on |
2277
|
|
|
http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c |
2278
|
|
|
http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py |
2279
|
|
|
and |
2280
|
|
|
https://metacpan.org/pod/Text::Phonetic::Phonix |
2281
|
|
|
|
2282
|
|
|
:param str word: the word to transform |
2283
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
2284
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
2285
|
|
|
a maxlength string |
2286
|
|
|
:returns: the Phonix value |
2287
|
|
|
:rtype: str |
2288
|
|
|
|
2289
|
|
|
>>> phonix('Christopher') |
2290
|
|
|
'K683' |
2291
|
|
|
>>> phonix('Niall') |
2292
|
|
|
'N400' |
2293
|
|
|
>>> phonix('Smith') |
2294
|
|
|
'S530' |
2295
|
|
|
>>> phonix('Schmidt') |
2296
|
|
|
'S530' |
2297
|
|
|
""" |
2298
|
|
|
# pylint: disable=too-many-branches |
2299
|
|
|
def _start_repl(word, src, tar, post=None): |
2300
|
|
|
r"""Replace src with tar at the start of word.""" |
2301
|
|
|
if post: |
2302
|
|
|
for i in post: |
2303
|
|
|
if word.startswith(src+i): |
2304
|
|
|
return tar + word[len(src):] |
2305
|
|
|
elif word.startswith(src): |
2306
|
|
|
return tar + word[len(src):] |
2307
|
|
|
return word |
2308
|
|
|
|
2309
|
|
|
def _end_repl(word, src, tar, pre=None): |
2310
|
|
|
r"""Replace src with tar at the end of word.""" |
2311
|
|
|
if pre: |
2312
|
|
|
for i in pre: |
2313
|
|
|
if word.endswith(i+src): |
2314
|
|
|
return word[:-len(src)] + tar |
2315
|
|
|
elif word.endswith(src): |
2316
|
|
|
return word[:-len(src)] + tar |
2317
|
|
|
return word |
2318
|
|
|
|
2319
|
|
|
def _mid_repl(word, src, tar, pre=None, post=None): |
2320
|
|
|
r"""Replace src with tar in the middle of word.""" |
2321
|
|
|
if pre or post: |
2322
|
|
|
if not pre: |
|
|
|
|
2323
|
|
|
return word[0] + _all_repl(word[1:], src, tar, pre, post) |
2324
|
|
|
elif not post: |
2325
|
|
|
return _all_repl(word[:-1], src, tar, pre, post) + word[-1] |
2326
|
|
|
return _all_repl(word, src, tar, pre, post) |
2327
|
|
|
return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) + |
2328
|
|
|
word[-1]) |
2329
|
|
|
|
2330
|
|
|
def _all_repl(word, src, tar, pre=None, post=None): |
2331
|
|
|
r"""Replace src with tar anywhere in word.""" |
2332
|
|
|
if pre or post: |
2333
|
|
|
if post: |
2334
|
|
|
post = post |
2335
|
|
|
else: |
2336
|
|
|
post = frozenset(('',)) |
2337
|
|
|
if pre: |
2338
|
|
|
pre = pre |
2339
|
|
|
else: |
2340
|
|
|
pre = frozenset(('',)) |
2341
|
|
|
|
2342
|
|
|
for i, j in ((i, j) for i in pre for j in post): |
2343
|
|
|
word = word.replace(i+src+j, i+tar+j) |
2344
|
|
|
return word |
2345
|
|
|
else: |
2346
|
|
|
return word.replace(src, tar) |
2347
|
|
|
|
2348
|
|
|
_vow = {'A', 'E', 'I', 'O', 'U'} |
2349
|
|
|
_con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
2350
|
|
|
'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'} |
2351
|
|
|
|
2352
|
|
|
_phonix_substitutions = ((_all_repl, 'DG', 'G'), |
2353
|
|
|
(_all_repl, 'CO', 'KO'), |
2354
|
|
|
(_all_repl, 'CA', 'KA'), |
2355
|
|
|
(_all_repl, 'CU', 'KU'), |
2356
|
|
|
(_all_repl, 'CY', 'SI'), |
2357
|
|
|
(_all_repl, 'CI', 'SI'), |
2358
|
|
|
(_all_repl, 'CE', 'SE'), |
2359
|
|
|
(_start_repl, 'CL', 'KL', _vow), |
2360
|
|
|
(_all_repl, 'CK', 'K'), |
2361
|
|
|
(_end_repl, 'GC', 'K'), |
2362
|
|
|
(_end_repl, 'JC', 'K'), |
2363
|
|
|
(_start_repl, 'CHR', 'KR', _vow), |
2364
|
|
|
(_start_repl, 'CR', 'KR', _vow), |
2365
|
|
|
(_start_repl, 'WR', 'R'), |
2366
|
|
|
(_all_repl, 'NC', 'NK'), |
2367
|
|
|
(_all_repl, 'CT', 'KT'), |
2368
|
|
|
(_all_repl, 'PH', 'F'), |
2369
|
|
|
(_all_repl, 'AA', 'AR'), |
2370
|
|
|
(_all_repl, 'SCH', 'SH'), |
2371
|
|
|
(_all_repl, 'BTL', 'TL'), |
2372
|
|
|
(_all_repl, 'GHT', 'T'), |
2373
|
|
|
(_all_repl, 'AUGH', 'ARF'), |
2374
|
|
|
(_mid_repl, 'LJ', 'LD', _vow, _vow), |
2375
|
|
|
(_all_repl, 'LOUGH', 'LOW'), |
2376
|
|
|
(_start_repl, 'Q', 'KW'), |
2377
|
|
|
(_start_repl, 'KN', 'N'), |
2378
|
|
|
(_end_repl, 'GN', 'N'), |
2379
|
|
|
(_all_repl, 'GHN', 'N'), |
2380
|
|
|
(_end_repl, 'GNE', 'N'), |
2381
|
|
|
(_all_repl, 'GHNE', 'NE'), |
2382
|
|
|
(_end_repl, 'GNES', 'NS'), |
2383
|
|
|
(_start_repl, 'GN', 'N'), |
2384
|
|
|
(_mid_repl, 'GN', 'N', None, _con), |
2385
|
|
|
(_end_repl, 'GN', 'N'), |
2386
|
|
|
(_start_repl, 'PS', 'S'), |
2387
|
|
|
(_start_repl, 'PT', 'T'), |
2388
|
|
|
(_start_repl, 'CZ', 'C'), |
2389
|
|
|
(_mid_repl, 'WZ', 'Z', _vow), |
2390
|
|
|
(_mid_repl, 'CZ', 'CH'), |
2391
|
|
|
(_all_repl, 'LZ', 'LSH'), |
2392
|
|
|
(_all_repl, 'RZ', 'RSH'), |
2393
|
|
|
(_mid_repl, 'Z', 'S', None, _vow), |
2394
|
|
|
(_all_repl, 'ZZ', 'TS'), |
2395
|
|
|
(_mid_repl, 'Z', 'TS', _con), |
2396
|
|
|
(_all_repl, 'HROUG', 'REW'), |
2397
|
|
|
(_all_repl, 'OUGH', 'OF'), |
2398
|
|
|
(_mid_repl, 'Q', 'KW', _vow, _vow), |
2399
|
|
|
(_mid_repl, 'J', 'Y', _vow, _vow), |
2400
|
|
|
(_start_repl, 'YJ', 'Y', _vow), |
2401
|
|
|
(_start_repl, 'GH', 'G'), |
2402
|
|
|
(_end_repl, 'GH', 'E', _vow), |
2403
|
|
|
(_start_repl, 'CY', 'S'), |
2404
|
|
|
(_all_repl, 'NX', 'NKS'), |
2405
|
|
|
(_start_repl, 'PF', 'F'), |
2406
|
|
|
(_end_repl, 'DT', 'T'), |
2407
|
|
|
(_end_repl, 'TL', 'TIL'), |
2408
|
|
|
(_end_repl, 'DL', 'DIL'), |
2409
|
|
|
(_all_repl, 'YTH', 'ITH'), |
2410
|
|
|
(_start_repl, 'TJ', 'CH', _vow), |
2411
|
|
|
(_start_repl, 'TSJ', 'CH', _vow), |
2412
|
|
|
(_start_repl, 'TS', 'T', _vow), |
2413
|
|
|
(_all_repl, 'TCH', 'CH'), |
2414
|
|
|
(_mid_repl, 'WSK', 'VSKIE', _vow), |
2415
|
|
|
(_end_repl, 'WSK', 'VSKIE', _vow), |
2416
|
|
|
(_start_repl, 'MN', 'N', _vow), |
2417
|
|
|
(_start_repl, 'PN', 'N', _vow), |
2418
|
|
|
(_mid_repl, 'STL', 'SL', _vow), |
2419
|
|
|
(_end_repl, 'STL', 'SL', _vow), |
2420
|
|
|
(_end_repl, 'TNT', 'ENT'), |
2421
|
|
|
(_end_repl, 'EAUX', 'OH'), |
2422
|
|
|
(_all_repl, 'EXCI', 'ECS'), |
2423
|
|
|
(_all_repl, 'X', 'ECS'), |
2424
|
|
|
(_end_repl, 'NED', 'ND'), |
2425
|
|
|
(_all_repl, 'JR', 'DR'), |
2426
|
|
|
(_end_repl, 'EE', 'EA'), |
2427
|
|
|
(_all_repl, 'ZS', 'S'), |
2428
|
|
|
(_mid_repl, 'R', 'AH', _vow, _con), |
2429
|
|
|
(_end_repl, 'R', 'AH', _vow), |
2430
|
|
|
(_mid_repl, 'HR', 'AH', _vow, _con), |
2431
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
2432
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
2433
|
|
|
(_end_repl, 'RE', 'AR'), |
2434
|
|
|
(_end_repl, 'R', 'AH', _vow), |
2435
|
|
|
(_all_repl, 'LLE', 'LE'), |
2436
|
|
|
(_end_repl, 'LE', 'ILE', _con), |
2437
|
|
|
(_end_repl, 'LES', 'ILES', _con), |
2438
|
|
|
(_end_repl, 'E', ''), |
2439
|
|
|
(_end_repl, 'ES', 'S'), |
2440
|
|
|
(_end_repl, 'SS', 'AS', _vow), |
2441
|
|
|
(_end_repl, 'MB', 'M', _vow), |
2442
|
|
|
(_all_repl, 'MPTS', 'MPS'), |
2443
|
|
|
(_all_repl, 'MPS', 'MS'), |
2444
|
|
|
(_all_repl, 'MPT', 'MT')) |
2445
|
|
|
|
2446
|
|
|
_phonix_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2447
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
2448
|
|
|
'01230720022455012683070808')) |
2449
|
|
|
|
2450
|
|
|
sdx = '' |
2451
|
|
|
|
2452
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
2453
|
|
|
word = word.replace('ß', 'SS') |
2454
|
|
|
word = ''.join(c for c in word if c in |
2455
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
2456
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
2457
|
|
|
'Y', 'Z'}) |
2458
|
|
|
if word: |
2459
|
|
|
for trans in _phonix_substitutions: |
2460
|
|
|
word = trans[0](word, *trans[1:]) |
2461
|
|
|
if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
2462
|
|
|
sdx = 'v' + word[1:].translate(_phonix_translation) |
2463
|
|
|
else: |
2464
|
|
|
sdx = word[0] + word[1:].translate(_phonix_translation) |
2465
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
2466
|
|
|
sdx = sdx.replace('0', '') |
2467
|
|
|
|
2468
|
|
|
# Clamp maxlength to [4, 64] |
2469
|
|
|
if maxlength is not None: |
2470
|
|
|
maxlength = min(max(4, maxlength), 64) |
2471
|
|
|
else: |
2472
|
|
|
maxlength = 64 |
2473
|
|
|
|
2474
|
|
|
if zero_pad: |
2475
|
|
|
sdx += '0' * maxlength |
2476
|
|
|
if not sdx: |
2477
|
|
|
sdx = '0' |
2478
|
|
|
return sdx[:maxlength] |
2479
|
|
|
|
2480
|
|
|
|
2481
|
|
|
def sfinxbis(word, maxlength=None): |
2482
|
|
|
"""Return the SfinxBis code for a word. |
2483
|
|
|
|
2484
|
|
|
SfinxBis is a Soundex-like algorithm defined in: |
2485
|
|
|
http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf |
2486
|
|
|
|
2487
|
|
|
This implementation follows the reference implementation: |
2488
|
|
|
http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt |
2489
|
|
|
|
2490
|
|
|
SfinxBis is intended chiefly for Swedish names. |
2491
|
|
|
|
2492
|
|
|
:param str word: the word to transform |
2493
|
|
|
:param int maxlength: the length of the code returned (defaults to |
2494
|
|
|
unlimited) |
2495
|
|
|
:returns: the SfinxBis value |
2496
|
|
|
:rtype: tuple |
2497
|
|
|
|
2498
|
|
|
>>> sfinxbis('Christopher') |
2499
|
|
|
('K68376',) |
2500
|
|
|
>>> sfinxbis('Niall') |
2501
|
|
|
('N4',) |
2502
|
|
|
>>> sfinxbis('Smith') |
2503
|
|
|
('S53',) |
2504
|
|
|
>>> sfinxbis('Schmidt') |
2505
|
|
|
('S53',) |
2506
|
|
|
|
2507
|
|
|
>>> sfinxbis('Johansson') |
2508
|
|
|
('J585',) |
2509
|
|
|
>>> sfinxbis('Sjöberg') |
2510
|
|
|
('#162',) |
2511
|
|
|
""" |
2512
|
|
|
adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', |
2513
|
|
|
' VAN DER ', ' VON DEM ', ' VON DER ', |
2514
|
|
|
' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', |
2515
|
|
|
' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', |
2516
|
|
|
' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', |
2517
|
|
|
' S:T ') |
2518
|
|
|
|
2519
|
|
|
_harde_vokaler = {'A', 'O', 'U', 'Å'} |
2520
|
|
|
_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} |
2521
|
|
|
_konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', |
2522
|
|
|
'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
2523
|
|
|
_alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
2524
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
2525
|
|
|
'Y', 'Z', 'Ä', 'Å', 'Ö'} |
2526
|
|
|
|
2527
|
|
|
_sfinxbis_translation = dict(zip((ord(_) for _ in |
|
|
|
|
2528
|
|
|
'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), |
2529
|
|
|
'123729224551268378999999999')) |
2530
|
|
|
|
2531
|
|
|
_sfinxbis_substitutions = dict(zip((ord(_) for _ in |
2532
|
|
|
'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), |
2533
|
|
|
'VSAAAAÄCEEEEIIIINOOOOÖUUUYY')) |
2534
|
|
|
|
2535
|
|
|
def _foersvensker(ordet): |
2536
|
|
|
"""Return the Swedish-ized form of the word.""" |
2537
|
|
|
ordet = ordet.replace('STIERN', 'STJÄRN') |
2538
|
|
|
ordet = ordet.replace('HIE', 'HJ') |
2539
|
|
|
ordet = ordet.replace('SIÖ', 'SJÖ') |
2540
|
|
|
ordet = ordet.replace('SCH', 'SH') |
2541
|
|
|
ordet = ordet.replace('QU', 'KV') |
2542
|
|
|
ordet = ordet.replace('IO', 'JO') |
2543
|
|
|
ordet = ordet.replace('PH', 'F') |
2544
|
|
|
|
2545
|
|
|
for i in _harde_vokaler: |
2546
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
2547
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
2548
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
2549
|
|
|
for i in _mjuka_vokaler: |
2550
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
2551
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
2552
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
2553
|
|
|
|
2554
|
|
|
if 'H' in ordet: |
2555
|
|
|
for i in _konsonanter: |
2556
|
|
|
ordet = ordet.replace('H'+i, i) |
2557
|
|
|
|
2558
|
|
|
ordet = ordet.translate(_sfinxbis_substitutions) |
2559
|
|
|
|
2560
|
|
|
ordet = ordet.replace('Ð', 'ETH') |
2561
|
|
|
ordet = ordet.replace('Þ', 'TH') |
2562
|
|
|
ordet = ordet.replace('ß', 'SS') |
2563
|
|
|
|
2564
|
|
|
return ordet |
2565
|
|
|
|
2566
|
|
|
def _koda_foersta_ljudet(ordet): |
2567
|
|
|
"""Return the word with the first sound coded.""" |
2568
|
|
|
if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler: |
2569
|
|
|
ordet = '$' + ordet[1:] |
2570
|
|
|
elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): |
2571
|
|
|
ordet = 'J' + ordet[2:] |
2572
|
|
|
elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler: |
2573
|
|
|
ordet = 'J' + ordet[1:] |
2574
|
|
|
elif ordet[0:1] == 'Q': |
2575
|
|
|
ordet = 'K' + ordet[1:] |
2576
|
|
|
elif (ordet[0:2] == 'CH' and |
2577
|
|
|
ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)): |
2578
|
|
|
ordet = '#' + ordet[2:] |
2579
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler: |
2580
|
|
|
ordet = 'K' + ordet[1:] |
2581
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter: |
2582
|
|
|
ordet = 'K' + ordet[1:] |
2583
|
|
|
elif ordet[0:1] == 'X': |
2584
|
|
|
ordet = 'S' + ordet[1:] |
2585
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler: |
2586
|
|
|
ordet = 'S' + ordet[1:] |
2587
|
|
|
elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'): |
2588
|
|
|
ordet = '#' + ordet[3:] |
2589
|
|
|
elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): |
2590
|
|
|
ordet = '#' + ordet[2:] |
2591
|
|
|
elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler: |
2592
|
|
|
ordet = '#' + ordet[2:] |
2593
|
|
|
elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler: |
2594
|
|
|
ordet = '#' + ordet[1:] |
2595
|
|
|
return ordet |
2596
|
|
|
|
2597
|
|
|
# Steg 1, Versaler |
2598
|
|
|
word = unicodedata.normalize('NFC', text_type(word.upper())) |
2599
|
|
|
word = word.replace('ß', 'SS') |
2600
|
|
|
word = word.replace('-', ' ') |
2601
|
|
|
|
2602
|
|
|
# Steg 2, Ta bort adelsprefix |
2603
|
|
|
for adelstitel in adelstitler: |
2604
|
|
|
while adelstitel in word: |
2605
|
|
|
word = word.replace(adelstitel, ' ') |
2606
|
|
|
if word.startswith(adelstitel[1:]): |
2607
|
|
|
word = word[len(adelstitel)-1:] |
2608
|
|
|
|
2609
|
|
|
# Split word into tokens |
2610
|
|
|
ordlista = word.split() |
2611
|
|
|
|
2612
|
|
|
# Steg 3, Ta bort dubbelteckning i början på namnet |
2613
|
|
|
ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] |
2614
|
|
|
if not ordlista: |
2615
|
|
|
return ('',) |
2616
|
|
|
|
2617
|
|
|
# Steg 4, Försvenskning |
2618
|
|
|
ordlista = [_foersvensker(ordet) for ordet in ordlista] |
2619
|
|
|
|
2620
|
|
|
# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) |
2621
|
|
|
ordlista = [''.join(c for c in ordet if c in _alfabet) |
2622
|
|
|
for ordet in ordlista] |
2623
|
|
|
|
2624
|
|
|
# Steg 6, Koda första ljudet |
2625
|
|
|
ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] |
2626
|
|
|
|
2627
|
|
|
# Steg 7, Dela upp namnet i två delar |
2628
|
|
|
rest = [ordet[1:] for ordet in ordlista] |
2629
|
|
|
|
2630
|
|
|
# Steg 8, Utför fonetisk transformation i resten |
2631
|
|
|
rest = [ordet.replace('DT', 'T') for ordet in rest] |
2632
|
|
|
rest = [ordet.replace('X', 'KS') for ordet in rest] |
2633
|
|
|
|
2634
|
|
|
# Steg 9, Koda resten till en sifferkod |
2635
|
|
|
for vokal in _mjuka_vokaler: |
2636
|
|
|
rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest] |
2637
|
|
|
rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] |
2638
|
|
|
|
2639
|
|
|
# Steg 10, Ta bort intilliggande dubbletter |
2640
|
|
|
rest = [_delete_consecutive_repeats(ordet) for ordet in rest] |
2641
|
|
|
|
2642
|
|
|
# Steg 11, Ta bort alla "9" |
2643
|
|
|
rest = [ordet.replace('9', '') for ordet in rest] |
2644
|
|
|
|
2645
|
|
|
# Steg 12, Sätt ihop delarna igen |
2646
|
|
|
ordlista = [''.join(ordet) for ordet in |
2647
|
|
|
zip((_[0:1] for _ in ordlista), rest)] |
2648
|
|
|
|
2649
|
|
|
# truncate, if maxlength is set |
2650
|
|
|
if maxlength and maxlength < _INFINITY: |
2651
|
|
|
ordlista = [ordet[:maxlength] for ordet in ordlista] |
2652
|
|
|
|
2653
|
|
|
return tuple(ordlista) |
2654
|
|
|
|
2655
|
|
|
|
2656
|
|
|
def phonet(word, mode=1, lang='de', trace=False): |
2657
|
|
|
"""Return the phonet code for a word. |
2658
|
|
|
|
2659
|
|
|
phonet was developed by Jörg Michael and documented in c't magazine |
2660
|
|
|
vol. 25/1999, p. 252. It is a phonetic algorithm designed primarily for |
2661
|
|
|
German. |
2662
|
|
|
Cf. http://www.heise.de/ct/ftp/99/25/252/ |
2663
|
|
|
|
2664
|
|
|
This is a port of Jesper Zedlitz's code, which is licensed LGPL: |
2665
|
|
|
https://code.google.com/p/phonet4java/source/browse/trunk/src/main/java/com/googlecode/phonet4java/Phonet.java |
2666
|
|
|
|
2667
|
|
|
That is, in turn, based on Michael's C code, which is also licensed LGPL: |
2668
|
|
|
ftp://ftp.heise.de/pub/ct/listings/phonet.zip |
2669
|
|
|
|
2670
|
|
|
:param str word: the word to transform |
2671
|
|
|
:param int mode: the ponet variant to employ (1 or 2) |
2672
|
|
|
:param str lang: 'de' (default) for German |
2673
|
|
|
'none' for no language |
2674
|
|
|
:param bool trace: prints debugging info if True |
2675
|
|
|
:returns: the phonet value |
2676
|
|
|
:rtype: str |
2677
|
|
|
|
2678
|
|
|
>>> phonet('Christopher') |
2679
|
|
|
'KRISTOFA' |
2680
|
|
|
>>> phonet('Niall') |
2681
|
|
|
'NIAL' |
2682
|
|
|
>>> phonet('Smith') |
2683
|
|
|
'SMIT' |
2684
|
|
|
>>> phonet('Schmidt') |
2685
|
|
|
'SHMIT' |
2686
|
|
|
|
2687
|
|
|
>>> phonet('Christopher', mode=2) |
2688
|
|
|
'KRIZTUFA' |
2689
|
|
|
>>> phonet('Niall', mode=2) |
2690
|
|
|
'NIAL' |
2691
|
|
|
>>> phonet('Smith', mode=2) |
2692
|
|
|
'ZNIT' |
2693
|
|
|
>>> phonet('Schmidt', mode=2) |
2694
|
|
|
'ZNIT' |
2695
|
|
|
|
2696
|
|
|
>>> phonet('Christopher', lang='none') |
2697
|
|
|
'CHRISTOPHER' |
2698
|
|
|
>>> phonet('Niall', lang='none') |
2699
|
|
|
'NIAL' |
2700
|
|
|
>>> phonet('Smith', lang='none') |
2701
|
|
|
'SMITH' |
2702
|
|
|
>>> phonet('Schmidt', lang='none') |
2703
|
|
|
'SCHMIDT' |
2704
|
|
|
""" |
2705
|
|
|
# pylint: disable=too-many-branches |
2706
|
|
|
|
2707
|
|
|
_phonet_rules_no_lang = ( # separator chars |
2708
|
|
|
'´', ' ', ' ', |
2709
|
|
|
'"', ' ', ' ', |
2710
|
|
|
'`$', '', '', |
2711
|
|
|
'\'', ' ', ' ', |
2712
|
|
|
',', ',', ',', |
2713
|
|
|
';', ',', ',', |
2714
|
|
|
'-', ' ', ' ', |
2715
|
|
|
' ', ' ', ' ', |
2716
|
|
|
'.', '.', '.', |
2717
|
|
|
':', '.', '.', |
2718
|
|
|
# German umlauts |
2719
|
|
|
'Ä', 'AE', 'AE', |
2720
|
|
|
'Ö', 'OE', 'OE', |
2721
|
|
|
'Ü', 'UE', 'UE', |
2722
|
|
|
'ß', 'S', 'S', |
2723
|
|
|
# international umlauts |
2724
|
|
|
'À', 'A', 'A', |
2725
|
|
|
'Á', 'A', 'A', |
2726
|
|
|
'Â', 'A', 'A', |
2727
|
|
|
'Ã', 'A', 'A', |
2728
|
|
|
'Å', 'A', 'A', |
2729
|
|
|
'Æ', 'AE', 'AE', |
2730
|
|
|
'Ç', 'C', 'C', |
2731
|
|
|
'Ð', 'DJ', 'DJ', |
2732
|
|
|
'È', 'E', 'E', |
2733
|
|
|
'É', 'E', 'E', |
2734
|
|
|
'Ê', 'E', 'E', |
2735
|
|
|
'Ë', 'E', 'E', |
2736
|
|
|
'Ì', 'I', 'I', |
2737
|
|
|
'Í', 'I', 'I', |
2738
|
|
|
'Î', 'I', 'I', |
2739
|
|
|
'Ï', 'I', 'I', |
2740
|
|
|
'Ñ', 'NH', 'NH', |
2741
|
|
|
'Ò', 'O', 'O', |
2742
|
|
|
'Ó', 'O', 'O', |
2743
|
|
|
'Ô', 'O', 'O', |
2744
|
|
|
'Õ', 'O', 'O', |
2745
|
|
|
'Œ', 'OE', 'OE', |
2746
|
|
|
'Ø', 'OE', 'OE', |
2747
|
|
|
'Š', 'SH', 'SH', |
2748
|
|
|
'Þ', 'TH', 'TH', |
2749
|
|
|
'Ù', 'U', 'U', |
2750
|
|
|
'Ú', 'U', 'U', |
2751
|
|
|
'Û', 'U', 'U', |
2752
|
|
|
'Ý', 'Y', 'Y', |
2753
|
|
|
'Ÿ', 'Y', 'Y', |
2754
|
|
|
# 'normal' letters (A-Z) |
2755
|
|
|
'MC^', 'MAC', 'MAC', |
2756
|
|
|
'MC^', 'MAC', 'MAC', |
2757
|
|
|
'M´^', 'MAC', 'MAC', |
2758
|
|
|
'M\'^', 'MAC', 'MAC', |
2759
|
|
|
'O´^', 'O', 'O', |
2760
|
|
|
'O\'^', 'O', 'O', |
2761
|
|
|
'VAN DEN ^', 'VANDEN', 'VANDEN', |
2762
|
|
|
None, None, None) |
2763
|
|
|
|
2764
|
|
|
_phonet_rules_german = ( # separator chars |
2765
|
|
|
'´', ' ', ' ', |
2766
|
|
|
'"', ' ', ' ', |
2767
|
|
|
'`$', '', '', |
2768
|
|
|
'\'', ' ', ' ', |
2769
|
|
|
',', ' ', ' ', |
2770
|
|
|
';', ' ', ' ', |
2771
|
|
|
'-', ' ', ' ', |
2772
|
|
|
' ', ' ', ' ', |
2773
|
|
|
'.', '.', '.', |
2774
|
|
|
':', '.', '.', |
2775
|
|
|
# German umlauts |
2776
|
|
|
'ÄE', 'E', 'E', |
2777
|
|
|
'ÄU<', 'EU', 'EU', |
2778
|
|
|
'ÄV(AEOU)-<', 'EW', None, |
2779
|
|
|
'Ä$', 'Ä', None, |
2780
|
|
|
'Ä<', None, 'E', |
2781
|
|
|
'Ä', 'E', None, |
2782
|
|
|
'ÖE', 'Ö', 'Ö', |
2783
|
|
|
'ÖU', 'Ö', 'Ö', |
2784
|
|
|
'ÖVER--<', 'ÖW', None, |
2785
|
|
|
'ÖV(AOU)-', 'ÖW', None, |
2786
|
|
|
'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
2787
|
|
|
'ÜBER^^', 'ÜBA', 'IBA', |
2788
|
|
|
'ÜE', 'Ü', 'I', |
2789
|
|
|
'ÜVER--<', 'ÜW', None, |
2790
|
|
|
'ÜV(AOU)-', 'ÜW', None, |
2791
|
|
|
'Ü', None, 'I', |
2792
|
|
|
'ßCH<', None, 'Z', |
2793
|
|
|
'ß<', 'S', 'Z', |
2794
|
|
|
# international umlauts |
2795
|
|
|
'À<', 'A', 'A', |
2796
|
|
|
'Á<', 'A', 'A', |
2797
|
|
|
'Â<', 'A', 'A', |
2798
|
|
|
'Ã<', 'A', 'A', |
2799
|
|
|
'Å<', 'A', 'A', |
2800
|
|
|
'ÆER-', 'E', 'E', |
2801
|
|
|
'ÆU<', 'EU', 'EU', |
2802
|
|
|
'ÆV(AEOU)-<', 'EW', None, |
2803
|
|
|
'Æ$', 'Ä', None, |
2804
|
|
|
'Æ<', None, 'E', |
2805
|
|
|
'Æ', 'E', None, |
2806
|
|
|
'Ç', 'Z', 'Z', |
2807
|
|
|
'ÐÐ-', '', '', |
2808
|
|
|
'Ð', 'DI', 'TI', |
2809
|
|
|
'È<', 'E', 'E', |
2810
|
|
|
'É<', 'E', 'E', |
2811
|
|
|
'Ê<', 'E', 'E', |
2812
|
|
|
'Ë', 'E', 'E', |
2813
|
|
|
'Ì<', 'I', 'I', |
2814
|
|
|
'Í<', 'I', 'I', |
2815
|
|
|
'Î<', 'I', 'I', |
2816
|
|
|
'Ï', 'I', 'I', |
2817
|
|
|
'ÑÑ-', '', '', |
2818
|
|
|
'Ñ', 'NI', 'NI', |
2819
|
|
|
'Ò<', 'O', 'U', |
2820
|
|
|
'Ó<', 'O', 'U', |
2821
|
|
|
'Ô<', 'O', 'U', |
2822
|
|
|
'Õ<', 'O', 'U', |
2823
|
|
|
'Œ<', 'Ö', 'Ö', |
2824
|
|
|
'Ø(IJY)-<', 'E', 'E', |
2825
|
|
|
'Ø<', 'Ö', 'Ö', |
2826
|
|
|
'Š', 'SH', 'Z', |
2827
|
|
|
'Þ', 'T', 'T', |
2828
|
|
|
'Ù<', 'U', 'U', |
2829
|
|
|
'Ú<', 'U', 'U', |
2830
|
|
|
'Û<', 'U', 'U', |
2831
|
|
|
'Ý<', 'I', 'I', |
2832
|
|
|
'Ÿ<', 'I', 'I', |
2833
|
|
|
# 'normal' letters (A-Z) |
2834
|
|
|
'ABELLE$', 'ABL', 'ABL', |
2835
|
|
|
'ABELL$', 'ABL', 'ABL', |
2836
|
|
|
'ABIENNE$', 'ABIN', 'ABIN', |
2837
|
|
|
'ACHME---^', 'ACH', 'AK', |
2838
|
|
|
'ACEY$', 'AZI', 'AZI', |
2839
|
|
|
'ADV', 'ATW', None, |
2840
|
|
|
'AEGL-', 'EK', None, |
2841
|
|
|
'AEU<', 'EU', 'EU', |
2842
|
|
|
'AE2', 'E', 'E', |
2843
|
|
|
'AFTRAUBEN------', 'AFT ', 'AFT ', |
2844
|
|
|
'AGL-1', 'AK', None, |
2845
|
|
|
'AGNI-^', 'AKN', 'AKN', |
2846
|
|
|
'AGNIE-', 'ANI', 'ANI', |
2847
|
|
|
'AGN(AEOU)-$', 'ANI', 'ANI', |
2848
|
|
|
'AH(AIOÖUÜY)-', 'AH', None, |
2849
|
|
|
'AIA2', 'AIA', 'AIA', |
2850
|
|
|
'AIE$', 'E', 'E', |
2851
|
|
|
'AILL(EOU)-', 'ALI', 'ALI', |
2852
|
|
|
'AINE$', 'EN', 'EN', |
2853
|
|
|
'AIRE$', 'ER', 'ER', |
2854
|
|
|
'AIR-', 'E', 'E', |
2855
|
|
|
'AISE$', 'ES', 'EZ', |
2856
|
|
|
'AISSANCE$', 'ESANS', 'EZANZ', |
2857
|
|
|
'AISSE$', 'ES', 'EZ', |
2858
|
|
|
'AIX$', 'EX', 'EX', |
2859
|
|
|
'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
2860
|
|
|
'AKTIE', 'AXIE', 'AXIE', |
2861
|
|
|
'AKTUEL', 'AKTUEL', None, |
2862
|
|
|
'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
2863
|
|
|
'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
2864
|
|
|
'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
2865
|
|
|
'ANCH(OEI)-', 'ANSH', 'ANZ', |
2866
|
|
|
'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
2867
|
|
|
'ANDERGEHE----', 'ANDA ', 'ANTA ', |
2868
|
|
|
'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
2869
|
|
|
'ANDERGING----', 'ANDA ', 'ANTA ', |
2870
|
|
|
'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
2871
|
|
|
'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
2872
|
|
|
'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
2873
|
|
|
'ANER(BKO)---^^', 'AN', None, |
2874
|
|
|
'ANHAND---^$', 'AN H', 'AN ', |
2875
|
|
|
'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
2876
|
|
|
'ANIELLE$', 'ANIEL', 'ANIL', |
2877
|
|
|
'ANIEL', 'ANIEL', None, |
2878
|
|
|
'ANSTELLE----^$', 'AN ST', 'AN ZT', |
2879
|
|
|
'ANTI^^', 'ANTI', 'ANTI', |
2880
|
|
|
'ANVER^^', 'ANFA', 'ANFA', |
2881
|
|
|
'ATIA$', 'ATIA', 'ATIA', |
2882
|
|
|
'ATIA(NS)--', 'ATI', 'ATI', |
2883
|
|
|
'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
2884
|
|
|
'AUAU--', '', '', |
2885
|
|
|
'AUERE$', 'AUERE', None, |
2886
|
|
|
'AUERE(NS)-$', 'AUERE', None, |
2887
|
|
|
'AUERE(AIOUY)--', 'AUER', None, |
2888
|
|
|
'AUER(AÄIOÖUÜY)-', 'AUER', None, |
2889
|
|
|
'AUER<', 'AUA', 'AUA', |
2890
|
|
|
'AUF^^', 'AUF', 'AUF', |
2891
|
|
|
'AULT$', 'O', 'U', |
2892
|
|
|
'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
2893
|
|
|
'AUR$', 'AUA', 'AUA', |
2894
|
|
|
'AUSSE$', 'OS', 'UZ', |
2895
|
|
|
'AUS(ST)-^', 'AUS', 'AUS', |
2896
|
|
|
'AUS^^', 'AUS', 'AUS', |
2897
|
|
|
'AUTOFAHR----', 'AUTO ', 'AUTU ', |
2898
|
|
|
'AUTO^^', 'AUTO', 'AUTU', |
2899
|
|
|
'AUX(IY)-', 'AUX', 'AUX', |
2900
|
|
|
'AUX', 'O', 'U', |
2901
|
|
|
'AU', 'AU', 'AU', |
2902
|
|
|
'AVER--<', 'AW', None, |
2903
|
|
|
'AVIER$', 'AWIE', 'AFIE', |
2904
|
|
|
'AV(EÈÉÊI)-^', 'AW', None, |
2905
|
|
|
'AV(AOU)-', 'AW', None, |
2906
|
|
|
'AYRE$', 'EIRE', 'EIRE', |
2907
|
|
|
'AYRE(NS)-$', 'EIRE', 'EIRE', |
2908
|
|
|
'AYRE(AIOUY)--', 'EIR', 'EIR', |
2909
|
|
|
'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
2910
|
|
|
'AYR<', 'EIA', 'EIA', |
2911
|
|
|
'AYER--<', 'EI', 'EI', |
2912
|
|
|
'AY(AÄEIOÖUÜY)--', 'A', 'A', |
2913
|
|
|
'AË', 'E', 'E', |
2914
|
|
|
'A(IJY)<', 'EI', 'EI', |
2915
|
|
|
'BABY^$', 'BEBI', 'BEBI', |
2916
|
|
|
'BAB(IY)^', 'BEBI', 'BEBI', |
2917
|
|
|
'BEAU^$', 'BO', None, |
2918
|
|
|
'BEA(BCMNRU)-^', 'BEA', 'BEA', |
2919
|
|
|
'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
2920
|
|
|
'BEE$', 'BI', 'BI', |
2921
|
|
|
'BEIGE^$', 'BESH', 'BEZ', |
2922
|
|
|
'BENOIT--', 'BENO', 'BENU', |
2923
|
|
|
'BER(DT)-', 'BER', None, |
2924
|
|
|
'BERN(DT)-', 'BERN', None, |
2925
|
|
|
'BE(LMNRST)-^', 'BE', 'BE', |
2926
|
|
|
'BETTE$', 'BET', 'BET', |
2927
|
|
|
'BEVOR^$', 'BEFOR', None, |
2928
|
|
|
'BIC$', 'BIZ', 'BIZ', |
2929
|
|
|
'BOWL(EI)-', 'BOL', 'BUL', |
2930
|
|
|
'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
2931
|
|
|
'BRINGEND-----^', 'BRI', 'BRI', |
2932
|
|
|
'BRINGEND-----', ' BRI', ' BRI', |
2933
|
|
|
'BROW(NS)-', 'BRAU', 'BRAU', |
2934
|
|
|
'BUDGET7', 'BÜGE', 'BIKE', |
2935
|
|
|
'BUFFET7', 'BÜFE', 'BIFE', |
2936
|
|
|
'BYLLE$', 'BILE', 'BILE', |
2937
|
|
|
'BYLL$', 'BIL', 'BIL', |
2938
|
|
|
'BYPA--^', 'BEI', 'BEI', |
2939
|
|
|
'BYTE<', 'BEIT', 'BEIT', |
2940
|
|
|
'BY9^', 'BÜ', None, |
2941
|
|
|
'B(SßZ)$', 'BS', None, |
2942
|
|
|
'CACH(EI)-^', 'KESH', 'KEZ', |
2943
|
|
|
'CAE--', 'Z', 'Z', |
2944
|
|
|
'CA(IY)$', 'ZEI', 'ZEI', |
2945
|
|
|
'CE(EIJUY)--', 'Z', 'Z', |
2946
|
|
|
'CENT<', 'ZENT', 'ZENT', |
2947
|
|
|
'CERST(EI)----^', 'KE', 'KE', |
2948
|
|
|
'CER$', 'ZA', 'ZA', |
2949
|
|
|
'CE3', 'ZE', 'ZE', |
2950
|
|
|
'CH\'S$', 'X', 'X', |
2951
|
|
|
'CH´S$', 'X', 'X', |
2952
|
|
|
'CHAO(ST)-', 'KAO', 'KAU', |
2953
|
|
|
'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
2954
|
|
|
'CHAR(AI)-^', 'KAR', 'KAR', |
2955
|
|
|
'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
2956
|
|
|
'CHÄ(CF)-', 'SHE', 'ZE', |
2957
|
|
|
'CHE(CF)-', 'SHE', 'ZE', |
2958
|
|
|
'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
2959
|
|
|
'CHEQUE<', 'SHEK', 'ZEK', |
2960
|
|
|
'CHI(CFGPVW)-', 'SHI', 'ZI', |
2961
|
|
|
'CH(AEUY)-<^', 'SH', 'Z', |
2962
|
|
|
'CHK-', '', '', |
2963
|
|
|
'CHO(CKPS)-^', 'SHO', 'ZU', |
2964
|
|
|
'CHRIS-', 'KRI', None, |
2965
|
|
|
'CHRO-', 'KR', None, |
2966
|
|
|
'CH(LOR)-<^', 'K', 'K', |
2967
|
|
|
'CHST-', 'X', 'X', |
2968
|
|
|
'CH(SßXZ)3', 'X', 'X', |
2969
|
|
|
'CHTNI-3', 'CHN', 'KN', |
2970
|
|
|
'CH^', 'K', 'K', # or: 'CH', 'K' |
2971
|
|
|
'CH', 'CH', 'K', |
2972
|
|
|
'CIC$', 'ZIZ', 'ZIZ', |
2973
|
|
|
'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
2974
|
|
|
'CIENCE$', 'EIENS', 'EIENZ', |
2975
|
|
|
'CIER$', 'ZIE', 'ZIE', |
2976
|
|
|
'CYB-^', 'ZEI', 'ZEI', |
2977
|
|
|
'CY9^', 'ZÜ', 'ZI', |
2978
|
|
|
'C(IJY)-<3', 'Z', 'Z', |
2979
|
|
|
'CLOWN-', 'KLAU', 'KLAU', |
2980
|
|
|
'CCH', 'Z', 'Z', |
2981
|
|
|
'CCE-', 'X', 'X', |
2982
|
|
|
'C(CK)-', '', '', |
2983
|
|
|
'CLAUDET---', 'KLO', 'KLU', |
2984
|
|
|
'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
2985
|
|
|
'COACH', 'KOSH', 'KUZ', |
2986
|
|
|
'COLE$', 'KOL', 'KUL', |
2987
|
|
|
'COUCH', 'KAUSH', 'KAUZ', |
2988
|
|
|
'COW', 'KAU', 'KAU', |
2989
|
|
|
'CQUES$', 'K', 'K', |
2990
|
|
|
'CQUE', 'K', 'K', |
2991
|
|
|
'CRASH--9', 'KRE', 'KRE', |
2992
|
|
|
'CREAT-^', 'KREA', 'KREA', |
2993
|
|
|
'CST', 'XT', 'XT', |
2994
|
|
|
'CS<^', 'Z', 'Z', |
2995
|
|
|
'C(SßX)', 'X', 'X', |
2996
|
|
|
'CT\'S$', 'X', 'X', |
2997
|
|
|
'CT(SßXZ)', 'X', 'X', |
2998
|
|
|
'CZ<', 'Z', 'Z', |
2999
|
|
|
'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
3000
|
|
|
'C.^', 'C.', 'C.', |
3001
|
|
|
'CÄ-', 'Z', 'Z', |
3002
|
|
|
'CÜ$', 'ZÜ', 'ZI', |
3003
|
|
|
'C\'S$', 'X', 'X', |
3004
|
|
|
'C<', 'K', 'K', |
3005
|
|
|
'DAHER^$', 'DAHER', None, |
3006
|
|
|
'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
3007
|
|
|
'DAVO(NR)-^$', 'DAFO', 'TAFU', |
3008
|
|
|
'DD(SZ)--<', '', '', |
3009
|
|
|
'DD9', 'D', None, |
3010
|
|
|
'DEPOT7', 'DEPO', 'TEBU', |
3011
|
|
|
'DESIGN', 'DISEIN', 'TIZEIN', |
3012
|
|
|
'DE(LMNRST)-3^', 'DE', 'TE', |
3013
|
|
|
'DETTE$', 'DET', 'TET', |
3014
|
|
|
'DH$', 'T', None, |
3015
|
|
|
'DIC$', 'DIZ', 'TIZ', |
3016
|
|
|
'DIDR-^', 'DIT', None, |
3017
|
|
|
'DIEDR-^', 'DIT', None, |
3018
|
|
|
'DJ(AEIOU)-^', 'I', 'I', |
3019
|
|
|
'DMITR-^', 'DIMIT', 'TINIT', |
3020
|
|
|
'DRY9^', 'DRÜ', None, |
3021
|
|
|
'DT-', '', '', |
3022
|
|
|
'DUIS-^', 'DÜ', 'TI', |
3023
|
|
|
'DURCH^^', 'DURCH', 'TURK', |
3024
|
|
|
'DVA$', 'TWA', None, |
3025
|
|
|
'DY9^', 'DÜ', None, |
3026
|
|
|
'DYS$', 'DIS', None, |
3027
|
|
|
'DS(CH)--<', 'T', 'T', |
3028
|
|
|
'DST', 'ZT', 'ZT', |
3029
|
|
|
'DZS(CH)--', 'T', 'T', |
3030
|
|
|
'D(SßZ)', 'Z', 'Z', |
3031
|
|
|
'D(AÄEIOÖRUÜY)-', 'D', None, |
3032
|
|
|
'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
3033
|
|
|
'D\'H^', 'D', 'T', |
3034
|
|
|
'D´H^', 'D', 'T', |
3035
|
|
|
'D`H^', 'D', 'T', |
3036
|
|
|
'D\'S3$', 'Z', 'Z', |
3037
|
|
|
'D´S3$', 'Z', 'Z', |
3038
|
|
|
'D^', 'D', None, |
3039
|
|
|
'D', 'T', 'T', |
3040
|
|
|
'EAULT$', 'O', 'U', |
3041
|
|
|
'EAUX$', 'O', 'U', |
3042
|
|
|
'EAU', 'O', 'U', |
3043
|
|
|
'EAV', 'IW', 'IF', |
3044
|
|
|
'EAS3$', 'EAS', None, |
3045
|
|
|
'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
3046
|
|
|
'EA3$', 'EA', 'EA', |
3047
|
|
|
'EA3', 'I', 'I', |
3048
|
|
|
'EBENSO^$', 'EBNSO', 'EBNZU', |
3049
|
|
|
'EBENSO^^', 'EBNSO ', 'EBNZU ', |
3050
|
|
|
'EBEN^^', 'EBN', 'EBN', |
3051
|
|
|
'EE9', 'E', 'E', |
3052
|
|
|
'EGL-1', 'EK', None, |
3053
|
|
|
'EHE(IUY)--1', 'EH', None, |
3054
|
|
|
'EHUNG---1', 'E', None, |
3055
|
|
|
'EH(AÄIOÖUÜY)-1', 'EH', None, |
3056
|
|
|
'EIEI--', '', '', |
3057
|
|
|
'EIERE^$', 'EIERE', None, |
3058
|
|
|
'EIERE$', 'EIERE', None, |
3059
|
|
|
'EIERE(NS)-$', 'EIERE', None, |
3060
|
|
|
'EIERE(AIOUY)--', 'EIER', None, |
3061
|
|
|
'EIER(AÄIOÖUÜY)-', 'EIER', None, |
3062
|
|
|
'EIER<', 'EIA', None, |
3063
|
|
|
'EIGL-1', 'EIK', None, |
3064
|
|
|
'EIGH$', 'EI', 'EI', |
3065
|
|
|
'EIH--', 'E', 'E', |
3066
|
|
|
'EILLE$', 'EI', 'EI', |
3067
|
|
|
'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
3068
|
|
|
'EIR$', 'EIA', 'EIA', |
3069
|
|
|
'EITRAUBEN------', 'EIT ', 'EIT ', |
3070
|
|
|
'EI', 'EI', 'EI', |
3071
|
|
|
'EJ$', 'EI', 'EI', |
3072
|
|
|
'ELIZ^', 'ELIS', None, |
3073
|
|
|
'ELZ^', 'ELS', None, |
3074
|
|
|
'EL-^', 'E', 'E', |
3075
|
|
|
'ELANG----1', 'E', 'E', |
3076
|
|
|
'EL(DKL)--1', 'E', 'E', |
3077
|
|
|
'EL(MNT)--1$', 'E', 'E', |
3078
|
|
|
'ELYNE$', 'ELINE', 'ELINE', |
3079
|
|
|
'ELYN$', 'ELIN', 'ELIN', |
3080
|
|
|
'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
3081
|
|
|
'EL-1', 'L', 'L', |
3082
|
|
|
'EM-^', None, 'E', |
3083
|
|
|
'EM(DFKMPQT)--1', None, 'E', |
3084
|
|
|
'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
3085
|
|
|
'EM-1', None, 'N', |
3086
|
|
|
'ENGAG-^', 'ANGA', 'ANKA', |
3087
|
|
|
'EN-^', 'E', 'E', |
3088
|
|
|
'ENTUEL', 'ENTUEL', None, |
3089
|
|
|
'EN(CDGKQSTZ)--1', 'E', 'E', |
3090
|
|
|
'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
3091
|
|
|
'EN-1', '', '', |
3092
|
|
|
'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
3093
|
|
|
'ER-^', 'E', 'E', |
3094
|
|
|
'ERREGEND-----', ' ER', ' ER', |
3095
|
|
|
'ERT1$', 'AT', None, |
3096
|
|
|
'ER(DGLKMNRQTZß)-1', 'ER', None, |
3097
|
|
|
'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
3098
|
|
|
'ER1$', 'A', 'A', |
3099
|
|
|
'ER<1', 'A', 'A', |
3100
|
|
|
'ETAT7', 'ETA', 'ETA', |
3101
|
|
|
'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
3102
|
|
|
'EUERE$', 'EUERE', None, |
3103
|
|
|
'EUERE(NS)-$', 'EUERE', None, |
3104
|
|
|
'EUERE(AIOUY)--', 'EUER', None, |
3105
|
|
|
'EUER(AÄIOÖUÜY)-', 'EUER', None, |
3106
|
|
|
'EUER<', 'EUA', None, |
3107
|
|
|
'EUEU--', '', '', |
3108
|
|
|
'EUILLE$', 'Ö', 'Ö', |
3109
|
|
|
'EUR$', 'ÖR', 'ÖR', |
3110
|
|
|
'EUX', 'Ö', 'Ö', |
3111
|
|
|
'EUSZ$', 'EUS', None, |
3112
|
|
|
'EUTZ$', 'EUS', None, |
3113
|
|
|
'EUYS$', 'EUS', 'EUZ', |
3114
|
|
|
'EUZ$', 'EUS', None, |
3115
|
|
|
'EU', 'EU', 'EU', |
3116
|
|
|
'EVER--<1', 'EW', None, |
3117
|
|
|
'EV(ÄOÖUÜ)-1', 'EW', None, |
3118
|
|
|
'EYER<', 'EIA', 'EIA', |
3119
|
|
|
'EY<', 'EI', 'EI', |
3120
|
|
|
'FACETTE', 'FASET', 'FAZET', |
3121
|
|
|
'FANS--^$', 'FE', 'FE', |
3122
|
|
|
'FAN-^$', 'FE', 'FE', |
3123
|
|
|
'FAULT-', 'FOL', 'FUL', |
3124
|
|
|
'FEE(DL)-', 'FI', 'FI', |
3125
|
|
|
'FEHLER', 'FELA', 'FELA', |
3126
|
|
|
'FE(LMNRST)-3^', 'FE', 'FE', |
3127
|
|
|
'FOERDERN---^', 'FÖRD', 'FÖRT', |
3128
|
|
|
'FOERDERN---', ' FÖRD', ' FÖRT', |
3129
|
|
|
'FOND7', 'FON', 'FUN', |
3130
|
|
|
'FRAIN$', 'FRA', 'FRA', |
3131
|
|
|
'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
3132
|
|
|
'FY9^', 'FÜ', None, |
3133
|
|
|
'FÖRDERN---^', 'FÖRD', 'FÖRT', |
3134
|
|
|
'FÖRDERN---', ' FÖRD', ' FÖRT', |
3135
|
|
|
'GAGS^$', 'GEX', 'KEX', |
3136
|
|
|
'GAG^$', 'GEK', 'KEK', |
3137
|
|
|
'GD', 'KT', 'KT', |
3138
|
|
|
'GEGEN^^', 'GEGN', 'KEKN', |
3139
|
|
|
'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
3140
|
|
|
'GEGENGESET-----', 'GEGN ', 'KEKN ', |
3141
|
|
|
'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
3142
|
|
|
'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
3143
|
|
|
'GENDETWAS-----$', 'GENT ', 'KENT ', |
3144
|
|
|
'GENRE', 'IORE', 'IURE', |
3145
|
|
|
'GE(LMNRST)-3^', 'GE', 'KE', |
3146
|
|
|
'GER(DKT)-', 'GER', None, |
3147
|
|
|
'GETTE$', 'GET', 'KET', |
3148
|
|
|
'GGF.', 'GF.', None, |
3149
|
|
|
'GG-', '', '', |
3150
|
|
|
'GH', 'G', None, |
3151
|
|
|
'GI(AOU)-^', 'I', 'I', |
3152
|
|
|
'GION-3', 'KIO', 'KIU', |
3153
|
|
|
'G(CK)-', '', '', |
3154
|
|
|
'GJ(AEIOU)-^', 'I', 'I', |
3155
|
|
|
'GMBH^$', 'GMBH', 'GMBH', |
3156
|
|
|
'GNAC$', 'NIAK', 'NIAK', |
3157
|
|
|
'GNON$', 'NION', 'NIUN', |
3158
|
|
|
'GN$', 'N', 'N', |
3159
|
|
|
'GONCAL-^', 'GONZA', 'KUNZA', |
3160
|
|
|
'GRY9^', 'GRÜ', None, |
3161
|
|
|
'G(SßXZ)-<', 'K', 'K', |
3162
|
|
|
'GUCK-', 'KU', 'KU', |
3163
|
|
|
'GUISEP-^', 'IUSE', 'IUZE', |
3164
|
|
|
'GUI-^', 'G', 'K', |
3165
|
|
|
'GUTAUSSEH------^', 'GUT ', 'KUT ', |
3166
|
|
|
'GUTGEHEND------^', 'GUT ', 'KUT ', |
3167
|
|
|
'GY9^', 'GÜ', None, |
3168
|
|
|
'G(AÄEILOÖRUÜY)-', 'G', None, |
3169
|
|
|
'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
3170
|
|
|
'G\'S$', 'X', 'X', |
3171
|
|
|
'G´S$', 'X', 'X', |
3172
|
|
|
'G^', 'G', None, |
3173
|
|
|
'G', 'K', 'K', |
3174
|
|
|
'HA(HIUY)--1', 'H', None, |
3175
|
|
|
'HANDVOL---^', 'HANT ', 'ANT ', |
3176
|
|
|
'HANNOVE-^', 'HANOF', None, |
3177
|
|
|
'HAVEN7$', 'HAFN', None, |
3178
|
|
|
'HEAD-', 'HE', 'E', |
3179
|
|
|
'HELIEGEN------', 'E ', 'E ', |
3180
|
|
|
'HESTEHEN------', 'E ', 'E ', |
3181
|
|
|
'HE(LMNRST)-3^', 'HE', 'E', |
3182
|
|
|
'HE(LMN)-1', 'E', 'E', |
3183
|
|
|
'HEUR1$', 'ÖR', 'ÖR', |
3184
|
|
|
'HE(HIUY)--1', 'H', None, |
3185
|
|
|
'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
3186
|
|
|
'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
3187
|
|
|
'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
3188
|
|
|
'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
3189
|
|
|
'HOBBY9^', 'HOBI', None, |
3190
|
|
|
'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
3191
|
|
|
'HOCHTALEN-----^', 'HOCH ', 'UK ', |
3192
|
|
|
'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
3193
|
|
|
'HO(HIY)--1', 'H', None, |
3194
|
|
|
'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
3195
|
|
|
'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
3196
|
|
|
'HUIS^^', 'HÜS', 'IZ', |
3197
|
|
|
'HUIS$', 'ÜS', 'IZ', |
3198
|
|
|
'HUI--1', 'H', None, |
3199
|
|
|
'HYGIEN^', 'HÜKIEN', None, |
3200
|
|
|
'HY9^', 'HÜ', None, |
3201
|
|
|
'HY(BDGMNPST)-', 'Ü', None, |
3202
|
|
|
'H.^', None, 'H.', |
3203
|
|
|
'HÄU--1', 'H', None, |
3204
|
|
|
'H^', 'H', '', |
3205
|
|
|
'H', '', '', |
3206
|
|
|
'ICHELL---', 'ISH', 'IZ', |
3207
|
|
|
'ICHI$', 'ISHI', 'IZI', |
3208
|
|
|
'IEC$', 'IZ', 'IZ', |
3209
|
|
|
'IEDENSTELLE------', 'IDN ', 'ITN ', |
3210
|
|
|
'IEI-3', '', '', |
3211
|
|
|
'IELL3', 'IEL', 'IEL', |
3212
|
|
|
'IENNE$', 'IN', 'IN', |
3213
|
|
|
'IERRE$', 'IER', 'IER', |
3214
|
|
|
'IERZULAN---', 'IR ZU ', 'IR ZU ', |
3215
|
|
|
'IETTE$', 'IT', 'IT', |
3216
|
|
|
'IEU', 'IÖ', 'IÖ', |
3217
|
|
|
'IE<4', 'I', 'I', |
3218
|
|
|
'IGL-1', 'IK', None, |
3219
|
|
|
'IGHT3$', 'EIT', 'EIT', |
3220
|
|
|
'IGNI(EO)-', 'INI', 'INI', |
3221
|
|
|
'IGN(AEOU)-$', 'INI', 'INI', |
3222
|
|
|
'IHER(DGLKRT)--1', 'IHE', None, |
3223
|
|
|
'IHE(IUY)--', 'IH', None, |
3224
|
|
|
'IH(AIOÖUÜY)-', 'IH', None, |
3225
|
|
|
'IJ(AOU)-', 'I', 'I', |
3226
|
|
|
'IJ$', 'I', 'I', |
3227
|
|
|
'IJ<', 'EI', 'EI', |
3228
|
|
|
'IKOLE$', 'IKOL', 'IKUL', |
3229
|
|
|
'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
3230
|
|
|
'ILLAR(DT)--4', 'ILIA', 'ILIA', |
3231
|
|
|
'IMSTAN----^', 'IM ', 'IN ', |
3232
|
|
|
'INDELERREGE------', 'INDL ', 'INTL ', |
3233
|
|
|
'INFRAGE-----^$', 'IN ', 'IN ', |
3234
|
|
|
'INTERN(AOU)-^', 'INTAN', 'INTAN', |
3235
|
|
|
'INVER-', 'INWE', 'INFE', |
3236
|
|
|
'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
3237
|
|
|
'IUSZ$', 'IUS', None, |
3238
|
|
|
'IUTZ$', 'IUS', None, |
3239
|
|
|
'IUZ$', 'IUS', None, |
3240
|
|
|
'IVER--<', 'IW', None, |
3241
|
|
|
'IVIER$', 'IWIE', 'IFIE', |
3242
|
|
|
'IV(ÄOÖUÜ)-', 'IW', None, |
3243
|
|
|
'IV<3', 'IW', None, |
3244
|
|
|
'IY2', 'I', None, |
3245
|
|
|
'I(ÈÉÊ)<4', 'I', 'I', |
3246
|
|
|
'JAVIE---<^', 'ZA', 'ZA', |
3247
|
|
|
'JEANS^$', 'JINS', 'INZ', |
3248
|
|
|
'JEANNE^$', 'IAN', 'IAN', |
3249
|
|
|
'JEAN-^', 'IA', 'IA', |
3250
|
|
|
'JER-^', 'IE', 'IE', |
3251
|
|
|
'JE(LMNST)-', 'IE', 'IE', |
3252
|
|
|
'JI^', 'JI', None, |
3253
|
|
|
'JOR(GK)^$', 'IÖRK', 'IÖRK', |
3254
|
|
|
'J', 'I', 'I', |
3255
|
|
|
'KC(ÄEIJ)-', 'X', 'X', |
3256
|
|
|
'KD', 'KT', None, |
3257
|
|
|
'KE(LMNRST)-3^', 'KE', 'KE', |
3258
|
|
|
'KG(AÄEILOÖRUÜY)-', 'K', None, |
3259
|
|
|
'KH<^', 'K', 'K', |
3260
|
|
|
'KIC$', 'KIZ', 'KIZ', |
3261
|
|
|
'KLE(LMNRST)-3^', 'KLE', 'KLE', |
3262
|
|
|
'KOTELE-^', 'KOTL', 'KUTL', |
3263
|
|
|
'KREAT-^', 'KREA', 'KREA', |
3264
|
|
|
'KRÜS(TZ)--^', 'KRI', None, |
3265
|
|
|
'KRYS(TZ)--^', 'KRI', None, |
3266
|
|
|
'KRY9^', 'KRÜ', None, |
3267
|
|
|
'KSCH---', 'K', 'K', |
3268
|
|
|
'KSH--', 'K', 'K', |
3269
|
|
|
'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
3270
|
|
|
'KT\'S$', 'X', 'X', |
3271
|
|
|
'KTI(AIOU)-3', 'XI', 'XI', |
3272
|
|
|
'KT(SßXZ)', 'X', 'X', |
3273
|
|
|
'KY9^', 'KÜ', None, |
3274
|
|
|
'K\'S$', 'X', 'X', |
3275
|
|
|
'K´S$', 'X', 'X', |
3276
|
|
|
'LANGES$', ' LANGES', ' LANKEZ', |
3277
|
|
|
'LANGE$', ' LANGE', ' LANKE', |
3278
|
|
|
'LANG$', ' LANK', ' LANK', |
3279
|
|
|
'LARVE-', 'LARF', 'LARF', |
3280
|
|
|
'LD(SßZ)$', 'LS', 'LZ', |
3281
|
|
|
'LD\'S$', 'LS', 'LZ', |
3282
|
|
|
'LD´S$', 'LS', 'LZ', |
3283
|
|
|
'LEAND-^', 'LEAN', 'LEAN', |
3284
|
|
|
'LEERSTEHE-----^', 'LER ', 'LER ', |
3285
|
|
|
'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
3286
|
|
|
'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
3287
|
|
|
'LEIDERREGE------', 'LEIT ', 'LEIT ', |
3288
|
|
|
'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
3289
|
|
|
'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
3290
|
|
|
'LEL-', 'LE', 'LE', |
3291
|
|
|
'LE(MNRST)-3^', 'LE', 'LE', |
3292
|
|
|
'LETTE$', 'LET', 'LET', |
3293
|
|
|
'LFGNAG-', 'LFGAN', 'LFKAN', |
3294
|
|
|
'LICHERWEIS----', 'LICHA ', 'LIKA ', |
3295
|
|
|
'LIC$', 'LIZ', 'LIZ', |
3296
|
|
|
'LIVE^$', 'LEIF', 'LEIF', |
3297
|
|
|
'LT(SßZ)$', 'LS', 'LZ', |
3298
|
|
|
'LT\'S$', 'LS', 'LZ', |
3299
|
|
|
'LT´S$', 'LS', 'LZ', |
3300
|
|
|
'LUI(GS)--', 'LU', 'LU', |
3301
|
|
|
'LV(AIO)-', 'LW', None, |
3302
|
|
|
'LY9^', 'LÜ', None, |
3303
|
|
|
'LSTS$', 'LS', 'LZ', |
3304
|
|
|
'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
3305
|
|
|
'L(SßZ)$', 'LS', None, |
3306
|
|
|
'MAIR-<', 'MEI', 'NEI', |
3307
|
|
|
'MANAG-', 'MENE', 'NENE', |
3308
|
|
|
'MANUEL', 'MANUEL', None, |
3309
|
|
|
'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
3310
|
|
|
'MATCH', 'MESH', 'NEZ', |
3311
|
|
|
'MAURICE', 'MORIS', 'NURIZ', |
3312
|
|
|
'MBH^$', 'MBH', 'MBH', |
3313
|
|
|
'MB(ßZ)$', 'MS', None, |
3314
|
|
|
'MB(SßTZ)-', 'M', 'N', |
3315
|
|
|
'MCG9^', 'MAK', 'NAK', |
3316
|
|
|
'MC9^', 'MAK', 'NAK', |
3317
|
|
|
'MEMOIR-^', 'MEMOA', 'NENUA', |
3318
|
|
|
'MERHAVEN$', 'MAHAFN', None, |
3319
|
|
|
'ME(LMNRST)-3^', 'ME', 'NE', |
3320
|
|
|
'MEN(STZ)--3', 'ME', None, |
3321
|
|
|
'MEN$', 'MEN', None, |
3322
|
|
|
'MIGUEL-', 'MIGE', 'NIKE', |
3323
|
|
|
'MIKE^$', 'MEIK', 'NEIK', |
3324
|
|
|
'MITHILFE----^$', 'MIT H', 'NIT ', |
3325
|
|
|
'MN$', 'M', None, |
3326
|
|
|
'MN', 'N', 'N', |
3327
|
|
|
'MPJUTE-', 'MPUT', 'NBUT', |
3328
|
|
|
'MP(ßZ)$', 'MS', None, |
3329
|
|
|
'MP(SßTZ)-', 'M', 'N', |
3330
|
|
|
'MP(BDJLMNPQVW)-', 'MB', 'NB', |
3331
|
|
|
'MY9^', 'MÜ', None, |
3332
|
|
|
'M(ßZ)$', 'MS', None, |
3333
|
|
|
'M´G7^', 'MAK', 'NAK', |
3334
|
|
|
'M\'G7^', 'MAK', 'NAK', |
3335
|
|
|
'M´^', 'MAK', 'NAK', |
3336
|
|
|
'M\'^', 'MAK', 'NAK', |
3337
|
|
|
'M', None, 'N', |
3338
|
|
|
'NACH^^', 'NACH', 'NAK', |
3339
|
|
|
'NADINE', 'NADIN', 'NATIN', |
3340
|
|
|
'NAIV--', 'NA', 'NA', |
3341
|
|
|
'NAISE$', 'NESE', 'NEZE', |
3342
|
|
|
'NAUGENOMM------', 'NAU ', 'NAU ', |
3343
|
|
|
'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
3344
|
|
|
'NCH$', 'NSH', 'NZ', |
3345
|
|
|
'NCOISE$', 'SOA', 'ZUA', |
3346
|
|
|
'NCOIS$', 'SOA', 'ZUA', |
3347
|
|
|
'NDAR$', 'NDA', 'NTA', |
3348
|
|
|
'NDERINGEN------', 'NDE ', 'NTE ', |
3349
|
|
|
'NDRO(CDKTZ)-', 'NTRO', None, |
3350
|
|
|
'ND(BFGJLMNPQVW)-', 'NT', None, |
3351
|
|
|
'ND(SßZ)$', 'NS', 'NZ', |
3352
|
|
|
'ND\'S$', 'NS', 'NZ', |
3353
|
|
|
'ND´S$', 'NS', 'NZ', |
3354
|
|
|
'NEBEN^^', 'NEBN', 'NEBN', |
3355
|
|
|
'NENGELERN------', 'NEN ', 'NEN ', |
3356
|
|
|
'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
3357
|
|
|
'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
3358
|
|
|
'NE(LMNRST)-3^', 'NE', 'NE', |
3359
|
|
|
'NEN-3', 'NE', 'NE', |
3360
|
|
|
'NETTE$', 'NET', 'NET', |
3361
|
|
|
'NGU^^', 'NU', 'NU', |
3362
|
|
|
'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
3363
|
|
|
'NH(AUO)-$', 'NI', 'NI', |
3364
|
|
|
'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
3365
|
|
|
'NICHTSSAGE----', 'NIX ', 'NIX ', |
3366
|
|
|
'NICHTS^^', 'NIX', 'NIX', |
3367
|
|
|
'NICHT^^', 'NICHT', 'NIKT', |
3368
|
|
|
'NINE$', 'NIN', 'NIN', |
3369
|
|
|
'NON^^', 'NON', 'NUN', |
3370
|
|
|
'NOTLEIDE-----^', 'NOT ', 'NUT ', |
3371
|
|
|
'NOT^^', 'NOT', 'NUT', |
3372
|
|
|
'NTI(AIOU)-3', 'NZI', 'NZI', |
3373
|
|
|
'NTIEL--3', 'NZI', 'NZI', |
3374
|
|
|
'NT(SßZ)$', 'NS', 'NZ', |
3375
|
|
|
'NT\'S$', 'NS', 'NZ', |
3376
|
|
|
'NT´S$', 'NS', 'NZ', |
3377
|
|
|
'NYLON', 'NEILON', 'NEILUN', |
3378
|
|
|
'NY9^', 'NÜ', None, |
3379
|
|
|
'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
3380
|
|
|
'NSZ-', 'NS', None, |
3381
|
|
|
'NSTS$', 'NS', 'NZ', |
3382
|
|
|
'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
3383
|
|
|
'N(SßZ)$', 'NS', None, |
3384
|
|
|
'OBERE-', 'OBER', None, |
3385
|
|
|
'OBER^^', 'OBA', 'UBA', |
3386
|
|
|
'OEU2', 'Ö', 'Ö', |
3387
|
|
|
'OE<2', 'Ö', 'Ö', |
3388
|
|
|
'OGL-', 'OK', None, |
3389
|
|
|
'OGNIE-', 'ONI', 'UNI', |
3390
|
|
|
'OGN(AEOU)-$', 'ONI', 'UNI', |
3391
|
|
|
'OH(AIOÖUÜY)-', 'OH', None, |
3392
|
|
|
'OIE$', 'Ö', 'Ö', |
3393
|
|
|
'OIRE$', 'OA', 'UA', |
3394
|
|
|
'OIR$', 'OA', 'UA', |
3395
|
|
|
'OIX', 'OA', 'UA', |
3396
|
|
|
'OI<3', 'EU', 'EU', |
3397
|
|
|
'OKAY^$', 'OKE', 'UKE', |
3398
|
|
|
'OLYN$', 'OLIN', 'ULIN', |
3399
|
|
|
'OO(DLMZ)-', 'U', None, |
3400
|
|
|
'OO$', 'U', None, |
3401
|
|
|
'OO-', '', '', |
3402
|
|
|
'ORGINAL-----', 'ORI', 'URI', |
3403
|
|
|
'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
3404
|
|
|
'OUI^', 'WI', 'FI', |
3405
|
|
|
'OUILLE$', 'ULIE', 'ULIE', |
3406
|
|
|
'OU(DT)-^', 'AU', 'AU', |
3407
|
|
|
'OUSE$', 'AUS', 'AUZ', |
3408
|
|
|
'OUT-', 'AU', 'AU', |
3409
|
|
|
'OU', 'U', 'U', |
3410
|
|
|
'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
3411
|
|
|
'OVER--<', 'OW', None, |
3412
|
|
|
'OV(AOU)-', 'OW', None, |
3413
|
|
|
'OW$', 'AU', 'AU', |
3414
|
|
|
'OWS$', 'OS', 'UZ', |
3415
|
|
|
'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
3416
|
|
|
'OYER', 'OIA', None, |
3417
|
|
|
'OY(AÄEIOÖUÜ)--', 'O', 'U', |
3418
|
|
|
'O(JY)<', 'EU', 'EU', |
3419
|
|
|
'OZ$', 'OS', None, |
3420
|
|
|
'O´^', 'O', 'U', |
3421
|
|
|
'O\'^', 'O', 'U', |
3422
|
|
|
'O', None, 'U', |
3423
|
|
|
'PATIEN--^', 'PAZI', 'PAZI', |
3424
|
|
|
'PENSIO-^', 'PANSI', 'PANZI', |
3425
|
|
|
'PE(LMNRST)-3^', 'PE', 'PE', |
3426
|
|
|
'PFER-^', 'FE', 'FE', |
3427
|
|
|
'P(FH)<', 'F', 'F', |
3428
|
|
|
'PIC^$', 'PIK', 'PIK', |
3429
|
|
|
'PIC$', 'PIZ', 'PIZ', |
3430
|
|
|
'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
3431
|
|
|
'POLYP-', 'POLÜ', None, |
3432
|
|
|
'POLY^^', 'POLI', 'PULI', |
3433
|
|
|
'PORTRAIT7', 'PORTRE', 'PURTRE', |
3434
|
|
|
'POWER7', 'PAUA', 'PAUA', |
3435
|
|
|
'PP(FH)--<', 'B', 'B', |
3436
|
|
|
'PP-', '', '', |
3437
|
|
|
'PRODUZ-^', 'PRODU', 'BRUTU', |
3438
|
|
|
'PRODUZI--', ' PRODU', ' BRUTU', |
3439
|
|
|
'PRIX^$', 'PRI', 'PRI', |
3440
|
|
|
'PS-^^', 'P', None, |
3441
|
|
|
'P(SßZ)^', None, 'Z', |
3442
|
|
|
'P(SßZ)$', 'BS', None, |
3443
|
|
|
'PT-^', '', '', |
3444
|
|
|
'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
3445
|
|
|
'PY9^', 'PÜ', None, |
3446
|
|
|
'P(AÄEIOÖRUÜY)-', 'P', 'P', |
3447
|
|
|
'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
3448
|
|
|
'P.^', None, 'P.', |
3449
|
|
|
'P^', 'P', None, |
3450
|
|
|
'P', 'B', 'B', |
3451
|
|
|
'QI-', 'Z', 'Z', |
3452
|
|
|
'QUARANT--', 'KARA', 'KARA', |
3453
|
|
|
'QUE(LMNRST)-3', 'KWE', 'KFE', |
3454
|
|
|
'QUE$', 'K', 'K', |
3455
|
|
|
'QUI(NS)$', 'KI', 'KI', |
3456
|
|
|
'QUIZ7', 'KWIS', None, |
3457
|
|
|
'Q(UV)7', 'KW', 'KF', |
3458
|
|
|
'Q<', 'K', 'K', |
3459
|
|
|
'RADFAHR----', 'RAT ', 'RAT ', |
3460
|
|
|
'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
3461
|
|
|
'RCH', 'RCH', 'RK', |
3462
|
|
|
'REA(DU)---3^', 'R', None, |
3463
|
|
|
'REBSERZEUG------', 'REBS ', 'REBZ ', |
3464
|
|
|
'RECHERCH^', 'RESHASH', 'REZAZ', |
3465
|
|
|
'RECYCL--', 'RIZEI', 'RIZEI', |
3466
|
|
|
'RE(ALST)-3^', 'RE', None, |
3467
|
|
|
'REE$', 'RI', 'RI', |
3468
|
|
|
'RER$', 'RA', 'RA', |
3469
|
|
|
'RE(MNR)-4', 'RE', 'RE', |
3470
|
|
|
'RETTE$', 'RET', 'RET', |
3471
|
|
|
'REUZ$', 'REUZ', None, |
3472
|
|
|
'REW$', 'RU', 'RU', |
3473
|
|
|
'RH<^', 'R', 'R', |
3474
|
|
|
'RJA(MN)--', 'RI', 'RI', |
3475
|
|
|
'ROWD-^', 'RAU', 'RAU', |
3476
|
|
|
'RTEMONNAIE-', 'RTMON', 'RTNUN', |
3477
|
|
|
'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
3478
|
|
|
'RTIEL--3', 'RZI', 'RZI', |
3479
|
|
|
'RV(AEOU)-3', 'RW', None, |
3480
|
|
|
'RY(KN)-$', 'RI', 'RI', |
3481
|
|
|
'RY9^', 'RÜ', None, |
3482
|
|
|
'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
3483
|
|
|
'SAISO-^', 'SES', 'ZEZ', |
3484
|
|
|
'SAFE^$', 'SEIF', 'ZEIF', |
3485
|
|
|
'SAUCE-^', 'SOS', 'ZUZ', |
3486
|
|
|
'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
3487
|
|
|
'SCHSCH---7', '', '', |
3488
|
|
|
'SCHTSCH', 'SH', 'Z', |
3489
|
|
|
'SC(HZ)<', 'SH', 'Z', |
3490
|
|
|
'SC', 'SK', 'ZK', |
3491
|
|
|
'SELBSTST--7^^', 'SELB', 'ZELB', |
3492
|
|
|
'SELBST7^^', 'SELBST', 'ZELBZT', |
3493
|
|
|
'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
3494
|
|
|
'SERVI-^', 'SERW', None, |
3495
|
|
|
'SE(LMNRST)-3^', 'SE', 'ZE', |
3496
|
|
|
'SETTE$', 'SET', 'ZET', |
3497
|
|
|
'SHP-^', 'S', 'Z', |
3498
|
|
|
'SHST', 'SHT', 'ZT', |
3499
|
|
|
'SHTSH', 'SH', 'Z', |
3500
|
|
|
'SHT', 'ST', 'Z', |
3501
|
|
|
'SHY9^', 'SHÜ', None, |
3502
|
|
|
'SH^^', 'SH', None, |
3503
|
|
|
'SH3', 'SH', 'Z', |
3504
|
|
|
'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
3505
|
|
|
'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
3506
|
|
|
'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
3507
|
|
|
'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
3508
|
|
|
'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
3509
|
|
|
'SIEGLI-^', 'SIKL', 'ZIKL', |
3510
|
|
|
'SIGLI-^', 'SIKL', 'ZIKL', |
3511
|
|
|
'SIGHT', 'SEIT', 'ZEIT', |
3512
|
|
|
'SIGN', 'SEIN', 'ZEIN', |
3513
|
|
|
'SKI(NPZ)-', 'SKI', 'ZKI', |
3514
|
|
|
'SKI<^', 'SHI', 'ZI', |
3515
|
|
|
'SODASS^$', 'SO DAS', 'ZU TAZ', |
3516
|
|
|
'SODAß^$', 'SO DAS', 'ZU TAZ', |
3517
|
|
|
'SOGENAN--^', 'SO GEN', 'ZU KEN', |
3518
|
|
|
'SOUND-', 'SAUN', 'ZAUN', |
3519
|
|
|
'STAATS^^', 'STAZ', 'ZTAZ', |
3520
|
|
|
'STADT^^', 'STAT', 'ZTAT', |
3521
|
|
|
'STANDE$', ' STANDE', ' ZTANTE', |
3522
|
|
|
'START^^', 'START', 'ZTART', |
3523
|
|
|
'STAURANT7', 'STORAN', 'ZTURAN', |
3524
|
|
|
'STEAK-', 'STE', 'ZTE', |
3525
|
|
|
'STEPHEN-^$', 'STEW', None, |
3526
|
|
|
'STERN', 'STERN', None, |
3527
|
|
|
'STRAF^^', 'STRAF', 'ZTRAF', |
3528
|
|
|
'ST\'S$', 'Z', 'Z', |
3529
|
|
|
'ST´S$', 'Z', 'Z', |
3530
|
|
|
'STST--', '', '', |
3531
|
|
|
'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
3532
|
|
|
'ST(SZ)', 'Z', 'Z', |
3533
|
|
|
'SPAREN---^', 'SPA', 'ZPA', |
3534
|
|
|
'SPAREND----', ' SPA', ' ZPA', |
3535
|
|
|
'S(PTW)-^^', 'S', None, |
3536
|
|
|
'SP', 'SP', None, |
3537
|
|
|
'STYN(AE)-$', 'STIN', 'ZTIN', |
3538
|
|
|
'ST', 'ST', 'ZT', |
3539
|
|
|
'SUITE<', 'SIUT', 'ZIUT', |
3540
|
|
|
'SUKE--$', 'S', 'Z', |
3541
|
|
|
'SURF(EI)-', 'SÖRF', 'ZÖRF', |
3542
|
|
|
'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
3543
|
|
|
'SYB(IY)--^', 'SIB', None, |
3544
|
|
|
'SYL(KVW)--^', 'SI', None, |
3545
|
|
|
'SY9^', 'SÜ', None, |
3546
|
|
|
'SZE(NPT)-^', 'ZE', 'ZE', |
3547
|
|
|
'SZI(ELN)-^', 'ZI', 'ZI', |
3548
|
|
|
'SZCZ<', 'SH', 'Z', |
3549
|
|
|
'SZT<', 'ST', 'ZT', |
3550
|
|
|
'SZ<3', 'SH', 'Z', |
3551
|
|
|
'SÜL(KVW)--^', 'SI', None, |
3552
|
|
|
'S', None, 'Z', |
3553
|
|
|
'TCH', 'SH', 'Z', |
3554
|
|
|
'TD(AÄEIOÖRUÜY)-', 'T', None, |
3555
|
|
|
'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
3556
|
|
|
'TEAT-^', 'TEA', 'TEA', |
3557
|
|
|
'TERRAI7^', 'TERA', 'TERA', |
3558
|
|
|
'TE(LMNRST)-3^', 'TE', 'TE', |
3559
|
|
|
'TH<', 'T', 'T', |
3560
|
|
|
'TICHT-', 'TIK', 'TIK', |
3561
|
|
|
'TICH$', 'TIK', 'TIK', |
3562
|
|
|
'TIC$', 'TIZ', 'TIZ', |
3563
|
|
|
'TIGGESTELL-------', 'TIK ', 'TIK ', |
3564
|
|
|
'TIGSTELL-----', 'TIK ', 'TIK ', |
3565
|
|
|
'TOAS-^', 'TO', 'TU', |
3566
|
|
|
'TOILET-', 'TOLE', 'TULE', |
3567
|
|
|
'TOIN-', 'TOA', 'TUA', |
3568
|
|
|
'TRAECHTI-^', 'TRECHT', 'TREKT', |
3569
|
|
|
'TRAECHTIG--', ' TRECHT', ' TREKT', |
3570
|
|
|
'TRAINI-', 'TREN', 'TREN', |
3571
|
|
|
'TRÄCHTI-^', 'TRECHT', 'TREKT', |
3572
|
|
|
'TRÄCHTIG--', ' TRECHT', ' TREKT', |
3573
|
|
|
'TSCH', 'SH', 'Z', |
3574
|
|
|
'TSH', 'SH', 'Z', |
3575
|
|
|
'TST', 'ZT', 'ZT', |
3576
|
|
|
'T(Sß)', 'Z', 'Z', |
3577
|
|
|
'TT(SZ)--<', '', '', |
3578
|
|
|
'TT9', 'T', 'T', |
3579
|
|
|
'TV^$', 'TV', 'TV', |
3580
|
|
|
'TX(AEIOU)-3', 'SH', 'Z', |
3581
|
|
|
'TY9^', 'TÜ', None, |
3582
|
|
|
'TZ-', '', '', |
3583
|
|
|
'T\'S3$', 'Z', 'Z', |
3584
|
|
|
'T´S3$', 'Z', 'Z', |
3585
|
|
|
'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
3586
|
|
|
'UEBER^^', 'ÜBA', 'IBA', |
3587
|
|
|
'UE2', 'Ü', 'I', |
3588
|
|
|
'UGL-', 'UK', None, |
3589
|
|
|
'UH(AOÖUÜY)-', 'UH', None, |
3590
|
|
|
'UIE$', 'Ü', 'I', |
3591
|
|
|
'UM^^', 'UM', 'UN', |
3592
|
|
|
'UNTERE--3', 'UNTE', 'UNTE', |
3593
|
|
|
'UNTER^^', 'UNTA', 'UNTA', |
3594
|
|
|
'UNVER^^', 'UNFA', 'UNFA', |
3595
|
|
|
'UN^^', 'UN', 'UN', |
3596
|
|
|
'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
3597
|
|
|
'UVE-4', 'UW', None, |
3598
|
|
|
'UY2', 'UI', None, |
3599
|
|
|
'UZZ', 'AS', 'AZ', |
3600
|
|
|
'VACL-^', 'WAZ', 'FAZ', |
3601
|
|
|
'VAC$', 'WAZ', 'FAZ', |
3602
|
|
|
'VAN DEN ^', 'FANDN', 'FANTN', |
3603
|
|
|
'VANES-^', 'WANE', None, |
3604
|
|
|
'VATRO-', 'WATR', None, |
3605
|
|
|
'VA(DHJNT)--^', 'F', None, |
3606
|
|
|
'VEDD-^', 'FE', 'FE', |
3607
|
|
|
'VE(BEHIU)--^', 'F', None, |
3608
|
|
|
'VEL(BDLMNT)-^', 'FEL', None, |
3609
|
|
|
'VENTZ-^', 'FEN', None, |
3610
|
|
|
'VEN(NRSZ)-^', 'FEN', None, |
3611
|
|
|
'VER(AB)-^$', 'WER', None, |
3612
|
|
|
'VERBAL^$', 'WERBAL', None, |
3613
|
|
|
'VERBAL(EINS)-^', 'WERBAL', None, |
3614
|
|
|
'VERTEBR--', 'WERTE', None, |
3615
|
|
|
'VEREIN-----', 'F', None, |
3616
|
|
|
'VEREN(AEIOU)-^', 'WEREN', None, |
3617
|
|
|
'VERIFI', 'WERIFI', None, |
3618
|
|
|
'VERON(AEIOU)-^', 'WERON', None, |
3619
|
|
|
'VERSEN^', 'FERSN', 'FAZN', |
3620
|
|
|
'VERSIERT--^', 'WERSI', None, |
3621
|
|
|
'VERSIO--^', 'WERS', None, |
3622
|
|
|
'VERSUS', 'WERSUS', None, |
3623
|
|
|
'VERTI(GK)-', 'WERTI', None, |
3624
|
|
|
'VER^^', 'FER', 'FA', |
3625
|
|
|
'VERSPRECHE-------', ' FER', ' FA', |
3626
|
|
|
'VER$', 'WA', None, |
3627
|
|
|
'VER', 'FA', 'FA', |
3628
|
|
|
'VET(HT)-^', 'FET', 'FET', |
3629
|
|
|
'VETTE$', 'WET', 'FET', |
3630
|
|
|
'VE^', 'WE', None, |
3631
|
|
|
'VIC$', 'WIZ', 'FIZ', |
3632
|
|
|
'VIELSAGE----', 'FIL ', 'FIL ', |
3633
|
|
|
'VIEL', 'FIL', 'FIL', |
3634
|
|
|
'VIEW', 'WIU', 'FIU', |
3635
|
|
|
'VILL(AE)-', 'WIL', None, |
3636
|
|
|
'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
3637
|
|
|
'VI(ELS)--^', 'F', None, |
3638
|
|
|
'VILLON--', 'WILI', 'FILI', |
3639
|
|
|
'VIZE^^', 'FIZE', 'FIZE', |
3640
|
|
|
'VLIE--^', 'FL', None, |
3641
|
|
|
'VL(AEIOU)--', 'W', None, |
3642
|
|
|
'VOKA-^', 'WOK', None, |
3643
|
|
|
'VOL(ATUVW)--^', 'WO', None, |
3644
|
|
|
'VOR^^', 'FOR', 'FUR', |
3645
|
|
|
'VR(AEIOU)--', 'W', None, |
3646
|
|
|
'VV9', 'W', None, |
3647
|
|
|
'VY9^', 'WÜ', 'FI', |
3648
|
|
|
'V(ÜY)-', 'W', None, |
3649
|
|
|
'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
3650
|
|
|
'V(AEIJLRU)-<', 'W', None, |
3651
|
|
|
'V.^', 'V.', None, |
3652
|
|
|
'V<', 'F', 'F', |
3653
|
|
|
'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
3654
|
|
|
'WEITREICH-----^', 'WEIT ', 'FEIT ', |
3655
|
|
|
'WEITVER^', 'WEIT FER', 'FEIT FA', |
3656
|
|
|
'WE(LMNRST)-3^', 'WE', 'FE', |
3657
|
|
|
'WER(DST)-', 'WER', None, |
3658
|
|
|
'WIC$', 'WIZ', 'FIZ', |
3659
|
|
|
'WIEDERU--', 'WIDE', 'FITE', |
3660
|
|
|
'WIEDER^$', 'WIDA', 'FITA', |
3661
|
|
|
'WIEDER^^', 'WIDA ', 'FITA ', |
3662
|
|
|
'WIEVIEL', 'WI FIL', 'FI FIL', |
3663
|
|
|
'WISUEL', 'WISUEL', None, |
3664
|
|
|
'WR-^', 'W', None, |
3665
|
|
|
'WY9^', 'WÜ', 'FI', |
3666
|
|
|
'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
3667
|
|
|
'W$', 'F', None, |
3668
|
|
|
'W', None, 'F', |
3669
|
|
|
'X<^', 'Z', 'Z', |
3670
|
|
|
'XHAVEN$', 'XAFN', None, |
3671
|
|
|
'X(CSZ)', 'X', 'X', |
3672
|
|
|
'XTS(CH)--', 'XT', 'XT', |
3673
|
|
|
'XT(SZ)', 'Z', 'Z', |
3674
|
|
|
'YE(LMNRST)-3^', 'IE', 'IE', |
3675
|
|
|
'YE-3', 'I', 'I', |
3676
|
|
|
'YOR(GK)^$', 'IÖRK', 'IÖRK', |
3677
|
|
|
'Y(AOU)-<7', 'I', 'I', |
3678
|
|
|
'Y(BKLMNPRSTX)-1', 'Ü', None, |
3679
|
|
|
'YVES^$', 'IF', 'IF', |
3680
|
|
|
'YVONNE^$', 'IWON', 'IFUN', |
3681
|
|
|
'Y.^', 'Y.', None, |
3682
|
|
|
'Y', 'I', 'I', |
3683
|
|
|
'ZC(AOU)-', 'SK', 'ZK', |
3684
|
|
|
'ZE(LMNRST)-3^', 'ZE', 'ZE', |
3685
|
|
|
'ZIEJ$', 'ZI', 'ZI', |
3686
|
|
|
'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
3687
|
|
|
'ZL(AEIOU)-', 'SL', None, |
3688
|
|
|
'ZS(CHT)--', '', '', |
3689
|
|
|
'ZS', 'SH', 'Z', |
3690
|
|
|
'ZUERST', 'ZUERST', 'ZUERST', |
3691
|
|
|
'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
3692
|
|
|
'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
3693
|
|
|
'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
3694
|
|
|
'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
3695
|
|
|
'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
3696
|
|
|
'ZURUECK^^', 'ZURÜK', 'ZURIK', |
3697
|
|
|
'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
3698
|
|
|
'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
3699
|
|
|
'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
3700
|
|
|
'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
3701
|
|
|
'ZUVER^^', 'ZUFA', 'ZUFA', |
3702
|
|
|
'ZUVIEL', 'ZU FIL', 'ZU FIL', |
3703
|
|
|
'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
3704
|
|
|
'ZY9^', 'ZÜ', None, |
3705
|
|
|
'ZYK3$', 'ZIK', None, |
3706
|
|
|
'Z(VW)7^', 'SW', None, |
3707
|
|
|
None, None, None) |
3708
|
|
|
|
3709
|
|
|
phonet_hash = Counter() |
3710
|
|
|
alpha_pos = Counter() |
3711
|
|
|
|
3712
|
|
|
phonet_hash_1 = Counter() |
3713
|
|
|
phonet_hash_2 = Counter() |
3714
|
|
|
|
3715
|
|
|
_phonet_upper_translation = dict(zip((ord(_) for _ in |
|
|
|
|
3716
|
|
|
'abcdefghijklmnopqrstuvwxyzàáâãåäæ' + |
3717
|
|
|
'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'), |
3718
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' + |
3719
|
|
|
'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ')) |
3720
|
|
|
|
3721
|
|
|
def _trinfo(text, rule, err_text, lang): |
3722
|
|
|
"""Output debug information.""" |
3723
|
|
|
if lang == 'none': |
3724
|
|
|
_phonet_rules = _phonet_rules_no_lang |
3725
|
|
|
else: |
3726
|
|
|
_phonet_rules = _phonet_rules_german |
3727
|
|
|
|
3728
|
|
|
from_rule = ('(NULL)' if _phonet_rules[rule] is None else |
3729
|
|
|
_phonet_rules[rule]) |
3730
|
|
|
to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else |
3731
|
|
|
_phonet_rules[rule + 1]) |
3732
|
|
|
to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else |
3733
|
|
|
_phonet_rules[rule + 2]) |
3734
|
|
|
print('"{} {}: "{}"{}"{}" {}'.format(text, ((rule / 3) + 1), |
3735
|
|
|
from_rule, to_rule1, to_rule2, |
3736
|
|
|
err_text)) |
3737
|
|
|
|
3738
|
|
|
def _initialize_phonet(lang): |
3739
|
|
|
"""Initialize phonet variables.""" |
3740
|
|
|
if lang == 'none': |
3741
|
|
|
_phonet_rules = _phonet_rules_no_lang |
3742
|
|
|
else: |
3743
|
|
|
_phonet_rules = _phonet_rules_german |
3744
|
|
|
|
3745
|
|
|
phonet_hash[''] = -1 |
3746
|
|
|
|
3747
|
|
|
# German and international umlauts |
3748
|
|
|
for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', |
3749
|
|
|
'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', |
3750
|
|
|
'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}: |
3751
|
|
|
alpha_pos[j] = 1 |
3752
|
|
|
phonet_hash[j] = -1 |
3753
|
|
|
|
3754
|
|
|
# "normal" letters ('A'-'Z') |
3755
|
|
|
for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
3756
|
|
|
alpha_pos[j] = i + 2 |
3757
|
|
|
phonet_hash[j] = -1 |
3758
|
|
|
|
3759
|
|
|
for i in range(26): |
3760
|
|
|
for j in range(28): |
3761
|
|
|
phonet_hash_1[i, j] = -1 |
3762
|
|
|
phonet_hash_2[i, j] = -1 |
3763
|
|
|
|
3764
|
|
|
# for each phonetc rule |
3765
|
|
|
for i in range(len(_phonet_rules)): |
|
|
|
|
3766
|
|
|
rule = _phonet_rules[i] |
3767
|
|
|
|
3768
|
|
|
if rule and i % 3 == 0: |
3769
|
|
|
# calculate first hash value |
3770
|
|
|
k = _phonet_rules[i][0] |
3771
|
|
|
|
3772
|
|
|
if phonet_hash[k] < 0 and (_phonet_rules[i+1] or |
|
|
|
|
3773
|
|
|
_phonet_rules[i+2]): |
3774
|
|
|
phonet_hash[k] = i |
3775
|
|
|
|
3776
|
|
|
# calculate second hash values |
3777
|
|
|
if k and alpha_pos[k] >= 2: |
|
|
|
|
3778
|
|
|
k = alpha_pos[k] |
3779
|
|
|
|
3780
|
|
|
j = k-2 |
3781
|
|
|
rule = rule[1:] |
3782
|
|
|
|
3783
|
|
|
if not rule: |
3784
|
|
|
rule = ' ' |
3785
|
|
|
elif rule[0] == '(': |
3786
|
|
|
rule = rule[1:] |
3787
|
|
|
else: |
3788
|
|
|
rule = rule[0] |
3789
|
|
|
|
3790
|
|
|
while rule and (rule[0] != ')'): |
3791
|
|
|
k = alpha_pos[rule[0]] |
3792
|
|
|
|
3793
|
|
|
if k > 0: |
3794
|
|
|
# add hash value for this letter |
3795
|
|
|
if phonet_hash_1[j, k] < 0: |
|
|
|
|
3796
|
|
|
phonet_hash_1[j, k] = i |
3797
|
|
|
phonet_hash_2[j, k] = i |
3798
|
|
|
|
3799
|
|
|
if phonet_hash_2[j, k] >= (i-30): |
|
|
|
|
3800
|
|
|
phonet_hash_2[j, k] = i |
3801
|
|
|
else: |
3802
|
|
|
k = -1 |
3803
|
|
|
|
3804
|
|
|
if k <= 0: |
3805
|
|
|
# add hash value for all letters |
3806
|
|
|
if phonet_hash_1[j, 0] < 0: |
3807
|
|
|
phonet_hash_1[j, 0] = i |
3808
|
|
|
|
3809
|
|
|
phonet_hash_2[j, 0] = i |
3810
|
|
|
|
3811
|
|
|
rule = rule[1:] |
3812
|
|
|
|
3813
|
|
|
def _phonet(term, mode, lang, trace): |
|
|
|
|
3814
|
|
|
"""Return the phonet coded form of a term.""" |
3815
|
|
|
if lang == 'none': |
3816
|
|
|
_phonet_rules = _phonet_rules_no_lang |
3817
|
|
|
else: |
3818
|
|
|
_phonet_rules = _phonet_rules_german |
3819
|
|
|
|
3820
|
|
|
char0 = '' |
3821
|
|
|
dest = term |
3822
|
|
|
|
3823
|
|
|
if not term: |
3824
|
|
|
return '' |
3825
|
|
|
|
3826
|
|
|
term_length = len(term) |
3827
|
|
|
|
3828
|
|
|
# convert input string to upper-case |
3829
|
|
|
src = term.translate(_phonet_upper_translation) |
3830
|
|
|
|
3831
|
|
|
# check "src" |
3832
|
|
|
i = 0 |
3833
|
|
|
j = 0 |
3834
|
|
|
zeta = 0 |
3835
|
|
|
|
3836
|
|
|
while i < len(src): |
|
|
|
|
3837
|
|
|
char = src[i] |
3838
|
|
|
|
3839
|
|
|
if trace: |
3840
|
|
|
print('\ncheck position {}: src = "{}", dest = "{}"'.format |
3841
|
|
|
(j, src[i:], dest[:j])) |
3842
|
|
|
|
3843
|
|
|
pos = alpha_pos[char] |
3844
|
|
|
|
3845
|
|
|
if pos >= 2: |
3846
|
|
|
xpos = pos-2 |
3847
|
|
|
|
3848
|
|
|
if i+1 == len(src): |
3849
|
|
|
pos = alpha_pos[''] |
3850
|
|
|
else: |
3851
|
|
|
pos = alpha_pos[src[i+1]] |
3852
|
|
|
|
3853
|
|
|
start1 = phonet_hash_1[xpos, pos] |
3854
|
|
|
start2 = phonet_hash_1[xpos, 0] |
3855
|
|
|
end1 = phonet_hash_2[xpos, pos] |
3856
|
|
|
end2 = phonet_hash_2[xpos, 0] |
3857
|
|
|
|
3858
|
|
|
# preserve rule priorities |
3859
|
|
|
if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
3860
|
|
|
pos = start1 |
|
|
|
|
3861
|
|
|
start1 = start2 |
3862
|
|
|
start2 = pos |
3863
|
|
|
pos = end1 |
|
|
|
|
3864
|
|
|
end1 = end2 |
3865
|
|
|
end2 = pos |
3866
|
|
|
|
3867
|
|
|
if (end1 >= start2) and (start2 >= 0): |
|
|
|
|
3868
|
|
|
if end2 > end1: |
3869
|
|
|
end1 = end2 |
3870
|
|
|
|
3871
|
|
|
start2 = -1 |
3872
|
|
|
end2 = -1 |
3873
|
|
|
else: |
3874
|
|
|
pos = phonet_hash[char] |
3875
|
|
|
start1 = pos |
3876
|
|
|
end1 = 10000 |
3877
|
|
|
start2 = -1 |
3878
|
|
|
end2 = -1 |
3879
|
|
|
|
3880
|
|
|
pos = start1 |
3881
|
|
|
zeta0 = 0 |
3882
|
|
|
|
3883
|
|
|
if pos >= 0: |
3884
|
|
|
# check rules for this char |
3885
|
|
|
while ((_phonet_rules[pos] is None) or |
3886
|
|
|
(_phonet_rules[pos][0] == char)): |
3887
|
|
|
if pos > end1: |
3888
|
|
|
if start2 > 0: |
3889
|
|
|
pos = start2 |
3890
|
|
|
start1 = start2 |
3891
|
|
|
start2 = -1 |
3892
|
|
|
end1 = end2 |
3893
|
|
|
end2 = -1 |
3894
|
|
|
continue |
3895
|
|
|
|
3896
|
|
|
break |
3897
|
|
|
|
3898
|
|
|
if (((_phonet_rules[pos] is None) or |
3899
|
|
|
(_phonet_rules[pos + mode] is None))): |
3900
|
|
|
# no conversion rule available |
3901
|
|
|
pos += 3 |
3902
|
|
|
continue |
3903
|
|
|
|
3904
|
|
|
if trace: |
3905
|
|
|
_trinfo('> rule no.', pos, 'is being checked', lang) |
3906
|
|
|
|
3907
|
|
|
# check whole string |
3908
|
|
|
matches = 1 # number of matching letters |
3909
|
|
|
priority = 5 # default priority |
3910
|
|
|
rule = _phonet_rules[pos] |
3911
|
|
|
rule = rule[1:] |
3912
|
|
|
|
3913
|
|
|
while (rule and |
3914
|
|
|
(len(src) > (i + matches)) and |
3915
|
|
|
(src[i + matches] == rule[0]) and |
3916
|
|
|
not rule[0].isdigit() and |
3917
|
|
|
(rule not in '(-<^$')): |
3918
|
|
|
matches += 1 |
3919
|
|
|
rule = rule[1:] |
3920
|
|
|
|
3921
|
|
|
if rule and (rule[0] == '('): |
3922
|
|
|
# check an array of letters |
3923
|
|
|
if (((len(src) > (i + matches)) and |
3924
|
|
|
src[i + matches].isalpha() and |
3925
|
|
|
(src[i + matches] in rule[1:]))): |
3926
|
|
|
matches += 1 |
3927
|
|
|
|
3928
|
|
|
while rule and rule[0] != ')': |
3929
|
|
|
rule = rule[1:] |
3930
|
|
|
|
3931
|
|
|
# if rule[0] == ')': |
3932
|
|
|
rule = rule[1:] |
3933
|
|
|
|
3934
|
|
|
if rule: |
3935
|
|
|
priority0 = ord(rule[0]) |
3936
|
|
|
else: |
3937
|
|
|
priority0 = 0 |
3938
|
|
|
|
3939
|
|
|
matches0 = matches |
3940
|
|
|
|
3941
|
|
|
while rule and rule[0] == '-' and matches > 1: |
3942
|
|
|
matches -= 1 |
3943
|
|
|
rule = rule[1:] |
3944
|
|
|
|
3945
|
|
|
if rule and rule[0] == '<': |
3946
|
|
|
rule = rule[1:] |
3947
|
|
|
|
3948
|
|
|
if rule and rule[0].isdigit(): |
3949
|
|
|
# read priority |
3950
|
|
|
priority = int(rule[0]) |
3951
|
|
|
rule = rule[1:] |
3952
|
|
|
|
3953
|
|
|
if rule and rule[0:2] == '^^': |
3954
|
|
|
rule = rule[1:] |
3955
|
|
|
|
3956
|
|
|
if (not rule or |
|
|
|
|
3957
|
|
|
((rule[0] == '^') and |
3958
|
|
|
((i == 0) or not src[i-1].isalpha()) and |
3959
|
|
|
((rule[1:2] != '$') or |
3960
|
|
|
(not (src[i+matches0:i+matches0+1].isalpha()) and |
3961
|
|
|
(src[i+matches0:i+matches0+1] != '.')))) or |
3962
|
|
|
((rule[0] == '$') and (i > 0) and |
3963
|
|
|
src[i-1].isalpha() and |
3964
|
|
|
((not src[i+matches0:i+matches0+1].isalpha()) and |
3965
|
|
|
(src[i+matches0:i+matches0+1] != '.')))): |
3966
|
|
|
# look for continuation, if: |
3967
|
|
|
# matches > 1 und NO '-' in first string */ |
3968
|
|
|
pos0 = -1 |
3969
|
|
|
|
3970
|
|
|
start3 = 0 |
3971
|
|
|
start4 = 0 |
3972
|
|
|
end3 = 0 |
3973
|
|
|
end4 = 0 |
3974
|
|
|
|
3975
|
|
|
if (((matches > 1) and |
3976
|
|
|
src[i+matches:i+matches+1] and |
3977
|
|
|
(priority0 != ord('-')))): |
3978
|
|
|
char0 = src[i+matches-1] |
3979
|
|
|
pos0 = alpha_pos[char0] |
3980
|
|
|
|
3981
|
|
|
if pos0 >= 2 and src[i+matches]: |
3982
|
|
|
xpos = pos0 - 2 |
3983
|
|
|
pos0 = alpha_pos[src[i+matches]] |
3984
|
|
|
start3 = phonet_hash_1[xpos, pos0] |
3985
|
|
|
start4 = phonet_hash_1[xpos, 0] |
3986
|
|
|
end3 = phonet_hash_2[xpos, pos0] |
3987
|
|
|
end4 = phonet_hash_2[xpos, 0] |
3988
|
|
|
|
3989
|
|
|
# preserve rule priorities |
3990
|
|
|
if (((start4 >= 0) and |
3991
|
|
|
((start3 < 0) or (start4 < start3)))): |
3992
|
|
|
pos0 = start3 |
|
|
|
|
3993
|
|
|
start3 = start4 |
3994
|
|
|
start4 = pos0 |
3995
|
|
|
pos0 = end3 |
|
|
|
|
3996
|
|
|
end3 = end4 |
3997
|
|
|
end4 = pos0 |
3998
|
|
|
|
3999
|
|
|
if (end3 >= start4) and (start4 >= 0): |
|
|
|
|
4000
|
|
|
if end4 > end3: |
4001
|
|
|
end3 = end4 |
4002
|
|
|
|
4003
|
|
|
start4 = -1 |
4004
|
|
|
end4 = -1 |
4005
|
|
|
else: |
4006
|
|
|
pos0 = phonet_hash[char0] |
4007
|
|
|
start3 = pos0 |
4008
|
|
|
end3 = 10000 |
4009
|
|
|
start4 = -1 |
4010
|
|
|
end4 = -1 |
4011
|
|
|
|
4012
|
|
|
pos0 = start3 |
4013
|
|
|
|
4014
|
|
|
# check continuation rules for src[i+matches] |
4015
|
|
|
if pos0 >= 0: |
4016
|
|
|
while ((_phonet_rules[pos0] is None) or |
4017
|
|
|
(_phonet_rules[pos0][0] == char0)): |
|
|
|
|
4018
|
|
|
if pos0 > end3: |
4019
|
|
|
if start4 > 0: |
4020
|
|
|
pos0 = start4 |
4021
|
|
|
start3 = start4 |
4022
|
|
|
start4 = -1 |
4023
|
|
|
end3 = end4 |
4024
|
|
|
end4 = -1 |
4025
|
|
|
continue |
4026
|
|
|
|
4027
|
|
|
priority0 = -1 |
4028
|
|
|
|
4029
|
|
|
# important |
4030
|
|
|
break |
4031
|
|
|
|
4032
|
|
|
if (((_phonet_rules[pos0] is None) or |
4033
|
|
|
(_phonet_rules[pos0 + mode] is None))): |
4034
|
|
|
# no conversion rule available |
4035
|
|
|
pos0 += 3 |
4036
|
|
|
continue |
4037
|
|
|
|
4038
|
|
|
if trace: |
4039
|
|
|
_trinfo('> > continuation rule no.', pos0, |
4040
|
|
|
'is being checked', lang) |
4041
|
|
|
|
4042
|
|
|
# check whole string |
4043
|
|
|
matches0 = matches |
4044
|
|
|
priority0 = 5 |
4045
|
|
|
rule = _phonet_rules[pos0] |
4046
|
|
|
rule = rule[1:] |
|
|
|
|
4047
|
|
|
|
4048
|
|
|
while (rule and |
4049
|
|
|
(src[i+matches0:i+matches0+1] == |
4050
|
|
|
rule[0]) and |
4051
|
|
|
(not rule[0].isdigit() or |
4052
|
|
|
(rule in '(-<^$'))): |
4053
|
|
|
matches0 += 1 |
4054
|
|
|
rule = rule[1:] |
4055
|
|
|
|
4056
|
|
|
if rule and rule[0] == '(': |
4057
|
|
|
# check an array of letters |
4058
|
|
|
if ((src[i+matches0:i+matches0+1] |
4059
|
|
|
.isalpha() and |
4060
|
|
|
(src[i+matches0] in rule[1:]))): |
4061
|
|
|
matches0 += 1 |
4062
|
|
|
|
4063
|
|
|
while rule and rule[0] != ')': |
4064
|
|
|
rule = rule[1:] |
4065
|
|
|
|
4066
|
|
|
# if rule[0] == ')': |
4067
|
|
|
rule = rule[1:] |
4068
|
|
|
|
4069
|
|
|
while rule and rule[0] == '-': |
4070
|
|
|
# "matches0" is NOT decremented |
4071
|
|
|
# because of "if (matches0 == matches)" |
4072
|
|
|
rule = rule[1:] |
4073
|
|
|
|
4074
|
|
|
if rule and rule[0] == '<': |
4075
|
|
|
rule = rule[1:] |
4076
|
|
|
|
4077
|
|
|
if rule and rule[0].isdigit(): |
4078
|
|
|
priority0 = int(rule[0]) |
4079
|
|
|
rule = rule[1:] |
4080
|
|
|
|
4081
|
|
|
if (not rule or |
4082
|
|
|
# rule == '^' is not possible here |
4083
|
|
|
((rule[0] == '$') and not |
4084
|
|
|
src[i+matches0:i+matches0+1] |
4085
|
|
|
.isalpha() and |
4086
|
|
|
(src[i+matches0:i+matches0+1] |
4087
|
|
|
!= '.'))): |
4088
|
|
|
if matches0 == matches: |
4089
|
|
|
# this is only a partial string |
4090
|
|
|
if trace: |
4091
|
|
|
_trinfo('> > continuation ' + |
4092
|
|
|
'rule no.', |
4093
|
|
|
pos0, |
4094
|
|
|
'not used (too short)', |
4095
|
|
|
lang) |
4096
|
|
|
|
4097
|
|
|
pos0 += 3 |
4098
|
|
|
continue |
4099
|
|
|
|
4100
|
|
|
if priority0 < priority: |
4101
|
|
|
# priority is too low |
4102
|
|
|
if trace: |
4103
|
|
|
_trinfo('> > continuation ' + |
4104
|
|
|
'rule no.', |
4105
|
|
|
pos0, |
4106
|
|
|
'not used (priority)', |
4107
|
|
|
lang) |
4108
|
|
|
|
4109
|
|
|
pos0 += 3 |
4110
|
|
|
continue |
4111
|
|
|
|
4112
|
|
|
# continuation rule found |
4113
|
|
|
break |
4114
|
|
|
|
4115
|
|
|
if trace: |
4116
|
|
|
_trinfo('> > continuation rule no.', pos0, |
4117
|
|
|
'not used', lang) |
4118
|
|
|
|
4119
|
|
|
pos0 += 3 |
4120
|
|
|
|
4121
|
|
|
# end of "while" |
4122
|
|
|
if ((priority0 >= priority) and |
4123
|
|
|
((_phonet_rules[pos0] is not None) and |
4124
|
|
|
(_phonet_rules[pos0][0] == char0))): |
|
|
|
|
4125
|
|
|
|
4126
|
|
|
if trace: |
4127
|
|
|
_trinfo('> rule no.', pos, '', lang) |
4128
|
|
|
_trinfo('> not used because of ' + |
4129
|
|
|
'continuation', pos0, '', lang) |
4130
|
|
|
|
4131
|
|
|
pos += 3 |
4132
|
|
|
continue |
4133
|
|
|
|
4134
|
|
|
# replace string |
4135
|
|
|
if trace: |
4136
|
|
|
_trinfo('Rule no.', pos, 'is applied', lang) |
4137
|
|
|
|
4138
|
|
|
if ((_phonet_rules[pos] and |
4139
|
|
|
('<' in _phonet_rules[pos][1:]))): |
4140
|
|
|
priority0 = 1 |
4141
|
|
|
else: |
4142
|
|
|
priority0 = 0 |
4143
|
|
|
|
4144
|
|
|
rule = _phonet_rules[pos + mode] |
4145
|
|
|
|
4146
|
|
|
if (priority0 == 1) and (zeta == 0): |
4147
|
|
|
# rule with '<' is applied |
4148
|
|
|
if ((j > 0) and rule and |
4149
|
|
|
((dest[j-1] == char) or |
4150
|
|
|
(dest[j-1] == rule[0]))): |
4151
|
|
|
j -= 1 |
4152
|
|
|
|
4153
|
|
|
zeta0 = 1 |
4154
|
|
|
zeta += 1 |
4155
|
|
|
matches0 = 0 |
4156
|
|
|
|
4157
|
|
|
while rule and src[i+matches0]: |
4158
|
|
|
src = (src[0:i+matches0] + rule[0] + |
4159
|
|
|
src[i+matches0+1:]) |
4160
|
|
|
matches0 += 1 |
4161
|
|
|
rule = rule[1:] |
4162
|
|
|
|
4163
|
|
|
if matches0 < matches: |
4164
|
|
|
src = (src[0:i+matches0] + |
4165
|
|
|
src[i+matches:]) |
4166
|
|
|
|
4167
|
|
|
char = src[i] |
4168
|
|
|
else: |
4169
|
|
|
i = i + matches - 1 |
4170
|
|
|
zeta = 0 |
4171
|
|
|
|
4172
|
|
|
while len(rule) > 1: |
4173
|
|
|
if (j == 0) or (dest[j - 1] != rule[0]): |
4174
|
|
|
dest = (dest[0:j] + rule[0] + |
4175
|
|
|
dest[min(len(dest), j+1):]) |
4176
|
|
|
j += 1 |
4177
|
|
|
|
4178
|
|
|
rule = rule[1:] |
4179
|
|
|
|
4180
|
|
|
# new "current char" |
4181
|
|
|
if not rule: |
4182
|
|
|
rule = '' |
4183
|
|
|
char = '' |
4184
|
|
|
else: |
4185
|
|
|
char = rule[0] |
4186
|
|
|
|
4187
|
|
|
if ((_phonet_rules[pos] and |
4188
|
|
|
'^^' in _phonet_rules[pos][1:])): |
4189
|
|
|
if char: # pragma: no branch |
4190
|
|
|
dest = (dest[0:j] + char + |
4191
|
|
|
dest[min(len(dest), j + 1):]) |
4192
|
|
|
j += 1 |
4193
|
|
|
|
4194
|
|
|
src = src[i + 1:] |
4195
|
|
|
i = 0 |
4196
|
|
|
zeta0 = 1 |
4197
|
|
|
|
4198
|
|
|
break |
4199
|
|
|
|
4200
|
|
|
pos += 3 |
4201
|
|
|
|
4202
|
|
|
if pos > end1 and start2 > 0: |
4203
|
|
|
pos = start2 |
4204
|
|
|
start1 = start2 |
4205
|
|
|
end1 = end2 |
4206
|
|
|
start2 = -1 |
4207
|
|
|
end2 = -1 |
4208
|
|
|
|
4209
|
|
|
if zeta0 == 0: |
4210
|
|
|
if char and ((j == 0) or (dest[j-1] != char)): |
4211
|
|
|
# delete multiple letters only |
4212
|
|
|
dest = dest[0:j] + char + dest[min(j+1, term_length):] |
4213
|
|
|
j += 1 |
4214
|
|
|
|
4215
|
|
|
i += 1 |
4216
|
|
|
zeta = 0 |
4217
|
|
|
|
4218
|
|
|
dest = dest[0:j] |
4219
|
|
|
|
4220
|
|
|
return dest |
4221
|
|
|
|
4222
|
|
|
_initialize_phonet(lang) |
4223
|
|
|
|
4224
|
|
|
word = unicodedata.normalize('NFKC', text_type(word)) |
4225
|
|
|
return _phonet(word, mode, lang, trace) |
4226
|
|
|
|
4227
|
|
|
|
4228
|
|
|
def spfc(word): |
4229
|
|
|
"""Return the Standardized Phonetic Frequency Code (SPFC) of a word. |
4230
|
|
|
|
4231
|
|
|
Standardized Phonetic Frequency Code is roughly Soundex-like. |
4232
|
|
|
This implementation is based on page 19-21 of |
4233
|
|
|
https://archive.org/stream/accessingindivid00moor#page/19/mode/1up |
4234
|
|
|
|
4235
|
|
|
:param str word: the word to transform |
4236
|
|
|
:returns: the SPFC value |
4237
|
|
|
:rtype: str |
4238
|
|
|
|
4239
|
|
|
>>> spfc('Christopher Smith') |
4240
|
|
|
'01160' |
4241
|
|
|
>>> spfc('Christopher Schmidt') |
4242
|
|
|
'01160' |
4243
|
|
|
>>> spfc('Niall Smith') |
4244
|
|
|
'01660' |
4245
|
|
|
>>> spfc('Niall Schmidt') |
4246
|
|
|
|
4247
|
|
|
>>> spfc('L.Smith') |
4248
|
|
|
'01960' |
4249
|
|
|
>>> spfc('R.Miller') |
4250
|
|
|
'65490' |
4251
|
|
|
|
4252
|
|
|
>>> spfc(('L', 'Smith')) |
4253
|
|
|
'01960' |
4254
|
|
|
>>> spfc(('R', 'Miller')) |
4255
|
|
|
'65490' |
4256
|
|
|
""" |
4257
|
|
|
_pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'), |
|
|
|
|
4258
|
|
|
'0011112222334445556666777')) |
4259
|
|
|
_pf2 = dict(zip((ord(_) for _ in |
4260
|
|
|
'SZCKQFPXABORDHIMNGJTUVWEL'), |
4261
|
|
|
'0011122233445556677788899')) |
4262
|
|
|
_pf3 = dict(zip((ord(_) for _ in |
4263
|
|
|
'BCKQVDTFLPGJXMNRSZAEHIOUWY'), |
4264
|
|
|
'00000112223334456677777777')) |
4265
|
|
|
|
4266
|
|
|
_substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'), |
4267
|
|
|
('MN', 'N')) |
4268
|
|
|
|
4269
|
|
|
def _raise_word_ex(): |
4270
|
|
|
"""Raise an AttributeError.""" |
4271
|
|
|
raise AttributeError('word attribute must be a string with a space ' + |
4272
|
|
|
'or period dividing the first and last names ' + |
4273
|
|
|
'or a tuple/list consisting of the first and ' + |
4274
|
|
|
'last names') |
4275
|
|
|
|
4276
|
|
|
if not word: |
4277
|
|
|
return '' |
4278
|
|
|
|
4279
|
|
|
if isinstance(word, (str, text_type)): |
4280
|
|
|
names = word.split('.', 1) |
4281
|
|
|
if len(names) != 2: |
4282
|
|
|
names = word.split(' ', 1) |
4283
|
|
|
if len(names) != 2: |
4284
|
|
|
_raise_word_ex() |
4285
|
|
|
elif hasattr(word, '__iter__'): |
4286
|
|
|
if len(word) != 2: |
4287
|
|
|
_raise_word_ex() |
4288
|
|
|
names = word |
4289
|
|
|
else: |
4290
|
|
|
_raise_word_ex() |
4291
|
|
|
|
4292
|
|
|
names = [unicodedata.normalize('NFKD', text_type(_.strip() |
4293
|
|
|
.replace('ß', 'SS') |
4294
|
|
|
.upper())) |
4295
|
|
|
for _ in names] |
|
|
|
|
4296
|
|
|
code = '' |
4297
|
|
|
|
4298
|
|
|
def steps_one_to_three(name): |
4299
|
|
|
"""Perform the first three steps of SPFC.""" |
4300
|
|
|
# filter out non A-Z |
4301
|
|
|
name = ''.join(_ for _ in name if _ in |
4302
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
4303
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
4304
|
|
|
'W', 'X', 'Y', 'Z'}) |
4305
|
|
|
|
4306
|
|
|
# 1. In the field, convert DK to K, DT to T, SC to S, KN to N, |
4307
|
|
|
# and MN to N |
4308
|
|
|
for subst in _substitutions: |
4309
|
|
|
name = name.replace(subst[0], subst[1]) |
4310
|
|
|
|
4311
|
|
|
# 2. In the name field, replace multiple letters with a single letter |
4312
|
|
|
name = _delete_consecutive_repeats(name) |
4313
|
|
|
|
4314
|
|
|
# 3. Remove vowels, W, H, and Y, but keep the first letter in the name |
4315
|
|
|
# field. |
4316
|
|
|
if name: |
4317
|
|
|
name = name[0] + ''.join(_ for _ in name[1:] if _ not in |
4318
|
|
|
{'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}) |
4319
|
|
|
return name |
4320
|
|
|
|
4321
|
|
|
names = [steps_one_to_three(_) for _ in names] |
4322
|
|
|
|
4323
|
|
|
# 4. The first digit of the code is obtained using PF1 and the first letter |
4324
|
|
|
# of the name field. Remove this letter after coding. |
4325
|
|
|
if names[1]: |
4326
|
|
|
code += names[1][0].translate(_pf1) |
4327
|
|
|
names[1] = names[1][1:] |
4328
|
|
|
|
4329
|
|
|
# 5. Using the last letters of the name, use Table PF3 to obtain the |
4330
|
|
|
# second digit of the code. Use as many letters as possible and remove |
4331
|
|
|
# after coding. |
4332
|
|
|
if names[1]: |
4333
|
|
|
if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': |
4334
|
|
|
code += '8' |
4335
|
|
|
names[1] = names[1][:-3] |
4336
|
|
|
elif names[1][-2:] == 'SN': |
4337
|
|
|
code += '8' |
4338
|
|
|
names[1] = names[1][:-2] |
4339
|
|
|
elif names[1][-3:] == 'STR': |
4340
|
|
|
code += '9' |
4341
|
|
|
names[1] = names[1][:-3] |
4342
|
|
|
elif names[1][-2:] in {'SR', 'TN', 'TD'}: |
4343
|
|
|
code += '9' |
4344
|
|
|
names[1] = names[1][:-2] |
4345
|
|
|
elif names[1][-3:] == 'DRS': |
4346
|
|
|
code += '7' |
4347
|
|
|
names[1] = names[1][:-3] |
4348
|
|
|
elif names[1][-2:] in {'TR', 'MN'}: |
4349
|
|
|
code += '7' |
4350
|
|
|
names[1] = names[1][:-2] |
4351
|
|
|
else: |
4352
|
|
|
code += names[1][-1].translate(_pf3) |
4353
|
|
|
names[1] = names[1][:-1] |
4354
|
|
|
|
4355
|
|
|
# 6. The third digit is found using Table PF2 and the first character of |
4356
|
|
|
# the first name. Remove after coding. |
4357
|
|
|
if names[0]: |
4358
|
|
|
code += names[0][0].translate(_pf2) |
4359
|
|
|
names[0] = names[0][1:] |
4360
|
|
|
|
4361
|
|
|
# 7. The fourth digit is found using Table PF2 and the first character of |
4362
|
|
|
# the name field. If no letters remain use zero. After coding remove the |
4363
|
|
|
# letter. |
4364
|
|
|
# 8. The fifth digit is found in the same manner as the fourth using the |
4365
|
|
|
# remaining characters of the name field if any. |
4366
|
|
|
for _ in range(2): |
4367
|
|
|
if names[1]: |
4368
|
|
|
code += names[1][0].translate(_pf2) |
4369
|
|
|
names[1] = names[1][1:] |
4370
|
|
|
else: |
4371
|
|
|
code += '0' |
4372
|
|
|
|
4373
|
|
|
return code |
4374
|
|
|
|
4375
|
|
|
|
4376
|
|
|
def statistics_canada(word, maxlength=4): |
4377
|
|
|
"""Return the Statistics Canada code for a word. |
4378
|
|
|
|
4379
|
|
|
The original description of this algorithm could not be located, and |
4380
|
|
|
may only have been specified in an unpublished TR. The coding does not |
4381
|
|
|
appear to be in use by Statistics Canada any longer. In its place, this is |
4382
|
|
|
an implementation of the "Census modified Statistics Canada name coding |
4383
|
|
|
procedure". |
4384
|
|
|
|
4385
|
|
|
The modified version of this algorithm is described in Appendix B of |
4386
|
|
|
Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding |
4387
|
|
|
Procedure for the SRS Record Linkage System.` Statistical Reporting |
4388
|
|
|
Service, U.S. Department of Agriculture, Washington, D.C. February 1977. |
4389
|
|
|
https://naldc.nal.usda.gov/download/27833/PDF |
4390
|
|
|
|
4391
|
|
|
:param str word: the word to transform |
4392
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
4393
|
|
|
:param bool modified: indicates whether to use USDA modified algorithm |
4394
|
|
|
:returns: the Statistics Canada name code value |
4395
|
|
|
:rtype: str |
4396
|
|
|
|
4397
|
|
|
>>> statistics_canada('Christopher') |
4398
|
|
|
'CHRS' |
4399
|
|
|
>>> statistics_canada('Niall') |
4400
|
|
|
'NL' |
4401
|
|
|
>>> statistics_canada('Smith') |
4402
|
|
|
'SMTH' |
4403
|
|
|
>>> statistics_canada('Schmidt') |
4404
|
|
|
'SCHM' |
4405
|
|
|
""" |
4406
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4407
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
4408
|
|
|
word = word.replace('ß', 'SS') |
4409
|
|
|
word = ''.join(c for c in word if c in |
4410
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4411
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4412
|
|
|
'Y', 'Z'}) |
4413
|
|
|
if not word: |
4414
|
|
|
return '' |
4415
|
|
|
|
4416
|
|
|
code = word[1:] |
4417
|
|
|
for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
4418
|
|
|
code = code.replace(vowel, '') |
4419
|
|
|
code = word[0]+code |
4420
|
|
|
code = _delete_consecutive_repeats(code) |
4421
|
|
|
code = code.replace(' ', '') |
4422
|
|
|
|
4423
|
|
|
return code[:maxlength] |
4424
|
|
|
|
4425
|
|
|
|
4426
|
|
|
def lein(word, maxlength=4, zero_pad=True): |
4427
|
|
|
"""Return the Lein code for a word. |
4428
|
|
|
|
4429
|
|
|
This is Lein name coding, based on |
4430
|
|
|
https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF |
4431
|
|
|
|
4432
|
|
|
:param str word: the word to transform |
4433
|
|
|
:param int maxlength: the maximum length (default 4) of the code to return |
4434
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4435
|
|
|
maxlength string |
4436
|
|
|
:returns: the Lein code |
4437
|
|
|
:rtype: str |
4438
|
|
|
|
4439
|
|
|
>>> lein('Christopher') |
4440
|
|
|
'C351' |
4441
|
|
|
>>> lein('Niall') |
4442
|
|
|
'N300' |
4443
|
|
|
>>> lein('Smith') |
4444
|
|
|
'S210' |
4445
|
|
|
>>> lein('Schmidt') |
4446
|
|
|
'S521' |
4447
|
|
|
""" |
4448
|
|
|
_lein_translation = dict(zip((ord(_) for _ in |
|
|
|
|
4449
|
|
|
'BCDFGJKLMNPQRSTVXZ'), |
4450
|
|
|
'451455532245351455')) |
4451
|
|
|
|
4452
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4453
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
4454
|
|
|
word = word.replace('ß', 'SS') |
4455
|
|
|
word = ''.join(c for c in word if c in |
4456
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4457
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4458
|
|
|
'Y', 'Z'}) |
4459
|
|
|
|
4460
|
|
|
if not word: |
4461
|
|
|
return '' |
4462
|
|
|
|
4463
|
|
|
code = word[0] # Rule 1 |
4464
|
|
|
word = word[1:].translate(str.maketrans('', '', 'AEIOUYWH ')) # Rule 2 |
4465
|
|
|
word = _delete_consecutive_repeats(word) # Rule 3 |
4466
|
|
|
code += word.translate(_lein_translation) # Rule 4 |
4467
|
|
|
|
4468
|
|
|
if zero_pad: |
4469
|
|
|
code += ('0'*maxlength) # Rule 4 |
4470
|
|
|
|
4471
|
|
|
return code[:maxlength] |
4472
|
|
|
|
4473
|
|
|
|
4474
|
|
|
def roger_root(word, maxlength=5, zero_pad=True): |
4475
|
|
|
"""Return the Roger Root code for a word. |
4476
|
|
|
|
4477
|
|
|
This is Roger Root name coding, based on |
4478
|
|
|
https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF |
4479
|
|
|
|
4480
|
|
|
:param str word: the word to transform |
4481
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
4482
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4483
|
|
|
maxlength string |
4484
|
|
|
:returns: the Roger Root code |
4485
|
|
|
:rtype: str |
4486
|
|
|
|
4487
|
|
|
>>> roger_root('Christopher') |
4488
|
|
|
'06401' |
4489
|
|
|
>>> roger_root('Niall') |
4490
|
|
|
'02500' |
4491
|
|
|
>>> roger_root('Smith') |
4492
|
|
|
'00310' |
4493
|
|
|
>>> roger_root('Schmidt') |
4494
|
|
|
'06310' |
4495
|
|
|
""" |
4496
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
4497
|
|
|
word = unicodedata.normalize('NFKD', text_type(word.upper())) |
4498
|
|
|
word = word.replace('ß', 'SS') |
4499
|
|
|
word = ''.join(c for c in word if c in |
4500
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
4501
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
4502
|
|
|
'Y', 'Z'}) |
4503
|
|
|
|
4504
|
|
|
if not word: |
4505
|
|
|
return '' |
4506
|
|
|
|
4507
|
|
|
# '*' is used to prevent combining by _delete_consecutive_repeats() |
4508
|
|
|
_init_patterns = {4: {'TSCH': '06'}, |
4509
|
|
|
3: {'TSH': '06', 'SCH': '06'}, |
4510
|
|
|
2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0', |
4511
|
|
|
'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02', |
4512
|
|
|
'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02', |
4513
|
|
|
'SH': '06', 'TS': '0*0', 'WR': '04'}, |
4514
|
|
|
1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1', |
4515
|
|
|
'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3', |
4516
|
|
|
'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1', |
4517
|
|
|
'P': '09', 'Q': '07', 'R': '04', 'S': '0*0', |
4518
|
|
|
'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07', |
4519
|
|
|
'Y': '5', 'Z': '0*0'}} |
4520
|
|
|
|
4521
|
|
|
_med_patterns = {4: {'TSCH': '6'}, |
4522
|
|
|
3: {'TSH': '6', 'SCH': '6'}, |
4523
|
|
|
2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7', |
4524
|
|
|
'PH': '8', 'SH': '6', 'TS': '0'}, |
4525
|
|
|
1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7', |
4526
|
|
|
'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2', |
4527
|
|
|
'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1', |
4528
|
|
|
'V': '8', 'X': '7', 'Z': '0', |
4529
|
|
|
'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*', |
4530
|
|
|
'U': '*', 'W': '*', 'Y': '*'}} |
4531
|
|
|
|
4532
|
|
|
code = '' |
4533
|
|
|
pos = 0 |
4534
|
|
|
|
4535
|
|
|
# Do first digit(s) first |
4536
|
|
|
for n in range(4, 0, -1): |
|
|
|
|
4537
|
|
|
if word[:n] in _init_patterns[n]: |
4538
|
|
|
code = _init_patterns[n][word[:n]] |
4539
|
|
|
pos += n |
4540
|
|
|
break |
4541
|
|
|
else: |
4542
|
|
|
pos += 1 # Advance if nothing is recognized |
4543
|
|
|
|
4544
|
|
|
# Then code subsequent digits |
4545
|
|
|
while pos < len(word): |
4546
|
|
|
for n in range(4, 0, -1): |
|
|
|
|
4547
|
|
|
if word[pos:pos+n] in _med_patterns[n]: |
4548
|
|
|
code += _med_patterns[n][word[pos:pos+n]] |
4549
|
|
|
pos += n |
4550
|
|
|
break |
4551
|
|
|
else: |
4552
|
|
|
pos += 1 # Advance if nothing is recognized |
4553
|
|
|
|
4554
|
|
|
code = _delete_consecutive_repeats(code) |
4555
|
|
|
code = code.replace('*', '') |
4556
|
|
|
|
4557
|
|
|
if zero_pad: |
4558
|
|
|
code += '0'*maxlength |
4559
|
|
|
|
4560
|
|
|
return code[:maxlength] |
4561
|
|
|
|
4562
|
|
|
|
4563
|
|
|
def onca(word, maxlength=4, zero_pad=True): |
4564
|
|
|
"""Return the Oxford Name Compression Algorithm (ONCA) code for a word. |
4565
|
|
|
|
4566
|
|
|
This is the Oxford Name Compression Algorithm, based on: |
4567
|
|
|
Gill, Leicester E. 1997. "OX-LINK: The Oxford Medical Record Linkage |
4568
|
|
|
System." In ``Record Linkage Techniques -- 1997``. Arlington, VA. March |
4569
|
|
|
20--21, 1997. |
4570
|
|
|
https://nces.ed.gov/FCSM/pdf/RLT97.pdf |
4571
|
|
|
|
4572
|
|
|
I can find no complete description of the "anglicised version of the NYSIIS |
4573
|
|
|
method" identified as the first step in this algorithm, so this is likely |
4574
|
|
|
not a correct implementation, in that it employs the standard NYSIIS |
4575
|
|
|
algorithm. |
4576
|
|
|
|
4577
|
|
|
:param str word: the word to transform |
4578
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
4579
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
4580
|
|
|
maxlength string |
4581
|
|
|
:returns: the ONCA code |
4582
|
|
|
:rtype: str |
4583
|
|
|
|
4584
|
|
|
>>> onca('Christopher') |
4585
|
|
|
'C623' |
4586
|
|
|
>>> onca('Niall') |
4587
|
|
|
'N400' |
4588
|
|
|
>>> onca('Smith') |
4589
|
|
|
'S530' |
4590
|
|
|
>>> onca('Schmidt') |
4591
|
|
|
'S530' |
4592
|
|
|
""" |
4593
|
|
|
# In the most extreme case, 3 characters of NYSIIS input can be compressed |
4594
|
|
|
# to one character of output, so give it triple the maxlength. |
4595
|
|
|
return soundex(nysiis(word, maxlength=maxlength*3), maxlength, |
4596
|
|
|
zero_pad=zero_pad) |
4597
|
|
|
|
4598
|
|
|
|
4599
|
|
|
def eudex(word): |
4600
|
|
|
"""Return the eudex hash of a word. |
4601
|
|
|
|
4602
|
|
|
:param str word: the word to transform |
4603
|
|
|
:returns: the eudex hash |
4604
|
|
|
:rtype: str |
4605
|
|
|
""" |
4606
|
|
|
_trailing_phones = { |
4607
|
|
|
'a': 0, # a |
4608
|
|
|
'b': 0b01001000, # b |
4609
|
|
|
'c': 0b00001100, # c |
4610
|
|
|
'd': 0b00011000, # d |
4611
|
|
|
'e': 0, # e |
4612
|
|
|
'f': 0b01000100, # f |
4613
|
|
|
'g': 0b00001000, # g |
4614
|
|
|
'h': 0b00000100, # h |
4615
|
|
|
'i': 1, # i |
4616
|
|
|
'j': 0b00000101, # j |
4617
|
|
|
'k': 0b00001001, # k |
4618
|
|
|
'l': 0b10100000, # l |
4619
|
|
|
'm': 0b00000010, # m |
4620
|
|
|
'n': 0b00010010, # n |
4621
|
|
|
'o': 0, # o |
4622
|
|
|
'p': 0b01001001, # p |
4623
|
|
|
'q': 0b10101000, # q |
4624
|
|
|
'r': 0b10100001, # r |
4625
|
|
|
's': 0b00010100, # s |
4626
|
|
|
't': 0b00011101, # t |
4627
|
|
|
'u': 1, # u |
4628
|
|
|
'v': 0b01000101, # v |
4629
|
|
|
'w': 0b00000000, # w |
4630
|
|
|
'x': 0b10000100, # x |
4631
|
|
|
'y': 1, # y |
4632
|
|
|
'z': 0b10010100, # z |
4633
|
|
|
|
4634
|
|
|
'ß': 0b00010101, # ß |
4635
|
|
|
'à': 0, # à |
4636
|
|
|
'á': 0, # á |
4637
|
|
|
'â': 0, # â |
4638
|
|
|
'ã': 0, # ã |
4639
|
|
|
'ä': 0, # ä[æ] |
4640
|
|
|
'å': 1, # å[oː] |
4641
|
|
|
'æ': 0, # æ[æ] |
4642
|
|
|
'ç': 0b10010101, # ç[t͡ʃ] |
4643
|
|
|
'è': 1, # è |
4644
|
|
|
'é': 1, # é |
4645
|
|
|
'ê': 1, # ê |
4646
|
|
|
'ë': 1, # ë |
4647
|
|
|
'ì': 1, # ì |
4648
|
|
|
'í': 1, # í |
4649
|
|
|
'î': 1, # î |
4650
|
|
|
'ï': 1, # ï |
4651
|
|
|
'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T) |
4652
|
|
|
'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j) |
4653
|
|
|
'ò': 0, # ò |
4654
|
|
|
'ó': 0, # ó |
4655
|
|
|
'ô': 0, # ô |
4656
|
|
|
'õ': 0, # õ |
4657
|
|
|
'ö': 1, # ö[ø] |
4658
|
|
|
'÷': 0b11111111, # ÷ |
4659
|
|
|
'ø': 1, # ø[ø] |
4660
|
|
|
'ù': 1, # ù |
4661
|
|
|
'ú': 1, # ú |
4662
|
|
|
'û': 1, # û |
4663
|
|
|
'ü': 1, # ü |
4664
|
|
|
'ý': 1, # ý |
4665
|
|
|
'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T) |
4666
|
|
|
'ÿ': 1, # ÿ |
4667
|
|
|
} |
4668
|
|
|
|
4669
|
|
|
_initial_phones = { |
4670
|
|
|
'a': 0b10000100, # a* |
4671
|
|
|
'b': 0b00100100, # b |
4672
|
|
|
'c': 0b00000110, # c |
4673
|
|
|
'd': 0b00001100, # d |
4674
|
|
|
'e': 0b11011000, # e* |
4675
|
|
|
'f': 0b00100010, # f |
4676
|
|
|
'g': 0b00000100, # g |
4677
|
|
|
'h': 0b00000010, # h |
4678
|
|
|
'i': 0b11111000, # i* |
4679
|
|
|
'j': 0b00000011, # j |
4680
|
|
|
'k': 0b00000101, # k |
4681
|
|
|
'l': 0b01010000, # l |
4682
|
|
|
'm': 0b00000001, # m |
4683
|
|
|
'n': 0b00001001, # n |
4684
|
|
|
'o': 0b10010100, # o* |
4685
|
|
|
'p': 0b00100101, # p |
4686
|
|
|
'q': 0b01010100, # q |
4687
|
|
|
'r': 0b01010001, # r |
4688
|
|
|
's': 0b00001010, # s |
4689
|
|
|
't': 0b00001110, # t |
4690
|
|
|
'u': 0b11100000, # u* |
4691
|
|
|
'v': 0b00100011, # v |
4692
|
|
|
'w': 0b00000000, # w |
4693
|
|
|
'x': 0b01000010, # x |
4694
|
|
|
'y': 0b11100100, # y* |
4695
|
|
|
'z': 0b01001010, # z |
4696
|
|
|
|
4697
|
|
|
'ß': 0b00001011, # ß |
4698
|
|
|
'à': 0b10000101, # à |
4699
|
|
|
'á': 0b10000101, # á |
4700
|
|
|
'â': 0b10000000, # â |
4701
|
|
|
'ã': 0b10000110, # ã |
4702
|
|
|
'ä': 0b10100110, # ä [æ] |
4703
|
|
|
'å': 0b11000010, # å [oː] |
4704
|
|
|
'æ': 0b10100111, # æ [æ] |
4705
|
|
|
'ç': 0b01010100, # ç [t͡ʃ] |
4706
|
|
|
'è': 0b11011001, # è |
4707
|
|
|
'é': 0b11011001, # é |
4708
|
|
|
'ê': 0b11011001, # ê |
4709
|
|
|
'ë': 0b11000110, # ë [ə] or [œ] |
4710
|
|
|
'ì': 0b11111001, # ì |
4711
|
|
|
'í': 0b11111001, # í |
4712
|
|
|
'î': 0b11111001, # î |
4713
|
|
|
'ï': 0b11111001, # ï |
4714
|
|
|
'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T) |
4715
|
|
|
'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j) |
4716
|
|
|
'ò': 0b10010101, # ò |
4717
|
|
|
'ó': 0b10010101, # ó |
4718
|
|
|
'ô': 0b10010101, # ô |
4719
|
|
|
'õ': 0b10010101, # õ |
4720
|
|
|
'ö': 0b11011100, # ö [œ] or [ø] |
4721
|
|
|
'÷': 0b11111111, # ÷ |
4722
|
|
|
'ø': 0b11011101, # ø [œ] or [ø] |
4723
|
|
|
'ù': 0b11100001, # ù |
4724
|
|
|
'ú': 0b11100001, # ú |
4725
|
|
|
'û': 0b11100001, # û |
4726
|
|
|
'ü': 0b11100101, # ü |
4727
|
|
|
'ý': 0b11100101, # ý |
4728
|
|
|
'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T) |
4729
|
|
|
'ÿ': 0b11100101, # ÿ |
4730
|
|
|
} |
4731
|
|
|
# Lowercase input & filter unknown characters |
4732
|
|
|
word = ''.join(char for char in word.lower() if char in _initial_phones) |
4733
|
|
|
|
4734
|
|
|
# Perform initial eudex coding of each character |
4735
|
|
|
values = [_initial_phones[word[0]]] |
4736
|
|
|
values += [_trailing_phones[char] for char in word[1:]] |
4737
|
|
|
|
4738
|
|
|
# Right-shift by one to determine if second instance should be skipped |
4739
|
|
|
shifted_values = [_ >> 1 for _ in values] |
4740
|
|
|
condensed_values = [values[0]] |
4741
|
|
|
for n in range(1, len(shifted_values)): |
|
|
|
|
4742
|
|
|
if shifted_values[n] != shifted_values[n-1]: |
4743
|
|
|
condensed_values.append(values[n]) |
4744
|
|
|
|
4745
|
|
|
# Add padding after first character & trim beyond 8 |
4746
|
|
|
values = ([condensed_values[0]] + [0]*max(0, 8 - len(condensed_values)) + |
4747
|
|
|
condensed_values[1:8]) |
4748
|
|
|
|
4749
|
|
|
# Combine individual character values into eudex hash |
4750
|
|
|
hash_value = 0 |
4751
|
|
|
for val in values: |
4752
|
|
|
hash_value = (hash_value << 8) | val |
4753
|
|
|
|
4754
|
|
|
return hash_value |
4755
|
|
|
|
4756
|
|
|
|
4757
|
|
|
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx', |
|
|
|
|
4758
|
|
|
concat=False, filter_langs=False): |
4759
|
|
|
"""Return the Beider-Morse Phonetic Matching algorithm code for a word. |
4760
|
|
|
|
4761
|
|
|
The Beider-Morse Phonetic Matching algorithm is described at: |
4762
|
|
|
http://stevemorse.org/phonetics/bmpm.htm |
4763
|
|
|
The reference implementation is licensed under GPLv3 and available at: |
4764
|
|
|
http://stevemorse.org/phoneticinfo.htm |
4765
|
|
|
|
4766
|
|
|
:param str word: the word to transform |
4767
|
|
|
:param str language_arg: the language of the term; supported values |
4768
|
|
|
include: |
4769
|
|
|
|
4770
|
|
|
- 'any' |
4771
|
|
|
- 'arabic' |
4772
|
|
|
- 'cyrillic' |
4773
|
|
|
- 'czech' |
4774
|
|
|
- 'dutch' |
4775
|
|
|
- 'english' |
4776
|
|
|
- 'french' |
4777
|
|
|
- 'german' |
4778
|
|
|
- 'greek' |
4779
|
|
|
- 'greeklatin' |
4780
|
|
|
- 'hebrew' |
4781
|
|
|
- 'hungarian' |
4782
|
|
|
- 'italian' |
4783
|
|
|
- 'polish' |
4784
|
|
|
- 'portuguese' |
4785
|
|
|
- 'romanian' |
4786
|
|
|
- 'russian' |
4787
|
|
|
- 'spanish' |
4788
|
|
|
- 'turkish' |
4789
|
|
|
- 'germandjsg' |
4790
|
|
|
- 'polishdjskp' |
4791
|
|
|
- 'russiandjsre' |
4792
|
|
|
|
4793
|
|
|
:param str name_mode: the name mode of the algorithm: |
4794
|
|
|
|
4795
|
|
|
- 'gen' -- general (default) |
4796
|
|
|
- 'ash' -- Ashkenazi |
4797
|
|
|
- 'sep' -- Sephardic |
4798
|
|
|
|
4799
|
|
|
:param str match_mode: matching mode: 'approx' or 'exact' |
4800
|
|
|
:param bool concat: concatenation mode |
4801
|
|
|
:param bool filter_langs: filter out incompatible languages |
4802
|
|
|
:returns: the BMPM value(s) |
4803
|
|
|
:rtype: tuple |
4804
|
|
|
|
4805
|
|
|
>>> bmpm('Christopher') |
4806
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
4807
|
|
|
xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir |
4808
|
|
|
tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir |
4809
|
|
|
zritofi' |
4810
|
|
|
>>> bmpm('Niall') |
4811
|
|
|
'nial niol' |
4812
|
|
|
>>> bmpm('Smith') |
4813
|
|
|
'zmit' |
4814
|
|
|
>>> bmpm('Schmidt') |
4815
|
|
|
'zmit stzmit' |
4816
|
|
|
|
4817
|
|
|
>>> bmpm('Christopher', language_arg='German') |
4818
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
4819
|
|
|
xristYfir' |
4820
|
|
|
>>> bmpm('Christopher', language_arg='English') |
4821
|
|
|
'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir |
4822
|
|
|
xrQstafir' |
4823
|
|
|
>>> bmpm('Christopher', language_arg='German', name_mode='ash') |
4824
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
4825
|
|
|
xristYfir' |
4826
|
|
|
|
4827
|
|
|
>>> bmpm('Christopher', language_arg='German', match_mode='exact') |
4828
|
|
|
'xriStopher xriStofer xristopher xristofer' |
4829
|
|
|
""" |
4830
|
|
|
return _bmpm(word, language_arg, name_mode, match_mode, |
4831
|
|
|
concat, filter_langs) |
4832
|
|
|
|
4833
|
|
|
|
4834
|
|
|
if __name__ == '__main__': |
4835
|
|
|
import doctest |
4836
|
|
|
doctest.testmod() |
4837
|
|
|
|