|
1
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
|
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
|
|
"""abydos.phonetic. |
|
20
|
|
|
|
|
21
|
|
|
The phonetic module implements phonetic algorithms including: |
|
22
|
|
|
|
|
23
|
|
|
- Robert C. Russell's Index |
|
24
|
|
|
- American Soundex |
|
25
|
|
|
- Refined Soundex |
|
26
|
|
|
- Daitch-Mokotoff Soundex |
|
27
|
|
|
- Kölner Phonetik |
|
28
|
|
|
- NYSIIS |
|
29
|
|
|
- Match Rating Algorithm |
|
30
|
|
|
- Metaphone |
|
31
|
|
|
- Double Metaphone |
|
32
|
|
|
- Caverphone |
|
33
|
|
|
- Alpha Search Inquiry System |
|
34
|
|
|
- Fuzzy Soundex |
|
35
|
|
|
- Phonex |
|
36
|
|
|
- Phonem |
|
37
|
|
|
- Phonix |
|
38
|
|
|
- SfinxBis |
|
39
|
|
|
- phonet |
|
40
|
|
|
- Standardized Phonetic Frequency Code |
|
41
|
|
|
- Statistics Canada |
|
42
|
|
|
- Lein |
|
43
|
|
|
- Roger Root |
|
44
|
|
|
- Oxford Name Compression Algorithm (ONCA) |
|
45
|
|
|
- Eudex phonetic hash |
|
46
|
|
|
- Haase Phonetik |
|
47
|
|
|
- Reth-Schek Phonetik |
|
48
|
|
|
- FONEM |
|
49
|
|
|
- Parmar-Kumbharana |
|
50
|
|
|
- Davidson's Consonant Code |
|
51
|
|
|
- SoundD |
|
52
|
|
|
- PSHP Soundex/Viewex Coding |
|
53
|
|
|
- an early version of Henry Code |
|
54
|
|
|
- Norphone |
|
55
|
|
|
- Dolby Code |
|
56
|
|
|
- Phonetic Spanish |
|
57
|
|
|
- Spanish Metaphone |
|
58
|
|
|
- MetaSoundex |
|
59
|
|
|
- Beider-Morse Phonetic Matching |
|
60
|
|
|
""" |
|
61
|
|
|
|
|
62
|
|
|
from __future__ import division, unicode_literals |
|
63
|
|
|
|
|
64
|
|
|
from collections import Counter |
|
65
|
|
|
from itertools import groupby, product |
|
66
|
|
|
from re import compile as re_compile |
|
67
|
|
|
from unicodedata import normalize |
|
68
|
|
|
|
|
69
|
|
|
from six import text_type |
|
70
|
|
|
from six.moves import range |
|
71
|
|
|
|
|
72
|
|
|
from ._bm import _bmpm |
|
73
|
|
|
|
|
74
|
|
|
_INFINITY = float('inf') |
|
75
|
|
|
|
|
76
|
|
|
|
|
77
|
|
|
def _delete_consecutive_repeats(word): |
|
78
|
|
|
"""Delete consecutive repeated characters in a word. |
|
79
|
|
|
|
|
80
|
|
|
:param str word: the word to transform |
|
81
|
|
|
:returns: word with consecutive repeating characters collapsed to |
|
82
|
|
|
a single instance |
|
83
|
|
|
:rtype: str |
|
84
|
|
|
""" |
|
85
|
|
|
return ''.join(char for char, _ in groupby(word)) |
|
86
|
|
|
|
|
87
|
|
|
|
|
88
|
|
|
def russell_index(word): |
|
89
|
|
|
"""Return the Russell Index (integer output) of a word. |
|
90
|
|
|
|
|
91
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
|
92
|
|
|
:cite:`Russell:1917`. |
|
93
|
|
|
|
|
94
|
|
|
:param str word: the word to transform |
|
95
|
|
|
:returns: the Russell Index value |
|
96
|
|
|
:rtype: int |
|
97
|
|
|
|
|
98
|
|
|
>>> russell_index('Christopher') |
|
99
|
|
|
3813428 |
|
100
|
|
|
>>> russell_index('Niall') |
|
101
|
|
|
715 |
|
102
|
|
|
>>> russell_index('Smith') |
|
103
|
|
|
3614 |
|
104
|
|
|
>>> russell_index('Schmidt') |
|
105
|
|
|
3614 |
|
106
|
|
|
""" |
|
107
|
|
|
_russell_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
108
|
|
|
'ABCDEFGIKLMNOPQRSTUVXYZ'), |
|
109
|
|
|
'12341231356712383412313')) |
|
110
|
|
|
|
|
111
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
112
|
|
|
word = word.replace('ß', 'SS') |
|
113
|
|
|
word = word.replace('GH', '') # discard gh (rule 3) |
|
114
|
|
|
word = word.rstrip('SZ') # discard /[sz]$/ (rule 3) |
|
115
|
|
|
|
|
116
|
|
|
# translate according to Russell's mapping |
|
117
|
|
|
word = ''.join(c for c in word if c in |
|
118
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N', |
|
119
|
|
|
'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'}) |
|
120
|
|
|
sdx = word.translate(_russell_translation) |
|
121
|
|
|
|
|
122
|
|
|
# remove any 1s after the first occurrence |
|
123
|
|
|
one = sdx.find('1')+1 |
|
124
|
|
|
if one: |
|
125
|
|
|
sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1') |
|
126
|
|
|
|
|
127
|
|
|
# remove repeating characters |
|
128
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
129
|
|
|
|
|
130
|
|
|
# return as an int |
|
131
|
|
|
return int(sdx) if sdx else float('NaN') |
|
132
|
|
|
|
|
133
|
|
|
|
|
134
|
|
|
def russell_index_num_to_alpha(num): |
|
135
|
|
|
"""Convert the Russell Index integer to an alphabetic string. |
|
136
|
|
|
|
|
137
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
|
138
|
|
|
:cite:`Russell:1917`. |
|
139
|
|
|
|
|
140
|
|
|
:param int num: a Russell Index integer value |
|
141
|
|
|
:returns: the Russell Index as an alphabetic string |
|
142
|
|
|
:rtype: str |
|
143
|
|
|
|
|
144
|
|
|
>>> russell_index_num_to_alpha(3813428) |
|
145
|
|
|
'CRACDBR' |
|
146
|
|
|
>>> russell_index_num_to_alpha(715) |
|
147
|
|
|
'NAL' |
|
148
|
|
|
>>> russell_index_num_to_alpha(3614) |
|
149
|
|
|
'CMAD' |
|
150
|
|
|
""" |
|
151
|
|
|
_russell_num_translation = dict(zip((ord(_) for _ in '12345678'), |
|
|
|
|
|
|
152
|
|
|
'ABCDLMNR')) |
|
153
|
|
|
num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5', |
|
154
|
|
|
'6', '7', '8'}) |
|
155
|
|
|
if num: |
|
156
|
|
|
return num.translate(_russell_num_translation) |
|
157
|
|
|
return '' |
|
158
|
|
|
|
|
159
|
|
|
|
|
160
|
|
|
def russell_index_alpha(word): |
|
161
|
|
|
"""Return the Russell Index (alphabetic output) for the word. |
|
162
|
|
|
|
|
163
|
|
|
This follows Robert C. Russell's Index algorithm, as described in |
|
164
|
|
|
:cite:`Russell:1917`. |
|
165
|
|
|
|
|
166
|
|
|
:param str word: the word to transform |
|
167
|
|
|
:returns: the Russell Index value as an alphabetic string |
|
168
|
|
|
:rtype: str |
|
169
|
|
|
|
|
170
|
|
|
>>> russell_index_alpha('Christopher') |
|
171
|
|
|
'CRACDBR' |
|
172
|
|
|
>>> russell_index_alpha('Niall') |
|
173
|
|
|
'NAL' |
|
174
|
|
|
>>> russell_index_alpha('Smith') |
|
175
|
|
|
'CMAD' |
|
176
|
|
|
>>> russell_index_alpha('Schmidt') |
|
177
|
|
|
'CMAD' |
|
178
|
|
|
""" |
|
179
|
|
|
if word: |
|
180
|
|
|
return russell_index_num_to_alpha(russell_index(word)) |
|
181
|
|
|
return '' |
|
182
|
|
|
|
|
183
|
|
|
|
|
184
|
|
|
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True): |
|
185
|
|
|
"""Return the Soundex code for a word. |
|
186
|
|
|
|
|
187
|
|
|
:param str word: the word to transform |
|
188
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
|
189
|
|
|
:param str var: the variant of the algorithm to employ (defaults to |
|
190
|
|
|
'American'): |
|
191
|
|
|
|
|
192
|
|
|
- 'American' follows the American Soundex algorithm, as described at |
|
193
|
|
|
:cite:`US:2007` and in :cite:`Knuth:1998`; this is also called |
|
194
|
|
|
Miracode |
|
195
|
|
|
- 'special' follows the rules from the 1880-1910 US Census |
|
196
|
|
|
retrospective re-analysis, in which h & w are not treated as blocking |
|
197
|
|
|
consonants but as vowels. Cf. :cite:`Repici:2013`. |
|
198
|
|
|
- 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the |
|
199
|
|
|
US Census, including coding prefixed and unprefixed versions of some |
|
200
|
|
|
names |
|
201
|
|
|
|
|
202
|
|
|
:param bool reverse: reverse the word before computing the selected Soundex |
|
203
|
|
|
(defaults to False); This results in "Reverse Soundex" |
|
204
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
|
205
|
|
|
maxlength string |
|
206
|
|
|
:returns: the Soundex value |
|
207
|
|
|
:rtype: str |
|
208
|
|
|
|
|
209
|
|
|
>>> soundex("Christopher") |
|
210
|
|
|
'C623' |
|
211
|
|
|
>>> soundex("Niall") |
|
212
|
|
|
'N400' |
|
213
|
|
|
>>> soundex('Smith') |
|
214
|
|
|
'S530' |
|
215
|
|
|
>>> soundex('Schmidt') |
|
216
|
|
|
'S530' |
|
217
|
|
|
|
|
218
|
|
|
|
|
219
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY) |
|
220
|
|
|
'C623160000000000000000000000000000000000000000000000000000000000' |
|
221
|
|
|
>>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False) |
|
222
|
|
|
'C62316' |
|
223
|
|
|
|
|
224
|
|
|
>>> soundex('Christopher', reverse=True) |
|
225
|
|
|
'R132' |
|
226
|
|
|
|
|
227
|
|
|
>>> soundex('Ashcroft') |
|
228
|
|
|
'A261' |
|
229
|
|
|
>>> soundex('Asicroft') |
|
230
|
|
|
'A226' |
|
231
|
|
|
>>> soundex('Ashcroft', var='special') |
|
232
|
|
|
'A226' |
|
233
|
|
|
>>> soundex('Asicroft', var='special') |
|
234
|
|
|
'A226' |
|
235
|
|
|
""" |
|
236
|
|
|
_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
237
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
238
|
|
|
'01230129022455012623019202')) |
|
239
|
|
|
|
|
240
|
|
|
# Require a maxlength of at least 4 and not more than 64 |
|
241
|
|
|
if maxlength is not None: |
|
242
|
|
|
maxlength = min(max(4, maxlength), 64) |
|
243
|
|
|
else: |
|
244
|
|
|
maxlength = 64 |
|
245
|
|
|
|
|
246
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
247
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
248
|
|
|
word = word.replace('ß', 'SS') |
|
249
|
|
|
|
|
250
|
|
|
if var == 'Census': |
|
251
|
|
|
# Should these prefixes be supplemented? (VANDE, DELA, VON) |
|
252
|
|
|
if word[:3] in {'VAN', 'CON'} and len(word) > 4: |
|
253
|
|
|
return (soundex(word, maxlength, 'American', reverse, zero_pad), |
|
254
|
|
|
soundex(word[3:], maxlength, 'American', reverse, |
|
255
|
|
|
zero_pad)) |
|
256
|
|
|
if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3: |
|
257
|
|
|
return (soundex(word, maxlength, 'American', reverse, zero_pad), |
|
258
|
|
|
soundex(word[2:], maxlength, 'American', reverse, |
|
259
|
|
|
zero_pad)) |
|
260
|
|
|
# Otherwise, proceed as usual (var='American' mode, ostensibly) |
|
261
|
|
|
|
|
262
|
|
|
word = ''.join(c for c in word if c in |
|
263
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
264
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
265
|
|
|
'Y', 'Z'}) |
|
266
|
|
|
|
|
267
|
|
|
# Nothing to convert, return base case |
|
268
|
|
|
if not word: |
|
269
|
|
|
if zero_pad: |
|
270
|
|
|
return '0'*maxlength |
|
271
|
|
|
return '0' |
|
272
|
|
|
|
|
273
|
|
|
# Reverse word if computing Reverse Soundex |
|
274
|
|
|
if reverse: |
|
275
|
|
|
word = word[::-1] |
|
276
|
|
|
|
|
277
|
|
|
# apply the Soundex algorithm |
|
278
|
|
|
sdx = word.translate(_soundex_translation) |
|
279
|
|
|
|
|
280
|
|
|
if var == 'special': |
|
281
|
|
|
sdx = sdx.replace('9', '0') # special rule for 1880-1910 census |
|
282
|
|
|
else: |
|
283
|
|
|
sdx = sdx.replace('9', '') # rule 1 |
|
284
|
|
|
sdx = _delete_consecutive_repeats(sdx) # rule 3 |
|
285
|
|
|
|
|
286
|
|
|
if word[0] in 'HW': |
|
287
|
|
|
sdx = word[0] + sdx |
|
288
|
|
|
else: |
|
289
|
|
|
sdx = word[0] + sdx[1:] |
|
290
|
|
|
sdx = sdx.replace('0', '') # rule 1 |
|
291
|
|
|
|
|
292
|
|
|
if zero_pad: |
|
293
|
|
|
sdx += ('0'*maxlength) # rule 4 |
|
294
|
|
|
|
|
295
|
|
|
return sdx[:maxlength] |
|
296
|
|
|
|
|
297
|
|
|
|
|
298
|
|
|
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False, |
|
299
|
|
|
retain_vowels=False): |
|
300
|
|
|
"""Return the Refined Soundex code for a word. |
|
301
|
|
|
|
|
302
|
|
|
This is Soundex, but with more character classes. It was defined at |
|
303
|
|
|
:cite:`Boyce:1998`. |
|
304
|
|
|
|
|
305
|
|
|
:param word: the word to transform |
|
306
|
|
|
:param maxlength: the length of the code returned (defaults to unlimited) |
|
307
|
|
|
:param reverse: reverse the word before computing the selected Soundex |
|
308
|
|
|
(defaults to False); This results in "Reverse Soundex" |
|
309
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
|
310
|
|
|
maxlength string |
|
311
|
|
|
:param retain_vowels: retain vowels (as 0) in the resulting code |
|
312
|
|
|
:returns: the Refined Soundex value |
|
313
|
|
|
:rtype: str |
|
314
|
|
|
|
|
315
|
|
|
>>> refined_soundex('Christopher') |
|
316
|
|
|
'C393619' |
|
317
|
|
|
>>> refined_soundex('Niall') |
|
318
|
|
|
'N87' |
|
319
|
|
|
>>> refined_soundex('Smith') |
|
320
|
|
|
'S386' |
|
321
|
|
|
>>> refined_soundex('Schmidt') |
|
322
|
|
|
'S386' |
|
323
|
|
|
""" |
|
324
|
|
|
_ref_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
325
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
326
|
|
|
'01360240043788015936020505')) |
|
327
|
|
|
|
|
328
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
329
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
330
|
|
|
word = word.replace('ß', 'SS') |
|
331
|
|
|
word = ''.join(c for c in word if c in |
|
332
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
333
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
334
|
|
|
'Y', 'Z'}) |
|
335
|
|
|
|
|
336
|
|
|
# Reverse word if computing Reverse Soundex |
|
337
|
|
|
if reverse: |
|
338
|
|
|
word = word[::-1] |
|
339
|
|
|
|
|
340
|
|
|
# apply the Soundex algorithm |
|
341
|
|
|
sdx = word[:1] + word.translate(_ref_soundex_translation) |
|
342
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
343
|
|
|
if not retain_vowels: |
|
344
|
|
|
sdx = sdx.replace('0', '') # Delete vowels, H, W, Y |
|
345
|
|
|
|
|
346
|
|
|
if maxlength < _INFINITY: |
|
347
|
|
|
if zero_pad: |
|
348
|
|
|
sdx += ('0' * maxlength) |
|
349
|
|
|
if maxlength: |
|
350
|
|
|
sdx = sdx[:maxlength] |
|
351
|
|
|
|
|
352
|
|
|
return sdx |
|
353
|
|
|
|
|
354
|
|
|
|
|
355
|
|
|
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True): |
|
356
|
|
|
"""Return the Daitch-Mokotoff Soundex code for a word. |
|
357
|
|
|
|
|
358
|
|
|
Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values |
|
359
|
|
|
of a word as a set. A collection is necessary since there can be multiple |
|
360
|
|
|
values for a single word. |
|
361
|
|
|
|
|
362
|
|
|
:param word: the word to transform |
|
363
|
|
|
:param maxlength: the length of the code returned (defaults to 6) |
|
364
|
|
|
:param reverse: reverse the word before computing the selected Soundex |
|
365
|
|
|
(defaults to False); This results in "Reverse Soundex" |
|
366
|
|
|
:param zero_pad: pad the end of the return value with 0s to achieve a |
|
367
|
|
|
maxlength string |
|
368
|
|
|
:returns: the Daitch-Mokotoff Soundex value |
|
369
|
|
|
:rtype: str |
|
370
|
|
|
|
|
371
|
|
|
>>> sorted(dm_soundex('Christopher')) |
|
372
|
|
|
['494379', '594379'] |
|
373
|
|
|
>>> dm_soundex('Niall') |
|
374
|
|
|
{'680000'} |
|
375
|
|
|
>>> dm_soundex('Smith') |
|
376
|
|
|
{'463000'} |
|
377
|
|
|
>>> dm_soundex('Schmidt') |
|
378
|
|
|
{'463000'} |
|
379
|
|
|
|
|
380
|
|
|
>>> sorted(dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)) |
|
381
|
|
|
['35457976754', '3557976754'] |
|
382
|
|
|
""" |
|
383
|
|
|
_dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4), |
|
384
|
|
|
'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4), |
|
385
|
|
|
'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4), |
|
386
|
|
|
'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4), |
|
387
|
|
|
'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3), |
|
388
|
|
|
'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4), |
|
389
|
|
|
'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54), |
|
390
|
|
|
'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'), |
|
391
|
|
|
'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'), |
|
392
|
|
|
'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4), |
|
393
|
|
|
'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4), |
|
394
|
|
|
'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4), |
|
395
|
|
|
'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'), |
|
396
|
|
|
'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7), |
|
397
|
|
|
'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4), |
|
398
|
|
|
'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'), |
|
399
|
|
|
'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5), |
|
400
|
|
|
'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4), |
|
401
|
|
|
'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4), |
|
402
|
|
|
'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4), |
|
403
|
|
|
'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'), |
|
404
|
|
|
'STRS': (2, 4, 4), 'CZS': (4, 4, 4), |
|
405
|
|
|
'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'), |
|
406
|
|
|
'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'), |
|
407
|
|
|
'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7), |
|
408
|
|
|
'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43), |
|
409
|
|
|
'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43), |
|
410
|
|
|
'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7), |
|
411
|
|
|
'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9), |
|
412
|
|
|
'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4), |
|
413
|
|
|
'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4), |
|
414
|
|
|
'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54), |
|
415
|
|
|
'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43), |
|
416
|
|
|
'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3), |
|
417
|
|
|
'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4), |
|
418
|
|
|
'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4), |
|
419
|
|
|
'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'), |
|
420
|
|
|
'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5), |
|
421
|
|
|
'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'), |
|
422
|
|
|
'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4), |
|
423
|
|
|
'CH': ((5, 4), (5, 4), (5, 4)), |
|
424
|
|
|
'CK': ((5, 45), (5, 45), (5, 45)), |
|
425
|
|
|
'C': ((5, 4), (5, 4), (5, 4)), |
|
426
|
|
|
'J': ((1, 4), ('_', 4), ('_', 4)), |
|
427
|
|
|
'RZ': ((94, 4), (94, 4), (94, 4)), |
|
428
|
|
|
'RS': ((94, 4), (94, 4), (94, 4))} |
|
429
|
|
|
|
|
430
|
|
|
_dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
|
431
|
|
|
'B': ('B'), |
|
432
|
|
|
'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
|
433
|
|
|
'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', |
|
434
|
|
|
'DZ', 'D'), |
|
435
|
|
|
'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
|
436
|
|
|
'F': ('FB', 'F'), |
|
437
|
|
|
'G': ('G'), |
|
438
|
|
|
'H': ('H'), |
|
439
|
|
|
'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
|
440
|
|
|
'J': ('J'), |
|
441
|
|
|
'K': ('KH', 'KS', 'K'), |
|
442
|
|
|
'L': ('L'), |
|
443
|
|
|
'M': ('MN', 'M'), |
|
444
|
|
|
'N': ('NM', 'N'), |
|
445
|
|
|
'O': ('OI', 'OJ', 'OY', 'O'), |
|
446
|
|
|
'P': ('PF', 'PH', 'P'), |
|
447
|
|
|
'Q': ('Q'), |
|
448
|
|
|
'R': ('RS', 'RZ', 'R'), |
|
449
|
|
|
'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH', |
|
450
|
|
|
'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS', |
|
451
|
|
|
'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT', |
|
452
|
|
|
'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'), |
|
453
|
|
|
'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS', |
|
454
|
|
|
'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH', |
|
455
|
|
|
'TS', 'TZ', 'T'), |
|
456
|
|
|
'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
|
457
|
|
|
'V': ('V'), |
|
458
|
|
|
'W': ('W'), |
|
459
|
|
|
'X': ('X'), |
|
460
|
|
|
'Y': ('Y'), |
|
461
|
|
|
'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD', |
|
462
|
|
|
'ZH', 'ZS', 'Z')} |
|
463
|
|
|
|
|
464
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
|
465
|
|
|
dms = [''] # initialize empty code list |
|
466
|
|
|
|
|
467
|
|
|
# Require a maxlength of at least 6 and not more than 64 |
|
468
|
|
|
if maxlength is not None: |
|
469
|
|
|
maxlength = min(max(6, maxlength), 64) |
|
470
|
|
|
else: |
|
471
|
|
|
maxlength = 64 |
|
472
|
|
|
|
|
473
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z |
|
474
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
475
|
|
|
word = word.replace('ß', 'SS') |
|
476
|
|
|
word = ''.join(c for c in word if c in |
|
477
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
478
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
479
|
|
|
'Y', 'Z'}) |
|
480
|
|
|
|
|
481
|
|
|
# Nothing to convert, return base case |
|
482
|
|
|
if not word: |
|
483
|
|
|
if zero_pad: |
|
484
|
|
|
return {'0'*maxlength} |
|
485
|
|
|
return {'0'} |
|
486
|
|
|
|
|
487
|
|
|
# Reverse word if computing Reverse Soundex |
|
488
|
|
|
if reverse: |
|
489
|
|
|
word = word[::-1] |
|
490
|
|
|
|
|
491
|
|
|
pos = 0 |
|
492
|
|
|
while pos < len(word): |
|
493
|
|
|
# Iterate through _dms_order, which specifies the possible substrings |
|
494
|
|
|
# for which codes exist in the Daitch-Mokotoff coding |
|
495
|
|
|
for sstr in _dms_order[word[pos]]: # pragma: no branch |
|
496
|
|
|
if word[pos:].startswith(sstr): |
|
497
|
|
|
# Having determined a valid substring start, retrieve the code |
|
498
|
|
|
dm_val = _dms_table[sstr] |
|
499
|
|
|
|
|
500
|
|
|
# Having retried the code (triple), determine the correct |
|
501
|
|
|
# positional variant (first, pre-vocalic, elsewhere) |
|
502
|
|
|
if pos == 0: |
|
503
|
|
|
dm_val = dm_val[0] |
|
504
|
|
|
elif (pos+len(sstr) < len(word) and |
|
505
|
|
|
word[pos+len(sstr)] in _vowels): |
|
506
|
|
|
dm_val = dm_val[1] |
|
507
|
|
|
else: |
|
508
|
|
|
dm_val = dm_val[2] |
|
509
|
|
|
|
|
510
|
|
|
# Build the code strings |
|
511
|
|
|
if isinstance(dm_val, tuple): |
|
512
|
|
|
dms = [_ + text_type(dm_val[0]) for _ in dms] \ |
|
513
|
|
|
+ [_ + text_type(dm_val[1]) for _ in dms] |
|
514
|
|
|
else: |
|
515
|
|
|
dms = [_ + text_type(dm_val) for _ in dms] |
|
516
|
|
|
pos += len(sstr) |
|
517
|
|
|
break |
|
518
|
|
|
|
|
519
|
|
|
# Filter out double letters and _ placeholders |
|
520
|
|
|
dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_') |
|
|
|
|
|
|
521
|
|
|
for _ in dms) |
|
522
|
|
|
|
|
523
|
|
|
# Trim codes and return set |
|
524
|
|
|
if zero_pad: |
|
525
|
|
|
dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms) |
|
526
|
|
|
else: |
|
527
|
|
|
dms = (_[:maxlength] for _ in dms) |
|
528
|
|
|
return set(dms) |
|
529
|
|
|
|
|
530
|
|
|
|
|
531
|
|
|
def koelner_phonetik(word): |
|
532
|
|
|
"""Return the Kölner Phonetik (numeric output) code for a word. |
|
533
|
|
|
|
|
534
|
|
|
Based on the algorithm defined by :cite:`Postel:1969`. |
|
535
|
|
|
|
|
536
|
|
|
While the output code is numeric, it is still a str because 0s can lead |
|
537
|
|
|
the code. |
|
538
|
|
|
|
|
539
|
|
|
:param str word: the word to transform |
|
540
|
|
|
:returns: the Kölner Phonetik value as a numeric string |
|
541
|
|
|
:rtype: str |
|
542
|
|
|
|
|
543
|
|
|
>>> koelner_phonetik('Christopher') |
|
544
|
|
|
'478237' |
|
545
|
|
|
>>> koelner_phonetik('Niall') |
|
546
|
|
|
'65' |
|
547
|
|
|
>>> koelner_phonetik('Smith') |
|
548
|
|
|
'862' |
|
549
|
|
|
>>> koelner_phonetik('Schmidt') |
|
550
|
|
|
'862' |
|
551
|
|
|
>>> koelner_phonetik('Müller') |
|
552
|
|
|
'657' |
|
553
|
|
|
>>> koelner_phonetik('Zimmermann') |
|
554
|
|
|
'86766' |
|
555
|
|
|
""" |
|
556
|
|
|
# pylint: disable=too-many-branches |
|
557
|
|
|
def _after(word, i, letters): |
|
558
|
|
|
"""Return True if word[i] follows one of the supplied letters.""" |
|
559
|
|
|
if i > 0 and word[i-1] in letters: |
|
560
|
|
|
return True |
|
561
|
|
|
return False |
|
562
|
|
|
|
|
563
|
|
|
def _before(word, i, letters): |
|
564
|
|
|
"""Return True if word[i] precedes one of the supplied letters.""" |
|
565
|
|
|
if i+1 < len(word) and word[i+1] in letters: |
|
566
|
|
|
return True |
|
567
|
|
|
return False |
|
568
|
|
|
|
|
569
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
|
570
|
|
|
|
|
571
|
|
|
sdx = '' |
|
572
|
|
|
|
|
573
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
574
|
|
|
word = word.replace('ß', 'SS') |
|
575
|
|
|
|
|
576
|
|
|
word = word.replace('Ä', 'AE') |
|
577
|
|
|
word = word.replace('Ö', 'OE') |
|
578
|
|
|
word = word.replace('Ü', 'UE') |
|
579
|
|
|
word = ''.join(c for c in word if c in |
|
580
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
581
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
582
|
|
|
'Y', 'Z'}) |
|
583
|
|
|
|
|
584
|
|
|
# Nothing to convert, return base case |
|
585
|
|
|
if not word: |
|
586
|
|
|
return sdx |
|
587
|
|
|
|
|
588
|
|
|
for i in range(len(word)): |
|
|
|
|
|
|
589
|
|
View Code Duplication |
if word[i] in _vowels: |
|
|
|
|
|
|
590
|
|
|
sdx += '0' |
|
591
|
|
|
elif word[i] == 'B': |
|
592
|
|
|
sdx += '1' |
|
593
|
|
|
elif word[i] == 'P': |
|
594
|
|
|
if _before(word, i, {'H'}): |
|
595
|
|
|
sdx += '3' |
|
596
|
|
|
else: |
|
597
|
|
|
sdx += '1' |
|
598
|
|
|
elif word[i] in {'D', 'T'}: |
|
599
|
|
|
if _before(word, i, {'C', 'S', 'Z'}): |
|
600
|
|
|
sdx += '8' |
|
601
|
|
|
else: |
|
602
|
|
|
sdx += '2' |
|
603
|
|
|
elif word[i] in {'F', 'V', 'W'}: |
|
604
|
|
|
sdx += '3' |
|
605
|
|
|
elif word[i] in {'G', 'K', 'Q'}: |
|
606
|
|
|
sdx += '4' |
|
607
|
|
|
elif word[i] == 'C': |
|
608
|
|
|
if _after(word, i, {'S', 'Z'}): |
|
609
|
|
|
sdx += '8' |
|
610
|
|
|
elif i == 0: |
|
611
|
|
|
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', |
|
612
|
|
|
'X'}): |
|
613
|
|
|
sdx += '4' |
|
614
|
|
|
else: |
|
615
|
|
|
sdx += '8' |
|
616
|
|
|
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
617
|
|
|
sdx += '4' |
|
618
|
|
|
else: |
|
619
|
|
|
sdx += '8' |
|
620
|
|
|
elif word[i] == 'X': |
|
621
|
|
|
if _after(word, i, {'C', 'K', 'Q'}): |
|
622
|
|
|
sdx += '8' |
|
623
|
|
|
else: |
|
624
|
|
|
sdx += '48' |
|
625
|
|
|
elif word[i] == 'L': |
|
626
|
|
|
sdx += '5' |
|
627
|
|
|
elif word[i] in {'M', 'N'}: |
|
628
|
|
|
sdx += '6' |
|
629
|
|
|
elif word[i] == 'R': |
|
630
|
|
|
sdx += '7' |
|
631
|
|
|
elif word[i] in {'S', 'Z'}: |
|
632
|
|
|
sdx += '8' |
|
633
|
|
|
|
|
634
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
635
|
|
|
|
|
636
|
|
|
if sdx: |
|
637
|
|
|
sdx = sdx[:1] + sdx[1:].replace('0', '') |
|
638
|
|
|
|
|
639
|
|
|
return sdx |
|
640
|
|
|
|
|
641
|
|
|
|
|
642
|
|
|
def koelner_phonetik_num_to_alpha(num): |
|
643
|
|
|
"""Convert a Kölner Phonetik code from numeric to alphabetic. |
|
644
|
|
|
|
|
645
|
|
|
:param str num: a numeric Kölner Phonetik representation |
|
646
|
|
|
:returns: an alphabetic representation of the same word |
|
647
|
|
|
:rtype: str |
|
648
|
|
|
|
|
649
|
|
|
>>> koelner_phonetik_num_to_alpha(862) |
|
650
|
|
|
'SNT' |
|
651
|
|
|
>>> koelner_phonetik_num_to_alpha(657) |
|
652
|
|
|
'NLR' |
|
653
|
|
|
>>> koelner_phonetik_num_to_alpha(86766) |
|
654
|
|
|
'SNRNN' |
|
655
|
|
|
""" |
|
656
|
|
|
_koelner_num_translation = dict(zip((ord(_) for _ in '012345678'), |
|
|
|
|
|
|
657
|
|
|
'APTFKLNRS')) |
|
658
|
|
|
num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4', |
|
659
|
|
|
'5', '6', '7', '8'}) |
|
660
|
|
|
return num.translate(_koelner_num_translation) |
|
661
|
|
|
|
|
662
|
|
|
|
|
663
|
|
|
def koelner_phonetik_alpha(word): |
|
664
|
|
|
"""Return the Kölner Phonetik (alphabetic output) code for a word. |
|
665
|
|
|
|
|
666
|
|
|
:param str word: the word to transform |
|
667
|
|
|
:returns: the Kölner Phonetik value as an alphabetic string |
|
668
|
|
|
:rtype: str |
|
669
|
|
|
|
|
670
|
|
|
>>> koelner_phonetik_alpha('Smith') |
|
671
|
|
|
'SNT' |
|
672
|
|
|
>>> koelner_phonetik_alpha('Schmidt') |
|
673
|
|
|
'SNT' |
|
674
|
|
|
>>> koelner_phonetik_alpha('Müller') |
|
675
|
|
|
'NLR' |
|
676
|
|
|
>>> koelner_phonetik_alpha('Zimmermann') |
|
677
|
|
|
'SNRNN' |
|
678
|
|
|
""" |
|
679
|
|
|
return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
|
680
|
|
|
|
|
681
|
|
|
|
|
682
|
|
|
def nysiis(word, maxlength=6, modified=False): |
|
683
|
|
|
"""Return the NYSIIS code for a word. |
|
684
|
|
|
|
|
685
|
|
|
The New York State Identification and Intelligence System algorithm is |
|
686
|
|
|
defined in :cite:`Taft:1970`. |
|
687
|
|
|
|
|
688
|
|
|
The modified version of this algorithm is described in Appendix B of |
|
689
|
|
|
:cite:`Lynch:1977`. |
|
690
|
|
|
|
|
691
|
|
|
:param str word: the word to transform |
|
692
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
|
693
|
|
|
:param bool modified: indicates whether to use USDA modified NYSIIS |
|
694
|
|
|
:returns: the NYSIIS value |
|
695
|
|
|
:rtype: str |
|
696
|
|
|
|
|
697
|
|
|
>>> nysiis('Christopher') |
|
698
|
|
|
'CRASTA' |
|
699
|
|
|
>>> nysiis('Niall') |
|
700
|
|
|
'NAL' |
|
701
|
|
|
>>> nysiis('Smith') |
|
702
|
|
|
'SNAT' |
|
703
|
|
|
>>> nysiis('Schmidt') |
|
704
|
|
|
'SNAD' |
|
705
|
|
|
|
|
706
|
|
|
>>> nysiis('Christopher', maxlength=_INFINITY) |
|
707
|
|
|
'CRASTAFAR' |
|
708
|
|
|
|
|
709
|
|
|
>>> nysiis('Christopher', maxlength=8, modified=True) |
|
710
|
|
|
'CRASTAFA' |
|
711
|
|
|
>>> nysiis('Niall', maxlength=8, modified=True) |
|
712
|
|
|
'NAL' |
|
713
|
|
|
>>> nysiis('Smith', maxlength=8, modified=True) |
|
714
|
|
|
'SNAT' |
|
715
|
|
|
>>> nysiis('Schmidt', maxlength=8, modified=True) |
|
716
|
|
|
'SNAD' |
|
717
|
|
|
""" |
|
718
|
|
|
# Require a maxlength of at least 6 |
|
719
|
|
|
if maxlength: |
|
720
|
|
|
maxlength = max(6, maxlength) |
|
721
|
|
|
|
|
722
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
|
723
|
|
|
|
|
724
|
|
|
word = ''.join(c for c in word.upper() if c.isalpha()) |
|
725
|
|
|
word = word.replace('ß', 'SS') |
|
726
|
|
|
|
|
727
|
|
|
# exit early if there are no alphas |
|
728
|
|
|
if not word: |
|
729
|
|
|
return '' |
|
730
|
|
|
|
|
731
|
|
|
if modified: |
|
732
|
|
|
original_first_char = word[0] |
|
733
|
|
|
|
|
734
|
|
|
if word[:3] == 'MAC': |
|
735
|
|
|
word = 'MCC'+word[3:] |
|
736
|
|
|
elif word[:2] == 'KN': |
|
737
|
|
|
word = 'NN'+word[2:] |
|
738
|
|
|
elif word[:1] == 'K': |
|
739
|
|
|
word = 'C'+word[1:] |
|
740
|
|
|
elif word[:2] in {'PH', 'PF'}: |
|
741
|
|
|
word = 'FF'+word[2:] |
|
742
|
|
|
elif word[:3] == 'SCH': |
|
743
|
|
|
word = 'SSS'+word[3:] |
|
744
|
|
|
elif modified: |
|
745
|
|
|
if word[:2] == 'WR': |
|
746
|
|
|
word = 'RR'+word[2:] |
|
747
|
|
|
elif word[:2] == 'RH': |
|
748
|
|
|
word = 'RR'+word[2:] |
|
749
|
|
|
elif word[:2] == 'DG': |
|
750
|
|
|
word = 'GG'+word[2:] |
|
751
|
|
|
elif word[:1] in _vowels: |
|
752
|
|
|
word = 'A'+word[1:] |
|
753
|
|
|
|
|
754
|
|
|
if modified and word[-1:] in {'S', 'Z'}: |
|
755
|
|
|
word = word[:-1] |
|
756
|
|
|
|
|
757
|
|
|
if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and |
|
758
|
|
|
word[-2:] == 'YE'): |
|
759
|
|
|
word = word[:-2]+'Y' |
|
760
|
|
|
elif word[-2:] in {'DT', 'RT', 'RD'}: |
|
761
|
|
|
word = word[:-2]+'D' |
|
762
|
|
|
elif word[-2:] in {'NT', 'ND'}: |
|
763
|
|
|
word = word[:-2]+('N' if modified else 'D') |
|
764
|
|
|
elif modified: |
|
765
|
|
|
if word[-2:] == 'IX': |
|
766
|
|
|
word = word[:-2]+'ICK' |
|
767
|
|
|
elif word[-2:] == 'EX': |
|
768
|
|
|
word = word[:-2]+'ECK' |
|
769
|
|
|
elif word[-2:] in {'JR', 'SR'}: |
|
770
|
|
|
return 'ERROR' # TODO: decide how best to return an error |
|
|
|
|
|
|
771
|
|
|
|
|
772
|
|
|
key = word[:1] |
|
773
|
|
|
|
|
774
|
|
|
skip = 0 |
|
775
|
|
|
for i in range(1, len(word)): |
|
776
|
|
|
if i >= len(word): |
|
777
|
|
|
continue |
|
778
|
|
|
elif skip: |
|
779
|
|
|
skip -= 1 |
|
780
|
|
|
continue |
|
781
|
|
|
elif word[i:i+2] == 'EV': |
|
782
|
|
|
word = word[:i] + 'AF' + word[i+2:] |
|
783
|
|
|
skip = 1 |
|
784
|
|
|
elif word[i] in _vowels: |
|
785
|
|
|
word = word[:i] + 'A' + word[i+1:] |
|
786
|
|
|
elif modified and i != len(word)-1 and word[i] == 'Y': |
|
787
|
|
|
word = word[:i] + 'A' + word[i+1:] |
|
788
|
|
|
elif word[i] == 'Q': |
|
789
|
|
|
word = word[:i] + 'G' + word[i+1:] |
|
790
|
|
|
elif word[i] == 'Z': |
|
791
|
|
|
word = word[:i] + 'S' + word[i+1:] |
|
792
|
|
|
elif word[i] == 'M': |
|
793
|
|
|
word = word[:i] + 'N' + word[i+1:] |
|
794
|
|
|
elif word[i:i+2] == 'KN': |
|
795
|
|
|
word = word[:i] + 'N' + word[i+2:] |
|
796
|
|
|
elif word[i] == 'K': |
|
797
|
|
|
word = word[:i] + 'C' + word[i+1:] |
|
798
|
|
|
elif modified and i == len(word)-3 and word[i:i+3] == 'SCH': |
|
799
|
|
|
word = word[:i] + 'SSA' |
|
800
|
|
|
skip = 2 |
|
801
|
|
|
elif word[i:i+3] == 'SCH': |
|
802
|
|
|
word = word[:i] + 'SSS' + word[i+3:] |
|
803
|
|
|
skip = 2 |
|
804
|
|
|
elif modified and i == len(word)-2 and word[i:i+2] == 'SH': |
|
805
|
|
|
word = word[:i] + 'SA' |
|
806
|
|
|
skip = 1 |
|
807
|
|
|
elif word[i:i+2] == 'SH': |
|
808
|
|
|
word = word[:i] + 'SS' + word[i+2:] |
|
809
|
|
|
skip = 1 |
|
810
|
|
|
elif word[i:i+2] == 'PH': |
|
811
|
|
|
word = word[:i] + 'FF' + word[i+2:] |
|
812
|
|
|
skip = 1 |
|
813
|
|
|
elif modified and word[i:i+3] == 'GHT': |
|
814
|
|
|
word = word[:i] + 'TTT' + word[i+3:] |
|
815
|
|
|
skip = 2 |
|
816
|
|
|
elif modified and word[i:i+2] == 'DG': |
|
817
|
|
|
word = word[:i] + 'GG' + word[i+2:] |
|
818
|
|
|
skip = 1 |
|
819
|
|
|
elif modified and word[i:i+2] == 'WR': |
|
820
|
|
|
word = word[:i] + 'RR' + word[i+2:] |
|
821
|
|
|
skip = 1 |
|
822
|
|
|
elif word[i] == 'H' and (word[i-1] not in _vowels or |
|
823
|
|
|
word[i+1:i+2] not in _vowels): |
|
824
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
|
825
|
|
|
elif word[i] == 'W' and word[i-1] in _vowels: |
|
826
|
|
|
word = word[:i] + word[i-1] + word[i+1:] |
|
827
|
|
|
|
|
828
|
|
|
if word[i:i+skip+1] != key[-1:]: |
|
829
|
|
|
key += word[i:i+skip+1] |
|
830
|
|
|
|
|
831
|
|
|
key = _delete_consecutive_repeats(key) |
|
832
|
|
|
|
|
833
|
|
|
if key[-1:] == 'S': |
|
834
|
|
|
key = key[:-1] |
|
835
|
|
|
if key[-2:] == 'AY': |
|
836
|
|
|
key = key[:-2] + 'Y' |
|
837
|
|
|
if key[-1:] == 'A': |
|
838
|
|
|
key = key[:-1] |
|
839
|
|
|
if modified and key[:1] == 'A': |
|
840
|
|
|
key = original_first_char + key[1:] |
|
|
|
|
|
|
841
|
|
|
|
|
842
|
|
|
if maxlength and maxlength < _INFINITY: |
|
843
|
|
|
key = key[:maxlength] |
|
844
|
|
|
|
|
845
|
|
|
return key |
|
846
|
|
|
|
|
847
|
|
|
|
|
848
|
|
|
def mra(word): |
|
849
|
|
|
"""Return the MRA personal numeric identifier (PNI) for a word. |
|
850
|
|
|
|
|
851
|
|
|
A description of the Western Airlines Surname Match Rating Algorithm can |
|
852
|
|
|
be found on page 18 of :cite:`Moore:1977`. |
|
853
|
|
|
|
|
854
|
|
|
:param str word: the word to transform |
|
855
|
|
|
:returns: the MRA PNI |
|
856
|
|
|
:rtype: str |
|
857
|
|
|
|
|
858
|
|
|
>>> mra('Christopher') |
|
859
|
|
|
'CHRPHR' |
|
860
|
|
|
>>> mra('Niall') |
|
861
|
|
|
'NL' |
|
862
|
|
|
>>> mra('Smith') |
|
863
|
|
|
'SMTH' |
|
864
|
|
|
>>> mra('Schmidt') |
|
865
|
|
|
'SCHMDT' |
|
866
|
|
|
""" |
|
867
|
|
|
if not word: |
|
868
|
|
|
return word |
|
869
|
|
|
word = word.upper() |
|
870
|
|
|
word = word.replace('ß', 'SS') |
|
871
|
|
|
word = word[0]+''.join(c for c in word[1:] if |
|
872
|
|
|
c not in {'A', 'E', 'I', 'O', 'U'}) |
|
873
|
|
|
word = _delete_consecutive_repeats(word) |
|
874
|
|
|
if len(word) > 6: |
|
875
|
|
|
word = word[:3]+word[-3:] |
|
876
|
|
|
return word |
|
877
|
|
|
|
|
878
|
|
|
|
|
879
|
|
|
def metaphone(word, maxlength=_INFINITY): |
|
880
|
|
|
"""Return the Metaphone code for a word. |
|
881
|
|
|
|
|
882
|
|
|
Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`, |
|
883
|
|
|
as described in :cite:`Philips:1990b`. |
|
884
|
|
|
This incorporates some corrections to the above code, particularly |
|
885
|
|
|
some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`. |
|
886
|
|
|
|
|
887
|
|
|
:param str word: the word to transform |
|
888
|
|
|
:param int maxlength: the maximum length of the returned Metaphone code |
|
889
|
|
|
(defaults to unlimited, but in Philips' original implementation |
|
890
|
|
|
this was 4) |
|
891
|
|
|
:returns: the Metaphone value |
|
892
|
|
|
:rtype: str |
|
893
|
|
|
|
|
894
|
|
|
|
|
895
|
|
|
>>> metaphone('Christopher') |
|
896
|
|
|
'KRSTFR' |
|
897
|
|
|
>>> metaphone('Niall') |
|
898
|
|
|
'NL' |
|
899
|
|
|
>>> metaphone('Smith') |
|
900
|
|
|
'SM0' |
|
901
|
|
|
>>> metaphone('Schmidt') |
|
902
|
|
|
'SKMTT' |
|
903
|
|
|
""" |
|
904
|
|
|
# pylint: disable=too-many-branches |
|
905
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U'} |
|
906
|
|
|
_frontv = {'E', 'I', 'Y'} |
|
907
|
|
|
_varson = {'C', 'G', 'P', 'S', 'T'} |
|
908
|
|
|
|
|
909
|
|
|
# Require a maxlength of at least 4 |
|
910
|
|
|
if maxlength is not None: |
|
911
|
|
|
maxlength = max(4, maxlength) |
|
912
|
|
|
else: |
|
913
|
|
|
maxlength = 64 |
|
914
|
|
|
|
|
915
|
|
|
# As in variable sound--those modified by adding an "h" |
|
916
|
|
|
ename = ''.join(c for c in word.upper() if c.isalnum()) |
|
917
|
|
|
ename = ename.replace('ß', 'SS') |
|
918
|
|
|
|
|
919
|
|
|
# Delete nonalphanumeric characters and make all caps |
|
920
|
|
|
if not ename: |
|
921
|
|
|
return '' |
|
922
|
|
|
if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}: |
|
923
|
|
|
ename = ename[1:] |
|
924
|
|
|
elif ename[0] == 'X': |
|
925
|
|
|
ename = 'S' + ename[1:] |
|
926
|
|
|
elif ename[0:2] == 'WH': |
|
927
|
|
|
ename = 'W' + ename[2:] |
|
928
|
|
|
|
|
929
|
|
|
# Convert to metaph |
|
930
|
|
|
elen = len(ename)-1 |
|
931
|
|
|
metaph = '' |
|
932
|
|
|
for i in range(len(ename)): |
|
|
|
|
|
|
933
|
|
|
if len(metaph) >= maxlength: |
|
934
|
|
|
break |
|
935
|
|
|
if ((ename[i] not in {'G', 'T'} and |
|
936
|
|
|
i > 0 and ename[i-1] == ename[i])): |
|
937
|
|
|
continue |
|
938
|
|
|
|
|
939
|
|
|
if ename[i] in _vowels and i == 0: |
|
940
|
|
|
metaph = ename[i] |
|
941
|
|
|
|
|
942
|
|
|
elif ename[i] == 'B': |
|
943
|
|
|
if i != elen or ename[i-1] != 'M': |
|
944
|
|
|
metaph += ename[i] |
|
945
|
|
|
|
|
946
|
|
|
elif ename[i] == 'C': |
|
947
|
|
|
if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv): |
|
948
|
|
|
if ename[i+1:i+3] == 'IA': |
|
949
|
|
|
metaph += 'X' |
|
950
|
|
|
elif ename[i+1:i+2] in _frontv: |
|
951
|
|
|
metaph += 'S' |
|
952
|
|
|
elif i > 0 and ename[i-1:i+2] == 'SCH': |
|
953
|
|
|
metaph += 'K' |
|
954
|
|
|
elif ename[i+1:i+2] == 'H': |
|
955
|
|
|
if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels: |
|
956
|
|
|
metaph += 'K' |
|
957
|
|
|
else: |
|
958
|
|
|
metaph += 'X' |
|
959
|
|
|
else: |
|
960
|
|
|
metaph += 'K' |
|
961
|
|
|
|
|
962
|
|
|
elif ename[i] == 'D': |
|
963
|
|
|
if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv: |
|
964
|
|
|
metaph += 'J' |
|
965
|
|
|
else: |
|
966
|
|
|
metaph += 'T' |
|
967
|
|
|
|
|
968
|
|
|
elif ename[i] == 'G': |
|
969
|
|
|
if ename[i+1:i+2] == 'H' and not (i+1 == elen or |
|
970
|
|
|
ename[i+2:i+3] not in _vowels): |
|
971
|
|
|
continue |
|
972
|
|
|
elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or |
|
973
|
|
|
(i+3 == elen and ename[i+1:i+4] == 'NED')): |
|
974
|
|
|
continue |
|
975
|
|
|
elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and |
|
976
|
|
|
ename[i+1] in _frontv): |
|
977
|
|
|
continue |
|
978
|
|
|
elif ename[i+1:i+2] == 'G': |
|
979
|
|
|
continue |
|
980
|
|
|
elif ename[i+1:i+2] in _frontv: |
|
981
|
|
|
if i == 0 or ename[i-1] != 'G': |
|
982
|
|
|
metaph += 'J' |
|
983
|
|
|
else: |
|
984
|
|
|
metaph += 'K' |
|
985
|
|
|
else: |
|
986
|
|
|
metaph += 'K' |
|
987
|
|
|
|
|
988
|
|
|
elif ename[i] == 'H': |
|
989
|
|
|
if ((i > 0 and ename[i-1] in _vowels and |
|
990
|
|
|
ename[i+1:i+2] not in _vowels)): |
|
991
|
|
|
continue |
|
992
|
|
|
elif i > 0 and ename[i-1] in _varson: |
|
993
|
|
|
continue |
|
994
|
|
|
else: |
|
995
|
|
|
metaph += 'H' |
|
996
|
|
|
|
|
997
|
|
|
elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}: |
|
998
|
|
|
metaph += ename[i] |
|
999
|
|
|
|
|
1000
|
|
|
elif ename[i] == 'K': |
|
1001
|
|
|
if i > 0 and ename[i-1] == 'C': |
|
1002
|
|
|
continue |
|
1003
|
|
|
else: |
|
1004
|
|
|
metaph += 'K' |
|
1005
|
|
|
|
|
1006
|
|
|
elif ename[i] == 'P': |
|
1007
|
|
|
if ename[i+1:i+2] == 'H': |
|
1008
|
|
|
metaph += 'F' |
|
1009
|
|
|
else: |
|
1010
|
|
|
metaph += 'P' |
|
1011
|
|
|
|
|
1012
|
|
|
elif ename[i] == 'Q': |
|
1013
|
|
|
metaph += 'K' |
|
1014
|
|
|
|
|
1015
|
|
|
elif ename[i] == 'S': |
|
1016
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
|
1017
|
|
|
ename[i+2] in 'OA')): |
|
1018
|
|
|
metaph += 'X' |
|
1019
|
|
|
elif ename[i+1:i+2] == 'H': |
|
1020
|
|
|
metaph += 'X' |
|
1021
|
|
|
else: |
|
1022
|
|
|
metaph += 'S' |
|
1023
|
|
|
|
|
1024
|
|
|
elif ename[i] == 'T': |
|
1025
|
|
|
if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and |
|
1026
|
|
|
ename[i+2] in {'A', 'O'})): |
|
1027
|
|
|
metaph += 'X' |
|
1028
|
|
|
elif ename[i+1:i+2] == 'H': |
|
1029
|
|
|
metaph += '0' |
|
1030
|
|
|
elif ename[i+1:i+3] != 'CH': |
|
1031
|
|
|
if ename[i-1:i] != 'T': |
|
1032
|
|
|
metaph += 'T' |
|
1033
|
|
|
|
|
1034
|
|
|
elif ename[i] == 'V': |
|
1035
|
|
|
metaph += 'F' |
|
1036
|
|
|
|
|
1037
|
|
|
elif ename[i] in 'WY': |
|
1038
|
|
|
if ename[i+1:i+2] in _vowels: |
|
1039
|
|
|
metaph += ename[i] |
|
1040
|
|
|
|
|
1041
|
|
|
elif ename[i] == 'X': |
|
1042
|
|
|
metaph += 'KS' |
|
1043
|
|
|
|
|
1044
|
|
|
elif ename[i] == 'Z': |
|
1045
|
|
|
metaph += 'S' |
|
1046
|
|
|
|
|
1047
|
|
|
return metaph |
|
1048
|
|
|
|
|
1049
|
|
|
|
|
1050
|
|
|
def double_metaphone(word, maxlength=_INFINITY): |
|
1051
|
|
|
"""Return the Double Metaphone code for a word. |
|
1052
|
|
|
|
|
1053
|
|
|
Based on Lawrence Philips' (Visual) C++ code from 1999 |
|
1054
|
|
|
:cite:`Philips:2000`. |
|
1055
|
|
|
|
|
1056
|
|
|
:param word: the word to transform |
|
1057
|
|
|
:param maxlength: the maximum length of the returned Double Metaphone codes |
|
1058
|
|
|
(defaults to unlimited, but in Philips' original implementation this |
|
1059
|
|
|
was 4) |
|
1060
|
|
|
:returns: the Double Metaphone value(s) |
|
1061
|
|
|
:rtype: tuple |
|
1062
|
|
|
|
|
1063
|
|
|
>>> double_metaphone('Christopher') |
|
1064
|
|
|
('KRSTFR', '') |
|
1065
|
|
|
>>> double_metaphone('Niall') |
|
1066
|
|
|
('NL', '') |
|
1067
|
|
|
>>> double_metaphone('Smith') |
|
1068
|
|
|
('SM0', 'XMT') |
|
1069
|
|
|
>>> double_metaphone('Schmidt') |
|
1070
|
|
|
('XMT', 'SMT') |
|
1071
|
|
|
""" |
|
1072
|
|
|
# pylint: disable=too-many-branches |
|
1073
|
|
|
# Require a maxlength of at least 4 |
|
1074
|
|
|
if maxlength is not None: |
|
1075
|
|
|
maxlength = max(4, maxlength) |
|
1076
|
|
|
else: |
|
1077
|
|
|
maxlength = 64 |
|
1078
|
|
|
|
|
1079
|
|
|
primary = '' |
|
1080
|
|
|
secondary = '' |
|
1081
|
|
|
|
|
1082
|
|
|
def _slavo_germanic(): |
|
1083
|
|
|
"""Return True if the word appears to be Slavic or Germanic.""" |
|
1084
|
|
|
if 'W' in word or 'K' in word or 'CZ' in word: |
|
1085
|
|
|
return True |
|
1086
|
|
|
return False |
|
1087
|
|
|
|
|
1088
|
|
|
def _metaph_add(pri, sec=''): |
|
1089
|
|
|
"""Return a new metaphone tuple with the supplied elements.""" |
|
1090
|
|
|
newpri = primary |
|
1091
|
|
|
newsec = secondary |
|
1092
|
|
|
if pri: |
|
1093
|
|
|
newpri += pri |
|
1094
|
|
|
if sec: |
|
1095
|
|
|
if sec != ' ': |
|
1096
|
|
|
newsec += sec |
|
1097
|
|
|
else: |
|
1098
|
|
|
newsec += pri |
|
1099
|
|
|
return (newpri, newsec) |
|
1100
|
|
|
|
|
1101
|
|
|
def _is_vowel(pos): |
|
1102
|
|
|
"""Return True if the character at word[pos] is a vowel.""" |
|
1103
|
|
|
if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
1104
|
|
|
return True |
|
1105
|
|
|
return False |
|
1106
|
|
|
|
|
1107
|
|
|
def _get_at(pos): |
|
1108
|
|
|
"""Return the character at word[pos].""" |
|
1109
|
|
|
return word[pos] |
|
1110
|
|
|
|
|
1111
|
|
|
def _string_at(pos, slen, substrings): |
|
1112
|
|
|
"""Return True if word[pos:pos+slen] is in substrings.""" |
|
1113
|
|
|
if pos < 0: |
|
1114
|
|
|
return False |
|
1115
|
|
|
return word[pos:pos+slen] in substrings |
|
1116
|
|
|
|
|
1117
|
|
|
current = 0 |
|
1118
|
|
|
length = len(word) |
|
1119
|
|
|
if length < 1: |
|
1120
|
|
|
return ('', '') |
|
1121
|
|
|
last = length - 1 |
|
1122
|
|
|
|
|
1123
|
|
|
word = word.upper() |
|
1124
|
|
|
word = word.replace('ß', 'SS') |
|
1125
|
|
|
|
|
1126
|
|
|
# Pad the original string so that we can index beyond the edge of the world |
|
1127
|
|
|
word += ' ' |
|
1128
|
|
|
|
|
1129
|
|
|
# Skip these when at start of word |
|
1130
|
|
|
if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}: |
|
1131
|
|
|
current += 1 |
|
1132
|
|
|
|
|
1133
|
|
|
# Initial 'X' is pronounced 'Z' e.g. 'Xavier' |
|
1134
|
|
|
if _get_at(0) == 'X': |
|
1135
|
|
|
(primary, secondary) = _metaph_add('S') # 'Z' maps to 'S' |
|
1136
|
|
|
current += 1 |
|
1137
|
|
|
|
|
1138
|
|
|
# Main loop |
|
1139
|
|
|
while True: |
|
|
|
|
|
|
1140
|
|
|
if current >= length: |
|
1141
|
|
|
break |
|
1142
|
|
|
|
|
1143
|
|
|
if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
1144
|
|
|
if current == 0: |
|
1145
|
|
|
# All init vowels now map to 'A' |
|
1146
|
|
|
(primary, secondary) = _metaph_add('A') |
|
1147
|
|
|
current += 1 |
|
1148
|
|
|
continue |
|
1149
|
|
|
|
|
1150
|
|
|
elif _get_at(current) == 'B': |
|
1151
|
|
|
# "-mb", e.g", "dumb", already skipped over... |
|
1152
|
|
|
(primary, secondary) = _metaph_add('P') |
|
1153
|
|
|
if _get_at(current + 1) == 'B': |
|
1154
|
|
|
current += 2 |
|
1155
|
|
|
else: |
|
1156
|
|
|
current += 1 |
|
1157
|
|
|
continue |
|
1158
|
|
|
|
|
1159
|
|
|
elif _get_at(current) == 'Ç': |
|
1160
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1161
|
|
|
current += 1 |
|
1162
|
|
|
continue |
|
1163
|
|
|
|
|
1164
|
|
|
elif _get_at(current) == 'C': |
|
1165
|
|
|
# Various Germanic |
|
1166
|
|
|
if (current > 1 and not _is_vowel(current - 2) and |
|
|
|
|
|
|
1167
|
|
|
_string_at((current - 1), 3, {'ACH'}) and |
|
1168
|
|
|
((_get_at(current + 2) != 'I') and |
|
1169
|
|
|
((_get_at(current + 2) != 'E') or |
|
1170
|
|
|
_string_at((current - 2), 6, |
|
1171
|
|
|
{'BACHER', 'MACHER'})))): |
|
1172
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1173
|
|
|
current += 2 |
|
1174
|
|
|
continue |
|
1175
|
|
|
|
|
1176
|
|
|
# Special case 'caesar' |
|
1177
|
|
|
elif current == 0 and _string_at(current, 6, {'CAESAR'}): |
|
1178
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1179
|
|
|
current += 2 |
|
1180
|
|
|
continue |
|
1181
|
|
|
|
|
1182
|
|
|
# Italian 'chianti' |
|
1183
|
|
|
elif _string_at(current, 4, {'CHIA'}): |
|
1184
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1185
|
|
|
current += 2 |
|
1186
|
|
|
continue |
|
1187
|
|
|
|
|
1188
|
|
|
elif _string_at(current, 2, {'CH'}): |
|
1189
|
|
|
# Find 'Michael' |
|
1190
|
|
|
if current > 0 and _string_at(current, 4, {'CHAE'}): |
|
1191
|
|
|
(primary, secondary) = _metaph_add('K', 'X') |
|
1192
|
|
|
current += 2 |
|
1193
|
|
|
continue |
|
1194
|
|
|
|
|
1195
|
|
|
# Greek roots e.g. 'chemistry', 'chorus' |
|
1196
|
|
|
elif (current == 0 and |
|
1197
|
|
|
(_string_at((current + 1), 5, |
|
1198
|
|
|
{'HARAC', 'HARIS'}) or |
|
1199
|
|
|
_string_at((current + 1), 3, |
|
1200
|
|
|
{'HOR', 'HYM', 'HIA', 'HEM'})) and |
|
1201
|
|
|
not _string_at(0, 5, {'CHORE'})): |
|
1202
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1203
|
|
|
current += 2 |
|
1204
|
|
|
continue |
|
1205
|
|
|
|
|
1206
|
|
|
# Germanic, Greek, or otherwise 'ch' for 'kh' sound |
|
1207
|
|
|
elif ((_string_at(0, 4, {'VAN ', 'VON '}) or |
|
|
|
|
|
|
1208
|
|
|
_string_at(0, 3, {'SCH'})) or |
|
1209
|
|
|
# 'architect but not 'arch', 'orchestra', 'orchid' |
|
1210
|
|
|
_string_at((current - 2), 6, |
|
1211
|
|
|
{'ORCHES', 'ARCHIT', 'ORCHID'}) or |
|
1212
|
|
|
_string_at((current + 2), 1, {'T', 'S'}) or |
|
1213
|
|
|
((_string_at((current - 1), 1, |
|
1214
|
|
|
{'A', 'O', 'U', 'E'}) or |
|
1215
|
|
|
(current == 0)) and |
|
1216
|
|
|
# e.g., 'wachtler', 'wechsler', but not 'tichner' |
|
1217
|
|
|
_string_at((current + 2), 1, |
|
1218
|
|
|
{'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W', |
|
1219
|
|
|
' '}))): |
|
1220
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1221
|
|
|
|
|
1222
|
|
|
else: |
|
1223
|
|
|
if current > 0: |
|
1224
|
|
|
if _string_at(0, 2, {'MC'}): |
|
1225
|
|
|
# e.g., "McHugh" |
|
1226
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1227
|
|
|
else: |
|
1228
|
|
|
(primary, secondary) = _metaph_add('X', 'K') |
|
1229
|
|
|
else: |
|
1230
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1231
|
|
|
|
|
1232
|
|
|
current += 2 |
|
1233
|
|
|
continue |
|
1234
|
|
|
|
|
1235
|
|
|
# e.g, 'czerny' |
|
1236
|
|
|
elif (_string_at(current, 2, {'CZ'}) and |
|
1237
|
|
|
not _string_at((current - 2), 4, {'WICZ'})): |
|
1238
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
|
1239
|
|
|
current += 2 |
|
1240
|
|
|
continue |
|
1241
|
|
|
|
|
1242
|
|
|
# e.g., 'focaccia' |
|
1243
|
|
|
elif _string_at((current + 1), 3, {'CIA'}): |
|
1244
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1245
|
|
|
current += 3 |
|
1246
|
|
|
|
|
1247
|
|
|
# double 'C', but not if e.g. 'McClellan' |
|
1248
|
|
|
elif (_string_at(current, 2, {'CC'}) and |
|
1249
|
|
|
not ((current == 1) and (_get_at(0) == 'M'))): |
|
1250
|
|
|
# 'bellocchio' but not 'bacchus' |
|
1251
|
|
|
if ((_string_at((current + 2), 1, |
|
1252
|
|
|
{'I', 'E', 'H'}) and |
|
1253
|
|
|
not _string_at((current + 2), 2, ['HU']))): |
|
1254
|
|
|
# 'accident', 'accede' 'succeed' |
|
1255
|
|
|
if ((((current == 1) and _get_at(current - 1) == 'A') or |
|
1256
|
|
|
_string_at((current - 1), 5, |
|
1257
|
|
|
{'UCCEE', 'UCCES'}))): |
|
1258
|
|
|
(primary, secondary) = _metaph_add('KS') |
|
1259
|
|
|
# 'bacci', 'bertucci', other italian |
|
1260
|
|
|
else: |
|
1261
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1262
|
|
|
current += 3 |
|
1263
|
|
|
continue |
|
1264
|
|
|
else: # Pierce's rule |
|
1265
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1266
|
|
|
current += 2 |
|
1267
|
|
|
continue |
|
1268
|
|
|
|
|
1269
|
|
|
elif _string_at(current, 2, {'CK', 'CG', 'CQ'}): |
|
1270
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1271
|
|
|
current += 2 |
|
1272
|
|
|
continue |
|
1273
|
|
|
|
|
1274
|
|
|
elif _string_at(current, 2, {'CI', 'CE', 'CY'}): |
|
1275
|
|
|
# Italian vs. English |
|
1276
|
|
|
if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}): |
|
1277
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
|
1278
|
|
|
else: |
|
1279
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1280
|
|
|
current += 2 |
|
1281
|
|
|
continue |
|
1282
|
|
|
|
|
1283
|
|
|
# else |
|
1284
|
|
|
else: |
|
1285
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1286
|
|
|
|
|
1287
|
|
|
# name sent in 'mac caffrey', 'mac gregor |
|
1288
|
|
|
if _string_at((current + 1), 2, {' C', ' Q', ' G'}): |
|
1289
|
|
|
current += 3 |
|
1290
|
|
|
elif (_string_at((current + 1), 1, |
|
1291
|
|
|
{'C', 'K', 'Q'}) and |
|
1292
|
|
|
not _string_at((current + 1), 2, {'CE', 'CI'})): |
|
1293
|
|
|
current += 2 |
|
1294
|
|
|
else: |
|
1295
|
|
|
current += 1 |
|
1296
|
|
|
continue |
|
1297
|
|
|
|
|
1298
|
|
|
elif _get_at(current) == 'D': |
|
1299
|
|
|
if _string_at(current, 2, {'DG'}): |
|
1300
|
|
|
if _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
1301
|
|
|
# e.g. 'edge' |
|
1302
|
|
|
(primary, secondary) = _metaph_add('J') |
|
1303
|
|
|
current += 3 |
|
1304
|
|
|
continue |
|
1305
|
|
|
else: |
|
1306
|
|
|
# e.g. 'edgar' |
|
1307
|
|
|
(primary, secondary) = _metaph_add('TK') |
|
1308
|
|
|
current += 2 |
|
1309
|
|
|
continue |
|
1310
|
|
|
|
|
1311
|
|
|
elif _string_at(current, 2, {'DT', 'DD'}): |
|
1312
|
|
|
(primary, secondary) = _metaph_add('T') |
|
1313
|
|
|
current += 2 |
|
1314
|
|
|
continue |
|
1315
|
|
|
|
|
1316
|
|
|
# else |
|
1317
|
|
|
else: |
|
1318
|
|
|
(primary, secondary) = _metaph_add('T') |
|
1319
|
|
|
current += 1 |
|
1320
|
|
|
continue |
|
1321
|
|
|
|
|
1322
|
|
|
elif _get_at(current) == 'F': |
|
1323
|
|
|
if _get_at(current + 1) == 'F': |
|
1324
|
|
|
current += 2 |
|
1325
|
|
|
else: |
|
1326
|
|
|
current += 1 |
|
1327
|
|
|
(primary, secondary) = _metaph_add('F') |
|
1328
|
|
|
continue |
|
1329
|
|
|
|
|
1330
|
|
|
elif _get_at(current) == 'G': |
|
1331
|
|
|
if _get_at(current + 1) == 'H': |
|
1332
|
|
|
if (current > 0) and not _is_vowel(current - 1): |
|
1333
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1334
|
|
|
current += 2 |
|
1335
|
|
|
continue |
|
1336
|
|
|
|
|
1337
|
|
|
# 'ghislane', ghiradelli |
|
1338
|
|
|
elif current == 0: |
|
1339
|
|
|
if _get_at(current + 2) == 'I': |
|
1340
|
|
|
(primary, secondary) = _metaph_add('J') |
|
1341
|
|
|
else: |
|
1342
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1343
|
|
|
current += 2 |
|
1344
|
|
|
continue |
|
1345
|
|
|
|
|
1346
|
|
|
# Parker's rule (with some further refinements) - e.g., 'hugh' |
|
1347
|
|
|
elif (((current > 1) and |
|
|
|
|
|
|
1348
|
|
|
_string_at((current - 2), 1, {'B', 'H', 'D'})) or |
|
1349
|
|
|
# e.g., 'bough' |
|
1350
|
|
|
((current > 2) and |
|
1351
|
|
|
_string_at((current - 3), 1, {'B', 'H', 'D'})) or |
|
1352
|
|
|
# e.g., 'broughton' |
|
1353
|
|
|
((current > 3) and |
|
1354
|
|
|
_string_at((current - 4), 1, {'B', 'H'}))): |
|
1355
|
|
|
current += 2 |
|
1356
|
|
|
continue |
|
1357
|
|
|
else: |
|
1358
|
|
|
# e.g. 'laugh', 'McLaughlin', 'cough', |
|
1359
|
|
|
# 'gough', 'rough', 'tough' |
|
1360
|
|
|
if ((current > 2) and |
|
1361
|
|
|
(_get_at(current - 1) == 'U') and |
|
1362
|
|
|
(_string_at((current - 3), 1, |
|
1363
|
|
|
{'C', 'G', 'L', 'R', 'T'}))): |
|
1364
|
|
|
(primary, secondary) = _metaph_add('F') |
|
1365
|
|
|
elif (current > 0) and _get_at(current - 1) != 'I': |
|
1366
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1367
|
|
|
current += 2 |
|
1368
|
|
|
continue |
|
1369
|
|
|
|
|
1370
|
|
|
elif _get_at(current + 1) == 'N': |
|
1371
|
|
|
if (current == 1) and _is_vowel(0) and not _slavo_germanic(): |
|
1372
|
|
|
(primary, secondary) = _metaph_add('KN', 'N') |
|
1373
|
|
|
# not e.g. 'cagney' |
|
1374
|
|
|
elif (not _string_at((current + 2), 2, {'EY'}) and |
|
1375
|
|
|
(_get_at(current + 1) != 'Y') and |
|
1376
|
|
|
not _slavo_germanic()): |
|
1377
|
|
|
(primary, secondary) = _metaph_add('N', 'KN') |
|
1378
|
|
|
else: |
|
1379
|
|
|
(primary, secondary) = _metaph_add('KN') |
|
1380
|
|
|
current += 2 |
|
1381
|
|
|
continue |
|
1382
|
|
|
|
|
1383
|
|
|
# 'tagliaro' |
|
1384
|
|
|
elif (_string_at((current + 1), 2, {'LI'}) and |
|
1385
|
|
|
not _slavo_germanic()): |
|
1386
|
|
|
(primary, secondary) = _metaph_add('KL', 'L') |
|
1387
|
|
|
current += 2 |
|
1388
|
|
|
continue |
|
1389
|
|
|
|
|
1390
|
|
|
# -ges-, -gep-, -gel-, -gie- at beginning |
|
1391
|
|
|
elif ((current == 0) and |
|
1392
|
|
|
((_get_at(current + 1) == 'Y') or |
|
1393
|
|
|
_string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY', |
|
1394
|
|
|
'IB', 'IL', 'IN', 'IE', 'EI', |
|
1395
|
|
|
'ER'}))): |
|
1396
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
|
1397
|
|
|
current += 2 |
|
1398
|
|
|
continue |
|
1399
|
|
|
|
|
1400
|
|
|
# -ger-, -gy- |
|
1401
|
|
|
elif ((_string_at((current + 1), 2, {'ER'}) or |
|
1402
|
|
|
(_get_at(current + 1) == 'Y')) and not |
|
1403
|
|
|
_string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not |
|
1404
|
|
|
_string_at((current - 1), 1, {'E', 'I'}) and not |
|
1405
|
|
|
_string_at((current - 1), 3, {'RGY', 'OGY'})): |
|
1406
|
|
|
(primary, secondary) = _metaph_add('K', 'J') |
|
1407
|
|
|
current += 2 |
|
1408
|
|
|
continue |
|
1409
|
|
|
|
|
1410
|
|
|
# italian e.g, 'biaggi' |
|
1411
|
|
|
elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or |
|
1412
|
|
|
_string_at((current - 1), 4, {'AGGI', 'OGGI'})): |
|
1413
|
|
|
# obvious germanic |
|
1414
|
|
|
if (((_string_at(0, 4, {'VAN ', 'VON '}) or |
|
1415
|
|
|
_string_at(0, 3, {'SCH'})) or |
|
1416
|
|
|
_string_at((current + 1), 2, {'ET'}))): |
|
1417
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1418
|
|
|
elif _string_at((current + 1), 4, {'IER '}): |
|
1419
|
|
|
(primary, secondary) = _metaph_add('J') |
|
1420
|
|
|
else: |
|
1421
|
|
|
(primary, secondary) = _metaph_add('J', 'K') |
|
1422
|
|
|
current += 2 |
|
1423
|
|
|
continue |
|
1424
|
|
|
|
|
1425
|
|
|
else: |
|
1426
|
|
|
if _get_at(current + 1) == 'G': |
|
1427
|
|
|
current += 2 |
|
1428
|
|
|
else: |
|
1429
|
|
|
current += 1 |
|
1430
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1431
|
|
|
continue |
|
1432
|
|
|
|
|
1433
|
|
|
elif _get_at(current) == 'H': |
|
1434
|
|
|
# only keep if first & before vowel or btw. 2 vowels |
|
1435
|
|
|
if ((((current == 0) or _is_vowel(current - 1)) and |
|
1436
|
|
|
_is_vowel(current + 1))): |
|
1437
|
|
|
(primary, secondary) = _metaph_add('H') |
|
1438
|
|
|
current += 2 |
|
1439
|
|
|
else: # also takes care of 'HH' |
|
1440
|
|
|
current += 1 |
|
1441
|
|
|
continue |
|
1442
|
|
|
|
|
1443
|
|
|
elif _get_at(current) == 'J': |
|
1444
|
|
|
# obvious spanish, 'jose', 'san jacinto' |
|
1445
|
|
|
if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}): |
|
1446
|
|
|
if ((((current == 0) and (_get_at(current + 4) == ' ')) or |
|
1447
|
|
|
_string_at(0, 4, ['SAN ']))): |
|
1448
|
|
|
(primary, secondary) = _metaph_add('H') |
|
1449
|
|
|
else: |
|
1450
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
|
1451
|
|
|
current += 1 |
|
1452
|
|
|
continue |
|
1453
|
|
|
|
|
1454
|
|
|
elif (current == 0) and not _string_at(current, 4, {'JOSE'}): |
|
1455
|
|
|
# Yankelovich/Jankelowicz |
|
1456
|
|
|
(primary, secondary) = _metaph_add('J', 'A') |
|
1457
|
|
|
# Spanish pron. of e.g. 'bajador' |
|
1458
|
|
|
elif (_is_vowel(current - 1) and |
|
1459
|
|
|
not _slavo_germanic() and |
|
1460
|
|
|
((_get_at(current + 1) == 'A') or |
|
1461
|
|
|
(_get_at(current + 1) == 'O'))): |
|
1462
|
|
|
(primary, secondary) = _metaph_add('J', 'H') |
|
1463
|
|
|
elif current == last: |
|
1464
|
|
|
(primary, secondary) = _metaph_add('J', ' ') |
|
1465
|
|
|
elif (not _string_at((current + 1), 1, |
|
1466
|
|
|
{'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and |
|
1467
|
|
|
not _string_at((current - 1), 1, {'S', 'K', 'L'})): |
|
1468
|
|
|
(primary, secondary) = _metaph_add('J') |
|
1469
|
|
|
|
|
1470
|
|
|
if _get_at(current + 1) == 'J': # it could happen! |
|
1471
|
|
|
current += 2 |
|
1472
|
|
|
else: |
|
1473
|
|
|
current += 1 |
|
1474
|
|
|
continue |
|
1475
|
|
|
|
|
1476
|
|
|
elif _get_at(current) == 'K': |
|
1477
|
|
|
if _get_at(current + 1) == 'K': |
|
1478
|
|
|
current += 2 |
|
1479
|
|
|
else: |
|
1480
|
|
|
current += 1 |
|
1481
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1482
|
|
|
continue |
|
1483
|
|
|
|
|
1484
|
|
|
elif _get_at(current) == 'L': |
|
1485
|
|
|
if _get_at(current + 1) == 'L': |
|
1486
|
|
|
# Spanish e.g. 'cabrillo', 'gallegos' |
|
1487
|
|
|
if (((current == (length - 3)) and |
|
1488
|
|
|
_string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or |
|
1489
|
|
|
((_string_at((last - 1), 2, {'AS', 'OS'}) or |
|
1490
|
|
|
_string_at(last, 1, {'A', 'O'})) and |
|
1491
|
|
|
_string_at((current - 1), 4, {'ALLE'}))): |
|
1492
|
|
|
(primary, secondary) = _metaph_add('L', ' ') |
|
1493
|
|
|
current += 2 |
|
1494
|
|
|
continue |
|
1495
|
|
|
current += 2 |
|
1496
|
|
|
else: |
|
1497
|
|
|
current += 1 |
|
1498
|
|
|
(primary, secondary) = _metaph_add('L') |
|
1499
|
|
|
continue |
|
1500
|
|
|
|
|
1501
|
|
|
elif _get_at(current) == 'M': |
|
1502
|
|
|
if (((_string_at((current - 1), 3, {'UMB'}) and |
|
1503
|
|
|
(((current + 1) == last) or |
|
1504
|
|
|
_string_at((current + 2), 2, {'ER'}))) or |
|
1505
|
|
|
# 'dumb', 'thumb' |
|
1506
|
|
|
(_get_at(current + 1) == 'M'))): |
|
1507
|
|
|
current += 2 |
|
1508
|
|
|
else: |
|
1509
|
|
|
current += 1 |
|
1510
|
|
|
(primary, secondary) = _metaph_add('M') |
|
1511
|
|
|
continue |
|
1512
|
|
|
|
|
1513
|
|
|
elif _get_at(current) == 'N': |
|
1514
|
|
|
if _get_at(current + 1) == 'N': |
|
1515
|
|
|
current += 2 |
|
1516
|
|
|
else: |
|
1517
|
|
|
current += 1 |
|
1518
|
|
|
(primary, secondary) = _metaph_add('N') |
|
1519
|
|
|
continue |
|
1520
|
|
|
|
|
1521
|
|
|
elif _get_at(current) == 'Ñ': |
|
1522
|
|
|
current += 1 |
|
1523
|
|
|
(primary, secondary) = _metaph_add('N') |
|
1524
|
|
|
continue |
|
1525
|
|
|
|
|
1526
|
|
|
elif _get_at(current) == 'P': |
|
1527
|
|
|
if _get_at(current + 1) == 'H': |
|
1528
|
|
|
(primary, secondary) = _metaph_add('F') |
|
1529
|
|
|
current += 2 |
|
1530
|
|
|
continue |
|
1531
|
|
|
|
|
1532
|
|
|
# also account for "campbell", "raspberry" |
|
1533
|
|
|
elif _string_at((current + 1), 1, {'P', 'B'}): |
|
1534
|
|
|
current += 2 |
|
1535
|
|
|
else: |
|
1536
|
|
|
current += 1 |
|
1537
|
|
|
(primary, secondary) = _metaph_add('P') |
|
1538
|
|
|
continue |
|
1539
|
|
|
|
|
1540
|
|
|
elif _get_at(current) == 'Q': |
|
1541
|
|
|
if _get_at(current + 1) == 'Q': |
|
1542
|
|
|
current += 2 |
|
1543
|
|
|
else: |
|
1544
|
|
|
current += 1 |
|
1545
|
|
|
(primary, secondary) = _metaph_add('K') |
|
1546
|
|
|
continue |
|
1547
|
|
|
|
|
1548
|
|
|
elif _get_at(current) == 'R': |
|
1549
|
|
|
# french e.g. 'rogier', but exclude 'hochmeier' |
|
1550
|
|
|
if (((current == last) and |
|
1551
|
|
|
not _slavo_germanic() and |
|
1552
|
|
|
_string_at((current - 2), 2, {'IE'}) and |
|
1553
|
|
|
not _string_at((current - 4), 2, {'ME', 'MA'}))): |
|
1554
|
|
|
(primary, secondary) = _metaph_add('', 'R') |
|
1555
|
|
|
else: |
|
1556
|
|
|
(primary, secondary) = _metaph_add('R') |
|
1557
|
|
|
|
|
1558
|
|
|
if _get_at(current + 1) == 'R': |
|
1559
|
|
|
current += 2 |
|
1560
|
|
|
else: |
|
1561
|
|
|
current += 1 |
|
1562
|
|
|
continue |
|
1563
|
|
|
|
|
1564
|
|
|
elif _get_at(current) == 'S': |
|
1565
|
|
|
# special cases 'island', 'isle', 'carlisle', 'carlysle' |
|
1566
|
|
|
if _string_at((current - 1), 3, {'ISL', 'YSL'}): |
|
1567
|
|
|
current += 1 |
|
1568
|
|
|
continue |
|
1569
|
|
|
|
|
1570
|
|
|
# special case 'sugar-' |
|
1571
|
|
|
elif (current == 0) and _string_at(current, 5, {'SUGAR'}): |
|
1572
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
|
1573
|
|
|
current += 1 |
|
1574
|
|
|
continue |
|
1575
|
|
|
|
|
1576
|
|
|
elif _string_at(current, 2, {'SH'}): |
|
1577
|
|
|
# Germanic |
|
1578
|
|
|
if _string_at((current + 1), 4, |
|
1579
|
|
|
{'HEIM', 'HOEK', 'HOLM', 'HOLZ'}): |
|
1580
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1581
|
|
|
else: |
|
1582
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1583
|
|
|
current += 2 |
|
1584
|
|
|
continue |
|
1585
|
|
|
|
|
1586
|
|
|
# Italian & Armenian |
|
1587
|
|
|
elif (_string_at(current, 3, {'SIO', 'SIA'}) or |
|
1588
|
|
|
_string_at(current, 4, {'SIAN'})): |
|
1589
|
|
|
if not _slavo_germanic(): |
|
1590
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
|
1591
|
|
|
else: |
|
1592
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1593
|
|
|
current += 3 |
|
1594
|
|
|
continue |
|
1595
|
|
|
|
|
1596
|
|
|
# German & anglicisations, e.g. 'smith' match 'schmidt', |
|
1597
|
|
|
# 'snider' match 'schneider' |
|
1598
|
|
|
# also, -sz- in Slavic language although in Hungarian it is |
|
1599
|
|
|
# pronounced 's' |
|
1600
|
|
|
elif (((current == 0) and |
|
1601
|
|
|
_string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or |
|
1602
|
|
|
_string_at((current + 1), 1, {'Z'})): |
|
1603
|
|
|
(primary, secondary) = _metaph_add('S', 'X') |
|
1604
|
|
|
if _string_at((current + 1), 1, {'Z'}): |
|
1605
|
|
|
current += 2 |
|
1606
|
|
|
else: |
|
1607
|
|
|
current += 1 |
|
1608
|
|
|
continue |
|
1609
|
|
|
|
|
1610
|
|
|
elif _string_at(current, 2, {'SC'}): |
|
1611
|
|
|
# Schlesinger's rule |
|
1612
|
|
|
if _get_at(current + 2) == 'H': |
|
1613
|
|
|
# dutch origin, e.g. 'school', 'schooner' |
|
1614
|
|
|
if _string_at((current + 3), 2, |
|
1615
|
|
|
{'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}): |
|
1616
|
|
|
# 'schermerhorn', 'schenker' |
|
1617
|
|
|
if _string_at((current + 3), 2, {'ER', 'EN'}): |
|
1618
|
|
|
(primary, secondary) = _metaph_add('X', 'SK') |
|
1619
|
|
|
else: |
|
1620
|
|
|
(primary, secondary) = _metaph_add('SK') |
|
1621
|
|
|
current += 3 |
|
1622
|
|
|
continue |
|
1623
|
|
|
else: |
|
1624
|
|
|
if (((current == 0) and not _is_vowel(3) and |
|
1625
|
|
|
(_get_at(3) != 'W'))): |
|
1626
|
|
|
(primary, secondary) = _metaph_add('X', 'S') |
|
1627
|
|
|
else: |
|
1628
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1629
|
|
|
current += 3 |
|
1630
|
|
|
continue |
|
1631
|
|
|
|
|
1632
|
|
|
elif _string_at((current + 2), 1, {'I', 'E', 'Y'}): |
|
1633
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1634
|
|
|
current += 3 |
|
1635
|
|
|
continue |
|
1636
|
|
|
|
|
1637
|
|
|
# else |
|
1638
|
|
|
else: |
|
1639
|
|
|
(primary, secondary) = _metaph_add('SK') |
|
1640
|
|
|
current += 3 |
|
1641
|
|
|
continue |
|
1642
|
|
|
|
|
1643
|
|
|
else: |
|
1644
|
|
|
# french e.g. 'resnais', 'artois' |
|
1645
|
|
|
if (current == last) and _string_at((current - 2), 2, |
|
1646
|
|
|
{'AI', 'OI'}): |
|
1647
|
|
|
(primary, secondary) = _metaph_add('', 'S') |
|
1648
|
|
|
else: |
|
1649
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1650
|
|
|
|
|
1651
|
|
|
if _string_at((current + 1), 1, {'S', 'Z'}): |
|
1652
|
|
|
current += 2 |
|
1653
|
|
|
else: |
|
1654
|
|
|
current += 1 |
|
1655
|
|
|
continue |
|
1656
|
|
|
|
|
1657
|
|
|
elif _get_at(current) == 'T': |
|
1658
|
|
|
if _string_at(current, 4, {'TION'}): |
|
1659
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1660
|
|
|
current += 3 |
|
1661
|
|
|
continue |
|
1662
|
|
|
|
|
1663
|
|
|
elif _string_at(current, 3, {'TIA', 'TCH'}): |
|
1664
|
|
|
(primary, secondary) = _metaph_add('X') |
|
1665
|
|
|
current += 3 |
|
1666
|
|
|
continue |
|
1667
|
|
|
|
|
1668
|
|
|
elif (_string_at(current, 2, {'TH'}) or |
|
1669
|
|
|
_string_at(current, 3, {'TTH'})): |
|
1670
|
|
|
# special case 'thomas', 'thames' or germanic |
|
1671
|
|
|
if ((_string_at((current + 2), 2, {'OM', 'AM'}) or |
|
1672
|
|
|
_string_at(0, 4, {'VAN ', 'VON '}) or |
|
1673
|
|
|
_string_at(0, 3, {'SCH'}))): |
|
1674
|
|
|
(primary, secondary) = _metaph_add('T') |
|
1675
|
|
|
else: |
|
1676
|
|
|
(primary, secondary) = _metaph_add('0', 'T') |
|
1677
|
|
|
current += 2 |
|
1678
|
|
|
continue |
|
1679
|
|
|
|
|
1680
|
|
|
elif _string_at((current + 1), 1, {'T', 'D'}): |
|
1681
|
|
|
current += 2 |
|
1682
|
|
|
else: |
|
1683
|
|
|
current += 1 |
|
1684
|
|
|
(primary, secondary) = _metaph_add('T') |
|
1685
|
|
|
continue |
|
1686
|
|
|
|
|
1687
|
|
|
elif _get_at(current) == 'V': |
|
1688
|
|
|
if _get_at(current + 1) == 'V': |
|
1689
|
|
|
current += 2 |
|
1690
|
|
|
else: |
|
1691
|
|
|
current += 1 |
|
1692
|
|
|
(primary, secondary) = _metaph_add('F') |
|
1693
|
|
|
continue |
|
1694
|
|
|
|
|
1695
|
|
|
elif _get_at(current) == 'W': |
|
1696
|
|
|
# can also be in middle of word |
|
1697
|
|
|
if _string_at(current, 2, {'WR'}): |
|
1698
|
|
|
(primary, secondary) = _metaph_add('R') |
|
1699
|
|
|
current += 2 |
|
1700
|
|
|
continue |
|
1701
|
|
|
elif ((current == 0) and |
|
1702
|
|
|
(_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))): |
|
1703
|
|
|
# Wasserman should match Vasserman |
|
1704
|
|
|
if _is_vowel(current + 1): |
|
1705
|
|
|
(primary, secondary) = _metaph_add('A', 'F') |
|
1706
|
|
|
else: |
|
1707
|
|
|
# need Uomo to match Womo |
|
1708
|
|
|
(primary, secondary) = _metaph_add('A') |
|
1709
|
|
|
|
|
1710
|
|
|
# Arnow should match Arnoff |
|
1711
|
|
|
if ((((current == last) and _is_vowel(current - 1)) or |
|
1712
|
|
|
_string_at((current - 1), 5, |
|
1713
|
|
|
{'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or |
|
1714
|
|
|
_string_at(0, 3, ['SCH']))): |
|
1715
|
|
|
(primary, secondary) = _metaph_add('', 'F') |
|
1716
|
|
|
current += 1 |
|
1717
|
|
|
continue |
|
1718
|
|
|
# Polish e.g. 'filipowicz' |
|
1719
|
|
|
elif _string_at(current, 4, {'WICZ', 'WITZ'}): |
|
1720
|
|
|
(primary, secondary) = _metaph_add('TS', 'FX') |
|
1721
|
|
|
current += 4 |
|
1722
|
|
|
continue |
|
1723
|
|
|
# else skip it |
|
1724
|
|
|
else: |
|
1725
|
|
|
current += 1 |
|
1726
|
|
|
continue |
|
1727
|
|
|
|
|
1728
|
|
|
elif _get_at(current) == 'X': |
|
1729
|
|
|
# French e.g. breaux |
|
1730
|
|
|
if (not ((current == last) and |
|
1731
|
|
|
(_string_at((current - 3), 3, {'IAU', 'EAU'}) or |
|
1732
|
|
|
_string_at((current - 2), 2, {'AU', 'OU'})))): |
|
1733
|
|
|
(primary, secondary) = _metaph_add('KS') |
|
1734
|
|
|
|
|
1735
|
|
|
if _string_at((current + 1), 1, {'C', 'X'}): |
|
1736
|
|
|
current += 2 |
|
1737
|
|
|
else: |
|
1738
|
|
|
current += 1 |
|
1739
|
|
|
continue |
|
1740
|
|
|
|
|
1741
|
|
|
elif _get_at(current) == 'Z': |
|
1742
|
|
|
# Chinese Pinyin e.g. 'zhao' |
|
1743
|
|
|
if _get_at(current + 1) == 'H': |
|
1744
|
|
|
(primary, secondary) = _metaph_add('J') |
|
1745
|
|
|
current += 2 |
|
1746
|
|
|
continue |
|
1747
|
|
|
elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or |
|
1748
|
|
|
(_slavo_germanic() and ((current > 0) and |
|
1749
|
|
|
_get_at(current - 1) != 'T'))): |
|
1750
|
|
|
(primary, secondary) = _metaph_add('S', 'TS') |
|
1751
|
|
|
else: |
|
1752
|
|
|
(primary, secondary) = _metaph_add('S') |
|
1753
|
|
|
|
|
1754
|
|
|
if _get_at(current + 1) == 'Z': |
|
1755
|
|
|
current += 2 |
|
1756
|
|
|
else: |
|
1757
|
|
|
current += 1 |
|
1758
|
|
|
continue |
|
1759
|
|
|
|
|
1760
|
|
|
else: |
|
1761
|
|
|
current += 1 |
|
1762
|
|
|
|
|
1763
|
|
|
if maxlength and maxlength < _INFINITY: |
|
1764
|
|
|
primary = primary[:maxlength] |
|
1765
|
|
|
secondary = secondary[:maxlength] |
|
1766
|
|
|
if primary == secondary: |
|
1767
|
|
|
secondary = '' |
|
1768
|
|
|
|
|
1769
|
|
|
return (primary, secondary) |
|
1770
|
|
|
|
|
1771
|
|
|
|
|
1772
|
|
|
def caverphone(word, version=2): |
|
1773
|
|
|
"""Return the Caverphone code for a word. |
|
1774
|
|
|
|
|
1775
|
|
|
A description of version 1 of the algorithm can be found in |
|
1776
|
|
|
:cite:`Hood:2002`. |
|
1777
|
|
|
|
|
1778
|
|
|
A description of version 2 of the algorithm can be found in |
|
1779
|
|
|
:cite:`Hood:2004`. |
|
1780
|
|
|
|
|
1781
|
|
|
:param str word: the word to transform |
|
1782
|
|
|
:param int version: the version of Caverphone to employ for encoding |
|
1783
|
|
|
(defaults to 2) |
|
1784
|
|
|
:returns: the Caverphone value |
|
1785
|
|
|
:rtype: str |
|
1786
|
|
|
|
|
1787
|
|
|
>>> caverphone('Christopher') |
|
1788
|
|
|
'KRSTFA1111' |
|
1789
|
|
|
>>> caverphone('Niall') |
|
1790
|
|
|
'NA11111111' |
|
1791
|
|
|
>>> caverphone('Smith') |
|
1792
|
|
|
'SMT1111111' |
|
1793
|
|
|
>>> caverphone('Schmidt') |
|
1794
|
|
|
'SKMT111111' |
|
1795
|
|
|
|
|
1796
|
|
|
>>> caverphone('Christopher', 1) |
|
1797
|
|
|
'KRSTF1' |
|
1798
|
|
|
>>> caverphone('Niall', 1) |
|
1799
|
|
|
'N11111' |
|
1800
|
|
|
>>> caverphone('Smith', 1) |
|
1801
|
|
|
'SMT111' |
|
1802
|
|
|
>>> caverphone('Schmidt', 1) |
|
1803
|
|
|
'SKMT11' |
|
1804
|
|
|
""" |
|
1805
|
|
|
_vowels = {'a', 'e', 'i', 'o', 'u'} |
|
1806
|
|
|
|
|
1807
|
|
|
word = word.lower() |
|
1808
|
|
|
word = ''.join(c for c in word if c in |
|
1809
|
|
|
{'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', |
|
1810
|
|
|
'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', |
|
1811
|
|
|
'y', 'z'}) |
|
1812
|
|
|
|
|
1813
|
|
|
def _squeeze_replace(word, char, new_char): |
|
1814
|
|
|
"""Convert strings of char in word to one instance of new_char.""" |
|
1815
|
|
|
while char * 2 in word: |
|
1816
|
|
|
word = word.replace(char * 2, char) |
|
1817
|
|
|
return word.replace(char, new_char) |
|
1818
|
|
|
|
|
1819
|
|
|
# the main replacemet algorithm |
|
1820
|
|
|
if version != 1 and word[-1:] == 'e': |
|
1821
|
|
|
word = word[:-1] |
|
1822
|
|
|
if word: |
|
1823
|
|
|
if word[:5] == 'cough': |
|
1824
|
|
|
word = 'cou2f'+word[5:] |
|
1825
|
|
|
if word[:5] == 'rough': |
|
1826
|
|
|
word = 'rou2f'+word[5:] |
|
1827
|
|
|
if word[:5] == 'tough': |
|
1828
|
|
|
word = 'tou2f'+word[5:] |
|
1829
|
|
|
if word[:6] == 'enough': |
|
1830
|
|
|
word = 'enou2f'+word[6:] |
|
1831
|
|
|
if version != 1 and word[:6] == 'trough': |
|
1832
|
|
|
word = 'trou2f'+word[6:] |
|
1833
|
|
|
if word[:2] == 'gn': |
|
1834
|
|
|
word = '2n'+word[2:] |
|
1835
|
|
|
if word[-2:] == 'mb': |
|
1836
|
|
|
word = word[:-1]+'2' |
|
1837
|
|
|
word = word.replace('cq', '2q') |
|
1838
|
|
|
word = word.replace('ci', 'si') |
|
1839
|
|
|
word = word.replace('ce', 'se') |
|
1840
|
|
|
word = word.replace('cy', 'sy') |
|
1841
|
|
|
word = word.replace('tch', '2ch') |
|
1842
|
|
|
word = word.replace('c', 'k') |
|
1843
|
|
|
word = word.replace('q', 'k') |
|
1844
|
|
|
word = word.replace('x', 'k') |
|
1845
|
|
|
word = word.replace('v', 'f') |
|
1846
|
|
|
word = word.replace('dg', '2g') |
|
1847
|
|
|
word = word.replace('tio', 'sio') |
|
1848
|
|
|
word = word.replace('tia', 'sia') |
|
1849
|
|
|
word = word.replace('d', 't') |
|
1850
|
|
|
word = word.replace('ph', 'fh') |
|
1851
|
|
|
word = word.replace('b', 'p') |
|
1852
|
|
|
word = word.replace('sh', 's2') |
|
1853
|
|
|
word = word.replace('z', 's') |
|
1854
|
|
|
if word[0] in _vowels: |
|
1855
|
|
|
word = 'A'+word[1:] |
|
1856
|
|
|
word = word.replace('a', '3') |
|
1857
|
|
|
word = word.replace('e', '3') |
|
1858
|
|
|
word = word.replace('i', '3') |
|
1859
|
|
|
word = word.replace('o', '3') |
|
1860
|
|
|
word = word.replace('u', '3') |
|
1861
|
|
|
if version != 1: |
|
1862
|
|
|
word = word.replace('j', 'y') |
|
1863
|
|
|
if word[:2] == 'y3': |
|
1864
|
|
|
word = 'Y3'+word[2:] |
|
1865
|
|
|
if word[:1] == 'y': |
|
1866
|
|
|
word = 'A'+word[1:] |
|
1867
|
|
|
word = word.replace('y', '3') |
|
1868
|
|
|
word = word.replace('3gh3', '3kh3') |
|
1869
|
|
|
word = word.replace('gh', '22') |
|
1870
|
|
|
word = word.replace('g', 'k') |
|
1871
|
|
|
|
|
1872
|
|
|
word = _squeeze_replace(word, 's', 'S') |
|
1873
|
|
|
word = _squeeze_replace(word, 't', 'T') |
|
1874
|
|
|
word = _squeeze_replace(word, 'p', 'P') |
|
1875
|
|
|
word = _squeeze_replace(word, 'k', 'K') |
|
1876
|
|
|
word = _squeeze_replace(word, 'f', 'F') |
|
1877
|
|
|
word = _squeeze_replace(word, 'm', 'M') |
|
1878
|
|
|
word = _squeeze_replace(word, 'n', 'N') |
|
1879
|
|
|
|
|
1880
|
|
|
word = word.replace('w3', 'W3') |
|
1881
|
|
|
if version == 1: |
|
1882
|
|
|
word = word.replace('wy', 'Wy') |
|
1883
|
|
|
word = word.replace('wh3', 'Wh3') |
|
1884
|
|
|
if version == 1: |
|
1885
|
|
|
word = word.replace('why', 'Why') |
|
1886
|
|
|
if version != 1 and word[-1:] == 'w': |
|
1887
|
|
|
word = word[:-1]+'3' |
|
1888
|
|
|
word = word.replace('w', '2') |
|
1889
|
|
|
if word[:1] == 'h': |
|
1890
|
|
|
word = 'A'+word[1:] |
|
1891
|
|
|
word = word.replace('h', '2') |
|
1892
|
|
|
word = word.replace('r3', 'R3') |
|
1893
|
|
|
if version == 1: |
|
1894
|
|
|
word = word.replace('ry', 'Ry') |
|
1895
|
|
|
if version != 1 and word[-1:] == 'r': |
|
1896
|
|
|
word = word[:-1]+'3' |
|
1897
|
|
|
word = word.replace('r', '2') |
|
1898
|
|
|
word = word.replace('l3', 'L3') |
|
1899
|
|
|
if version == 1: |
|
1900
|
|
|
word = word.replace('ly', 'Ly') |
|
1901
|
|
|
if version != 1 and word[-1:] == 'l': |
|
1902
|
|
|
word = word[:-1]+'3' |
|
1903
|
|
|
word = word.replace('l', '2') |
|
1904
|
|
|
if version == 1: |
|
1905
|
|
|
word = word.replace('j', 'y') |
|
1906
|
|
|
word = word.replace('y3', 'Y3') |
|
1907
|
|
|
word = word.replace('y', '2') |
|
1908
|
|
|
word = word.replace('2', '') |
|
1909
|
|
|
if version != 1 and word[-1:] == '3': |
|
1910
|
|
|
word = word[:-1]+'A' |
|
1911
|
|
|
word = word.replace('3', '') |
|
1912
|
|
|
|
|
1913
|
|
|
# pad with 1s, then extract the necessary length of code |
|
1914
|
|
|
word = word+'1'*10 |
|
1915
|
|
|
if version != 1: |
|
1916
|
|
|
word = word[:10] |
|
1917
|
|
|
else: |
|
1918
|
|
|
word = word[:6] |
|
1919
|
|
|
|
|
1920
|
|
|
return word |
|
1921
|
|
|
|
|
1922
|
|
|
|
|
1923
|
|
|
def alpha_sis(word, maxlength=14): |
|
1924
|
|
|
"""Return the IBM Alpha Search Inquiry System code for a word. |
|
1925
|
|
|
|
|
1926
|
|
|
The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`. |
|
1927
|
|
|
This implementation is based on the description in :cite:`Moore:1977`. |
|
1928
|
|
|
|
|
1929
|
|
|
A collection is necessary since there can be multiple values for a |
|
1930
|
|
|
single word. But the collection must be ordered since the first value |
|
1931
|
|
|
is the primary coding. |
|
1932
|
|
|
|
|
1933
|
|
|
:param str word: the word to transform |
|
1934
|
|
|
:param int maxlength: the length of the code returned (defaults to 14) |
|
1935
|
|
|
:returns: the Alpha SIS value |
|
1936
|
|
|
:rtype: tuple |
|
1937
|
|
|
|
|
1938
|
|
|
>>> alpha_sis('Christopher') |
|
1939
|
|
|
('06401840000000', '07040184000000', '04018400000000') |
|
1940
|
|
|
>>> alpha_sis('Niall') |
|
1941
|
|
|
('02500000000000',) |
|
1942
|
|
|
>>> alpha_sis('Smith') |
|
1943
|
|
|
('03100000000000',) |
|
1944
|
|
|
>>> alpha_sis('Schmidt') |
|
1945
|
|
|
('06310000000000',) |
|
1946
|
|
|
""" |
|
1947
|
|
|
_alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02', |
|
1948
|
|
|
'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04', |
|
1949
|
|
|
'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3', |
|
1950
|
|
|
'O': '1', 'U': '1', 'W': '4', 'Y': '5'} |
|
1951
|
|
|
_alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS', |
|
1952
|
|
|
'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W', |
|
1953
|
|
|
'Y') |
|
1954
|
|
|
_alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'), |
|
1955
|
|
|
'CH': ('6', '70', '0'), 'CK': ('7', '6'), |
|
1956
|
|
|
'DS': ('0', '10'), 'DZ': ('0', '10'), |
|
1957
|
|
|
'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0', |
|
1958
|
|
|
'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8', |
|
1959
|
|
|
'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0', |
|
1960
|
|
|
'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4', |
|
1961
|
|
|
'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7', |
|
1962
|
|
|
'F': '8', 'V': '8', 'B': '9', 'P': '9'} |
|
1963
|
|
|
_alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ', |
|
1964
|
|
|
'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K', |
|
1965
|
|
|
'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C', |
|
1966
|
|
|
'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P') |
|
1967
|
|
|
|
|
1968
|
|
|
alpha = [''] |
|
1969
|
|
|
pos = 0 |
|
1970
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
1971
|
|
|
word = word.replace('ß', 'SS') |
|
1972
|
|
|
word = ''.join(c for c in word if c in |
|
1973
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
1974
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
1975
|
|
|
'Y', 'Z'}) |
|
1976
|
|
|
|
|
1977
|
|
|
# Clamp maxlength to [4, 64] |
|
1978
|
|
|
if maxlength is not None: |
|
1979
|
|
|
maxlength = min(max(4, maxlength), 64) |
|
1980
|
|
|
else: |
|
1981
|
|
|
maxlength = 64 |
|
1982
|
|
|
|
|
1983
|
|
|
# Do special processing for initial substrings |
|
1984
|
|
|
for k in _alpha_sis_initials_order: |
|
1985
|
|
|
if word.startswith(k): |
|
1986
|
|
|
alpha[0] += _alpha_sis_initials[k] |
|
1987
|
|
|
pos += len(k) |
|
1988
|
|
|
break |
|
1989
|
|
|
|
|
1990
|
|
|
# Add a '0' if alpha is still empty |
|
1991
|
|
|
if not alpha[0]: |
|
1992
|
|
|
alpha[0] += '0' |
|
1993
|
|
|
|
|
1994
|
|
|
# Whether or not any special initial codes were encoded, iterate |
|
1995
|
|
|
# through the length of the word in the main encoding loop |
|
1996
|
|
|
while pos < len(word): |
|
1997
|
|
|
origpos = pos |
|
1998
|
|
|
for k in _alpha_sis_basic_order: |
|
1999
|
|
|
if word[pos:].startswith(k): |
|
2000
|
|
|
if isinstance(_alpha_sis_basic[k], tuple): |
|
2001
|
|
|
newalpha = [] |
|
2002
|
|
|
for i in range(len(_alpha_sis_basic[k])): |
|
2003
|
|
|
newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha] |
|
2004
|
|
|
alpha = newalpha |
|
2005
|
|
|
else: |
|
2006
|
|
|
alpha = [_ + _alpha_sis_basic[k] for _ in alpha] |
|
2007
|
|
|
pos += len(k) |
|
2008
|
|
|
break |
|
2009
|
|
|
if pos == origpos: |
|
2010
|
|
|
alpha = [_ + '_' for _ in alpha] |
|
2011
|
|
|
pos += 1 |
|
2012
|
|
|
|
|
2013
|
|
|
# Trim doublets and placeholders |
|
2014
|
|
|
for i in range(len(alpha)): |
|
2015
|
|
|
pos = 1 |
|
2016
|
|
|
while pos < len(alpha[i]): |
|
2017
|
|
|
if alpha[i][pos] == alpha[i][pos-1]: |
|
2018
|
|
|
alpha[i] = alpha[i][:pos]+alpha[i][pos+1:] |
|
2019
|
|
|
pos += 1 |
|
2020
|
|
|
alpha = (_.replace('_', '') for _ in alpha) |
|
|
|
|
|
|
2021
|
|
|
|
|
2022
|
|
|
# Trim codes and return tuple |
|
2023
|
|
|
alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha) |
|
2024
|
|
|
return tuple(alpha) |
|
2025
|
|
|
|
|
2026
|
|
|
|
|
2027
|
|
|
def fuzzy_soundex(word, maxlength=5, zero_pad=True): |
|
2028
|
|
|
"""Return the Fuzzy Soundex code for a word. |
|
2029
|
|
|
|
|
2030
|
|
|
Fuzzy Soundex is an algorithm derived from Soundex, defined in |
|
2031
|
|
|
:cite:`Holmes:2002`. |
|
2032
|
|
|
|
|
2033
|
|
|
:param str word: the word to transform |
|
2034
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
|
2035
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
|
2036
|
|
|
a maxlength string |
|
2037
|
|
|
:returns: the Fuzzy Soundex value |
|
2038
|
|
|
:rtype: str |
|
2039
|
|
|
|
|
2040
|
|
|
>>> fuzzy_soundex('Christopher') |
|
2041
|
|
|
'K6931' |
|
2042
|
|
|
>>> fuzzy_soundex('Niall') |
|
2043
|
|
|
'N4000' |
|
2044
|
|
|
>>> fuzzy_soundex('Smith') |
|
2045
|
|
|
'S5300' |
|
2046
|
|
|
>>> fuzzy_soundex('Smith') |
|
2047
|
|
|
'S5300' |
|
2048
|
|
|
""" |
|
2049
|
|
|
_fuzzy_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
2050
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
2051
|
|
|
'0193017-07745501769301-7-9')) |
|
2052
|
|
|
|
|
2053
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
2054
|
|
|
word = word.replace('ß', 'SS') |
|
2055
|
|
|
|
|
2056
|
|
|
# Clamp maxlength to [4, 64] |
|
2057
|
|
|
if maxlength is not None: |
|
2058
|
|
|
maxlength = min(max(4, maxlength), 64) |
|
2059
|
|
|
else: |
|
2060
|
|
|
maxlength = 64 |
|
2061
|
|
|
|
|
2062
|
|
|
if not word: |
|
2063
|
|
|
if zero_pad: |
|
2064
|
|
|
return '0' * maxlength |
|
2065
|
|
|
return '0' |
|
2066
|
|
|
|
|
2067
|
|
|
if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}: |
|
2068
|
|
|
word = 'SS' + word[2:] |
|
2069
|
|
|
elif word[:2] == 'GN': |
|
2070
|
|
|
word = 'NN' + word[2:] |
|
2071
|
|
|
elif word[:2] in {'HR', 'WR'}: |
|
2072
|
|
|
word = 'RR' + word[2:] |
|
2073
|
|
|
elif word[:2] == 'HW': |
|
2074
|
|
|
word = 'WW' + word[2:] |
|
2075
|
|
|
elif word[:2] in {'KN', 'NG'}: |
|
2076
|
|
|
word = 'NN' + word[2:] |
|
2077
|
|
|
|
|
2078
|
|
|
if word[-2:] == 'CH': |
|
2079
|
|
|
word = word[:-2] + 'KK' |
|
2080
|
|
|
elif word[-2:] == 'NT': |
|
2081
|
|
|
word = word[:-2] + 'TT' |
|
2082
|
|
|
elif word[-2:] == 'RT': |
|
2083
|
|
|
word = word[:-2] + 'RR' |
|
2084
|
|
|
elif word[-3:] == 'RDT': |
|
2085
|
|
|
word = word[:-3] + 'RR' |
|
2086
|
|
|
|
|
2087
|
|
|
word = word.replace('CA', 'KA') |
|
2088
|
|
|
word = word.replace('CC', 'KK') |
|
2089
|
|
|
word = word.replace('CK', 'KK') |
|
2090
|
|
|
word = word.replace('CE', 'SE') |
|
2091
|
|
|
word = word.replace('CHL', 'KL') |
|
2092
|
|
|
word = word.replace('CL', 'KL') |
|
2093
|
|
|
word = word.replace('CHR', 'KR') |
|
2094
|
|
|
word = word.replace('CR', 'KR') |
|
2095
|
|
|
word = word.replace('CI', 'SI') |
|
2096
|
|
|
word = word.replace('CO', 'KO') |
|
2097
|
|
|
word = word.replace('CU', 'KU') |
|
2098
|
|
|
word = word.replace('CY', 'SY') |
|
2099
|
|
|
word = word.replace('DG', 'GG') |
|
2100
|
|
|
word = word.replace('GH', 'HH') |
|
2101
|
|
|
word = word.replace('MAC', 'MK') |
|
2102
|
|
|
word = word.replace('MC', 'MK') |
|
2103
|
|
|
word = word.replace('NST', 'NSS') |
|
2104
|
|
|
word = word.replace('PF', 'FF') |
|
2105
|
|
|
word = word.replace('PH', 'FF') |
|
2106
|
|
|
word = word.replace('SCH', 'SSS') |
|
2107
|
|
|
word = word.replace('TIO', 'SIO') |
|
2108
|
|
|
word = word.replace('TIA', 'SIO') |
|
2109
|
|
|
word = word.replace('TCH', 'CHH') |
|
2110
|
|
|
|
|
2111
|
|
|
sdx = word.translate(_fuzzy_soundex_translation) |
|
2112
|
|
|
sdx = sdx.replace('-', '') |
|
2113
|
|
|
|
|
2114
|
|
|
# remove repeating characters |
|
2115
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
2116
|
|
|
|
|
2117
|
|
|
if word[0] in {'H', 'W', 'Y'}: |
|
2118
|
|
|
sdx = word[0] + sdx |
|
2119
|
|
|
else: |
|
2120
|
|
|
sdx = word[0] + sdx[1:] |
|
2121
|
|
|
|
|
2122
|
|
|
sdx = sdx.replace('0', '') |
|
2123
|
|
|
|
|
2124
|
|
|
if zero_pad: |
|
2125
|
|
|
sdx += ('0'*maxlength) |
|
2126
|
|
|
|
|
2127
|
|
|
return sdx[:maxlength] |
|
2128
|
|
|
|
|
2129
|
|
|
|
|
2130
|
|
|
def phonex(word, maxlength=4, zero_pad=True): |
|
2131
|
|
|
"""Return the Phonex code for a word. |
|
2132
|
|
|
|
|
2133
|
|
|
Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`. |
|
2134
|
|
|
|
|
2135
|
|
|
:param str word: the word to transform |
|
2136
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
|
2137
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
|
2138
|
|
|
a maxlength string |
|
2139
|
|
|
:returns: the Phonex value |
|
2140
|
|
|
:rtype: str |
|
2141
|
|
|
|
|
2142
|
|
|
>>> phonex('Christopher') |
|
2143
|
|
|
'C623' |
|
2144
|
|
|
>>> phonex('Niall') |
|
2145
|
|
|
'N400' |
|
2146
|
|
|
>>> phonex('Schmidt') |
|
2147
|
|
|
'S253' |
|
2148
|
|
|
>>> phonex('Smith') |
|
2149
|
|
|
'S530' |
|
2150
|
|
|
""" |
|
2151
|
|
|
name = normalize('NFKD', text_type(word.upper())) |
|
2152
|
|
|
name = name.replace('ß', 'SS') |
|
2153
|
|
|
|
|
2154
|
|
|
# Clamp maxlength to [4, 64] |
|
2155
|
|
|
if maxlength is not None: |
|
2156
|
|
|
maxlength = min(max(4, maxlength), 64) |
|
2157
|
|
|
else: |
|
2158
|
|
|
maxlength = 64 |
|
2159
|
|
|
|
|
2160
|
|
|
name_code = last = '' |
|
2161
|
|
|
|
|
2162
|
|
|
# Deletions effected by replacing with next letter which |
|
2163
|
|
|
# will be ignored due to duplicate handling of Soundex code. |
|
2164
|
|
|
# This is faster than 'moving' all subsequent letters. |
|
2165
|
|
|
|
|
2166
|
|
|
# Remove any trailing Ss |
|
2167
|
|
|
while name[-1:] == 'S': |
|
2168
|
|
|
name = name[:-1] |
|
2169
|
|
|
|
|
2170
|
|
|
# Phonetic equivalents of first 2 characters |
|
2171
|
|
|
# Works since duplicate letters are ignored |
|
2172
|
|
|
if name[:2] == 'KN': |
|
2173
|
|
|
name = 'N' + name[2:] # KN.. == N.. |
|
2174
|
|
|
elif name[:2] == 'PH': |
|
2175
|
|
|
name = 'F' + name[2:] # PH.. == F.. (H ignored anyway) |
|
2176
|
|
|
elif name[:2] == 'WR': |
|
2177
|
|
|
name = 'R' + name[2:] # WR.. == R.. |
|
2178
|
|
|
|
|
2179
|
|
|
if name: |
|
2180
|
|
|
# Special case, ignore H first letter (subsequent Hs ignored anyway) |
|
2181
|
|
|
# Works since duplicate letters are ignored |
|
2182
|
|
|
if name[0] == 'H': |
|
2183
|
|
|
name = name[1:] |
|
2184
|
|
|
|
|
2185
|
|
|
if name: |
|
2186
|
|
|
# Phonetic equivalents of first character |
|
2187
|
|
|
if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
2188
|
|
|
name = 'A' + name[1:] |
|
2189
|
|
|
elif name[0] in {'B', 'P'}: |
|
2190
|
|
|
name = 'B' + name[1:] |
|
2191
|
|
|
elif name[0] in {'V', 'F'}: |
|
2192
|
|
|
name = 'F' + name[1:] |
|
2193
|
|
|
elif name[0] in {'C', 'K', 'Q'}: |
|
2194
|
|
|
name = 'C' + name[1:] |
|
2195
|
|
|
elif name[0] in {'G', 'J'}: |
|
2196
|
|
|
name = 'G' + name[1:] |
|
2197
|
|
|
elif name[0] in {'S', 'Z'}: |
|
2198
|
|
|
name = 'S' + name[1:] |
|
2199
|
|
|
|
|
2200
|
|
|
name_code = last = name[0] |
|
2201
|
|
|
|
|
2202
|
|
|
# MODIFIED SOUNDEX CODE |
|
2203
|
|
|
for i in range(1, len(name)): |
|
2204
|
|
|
code = '0' |
|
2205
|
|
|
if name[i] in {'B', 'F', 'P', 'V'}: |
|
2206
|
|
|
code = '1' |
|
2207
|
|
|
elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}: |
|
2208
|
|
|
code = '2' |
|
2209
|
|
|
elif name[i] in {'D', 'T'}: |
|
2210
|
|
|
if name[i+1:i+2] != 'C': |
|
2211
|
|
|
code = '3' |
|
2212
|
|
|
elif name[i] == 'L': |
|
2213
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
|
2214
|
|
|
i+1 == len(name)): |
|
2215
|
|
|
code = '4' |
|
2216
|
|
|
elif name[i] in {'M', 'N'}: |
|
2217
|
|
|
if name[i+1:i+2] in {'D', 'G'}: |
|
2218
|
|
|
name = name[:i+1] + name[i] + name[i+2:] |
|
2219
|
|
|
code = '5' |
|
2220
|
|
|
elif name[i] == 'R': |
|
2221
|
|
|
if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or |
|
2222
|
|
|
i+1 == len(name)): |
|
2223
|
|
|
code = '6' |
|
2224
|
|
|
|
|
2225
|
|
|
if code != last and code != '0' and i != 0: |
|
2226
|
|
|
name_code += code |
|
2227
|
|
|
|
|
2228
|
|
|
last = name_code[-1] |
|
2229
|
|
|
|
|
2230
|
|
|
if zero_pad: |
|
2231
|
|
|
name_code += '0' * maxlength |
|
2232
|
|
|
if not name_code: |
|
2233
|
|
|
name_code = '0' |
|
2234
|
|
|
return name_code[:maxlength] |
|
2235
|
|
|
|
|
2236
|
|
|
|
|
2237
|
|
|
def phonem(word): |
|
2238
|
|
|
"""Return the Phonem code for a word. |
|
2239
|
|
|
|
|
2240
|
|
|
Phonem is defined in :cite:`Wilde:1988`. |
|
2241
|
|
|
|
|
2242
|
|
|
This version is based on the Perl implementation documented at |
|
2243
|
|
|
:cite:`Wilz:2005`. |
|
2244
|
|
|
It includes some enhancements presented in the Java port at |
|
2245
|
|
|
:cite:`dcm4che:2011`. |
|
2246
|
|
|
|
|
2247
|
|
|
Phonem is intended chiefly for German names/words. |
|
2248
|
|
|
|
|
2249
|
|
|
:param str word: the word to transform |
|
2250
|
|
|
:returns: the Phonem value |
|
2251
|
|
|
:rtype: str |
|
2252
|
|
|
|
|
2253
|
|
|
>>> phonem('Christopher') |
|
2254
|
|
|
'CRYSDOVR' |
|
2255
|
|
|
>>> phonem('Niall') |
|
2256
|
|
|
'NYAL' |
|
2257
|
|
|
>>> phonem('Smith') |
|
2258
|
|
|
'SMYD' |
|
2259
|
|
|
>>> phonem('Schmidt') |
|
2260
|
|
|
'CMYD' |
|
2261
|
|
|
""" |
|
2262
|
|
|
_phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'), |
|
2263
|
|
|
('TZ', 'C'), ('TS', 'C'), ('KS', 'X'), |
|
2264
|
|
|
('PF', 'V'), ('QU', 'KW'), ('PH', 'V'), |
|
2265
|
|
|
('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'), |
|
2266
|
|
|
('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'), |
|
2267
|
|
|
('AU', 'A§'), ('OU', '§')) |
|
2268
|
|
|
_phonem_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
2269
|
|
|
'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
|
2270
|
|
|
'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ')) |
|
2271
|
|
|
|
|
2272
|
|
|
word = normalize('NFC', text_type(word.upper())) |
|
2273
|
|
|
for i, j in _phonem_substitutions: |
|
2274
|
|
|
word = word.replace(i, j) |
|
2275
|
|
|
word = word.translate(_phonem_translation) |
|
2276
|
|
|
|
|
2277
|
|
|
return ''.join(c for c in _delete_consecutive_repeats(word) |
|
2278
|
|
|
if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S', |
|
2279
|
|
|
'U', 'V', 'W', 'X', 'Y', 'Ö'}) |
|
2280
|
|
|
|
|
2281
|
|
|
|
|
2282
|
|
|
def phonix(word, maxlength=4, zero_pad=True): |
|
2283
|
|
|
"""Return the Phonix code for a word. |
|
2284
|
|
|
|
|
2285
|
|
|
Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`. |
|
2286
|
|
|
|
|
2287
|
|
|
This implementation is based on: |
|
2288
|
|
|
- :cite:`Pfeifer:2000` |
|
2289
|
|
|
- :cite:`Christen:2011` |
|
2290
|
|
|
- :cite:`Kollar:2007` |
|
2291
|
|
|
|
|
2292
|
|
|
:param str word: the word to transform |
|
2293
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
|
2294
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve |
|
2295
|
|
|
a maxlength string |
|
2296
|
|
|
:returns: the Phonix value |
|
2297
|
|
|
:rtype: str |
|
2298
|
|
|
|
|
2299
|
|
|
>>> phonix('Christopher') |
|
2300
|
|
|
'K683' |
|
2301
|
|
|
>>> phonix('Niall') |
|
2302
|
|
|
'N400' |
|
2303
|
|
|
>>> phonix('Smith') |
|
2304
|
|
|
'S530' |
|
2305
|
|
|
>>> phonix('Schmidt') |
|
2306
|
|
|
'S530' |
|
2307
|
|
|
""" |
|
2308
|
|
|
# pylint: disable=too-many-branches |
|
2309
|
|
|
def _start_repl(word, src, tar, post=None): |
|
2310
|
|
|
r"""Replace src with tar at the start of word.""" |
|
2311
|
|
|
if post: |
|
2312
|
|
|
for i in post: |
|
2313
|
|
|
if word.startswith(src+i): |
|
2314
|
|
|
return tar + word[len(src):] |
|
2315
|
|
|
elif word.startswith(src): |
|
2316
|
|
|
return tar + word[len(src):] |
|
2317
|
|
|
return word |
|
2318
|
|
|
|
|
2319
|
|
|
def _end_repl(word, src, tar, pre=None): |
|
2320
|
|
|
r"""Replace src with tar at the end of word.""" |
|
2321
|
|
|
if pre: |
|
2322
|
|
|
for i in pre: |
|
2323
|
|
|
if word.endswith(i+src): |
|
2324
|
|
|
return word[:-len(src)] + tar |
|
2325
|
|
|
elif word.endswith(src): |
|
2326
|
|
|
return word[:-len(src)] + tar |
|
2327
|
|
|
return word |
|
2328
|
|
|
|
|
2329
|
|
|
def _mid_repl(word, src, tar, pre=None, post=None): |
|
2330
|
|
|
r"""Replace src with tar in the middle of word.""" |
|
2331
|
|
|
if pre or post: |
|
2332
|
|
|
if not pre: |
|
2333
|
|
|
return word[0] + _all_repl(word[1:], src, tar, pre, post) |
|
2334
|
|
|
elif not post: |
|
2335
|
|
|
return _all_repl(word[:-1], src, tar, pre, post) + word[-1] |
|
2336
|
|
|
return _all_repl(word, src, tar, pre, post) |
|
2337
|
|
|
return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) + |
|
2338
|
|
|
word[-1]) |
|
2339
|
|
|
|
|
2340
|
|
|
def _all_repl(word, src, tar, pre=None, post=None): |
|
2341
|
|
|
r"""Replace src with tar anywhere in word.""" |
|
2342
|
|
|
if pre or post: |
|
2343
|
|
|
if post: |
|
2344
|
|
|
post = post |
|
2345
|
|
|
else: |
|
2346
|
|
|
post = frozenset(('',)) |
|
2347
|
|
|
if pre: |
|
2348
|
|
|
pre = pre |
|
2349
|
|
|
else: |
|
2350
|
|
|
pre = frozenset(('',)) |
|
2351
|
|
|
|
|
2352
|
|
|
for i, j in ((i, j) for i in pre for j in post): |
|
2353
|
|
|
word = word.replace(i+src+j, i+tar+j) |
|
2354
|
|
|
return word |
|
2355
|
|
|
else: |
|
2356
|
|
|
return word.replace(src, tar) |
|
2357
|
|
|
|
|
2358
|
|
|
_vow = {'A', 'E', 'I', 'O', 'U'} |
|
2359
|
|
|
_con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
|
2360
|
|
|
'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'} |
|
2361
|
|
|
|
|
2362
|
|
|
_phonix_substitutions = ((_all_repl, 'DG', 'G'), |
|
2363
|
|
|
(_all_repl, 'CO', 'KO'), |
|
2364
|
|
|
(_all_repl, 'CA', 'KA'), |
|
2365
|
|
|
(_all_repl, 'CU', 'KU'), |
|
2366
|
|
|
(_all_repl, 'CY', 'SI'), |
|
2367
|
|
|
(_all_repl, 'CI', 'SI'), |
|
2368
|
|
|
(_all_repl, 'CE', 'SE'), |
|
2369
|
|
|
(_start_repl, 'CL', 'KL', _vow), |
|
2370
|
|
|
(_all_repl, 'CK', 'K'), |
|
2371
|
|
|
(_end_repl, 'GC', 'K'), |
|
2372
|
|
|
(_end_repl, 'JC', 'K'), |
|
2373
|
|
|
(_start_repl, 'CHR', 'KR', _vow), |
|
2374
|
|
|
(_start_repl, 'CR', 'KR', _vow), |
|
2375
|
|
|
(_start_repl, 'WR', 'R'), |
|
2376
|
|
|
(_all_repl, 'NC', 'NK'), |
|
2377
|
|
|
(_all_repl, 'CT', 'KT'), |
|
2378
|
|
|
(_all_repl, 'PH', 'F'), |
|
2379
|
|
|
(_all_repl, 'AA', 'AR'), |
|
2380
|
|
|
(_all_repl, 'SCH', 'SH'), |
|
2381
|
|
|
(_all_repl, 'BTL', 'TL'), |
|
2382
|
|
|
(_all_repl, 'GHT', 'T'), |
|
2383
|
|
|
(_all_repl, 'AUGH', 'ARF'), |
|
2384
|
|
|
(_mid_repl, 'LJ', 'LD', _vow, _vow), |
|
2385
|
|
|
(_all_repl, 'LOUGH', 'LOW'), |
|
2386
|
|
|
(_start_repl, 'Q', 'KW'), |
|
2387
|
|
|
(_start_repl, 'KN', 'N'), |
|
2388
|
|
|
(_end_repl, 'GN', 'N'), |
|
2389
|
|
|
(_all_repl, 'GHN', 'N'), |
|
2390
|
|
|
(_end_repl, 'GNE', 'N'), |
|
2391
|
|
|
(_all_repl, 'GHNE', 'NE'), |
|
2392
|
|
|
(_end_repl, 'GNES', 'NS'), |
|
2393
|
|
|
(_start_repl, 'GN', 'N'), |
|
2394
|
|
|
(_mid_repl, 'GN', 'N', None, _con), |
|
2395
|
|
|
(_end_repl, 'GN', 'N'), |
|
2396
|
|
|
(_start_repl, 'PS', 'S'), |
|
2397
|
|
|
(_start_repl, 'PT', 'T'), |
|
2398
|
|
|
(_start_repl, 'CZ', 'C'), |
|
2399
|
|
|
(_mid_repl, 'WZ', 'Z', _vow), |
|
2400
|
|
|
(_mid_repl, 'CZ', 'CH'), |
|
2401
|
|
|
(_all_repl, 'LZ', 'LSH'), |
|
2402
|
|
|
(_all_repl, 'RZ', 'RSH'), |
|
2403
|
|
|
(_mid_repl, 'Z', 'S', None, _vow), |
|
2404
|
|
|
(_all_repl, 'ZZ', 'TS'), |
|
2405
|
|
|
(_mid_repl, 'Z', 'TS', _con), |
|
2406
|
|
|
(_all_repl, 'HROUG', 'REW'), |
|
2407
|
|
|
(_all_repl, 'OUGH', 'OF'), |
|
2408
|
|
|
(_mid_repl, 'Q', 'KW', _vow, _vow), |
|
2409
|
|
|
(_mid_repl, 'J', 'Y', _vow, _vow), |
|
2410
|
|
|
(_start_repl, 'YJ', 'Y', _vow), |
|
2411
|
|
|
(_start_repl, 'GH', 'G'), |
|
2412
|
|
|
(_end_repl, 'GH', 'E', _vow), |
|
2413
|
|
|
(_start_repl, 'CY', 'S'), |
|
2414
|
|
|
(_all_repl, 'NX', 'NKS'), |
|
2415
|
|
|
(_start_repl, 'PF', 'F'), |
|
2416
|
|
|
(_end_repl, 'DT', 'T'), |
|
2417
|
|
|
(_end_repl, 'TL', 'TIL'), |
|
2418
|
|
|
(_end_repl, 'DL', 'DIL'), |
|
2419
|
|
|
(_all_repl, 'YTH', 'ITH'), |
|
2420
|
|
|
(_start_repl, 'TJ', 'CH', _vow), |
|
2421
|
|
|
(_start_repl, 'TSJ', 'CH', _vow), |
|
2422
|
|
|
(_start_repl, 'TS', 'T', _vow), |
|
2423
|
|
|
(_all_repl, 'TCH', 'CH'), |
|
2424
|
|
|
(_mid_repl, 'WSK', 'VSKIE', _vow), |
|
2425
|
|
|
(_end_repl, 'WSK', 'VSKIE', _vow), |
|
2426
|
|
|
(_start_repl, 'MN', 'N', _vow), |
|
2427
|
|
|
(_start_repl, 'PN', 'N', _vow), |
|
2428
|
|
|
(_mid_repl, 'STL', 'SL', _vow), |
|
2429
|
|
|
(_end_repl, 'STL', 'SL', _vow), |
|
2430
|
|
|
(_end_repl, 'TNT', 'ENT'), |
|
2431
|
|
|
(_end_repl, 'EAUX', 'OH'), |
|
2432
|
|
|
(_all_repl, 'EXCI', 'ECS'), |
|
2433
|
|
|
(_all_repl, 'X', 'ECS'), |
|
2434
|
|
|
(_end_repl, 'NED', 'ND'), |
|
2435
|
|
|
(_all_repl, 'JR', 'DR'), |
|
2436
|
|
|
(_end_repl, 'EE', 'EA'), |
|
2437
|
|
|
(_all_repl, 'ZS', 'S'), |
|
2438
|
|
|
(_mid_repl, 'R', 'AH', _vow, _con), |
|
2439
|
|
|
(_end_repl, 'R', 'AH', _vow), |
|
2440
|
|
|
(_mid_repl, 'HR', 'AH', _vow, _con), |
|
2441
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
|
2442
|
|
|
(_end_repl, 'HR', 'AH', _vow), |
|
2443
|
|
|
(_end_repl, 'RE', 'AR'), |
|
2444
|
|
|
(_end_repl, 'R', 'AH', _vow), |
|
2445
|
|
|
(_all_repl, 'LLE', 'LE'), |
|
2446
|
|
|
(_end_repl, 'LE', 'ILE', _con), |
|
2447
|
|
|
(_end_repl, 'LES', 'ILES', _con), |
|
2448
|
|
|
(_end_repl, 'E', ''), |
|
2449
|
|
|
(_end_repl, 'ES', 'S'), |
|
2450
|
|
|
(_end_repl, 'SS', 'AS', _vow), |
|
2451
|
|
|
(_end_repl, 'MB', 'M', _vow), |
|
2452
|
|
|
(_all_repl, 'MPTS', 'MPS'), |
|
2453
|
|
|
(_all_repl, 'MPS', 'MS'), |
|
2454
|
|
|
(_all_repl, 'MPT', 'MT')) |
|
2455
|
|
|
|
|
2456
|
|
|
_phonix_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
2457
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
2458
|
|
|
'01230720022455012683070808')) |
|
2459
|
|
|
|
|
2460
|
|
|
sdx = '' |
|
2461
|
|
|
|
|
2462
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
2463
|
|
|
word = word.replace('ß', 'SS') |
|
2464
|
|
|
word = ''.join(c for c in word if c in |
|
2465
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
2466
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
2467
|
|
|
'Y', 'Z'}) |
|
2468
|
|
|
if word: |
|
2469
|
|
|
for trans in _phonix_substitutions: |
|
2470
|
|
|
word = trans[0](word, *trans[1:]) |
|
2471
|
|
|
if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
2472
|
|
|
sdx = 'v' + word[1:].translate(_phonix_translation) |
|
2473
|
|
|
else: |
|
2474
|
|
|
sdx = word[0] + word[1:].translate(_phonix_translation) |
|
2475
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
2476
|
|
|
sdx = sdx.replace('0', '') |
|
2477
|
|
|
|
|
2478
|
|
|
# Clamp maxlength to [4, 64] |
|
2479
|
|
|
if maxlength is not None: |
|
2480
|
|
|
maxlength = min(max(4, maxlength), 64) |
|
2481
|
|
|
else: |
|
2482
|
|
|
maxlength = 64 |
|
2483
|
|
|
|
|
2484
|
|
|
if zero_pad: |
|
2485
|
|
|
sdx += '0' * maxlength |
|
2486
|
|
|
if not sdx: |
|
2487
|
|
|
sdx = '0' |
|
2488
|
|
|
return sdx[:maxlength] |
|
2489
|
|
|
|
|
2490
|
|
|
|
|
2491
|
|
|
def sfinxbis(word, maxlength=None): |
|
2492
|
|
|
"""Return the SfinxBis code for a word. |
|
2493
|
|
|
|
|
2494
|
|
|
SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`. |
|
2495
|
|
|
|
|
2496
|
|
|
This implementation follows the reference implementation: |
|
2497
|
|
|
:cite:`Sjoo:2009`. |
|
2498
|
|
|
|
|
2499
|
|
|
SfinxBis is intended chiefly for Swedish names. |
|
2500
|
|
|
|
|
2501
|
|
|
:param str word: the word to transform |
|
2502
|
|
|
:param int maxlength: the length of the code returned (defaults to |
|
2503
|
|
|
unlimited) |
|
2504
|
|
|
:returns: the SfinxBis value |
|
2505
|
|
|
:rtype: tuple |
|
2506
|
|
|
|
|
2507
|
|
|
>>> sfinxbis('Christopher') |
|
2508
|
|
|
('K68376',) |
|
2509
|
|
|
>>> sfinxbis('Niall') |
|
2510
|
|
|
('N4',) |
|
2511
|
|
|
>>> sfinxbis('Smith') |
|
2512
|
|
|
('S53',) |
|
2513
|
|
|
>>> sfinxbis('Schmidt') |
|
2514
|
|
|
('S53',) |
|
2515
|
|
|
|
|
2516
|
|
|
>>> sfinxbis('Johansson') |
|
2517
|
|
|
('J585',) |
|
2518
|
|
|
>>> sfinxbis('Sjöberg') |
|
2519
|
|
|
('#162',) |
|
2520
|
|
|
""" |
|
2521
|
|
|
adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ', |
|
2522
|
|
|
' VAN DER ', ' VON DEM ', ' VON DER ', |
|
2523
|
|
|
' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ', |
|
2524
|
|
|
' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ', |
|
2525
|
|
|
' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ', |
|
2526
|
|
|
' S:T ') |
|
2527
|
|
|
|
|
2528
|
|
|
_harde_vokaler = {'A', 'O', 'U', 'Å'} |
|
2529
|
|
|
_mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'} |
|
2530
|
|
|
_konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', |
|
2531
|
|
|
'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
|
2532
|
|
|
_alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
2533
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
2534
|
|
|
'Y', 'Z', 'Ä', 'Å', 'Ö'} |
|
2535
|
|
|
|
|
2536
|
|
|
_sfinxbis_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
2537
|
|
|
'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'), |
|
2538
|
|
|
'123729224551268378999999999')) |
|
2539
|
|
|
|
|
2540
|
|
|
_sfinxbis_substitutions = dict(zip((ord(_) for _ in |
|
2541
|
|
|
'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'), |
|
2542
|
|
|
'VSAAAAÄCEEEEIIIINOOOOÖUUUYY')) |
|
2543
|
|
|
|
|
2544
|
|
|
def _foersvensker(ordet): |
|
2545
|
|
|
"""Return the Swedish-ized form of the word.""" |
|
2546
|
|
|
ordet = ordet.replace('STIERN', 'STJÄRN') |
|
2547
|
|
|
ordet = ordet.replace('HIE', 'HJ') |
|
2548
|
|
|
ordet = ordet.replace('SIÖ', 'SJÖ') |
|
2549
|
|
|
ordet = ordet.replace('SCH', 'SH') |
|
2550
|
|
|
ordet = ordet.replace('QU', 'KV') |
|
2551
|
|
|
ordet = ordet.replace('IO', 'JO') |
|
2552
|
|
|
ordet = ordet.replace('PH', 'F') |
|
2553
|
|
|
|
|
2554
|
|
|
for i in _harde_vokaler: |
|
2555
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
|
2556
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
|
2557
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
|
2558
|
|
|
for i in _mjuka_vokaler: |
|
2559
|
|
|
ordet = ordet.replace(i+'Ü', i+'J') |
|
2560
|
|
|
ordet = ordet.replace(i+'Y', i+'J') |
|
2561
|
|
|
ordet = ordet.replace(i+'I', i+'J') |
|
2562
|
|
|
|
|
2563
|
|
|
if 'H' in ordet: |
|
2564
|
|
|
for i in _konsonanter: |
|
2565
|
|
|
ordet = ordet.replace('H'+i, i) |
|
2566
|
|
|
|
|
2567
|
|
|
ordet = ordet.translate(_sfinxbis_substitutions) |
|
2568
|
|
|
|
|
2569
|
|
|
ordet = ordet.replace('Ð', 'ETH') |
|
2570
|
|
|
ordet = ordet.replace('Þ', 'TH') |
|
2571
|
|
|
ordet = ordet.replace('ß', 'SS') |
|
2572
|
|
|
|
|
2573
|
|
|
return ordet |
|
2574
|
|
|
|
|
2575
|
|
|
def _koda_foersta_ljudet(ordet): |
|
2576
|
|
|
"""Return the word with the first sound coded.""" |
|
2577
|
|
|
if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler: |
|
2578
|
|
|
ordet = '$' + ordet[1:] |
|
2579
|
|
|
elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'): |
|
2580
|
|
|
ordet = 'J' + ordet[2:] |
|
2581
|
|
|
elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler: |
|
2582
|
|
|
ordet = 'J' + ordet[1:] |
|
2583
|
|
|
elif ordet[0:1] == 'Q': |
|
2584
|
|
|
ordet = 'K' + ordet[1:] |
|
2585
|
|
|
elif (ordet[0:2] == 'CH' and |
|
2586
|
|
|
ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)): |
|
2587
|
|
|
ordet = '#' + ordet[2:] |
|
2588
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler: |
|
2589
|
|
|
ordet = 'K' + ordet[1:] |
|
2590
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter: |
|
2591
|
|
|
ordet = 'K' + ordet[1:] |
|
2592
|
|
|
elif ordet[0:1] == 'X': |
|
2593
|
|
|
ordet = 'S' + ordet[1:] |
|
2594
|
|
|
elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler: |
|
2595
|
|
|
ordet = 'S' + ordet[1:] |
|
2596
|
|
|
elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'): |
|
2597
|
|
|
ordet = '#' + ordet[3:] |
|
2598
|
|
|
elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'): |
|
2599
|
|
|
ordet = '#' + ordet[2:] |
|
2600
|
|
|
elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler: |
|
2601
|
|
|
ordet = '#' + ordet[2:] |
|
2602
|
|
|
elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler: |
|
2603
|
|
|
ordet = '#' + ordet[1:] |
|
2604
|
|
|
return ordet |
|
2605
|
|
|
|
|
2606
|
|
|
# Steg 1, Versaler |
|
2607
|
|
|
word = normalize('NFC', text_type(word.upper())) |
|
2608
|
|
|
word = word.replace('ß', 'SS') |
|
2609
|
|
|
word = word.replace('-', ' ') |
|
2610
|
|
|
|
|
2611
|
|
|
# Steg 2, Ta bort adelsprefix |
|
2612
|
|
|
for adelstitel in adelstitler: |
|
2613
|
|
|
while adelstitel in word: |
|
2614
|
|
|
word = word.replace(adelstitel, ' ') |
|
2615
|
|
|
if word.startswith(adelstitel[1:]): |
|
2616
|
|
|
word = word[len(adelstitel)-1:] |
|
2617
|
|
|
|
|
2618
|
|
|
# Split word into tokens |
|
2619
|
|
|
ordlista = word.split() |
|
2620
|
|
|
|
|
2621
|
|
|
# Steg 3, Ta bort dubbelteckning i början på namnet |
|
2622
|
|
|
ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista] |
|
2623
|
|
|
if not ordlista: |
|
2624
|
|
|
return ('',) |
|
2625
|
|
|
|
|
2626
|
|
|
# Steg 4, Försvenskning |
|
2627
|
|
|
ordlista = [_foersvensker(ordet) for ordet in ordlista] |
|
2628
|
|
|
|
|
2629
|
|
|
# Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214) |
|
2630
|
|
|
ordlista = [''.join(c for c in ordet if c in _alfabet) |
|
2631
|
|
|
for ordet in ordlista] |
|
2632
|
|
|
|
|
2633
|
|
|
# Steg 6, Koda första ljudet |
|
2634
|
|
|
ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista] |
|
2635
|
|
|
|
|
2636
|
|
|
# Steg 7, Dela upp namnet i två delar |
|
2637
|
|
|
rest = [ordet[1:] for ordet in ordlista] |
|
2638
|
|
|
|
|
2639
|
|
|
# Steg 8, Utför fonetisk transformation i resten |
|
2640
|
|
|
rest = [ordet.replace('DT', 'T') for ordet in rest] |
|
2641
|
|
|
rest = [ordet.replace('X', 'KS') for ordet in rest] |
|
2642
|
|
|
|
|
2643
|
|
|
# Steg 9, Koda resten till en sifferkod |
|
2644
|
|
|
for vokal in _mjuka_vokaler: |
|
2645
|
|
|
rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest] |
|
2646
|
|
|
rest = [ordet.translate(_sfinxbis_translation) for ordet in rest] |
|
2647
|
|
|
|
|
2648
|
|
|
# Steg 10, Ta bort intilliggande dubbletter |
|
2649
|
|
|
rest = [_delete_consecutive_repeats(ordet) for ordet in rest] |
|
2650
|
|
|
|
|
2651
|
|
|
# Steg 11, Ta bort alla "9" |
|
2652
|
|
|
rest = [ordet.replace('9', '') for ordet in rest] |
|
2653
|
|
|
|
|
2654
|
|
|
# Steg 12, Sätt ihop delarna igen |
|
2655
|
|
|
ordlista = [''.join(ordet) for ordet in |
|
2656
|
|
|
zip((_[0:1] for _ in ordlista), rest)] |
|
2657
|
|
|
|
|
2658
|
|
|
# truncate, if maxlength is set |
|
2659
|
|
|
if maxlength and maxlength < _INFINITY: |
|
2660
|
|
|
ordlista = [ordet[:maxlength] for ordet in ordlista] |
|
2661
|
|
|
|
|
2662
|
|
|
return tuple(ordlista) |
|
2663
|
|
|
|
|
2664
|
|
|
|
|
2665
|
|
|
def phonet(word, mode=1, lang='de', trace=False): |
|
2666
|
|
|
"""Return the phonet code for a word. |
|
2667
|
|
|
|
|
2668
|
|
|
phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and |
|
2669
|
|
|
documented in :cite:`Michael:1999`. |
|
2670
|
|
|
|
|
2671
|
|
|
This is a port of Jesper Zedlitz's code, which is licensed LGPL |
|
2672
|
|
|
:cite:`Zedlitz:2015`. |
|
2673
|
|
|
|
|
2674
|
|
|
That is, in turn, based on Michael's C code, which is also licensed LGPL |
|
2675
|
|
|
:cite:`Michael:2007`. |
|
2676
|
|
|
|
|
2677
|
|
|
:param str word: the word to transform |
|
2678
|
|
|
:param int mode: the ponet variant to employ (1 or 2) |
|
2679
|
|
|
:param str lang: 'de' (default) for German |
|
2680
|
|
|
'none' for no language |
|
2681
|
|
|
:param bool trace: prints debugging info if True |
|
2682
|
|
|
:returns: the phonet value |
|
2683
|
|
|
:rtype: str |
|
2684
|
|
|
|
|
2685
|
|
|
>>> phonet('Christopher') |
|
2686
|
|
|
'KRISTOFA' |
|
2687
|
|
|
>>> phonet('Niall') |
|
2688
|
|
|
'NIAL' |
|
2689
|
|
|
>>> phonet('Smith') |
|
2690
|
|
|
'SMIT' |
|
2691
|
|
|
>>> phonet('Schmidt') |
|
2692
|
|
|
'SHMIT' |
|
2693
|
|
|
|
|
2694
|
|
|
>>> phonet('Christopher', mode=2) |
|
2695
|
|
|
'KRIZTUFA' |
|
2696
|
|
|
>>> phonet('Niall', mode=2) |
|
2697
|
|
|
'NIAL' |
|
2698
|
|
|
>>> phonet('Smith', mode=2) |
|
2699
|
|
|
'ZNIT' |
|
2700
|
|
|
>>> phonet('Schmidt', mode=2) |
|
2701
|
|
|
'ZNIT' |
|
2702
|
|
|
|
|
2703
|
|
|
>>> phonet('Christopher', lang='none') |
|
2704
|
|
|
'CHRISTOPHER' |
|
2705
|
|
|
>>> phonet('Niall', lang='none') |
|
2706
|
|
|
'NIAL' |
|
2707
|
|
|
>>> phonet('Smith', lang='none') |
|
2708
|
|
|
'SMITH' |
|
2709
|
|
|
>>> phonet('Schmidt', lang='none') |
|
2710
|
|
|
'SCHMIDT' |
|
2711
|
|
|
""" |
|
2712
|
|
|
# pylint: disable=too-many-branches |
|
2713
|
|
|
|
|
2714
|
|
|
_phonet_rules_no_lang = ( # separator chars |
|
2715
|
|
|
'´', ' ', ' ', |
|
2716
|
|
|
'"', ' ', ' ', |
|
2717
|
|
|
'`$', '', '', |
|
2718
|
|
|
'\'', ' ', ' ', |
|
2719
|
|
|
',', ',', ',', |
|
2720
|
|
|
';', ',', ',', |
|
2721
|
|
|
'-', ' ', ' ', |
|
2722
|
|
|
' ', ' ', ' ', |
|
2723
|
|
|
'.', '.', '.', |
|
2724
|
|
|
':', '.', '.', |
|
2725
|
|
|
# German umlauts |
|
2726
|
|
|
'Ä', 'AE', 'AE', |
|
2727
|
|
|
'Ö', 'OE', 'OE', |
|
2728
|
|
|
'Ü', 'UE', 'UE', |
|
2729
|
|
|
'ß', 'S', 'S', |
|
2730
|
|
|
# international umlauts |
|
2731
|
|
|
'À', 'A', 'A', |
|
2732
|
|
|
'Á', 'A', 'A', |
|
2733
|
|
|
'Â', 'A', 'A', |
|
2734
|
|
|
'Ã', 'A', 'A', |
|
2735
|
|
|
'Å', 'A', 'A', |
|
2736
|
|
|
'Æ', 'AE', 'AE', |
|
2737
|
|
|
'Ç', 'C', 'C', |
|
2738
|
|
|
'Ð', 'DJ', 'DJ', |
|
2739
|
|
|
'È', 'E', 'E', |
|
2740
|
|
|
'É', 'E', 'E', |
|
2741
|
|
|
'Ê', 'E', 'E', |
|
2742
|
|
|
'Ë', 'E', 'E', |
|
2743
|
|
|
'Ì', 'I', 'I', |
|
2744
|
|
|
'Í', 'I', 'I', |
|
2745
|
|
|
'Î', 'I', 'I', |
|
2746
|
|
|
'Ï', 'I', 'I', |
|
2747
|
|
|
'Ñ', 'NH', 'NH', |
|
2748
|
|
|
'Ò', 'O', 'O', |
|
2749
|
|
|
'Ó', 'O', 'O', |
|
2750
|
|
|
'Ô', 'O', 'O', |
|
2751
|
|
|
'Õ', 'O', 'O', |
|
2752
|
|
|
'Œ', 'OE', 'OE', |
|
2753
|
|
|
'Ø', 'OE', 'OE', |
|
2754
|
|
|
'Š', 'SH', 'SH', |
|
2755
|
|
|
'Þ', 'TH', 'TH', |
|
2756
|
|
|
'Ù', 'U', 'U', |
|
2757
|
|
|
'Ú', 'U', 'U', |
|
2758
|
|
|
'Û', 'U', 'U', |
|
2759
|
|
|
'Ý', 'Y', 'Y', |
|
2760
|
|
|
'Ÿ', 'Y', 'Y', |
|
2761
|
|
|
# 'normal' letters (A-Z) |
|
2762
|
|
|
'MC^', 'MAC', 'MAC', |
|
2763
|
|
|
'MC^', 'MAC', 'MAC', |
|
2764
|
|
|
'M´^', 'MAC', 'MAC', |
|
2765
|
|
|
'M\'^', 'MAC', 'MAC', |
|
2766
|
|
|
'O´^', 'O', 'O', |
|
2767
|
|
|
'O\'^', 'O', 'O', |
|
2768
|
|
|
'VAN DEN ^', 'VANDEN', 'VANDEN', |
|
2769
|
|
|
None, None, None) |
|
2770
|
|
|
|
|
2771
|
|
|
_phonet_rules_german = ( # separator chars |
|
2772
|
|
|
'´', ' ', ' ', |
|
2773
|
|
|
'"', ' ', ' ', |
|
2774
|
|
|
'`$', '', '', |
|
2775
|
|
|
'\'', ' ', ' ', |
|
2776
|
|
|
',', ' ', ' ', |
|
2777
|
|
|
';', ' ', ' ', |
|
2778
|
|
|
'-', ' ', ' ', |
|
2779
|
|
|
' ', ' ', ' ', |
|
2780
|
|
|
'.', '.', '.', |
|
2781
|
|
|
':', '.', '.', |
|
2782
|
|
|
# German umlauts |
|
2783
|
|
|
'ÄE', 'E', 'E', |
|
2784
|
|
|
'ÄU<', 'EU', 'EU', |
|
2785
|
|
|
'ÄV(AEOU)-<', 'EW', None, |
|
2786
|
|
|
'Ä$', 'Ä', None, |
|
2787
|
|
|
'Ä<', None, 'E', |
|
2788
|
|
|
'Ä', 'E', None, |
|
2789
|
|
|
'ÖE', 'Ö', 'Ö', |
|
2790
|
|
|
'ÖU', 'Ö', 'Ö', |
|
2791
|
|
|
'ÖVER--<', 'ÖW', None, |
|
2792
|
|
|
'ÖV(AOU)-', 'ÖW', None, |
|
2793
|
|
|
'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
|
2794
|
|
|
'ÜBER^^', 'ÜBA', 'IBA', |
|
2795
|
|
|
'ÜE', 'Ü', 'I', |
|
2796
|
|
|
'ÜVER--<', 'ÜW', None, |
|
2797
|
|
|
'ÜV(AOU)-', 'ÜW', None, |
|
2798
|
|
|
'Ü', None, 'I', |
|
2799
|
|
|
'ßCH<', None, 'Z', |
|
2800
|
|
|
'ß<', 'S', 'Z', |
|
2801
|
|
|
# international umlauts |
|
2802
|
|
|
'À<', 'A', 'A', |
|
2803
|
|
|
'Á<', 'A', 'A', |
|
2804
|
|
|
'Â<', 'A', 'A', |
|
2805
|
|
|
'Ã<', 'A', 'A', |
|
2806
|
|
|
'Å<', 'A', 'A', |
|
2807
|
|
|
'ÆER-', 'E', 'E', |
|
2808
|
|
|
'ÆU<', 'EU', 'EU', |
|
2809
|
|
|
'ÆV(AEOU)-<', 'EW', None, |
|
2810
|
|
|
'Æ$', 'Ä', None, |
|
2811
|
|
|
'Æ<', None, 'E', |
|
2812
|
|
|
'Æ', 'E', None, |
|
2813
|
|
|
'Ç', 'Z', 'Z', |
|
2814
|
|
|
'ÐÐ-', '', '', |
|
2815
|
|
|
'Ð', 'DI', 'TI', |
|
2816
|
|
|
'È<', 'E', 'E', |
|
2817
|
|
|
'É<', 'E', 'E', |
|
2818
|
|
|
'Ê<', 'E', 'E', |
|
2819
|
|
|
'Ë', 'E', 'E', |
|
2820
|
|
|
'Ì<', 'I', 'I', |
|
2821
|
|
|
'Í<', 'I', 'I', |
|
2822
|
|
|
'Î<', 'I', 'I', |
|
2823
|
|
|
'Ï', 'I', 'I', |
|
2824
|
|
|
'ÑÑ-', '', '', |
|
2825
|
|
|
'Ñ', 'NI', 'NI', |
|
2826
|
|
|
'Ò<', 'O', 'U', |
|
2827
|
|
|
'Ó<', 'O', 'U', |
|
2828
|
|
|
'Ô<', 'O', 'U', |
|
2829
|
|
|
'Õ<', 'O', 'U', |
|
2830
|
|
|
'Œ<', 'Ö', 'Ö', |
|
2831
|
|
|
'Ø(IJY)-<', 'E', 'E', |
|
2832
|
|
|
'Ø<', 'Ö', 'Ö', |
|
2833
|
|
|
'Š', 'SH', 'Z', |
|
2834
|
|
|
'Þ', 'T', 'T', |
|
2835
|
|
|
'Ù<', 'U', 'U', |
|
2836
|
|
|
'Ú<', 'U', 'U', |
|
2837
|
|
|
'Û<', 'U', 'U', |
|
2838
|
|
|
'Ý<', 'I', 'I', |
|
2839
|
|
|
'Ÿ<', 'I', 'I', |
|
2840
|
|
|
# 'normal' letters (A-Z) |
|
2841
|
|
|
'ABELLE$', 'ABL', 'ABL', |
|
2842
|
|
|
'ABELL$', 'ABL', 'ABL', |
|
2843
|
|
|
'ABIENNE$', 'ABIN', 'ABIN', |
|
2844
|
|
|
'ACHME---^', 'ACH', 'AK', |
|
2845
|
|
|
'ACEY$', 'AZI', 'AZI', |
|
2846
|
|
|
'ADV', 'ATW', None, |
|
2847
|
|
|
'AEGL-', 'EK', None, |
|
2848
|
|
|
'AEU<', 'EU', 'EU', |
|
2849
|
|
|
'AE2', 'E', 'E', |
|
2850
|
|
|
'AFTRAUBEN------', 'AFT ', 'AFT ', |
|
2851
|
|
|
'AGL-1', 'AK', None, |
|
2852
|
|
|
'AGNI-^', 'AKN', 'AKN', |
|
2853
|
|
|
'AGNIE-', 'ANI', 'ANI', |
|
2854
|
|
|
'AGN(AEOU)-$', 'ANI', 'ANI', |
|
2855
|
|
|
'AH(AIOÖUÜY)-', 'AH', None, |
|
2856
|
|
|
'AIA2', 'AIA', 'AIA', |
|
2857
|
|
|
'AIE$', 'E', 'E', |
|
2858
|
|
|
'AILL(EOU)-', 'ALI', 'ALI', |
|
2859
|
|
|
'AINE$', 'EN', 'EN', |
|
2860
|
|
|
'AIRE$', 'ER', 'ER', |
|
2861
|
|
|
'AIR-', 'E', 'E', |
|
2862
|
|
|
'AISE$', 'ES', 'EZ', |
|
2863
|
|
|
'AISSANCE$', 'ESANS', 'EZANZ', |
|
2864
|
|
|
'AISSE$', 'ES', 'EZ', |
|
2865
|
|
|
'AIX$', 'EX', 'EX', |
|
2866
|
|
|
'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A', |
|
2867
|
|
|
'AKTIE', 'AXIE', 'AXIE', |
|
2868
|
|
|
'AKTUEL', 'AKTUEL', None, |
|
2869
|
|
|
'ALOI^', 'ALOI', 'ALUI', # Don't merge these rules |
|
2870
|
|
|
'ALOY^', 'ALOI', 'ALUI', # needed by 'check_rules' |
|
2871
|
|
|
'AMATEU(RS)-', 'AMATÖ', 'ANATÖ', |
|
2872
|
|
|
'ANCH(OEI)-', 'ANSH', 'ANZ', |
|
2873
|
|
|
'ANDERGEGANG----', 'ANDA GE', 'ANTA KE', |
|
2874
|
|
|
'ANDERGEHE----', 'ANDA ', 'ANTA ', |
|
2875
|
|
|
'ANDERGESETZ----', 'ANDA GE', 'ANTA KE', |
|
2876
|
|
|
'ANDERGING----', 'ANDA ', 'ANTA ', |
|
2877
|
|
|
'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ', |
|
2878
|
|
|
'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ', |
|
2879
|
|
|
'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ', |
|
2880
|
|
|
'ANER(BKO)---^^', 'AN', None, |
|
2881
|
|
|
'ANHAND---^$', 'AN H', 'AN ', |
|
2882
|
|
|
'ANH(AÄEIOÖUÜY)--^^', 'AN', None, |
|
2883
|
|
|
'ANIELLE$', 'ANIEL', 'ANIL', |
|
2884
|
|
|
'ANIEL', 'ANIEL', None, |
|
2885
|
|
|
'ANSTELLE----^$', 'AN ST', 'AN ZT', |
|
2886
|
|
|
'ANTI^^', 'ANTI', 'ANTI', |
|
2887
|
|
|
'ANVER^^', 'ANFA', 'ANFA', |
|
2888
|
|
|
'ATIA$', 'ATIA', 'ATIA', |
|
2889
|
|
|
'ATIA(NS)--', 'ATI', 'ATI', |
|
2890
|
|
|
'ATI(AÄOÖUÜ)-', 'AZI', 'AZI', |
|
2891
|
|
|
'AUAU--', '', '', |
|
2892
|
|
|
'AUERE$', 'AUERE', None, |
|
2893
|
|
|
'AUERE(NS)-$', 'AUERE', None, |
|
2894
|
|
|
'AUERE(AIOUY)--', 'AUER', None, |
|
2895
|
|
|
'AUER(AÄIOÖUÜY)-', 'AUER', None, |
|
2896
|
|
|
'AUER<', 'AUA', 'AUA', |
|
2897
|
|
|
'AUF^^', 'AUF', 'AUF', |
|
2898
|
|
|
'AULT$', 'O', 'U', |
|
2899
|
|
|
'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA', |
|
2900
|
|
|
'AUR$', 'AUA', 'AUA', |
|
2901
|
|
|
'AUSSE$', 'OS', 'UZ', |
|
2902
|
|
|
'AUS(ST)-^', 'AUS', 'AUS', |
|
2903
|
|
|
'AUS^^', 'AUS', 'AUS', |
|
2904
|
|
|
'AUTOFAHR----', 'AUTO ', 'AUTU ', |
|
2905
|
|
|
'AUTO^^', 'AUTO', 'AUTU', |
|
2906
|
|
|
'AUX(IY)-', 'AUX', 'AUX', |
|
2907
|
|
|
'AUX', 'O', 'U', |
|
2908
|
|
|
'AU', 'AU', 'AU', |
|
2909
|
|
|
'AVER--<', 'AW', None, |
|
2910
|
|
|
'AVIER$', 'AWIE', 'AFIE', |
|
2911
|
|
|
'AV(EÈÉÊI)-^', 'AW', None, |
|
2912
|
|
|
'AV(AOU)-', 'AW', None, |
|
2913
|
|
|
'AYRE$', 'EIRE', 'EIRE', |
|
2914
|
|
|
'AYRE(NS)-$', 'EIRE', 'EIRE', |
|
2915
|
|
|
'AYRE(AIOUY)--', 'EIR', 'EIR', |
|
2916
|
|
|
'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR', |
|
2917
|
|
|
'AYR<', 'EIA', 'EIA', |
|
2918
|
|
|
'AYER--<', 'EI', 'EI', |
|
2919
|
|
|
'AY(AÄEIOÖUÜY)--', 'A', 'A', |
|
2920
|
|
|
'AË', 'E', 'E', |
|
2921
|
|
|
'A(IJY)<', 'EI', 'EI', |
|
2922
|
|
|
'BABY^$', 'BEBI', 'BEBI', |
|
2923
|
|
|
'BAB(IY)^', 'BEBI', 'BEBI', |
|
2924
|
|
|
'BEAU^$', 'BO', None, |
|
2925
|
|
|
'BEA(BCMNRU)-^', 'BEA', 'BEA', |
|
2926
|
|
|
'BEAT(AEIMORU)-^', 'BEAT', 'BEAT', |
|
2927
|
|
|
'BEE$', 'BI', 'BI', |
|
2928
|
|
|
'BEIGE^$', 'BESH', 'BEZ', |
|
2929
|
|
|
'BENOIT--', 'BENO', 'BENU', |
|
2930
|
|
|
'BER(DT)-', 'BER', None, |
|
2931
|
|
|
'BERN(DT)-', 'BERN', None, |
|
2932
|
|
|
'BE(LMNRST)-^', 'BE', 'BE', |
|
2933
|
|
|
'BETTE$', 'BET', 'BET', |
|
2934
|
|
|
'BEVOR^$', 'BEFOR', None, |
|
2935
|
|
|
'BIC$', 'BIZ', 'BIZ', |
|
2936
|
|
|
'BOWL(EI)-', 'BOL', 'BUL', |
|
2937
|
|
|
'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B', |
|
2938
|
|
|
'BRINGEND-----^', 'BRI', 'BRI', |
|
2939
|
|
|
'BRINGEND-----', ' BRI', ' BRI', |
|
2940
|
|
|
'BROW(NS)-', 'BRAU', 'BRAU', |
|
2941
|
|
|
'BUDGET7', 'BÜGE', 'BIKE', |
|
2942
|
|
|
'BUFFET7', 'BÜFE', 'BIFE', |
|
2943
|
|
|
'BYLLE$', 'BILE', 'BILE', |
|
2944
|
|
|
'BYLL$', 'BIL', 'BIL', |
|
2945
|
|
|
'BYPA--^', 'BEI', 'BEI', |
|
2946
|
|
|
'BYTE<', 'BEIT', 'BEIT', |
|
2947
|
|
|
'BY9^', 'BÜ', None, |
|
2948
|
|
|
'B(SßZ)$', 'BS', None, |
|
2949
|
|
|
'CACH(EI)-^', 'KESH', 'KEZ', |
|
2950
|
|
|
'CAE--', 'Z', 'Z', |
|
2951
|
|
|
'CA(IY)$', 'ZEI', 'ZEI', |
|
2952
|
|
|
'CE(EIJUY)--', 'Z', 'Z', |
|
2953
|
|
|
'CENT<', 'ZENT', 'ZENT', |
|
2954
|
|
|
'CERST(EI)----^', 'KE', 'KE', |
|
2955
|
|
|
'CER$', 'ZA', 'ZA', |
|
2956
|
|
|
'CE3', 'ZE', 'ZE', |
|
2957
|
|
|
'CH\'S$', 'X', 'X', |
|
2958
|
|
|
'CH´S$', 'X', 'X', |
|
2959
|
|
|
'CHAO(ST)-', 'KAO', 'KAU', |
|
2960
|
|
|
'CHAMPIO-^', 'SHEMPI', 'ZENBI', |
|
2961
|
|
|
'CHAR(AI)-^', 'KAR', 'KAR', |
|
2962
|
|
|
'CHAU(CDFSVWXZ)-', 'SHO', 'ZU', |
|
2963
|
|
|
'CHÄ(CF)-', 'SHE', 'ZE', |
|
2964
|
|
|
'CHE(CF)-', 'SHE', 'ZE', |
|
2965
|
|
|
'CHEM-^', 'KE', 'KE', # or: 'CHE', 'KE' |
|
2966
|
|
|
'CHEQUE<', 'SHEK', 'ZEK', |
|
2967
|
|
|
'CHI(CFGPVW)-', 'SHI', 'ZI', |
|
2968
|
|
|
'CH(AEUY)-<^', 'SH', 'Z', |
|
2969
|
|
|
'CHK-', '', '', |
|
2970
|
|
|
'CHO(CKPS)-^', 'SHO', 'ZU', |
|
2971
|
|
|
'CHRIS-', 'KRI', None, |
|
2972
|
|
|
'CHRO-', 'KR', None, |
|
2973
|
|
|
'CH(LOR)-<^', 'K', 'K', |
|
2974
|
|
|
'CHST-', 'X', 'X', |
|
2975
|
|
|
'CH(SßXZ)3', 'X', 'X', |
|
2976
|
|
|
'CHTNI-3', 'CHN', 'KN', |
|
2977
|
|
|
'CH^', 'K', 'K', # or: 'CH', 'K' |
|
2978
|
|
|
'CH', 'CH', 'K', |
|
2979
|
|
|
'CIC$', 'ZIZ', 'ZIZ', |
|
2980
|
|
|
'CIENCEFICT----', 'EIENS ', 'EIENZ ', |
|
2981
|
|
|
'CIENCE$', 'EIENS', 'EIENZ', |
|
2982
|
|
|
'CIER$', 'ZIE', 'ZIE', |
|
2983
|
|
|
'CYB-^', 'ZEI', 'ZEI', |
|
2984
|
|
|
'CY9^', 'ZÜ', 'ZI', |
|
2985
|
|
|
'C(IJY)-<3', 'Z', 'Z', |
|
2986
|
|
|
'CLOWN-', 'KLAU', 'KLAU', |
|
2987
|
|
|
'CCH', 'Z', 'Z', |
|
2988
|
|
|
'CCE-', 'X', 'X', |
|
2989
|
|
|
'C(CK)-', '', '', |
|
2990
|
|
|
'CLAUDET---', 'KLO', 'KLU', |
|
2991
|
|
|
'CLAUDINE^$', 'KLODIN', 'KLUTIN', |
|
2992
|
|
|
'COACH', 'KOSH', 'KUZ', |
|
2993
|
|
|
'COLE$', 'KOL', 'KUL', |
|
2994
|
|
|
'COUCH', 'KAUSH', 'KAUZ', |
|
2995
|
|
|
'COW', 'KAU', 'KAU', |
|
2996
|
|
|
'CQUES$', 'K', 'K', |
|
2997
|
|
|
'CQUE', 'K', 'K', |
|
2998
|
|
|
'CRASH--9', 'KRE', 'KRE', |
|
2999
|
|
|
'CREAT-^', 'KREA', 'KREA', |
|
3000
|
|
|
'CST', 'XT', 'XT', |
|
3001
|
|
|
'CS<^', 'Z', 'Z', |
|
3002
|
|
|
'C(SßX)', 'X', 'X', |
|
3003
|
|
|
'CT\'S$', 'X', 'X', |
|
3004
|
|
|
'CT(SßXZ)', 'X', 'X', |
|
3005
|
|
|
'CZ<', 'Z', 'Z', |
|
3006
|
|
|
'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z', |
|
3007
|
|
|
'C.^', 'C.', 'C.', |
|
3008
|
|
|
'CÄ-', 'Z', 'Z', |
|
3009
|
|
|
'CÜ$', 'ZÜ', 'ZI', |
|
3010
|
|
|
'C\'S$', 'X', 'X', |
|
3011
|
|
|
'C<', 'K', 'K', |
|
3012
|
|
|
'DAHER^$', 'DAHER', None, |
|
3013
|
|
|
'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ', |
|
3014
|
|
|
'DAVO(NR)-^$', 'DAFO', 'TAFU', |
|
3015
|
|
|
'DD(SZ)--<', '', '', |
|
3016
|
|
|
'DD9', 'D', None, |
|
3017
|
|
|
'DEPOT7', 'DEPO', 'TEBU', |
|
3018
|
|
|
'DESIGN', 'DISEIN', 'TIZEIN', |
|
3019
|
|
|
'DE(LMNRST)-3^', 'DE', 'TE', |
|
3020
|
|
|
'DETTE$', 'DET', 'TET', |
|
3021
|
|
|
'DH$', 'T', None, |
|
3022
|
|
|
'DIC$', 'DIZ', 'TIZ', |
|
3023
|
|
|
'DIDR-^', 'DIT', None, |
|
3024
|
|
|
'DIEDR-^', 'DIT', None, |
|
3025
|
|
|
'DJ(AEIOU)-^', 'I', 'I', |
|
3026
|
|
|
'DMITR-^', 'DIMIT', 'TINIT', |
|
3027
|
|
|
'DRY9^', 'DRÜ', None, |
|
3028
|
|
|
'DT-', '', '', |
|
3029
|
|
|
'DUIS-^', 'DÜ', 'TI', |
|
3030
|
|
|
'DURCH^^', 'DURCH', 'TURK', |
|
3031
|
|
|
'DVA$', 'TWA', None, |
|
3032
|
|
|
'DY9^', 'DÜ', None, |
|
3033
|
|
|
'DYS$', 'DIS', None, |
|
3034
|
|
|
'DS(CH)--<', 'T', 'T', |
|
3035
|
|
|
'DST', 'ZT', 'ZT', |
|
3036
|
|
|
'DZS(CH)--', 'T', 'T', |
|
3037
|
|
|
'D(SßZ)', 'Z', 'Z', |
|
3038
|
|
|
'D(AÄEIOÖRUÜY)-', 'D', None, |
|
3039
|
|
|
'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None, |
|
3040
|
|
|
'D\'H^', 'D', 'T', |
|
3041
|
|
|
'D´H^', 'D', 'T', |
|
3042
|
|
|
'D`H^', 'D', 'T', |
|
3043
|
|
|
'D\'S3$', 'Z', 'Z', |
|
3044
|
|
|
'D´S3$', 'Z', 'Z', |
|
3045
|
|
|
'D^', 'D', None, |
|
3046
|
|
|
'D', 'T', 'T', |
|
3047
|
|
|
'EAULT$', 'O', 'U', |
|
3048
|
|
|
'EAUX$', 'O', 'U', |
|
3049
|
|
|
'EAU', 'O', 'U', |
|
3050
|
|
|
'EAV', 'IW', 'IF', |
|
3051
|
|
|
'EAS3$', 'EAS', None, |
|
3052
|
|
|
'EA(AÄEIOÖÜY)-3', 'EA', 'EA', |
|
3053
|
|
|
'EA3$', 'EA', 'EA', |
|
3054
|
|
|
'EA3', 'I', 'I', |
|
3055
|
|
|
'EBENSO^$', 'EBNSO', 'EBNZU', |
|
3056
|
|
|
'EBENSO^^', 'EBNSO ', 'EBNZU ', |
|
3057
|
|
|
'EBEN^^', 'EBN', 'EBN', |
|
3058
|
|
|
'EE9', 'E', 'E', |
|
3059
|
|
|
'EGL-1', 'EK', None, |
|
3060
|
|
|
'EHE(IUY)--1', 'EH', None, |
|
3061
|
|
|
'EHUNG---1', 'E', None, |
|
3062
|
|
|
'EH(AÄIOÖUÜY)-1', 'EH', None, |
|
3063
|
|
|
'EIEI--', '', '', |
|
3064
|
|
|
'EIERE^$', 'EIERE', None, |
|
3065
|
|
|
'EIERE$', 'EIERE', None, |
|
3066
|
|
|
'EIERE(NS)-$', 'EIERE', None, |
|
3067
|
|
|
'EIERE(AIOUY)--', 'EIER', None, |
|
3068
|
|
|
'EIER(AÄIOÖUÜY)-', 'EIER', None, |
|
3069
|
|
|
'EIER<', 'EIA', None, |
|
3070
|
|
|
'EIGL-1', 'EIK', None, |
|
3071
|
|
|
'EIGH$', 'EI', 'EI', |
|
3072
|
|
|
'EIH--', 'E', 'E', |
|
3073
|
|
|
'EILLE$', 'EI', 'EI', |
|
3074
|
|
|
'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA', |
|
3075
|
|
|
'EIR$', 'EIA', 'EIA', |
|
3076
|
|
|
'EITRAUBEN------', 'EIT ', 'EIT ', |
|
3077
|
|
|
'EI', 'EI', 'EI', |
|
3078
|
|
|
'EJ$', 'EI', 'EI', |
|
3079
|
|
|
'ELIZ^', 'ELIS', None, |
|
3080
|
|
|
'ELZ^', 'ELS', None, |
|
3081
|
|
|
'EL-^', 'E', 'E', |
|
3082
|
|
|
'ELANG----1', 'E', 'E', |
|
3083
|
|
|
'EL(DKL)--1', 'E', 'E', |
|
3084
|
|
|
'EL(MNT)--1$', 'E', 'E', |
|
3085
|
|
|
'ELYNE$', 'ELINE', 'ELINE', |
|
3086
|
|
|
'ELYN$', 'ELIN', 'ELIN', |
|
3087
|
|
|
'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL', |
|
3088
|
|
|
'EL-1', 'L', 'L', |
|
3089
|
|
|
'EM-^', None, 'E', |
|
3090
|
|
|
'EM(DFKMPQT)--1', None, 'E', |
|
3091
|
|
|
'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E', |
|
3092
|
|
|
'EM-1', None, 'N', |
|
3093
|
|
|
'ENGAG-^', 'ANGA', 'ANKA', |
|
3094
|
|
|
'EN-^', 'E', 'E', |
|
3095
|
|
|
'ENTUEL', 'ENTUEL', None, |
|
3096
|
|
|
'EN(CDGKQSTZ)--1', 'E', 'E', |
|
3097
|
|
|
'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN', |
|
3098
|
|
|
'EN-1', '', '', |
|
3099
|
|
|
'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER', |
|
3100
|
|
|
'ER-^', 'E', 'E', |
|
3101
|
|
|
'ERREGEND-----', ' ER', ' ER', |
|
3102
|
|
|
'ERT1$', 'AT', None, |
|
3103
|
|
|
'ER(DGLKMNRQTZß)-1', 'ER', None, |
|
3104
|
|
|
'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A', |
|
3105
|
|
|
'ER1$', 'A', 'A', |
|
3106
|
|
|
'ER<1', 'A', 'A', |
|
3107
|
|
|
'ETAT7', 'ETA', 'ETA', |
|
3108
|
|
|
'ETI(AÄOÖÜU)-', 'EZI', 'EZI', |
|
3109
|
|
|
'EUERE$', 'EUERE', None, |
|
3110
|
|
|
'EUERE(NS)-$', 'EUERE', None, |
|
3111
|
|
|
'EUERE(AIOUY)--', 'EUER', None, |
|
3112
|
|
|
'EUER(AÄIOÖUÜY)-', 'EUER', None, |
|
3113
|
|
|
'EUER<', 'EUA', None, |
|
3114
|
|
|
'EUEU--', '', '', |
|
3115
|
|
|
'EUILLE$', 'Ö', 'Ö', |
|
3116
|
|
|
'EUR$', 'ÖR', 'ÖR', |
|
3117
|
|
|
'EUX', 'Ö', 'Ö', |
|
3118
|
|
|
'EUSZ$', 'EUS', None, |
|
3119
|
|
|
'EUTZ$', 'EUS', None, |
|
3120
|
|
|
'EUYS$', 'EUS', 'EUZ', |
|
3121
|
|
|
'EUZ$', 'EUS', None, |
|
3122
|
|
|
'EU', 'EU', 'EU', |
|
3123
|
|
|
'EVER--<1', 'EW', None, |
|
3124
|
|
|
'EV(ÄOÖUÜ)-1', 'EW', None, |
|
3125
|
|
|
'EYER<', 'EIA', 'EIA', |
|
3126
|
|
|
'EY<', 'EI', 'EI', |
|
3127
|
|
|
'FACETTE', 'FASET', 'FAZET', |
|
3128
|
|
|
'FANS--^$', 'FE', 'FE', |
|
3129
|
|
|
'FAN-^$', 'FE', 'FE', |
|
3130
|
|
|
'FAULT-', 'FOL', 'FUL', |
|
3131
|
|
|
'FEE(DL)-', 'FI', 'FI', |
|
3132
|
|
|
'FEHLER', 'FELA', 'FELA', |
|
3133
|
|
|
'FE(LMNRST)-3^', 'FE', 'FE', |
|
3134
|
|
|
'FOERDERN---^', 'FÖRD', 'FÖRT', |
|
3135
|
|
|
'FOERDERN---', ' FÖRD', ' FÖRT', |
|
3136
|
|
|
'FOND7', 'FON', 'FUN', |
|
3137
|
|
|
'FRAIN$', 'FRA', 'FRA', |
|
3138
|
|
|
'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ', |
|
3139
|
|
|
'FY9^', 'FÜ', None, |
|
3140
|
|
|
'FÖRDERN---^', 'FÖRD', 'FÖRT', |
|
3141
|
|
|
'FÖRDERN---', ' FÖRD', ' FÖRT', |
|
3142
|
|
|
'GAGS^$', 'GEX', 'KEX', |
|
3143
|
|
|
'GAG^$', 'GEK', 'KEK', |
|
3144
|
|
|
'GD', 'KT', 'KT', |
|
3145
|
|
|
'GEGEN^^', 'GEGN', 'KEKN', |
|
3146
|
|
|
'GEGENGEKOM-----', 'GEGN ', 'KEKN ', |
|
3147
|
|
|
'GEGENGESET-----', 'GEGN ', 'KEKN ', |
|
3148
|
|
|
'GEGENKOMME-----', 'GEGN ', 'KEKN ', |
|
3149
|
|
|
'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ', |
|
3150
|
|
|
'GENDETWAS-----$', 'GENT ', 'KENT ', |
|
3151
|
|
|
'GENRE', 'IORE', 'IURE', |
|
3152
|
|
|
'GE(LMNRST)-3^', 'GE', 'KE', |
|
3153
|
|
|
'GER(DKT)-', 'GER', None, |
|
3154
|
|
|
'GETTE$', 'GET', 'KET', |
|
3155
|
|
|
'GGF.', 'GF.', None, |
|
3156
|
|
|
'GG-', '', '', |
|
3157
|
|
|
'GH', 'G', None, |
|
3158
|
|
|
'GI(AOU)-^', 'I', 'I', |
|
3159
|
|
|
'GION-3', 'KIO', 'KIU', |
|
3160
|
|
|
'G(CK)-', '', '', |
|
3161
|
|
|
'GJ(AEIOU)-^', 'I', 'I', |
|
3162
|
|
|
'GMBH^$', 'GMBH', 'GMBH', |
|
3163
|
|
|
'GNAC$', 'NIAK', 'NIAK', |
|
3164
|
|
|
'GNON$', 'NION', 'NIUN', |
|
3165
|
|
|
'GN$', 'N', 'N', |
|
3166
|
|
|
'GONCAL-^', 'GONZA', 'KUNZA', |
|
3167
|
|
|
'GRY9^', 'GRÜ', None, |
|
3168
|
|
|
'G(SßXZ)-<', 'K', 'K', |
|
3169
|
|
|
'GUCK-', 'KU', 'KU', |
|
3170
|
|
|
'GUISEP-^', 'IUSE', 'IUZE', |
|
3171
|
|
|
'GUI-^', 'G', 'K', |
|
3172
|
|
|
'GUTAUSSEH------^', 'GUT ', 'KUT ', |
|
3173
|
|
|
'GUTGEHEND------^', 'GUT ', 'KUT ', |
|
3174
|
|
|
'GY9^', 'GÜ', None, |
|
3175
|
|
|
'G(AÄEILOÖRUÜY)-', 'G', None, |
|
3176
|
|
|
'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None, |
|
3177
|
|
|
'G\'S$', 'X', 'X', |
|
3178
|
|
|
'G´S$', 'X', 'X', |
|
3179
|
|
|
'G^', 'G', None, |
|
3180
|
|
|
'G', 'K', 'K', |
|
3181
|
|
|
'HA(HIUY)--1', 'H', None, |
|
3182
|
|
|
'HANDVOL---^', 'HANT ', 'ANT ', |
|
3183
|
|
|
'HANNOVE-^', 'HANOF', None, |
|
3184
|
|
|
'HAVEN7$', 'HAFN', None, |
|
3185
|
|
|
'HEAD-', 'HE', 'E', |
|
3186
|
|
|
'HELIEGEN------', 'E ', 'E ', |
|
3187
|
|
|
'HESTEHEN------', 'E ', 'E ', |
|
3188
|
|
|
'HE(LMNRST)-3^', 'HE', 'E', |
|
3189
|
|
|
'HE(LMN)-1', 'E', 'E', |
|
3190
|
|
|
'HEUR1$', 'ÖR', 'ÖR', |
|
3191
|
|
|
'HE(HIUY)--1', 'H', None, |
|
3192
|
|
|
'HIH(AÄEIOÖUÜY)-1', 'IH', None, |
|
3193
|
|
|
'HLH(AÄEIOÖUÜY)-1', 'LH', None, |
|
3194
|
|
|
'HMH(AÄEIOÖUÜY)-1', 'MH', None, |
|
3195
|
|
|
'HNH(AÄEIOÖUÜY)-1', 'NH', None, |
|
3196
|
|
|
'HOBBY9^', 'HOBI', None, |
|
3197
|
|
|
'HOCHBEGAB-----^', 'HOCH ', 'UK ', |
|
3198
|
|
|
'HOCHTALEN-----^', 'HOCH ', 'UK ', |
|
3199
|
|
|
'HOCHZUFRI-----^', 'HOCH ', 'UK ', |
|
3200
|
|
|
'HO(HIY)--1', 'H', None, |
|
3201
|
|
|
'HRH(AÄEIOÖUÜY)-1', 'RH', None, |
|
3202
|
|
|
'HUH(AÄEIOÖUÜY)-1', 'UH', None, |
|
3203
|
|
|
'HUIS^^', 'HÜS', 'IZ', |
|
3204
|
|
|
'HUIS$', 'ÜS', 'IZ', |
|
3205
|
|
|
'HUI--1', 'H', None, |
|
3206
|
|
|
'HYGIEN^', 'HÜKIEN', None, |
|
3207
|
|
|
'HY9^', 'HÜ', None, |
|
3208
|
|
|
'HY(BDGMNPST)-', 'Ü', None, |
|
3209
|
|
|
'H.^', None, 'H.', |
|
3210
|
|
|
'HÄU--1', 'H', None, |
|
3211
|
|
|
'H^', 'H', '', |
|
3212
|
|
|
'H', '', '', |
|
3213
|
|
|
'ICHELL---', 'ISH', 'IZ', |
|
3214
|
|
|
'ICHI$', 'ISHI', 'IZI', |
|
3215
|
|
|
'IEC$', 'IZ', 'IZ', |
|
3216
|
|
|
'IEDENSTELLE------', 'IDN ', 'ITN ', |
|
3217
|
|
|
'IEI-3', '', '', |
|
3218
|
|
|
'IELL3', 'IEL', 'IEL', |
|
3219
|
|
|
'IENNE$', 'IN', 'IN', |
|
3220
|
|
|
'IERRE$', 'IER', 'IER', |
|
3221
|
|
|
'IERZULAN---', 'IR ZU ', 'IR ZU ', |
|
3222
|
|
|
'IETTE$', 'IT', 'IT', |
|
3223
|
|
|
'IEU', 'IÖ', 'IÖ', |
|
3224
|
|
|
'IE<4', 'I', 'I', |
|
3225
|
|
|
'IGL-1', 'IK', None, |
|
3226
|
|
|
'IGHT3$', 'EIT', 'EIT', |
|
3227
|
|
|
'IGNI(EO)-', 'INI', 'INI', |
|
3228
|
|
|
'IGN(AEOU)-$', 'INI', 'INI', |
|
3229
|
|
|
'IHER(DGLKRT)--1', 'IHE', None, |
|
3230
|
|
|
'IHE(IUY)--', 'IH', None, |
|
3231
|
|
|
'IH(AIOÖUÜY)-', 'IH', None, |
|
3232
|
|
|
'IJ(AOU)-', 'I', 'I', |
|
3233
|
|
|
'IJ$', 'I', 'I', |
|
3234
|
|
|
'IJ<', 'EI', 'EI', |
|
3235
|
|
|
'IKOLE$', 'IKOL', 'IKUL', |
|
3236
|
|
|
'ILLAN(STZ)--4', 'ILIA', 'ILIA', |
|
3237
|
|
|
'ILLAR(DT)--4', 'ILIA', 'ILIA', |
|
3238
|
|
|
'IMSTAN----^', 'IM ', 'IN ', |
|
3239
|
|
|
'INDELERREGE------', 'INDL ', 'INTL ', |
|
3240
|
|
|
'INFRAGE-----^$', 'IN ', 'IN ', |
|
3241
|
|
|
'INTERN(AOU)-^', 'INTAN', 'INTAN', |
|
3242
|
|
|
'INVER-', 'INWE', 'INFE', |
|
3243
|
|
|
'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI', |
|
3244
|
|
|
'IUSZ$', 'IUS', None, |
|
3245
|
|
|
'IUTZ$', 'IUS', None, |
|
3246
|
|
|
'IUZ$', 'IUS', None, |
|
3247
|
|
|
'IVER--<', 'IW', None, |
|
3248
|
|
|
'IVIER$', 'IWIE', 'IFIE', |
|
3249
|
|
|
'IV(ÄOÖUÜ)-', 'IW', None, |
|
3250
|
|
|
'IV<3', 'IW', None, |
|
3251
|
|
|
'IY2', 'I', None, |
|
3252
|
|
|
'I(ÈÉÊ)<4', 'I', 'I', |
|
3253
|
|
|
'JAVIE---<^', 'ZA', 'ZA', |
|
3254
|
|
|
'JEANS^$', 'JINS', 'INZ', |
|
3255
|
|
|
'JEANNE^$', 'IAN', 'IAN', |
|
3256
|
|
|
'JEAN-^', 'IA', 'IA', |
|
3257
|
|
|
'JER-^', 'IE', 'IE', |
|
3258
|
|
|
'JE(LMNST)-', 'IE', 'IE', |
|
3259
|
|
|
'JI^', 'JI', None, |
|
3260
|
|
|
'JOR(GK)^$', 'IÖRK', 'IÖRK', |
|
3261
|
|
|
'J', 'I', 'I', |
|
3262
|
|
|
'KC(ÄEIJ)-', 'X', 'X', |
|
3263
|
|
|
'KD', 'KT', None, |
|
3264
|
|
|
'KE(LMNRST)-3^', 'KE', 'KE', |
|
3265
|
|
|
'KG(AÄEILOÖRUÜY)-', 'K', None, |
|
3266
|
|
|
'KH<^', 'K', 'K', |
|
3267
|
|
|
'KIC$', 'KIZ', 'KIZ', |
|
3268
|
|
|
'KLE(LMNRST)-3^', 'KLE', 'KLE', |
|
3269
|
|
|
'KOTELE-^', 'KOTL', 'KUTL', |
|
3270
|
|
|
'KREAT-^', 'KREA', 'KREA', |
|
3271
|
|
|
'KRÜS(TZ)--^', 'KRI', None, |
|
3272
|
|
|
'KRYS(TZ)--^', 'KRI', None, |
|
3273
|
|
|
'KRY9^', 'KRÜ', None, |
|
3274
|
|
|
'KSCH---', 'K', 'K', |
|
3275
|
|
|
'KSH--', 'K', 'K', |
|
3276
|
|
|
'K(SßXZ)7', 'X', 'X', # implies 'KST' -> 'XT' |
|
3277
|
|
|
'KT\'S$', 'X', 'X', |
|
3278
|
|
|
'KTI(AIOU)-3', 'XI', 'XI', |
|
3279
|
|
|
'KT(SßXZ)', 'X', 'X', |
|
3280
|
|
|
'KY9^', 'KÜ', None, |
|
3281
|
|
|
'K\'S$', 'X', 'X', |
|
3282
|
|
|
'K´S$', 'X', 'X', |
|
3283
|
|
|
'LANGES$', ' LANGES', ' LANKEZ', |
|
3284
|
|
|
'LANGE$', ' LANGE', ' LANKE', |
|
3285
|
|
|
'LANG$', ' LANK', ' LANK', |
|
3286
|
|
|
'LARVE-', 'LARF', 'LARF', |
|
3287
|
|
|
'LD(SßZ)$', 'LS', 'LZ', |
|
3288
|
|
|
'LD\'S$', 'LS', 'LZ', |
|
3289
|
|
|
'LD´S$', 'LS', 'LZ', |
|
3290
|
|
|
'LEAND-^', 'LEAN', 'LEAN', |
|
3291
|
|
|
'LEERSTEHE-----^', 'LER ', 'LER ', |
|
3292
|
|
|
'LEICHBLEIB-----', 'LEICH ', 'LEIK ', |
|
3293
|
|
|
'LEICHLAUTE-----', 'LEICH ', 'LEIK ', |
|
3294
|
|
|
'LEIDERREGE------', 'LEIT ', 'LEIT ', |
|
3295
|
|
|
'LEIDGEPR----^', 'LEIT ', 'LEIT ', |
|
3296
|
|
|
'LEINSTEHE-----', 'LEIN ', 'LEIN ', |
|
3297
|
|
|
'LEL-', 'LE', 'LE', |
|
3298
|
|
|
'LE(MNRST)-3^', 'LE', 'LE', |
|
3299
|
|
|
'LETTE$', 'LET', 'LET', |
|
3300
|
|
|
'LFGNAG-', 'LFGAN', 'LFKAN', |
|
3301
|
|
|
'LICHERWEIS----', 'LICHA ', 'LIKA ', |
|
3302
|
|
|
'LIC$', 'LIZ', 'LIZ', |
|
3303
|
|
|
'LIVE^$', 'LEIF', 'LEIF', |
|
3304
|
|
|
'LT(SßZ)$', 'LS', 'LZ', |
|
3305
|
|
|
'LT\'S$', 'LS', 'LZ', |
|
3306
|
|
|
'LT´S$', 'LS', 'LZ', |
|
3307
|
|
|
'LUI(GS)--', 'LU', 'LU', |
|
3308
|
|
|
'LV(AIO)-', 'LW', None, |
|
3309
|
|
|
'LY9^', 'LÜ', None, |
|
3310
|
|
|
'LSTS$', 'LS', 'LZ', |
|
3311
|
|
|
'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None, |
|
3312
|
|
|
'L(SßZ)$', 'LS', None, |
|
3313
|
|
|
'MAIR-<', 'MEI', 'NEI', |
|
3314
|
|
|
'MANAG-', 'MENE', 'NENE', |
|
3315
|
|
|
'MANUEL', 'MANUEL', None, |
|
3316
|
|
|
'MASSEU(RS)-', 'MASÖ', 'NAZÖ', |
|
3317
|
|
|
'MATCH', 'MESH', 'NEZ', |
|
3318
|
|
|
'MAURICE', 'MORIS', 'NURIZ', |
|
3319
|
|
|
'MBH^$', 'MBH', 'MBH', |
|
3320
|
|
|
'MB(ßZ)$', 'MS', None, |
|
3321
|
|
|
'MB(SßTZ)-', 'M', 'N', |
|
3322
|
|
|
'MCG9^', 'MAK', 'NAK', |
|
3323
|
|
|
'MC9^', 'MAK', 'NAK', |
|
3324
|
|
|
'MEMOIR-^', 'MEMOA', 'NENUA', |
|
3325
|
|
|
'MERHAVEN$', 'MAHAFN', None, |
|
3326
|
|
|
'ME(LMNRST)-3^', 'ME', 'NE', |
|
3327
|
|
|
'MEN(STZ)--3', 'ME', None, |
|
3328
|
|
|
'MEN$', 'MEN', None, |
|
3329
|
|
|
'MIGUEL-', 'MIGE', 'NIKE', |
|
3330
|
|
|
'MIKE^$', 'MEIK', 'NEIK', |
|
3331
|
|
|
'MITHILFE----^$', 'MIT H', 'NIT ', |
|
3332
|
|
|
'MN$', 'M', None, |
|
3333
|
|
|
'MN', 'N', 'N', |
|
3334
|
|
|
'MPJUTE-', 'MPUT', 'NBUT', |
|
3335
|
|
|
'MP(ßZ)$', 'MS', None, |
|
3336
|
|
|
'MP(SßTZ)-', 'M', 'N', |
|
3337
|
|
|
'MP(BDJLMNPQVW)-', 'MB', 'NB', |
|
3338
|
|
|
'MY9^', 'MÜ', None, |
|
3339
|
|
|
'M(ßZ)$', 'MS', None, |
|
3340
|
|
|
'M´G7^', 'MAK', 'NAK', |
|
3341
|
|
|
'M\'G7^', 'MAK', 'NAK', |
|
3342
|
|
|
'M´^', 'MAK', 'NAK', |
|
3343
|
|
|
'M\'^', 'MAK', 'NAK', |
|
3344
|
|
|
'M', None, 'N', |
|
3345
|
|
|
'NACH^^', 'NACH', 'NAK', |
|
3346
|
|
|
'NADINE', 'NADIN', 'NATIN', |
|
3347
|
|
|
'NAIV--', 'NA', 'NA', |
|
3348
|
|
|
'NAISE$', 'NESE', 'NEZE', |
|
3349
|
|
|
'NAUGENOMM------', 'NAU ', 'NAU ', |
|
3350
|
|
|
'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT', |
|
3351
|
|
|
'NCH$', 'NSH', 'NZ', |
|
3352
|
|
|
'NCOISE$', 'SOA', 'ZUA', |
|
3353
|
|
|
'NCOIS$', 'SOA', 'ZUA', |
|
3354
|
|
|
'NDAR$', 'NDA', 'NTA', |
|
3355
|
|
|
'NDERINGEN------', 'NDE ', 'NTE ', |
|
3356
|
|
|
'NDRO(CDKTZ)-', 'NTRO', None, |
|
3357
|
|
|
'ND(BFGJLMNPQVW)-', 'NT', None, |
|
3358
|
|
|
'ND(SßZ)$', 'NS', 'NZ', |
|
3359
|
|
|
'ND\'S$', 'NS', 'NZ', |
|
3360
|
|
|
'ND´S$', 'NS', 'NZ', |
|
3361
|
|
|
'NEBEN^^', 'NEBN', 'NEBN', |
|
3362
|
|
|
'NENGELERN------', 'NEN ', 'NEN ', |
|
3363
|
|
|
'NENLERN(ET)---', 'NEN LE', 'NEN LE', |
|
3364
|
|
|
'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE', |
|
3365
|
|
|
'NE(LMNRST)-3^', 'NE', 'NE', |
|
3366
|
|
|
'NEN-3', 'NE', 'NE', |
|
3367
|
|
|
'NETTE$', 'NET', 'NET', |
|
3368
|
|
|
'NGU^^', 'NU', 'NU', |
|
3369
|
|
|
'NG(BDFJLMNPQRTVW)-', 'NK', 'NK', |
|
3370
|
|
|
'NH(AUO)-$', 'NI', 'NI', |
|
3371
|
|
|
'NICHTSAHNEN-----', 'NIX ', 'NIX ', |
|
3372
|
|
|
'NICHTSSAGE----', 'NIX ', 'NIX ', |
|
3373
|
|
|
'NICHTS^^', 'NIX', 'NIX', |
|
3374
|
|
|
'NICHT^^', 'NICHT', 'NIKT', |
|
3375
|
|
|
'NINE$', 'NIN', 'NIN', |
|
3376
|
|
|
'NON^^', 'NON', 'NUN', |
|
3377
|
|
|
'NOTLEIDE-----^', 'NOT ', 'NUT ', |
|
3378
|
|
|
'NOT^^', 'NOT', 'NUT', |
|
3379
|
|
|
'NTI(AIOU)-3', 'NZI', 'NZI', |
|
3380
|
|
|
'NTIEL--3', 'NZI', 'NZI', |
|
3381
|
|
|
'NT(SßZ)$', 'NS', 'NZ', |
|
3382
|
|
|
'NT\'S$', 'NS', 'NZ', |
|
3383
|
|
|
'NT´S$', 'NS', 'NZ', |
|
3384
|
|
|
'NYLON', 'NEILON', 'NEILUN', |
|
3385
|
|
|
'NY9^', 'NÜ', None, |
|
3386
|
|
|
'NSTZUNEH---', 'NST ZU ', 'NZT ZU ', |
|
3387
|
|
|
'NSZ-', 'NS', None, |
|
3388
|
|
|
'NSTS$', 'NS', 'NZ', |
|
3389
|
|
|
'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None, |
|
3390
|
|
|
'N(SßZ)$', 'NS', None, |
|
3391
|
|
|
'OBERE-', 'OBER', None, |
|
3392
|
|
|
'OBER^^', 'OBA', 'UBA', |
|
3393
|
|
|
'OEU2', 'Ö', 'Ö', |
|
3394
|
|
|
'OE<2', 'Ö', 'Ö', |
|
3395
|
|
|
'OGL-', 'OK', None, |
|
3396
|
|
|
'OGNIE-', 'ONI', 'UNI', |
|
3397
|
|
|
'OGN(AEOU)-$', 'ONI', 'UNI', |
|
3398
|
|
|
'OH(AIOÖUÜY)-', 'OH', None, |
|
3399
|
|
|
'OIE$', 'Ö', 'Ö', |
|
3400
|
|
|
'OIRE$', 'OA', 'UA', |
|
3401
|
|
|
'OIR$', 'OA', 'UA', |
|
3402
|
|
|
'OIX', 'OA', 'UA', |
|
3403
|
|
|
'OI<3', 'EU', 'EU', |
|
3404
|
|
|
'OKAY^$', 'OKE', 'UKE', |
|
3405
|
|
|
'OLYN$', 'OLIN', 'ULIN', |
|
3406
|
|
|
'OO(DLMZ)-', 'U', None, |
|
3407
|
|
|
'OO$', 'U', None, |
|
3408
|
|
|
'OO-', '', '', |
|
3409
|
|
|
'ORGINAL-----', 'ORI', 'URI', |
|
3410
|
|
|
'OTI(AÄOÖUÜ)-', 'OZI', 'UZI', |
|
3411
|
|
|
'OUI^', 'WI', 'FI', |
|
3412
|
|
|
'OUILLE$', 'ULIE', 'ULIE', |
|
3413
|
|
|
'OU(DT)-^', 'AU', 'AU', |
|
3414
|
|
|
'OUSE$', 'AUS', 'AUZ', |
|
3415
|
|
|
'OUT-', 'AU', 'AU', |
|
3416
|
|
|
'OU', 'U', 'U', |
|
3417
|
|
|
'O(FV)$', 'AU', 'AU', # due to 'OW$' -> 'AU' |
|
3418
|
|
|
'OVER--<', 'OW', None, |
|
3419
|
|
|
'OV(AOU)-', 'OW', None, |
|
3420
|
|
|
'OW$', 'AU', 'AU', |
|
3421
|
|
|
'OWS$', 'OS', 'UZ', |
|
3422
|
|
|
'OJ(AÄEIOÖUÜ)--', 'O', 'U', |
|
3423
|
|
|
'OYER', 'OIA', None, |
|
3424
|
|
|
'OY(AÄEIOÖUÜ)--', 'O', 'U', |
|
3425
|
|
|
'O(JY)<', 'EU', 'EU', |
|
3426
|
|
|
'OZ$', 'OS', None, |
|
3427
|
|
|
'O´^', 'O', 'U', |
|
3428
|
|
|
'O\'^', 'O', 'U', |
|
3429
|
|
|
'O', None, 'U', |
|
3430
|
|
|
'PATIEN--^', 'PAZI', 'PAZI', |
|
3431
|
|
|
'PENSIO-^', 'PANSI', 'PANZI', |
|
3432
|
|
|
'PE(LMNRST)-3^', 'PE', 'PE', |
|
3433
|
|
|
'PFER-^', 'FE', 'FE', |
|
3434
|
|
|
'P(FH)<', 'F', 'F', |
|
3435
|
|
|
'PIC^$', 'PIK', 'PIK', |
|
3436
|
|
|
'PIC$', 'PIZ', 'PIZ', |
|
3437
|
|
|
'PIPELINE', 'PEIBLEIN', 'PEIBLEIN', |
|
3438
|
|
|
'POLYP-', 'POLÜ', None, |
|
3439
|
|
|
'POLY^^', 'POLI', 'PULI', |
|
3440
|
|
|
'PORTRAIT7', 'PORTRE', 'PURTRE', |
|
3441
|
|
|
'POWER7', 'PAUA', 'PAUA', |
|
3442
|
|
|
'PP(FH)--<', 'B', 'B', |
|
3443
|
|
|
'PP-', '', '', |
|
3444
|
|
|
'PRODUZ-^', 'PRODU', 'BRUTU', |
|
3445
|
|
|
'PRODUZI--', ' PRODU', ' BRUTU', |
|
3446
|
|
|
'PRIX^$', 'PRI', 'PRI', |
|
3447
|
|
|
'PS-^^', 'P', None, |
|
3448
|
|
|
'P(SßZ)^', None, 'Z', |
|
3449
|
|
|
'P(SßZ)$', 'BS', None, |
|
3450
|
|
|
'PT-^', '', '', |
|
3451
|
|
|
'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI', |
|
3452
|
|
|
'PY9^', 'PÜ', None, |
|
3453
|
|
|
'P(AÄEIOÖRUÜY)-', 'P', 'P', |
|
3454
|
|
|
'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None, |
|
3455
|
|
|
'P.^', None, 'P.', |
|
3456
|
|
|
'P^', 'P', None, |
|
3457
|
|
|
'P', 'B', 'B', |
|
3458
|
|
|
'QI-', 'Z', 'Z', |
|
3459
|
|
|
'QUARANT--', 'KARA', 'KARA', |
|
3460
|
|
|
'QUE(LMNRST)-3', 'KWE', 'KFE', |
|
3461
|
|
|
'QUE$', 'K', 'K', |
|
3462
|
|
|
'QUI(NS)$', 'KI', 'KI', |
|
3463
|
|
|
'QUIZ7', 'KWIS', None, |
|
3464
|
|
|
'Q(UV)7', 'KW', 'KF', |
|
3465
|
|
|
'Q<', 'K', 'K', |
|
3466
|
|
|
'RADFAHR----', 'RAT ', 'RAT ', |
|
3467
|
|
|
'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ', |
|
3468
|
|
|
'RCH', 'RCH', 'RK', |
|
3469
|
|
|
'REA(DU)---3^', 'R', None, |
|
3470
|
|
|
'REBSERZEUG------', 'REBS ', 'REBZ ', |
|
3471
|
|
|
'RECHERCH^', 'RESHASH', 'REZAZ', |
|
3472
|
|
|
'RECYCL--', 'RIZEI', 'RIZEI', |
|
3473
|
|
|
'RE(ALST)-3^', 'RE', None, |
|
3474
|
|
|
'REE$', 'RI', 'RI', |
|
3475
|
|
|
'RER$', 'RA', 'RA', |
|
3476
|
|
|
'RE(MNR)-4', 'RE', 'RE', |
|
3477
|
|
|
'RETTE$', 'RET', 'RET', |
|
3478
|
|
|
'REUZ$', 'REUZ', None, |
|
3479
|
|
|
'REW$', 'RU', 'RU', |
|
3480
|
|
|
'RH<^', 'R', 'R', |
|
3481
|
|
|
'RJA(MN)--', 'RI', 'RI', |
|
3482
|
|
|
'ROWD-^', 'RAU', 'RAU', |
|
3483
|
|
|
'RTEMONNAIE-', 'RTMON', 'RTNUN', |
|
3484
|
|
|
'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI', |
|
3485
|
|
|
'RTIEL--3', 'RZI', 'RZI', |
|
3486
|
|
|
'RV(AEOU)-3', 'RW', None, |
|
3487
|
|
|
'RY(KN)-$', 'RI', 'RI', |
|
3488
|
|
|
'RY9^', 'RÜ', None, |
|
3489
|
|
|
'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ', |
|
3490
|
|
|
'SAISO-^', 'SES', 'ZEZ', |
|
3491
|
|
|
'SAFE^$', 'SEIF', 'ZEIF', |
|
3492
|
|
|
'SAUCE-^', 'SOS', 'ZUZ', |
|
3493
|
|
|
'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ', |
|
3494
|
|
|
'SCHSCH---7', '', '', |
|
3495
|
|
|
'SCHTSCH', 'SH', 'Z', |
|
3496
|
|
|
'SC(HZ)<', 'SH', 'Z', |
|
3497
|
|
|
'SC', 'SK', 'ZK', |
|
3498
|
|
|
'SELBSTST--7^^', 'SELB', 'ZELB', |
|
3499
|
|
|
'SELBST7^^', 'SELBST', 'ZELBZT', |
|
3500
|
|
|
'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ', |
|
3501
|
|
|
'SERVI-^', 'SERW', None, |
|
3502
|
|
|
'SE(LMNRST)-3^', 'SE', 'ZE', |
|
3503
|
|
|
'SETTE$', 'SET', 'ZET', |
|
3504
|
|
|
'SHP-^', 'S', 'Z', |
|
3505
|
|
|
'SHST', 'SHT', 'ZT', |
|
3506
|
|
|
'SHTSH', 'SH', 'Z', |
|
3507
|
|
|
'SHT', 'ST', 'Z', |
|
3508
|
|
|
'SHY9^', 'SHÜ', None, |
|
3509
|
|
|
'SH^^', 'SH', None, |
|
3510
|
|
|
'SH3', 'SH', 'Z', |
|
3511
|
|
|
'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ', |
|
3512
|
|
|
'SICHERGEHE----^', 'SICHA ', 'ZIKA ', |
|
3513
|
|
|
'SICHERGESTEL------^', 'SICHA ', 'ZIKA ', |
|
3514
|
|
|
'SICHERSTELL-----^', 'SICHA ', 'ZIKA ', |
|
3515
|
|
|
'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ', |
|
3516
|
|
|
'SIEGLI-^', 'SIKL', 'ZIKL', |
|
3517
|
|
|
'SIGLI-^', 'SIKL', 'ZIKL', |
|
3518
|
|
|
'SIGHT', 'SEIT', 'ZEIT', |
|
3519
|
|
|
'SIGN', 'SEIN', 'ZEIN', |
|
3520
|
|
|
'SKI(NPZ)-', 'SKI', 'ZKI', |
|
3521
|
|
|
'SKI<^', 'SHI', 'ZI', |
|
3522
|
|
|
'SODASS^$', 'SO DAS', 'ZU TAZ', |
|
3523
|
|
|
'SODAß^$', 'SO DAS', 'ZU TAZ', |
|
3524
|
|
|
'SOGENAN--^', 'SO GEN', 'ZU KEN', |
|
3525
|
|
|
'SOUND-', 'SAUN', 'ZAUN', |
|
3526
|
|
|
'STAATS^^', 'STAZ', 'ZTAZ', |
|
3527
|
|
|
'STADT^^', 'STAT', 'ZTAT', |
|
3528
|
|
|
'STANDE$', ' STANDE', ' ZTANTE', |
|
3529
|
|
|
'START^^', 'START', 'ZTART', |
|
3530
|
|
|
'STAURANT7', 'STORAN', 'ZTURAN', |
|
3531
|
|
|
'STEAK-', 'STE', 'ZTE', |
|
3532
|
|
|
'STEPHEN-^$', 'STEW', None, |
|
3533
|
|
|
'STERN', 'STERN', None, |
|
3534
|
|
|
'STRAF^^', 'STRAF', 'ZTRAF', |
|
3535
|
|
|
'ST\'S$', 'Z', 'Z', |
|
3536
|
|
|
'ST´S$', 'Z', 'Z', |
|
3537
|
|
|
'STST--', '', '', |
|
3538
|
|
|
'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT', |
|
3539
|
|
|
'ST(SZ)', 'Z', 'Z', |
|
3540
|
|
|
'SPAREN---^', 'SPA', 'ZPA', |
|
3541
|
|
|
'SPAREND----', ' SPA', ' ZPA', |
|
3542
|
|
|
'S(PTW)-^^', 'S', None, |
|
3543
|
|
|
'SP', 'SP', None, |
|
3544
|
|
|
'STYN(AE)-$', 'STIN', 'ZTIN', |
|
3545
|
|
|
'ST', 'ST', 'ZT', |
|
3546
|
|
|
'SUITE<', 'SIUT', 'ZIUT', |
|
3547
|
|
|
'SUKE--$', 'S', 'Z', |
|
3548
|
|
|
'SURF(EI)-', 'SÖRF', 'ZÖRF', |
|
3549
|
|
|
'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None, |
|
3550
|
|
|
'SYB(IY)--^', 'SIB', None, |
|
3551
|
|
|
'SYL(KVW)--^', 'SI', None, |
|
3552
|
|
|
'SY9^', 'SÜ', None, |
|
3553
|
|
|
'SZE(NPT)-^', 'ZE', 'ZE', |
|
3554
|
|
|
'SZI(ELN)-^', 'ZI', 'ZI', |
|
3555
|
|
|
'SZCZ<', 'SH', 'Z', |
|
3556
|
|
|
'SZT<', 'ST', 'ZT', |
|
3557
|
|
|
'SZ<3', 'SH', 'Z', |
|
3558
|
|
|
'SÜL(KVW)--^', 'SI', None, |
|
3559
|
|
|
'S', None, 'Z', |
|
3560
|
|
|
'TCH', 'SH', 'Z', |
|
3561
|
|
|
'TD(AÄEIOÖRUÜY)-', 'T', None, |
|
3562
|
|
|
'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None, |
|
3563
|
|
|
'TEAT-^', 'TEA', 'TEA', |
|
3564
|
|
|
'TERRAI7^', 'TERA', 'TERA', |
|
3565
|
|
|
'TE(LMNRST)-3^', 'TE', 'TE', |
|
3566
|
|
|
'TH<', 'T', 'T', |
|
3567
|
|
|
'TICHT-', 'TIK', 'TIK', |
|
3568
|
|
|
'TICH$', 'TIK', 'TIK', |
|
3569
|
|
|
'TIC$', 'TIZ', 'TIZ', |
|
3570
|
|
|
'TIGGESTELL-------', 'TIK ', 'TIK ', |
|
3571
|
|
|
'TIGSTELL-----', 'TIK ', 'TIK ', |
|
3572
|
|
|
'TOAS-^', 'TO', 'TU', |
|
3573
|
|
|
'TOILET-', 'TOLE', 'TULE', |
|
3574
|
|
|
'TOIN-', 'TOA', 'TUA', |
|
3575
|
|
|
'TRAECHTI-^', 'TRECHT', 'TREKT', |
|
3576
|
|
|
'TRAECHTIG--', ' TRECHT', ' TREKT', |
|
3577
|
|
|
'TRAINI-', 'TREN', 'TREN', |
|
3578
|
|
|
'TRÄCHTI-^', 'TRECHT', 'TREKT', |
|
3579
|
|
|
'TRÄCHTIG--', ' TRECHT', ' TREKT', |
|
3580
|
|
|
'TSCH', 'SH', 'Z', |
|
3581
|
|
|
'TSH', 'SH', 'Z', |
|
3582
|
|
|
'TST', 'ZT', 'ZT', |
|
3583
|
|
|
'T(Sß)', 'Z', 'Z', |
|
3584
|
|
|
'TT(SZ)--<', '', '', |
|
3585
|
|
|
'TT9', 'T', 'T', |
|
3586
|
|
|
'TV^$', 'TV', 'TV', |
|
3587
|
|
|
'TX(AEIOU)-3', 'SH', 'Z', |
|
3588
|
|
|
'TY9^', 'TÜ', None, |
|
3589
|
|
|
'TZ-', '', '', |
|
3590
|
|
|
'T\'S3$', 'Z', 'Z', |
|
3591
|
|
|
'T´S3$', 'Z', 'Z', |
|
3592
|
|
|
'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ', |
|
3593
|
|
|
'UEBER^^', 'ÜBA', 'IBA', |
|
3594
|
|
|
'UE2', 'Ü', 'I', |
|
3595
|
|
|
'UGL-', 'UK', None, |
|
3596
|
|
|
'UH(AOÖUÜY)-', 'UH', None, |
|
3597
|
|
|
'UIE$', 'Ü', 'I', |
|
3598
|
|
|
'UM^^', 'UM', 'UN', |
|
3599
|
|
|
'UNTERE--3', 'UNTE', 'UNTE', |
|
3600
|
|
|
'UNTER^^', 'UNTA', 'UNTA', |
|
3601
|
|
|
'UNVER^^', 'UNFA', 'UNFA', |
|
3602
|
|
|
'UN^^', 'UN', 'UN', |
|
3603
|
|
|
'UTI(AÄOÖUÜ)-', 'UZI', 'UZI', |
|
3604
|
|
|
'UVE-4', 'UW', None, |
|
3605
|
|
|
'UY2', 'UI', None, |
|
3606
|
|
|
'UZZ', 'AS', 'AZ', |
|
3607
|
|
|
'VACL-^', 'WAZ', 'FAZ', |
|
3608
|
|
|
'VAC$', 'WAZ', 'FAZ', |
|
3609
|
|
|
'VAN DEN ^', 'FANDN', 'FANTN', |
|
3610
|
|
|
'VANES-^', 'WANE', None, |
|
3611
|
|
|
'VATRO-', 'WATR', None, |
|
3612
|
|
|
'VA(DHJNT)--^', 'F', None, |
|
3613
|
|
|
'VEDD-^', 'FE', 'FE', |
|
3614
|
|
|
'VE(BEHIU)--^', 'F', None, |
|
3615
|
|
|
'VEL(BDLMNT)-^', 'FEL', None, |
|
3616
|
|
|
'VENTZ-^', 'FEN', None, |
|
3617
|
|
|
'VEN(NRSZ)-^', 'FEN', None, |
|
3618
|
|
|
'VER(AB)-^$', 'WER', None, |
|
3619
|
|
|
'VERBAL^$', 'WERBAL', None, |
|
3620
|
|
|
'VERBAL(EINS)-^', 'WERBAL', None, |
|
3621
|
|
|
'VERTEBR--', 'WERTE', None, |
|
3622
|
|
|
'VEREIN-----', 'F', None, |
|
3623
|
|
|
'VEREN(AEIOU)-^', 'WEREN', None, |
|
3624
|
|
|
'VERIFI', 'WERIFI', None, |
|
3625
|
|
|
'VERON(AEIOU)-^', 'WERON', None, |
|
3626
|
|
|
'VERSEN^', 'FERSN', 'FAZN', |
|
3627
|
|
|
'VERSIERT--^', 'WERSI', None, |
|
3628
|
|
|
'VERSIO--^', 'WERS', None, |
|
3629
|
|
|
'VERSUS', 'WERSUS', None, |
|
3630
|
|
|
'VERTI(GK)-', 'WERTI', None, |
|
3631
|
|
|
'VER^^', 'FER', 'FA', |
|
3632
|
|
|
'VERSPRECHE-------', ' FER', ' FA', |
|
3633
|
|
|
'VER$', 'WA', None, |
|
3634
|
|
|
'VER', 'FA', 'FA', |
|
3635
|
|
|
'VET(HT)-^', 'FET', 'FET', |
|
3636
|
|
|
'VETTE$', 'WET', 'FET', |
|
3637
|
|
|
'VE^', 'WE', None, |
|
3638
|
|
|
'VIC$', 'WIZ', 'FIZ', |
|
3639
|
|
|
'VIELSAGE----', 'FIL ', 'FIL ', |
|
3640
|
|
|
'VIEL', 'FIL', 'FIL', |
|
3641
|
|
|
'VIEW', 'WIU', 'FIU', |
|
3642
|
|
|
'VILL(AE)-', 'WIL', None, |
|
3643
|
|
|
'VIS(ACEIKUVWZ)-<^', 'WIS', None, |
|
3644
|
|
|
'VI(ELS)--^', 'F', None, |
|
3645
|
|
|
'VILLON--', 'WILI', 'FILI', |
|
3646
|
|
|
'VIZE^^', 'FIZE', 'FIZE', |
|
3647
|
|
|
'VLIE--^', 'FL', None, |
|
3648
|
|
|
'VL(AEIOU)--', 'W', None, |
|
3649
|
|
|
'VOKA-^', 'WOK', None, |
|
3650
|
|
|
'VOL(ATUVW)--^', 'WO', None, |
|
3651
|
|
|
'VOR^^', 'FOR', 'FUR', |
|
3652
|
|
|
'VR(AEIOU)--', 'W', None, |
|
3653
|
|
|
'VV9', 'W', None, |
|
3654
|
|
|
'VY9^', 'WÜ', 'FI', |
|
3655
|
|
|
'V(ÜY)-', 'W', None, |
|
3656
|
|
|
'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None, |
|
3657
|
|
|
'V(AEIJLRU)-<', 'W', None, |
|
3658
|
|
|
'V.^', 'V.', None, |
|
3659
|
|
|
'V<', 'F', 'F', |
|
3660
|
|
|
'WEITERENTWI-----^', 'WEITA ', 'FEITA ', |
|
3661
|
|
|
'WEITREICH-----^', 'WEIT ', 'FEIT ', |
|
3662
|
|
|
'WEITVER^', 'WEIT FER', 'FEIT FA', |
|
3663
|
|
|
'WE(LMNRST)-3^', 'WE', 'FE', |
|
3664
|
|
|
'WER(DST)-', 'WER', None, |
|
3665
|
|
|
'WIC$', 'WIZ', 'FIZ', |
|
3666
|
|
|
'WIEDERU--', 'WIDE', 'FITE', |
|
3667
|
|
|
'WIEDER^$', 'WIDA', 'FITA', |
|
3668
|
|
|
'WIEDER^^', 'WIDA ', 'FITA ', |
|
3669
|
|
|
'WIEVIEL', 'WI FIL', 'FI FIL', |
|
3670
|
|
|
'WISUEL', 'WISUEL', None, |
|
3671
|
|
|
'WR-^', 'W', None, |
|
3672
|
|
|
'WY9^', 'WÜ', 'FI', |
|
3673
|
|
|
'W(BDFGJKLMNPQRSTZ)-', 'F', None, |
|
3674
|
|
|
'W$', 'F', None, |
|
3675
|
|
|
'W', None, 'F', |
|
3676
|
|
|
'X<^', 'Z', 'Z', |
|
3677
|
|
|
'XHAVEN$', 'XAFN', None, |
|
3678
|
|
|
'X(CSZ)', 'X', 'X', |
|
3679
|
|
|
'XTS(CH)--', 'XT', 'XT', |
|
3680
|
|
|
'XT(SZ)', 'Z', 'Z', |
|
3681
|
|
|
'YE(LMNRST)-3^', 'IE', 'IE', |
|
3682
|
|
|
'YE-3', 'I', 'I', |
|
3683
|
|
|
'YOR(GK)^$', 'IÖRK', 'IÖRK', |
|
3684
|
|
|
'Y(AOU)-<7', 'I', 'I', |
|
3685
|
|
|
'Y(BKLMNPRSTX)-1', 'Ü', None, |
|
3686
|
|
|
'YVES^$', 'IF', 'IF', |
|
3687
|
|
|
'YVONNE^$', 'IWON', 'IFUN', |
|
3688
|
|
|
'Y.^', 'Y.', None, |
|
3689
|
|
|
'Y', 'I', 'I', |
|
3690
|
|
|
'ZC(AOU)-', 'SK', 'ZK', |
|
3691
|
|
|
'ZE(LMNRST)-3^', 'ZE', 'ZE', |
|
3692
|
|
|
'ZIEJ$', 'ZI', 'ZI', |
|
3693
|
|
|
'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA', |
|
3694
|
|
|
'ZL(AEIOU)-', 'SL', None, |
|
3695
|
|
|
'ZS(CHT)--', '', '', |
|
3696
|
|
|
'ZS', 'SH', 'Z', |
|
3697
|
|
|
'ZUERST', 'ZUERST', 'ZUERST', |
|
3698
|
|
|
'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE', |
|
3699
|
|
|
'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ', |
|
3700
|
|
|
'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN', |
|
3701
|
|
|
'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ', |
|
3702
|
|
|
'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN', |
|
3703
|
|
|
'ZURUECK^^', 'ZURÜK', 'ZURIK', |
|
3704
|
|
|
'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT', |
|
3705
|
|
|
'ZURÜCK^^', 'ZURÜK', 'ZURIK', |
|
3706
|
|
|
'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE', |
|
3707
|
|
|
'ZUTAGE', 'ZU TAGE', 'ZU TAKE', |
|
3708
|
|
|
'ZUVER^^', 'ZUFA', 'ZUFA', |
|
3709
|
|
|
'ZUVIEL', 'ZU FIL', 'ZU FIL', |
|
3710
|
|
|
'ZUWENIG', 'ZU WENIK', 'ZU FENIK', |
|
3711
|
|
|
'ZY9^', 'ZÜ', None, |
|
3712
|
|
|
'ZYK3$', 'ZIK', None, |
|
3713
|
|
|
'Z(VW)7^', 'SW', None, |
|
3714
|
|
|
None, None, None) |
|
3715
|
|
|
|
|
3716
|
|
|
phonet_hash = Counter() |
|
3717
|
|
|
alpha_pos = Counter() |
|
3718
|
|
|
|
|
3719
|
|
|
phonet_hash_1 = Counter() |
|
3720
|
|
|
phonet_hash_2 = Counter() |
|
3721
|
|
|
|
|
3722
|
|
|
_phonet_upper_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
3723
|
|
|
'abcdefghijklmnopqrstuvwxyzàáâãåäæ' + |
|
3724
|
|
|
'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'), |
|
3725
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' + |
|
3726
|
|
|
'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ')) |
|
3727
|
|
|
|
|
3728
|
|
|
def _trinfo(text, rule, err_text, lang): |
|
3729
|
|
|
"""Output debug information.""" |
|
3730
|
|
|
if lang == 'none': |
|
3731
|
|
|
_phonet_rules = _phonet_rules_no_lang |
|
3732
|
|
|
else: |
|
3733
|
|
|
_phonet_rules = _phonet_rules_german |
|
3734
|
|
|
|
|
3735
|
|
|
from_rule = ('(NULL)' if _phonet_rules[rule] is None else |
|
3736
|
|
|
_phonet_rules[rule]) |
|
3737
|
|
|
to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else |
|
3738
|
|
|
_phonet_rules[rule + 1]) |
|
3739
|
|
|
to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else |
|
3740
|
|
|
_phonet_rules[rule + 2]) |
|
3741
|
|
|
print('"{} {}: "{}"{}"{}" {}'.format(text, ((rule / 3) + 1), |
|
3742
|
|
|
from_rule, to_rule1, to_rule2, |
|
3743
|
|
|
err_text)) |
|
3744
|
|
|
|
|
3745
|
|
|
def _initialize_phonet(lang): |
|
3746
|
|
|
"""Initialize phonet variables.""" |
|
3747
|
|
|
if lang == 'none': |
|
3748
|
|
|
_phonet_rules = _phonet_rules_no_lang |
|
3749
|
|
|
else: |
|
3750
|
|
|
_phonet_rules = _phonet_rules_german |
|
3751
|
|
|
|
|
3752
|
|
|
phonet_hash[''] = -1 |
|
3753
|
|
|
|
|
3754
|
|
|
# German and international umlauts |
|
3755
|
|
|
for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë', |
|
3756
|
|
|
'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø', |
|
3757
|
|
|
'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}: |
|
3758
|
|
|
alpha_pos[j] = 1 |
|
3759
|
|
|
phonet_hash[j] = -1 |
|
3760
|
|
|
|
|
3761
|
|
|
# "normal" letters ('A'-'Z') |
|
3762
|
|
|
for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'): |
|
3763
|
|
|
alpha_pos[j] = i + 2 |
|
3764
|
|
|
phonet_hash[j] = -1 |
|
3765
|
|
|
|
|
3766
|
|
|
for i in range(26): |
|
3767
|
|
|
for j in range(28): |
|
3768
|
|
|
phonet_hash_1[i, j] = -1 |
|
3769
|
|
|
phonet_hash_2[i, j] = -1 |
|
3770
|
|
|
|
|
3771
|
|
|
# for each phonetc rule |
|
3772
|
|
|
for i in range(len(_phonet_rules)): |
|
3773
|
|
|
rule = _phonet_rules[i] |
|
3774
|
|
|
|
|
3775
|
|
|
if rule and i % 3 == 0: |
|
3776
|
|
|
# calculate first hash value |
|
3777
|
|
|
k = _phonet_rules[i][0] |
|
3778
|
|
|
|
|
3779
|
|
|
if phonet_hash[k] < 0 and (_phonet_rules[i+1] or |
|
|
|
|
|
|
3780
|
|
|
_phonet_rules[i+2]): |
|
3781
|
|
|
phonet_hash[k] = i |
|
3782
|
|
|
|
|
3783
|
|
|
# calculate second hash values |
|
3784
|
|
|
if k and alpha_pos[k] >= 2: |
|
|
|
|
|
|
3785
|
|
|
k = alpha_pos[k] |
|
3786
|
|
|
|
|
3787
|
|
|
j = k-2 |
|
3788
|
|
|
rule = rule[1:] |
|
3789
|
|
|
|
|
3790
|
|
|
if not rule: |
|
3791
|
|
|
rule = ' ' |
|
3792
|
|
|
elif rule[0] == '(': |
|
3793
|
|
|
rule = rule[1:] |
|
3794
|
|
|
else: |
|
3795
|
|
|
rule = rule[0] |
|
3796
|
|
|
|
|
3797
|
|
|
while rule and (rule[0] != ')'): |
|
3798
|
|
|
k = alpha_pos[rule[0]] |
|
3799
|
|
|
|
|
3800
|
|
|
if k > 0: |
|
3801
|
|
|
# add hash value for this letter |
|
3802
|
|
|
if phonet_hash_1[j, k] < 0: |
|
|
|
|
|
|
3803
|
|
|
phonet_hash_1[j, k] = i |
|
3804
|
|
|
phonet_hash_2[j, k] = i |
|
3805
|
|
|
|
|
3806
|
|
|
if phonet_hash_2[j, k] >= (i-30): |
|
|
|
|
|
|
3807
|
|
|
phonet_hash_2[j, k] = i |
|
3808
|
|
|
else: |
|
3809
|
|
|
k = -1 |
|
3810
|
|
|
|
|
3811
|
|
|
if k <= 0: |
|
3812
|
|
|
# add hash value for all letters |
|
3813
|
|
|
if phonet_hash_1[j, 0] < 0: |
|
3814
|
|
|
phonet_hash_1[j, 0] = i |
|
3815
|
|
|
|
|
3816
|
|
|
phonet_hash_2[j, 0] = i |
|
3817
|
|
|
|
|
3818
|
|
|
rule = rule[1:] |
|
3819
|
|
|
|
|
3820
|
|
|
def _phonet(term, mode, lang, trace): |
|
3821
|
|
|
"""Return the phonet coded form of a term.""" |
|
3822
|
|
|
if lang == 'none': |
|
3823
|
|
|
_phonet_rules = _phonet_rules_no_lang |
|
3824
|
|
|
else: |
|
3825
|
|
|
_phonet_rules = _phonet_rules_german |
|
3826
|
|
|
|
|
3827
|
|
|
char0 = '' |
|
3828
|
|
|
dest = term |
|
3829
|
|
|
|
|
3830
|
|
|
if not term: |
|
3831
|
|
|
return '' |
|
3832
|
|
|
|
|
3833
|
|
|
term_length = len(term) |
|
3834
|
|
|
|
|
3835
|
|
|
# convert input string to upper-case |
|
3836
|
|
|
src = term.translate(_phonet_upper_translation) |
|
3837
|
|
|
|
|
3838
|
|
|
# check "src" |
|
3839
|
|
|
i = 0 |
|
3840
|
|
|
j = 0 |
|
3841
|
|
|
zeta = 0 |
|
3842
|
|
|
|
|
3843
|
|
|
while i < len(src): |
|
3844
|
|
|
char = src[i] |
|
3845
|
|
|
|
|
3846
|
|
|
if trace: |
|
3847
|
|
|
print('\ncheck position {}: src = "{}", dest = "{}"'.format |
|
3848
|
|
|
(j, src[i:], dest[:j])) |
|
3849
|
|
|
|
|
3850
|
|
|
pos = alpha_pos[char] |
|
3851
|
|
|
|
|
3852
|
|
|
if pos >= 2: |
|
3853
|
|
|
xpos = pos-2 |
|
3854
|
|
|
|
|
3855
|
|
|
if i+1 == len(src): |
|
3856
|
|
|
pos = alpha_pos[''] |
|
3857
|
|
|
else: |
|
3858
|
|
|
pos = alpha_pos[src[i+1]] |
|
3859
|
|
|
|
|
3860
|
|
|
start1 = phonet_hash_1[xpos, pos] |
|
3861
|
|
|
start2 = phonet_hash_1[xpos, 0] |
|
3862
|
|
|
end1 = phonet_hash_2[xpos, pos] |
|
3863
|
|
|
end2 = phonet_hash_2[xpos, 0] |
|
3864
|
|
|
|
|
3865
|
|
|
# preserve rule priorities |
|
3866
|
|
|
if (start2 >= 0) and ((start1 < 0) or (start2 < start1)): |
|
3867
|
|
|
pos = start1 |
|
3868
|
|
|
start1 = start2 |
|
3869
|
|
|
start2 = pos |
|
3870
|
|
|
pos = end1 |
|
3871
|
|
|
end1 = end2 |
|
3872
|
|
|
end2 = pos |
|
3873
|
|
|
|
|
3874
|
|
|
if (end1 >= start2) and (start2 >= 0): |
|
3875
|
|
|
if end2 > end1: |
|
3876
|
|
|
end1 = end2 |
|
3877
|
|
|
|
|
3878
|
|
|
start2 = -1 |
|
3879
|
|
|
end2 = -1 |
|
3880
|
|
|
else: |
|
3881
|
|
|
pos = phonet_hash[char] |
|
3882
|
|
|
start1 = pos |
|
3883
|
|
|
end1 = 10000 |
|
3884
|
|
|
start2 = -1 |
|
3885
|
|
|
end2 = -1 |
|
3886
|
|
|
|
|
3887
|
|
|
pos = start1 |
|
3888
|
|
|
zeta0 = 0 |
|
3889
|
|
|
|
|
3890
|
|
|
if pos >= 0: |
|
3891
|
|
|
# check rules for this char |
|
3892
|
|
|
while ((_phonet_rules[pos] is None) or |
|
3893
|
|
|
(_phonet_rules[pos][0] == char)): |
|
3894
|
|
|
if pos > end1: |
|
3895
|
|
|
if start2 > 0: |
|
3896
|
|
|
pos = start2 |
|
3897
|
|
|
start1 = start2 |
|
3898
|
|
|
start2 = -1 |
|
3899
|
|
|
end1 = end2 |
|
3900
|
|
|
end2 = -1 |
|
3901
|
|
|
continue |
|
3902
|
|
|
|
|
3903
|
|
|
break |
|
3904
|
|
|
|
|
3905
|
|
|
if (((_phonet_rules[pos] is None) or |
|
3906
|
|
|
(_phonet_rules[pos + mode] is None))): |
|
3907
|
|
|
# no conversion rule available |
|
3908
|
|
|
pos += 3 |
|
3909
|
|
|
continue |
|
3910
|
|
|
|
|
3911
|
|
|
if trace: |
|
3912
|
|
|
_trinfo('> rule no.', pos, 'is being checked', lang) |
|
3913
|
|
|
|
|
3914
|
|
|
# check whole string |
|
3915
|
|
|
matches = 1 # number of matching letters |
|
3916
|
|
|
priority = 5 # default priority |
|
3917
|
|
|
rule = _phonet_rules[pos] |
|
3918
|
|
|
rule = rule[1:] |
|
3919
|
|
|
|
|
3920
|
|
|
while (rule and |
|
3921
|
|
|
(len(src) > (i + matches)) and |
|
3922
|
|
|
(src[i + matches] == rule[0]) and |
|
3923
|
|
|
not rule[0].isdigit() and |
|
3924
|
|
|
(rule not in '(-<^$')): |
|
3925
|
|
|
matches += 1 |
|
3926
|
|
|
rule = rule[1:] |
|
3927
|
|
|
|
|
3928
|
|
|
if rule and (rule[0] == '('): |
|
3929
|
|
|
# check an array of letters |
|
3930
|
|
|
if (((len(src) > (i + matches)) and |
|
3931
|
|
|
src[i + matches].isalpha() and |
|
3932
|
|
|
(src[i + matches] in rule[1:]))): |
|
3933
|
|
|
matches += 1 |
|
3934
|
|
|
|
|
3935
|
|
|
while rule and rule[0] != ')': |
|
3936
|
|
|
rule = rule[1:] |
|
3937
|
|
|
|
|
3938
|
|
|
# if rule[0] == ')': |
|
3939
|
|
|
rule = rule[1:] |
|
3940
|
|
|
|
|
3941
|
|
|
if rule: |
|
3942
|
|
|
priority0 = ord(rule[0]) |
|
3943
|
|
|
else: |
|
3944
|
|
|
priority0 = 0 |
|
3945
|
|
|
|
|
3946
|
|
|
matches0 = matches |
|
3947
|
|
|
|
|
3948
|
|
|
while rule and rule[0] == '-' and matches > 1: |
|
3949
|
|
|
matches -= 1 |
|
3950
|
|
|
rule = rule[1:] |
|
3951
|
|
|
|
|
3952
|
|
|
if rule and rule[0] == '<': |
|
3953
|
|
|
rule = rule[1:] |
|
3954
|
|
|
|
|
3955
|
|
|
if rule and rule[0].isdigit(): |
|
3956
|
|
|
# read priority |
|
3957
|
|
|
priority = int(rule[0]) |
|
3958
|
|
|
rule = rule[1:] |
|
3959
|
|
|
|
|
3960
|
|
|
if rule and rule[0:2] == '^^': |
|
3961
|
|
|
rule = rule[1:] |
|
3962
|
|
|
|
|
3963
|
|
|
if (not rule or |
|
3964
|
|
|
((rule[0] == '^') and |
|
3965
|
|
|
((i == 0) or not src[i-1].isalpha()) and |
|
3966
|
|
|
((rule[1:2] != '$') or |
|
3967
|
|
|
(not (src[i+matches0:i+matches0+1].isalpha()) and |
|
3968
|
|
|
(src[i+matches0:i+matches0+1] != '.')))) or |
|
3969
|
|
|
((rule[0] == '$') and (i > 0) and |
|
3970
|
|
|
src[i-1].isalpha() and |
|
3971
|
|
|
((not src[i+matches0:i+matches0+1].isalpha()) and |
|
3972
|
|
|
(src[i+matches0:i+matches0+1] != '.')))): |
|
3973
|
|
|
# look for continuation, if: |
|
3974
|
|
|
# matches > 1 und NO '-' in first string */ |
|
3975
|
|
|
pos0 = -1 |
|
3976
|
|
|
|
|
3977
|
|
|
start3 = 0 |
|
3978
|
|
|
start4 = 0 |
|
3979
|
|
|
end3 = 0 |
|
3980
|
|
|
end4 = 0 |
|
3981
|
|
|
|
|
3982
|
|
|
if (((matches > 1) and |
|
3983
|
|
|
src[i+matches:i+matches+1] and |
|
3984
|
|
|
(priority0 != ord('-')))): |
|
3985
|
|
|
char0 = src[i+matches-1] |
|
3986
|
|
|
pos0 = alpha_pos[char0] |
|
3987
|
|
|
|
|
3988
|
|
|
if pos0 >= 2 and src[i+matches]: |
|
3989
|
|
|
xpos = pos0 - 2 |
|
3990
|
|
|
pos0 = alpha_pos[src[i+matches]] |
|
3991
|
|
|
start3 = phonet_hash_1[xpos, pos0] |
|
3992
|
|
|
start4 = phonet_hash_1[xpos, 0] |
|
3993
|
|
|
end3 = phonet_hash_2[xpos, pos0] |
|
3994
|
|
|
end4 = phonet_hash_2[xpos, 0] |
|
3995
|
|
|
|
|
3996
|
|
|
# preserve rule priorities |
|
3997
|
|
|
if (((start4 >= 0) and |
|
3998
|
|
|
((start3 < 0) or (start4 < start3)))): |
|
3999
|
|
|
pos0 = start3 |
|
4000
|
|
|
start3 = start4 |
|
4001
|
|
|
start4 = pos0 |
|
4002
|
|
|
pos0 = end3 |
|
4003
|
|
|
end3 = end4 |
|
4004
|
|
|
end4 = pos0 |
|
4005
|
|
|
|
|
4006
|
|
|
if (end3 >= start4) and (start4 >= 0): |
|
4007
|
|
|
if end4 > end3: |
|
4008
|
|
|
end3 = end4 |
|
4009
|
|
|
|
|
4010
|
|
|
start4 = -1 |
|
4011
|
|
|
end4 = -1 |
|
4012
|
|
|
else: |
|
4013
|
|
|
pos0 = phonet_hash[char0] |
|
4014
|
|
|
start3 = pos0 |
|
4015
|
|
|
end3 = 10000 |
|
4016
|
|
|
start4 = -1 |
|
4017
|
|
|
end4 = -1 |
|
4018
|
|
|
|
|
4019
|
|
|
pos0 = start3 |
|
4020
|
|
|
|
|
4021
|
|
|
# check continuation rules for src[i+matches] |
|
4022
|
|
|
if pos0 >= 0: |
|
4023
|
|
|
while ((_phonet_rules[pos0] is None) or |
|
4024
|
|
|
(_phonet_rules[pos0][0] == char0)): |
|
4025
|
|
|
if pos0 > end3: |
|
4026
|
|
|
if start4 > 0: |
|
4027
|
|
|
pos0 = start4 |
|
4028
|
|
|
start3 = start4 |
|
4029
|
|
|
start4 = -1 |
|
4030
|
|
|
end3 = end4 |
|
4031
|
|
|
end4 = -1 |
|
4032
|
|
|
continue |
|
4033
|
|
|
|
|
4034
|
|
|
priority0 = -1 |
|
4035
|
|
|
|
|
4036
|
|
|
# important |
|
4037
|
|
|
break |
|
4038
|
|
|
|
|
4039
|
|
|
if (((_phonet_rules[pos0] is None) or |
|
4040
|
|
|
(_phonet_rules[pos0 + mode] is None))): |
|
4041
|
|
|
# no conversion rule available |
|
4042
|
|
|
pos0 += 3 |
|
4043
|
|
|
continue |
|
4044
|
|
|
|
|
4045
|
|
|
if trace: |
|
4046
|
|
|
_trinfo('> > continuation rule no.', pos0, |
|
4047
|
|
|
'is being checked', lang) |
|
4048
|
|
|
|
|
4049
|
|
|
# check whole string |
|
4050
|
|
|
matches0 = matches |
|
4051
|
|
|
priority0 = 5 |
|
4052
|
|
|
rule = _phonet_rules[pos0] |
|
4053
|
|
|
rule = rule[1:] |
|
4054
|
|
|
|
|
4055
|
|
|
while (rule and |
|
4056
|
|
|
(src[i+matches0:i+matches0+1] == |
|
4057
|
|
|
rule[0]) and |
|
4058
|
|
|
(not rule[0].isdigit() or |
|
4059
|
|
|
(rule in '(-<^$'))): |
|
4060
|
|
|
matches0 += 1 |
|
4061
|
|
|
rule = rule[1:] |
|
4062
|
|
|
|
|
4063
|
|
|
if rule and rule[0] == '(': |
|
4064
|
|
|
# check an array of letters |
|
4065
|
|
|
if ((src[i+matches0:i+matches0+1] |
|
4066
|
|
|
.isalpha() and |
|
4067
|
|
|
(src[i+matches0] in rule[1:]))): |
|
4068
|
|
|
matches0 += 1 |
|
4069
|
|
|
|
|
4070
|
|
|
while rule and rule[0] != ')': |
|
4071
|
|
|
rule = rule[1:] |
|
4072
|
|
|
|
|
4073
|
|
|
# if rule[0] == ')': |
|
4074
|
|
|
rule = rule[1:] |
|
4075
|
|
|
|
|
4076
|
|
|
while rule and rule[0] == '-': |
|
4077
|
|
|
# "matches0" is NOT decremented |
|
4078
|
|
|
# because of "if (matches0 == matches)" |
|
4079
|
|
|
rule = rule[1:] |
|
4080
|
|
|
|
|
4081
|
|
|
if rule and rule[0] == '<': |
|
4082
|
|
|
rule = rule[1:] |
|
4083
|
|
|
|
|
4084
|
|
|
if rule and rule[0].isdigit(): |
|
4085
|
|
|
priority0 = int(rule[0]) |
|
4086
|
|
|
rule = rule[1:] |
|
4087
|
|
|
|
|
4088
|
|
|
if (not rule or |
|
4089
|
|
|
# rule == '^' is not possible here |
|
4090
|
|
|
((rule[0] == '$') and not |
|
4091
|
|
|
src[i+matches0:i+matches0+1] |
|
4092
|
|
|
.isalpha() and |
|
4093
|
|
|
(src[i+matches0:i+matches0+1] |
|
4094
|
|
|
!= '.'))): |
|
4095
|
|
|
if matches0 == matches: |
|
4096
|
|
|
# this is only a partial string |
|
4097
|
|
|
if trace: |
|
4098
|
|
|
_trinfo('> > continuation ' + |
|
4099
|
|
|
'rule no.', |
|
4100
|
|
|
pos0, |
|
4101
|
|
|
'not used (too short)', |
|
4102
|
|
|
lang) |
|
4103
|
|
|
|
|
4104
|
|
|
pos0 += 3 |
|
4105
|
|
|
continue |
|
4106
|
|
|
|
|
4107
|
|
|
if priority0 < priority: |
|
4108
|
|
|
# priority is too low |
|
4109
|
|
|
if trace: |
|
4110
|
|
|
_trinfo('> > continuation ' + |
|
4111
|
|
|
'rule no.', |
|
4112
|
|
|
pos0, |
|
4113
|
|
|
'not used (priority)', |
|
4114
|
|
|
lang) |
|
4115
|
|
|
|
|
4116
|
|
|
pos0 += 3 |
|
4117
|
|
|
continue |
|
4118
|
|
|
|
|
4119
|
|
|
# continuation rule found |
|
4120
|
|
|
break |
|
4121
|
|
|
|
|
4122
|
|
|
if trace: |
|
4123
|
|
|
_trinfo('> > continuation rule no.', pos0, |
|
4124
|
|
|
'not used', lang) |
|
4125
|
|
|
|
|
4126
|
|
|
pos0 += 3 |
|
4127
|
|
|
|
|
4128
|
|
|
# end of "while" |
|
4129
|
|
|
if ((priority0 >= priority) and |
|
4130
|
|
|
((_phonet_rules[pos0] is not None) and |
|
4131
|
|
|
(_phonet_rules[pos0][0] == char0))): |
|
4132
|
|
|
|
|
4133
|
|
|
if trace: |
|
4134
|
|
|
_trinfo('> rule no.', pos, '', lang) |
|
4135
|
|
|
_trinfo('> not used because of ' + |
|
4136
|
|
|
'continuation', pos0, '', lang) |
|
4137
|
|
|
|
|
4138
|
|
|
pos += 3 |
|
4139
|
|
|
continue |
|
4140
|
|
|
|
|
4141
|
|
|
# replace string |
|
4142
|
|
|
if trace: |
|
4143
|
|
|
_trinfo('Rule no.', pos, 'is applied', lang) |
|
4144
|
|
|
|
|
4145
|
|
|
if ((_phonet_rules[pos] and |
|
4146
|
|
|
('<' in _phonet_rules[pos][1:]))): |
|
4147
|
|
|
priority0 = 1 |
|
4148
|
|
|
else: |
|
4149
|
|
|
priority0 = 0 |
|
4150
|
|
|
|
|
4151
|
|
|
rule = _phonet_rules[pos + mode] |
|
4152
|
|
|
|
|
4153
|
|
|
if (priority0 == 1) and (zeta == 0): |
|
4154
|
|
|
# rule with '<' is applied |
|
4155
|
|
|
if ((j > 0) and rule and |
|
4156
|
|
|
((dest[j-1] == char) or |
|
4157
|
|
|
(dest[j-1] == rule[0]))): |
|
4158
|
|
|
j -= 1 |
|
4159
|
|
|
|
|
4160
|
|
|
zeta0 = 1 |
|
4161
|
|
|
zeta += 1 |
|
4162
|
|
|
matches0 = 0 |
|
4163
|
|
|
|
|
4164
|
|
|
while rule and src[i+matches0]: |
|
4165
|
|
|
src = (src[0:i+matches0] + rule[0] + |
|
4166
|
|
|
src[i+matches0+1:]) |
|
4167
|
|
|
matches0 += 1 |
|
4168
|
|
|
rule = rule[1:] |
|
4169
|
|
|
|
|
4170
|
|
|
if matches0 < matches: |
|
4171
|
|
|
src = (src[0:i+matches0] + |
|
4172
|
|
|
src[i+matches:]) |
|
4173
|
|
|
|
|
4174
|
|
|
char = src[i] |
|
4175
|
|
|
else: |
|
4176
|
|
|
i = i + matches - 1 |
|
4177
|
|
|
zeta = 0 |
|
4178
|
|
|
|
|
4179
|
|
|
while len(rule) > 1: |
|
4180
|
|
|
if (j == 0) or (dest[j - 1] != rule[0]): |
|
4181
|
|
|
dest = (dest[0:j] + rule[0] + |
|
4182
|
|
|
dest[min(len(dest), j+1):]) |
|
4183
|
|
|
j += 1 |
|
4184
|
|
|
|
|
4185
|
|
|
rule = rule[1:] |
|
4186
|
|
|
|
|
4187
|
|
|
# new "current char" |
|
4188
|
|
|
if not rule: |
|
4189
|
|
|
rule = '' |
|
4190
|
|
|
char = '' |
|
4191
|
|
|
else: |
|
4192
|
|
|
char = rule[0] |
|
4193
|
|
|
|
|
4194
|
|
|
if ((_phonet_rules[pos] and |
|
4195
|
|
|
'^^' in _phonet_rules[pos][1:])): |
|
4196
|
|
|
if char: # pragma: no branch |
|
4197
|
|
|
dest = (dest[0:j] + char + |
|
4198
|
|
|
dest[min(len(dest), j + 1):]) |
|
4199
|
|
|
j += 1 |
|
4200
|
|
|
|
|
4201
|
|
|
src = src[i + 1:] |
|
4202
|
|
|
i = 0 |
|
4203
|
|
|
zeta0 = 1 |
|
4204
|
|
|
|
|
4205
|
|
|
break |
|
4206
|
|
|
|
|
4207
|
|
|
pos += 3 |
|
4208
|
|
|
|
|
4209
|
|
|
if pos > end1 and start2 > 0: |
|
4210
|
|
|
pos = start2 |
|
4211
|
|
|
start1 = start2 |
|
4212
|
|
|
end1 = end2 |
|
4213
|
|
|
start2 = -1 |
|
4214
|
|
|
end2 = -1 |
|
4215
|
|
|
|
|
4216
|
|
|
if zeta0 == 0: |
|
4217
|
|
|
if char and ((j == 0) or (dest[j-1] != char)): |
|
4218
|
|
|
# delete multiple letters only |
|
4219
|
|
|
dest = dest[0:j] + char + dest[min(j+1, term_length):] |
|
4220
|
|
|
j += 1 |
|
4221
|
|
|
|
|
4222
|
|
|
i += 1 |
|
4223
|
|
|
zeta = 0 |
|
4224
|
|
|
|
|
4225
|
|
|
dest = dest[0:j] |
|
4226
|
|
|
|
|
4227
|
|
|
return dest |
|
4228
|
|
|
|
|
4229
|
|
|
_initialize_phonet(lang) |
|
4230
|
|
|
|
|
4231
|
|
|
word = normalize('NFKC', text_type(word)) |
|
4232
|
|
|
return _phonet(word, mode, lang, trace) |
|
4233
|
|
|
|
|
4234
|
|
|
|
|
4235
|
|
|
def spfc(word): |
|
4236
|
|
|
"""Return the Standardized Phonetic Frequency Code (SPFC) of a word. |
|
4237
|
|
|
|
|
4238
|
|
|
Standardized Phonetic Frequency Code is roughly Soundex-like. |
|
4239
|
|
|
This implementation is based on page 19-21 of :cite:`Moore:1977`. |
|
4240
|
|
|
|
|
4241
|
|
|
:param str word: the word to transform |
|
4242
|
|
|
:returns: the SPFC value |
|
4243
|
|
|
:rtype: str |
|
4244
|
|
|
|
|
4245
|
|
|
>>> spfc('Christopher Smith') |
|
4246
|
|
|
'01160' |
|
4247
|
|
|
>>> spfc('Christopher Schmidt') |
|
4248
|
|
|
'01160' |
|
4249
|
|
|
>>> spfc('Niall Smith') |
|
4250
|
|
|
'01660' |
|
4251
|
|
|
>>> spfc('Niall Schmidt') |
|
4252
|
|
|
'01660' |
|
4253
|
|
|
|
|
4254
|
|
|
>>> spfc('L.Smith') |
|
4255
|
|
|
'01960' |
|
4256
|
|
|
>>> spfc('R.Miller') |
|
4257
|
|
|
'65490' |
|
4258
|
|
|
|
|
4259
|
|
|
>>> spfc(('L', 'Smith')) |
|
4260
|
|
|
'01960' |
|
4261
|
|
|
>>> spfc(('R', 'Miller')) |
|
4262
|
|
|
'65490' |
|
4263
|
|
|
""" |
|
4264
|
|
|
_pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'), |
|
|
|
|
|
|
4265
|
|
|
'0011112222334445556666777')) |
|
4266
|
|
|
_pf2 = dict(zip((ord(_) for _ in |
|
4267
|
|
|
'SZCKQFPXABORDHIMNGJTUVWEL'), |
|
4268
|
|
|
'0011122233445556677788899')) |
|
4269
|
|
|
_pf3 = dict(zip((ord(_) for _ in |
|
4270
|
|
|
'BCKQVDTFLPGJXMNRSZAEHIOUWY'), |
|
4271
|
|
|
'00000112223334456677777777')) |
|
4272
|
|
|
|
|
4273
|
|
|
_substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'), |
|
4274
|
|
|
('MN', 'N')) |
|
4275
|
|
|
|
|
4276
|
|
|
def _raise_word_ex(): |
|
4277
|
|
|
"""Raise an AttributeError.""" |
|
4278
|
|
|
raise AttributeError('word attribute must be a string with a space ' + |
|
4279
|
|
|
'or period dividing the first and last names ' + |
|
4280
|
|
|
'or a tuple/list consisting of the first and ' + |
|
4281
|
|
|
'last names') |
|
4282
|
|
|
|
|
4283
|
|
|
if not word: |
|
4284
|
|
|
return '' |
|
4285
|
|
|
|
|
4286
|
|
|
if isinstance(word, (str, text_type)): |
|
4287
|
|
|
names = word.split('.', 1) |
|
4288
|
|
|
if len(names) != 2: |
|
4289
|
|
|
names = word.split(' ', 1) |
|
4290
|
|
|
if len(names) != 2: |
|
4291
|
|
|
_raise_word_ex() |
|
4292
|
|
|
elif hasattr(word, '__iter__'): |
|
4293
|
|
|
if len(word) != 2: |
|
4294
|
|
|
_raise_word_ex() |
|
4295
|
|
|
names = word |
|
4296
|
|
|
else: |
|
4297
|
|
|
_raise_word_ex() |
|
4298
|
|
|
|
|
4299
|
|
|
names = [normalize('NFKD', text_type(_.strip() |
|
4300
|
|
|
.replace('ß', 'SS') |
|
4301
|
|
|
.upper())) |
|
4302
|
|
|
for _ in names] |
|
|
|
|
|
|
4303
|
|
|
code = '' |
|
4304
|
|
|
|
|
4305
|
|
|
def steps_one_to_three(name): |
|
4306
|
|
|
"""Perform the first three steps of SPFC.""" |
|
4307
|
|
|
# filter out non A-Z |
|
4308
|
|
|
name = ''.join(_ for _ in name if _ in |
|
4309
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
|
4310
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
|
4311
|
|
|
'W', 'X', 'Y', 'Z'}) |
|
4312
|
|
|
|
|
4313
|
|
|
# 1. In the field, convert DK to K, DT to T, SC to S, KN to N, |
|
4314
|
|
|
# and MN to N |
|
4315
|
|
|
for subst in _substitutions: |
|
4316
|
|
|
name = name.replace(subst[0], subst[1]) |
|
4317
|
|
|
|
|
4318
|
|
|
# 2. In the name field, replace multiple letters with a single letter |
|
4319
|
|
|
name = _delete_consecutive_repeats(name) |
|
4320
|
|
|
|
|
4321
|
|
|
# 3. Remove vowels, W, H, and Y, but keep the first letter in the name |
|
4322
|
|
|
# field. |
|
4323
|
|
|
if name: |
|
4324
|
|
|
name = name[0] + ''.join(_ for _ in name[1:] if _ not in |
|
4325
|
|
|
{'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}) |
|
4326
|
|
|
return name |
|
4327
|
|
|
|
|
4328
|
|
|
names = [steps_one_to_three(_) for _ in names] |
|
4329
|
|
|
|
|
4330
|
|
|
# 4. The first digit of the code is obtained using PF1 and the first letter |
|
4331
|
|
|
# of the name field. Remove this letter after coding. |
|
4332
|
|
|
if names[1]: |
|
4333
|
|
|
code += names[1][0].translate(_pf1) |
|
4334
|
|
|
names[1] = names[1][1:] |
|
4335
|
|
|
|
|
4336
|
|
|
# 5. Using the last letters of the name, use Table PF3 to obtain the |
|
4337
|
|
|
# second digit of the code. Use as many letters as possible and remove |
|
4338
|
|
|
# after coding. |
|
4339
|
|
|
if names[1]: |
|
4340
|
|
|
if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS': |
|
4341
|
|
|
code += '8' |
|
4342
|
|
|
names[1] = names[1][:-3] |
|
4343
|
|
|
elif names[1][-2:] == 'SN': |
|
4344
|
|
|
code += '8' |
|
4345
|
|
|
names[1] = names[1][:-2] |
|
4346
|
|
|
elif names[1][-3:] == 'STR': |
|
4347
|
|
|
code += '9' |
|
4348
|
|
|
names[1] = names[1][:-3] |
|
4349
|
|
|
elif names[1][-2:] in {'SR', 'TN', 'TD'}: |
|
4350
|
|
|
code += '9' |
|
4351
|
|
|
names[1] = names[1][:-2] |
|
4352
|
|
|
elif names[1][-3:] == 'DRS': |
|
4353
|
|
|
code += '7' |
|
4354
|
|
|
names[1] = names[1][:-3] |
|
4355
|
|
|
elif names[1][-2:] in {'TR', 'MN'}: |
|
4356
|
|
|
code += '7' |
|
4357
|
|
|
names[1] = names[1][:-2] |
|
4358
|
|
|
else: |
|
4359
|
|
|
code += names[1][-1].translate(_pf3) |
|
4360
|
|
|
names[1] = names[1][:-1] |
|
4361
|
|
|
|
|
4362
|
|
|
# 6. The third digit is found using Table PF2 and the first character of |
|
4363
|
|
|
# the first name. Remove after coding. |
|
4364
|
|
|
if names[0]: |
|
4365
|
|
|
code += names[0][0].translate(_pf2) |
|
4366
|
|
|
names[0] = names[0][1:] |
|
4367
|
|
|
|
|
4368
|
|
|
# 7. The fourth digit is found using Table PF2 and the first character of |
|
4369
|
|
|
# the name field. If no letters remain use zero. After coding remove the |
|
4370
|
|
|
# letter. |
|
4371
|
|
|
# 8. The fifth digit is found in the same manner as the fourth using the |
|
4372
|
|
|
# remaining characters of the name field if any. |
|
4373
|
|
|
for _ in range(2): |
|
4374
|
|
|
if names[1]: |
|
4375
|
|
|
code += names[1][0].translate(_pf2) |
|
4376
|
|
|
names[1] = names[1][1:] |
|
4377
|
|
|
else: |
|
4378
|
|
|
code += '0' |
|
4379
|
|
|
|
|
4380
|
|
|
return code |
|
4381
|
|
|
|
|
4382
|
|
|
|
|
4383
|
|
|
def statistics_canada(word, maxlength=4): |
|
4384
|
|
|
"""Return the Statistics Canada code for a word. |
|
4385
|
|
|
|
|
4386
|
|
|
The original description of this algorithm could not be located, and |
|
4387
|
|
|
may only have been specified in an unpublished TR. The coding does not |
|
4388
|
|
|
appear to be in use by Statistics Canada any longer. In its place, this is |
|
4389
|
|
|
an implementation of the "Census modified Statistics Canada name coding |
|
4390
|
|
|
procedure". |
|
4391
|
|
|
|
|
4392
|
|
|
The modified version of this algorithm is described in Appendix B of |
|
4393
|
|
|
:cite:`Moore:1977`. |
|
4394
|
|
|
|
|
4395
|
|
|
:param str word: the word to transform |
|
4396
|
|
|
:param int maxlength: the maximum length (default 6) of the code to return |
|
4397
|
|
|
:param bool modified: indicates whether to use USDA modified algorithm |
|
4398
|
|
|
:returns: the Statistics Canada name code value |
|
4399
|
|
|
:rtype: str |
|
4400
|
|
|
|
|
4401
|
|
|
>>> statistics_canada('Christopher') |
|
4402
|
|
|
'CHRS' |
|
4403
|
|
|
>>> statistics_canada('Niall') |
|
4404
|
|
|
'NL' |
|
4405
|
|
|
>>> statistics_canada('Smith') |
|
4406
|
|
|
'SMTH' |
|
4407
|
|
|
>>> statistics_canada('Schmidt') |
|
4408
|
|
|
'SCHM' |
|
4409
|
|
|
""" |
|
4410
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
4411
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
4412
|
|
|
word = word.replace('ß', 'SS') |
|
4413
|
|
|
word = ''.join(c for c in word if c in |
|
4414
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
4415
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
4416
|
|
|
'Y', 'Z'}) |
|
4417
|
|
|
if not word: |
|
4418
|
|
|
return '' |
|
4419
|
|
|
|
|
4420
|
|
|
code = word[1:] |
|
4421
|
|
|
for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}: |
|
4422
|
|
|
code = code.replace(vowel, '') |
|
4423
|
|
|
code = word[0]+code |
|
4424
|
|
|
code = _delete_consecutive_repeats(code) |
|
4425
|
|
|
code = code.replace(' ', '') |
|
4426
|
|
|
|
|
4427
|
|
|
return code[:maxlength] |
|
4428
|
|
|
|
|
4429
|
|
|
|
|
4430
|
|
|
def lein(word, maxlength=4, zero_pad=True): |
|
4431
|
|
|
"""Return the Lein code for a word. |
|
4432
|
|
|
|
|
4433
|
|
|
This is Lein name coding, described in :cite:`Moore:1977`. |
|
4434
|
|
|
|
|
4435
|
|
|
:param str word: the word to transform |
|
4436
|
|
|
:param int maxlength: the maximum length (default 4) of the code to return |
|
4437
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
|
4438
|
|
|
maxlength string |
|
4439
|
|
|
:returns: the Lein code |
|
4440
|
|
|
:rtype: str |
|
4441
|
|
|
|
|
4442
|
|
|
>>> lein('Christopher') |
|
4443
|
|
|
'C351' |
|
4444
|
|
|
>>> lein('Niall') |
|
4445
|
|
|
'N300' |
|
4446
|
|
|
>>> lein('Smith') |
|
4447
|
|
|
'S210' |
|
4448
|
|
|
>>> lein('Schmidt') |
|
4449
|
|
|
'S521' |
|
4450
|
|
|
""" |
|
4451
|
|
|
_lein_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
4452
|
|
|
'BCDFGJKLMNPQRSTVXZ'), |
|
4453
|
|
|
'451455532245351455')) |
|
4454
|
|
|
|
|
4455
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
4456
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
4457
|
|
|
word = word.replace('ß', 'SS') |
|
4458
|
|
|
word = ''.join(c for c in word if c in |
|
4459
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
4460
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
4461
|
|
|
'Y', 'Z'}) |
|
4462
|
|
|
|
|
4463
|
|
|
if not word: |
|
4464
|
|
|
return '' |
|
4465
|
|
|
|
|
4466
|
|
|
code = word[0] # Rule 1 |
|
4467
|
|
|
word = word[1:].translate({32: None, 65: None, 69: None, 72: None, |
|
4468
|
|
|
73: None, 79: None, 85: None, 87: None, |
|
4469
|
|
|
89: None}) # Rule 2 |
|
4470
|
|
|
word = _delete_consecutive_repeats(word) # Rule 3 |
|
4471
|
|
|
code += word.translate(_lein_translation) # Rule 4 |
|
4472
|
|
|
|
|
4473
|
|
|
if zero_pad: |
|
4474
|
|
|
code += ('0'*maxlength) # Rule 4 |
|
4475
|
|
|
|
|
4476
|
|
|
return code[:maxlength] |
|
4477
|
|
|
|
|
4478
|
|
|
|
|
4479
|
|
|
def roger_root(word, maxlength=5, zero_pad=True): |
|
4480
|
|
|
"""Return the Roger Root code for a word. |
|
4481
|
|
|
|
|
4482
|
|
|
This is Roger Root name coding, described in :cite:`Moore:1977`. |
|
4483
|
|
|
|
|
4484
|
|
|
:param str word: the word to transform |
|
4485
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
|
4486
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
|
4487
|
|
|
maxlength string |
|
4488
|
|
|
:returns: the Roger Root code |
|
4489
|
|
|
:rtype: str |
|
4490
|
|
|
|
|
4491
|
|
|
>>> roger_root('Christopher') |
|
4492
|
|
|
'06401' |
|
4493
|
|
|
>>> roger_root('Niall') |
|
4494
|
|
|
'02500' |
|
4495
|
|
|
>>> roger_root('Smith') |
|
4496
|
|
|
'00310' |
|
4497
|
|
|
>>> roger_root('Schmidt') |
|
4498
|
|
|
'06310' |
|
4499
|
|
|
""" |
|
4500
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
4501
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
4502
|
|
|
word = word.replace('ß', 'SS') |
|
4503
|
|
|
word = ''.join(c for c in word if c in |
|
4504
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
4505
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
4506
|
|
|
'Y', 'Z'}) |
|
4507
|
|
|
|
|
4508
|
|
|
if not word: |
|
4509
|
|
|
return '' |
|
4510
|
|
|
|
|
4511
|
|
|
# '*' is used to prevent combining by _delete_consecutive_repeats() |
|
4512
|
|
|
_init_patterns = {4: {'TSCH': '06'}, |
|
4513
|
|
|
3: {'TSH': '06', 'SCH': '06'}, |
|
4514
|
|
|
2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0', |
|
4515
|
|
|
'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02', |
|
4516
|
|
|
'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02', |
|
4517
|
|
|
'SH': '06', 'TS': '0*0', 'WR': '04'}, |
|
4518
|
|
|
1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1', |
|
4519
|
|
|
'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3', |
|
4520
|
|
|
'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1', |
|
4521
|
|
|
'P': '09', 'Q': '07', 'R': '04', 'S': '0*0', |
|
4522
|
|
|
'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07', |
|
4523
|
|
|
'Y': '5', 'Z': '0*0'}} |
|
4524
|
|
|
|
|
4525
|
|
|
_med_patterns = {4: {'TSCH': '6'}, |
|
4526
|
|
|
3: {'TSH': '6', 'SCH': '6'}, |
|
4527
|
|
|
2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7', |
|
4528
|
|
|
'PH': '8', 'SH': '6', 'TS': '0'}, |
|
4529
|
|
|
1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7', |
|
4530
|
|
|
'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2', |
|
4531
|
|
|
'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1', |
|
4532
|
|
|
'V': '8', 'X': '7', 'Z': '0', |
|
4533
|
|
|
'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*', |
|
4534
|
|
|
'U': '*', 'W': '*', 'Y': '*'}} |
|
4535
|
|
|
|
|
4536
|
|
|
code = '' |
|
4537
|
|
|
pos = 0 |
|
4538
|
|
|
|
|
4539
|
|
|
# Do first digit(s) first |
|
4540
|
|
|
for num in range(4, 0, -1): |
|
4541
|
|
|
if word[:num] in _init_patterns[num]: |
|
4542
|
|
|
code = _init_patterns[num][word[:num]] |
|
4543
|
|
|
pos += num |
|
4544
|
|
|
break |
|
4545
|
|
|
else: |
|
4546
|
|
|
pos += 1 # Advance if nothing is recognized |
|
4547
|
|
|
|
|
4548
|
|
|
# Then code subsequent digits |
|
4549
|
|
|
while pos < len(word): |
|
4550
|
|
|
for num in range(4, 0, -1): |
|
4551
|
|
|
if word[pos:pos+num] in _med_patterns[num]: |
|
4552
|
|
|
code += _med_patterns[num][word[pos:pos+num]] |
|
4553
|
|
|
pos += num |
|
4554
|
|
|
break |
|
4555
|
|
|
else: |
|
4556
|
|
|
pos += 1 # Advance if nothing is recognized |
|
4557
|
|
|
|
|
4558
|
|
|
code = _delete_consecutive_repeats(code) |
|
4559
|
|
|
code = code.replace('*', '') |
|
4560
|
|
|
|
|
4561
|
|
|
if zero_pad: |
|
4562
|
|
|
code += '0'*maxlength |
|
4563
|
|
|
|
|
4564
|
|
|
return code[:maxlength] |
|
4565
|
|
|
|
|
4566
|
|
|
|
|
4567
|
|
|
def onca(word, maxlength=4, zero_pad=True): |
|
4568
|
|
|
"""Return the Oxford Name Compression Algorithm (ONCA) code for a word. |
|
4569
|
|
|
|
|
4570
|
|
|
This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`. |
|
4571
|
|
|
|
|
4572
|
|
|
I can find no complete description of the "anglicised version of the NYSIIS |
|
4573
|
|
|
method" identified as the first step in this algorithm, so this is likely |
|
4574
|
|
|
not a precisely correct implementation, in that it employs the standard |
|
4575
|
|
|
NYSIIS algorithm. |
|
4576
|
|
|
|
|
4577
|
|
|
:param str word: the word to transform |
|
4578
|
|
|
:param int maxlength: the maximum length (default 5) of the code to return |
|
4579
|
|
|
:param bool zero_pad: pad the end of the return value with 0s to achieve a |
|
4580
|
|
|
maxlength string |
|
4581
|
|
|
:returns: the ONCA code |
|
4582
|
|
|
:rtype: str |
|
4583
|
|
|
|
|
4584
|
|
|
>>> onca('Christopher') |
|
4585
|
|
|
'C623' |
|
4586
|
|
|
>>> onca('Niall') |
|
4587
|
|
|
'N400' |
|
4588
|
|
|
>>> onca('Smith') |
|
4589
|
|
|
'S530' |
|
4590
|
|
|
>>> onca('Schmidt') |
|
4591
|
|
|
'S530' |
|
4592
|
|
|
""" |
|
4593
|
|
|
# In the most extreme case, 3 characters of NYSIIS input can be compressed |
|
4594
|
|
|
# to one character of output, so give it triple the maxlength. |
|
4595
|
|
|
return soundex(nysiis(word, maxlength=maxlength*3), maxlength, |
|
4596
|
|
|
zero_pad=zero_pad) |
|
4597
|
|
|
|
|
4598
|
|
|
|
|
4599
|
|
|
def eudex(word, maxlength=8): |
|
4600
|
|
|
"""Return the eudex phonetic hash of a word. |
|
4601
|
|
|
|
|
4602
|
|
|
This implementation of eudex phonetic hashing is based on the specification |
|
4603
|
|
|
(not the reference implementation) at :cite:`Ticki:2016`. |
|
4604
|
|
|
|
|
4605
|
|
|
Further details can be found at :cite:`Ticki:2016b`. |
|
4606
|
|
|
|
|
4607
|
|
|
:param str word: the word to transform |
|
4608
|
|
|
:param int maxlength: the length of the code returned (defaults to 8) |
|
4609
|
|
|
:returns: the eudex hash |
|
4610
|
|
|
:rtype: str |
|
4611
|
|
|
""" |
|
4612
|
|
|
_trailing_phones = { |
|
4613
|
|
|
'a': 0, # a |
|
4614
|
|
|
'b': 0b01001000, # b |
|
4615
|
|
|
'c': 0b00001100, # c |
|
4616
|
|
|
'd': 0b00011000, # d |
|
4617
|
|
|
'e': 0, # e |
|
4618
|
|
|
'f': 0b01000100, # f |
|
4619
|
|
|
'g': 0b00001000, # g |
|
4620
|
|
|
'h': 0b00000100, # h |
|
4621
|
|
|
'i': 1, # i |
|
4622
|
|
|
'j': 0b00000101, # j |
|
4623
|
|
|
'k': 0b00001001, # k |
|
4624
|
|
|
'l': 0b10100000, # l |
|
4625
|
|
|
'm': 0b00000010, # m |
|
4626
|
|
|
'n': 0b00010010, # n |
|
4627
|
|
|
'o': 0, # o |
|
4628
|
|
|
'p': 0b01001001, # p |
|
4629
|
|
|
'q': 0b10101000, # q |
|
4630
|
|
|
'r': 0b10100001, # r |
|
4631
|
|
|
's': 0b00010100, # s |
|
4632
|
|
|
't': 0b00011101, # t |
|
4633
|
|
|
'u': 1, # u |
|
4634
|
|
|
'v': 0b01000101, # v |
|
4635
|
|
|
'w': 0b00000000, # w |
|
4636
|
|
|
'x': 0b10000100, # x |
|
4637
|
|
|
'y': 1, # y |
|
4638
|
|
|
'z': 0b10010100, # z |
|
4639
|
|
|
|
|
4640
|
|
|
'ß': 0b00010101, # ß |
|
4641
|
|
|
'à': 0, # à |
|
4642
|
|
|
'á': 0, # á |
|
4643
|
|
|
'â': 0, # â |
|
4644
|
|
|
'ã': 0, # ã |
|
4645
|
|
|
'ä': 0, # ä[æ] |
|
4646
|
|
|
'å': 1, # å[oː] |
|
4647
|
|
|
'æ': 0, # æ[æ] |
|
4648
|
|
|
'ç': 0b10010101, # ç[t͡ʃ] |
|
4649
|
|
|
'è': 1, # è |
|
4650
|
|
|
'é': 1, # é |
|
4651
|
|
|
'ê': 1, # ê |
|
4652
|
|
|
'ë': 1, # ë |
|
4653
|
|
|
'ì': 1, # ì |
|
4654
|
|
|
'í': 1, # í |
|
4655
|
|
|
'î': 1, # î |
|
4656
|
|
|
'ï': 1, # ï |
|
4657
|
|
|
'ð': 0b00010101, # ð[ð̠](represented as a non-plosive T) |
|
4658
|
|
|
'ñ': 0b00010111, # ñ[nj](represented as a combination of n and j) |
|
4659
|
|
|
'ò': 0, # ò |
|
4660
|
|
|
'ó': 0, # ó |
|
4661
|
|
|
'ô': 0, # ô |
|
4662
|
|
|
'õ': 0, # õ |
|
4663
|
|
|
'ö': 1, # ö[ø] |
|
4664
|
|
|
'÷': 0b11111111, # ÷ |
|
4665
|
|
|
'ø': 1, # ø[ø] |
|
4666
|
|
|
'ù': 1, # ù |
|
4667
|
|
|
'ú': 1, # ú |
|
4668
|
|
|
'û': 1, # û |
|
4669
|
|
|
'ü': 1, # ü |
|
4670
|
|
|
'ý': 1, # ý |
|
4671
|
|
|
'þ': 0b00010101, # þ[ð̠](represented as a non-plosive T) |
|
4672
|
|
|
'ÿ': 1, # ÿ |
|
4673
|
|
|
} |
|
4674
|
|
|
|
|
4675
|
|
|
_initial_phones = { |
|
4676
|
|
|
'a': 0b10000100, # a* |
|
4677
|
|
|
'b': 0b00100100, # b |
|
4678
|
|
|
'c': 0b00000110, # c |
|
4679
|
|
|
'd': 0b00001100, # d |
|
4680
|
|
|
'e': 0b11011000, # e* |
|
4681
|
|
|
'f': 0b00100010, # f |
|
4682
|
|
|
'g': 0b00000100, # g |
|
4683
|
|
|
'h': 0b00000010, # h |
|
4684
|
|
|
'i': 0b11111000, # i* |
|
4685
|
|
|
'j': 0b00000011, # j |
|
4686
|
|
|
'k': 0b00000101, # k |
|
4687
|
|
|
'l': 0b01010000, # l |
|
4688
|
|
|
'm': 0b00000001, # m |
|
4689
|
|
|
'n': 0b00001001, # n |
|
4690
|
|
|
'o': 0b10010100, # o* |
|
4691
|
|
|
'p': 0b00100101, # p |
|
4692
|
|
|
'q': 0b01010100, # q |
|
4693
|
|
|
'r': 0b01010001, # r |
|
4694
|
|
|
's': 0b00001010, # s |
|
4695
|
|
|
't': 0b00001110, # t |
|
4696
|
|
|
'u': 0b11100000, # u* |
|
4697
|
|
|
'v': 0b00100011, # v |
|
4698
|
|
|
'w': 0b00000000, # w |
|
4699
|
|
|
'x': 0b01000010, # x |
|
4700
|
|
|
'y': 0b11100100, # y* |
|
4701
|
|
|
'z': 0b01001010, # z |
|
4702
|
|
|
|
|
4703
|
|
|
'ß': 0b00001011, # ß |
|
4704
|
|
|
'à': 0b10000101, # à |
|
4705
|
|
|
'á': 0b10000101, # á |
|
4706
|
|
|
'â': 0b10000000, # â |
|
4707
|
|
|
'ã': 0b10000110, # ã |
|
4708
|
|
|
'ä': 0b10100110, # ä [æ] |
|
4709
|
|
|
'å': 0b11000010, # å [oː] |
|
4710
|
|
|
'æ': 0b10100111, # æ [æ] |
|
4711
|
|
|
'ç': 0b01010100, # ç [t͡ʃ] |
|
4712
|
|
|
'è': 0b11011001, # è |
|
4713
|
|
|
'é': 0b11011001, # é |
|
4714
|
|
|
'ê': 0b11011001, # ê |
|
4715
|
|
|
'ë': 0b11000110, # ë [ə] or [œ] |
|
4716
|
|
|
'ì': 0b11111001, # ì |
|
4717
|
|
|
'í': 0b11111001, # í |
|
4718
|
|
|
'î': 0b11111001, # î |
|
4719
|
|
|
'ï': 0b11111001, # ï |
|
4720
|
|
|
'ð': 0b00001011, # ð [ð̠] (represented as a non-plosive T) |
|
4721
|
|
|
'ñ': 0b00001011, # ñ [nj] (represented as a combination of n and j) |
|
4722
|
|
|
'ò': 0b10010101, # ò |
|
4723
|
|
|
'ó': 0b10010101, # ó |
|
4724
|
|
|
'ô': 0b10010101, # ô |
|
4725
|
|
|
'õ': 0b10010101, # õ |
|
4726
|
|
|
'ö': 0b11011100, # ö [œ] or [ø] |
|
4727
|
|
|
'÷': 0b11111111, # ÷ |
|
4728
|
|
|
'ø': 0b11011101, # ø [œ] or [ø] |
|
4729
|
|
|
'ù': 0b11100001, # ù |
|
4730
|
|
|
'ú': 0b11100001, # ú |
|
4731
|
|
|
'û': 0b11100001, # û |
|
4732
|
|
|
'ü': 0b11100101, # ü |
|
4733
|
|
|
'ý': 0b11100101, # ý |
|
4734
|
|
|
'þ': 0b00001011, # þ [ð̠] (represented as a non-plosive T) |
|
4735
|
|
|
'ÿ': 0b11100101, # ÿ |
|
4736
|
|
|
} |
|
4737
|
|
|
# Lowercase input & filter unknown characters |
|
4738
|
|
|
word = ''.join(char for char in word.lower() if char in _initial_phones) |
|
4739
|
|
|
|
|
4740
|
|
|
if not word: |
|
4741
|
|
|
word = '÷' |
|
4742
|
|
|
|
|
4743
|
|
|
# Perform initial eudex coding of each character |
|
4744
|
|
|
values = [_initial_phones[word[0]]] |
|
4745
|
|
|
values += [_trailing_phones[char] for char in word[1:]] |
|
4746
|
|
|
|
|
4747
|
|
|
# Right-shift by one to determine if second instance should be skipped |
|
4748
|
|
|
shifted_values = [_ >> 1 for _ in values] |
|
4749
|
|
|
condensed_values = [values[0]] |
|
4750
|
|
|
for n in range(1, len(shifted_values)): |
|
4751
|
|
|
if shifted_values[n] != shifted_values[n-1]: |
|
4752
|
|
|
condensed_values.append(values[n]) |
|
4753
|
|
|
|
|
4754
|
|
|
# Add padding after first character & trim beyond maxlength |
|
4755
|
|
|
values = ([condensed_values[0]] + |
|
4756
|
|
|
[0]*max(0, maxlength - len(condensed_values)) + |
|
4757
|
|
|
condensed_values[1:maxlength]) |
|
4758
|
|
|
|
|
4759
|
|
|
# Combine individual character values into eudex hash |
|
4760
|
|
|
hash_value = 0 |
|
4761
|
|
|
for val in values: |
|
4762
|
|
|
hash_value = (hash_value << 8) | val |
|
4763
|
|
|
|
|
4764
|
|
|
return hash_value |
|
4765
|
|
|
|
|
4766
|
|
|
|
|
4767
|
|
|
def haase_phonetik(word, primary_only=False): |
|
4768
|
|
|
"""Return the Haase Phonetik (numeric output) code for a word. |
|
4769
|
|
|
|
|
4770
|
|
|
Based on the algorithm described at :cite:`Prante:2015`. |
|
4771
|
|
|
|
|
4772
|
|
|
Based on the original :cite:`Haase:2000`. |
|
4773
|
|
|
|
|
4774
|
|
|
While the output code is numeric, it is nevertheless a str. |
|
4775
|
|
|
|
|
4776
|
|
|
:param str word: the word to transform |
|
4777
|
|
|
:returns: the Haase Phonetik value as a numeric string |
|
4778
|
|
|
:rtype: str |
|
4779
|
|
|
""" |
|
4780
|
|
|
def _after(word, i, letters): |
|
4781
|
|
|
"""Return True if word[i] follows one of the supplied letters.""" |
|
4782
|
|
|
if i > 0 and word[i-1] in letters: |
|
4783
|
|
|
return True |
|
4784
|
|
|
return False |
|
4785
|
|
|
|
|
4786
|
|
|
def _before(word, i, letters): |
|
4787
|
|
|
"""Return True if word[i] precedes one of the supplied letters.""" |
|
4788
|
|
|
if i+1 < len(word) and word[i+1] in letters: |
|
4789
|
|
|
return True |
|
4790
|
|
|
return False |
|
4791
|
|
|
|
|
4792
|
|
|
_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
|
4793
|
|
|
|
|
4794
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
4795
|
|
|
word = word.replace('ß', 'SS') |
|
4796
|
|
|
|
|
4797
|
|
|
word = word.replace('Ä', 'AE') |
|
4798
|
|
|
word = word.replace('Ö', 'OE') |
|
4799
|
|
|
word = word.replace('Ü', 'UE') |
|
4800
|
|
|
word = ''.join(c for c in word if c in |
|
4801
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
4802
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
4803
|
|
|
'Y', 'Z'}) |
|
4804
|
|
|
|
|
4805
|
|
|
# Nothing to convert, return base case |
|
4806
|
|
|
if not word: |
|
4807
|
|
|
return '' |
|
4808
|
|
|
|
|
4809
|
|
|
variants = [] |
|
4810
|
|
|
if primary_only: |
|
4811
|
|
|
variants = [word] |
|
4812
|
|
|
else: |
|
4813
|
|
|
pos = 0 |
|
4814
|
|
|
if word[:2] == 'CH': |
|
4815
|
|
|
variants.append(('CH', 'SCH')) |
|
4816
|
|
|
pos += 2 |
|
4817
|
|
|
len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI', |
|
4818
|
|
|
'AUX': 'O', 'EUX': 'O'} |
|
4819
|
|
|
while pos < len(word): |
|
4820
|
|
|
if word[pos:pos+4] == 'ILLE': |
|
4821
|
|
|
variants.append(('ILLE', 'I')) |
|
4822
|
|
|
pos += 4 |
|
4823
|
|
|
elif word[pos:pos+3] in len_3_vars: |
|
4824
|
|
|
variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]])) |
|
4825
|
|
|
pos += 3 |
|
4826
|
|
|
elif word[pos:pos+2] == 'RB': |
|
4827
|
|
|
variants.append(('RB', 'RW')) |
|
4828
|
|
|
pos += 2 |
|
4829
|
|
|
elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
|
4830
|
|
|
variants.append(('EAU', 'O')) |
|
4831
|
|
|
pos += 3 |
|
4832
|
|
|
elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
4833
|
|
|
if word[pos:] == 'O': |
|
4834
|
|
|
variants.append(('O', 'OW')) |
|
4835
|
|
|
else: |
|
4836
|
|
|
variants.append(('A', 'AR')) |
|
4837
|
|
|
pos += 1 |
|
4838
|
|
|
else: |
|
4839
|
|
|
variants.append((word[pos],)) |
|
4840
|
|
|
pos += 1 |
|
4841
|
|
|
|
|
4842
|
|
|
variants = [''.join(letters) for letters in product(*variants)] |
|
4843
|
|
|
|
|
4844
|
|
|
def _haase_code(word): |
|
4845
|
|
|
sdx = '' |
|
4846
|
|
|
for i in range(len(word)): |
|
4847
|
|
View Code Duplication |
if word[i] in _vowels: |
|
|
|
|
|
|
4848
|
|
|
sdx += '9' |
|
4849
|
|
|
elif word[i] == 'B': |
|
4850
|
|
|
sdx += '1' |
|
4851
|
|
|
elif word[i] == 'P': |
|
4852
|
|
|
if _before(word, i, {'H'}): |
|
4853
|
|
|
sdx += '3' |
|
4854
|
|
|
else: |
|
4855
|
|
|
sdx += '1' |
|
4856
|
|
|
elif word[i] in {'D', 'T'}: |
|
4857
|
|
|
if _before(word, i, {'C', 'S', 'Z'}): |
|
4858
|
|
|
sdx += '8' |
|
4859
|
|
|
else: |
|
4860
|
|
|
sdx += '2' |
|
4861
|
|
|
elif word[i] in {'F', 'V', 'W'}: |
|
4862
|
|
|
sdx += '3' |
|
4863
|
|
|
elif word[i] in {'G', 'K', 'Q'}: |
|
4864
|
|
|
sdx += '4' |
|
4865
|
|
|
elif word[i] == 'C': |
|
4866
|
|
|
if _after(word, i, {'S', 'Z'}): |
|
4867
|
|
|
sdx += '8' |
|
4868
|
|
|
elif i == 0: |
|
4869
|
|
|
if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', |
|
4870
|
|
|
'U', 'X'}): |
|
4871
|
|
|
sdx += '4' |
|
4872
|
|
|
else: |
|
4873
|
|
|
sdx += '8' |
|
4874
|
|
|
elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
4875
|
|
|
sdx += '4' |
|
4876
|
|
|
else: |
|
4877
|
|
|
sdx += '8' |
|
4878
|
|
|
elif word[i] == 'X': |
|
4879
|
|
|
if _after(word, i, {'C', 'K', 'Q'}): |
|
4880
|
|
|
sdx += '8' |
|
4881
|
|
|
else: |
|
4882
|
|
|
sdx += '48' |
|
4883
|
|
|
elif word[i] == 'L': |
|
4884
|
|
|
sdx += '5' |
|
4885
|
|
|
elif word[i] in {'M', 'N'}: |
|
4886
|
|
|
sdx += '6' |
|
4887
|
|
|
elif word[i] == 'R': |
|
4888
|
|
|
sdx += '7' |
|
4889
|
|
|
elif word[i] in {'S', 'Z'}: |
|
4890
|
|
|
sdx += '8' |
|
4891
|
|
|
|
|
4892
|
|
|
sdx = _delete_consecutive_repeats(sdx) |
|
4893
|
|
|
|
|
4894
|
|
|
# if sdx: |
|
4895
|
|
|
# sdx = sdx[0] + sdx[1:].replace('9', '') |
|
4896
|
|
|
|
|
4897
|
|
|
return sdx |
|
4898
|
|
|
|
|
4899
|
|
|
return tuple(_haase_code(word) for word in variants) |
|
4900
|
|
|
|
|
4901
|
|
|
|
|
4902
|
|
|
def reth_schek_phonetik(word): |
|
4903
|
|
|
"""Return Reth-Schek Phonetik code for a word. |
|
4904
|
|
|
|
|
4905
|
|
|
This algorithm is proposed in :cite:`Reth:1977`. |
|
4906
|
|
|
|
|
4907
|
|
|
Since I couldn't secure a copy of that document (maybe I'll look for it |
|
4908
|
|
|
next time I'm in Germany), this implementation is based on what I could |
|
4909
|
|
|
glean from the implementations published by German Record Linkage |
|
4910
|
|
|
Center (www.record-linkage.de): |
|
4911
|
|
|
|
|
4912
|
|
|
- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` |
|
4913
|
|
|
- Merge ToolBox (in Java) :cite:`Schnell:2004` |
|
4914
|
|
|
|
|
4915
|
|
|
Rules that are unclear: |
|
4916
|
|
|
|
|
4917
|
|
|
- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
|
4918
|
|
|
- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
|
4919
|
|
|
- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
|
4920
|
|
|
think of a German word with '-tui-' in it.) |
|
4921
|
|
|
- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
|
4922
|
|
|
|
|
4923
|
|
|
:param word: |
|
4924
|
|
|
:return: |
|
4925
|
|
|
""" |
|
4926
|
|
|
replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE', |
|
4927
|
|
|
'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO', |
|
4928
|
|
|
'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'}, |
|
4929
|
|
|
2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B', |
|
4930
|
|
|
'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D', |
|
4931
|
|
|
'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F', |
|
4932
|
|
|
'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G', |
|
4933
|
|
|
'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M', |
|
4934
|
|
|
'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U', |
|
4935
|
|
|
'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI', |
|
4936
|
|
|
'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R', |
|
4937
|
|
|
'SS': 'S', 'KW': 'QU'}, |
|
4938
|
|
|
1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G', |
|
4939
|
|
|
'K': 'G', 'Y': 'I'}} |
|
4940
|
|
|
|
|
4941
|
|
|
# Uppercase |
|
4942
|
|
|
word = word.upper() |
|
4943
|
|
|
|
|
4944
|
|
|
# Replace umlauts/eszett |
|
4945
|
|
|
word = word.replace('Ä', 'AE') |
|
4946
|
|
|
word = word.replace('Ö', 'OE') |
|
4947
|
|
|
word = word.replace('Ü', 'UE') |
|
4948
|
|
|
word = word.replace('ß', 'SS') |
|
4949
|
|
|
|
|
4950
|
|
|
# Main loop, using above replacements table |
|
4951
|
|
|
pos = 0 |
|
4952
|
|
|
while pos < len(word): |
|
4953
|
|
|
for num in range(3, 0, -1): |
|
4954
|
|
|
if word[pos:pos+num] in replacements[num]: |
|
4955
|
|
|
word = (word[:pos] + replacements[num][word[pos:pos+num]] |
|
4956
|
|
|
+ word[pos+num:]) |
|
4957
|
|
|
pos += 1 |
|
4958
|
|
|
break |
|
4959
|
|
|
else: |
|
4960
|
|
|
pos += 1 # Advance if nothing is recognized |
|
4961
|
|
|
|
|
4962
|
|
|
# Change 'CH' back(?) to 'SCH' |
|
4963
|
|
|
word = word.replace('CH', 'SCH') |
|
4964
|
|
|
|
|
4965
|
|
|
# Replace final sequences |
|
4966
|
|
|
if word[-2:] == 'ER': |
|
4967
|
|
|
word = word[:-2]+'R' |
|
4968
|
|
|
elif word[-2:] == 'EL': |
|
4969
|
|
|
word = word[:-2]+'L' |
|
4970
|
|
|
elif word[-1] == 'H': |
|
4971
|
|
|
word = word[:-1] |
|
4972
|
|
|
|
|
4973
|
|
|
return word |
|
4974
|
|
|
|
|
4975
|
|
|
|
|
4976
|
|
|
def fonem(word): |
|
4977
|
|
|
"""Return the FONEM code of a word. |
|
4978
|
|
|
|
|
4979
|
|
|
FONEM is a phonetic algorithm designed for French (particularly surnames in |
|
4980
|
|
|
Saguenay, Canada), defined in :cite:`Bouchard:1981`. |
|
4981
|
|
|
|
|
4982
|
|
|
Guillaume Plique's Javascript implementation :cite:`Plique:2018` at |
|
4983
|
|
|
https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js |
|
4984
|
|
|
was also consulted for this implementation. |
|
4985
|
|
|
|
|
4986
|
|
|
:param str word: the word to transform |
|
4987
|
|
|
:returns: the FONEM code |
|
4988
|
|
|
:rtype: str |
|
4989
|
|
|
""" |
|
4990
|
|
|
# I don't see a sane way of doing this without regexps :( |
|
4991
|
|
|
rule_table = { |
|
4992
|
|
|
# Vowels & groups of vowels |
|
4993
|
|
|
'V-1': (re_compile('E?AU'), 'O'), |
|
4994
|
|
|
'V-2,5': (re_compile('(E?AU|O)L[TX]$'), 'O'), |
|
4995
|
|
|
'V-3,4': (re_compile('E?AU[TX]$'), 'O'), |
|
4996
|
|
|
'V-6': (re_compile('E?AUL?D$'), 'O'), |
|
4997
|
|
|
'V-7': (re_compile(r'(?<!G)AY$'), 'E'), |
|
4998
|
|
|
'V-8': (re_compile('EUX$'), 'EU'), |
|
4999
|
|
|
'V-9': (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'), |
|
5000
|
|
|
'V-10': ('Y', 'I'), |
|
5001
|
|
|
'V-11': (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'), |
|
5002
|
|
|
'V-12': (re_compile('(?<=[AEIOUY])ILL'), 'Y'), |
|
5003
|
|
|
'V-13': (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'), |
|
5004
|
|
|
'V-14': (re_compile(r'([AEIOUY])(?=\1)'), ''), |
|
5005
|
|
|
# Nasal vowels |
|
5006
|
|
|
'V-15': (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'), |
|
5007
|
|
|
'V-16': (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'), |
|
5008
|
|
|
'V-17': (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'), |
|
5009
|
|
|
'V-18': (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), |
|
5010
|
|
|
'IN'), |
|
5011
|
|
|
'V-19': (re_compile('B(O|U|OU)RNE?$'), 'BURN'), |
|
5012
|
|
|
'V-20': (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
|
5013
|
|
|
'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'), |
|
5014
|
|
|
# Consonants and groups of consonants |
|
5015
|
|
|
'C-1': ('BV', 'V'), |
|
5016
|
|
|
'C-2': (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'), |
|
5017
|
|
|
'C-3': (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'), |
|
5018
|
|
|
'C-4': (re_compile('^C(?=[EIY])'), 'S'), |
|
5019
|
|
|
'C-5': (re_compile('^C(?=[OUA])'), 'K'), |
|
5020
|
|
|
'C-6': (re_compile('(?<=[AEIOUY])C$'), 'K'), |
|
5021
|
|
|
'C-7': (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'), |
|
5022
|
|
|
'C-8': (re_compile('CC(?=[AOU])'), 'K'), |
|
5023
|
|
|
'C-9': (re_compile('CC(?=[EIY])'), 'X'), |
|
5024
|
|
|
'C-10': (re_compile('G(?=[EIY])'), 'J'), |
|
5025
|
|
|
'C-11': (re_compile('GA(?=I?[MN])'), 'G#'), |
|
5026
|
|
|
'C-12': (re_compile('GE(O|AU)'), 'JO'), |
|
5027
|
|
|
'C-13': (re_compile('GNI(?=[AEIOUY])'), 'GN'), |
|
5028
|
|
|
'C-14': (re_compile('(?<![PCS])H'), ''), |
|
5029
|
|
|
'C-15': ('JEA', 'JA'), |
|
5030
|
|
|
'C-16': (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'), |
|
5031
|
|
|
'C-17': (re_compile('^MC'), 'MA#'), |
|
5032
|
|
|
'C-18': ('PH', 'F'), |
|
5033
|
|
|
'C-19': ('QU', 'K'), |
|
5034
|
|
|
'C-20': (re_compile('^SC(?=[EIY])'), 'S'), |
|
5035
|
|
|
'C-21': (re_compile('(?<=.)SC(?=[EIY])'), 'SS'), |
|
5036
|
|
|
'C-22': (re_compile('(?<=.)SC(?=[AOU])'), 'SK'), |
|
5037
|
|
|
'C-23': ('SH', 'CH'), |
|
5038
|
|
|
'C-24': (re_compile('TIA$'), 'SSIA'), |
|
5039
|
|
|
'C-25': (re_compile('(?<=[AIOUY])W'), ''), |
|
5040
|
|
|
'C-26': (re_compile('X[CSZ]'), 'X'), |
|
5041
|
|
|
'C-27': (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' + |
|
5042
|
|
|
'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'), |
|
5043
|
|
|
'C-28': (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'), |
|
5044
|
|
|
'C-28a': (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'), |
|
5045
|
|
|
'C-28b': (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'), |
|
5046
|
|
|
'C-28bb': (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'), |
|
5047
|
|
|
'C-28c': (re_compile('((?<=[^I])|^)LL'), 'L'), |
|
5048
|
|
|
'C-28d': (re_compile('ILE$'), 'ILLE'), |
|
5049
|
|
|
'C-29': (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' + |
|
5050
|
|
|
'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), |
|
5051
|
|
|
lambda m: (m.group(1) or '') + (m.group(2) or '')), |
|
5052
|
|
|
'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'), |
|
5053
|
|
|
'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'), |
|
5054
|
|
|
# Rules to undo rule bleeding prevention in C-11, C-16, C-17 |
|
5055
|
|
|
'C-34': ('G#', 'GA'), |
|
5056
|
|
|
'C-35': ('MA#', 'MAC') |
|
5057
|
|
|
} |
|
5058
|
|
|
rule_order = [ |
|
5059
|
|
|
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
|
5060
|
|
|
'C-12', |
|
5061
|
|
|
'C-8', 'C-9', 'C-10', |
|
5062
|
|
|
'C-16', 'C-17', 'C-2', 'C-3', 'C-7', |
|
5063
|
|
|
'V-2,5', 'V-3,4', 'V-6', |
|
5064
|
|
|
'V-1', 'C-14', |
|
5065
|
|
|
'C-31,33', 'C-30,32', |
|
5066
|
|
|
'C-11', 'V-15', 'V-17', 'V-18', |
|
5067
|
|
|
'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16', |
|
5068
|
|
|
'V-19', 'V-20', |
|
5069
|
|
|
'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15', |
|
5070
|
|
|
'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24', |
|
5071
|
|
|
'C-25', 'C-26', 'C-27', |
|
5072
|
|
|
'C-29', |
|
5073
|
|
|
'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d', |
|
5074
|
|
|
'C-34', 'C-35' |
|
5075
|
|
|
] |
|
5076
|
|
|
|
|
5077
|
|
|
# normalize, upper-case, and filter non-French letters |
|
5078
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
5079
|
|
|
word = word.translate({198: 'AE', 338: 'OE'}) |
|
5080
|
|
|
word = ''.join(c for c in word if c in |
|
5081
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
5082
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
5083
|
|
|
'Y', 'Z', '-'}) |
|
5084
|
|
|
|
|
5085
|
|
|
for rule in rule_order: |
|
5086
|
|
|
regex, repl = rule_table[rule] |
|
5087
|
|
|
if isinstance(regex, text_type): |
|
5088
|
|
|
word = word.replace(regex, repl) |
|
5089
|
|
|
else: |
|
5090
|
|
|
word = regex.sub(repl, word) |
|
5091
|
|
|
# print(rule, word) |
|
5092
|
|
|
|
|
5093
|
|
|
return word |
|
5094
|
|
|
|
|
5095
|
|
|
|
|
5096
|
|
|
def parmar_kumbharana(word): |
|
5097
|
|
|
"""Return the Parmar-Kumbharana encoding of a word. |
|
5098
|
|
|
|
|
5099
|
|
|
This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`. |
|
5100
|
|
|
|
|
5101
|
|
|
:param word: |
|
5102
|
|
|
:return: |
|
5103
|
|
|
""" |
|
5104
|
|
|
rule_table = {4: {'OUGH': 'F'}, |
|
5105
|
|
|
3: {'DGE': 'J', |
|
5106
|
|
|
'OUL': 'U', |
|
5107
|
|
|
'GHT': 'T'}, |
|
5108
|
|
|
2: {'CE': 'S', 'CI': 'S', 'CY': 'S', |
|
5109
|
|
|
'GE': 'J', 'GI': 'J', 'GY': 'J', |
|
5110
|
|
|
'WR': 'R', |
|
5111
|
|
|
'GN': 'N', 'KN': 'N', 'PN': 'N', |
|
5112
|
|
|
'CK': 'K', |
|
5113
|
|
|
'SH': 'S'}} |
|
5114
|
|
|
vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''} |
|
5115
|
|
|
|
|
5116
|
|
|
word = word.upper() # Rule 3 |
|
5117
|
|
|
word = _delete_consecutive_repeats(word) # Rule 4 |
|
5118
|
|
|
|
|
5119
|
|
|
# Rule 5 |
|
5120
|
|
|
i = 0 |
|
5121
|
|
|
while i < len(word): |
|
5122
|
|
|
for match_len in range(4, 1, -1): |
|
5123
|
|
|
if word[i:i+match_len] in rule_table[match_len]: |
|
5124
|
|
|
repl = rule_table[match_len][word[i:i+match_len]] |
|
5125
|
|
|
word = (word[:i] + repl + word[i+match_len:]) |
|
5126
|
|
|
i += len(repl) |
|
5127
|
|
|
break |
|
5128
|
|
|
else: |
|
5129
|
|
|
i += 1 |
|
5130
|
|
|
|
|
5131
|
|
|
word = word[0]+word[1:].translate(vowel_trans) # Rule 6 |
|
5132
|
|
|
return word |
|
5133
|
|
|
|
|
5134
|
|
|
|
|
5135
|
|
|
def davidson(lname, fname='.', omit_fname=False): |
|
5136
|
|
|
"""Return Davidson's Consonant Code. |
|
5137
|
|
|
|
|
5138
|
|
|
This is based on the name compression system described in |
|
5139
|
|
|
:cite:`Davidson:1962`. |
|
5140
|
|
|
|
|
5141
|
|
|
:cite:`Dolby:1970` identifies this as having been the name compression |
|
5142
|
|
|
algorithm used by SABRE. |
|
5143
|
|
|
|
|
5144
|
|
|
:param str lname: Last name (or word) to be encoded |
|
5145
|
|
|
:param str fname: First name (optional), of which the first character is |
|
5146
|
|
|
included in the code. |
|
5147
|
|
|
:param str omit_fname: Set to True to completely omit the first character |
|
5148
|
|
|
of the first name |
|
5149
|
|
|
:return: Davidson's Consonant Code |
|
5150
|
|
|
""" |
|
5151
|
|
|
trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''} |
|
5152
|
|
|
|
|
5153
|
|
|
lname = text_type(lname.upper()) |
|
5154
|
|
|
code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans)) |
|
5155
|
|
|
code = code[:4] + (4-len(code))*' ' |
|
5156
|
|
|
|
|
5157
|
|
|
if not omit_fname: |
|
5158
|
|
|
code += fname[:1].upper() |
|
5159
|
|
|
|
|
5160
|
|
|
return code |
|
5161
|
|
|
|
|
5162
|
|
|
|
|
5163
|
|
|
def sound_d(word, maxlength=4): |
|
5164
|
|
|
"""Return the SoundD code. |
|
5165
|
|
|
|
|
5166
|
|
|
SoundD is defined in :cite:`Varol:2012`. |
|
5167
|
|
|
|
|
5168
|
|
|
:param str word: the word to transform |
|
5169
|
|
|
:param int maxlength: the length of the code returned (defaults to 4) |
|
5170
|
|
|
:return: |
|
5171
|
|
|
""" |
|
5172
|
|
|
_ref_soundd_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
5173
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
5174
|
|
|
'01230120022455012623010202')) |
|
5175
|
|
|
|
|
5176
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
5177
|
|
|
word = word.replace('ß', 'SS') |
|
5178
|
|
|
word = ''.join(c for c in word if c in |
|
5179
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
5180
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
5181
|
|
|
'Y', 'Z'}) |
|
5182
|
|
|
|
|
5183
|
|
|
if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}: |
|
5184
|
|
|
word = word[1:] |
|
5185
|
|
|
elif word[:1] == 'X': |
|
5186
|
|
|
word = 'S'+word[1:] |
|
5187
|
|
|
elif word[:2] == 'WH': |
|
5188
|
|
|
word = 'W'+word[2:] |
|
5189
|
|
|
|
|
5190
|
|
|
word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0') |
|
5191
|
|
|
|
|
5192
|
|
|
word = word.translate(_ref_soundd_translation) |
|
5193
|
|
|
word = _delete_consecutive_repeats(word) |
|
5194
|
|
|
word = word.replace('0', '') |
|
5195
|
|
|
|
|
5196
|
|
|
if maxlength is not None: |
|
5197
|
|
|
if len(word) < maxlength: |
|
5198
|
|
|
word += '0' * (maxlength-len(word)) |
|
5199
|
|
|
else: |
|
5200
|
|
|
word = word[:maxlength] |
|
5201
|
|
|
|
|
5202
|
|
|
return word |
|
5203
|
|
|
|
|
5204
|
|
|
|
|
5205
|
|
|
def pshp_soundex_last(lname, maxlength=4, german=False): |
|
5206
|
|
|
"""Calculate the PSHP Soundex/Viewex Coding of a last name. |
|
5207
|
|
|
|
|
5208
|
|
|
This coding is based on :cite:`Hershberg:1976`. |
|
5209
|
|
|
|
|
5210
|
|
|
Reference was also made to the German version of the same: |
|
5211
|
|
|
:cite:`Hershberg:1979`. |
|
5212
|
|
|
|
|
5213
|
|
|
A separate function, pshp_soundex_first() is used for first names. |
|
5214
|
|
|
|
|
5215
|
|
|
:param lname: the last name to encode |
|
5216
|
|
|
:param german: set to True if the name is German (different rules apply) |
|
5217
|
|
|
:return: |
|
5218
|
|
|
""" |
|
5219
|
|
|
lname = normalize('NFKD', text_type(lname.upper())) |
|
5220
|
|
|
lname = lname.replace('ß', 'SS') |
|
5221
|
|
|
lname = ''.join(c for c in lname if c in |
|
5222
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
|
5223
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
|
5224
|
|
|
'W', 'X', 'Y', 'Z'}) |
|
5225
|
|
|
|
|
5226
|
|
|
# A. Prefix treatment |
|
5227
|
|
|
if lname[:3] == 'VON' or lname[:3] == 'VAN': |
|
5228
|
|
|
lname = lname[3:].strip() |
|
5229
|
|
|
|
|
5230
|
|
|
# The rule implemented below says "MC, MAC become 1". I believe it meant to |
|
5231
|
|
|
# say they become M except in German data (where superscripted 1 indicates |
|
5232
|
|
|
# "except in German data"). It doesn't make sense for them to become 1 |
|
5233
|
|
|
# (BPFV -> 1) or to apply outside German. Unfortunately, both articles have |
|
5234
|
|
|
# this error(?). |
|
5235
|
|
|
if not german: |
|
5236
|
|
|
if lname[:3] == 'MAC': |
|
5237
|
|
|
lname = 'M'+lname[3:] |
|
5238
|
|
|
elif lname[:2] == 'MC': |
|
5239
|
|
|
lname = 'M'+lname[2:] |
|
5240
|
|
|
|
|
5241
|
|
|
# The non-German-only rule to strip ' is unnecessary due to filtering |
|
5242
|
|
|
|
|
5243
|
|
|
if lname[:1] in {'E', 'I', 'O', 'U'}: |
|
5244
|
|
|
lname = 'A' + lname[1:] |
|
5245
|
|
|
elif lname[:2] in {'GE', 'GI', 'GY'}: |
|
5246
|
|
|
lname = 'J' + lname[1:] |
|
5247
|
|
|
elif lname[:2] in {'CE', 'CI', 'CY'}: |
|
5248
|
|
|
lname = 'S' + lname[1:] |
|
5249
|
|
|
elif lname[:3] == 'CHR': |
|
5250
|
|
|
lname = 'K' + lname[1:] |
|
5251
|
|
|
elif lname[:1] == 'C' and lname[:2] != 'CH': |
|
5252
|
|
|
lname = 'K' + lname[1:] |
|
5253
|
|
|
|
|
5254
|
|
|
if lname[:2] == 'KN': |
|
5255
|
|
|
lname = 'N' + lname[1:] |
|
5256
|
|
|
elif lname[:2] == 'PH': |
|
5257
|
|
|
lname = 'F' + lname[1:] |
|
5258
|
|
|
elif lname[:3] in {'WIE', 'WEI'}: |
|
5259
|
|
|
lname = 'V' + lname[1:] |
|
5260
|
|
|
|
|
5261
|
|
|
if german and lname[:1] in {'W', 'M', 'Y', 'Z'}: |
|
5262
|
|
|
lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:] |
|
5263
|
|
|
|
|
5264
|
|
|
code = lname[:1] |
|
5265
|
|
|
|
|
5266
|
|
|
# B. Postfix treatment |
|
5267
|
|
|
if lname[-1:] == 'R': |
|
5268
|
|
|
lname = lname[:-1] + 'N' |
|
5269
|
|
|
elif lname[-2:] in {'SE', 'CE'}: |
|
5270
|
|
|
lname = lname[:-2] |
|
5271
|
|
|
if lname[-2:] == 'SS': |
|
5272
|
|
|
lname = lname[:-2] |
|
5273
|
|
|
elif lname[-1:] == 'S': |
|
5274
|
|
|
lname = lname[:-1] |
|
5275
|
|
|
|
|
5276
|
|
|
if not german: |
|
5277
|
|
|
l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'} |
|
5278
|
|
|
l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN', |
|
5279
|
|
|
'STON': 'SAON'} |
|
5280
|
|
|
if lname[-5:] in l5_repl: |
|
5281
|
|
|
lname = lname[:-5] + l5_repl[lname[-5:]] |
|
5282
|
|
|
elif lname[-4:] in l4_repl: |
|
5283
|
|
|
lname = lname[:-4] + l4_repl[lname[-4:]] |
|
5284
|
|
|
|
|
5285
|
|
|
if lname[-2:] in {'NG', 'ND'}: |
|
5286
|
|
|
lname = lname[:-1] |
|
5287
|
|
|
if not german and lname[-3:] in {'GAN', 'GEN'}: |
|
5288
|
|
|
lname = lname[:-3]+'A'+lname[-2:] |
|
5289
|
|
|
|
|
5290
|
|
|
if german: |
|
5291
|
|
|
if lname[-3:] == 'TES': |
|
5292
|
|
|
lname = lname[:-3] |
|
5293
|
|
|
elif lname[-2:] == 'TS': |
|
5294
|
|
|
lname = lname[:-2] |
|
5295
|
|
|
if lname[-3:] == 'TZE': |
|
5296
|
|
|
lname = lname[:-3] |
|
5297
|
|
|
elif lname[-2:] == 'ZE': |
|
5298
|
|
|
lname = lname[:-2] |
|
5299
|
|
|
if lname[-1:] == 'Z': |
|
5300
|
|
|
lname = lname[:-1] |
|
5301
|
|
|
elif lname[-2:] == 'TE': |
|
5302
|
|
|
lname = lname[:-2] |
|
5303
|
|
|
|
|
5304
|
|
|
# C. Infix Treatment |
|
5305
|
|
|
lname = lname.replace('CK', 'C') |
|
5306
|
|
|
lname = lname.replace('SCH', 'S') |
|
5307
|
|
|
lname = lname.replace('DT', 'T') |
|
5308
|
|
|
lname = lname.replace('ND', 'N') |
|
5309
|
|
|
lname = lname.replace('NG', 'N') |
|
5310
|
|
|
lname = lname.replace('LM', 'M') |
|
5311
|
|
|
lname = lname.replace('MN', 'M') |
|
5312
|
|
|
lname = lname.replace('WIE', 'VIE') |
|
5313
|
|
|
lname = lname.replace('WEI', 'VEI') |
|
5314
|
|
|
|
|
5315
|
|
|
# D. Soundexing |
|
5316
|
|
|
# code for X & Y are unspecified, but presumably are 2 & 0 |
|
5317
|
|
|
_pshp_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
5318
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
5319
|
|
|
'01230120022455012523010202')) |
|
5320
|
|
|
|
|
5321
|
|
|
lname = lname.translate(_pshp_translation) |
|
5322
|
|
|
lname = _delete_consecutive_repeats(lname) |
|
5323
|
|
|
|
|
5324
|
|
|
code += lname[1:] |
|
5325
|
|
|
code = code.replace('0', '') # rule 1 |
|
5326
|
|
|
|
|
5327
|
|
|
if maxlength is not None: |
|
5328
|
|
|
if len(code) < maxlength: |
|
5329
|
|
|
code += '0' * (maxlength-len(code)) |
|
5330
|
|
|
else: |
|
5331
|
|
|
code = code[:maxlength] |
|
5332
|
|
|
|
|
5333
|
|
|
return code |
|
5334
|
|
|
|
|
5335
|
|
|
|
|
5336
|
|
|
def pshp_soundex_first(fname, maxlength=4, german=False): |
|
5337
|
|
|
"""Calculate the PSHP Soundex/Viewex Coding of a first name. |
|
5338
|
|
|
|
|
5339
|
|
|
This coding is based on :cite:`Hershberg:1976`. |
|
5340
|
|
|
|
|
5341
|
|
|
Reference was also made to the German version of the same: |
|
5342
|
|
|
:cite:`Hershberg:1979`. |
|
5343
|
|
|
|
|
5344
|
|
|
A separate function, pshp_soundex_last() is used for last names. |
|
5345
|
|
|
|
|
5346
|
|
|
:param fname: the first name to encode |
|
5347
|
|
|
:param german: set to True if the name is German (different rules apply) |
|
5348
|
|
|
:return: |
|
5349
|
|
|
""" |
|
5350
|
|
|
fname = normalize('NFKD', text_type(fname.upper())) |
|
5351
|
|
|
fname = fname.replace('ß', 'SS') |
|
5352
|
|
|
fname = ''.join(c for c in fname if c in |
|
5353
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', |
|
5354
|
|
|
'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', |
|
5355
|
|
|
'W', 'X', 'Y', 'Z'}) |
|
5356
|
|
|
|
|
5357
|
|
|
# special rules |
|
5358
|
|
|
if fname == 'JAMES': |
|
5359
|
|
|
code = 'J7' |
|
5360
|
|
|
elif fname == 'PAT': |
|
5361
|
|
|
code = 'P7' |
|
5362
|
|
|
|
|
5363
|
|
|
else: |
|
5364
|
|
|
# A. Prefix treatment |
|
5365
|
|
|
if fname[:2] in {'GE', 'GI', 'GY'}: |
|
5366
|
|
|
fname = 'J' + fname[1:] |
|
5367
|
|
|
elif fname[:2] in {'CE', 'CI', 'CY'}: |
|
5368
|
|
|
fname = 'S' + fname[1:] |
|
5369
|
|
|
elif fname[:3] == 'CHR': |
|
5370
|
|
|
fname = 'K' + fname[1:] |
|
5371
|
|
|
elif fname[:1] == 'C' and fname[:2] != 'CH': |
|
5372
|
|
|
fname = 'K' + fname[1:] |
|
5373
|
|
|
|
|
5374
|
|
|
if fname[:2] == 'KN': |
|
5375
|
|
|
fname = 'N' + fname[1:] |
|
5376
|
|
|
elif fname[:2] == 'PH': |
|
5377
|
|
|
fname = 'F' + fname[1:] |
|
5378
|
|
|
elif fname[:3] in {'WIE', 'WEI'}: |
|
5379
|
|
|
fname = 'V' + fname[1:] |
|
5380
|
|
|
|
|
5381
|
|
|
if german and fname[:1] in {'W', 'M', 'Y', 'Z'}: |
|
5382
|
|
|
fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] + |
|
5383
|
|
|
fname[1:]) |
|
5384
|
|
|
|
|
5385
|
|
|
code = fname[:1] |
|
5386
|
|
|
|
|
5387
|
|
|
# B. Soundex coding |
|
5388
|
|
|
# code for Y unspecified, but presumably is 0 |
|
5389
|
|
|
_pshp_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
5390
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
5391
|
|
|
'01230120022455012523010202')) |
|
5392
|
|
|
|
|
5393
|
|
|
fname = fname.translate(_pshp_translation) |
|
5394
|
|
|
fname = _delete_consecutive_repeats(fname) |
|
5395
|
|
|
|
|
5396
|
|
|
code += fname[1:] |
|
5397
|
|
|
syl_ptr = code.find('0') |
|
5398
|
|
|
syl2_ptr = code[syl_ptr + 1:].find('0') |
|
5399
|
|
|
if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1: |
|
5400
|
|
|
code = code[:syl_ptr + 2] |
|
5401
|
|
|
|
|
5402
|
|
|
code = code.replace('0', '') # rule 1 |
|
5403
|
|
|
|
|
5404
|
|
|
if maxlength is not None: |
|
5405
|
|
|
if len(code) < maxlength: |
|
5406
|
|
|
code += '0' * (maxlength-len(code)) |
|
5407
|
|
|
else: |
|
5408
|
|
|
code = code[:maxlength] |
|
5409
|
|
|
|
|
5410
|
|
|
return code |
|
5411
|
|
|
|
|
5412
|
|
|
|
|
5413
|
|
|
def henry_early(word, maxlength=3): |
|
5414
|
|
|
"""Calculate the early version of the Henry code for a word. |
|
5415
|
|
|
|
|
5416
|
|
|
The early version of Henry coding is given in :cite:`Legare:1972`. This is |
|
5417
|
|
|
different from the later version defined in :cite:`Henry:1976`. |
|
5418
|
|
|
|
|
5419
|
|
|
:param word: |
|
5420
|
|
|
:param int maxlength: the length of the code returned (defaults to 3) |
|
5421
|
|
|
:return: |
|
5422
|
|
|
""" |
|
5423
|
|
|
_cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q', |
|
5424
|
|
|
'R', 'S', 'T', 'V', 'W', 'X', 'Z'} |
|
5425
|
|
|
_vows = {'A', 'E', 'I', 'O', 'U', 'Y'} |
|
5426
|
|
|
_diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O', |
|
5427
|
|
|
'EU': 'U'} |
|
5428
|
|
|
_unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'} |
|
5429
|
|
|
_simple = {'W': 'V', 'X': 'S', 'V': 'S'} |
|
5430
|
|
|
|
|
5431
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
5432
|
|
|
word = ''.join(c for c in word if c in |
|
5433
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
5434
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
5435
|
|
|
'Y', 'Z'}) |
|
5436
|
|
|
|
|
5437
|
|
|
if not word: |
|
5438
|
|
|
return '' |
|
5439
|
|
|
|
|
5440
|
|
|
# Rule Ia seems to be covered entirely in II |
|
5441
|
|
|
|
|
5442
|
|
|
# Rule Ib |
|
5443
|
|
|
if word[0] in _vows: |
|
5444
|
|
|
# Ib1 |
|
5445
|
|
|
if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or |
|
5446
|
|
|
(word[1:2] in _cons and word[2:3] not in _cons))): |
|
5447
|
|
|
if word[0] == 'Y': |
|
5448
|
|
|
word = 'I'+word[1:] |
|
5449
|
|
|
# Ib2 |
|
5450
|
|
|
elif word[1:2] in {'M', 'N'} and word[2:3] in _cons: |
|
5451
|
|
|
if word[0] == 'E': |
|
5452
|
|
|
word = 'A'+word[1:] |
|
5453
|
|
|
elif word[0] in {'I', 'U', 'Y'}: |
|
5454
|
|
|
word = 'E'+word[1:] |
|
5455
|
|
|
# Ib3 |
|
5456
|
|
|
elif word[:2] in _diph: |
|
5457
|
|
|
word = _diph[word[:2]]+word[2:] |
|
5458
|
|
|
# Ib4 |
|
5459
|
|
|
elif word[1:2] in _vows and word[0] == 'Y': |
|
5460
|
|
|
word = 'I' + word[1:] |
|
5461
|
|
|
|
|
5462
|
|
|
code = '' |
|
5463
|
|
|
skip = 0 |
|
5464
|
|
|
|
|
5465
|
|
|
# Rule II |
|
5466
|
|
|
for pos, char in enumerate(word): |
|
5467
|
|
|
nxch = char[pos+1:pos+2] |
|
5468
|
|
|
prev = char[pos-1:pos] |
|
5469
|
|
|
|
|
5470
|
|
|
if skip: |
|
5471
|
|
|
skip -= 1 |
|
5472
|
|
|
elif char in _vows: |
|
5473
|
|
|
code += char |
|
5474
|
|
|
# IIc |
|
5475
|
|
|
elif char == nxch: |
|
5476
|
|
|
skip = 1 |
|
5477
|
|
|
code += char |
|
5478
|
|
|
elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}: |
|
5479
|
|
|
skip = 1 |
|
5480
|
|
|
code += word[pos+1] |
|
5481
|
|
|
# IId |
|
5482
|
|
|
elif char == 'H' and prev in _cons: |
|
5483
|
|
|
continue |
|
5484
|
|
|
elif char == 'S' and nxch in _cons: |
|
5485
|
|
|
continue |
|
5486
|
|
|
elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}: |
|
5487
|
|
|
continue |
|
5488
|
|
|
elif char == 'L' and nxch in {'M', 'N'}: |
|
5489
|
|
|
continue |
|
5490
|
|
|
elif char in {'M', 'N'} and prev in _vows and nxch in _cons: |
|
5491
|
|
|
continue |
|
5492
|
|
|
# IIa |
|
5493
|
|
|
elif char in _unaltered: |
|
5494
|
|
|
code += char |
|
5495
|
|
|
# IIb |
|
5496
|
|
|
elif char in _simple: |
|
5497
|
|
|
code += _simple[char] |
|
5498
|
|
|
elif char in {'C', 'G', 'P', 'Q', 'S'}: |
|
5499
|
|
|
if char == 'C': |
|
5500
|
|
|
if nxch in {'A', 'O', 'U', 'L', 'R'}: |
|
5501
|
|
|
code += 'K' |
|
5502
|
|
|
elif nxch in {'E', 'I', 'Y'}: |
|
5503
|
|
|
code += 'J' |
|
5504
|
|
|
elif nxch == 'H': |
|
5505
|
|
|
if word[pos+2:pos+3] in _vows: |
|
5506
|
|
|
code += 'C' |
|
5507
|
|
|
elif word[pos+2:pos+3] in {'R', 'L'}: |
|
5508
|
|
|
code += 'K' |
|
5509
|
|
|
elif char == 'G': |
|
5510
|
|
|
if nxch in {'A', 'O', 'U', 'L', 'R'}: |
|
5511
|
|
|
code += 'G' |
|
5512
|
|
|
elif nxch in {'E', 'I', 'Y'}: |
|
5513
|
|
|
code += 'J' |
|
5514
|
|
|
elif nxch == 'N': |
|
5515
|
|
|
code += 'N' |
|
5516
|
|
|
elif char == 'P': |
|
5517
|
|
|
if nxch != 'H': |
|
5518
|
|
|
code += 'P' |
|
5519
|
|
|
else: |
|
5520
|
|
|
code += 'F' |
|
5521
|
|
|
elif char == 'Q': |
|
5522
|
|
|
if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}: |
|
5523
|
|
|
char += 'G' |
|
5524
|
|
|
elif word[pos + 1:pos + 2] in {'UA', 'UO'}: |
|
5525
|
|
|
char += 'K' |
|
5526
|
|
|
elif char == 'S': |
|
5527
|
|
|
if word[pos:pos+6] == 'SAINTE': |
|
5528
|
|
|
code += 'X' |
|
5529
|
|
|
skip = 5 |
|
5530
|
|
|
elif word[pos:pos+5] == 'SAINT': |
|
5531
|
|
|
code += 'X' |
|
5532
|
|
|
skip = 4 |
|
5533
|
|
|
elif word[pos:pos+3] == 'STE': |
|
5534
|
|
|
code += 'X' |
|
5535
|
|
|
skip = 2 |
|
5536
|
|
|
elif word[pos:pos+2] == 'ST': |
|
5537
|
|
|
code += 'X' |
|
5538
|
|
|
skip = 1 |
|
5539
|
|
|
else: |
|
5540
|
|
|
code += 'S' |
|
5541
|
|
|
else: # this should not be possible |
|
5542
|
|
|
continue |
|
5543
|
|
|
|
|
5544
|
|
|
# IIe1 |
|
5545
|
|
|
if code[-4:] in {'AULT', 'EULT', 'OULT'}: |
|
5546
|
|
|
code = code[:-2] |
|
5547
|
|
|
elif code[-4:-3] in _vows and code[-3:] == 'MPS': |
|
5548
|
|
|
code = code[:-3] |
|
5549
|
|
|
elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}: |
|
5550
|
|
|
code = code[:-2] |
|
5551
|
|
|
elif code[-2:-1] == 'R' and code[-1:] in _cons: |
|
5552
|
|
|
code = code[:-1] |
|
5553
|
|
|
# IIe2 |
|
5554
|
|
|
elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}: |
|
5555
|
|
|
code = code[:-1] |
|
5556
|
|
|
elif code[-2:] == 'ER': |
|
5557
|
|
|
code = code[:-1] |
|
5558
|
|
|
|
|
5559
|
|
|
# Drop non-initial vowels |
|
5560
|
|
|
code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '', |
|
5561
|
|
|
89: ''}) |
|
5562
|
|
|
|
|
5563
|
|
|
if maxlength is not None: |
|
5564
|
|
|
code = code[:maxlength] |
|
|
|
|
|
|
5565
|
|
|
|
|
5566
|
|
|
return code |
|
5567
|
|
|
|
|
5568
|
|
|
|
|
5569
|
|
|
def norphone(word): |
|
5570
|
|
|
"""Return the Norphone code. |
|
5571
|
|
|
|
|
5572
|
|
|
The reference implementation by Lars Marius Garshol is available in |
|
5573
|
|
|
:cite:`Garshol:2015`. |
|
5574
|
|
|
|
|
5575
|
|
|
Norphone was designed for Norwegian, but this implementation has been |
|
5576
|
|
|
extended to support Swedish vowels as well. This function incorporates |
|
5577
|
|
|
the "not implemented" rules from the above file's rule set. |
|
5578
|
|
|
|
|
5579
|
|
|
:param word: |
|
5580
|
|
|
:return: |
|
5581
|
|
|
""" |
|
5582
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'} |
|
5583
|
|
|
|
|
5584
|
|
|
replacements = {4: {'SKEI': 'X'}, |
|
5585
|
|
|
3: {'SKJ': 'X', 'KEI': 'X'}, |
|
5586
|
|
|
2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K', |
|
5587
|
|
|
'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X', |
|
5588
|
|
|
'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'}, |
|
5589
|
|
|
1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}} |
|
5590
|
|
|
|
|
5591
|
|
|
word = word.upper() |
|
5592
|
|
|
|
|
5593
|
|
|
code = '' |
|
5594
|
|
|
skip = 0 |
|
5595
|
|
|
|
|
5596
|
|
|
if word[0:2] == 'AA': |
|
5597
|
|
|
code = 'Å' |
|
5598
|
|
|
skip = 2 |
|
5599
|
|
|
elif word[0:2] == 'GI': |
|
5600
|
|
|
code = 'J' |
|
5601
|
|
|
skip = 2 |
|
5602
|
|
|
elif word[0:3] == 'SKY': |
|
5603
|
|
|
code = 'X' |
|
5604
|
|
|
skip = 3 |
|
5605
|
|
|
elif word[0:2] == 'EI': |
|
5606
|
|
|
code = 'Æ' |
|
5607
|
|
|
skip = 2 |
|
5608
|
|
|
elif word[0:2] == 'KY': |
|
5609
|
|
|
code = 'X' |
|
5610
|
|
|
skip = 2 |
|
5611
|
|
|
elif word[:1] == 'C': |
|
5612
|
|
|
code = 'K' |
|
5613
|
|
|
skip = 1 |
|
5614
|
|
|
elif word[:1] == 'Ä': |
|
5615
|
|
|
code = 'Æ' |
|
5616
|
|
|
skip = 1 |
|
5617
|
|
|
elif word[:1] == 'Ö': |
|
5618
|
|
|
code = 'Ø' |
|
5619
|
|
|
skip = 1 |
|
5620
|
|
|
|
|
5621
|
|
|
if word[-2:] == 'DT': |
|
5622
|
|
|
word = word[:-2]+'T' |
|
5623
|
|
|
# Though the rules indicate this rule applies in all positions, the |
|
5624
|
|
|
# reference implementation indicates it applies only in final position. |
|
5625
|
|
|
elif word[-2:-1] in _vowels and word[-1:] == 'D': |
|
5626
|
|
|
word = word[:-2] |
|
5627
|
|
|
|
|
5628
|
|
|
for pos, char in enumerate(word): |
|
5629
|
|
|
if skip: |
|
5630
|
|
|
skip -= 1 |
|
5631
|
|
|
else: |
|
5632
|
|
|
for length in sorted(replacements, reverse=True): |
|
5633
|
|
|
if word[pos:pos+length] in replacements[length]: |
|
5634
|
|
|
code += replacements[length][word[pos:pos+length]] |
|
5635
|
|
|
skip = length-1 |
|
5636
|
|
|
break |
|
5637
|
|
|
else: |
|
5638
|
|
|
if not pos or char not in _vowels: |
|
5639
|
|
|
code += char |
|
5640
|
|
|
|
|
5641
|
|
|
code = _delete_consecutive_repeats(code) |
|
5642
|
|
|
|
|
5643
|
|
|
return code |
|
5644
|
|
|
|
|
5645
|
|
|
|
|
5646
|
|
|
def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'): |
|
5647
|
|
|
r"""Return the Dolby Code of a name. |
|
5648
|
|
|
|
|
5649
|
|
|
This follows "A Spelling Equivalent Abbreviation Algorithm For Personal |
|
5650
|
|
|
Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`. |
|
5651
|
|
|
|
|
5652
|
|
|
:param word: the word to encode |
|
5653
|
|
|
:param maxlength: maximum length of the returned Dolby code -- this also |
|
5654
|
|
|
activates the fixed-length code mode |
|
5655
|
|
|
:param keep_vowels: if True, retains all vowel markers |
|
5656
|
|
|
:param vowel_char: the vowel marker character (default to \*) |
|
5657
|
|
|
:return: |
|
5658
|
|
|
""" |
|
5659
|
|
|
_vowels = {'A', 'E', 'I', 'O', 'U', 'Y'} |
|
5660
|
|
|
|
|
5661
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z out |
|
5662
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
5663
|
|
|
word = word.replace('ß', 'SS') |
|
5664
|
|
|
word = ''.join(c for c in word if c in |
|
5665
|
|
|
{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', |
|
5666
|
|
|
'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', |
|
5667
|
|
|
'Y', 'Z'}) |
|
5668
|
|
|
|
|
5669
|
|
|
# Rule 1 (FL2) |
|
5670
|
|
|
if word[:3] in {'MCG', 'MAG', 'MAC'}: |
|
5671
|
|
|
word = 'MK'+word[3:] |
|
5672
|
|
|
elif word[:2] == 'MC': |
|
5673
|
|
|
word = 'MK'+word[2:] |
|
5674
|
|
|
|
|
5675
|
|
|
# Rule 2 (FL3) |
|
5676
|
|
|
pos = len(word)-2 |
|
5677
|
|
|
while pos > -1: |
|
5678
|
|
|
if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC', |
|
5679
|
|
|
'SK', 'ST'}: |
|
5680
|
|
|
word = word[:pos+1]+word[pos+2:] |
|
5681
|
|
|
pos += 1 |
|
5682
|
|
|
pos -= 1 |
|
5683
|
|
|
|
|
5684
|
|
|
# Rule 3 (FL4) |
|
5685
|
|
|
# Although the rule indicates "after the first letter", the test cases make |
|
5686
|
|
|
# it clear that these apply to the first letter also. |
|
5687
|
|
|
word = word.replace('X', 'KS') |
|
5688
|
|
|
word = word.replace('CE', 'SE') |
|
5689
|
|
|
word = word.replace('CI', 'SI') |
|
5690
|
|
|
word = word.replace('CY', 'SI') |
|
5691
|
|
|
|
|
5692
|
|
|
# not in the rule set, but they seem to have intended it |
|
5693
|
|
|
word = word.replace('TCH', 'CH') |
|
5694
|
|
|
|
|
5695
|
|
|
pos = word.find('CH', 1) |
|
5696
|
|
|
while pos != -1: |
|
5697
|
|
|
if word[pos-1:pos] not in _vowels: |
|
5698
|
|
|
word = word[:pos]+'S'+word[pos+1:] |
|
5699
|
|
|
pos = word.find('CH', pos+1) |
|
5700
|
|
|
|
|
5701
|
|
|
word = word.replace('C', 'K') |
|
5702
|
|
|
word = word.replace('Z', 'S') |
|
5703
|
|
|
|
|
5704
|
|
|
word = word.replace('WR', 'R') |
|
5705
|
|
|
word = word.replace('DG', 'G') |
|
5706
|
|
|
word = word.replace('QU', 'K') |
|
5707
|
|
|
word = word.replace('T', 'D') |
|
5708
|
|
|
word = word.replace('PH', 'F') |
|
5709
|
|
|
|
|
5710
|
|
|
# Rule 4 (FL5) |
|
5711
|
|
|
# Although the rule indicates "after the first letter", the test cases make |
|
5712
|
|
|
# it clear that these apply to the first letter also. |
|
5713
|
|
|
pos = word.find('K', 0) |
|
5714
|
|
|
while pos != -1: |
|
5715
|
|
|
if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}: |
|
5716
|
|
|
word = word[:pos-1]+word[pos:] |
|
5717
|
|
|
pos -= 1 |
|
5718
|
|
|
pos = word.find('K', pos+1) |
|
5719
|
|
|
|
|
5720
|
|
|
# Rule FL6 |
|
5721
|
|
|
if maxlength and word[-1:] == 'E': |
|
5722
|
|
|
word = word[:-1] |
|
5723
|
|
|
|
|
5724
|
|
|
# Rule 5 (FL7) |
|
5725
|
|
|
word = _delete_consecutive_repeats(word) |
|
5726
|
|
|
|
|
5727
|
|
|
# Rule 6 (FL8) |
|
5728
|
|
|
if word[:2] == 'PF': |
|
5729
|
|
|
word = word[1:] |
|
5730
|
|
|
if word[-2:] == 'PF': |
|
5731
|
|
|
word = word[:-1] |
|
5732
|
|
|
elif word[-2:] == 'GH': |
|
5733
|
|
|
if word[-3:-2] in _vowels: |
|
5734
|
|
|
word = word[:-2]+'F' |
|
5735
|
|
|
else: |
|
5736
|
|
|
word = word[:-2]+'G' |
|
5737
|
|
|
word = word.replace('GH', '') |
|
5738
|
|
|
|
|
5739
|
|
|
# Rule FL9 |
|
5740
|
|
|
if maxlength: |
|
5741
|
|
|
word = word.replace('V', 'F') |
|
5742
|
|
|
|
|
5743
|
|
|
# Rules 7-9 (FL10-FL12) |
|
5744
|
|
|
first = 1 + (1 if maxlength else 0) |
|
5745
|
|
|
code = '' |
|
5746
|
|
|
for pos, char in enumerate(word): |
|
5747
|
|
|
if char in _vowels: |
|
5748
|
|
|
if first or keep_vowels: |
|
5749
|
|
|
code += vowel_char |
|
5750
|
|
|
first -= 1 |
|
5751
|
|
|
else: |
|
5752
|
|
|
continue |
|
5753
|
|
|
elif pos > 0 and char in {'W', 'H'}: |
|
5754
|
|
|
continue |
|
5755
|
|
|
else: |
|
5756
|
|
|
code += char |
|
5757
|
|
|
|
|
5758
|
|
|
if maxlength: |
|
5759
|
|
|
# Rule FL13 |
|
5760
|
|
|
if len(code) > maxlength and code[-1:] == 'S': |
|
5761
|
|
|
code = code[:-1] |
|
5762
|
|
|
if keep_vowels: |
|
5763
|
|
|
code = code[:maxlength] |
|
5764
|
|
|
else: |
|
5765
|
|
|
# Rule FL14 |
|
5766
|
|
|
code = code[:maxlength + 2] |
|
5767
|
|
|
# Rule FL15 |
|
5768
|
|
|
while len(code) > maxlength: |
|
5769
|
|
|
vowels = len(code) - maxlength |
|
5770
|
|
|
excess = vowels - 1 |
|
5771
|
|
|
word = code |
|
5772
|
|
|
code = '' |
|
5773
|
|
|
for char in word: |
|
5774
|
|
|
if char == vowel_char: |
|
5775
|
|
|
if vowels: |
|
5776
|
|
|
code += char |
|
5777
|
|
|
vowels -= 1 |
|
5778
|
|
|
else: |
|
5779
|
|
|
code += char |
|
5780
|
|
|
code = code[:maxlength + excess] |
|
5781
|
|
|
|
|
5782
|
|
|
# Rule FL16 |
|
5783
|
|
|
code += ' ' * (maxlength - len(code)) |
|
5784
|
|
|
|
|
5785
|
|
|
return code |
|
5786
|
|
|
|
|
5787
|
|
|
|
|
5788
|
|
|
def phonetic_spanish(word, maxlength=None): |
|
5789
|
|
|
"""Return the PhoneticSpanish coding of word. |
|
5790
|
|
|
|
|
5791
|
|
|
This follows the coding described in :cite:`Amon:2012` and |
|
5792
|
|
|
:cite:`delPilarAngeles:2015`. |
|
5793
|
|
|
|
|
5794
|
|
|
:param word: |
|
5795
|
|
|
:return: |
|
5796
|
|
|
""" |
|
5797
|
|
|
_es_soundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
5798
|
|
|
'BCDFGHJKLMNPQRSTVXYZ'), |
|
5799
|
|
|
'14328287566079431454')) |
|
5800
|
|
|
|
|
5801
|
|
|
# uppercase, normalize, and decompose, filter to A-Z minus vowels & W |
|
5802
|
|
|
word = normalize('NFKD', text_type(word.upper())) |
|
5803
|
|
|
word = ''.join(c for c in word if c in |
|
5804
|
|
|
{'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', |
|
5805
|
|
|
'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'}) |
|
5806
|
|
|
|
|
5807
|
|
|
# merge repeated Ls & Rs |
|
5808
|
|
|
word = word.replace('LL', 'L') |
|
5809
|
|
|
word = word.replace('R', 'R') |
|
5810
|
|
|
|
|
5811
|
|
|
# apply the Soundex algorithm |
|
5812
|
|
|
sdx = word.translate(_es_soundex_translation) |
|
5813
|
|
|
|
|
5814
|
|
|
if maxlength: |
|
5815
|
|
|
sdx = sdx[:maxlength] |
|
5816
|
|
|
|
|
5817
|
|
|
return sdx |
|
5818
|
|
|
|
|
5819
|
|
|
|
|
5820
|
|
|
def spanish_metaphone(word, maxlength=6, modified=False): |
|
5821
|
|
|
"""Return the Spanish Metaphone of a word. |
|
5822
|
|
|
|
|
5823
|
|
|
This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at |
|
5824
|
|
|
https://github.com/amsqr/Spanish-Metaphone and discussed in |
|
5825
|
|
|
:cite:`Mosquera:2012`. |
|
5826
|
|
|
|
|
5827
|
|
|
Modified version based on :cite:`delPilarAngeles:2016`. |
|
5828
|
|
|
|
|
5829
|
|
|
:param word: |
|
5830
|
|
|
:param maxlength: |
|
5831
|
|
|
:param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's |
|
5832
|
|
|
modified version of the algorithm |
|
5833
|
|
|
:return: |
|
5834
|
|
|
""" |
|
5835
|
|
|
def _is_vowel(pos): |
|
5836
|
|
|
"""Return True if the character at word[pos] is a vowel.""" |
|
5837
|
|
|
if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}: |
|
5838
|
|
|
return True |
|
5839
|
|
|
return False |
|
5840
|
|
|
|
|
5841
|
|
|
word = normalize('NFC', text_type(word.upper())) |
|
5842
|
|
|
|
|
5843
|
|
|
meta_key = '' |
|
5844
|
|
|
pos = 0 |
|
5845
|
|
|
|
|
5846
|
|
|
# do some replacements for the modified version |
|
5847
|
|
|
if modified: |
|
5848
|
|
|
word = word.replace('MB', 'NB') |
|
5849
|
|
|
word = word.replace('MP', 'NP') |
|
5850
|
|
|
word = word.replace('BS', 'S') |
|
5851
|
|
|
if word[:2] == 'PS': |
|
5852
|
|
|
word = word[1:] |
|
5853
|
|
|
|
|
5854
|
|
|
# simple replacements |
|
5855
|
|
|
word = word.replace('Á', 'A') |
|
5856
|
|
|
word = word.replace('CH', 'X') |
|
5857
|
|
|
word = word.replace('Ç', 'S') |
|
5858
|
|
|
word = word.replace('É', 'E') |
|
5859
|
|
|
word = word.replace('Í', 'I') |
|
5860
|
|
|
word = word.replace('Ó', 'O') |
|
5861
|
|
|
word = word.replace('Ú', 'U') |
|
5862
|
|
|
word = word.replace('Ñ', 'NY') |
|
5863
|
|
|
word = word.replace('GÜ', 'W') |
|
5864
|
|
|
word = word.replace('Ü', 'U') |
|
5865
|
|
|
word = word.replace('B', 'V') |
|
5866
|
|
|
word = word.replace('LL', 'Y') |
|
5867
|
|
|
|
|
5868
|
|
|
while len(meta_key) < maxlength: |
|
5869
|
|
|
if pos >= len(word): |
|
5870
|
|
|
break |
|
5871
|
|
|
|
|
5872
|
|
|
# get the next character |
|
5873
|
|
|
current_char = word[pos] |
|
5874
|
|
|
|
|
5875
|
|
|
# if a vowel in pos 0, add to key |
|
5876
|
|
|
if _is_vowel(pos) and pos == 0: |
|
5877
|
|
|
meta_key += current_char |
|
5878
|
|
|
pos += 1 |
|
5879
|
|
|
# otherwise, do consonant rules |
|
5880
|
|
|
else: |
|
5881
|
|
|
# simple consonants (unmutated) |
|
5882
|
|
|
if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V', |
|
5883
|
|
|
'L', 'Y'}: |
|
5884
|
|
|
meta_key += current_char |
|
5885
|
|
|
# skip doubled consonants |
|
5886
|
|
|
if word[pos+1:pos+2] == current_char: |
|
5887
|
|
|
pos += 2 |
|
5888
|
|
|
else: |
|
5889
|
|
|
pos += 1 |
|
5890
|
|
|
else: |
|
5891
|
|
|
if current_char == 'C': |
|
5892
|
|
|
# special case 'acción', 'reacción',etc. |
|
5893
|
|
|
if word[pos+1:pos+2] == 'C': |
|
5894
|
|
|
meta_key += 'X' |
|
5895
|
|
|
pos += 2 |
|
5896
|
|
|
# special case 'cesar', 'cien', 'cid', 'conciencia' |
|
5897
|
|
|
elif word[pos+1:pos+2] in {'E', 'I'}: |
|
5898
|
|
|
meta_key += 'Z' |
|
5899
|
|
|
pos += 2 |
|
5900
|
|
|
# base case |
|
5901
|
|
|
else: |
|
5902
|
|
|
meta_key += 'K' |
|
5903
|
|
|
pos += 1 |
|
5904
|
|
|
elif current_char == 'G': |
|
5905
|
|
|
# special case 'gente', 'ecologia',etc |
|
5906
|
|
|
if word[pos + 1:pos + 2] in {'E', 'I'}: |
|
5907
|
|
|
meta_key += 'J' |
|
5908
|
|
|
pos += 2 |
|
5909
|
|
|
# base case |
|
5910
|
|
|
else: |
|
5911
|
|
|
meta_key += 'G' |
|
5912
|
|
|
pos += 1 |
|
5913
|
|
|
elif current_char == 'H': |
|
5914
|
|
|
# since the letter 'H' is silent in Spanish, |
|
5915
|
|
|
# set the meta key to the vowel after the letter 'H' |
|
5916
|
|
|
if _is_vowel(pos+1): |
|
5917
|
|
|
meta_key += word[pos+1] |
|
5918
|
|
|
pos += 2 |
|
5919
|
|
|
else: |
|
5920
|
|
|
meta_key += 'H' |
|
5921
|
|
|
pos += 1 |
|
5922
|
|
|
elif current_char == 'Q': |
|
5923
|
|
|
if word[pos+1:pos+2] == 'U': |
|
5924
|
|
|
pos += 2 |
|
5925
|
|
|
else: |
|
5926
|
|
|
pos += 1 |
|
5927
|
|
|
meta_key += 'K' |
|
5928
|
|
|
elif current_char == 'W': |
|
5929
|
|
|
meta_key += 'U' |
|
5930
|
|
|
pos += 1 |
|
5931
|
|
|
elif current_char == 'R': |
|
5932
|
|
|
meta_key += 'R' |
|
5933
|
|
|
pos += 1 |
|
5934
|
|
|
elif current_char == 'S': |
|
5935
|
|
|
if not _is_vowel(pos+1) and pos == 0: |
|
5936
|
|
|
meta_key += 'ES' |
|
5937
|
|
|
pos += 1 |
|
5938
|
|
|
else: |
|
5939
|
|
|
meta_key += 'S' |
|
5940
|
|
|
pos += 1 |
|
5941
|
|
|
elif current_char == 'Z': |
|
5942
|
|
|
meta_key += 'Z' |
|
5943
|
|
|
pos += 1 |
|
5944
|
|
|
elif current_char == 'X': |
|
5945
|
|
|
if len(word) > 1 and pos == 0 and not _is_vowel(pos+1): |
|
5946
|
|
|
meta_key += 'EX' |
|
5947
|
|
|
pos += 1 |
|
5948
|
|
|
else: |
|
5949
|
|
|
meta_key += 'X' |
|
5950
|
|
|
pos += 1 |
|
5951
|
|
|
else: |
|
5952
|
|
|
pos += 1 |
|
5953
|
|
|
|
|
5954
|
|
|
# Final change from S to Z in modified version |
|
5955
|
|
|
if modified: |
|
5956
|
|
|
meta_key = meta_key.replace('S', 'Z') |
|
5957
|
|
|
|
|
5958
|
|
|
return meta_key |
|
5959
|
|
|
|
|
5960
|
|
|
|
|
5961
|
|
|
def metasoundex(word, language='en'): |
|
5962
|
|
|
"""Return the MetaSoundex code for a word. |
|
5963
|
|
|
|
|
5964
|
|
|
This is based on :cite:`Koneru:2017`. |
|
5965
|
|
|
|
|
5966
|
|
|
:param word: |
|
5967
|
|
|
:param language: either 'en' for English or 'es' for Spanish |
|
5968
|
|
|
:return: |
|
5969
|
|
|
""" |
|
5970
|
|
|
_metasoundex_translation = dict(zip((ord(_) for _ in |
|
|
|
|
|
|
5971
|
|
|
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), |
|
5972
|
|
|
'07430755015866075943077514')) |
|
5973
|
|
|
|
|
5974
|
|
|
if language == 'es': |
|
5975
|
|
|
return phonetic_spanish(spanish_metaphone(word)) |
|
5976
|
|
|
|
|
5977
|
|
|
word = soundex(metaphone(word)) |
|
5978
|
|
|
word = word[0].translate(_metasoundex_translation)+word[1:] |
|
5979
|
|
|
|
|
5980
|
|
|
return word |
|
5981
|
|
|
|
|
5982
|
|
|
|
|
5983
|
|
|
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx', |
|
5984
|
|
|
concat=False, filter_langs=False): |
|
5985
|
|
|
"""Return the Beider-Morse Phonetic Matching algorithm code for a word. |
|
5986
|
|
|
|
|
5987
|
|
|
The Beider-Morse Phonetic Matching algorithm is described in |
|
5988
|
|
|
:cite:`Beider:2008`. |
|
5989
|
|
|
The reference implementation is licensed under GPLv3. |
|
5990
|
|
|
|
|
5991
|
|
|
:param str word: the word to transform |
|
5992
|
|
|
:param str language_arg: the language of the term; supported values |
|
5993
|
|
|
include: |
|
5994
|
|
|
|
|
5995
|
|
|
- 'any' |
|
5996
|
|
|
- 'arabic' |
|
5997
|
|
|
- 'cyrillic' |
|
5998
|
|
|
- 'czech' |
|
5999
|
|
|
- 'dutch' |
|
6000
|
|
|
- 'english' |
|
6001
|
|
|
- 'french' |
|
6002
|
|
|
- 'german' |
|
6003
|
|
|
- 'greek' |
|
6004
|
|
|
- 'greeklatin' |
|
6005
|
|
|
- 'hebrew' |
|
6006
|
|
|
- 'hungarian' |
|
6007
|
|
|
- 'italian' |
|
6008
|
|
|
- 'polish' |
|
6009
|
|
|
- 'portuguese' |
|
6010
|
|
|
- 'romanian' |
|
6011
|
|
|
- 'russian' |
|
6012
|
|
|
- 'spanish' |
|
6013
|
|
|
- 'turkish' |
|
6014
|
|
|
- 'germandjsg' |
|
6015
|
|
|
- 'polishdjskp' |
|
6016
|
|
|
- 'russiandjsre' |
|
6017
|
|
|
|
|
6018
|
|
|
:param str name_mode: the name mode of the algorithm: |
|
6019
|
|
|
|
|
6020
|
|
|
- 'gen' -- general (default) |
|
6021
|
|
|
- 'ash' -- Ashkenazi |
|
6022
|
|
|
- 'sep' -- Sephardic |
|
6023
|
|
|
|
|
6024
|
|
|
:param str match_mode: matching mode: 'approx' or 'exact' |
|
6025
|
|
|
:param bool concat: concatenation mode |
|
6026
|
|
|
:param bool filter_langs: filter out incompatible languages |
|
6027
|
|
|
:returns: the BMPM value(s) |
|
6028
|
|
|
:rtype: tuple |
|
6029
|
|
|
|
|
6030
|
|
|
>>> bmpm('Christopher') |
|
6031
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
|
6032
|
|
|
xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir |
|
6033
|
|
|
tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir |
|
6034
|
|
|
zritofi' |
|
6035
|
|
|
>>> bmpm('Niall') |
|
6036
|
|
|
'nial niol' |
|
6037
|
|
|
>>> bmpm('Smith') |
|
6038
|
|
|
'zmit' |
|
6039
|
|
|
>>> bmpm('Schmidt') |
|
6040
|
|
|
'zmit stzmit' |
|
6041
|
|
|
|
|
6042
|
|
|
>>> bmpm('Christopher', language_arg='German') |
|
6043
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
|
6044
|
|
|
xristYfir' |
|
6045
|
|
|
>>> bmpm('Christopher', language_arg='English') |
|
6046
|
|
|
'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir |
|
6047
|
|
|
xrQstafir' |
|
6048
|
|
|
>>> bmpm('Christopher', language_arg='German', name_mode='ash') |
|
6049
|
|
|
'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir |
|
6050
|
|
|
xristYfir' |
|
6051
|
|
|
|
|
6052
|
|
|
>>> bmpm('Christopher', language_arg='German', match_mode='exact') |
|
6053
|
|
|
'xriStopher xriStofer xristopher xristofer' |
|
6054
|
|
|
""" |
|
6055
|
|
|
return _bmpm(word, language_arg, name_mode, match_mode, |
|
6056
|
|
|
concat, filter_langs) |
|
6057
|
|
|
|
|
6058
|
|
|
|
|
6059
|
|
|
if __name__ == '__main__': |
|
6060
|
|
|
import doctest |
|
6061
|
|
|
doctest.testmod() |
|
6062
|
|
|
|