Completed
Push — master ( cb6002...8e98e5 )
by Chris
08:42
created

abydos.phonetic.refined_soundex()   B

Complexity

Conditions 7

Size

Total Lines 48
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 16
nop 4
dl 0
loc 48
rs 8
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (4619/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Daitch-Mokotoff Soundex
26
    - Kölner Phonetik
27
    - NYSIIS
28
    - Match Rating Algorithm
29
    - Metaphone
30
    - Double Metaphone
31
    - Caverphone
32
    - Alpha Search Inquiry System
33
    - Fuzzy Soundex
34
    - Phonex
35
    - Phonem
36
    - Phonix
37
    - SfinxBis
38
    - phonet
39
    - Standardized Phonetic Frequency Code
40
    - Beider-Morse Phonetic Matching
41
"""
42
43
from __future__ import division
44
from __future__ import unicode_literals
45
46
import re
47
import unicodedata
48
from collections import Counter
49
from itertools import groupby
50
51
from six import text_type
52
from six.moves import range
53
54
from ._bm import _bmpm
55
56
_INFINITY = float('inf')
57
58
59
def _delete_consecutive_repeats(word):
60
    """Delete consecutive repeated characters in a word.
61
62
    :param str word: the word to transform
63
    :returns: word with consecutive repeating characters collapsed to
64
        a single instance
65
    :rtype: str
66
    """
67
    return ''.join(char for char, _ in groupby(word))
68
69
70
def russell_index(word):
71
    """Return the Russell Index (integer output) of a word.
72
73
    This follows Robert C. Russell's Index algorithm, as described in
74
    US Patent 1,261,167 (1917)
75
76
    :param str word: the word to transform
77
    :returns: the Russell Index value
78
    :rtype: int
79
80
    >>> russell_index('Christopher')
81
    3813428
82
    >>> russell_index('Niall')
83
    715
84
    >>> russell_index('Smith')
85
    3614
86
    >>> russell_index('Schmidt')
87
    3614
88
    """
89
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
90
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
91
                                    '12341231356712383412313'))
92
93
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
94
    word = word.replace('ß', 'SS')
95
    word = word.replace('GH', '')  # discard gh (rule 3)
96
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
97
98
    # translate according to Russell's mapping
99
    word = ''.join(c for c in word if c in
100
                   frozenset('ABCDEFGIKLMNOPQRSTUVXYZ'))
101
    sdx = word.translate(_russell_translation)
102
103
    # remove any 1s after the first occurrence
104
    one = sdx.find('1')+1
105
    if one:
106
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
107
108
    # remove repeating characters
109
    sdx = _delete_consecutive_repeats(sdx)
110
111
    # return as an int
112
    return int(sdx) if sdx else float('NaN')
113
114
115
def russell_index_num_to_alpha(num):
116
    """Convert the Russell Index integer to an alphabetic string.
117
118
    This follows Robert C. Russell's Index algorithm, as described in
119
    US Patent 1,261,167 (1917)
120
121
    :param int num: a Russell Index integer value
122
    :returns: the Russell Index as an alphabetic string
123
    :rtype: str
124
125
    >>> russell_index_num_to_alpha(3813428)
126
    'CRACDBR'
127
    >>> russell_index_num_to_alpha(715)
128
    'NAL'
129
    >>> russell_index_num_to_alpha(3614)
130
    'CMAD'
131
    """
132
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
133
                                        'ABCDLMNR'))
134
    num = ''.join(c for c in text_type(num) if c in frozenset('12345678'))
135
    if num:
136
        return num.translate(_russell_num_translation)
137
    return ''
138
139
140
def russell_index_alpha(word):
141
    """Return the Russell Index (alphabetic output) for the word.
142
143
    This follows Robert C. Russell's Index algorithm, as described in
144
    US Patent 1,261,167 (1917)
145
146
    :param str word: the word to transform
147
    :returns: the Russell Index value as an alphabetic string
148
    :rtype: str
149
150
    >>> russell_index_alpha('Christopher')
151
    'CRACDBR'
152
    >>> russell_index_alpha('Niall')
153
    'NAL'
154
    >>> russell_index_alpha('Smith')
155
    'CMAD'
156
    >>> russell_index_alpha('Schmidt')
157
    'CMAD'
158
    """
159
    if word:
160
        return russell_index_num_to_alpha(russell_index(word))
161
    return ''
162
163
164
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
165
    """Return the Soundex code for a word.
166
167
    :param str word: the word to transform
168
    :param int maxlength: the length of the code returned (defaults to 4)
169
    :param str var: the variant of the algorithm to employ (defaults to
170
        'American'):
171
172
        - 'American' follows the American Soundex algorithm, as described at
173
          http://www.archives.gov/publications/general-info-leaflets/55-census.html
174
          and in Knuth(1998:394); this is also called Miracode
175
        - 'special' follows the rules from the 1880-1910 US Census
176
          retrospective re-analysis, in which h & w are not treated as blocking
177
          consonants but as vowels.
178
          Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
179
        - 'dm' computes the Daitch-Mokotoff Soundex
180
181
    :param bool reverse: reverse the word before computing the selected Soundex
182
        (defaults to False); This results in "Reverse Soundex"
183
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
184
        maxlength string
185
    :returns: the Soundex value
186
    :rtype: str
187
188
    >>> soundex("Christopher")
189
    'C623'
190
    >>> soundex("Niall")
191
    'N400'
192
    >>> soundex('Smith')
193
    'S530'
194
    >>> soundex('Schmidt')
195
    'S530'
196
197
198
    >>> soundex('Christopher', maxlength=_INFINITY)
199
    'C623160000000000000000000000000000000000000000000000000000000000'
200
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
201
    'C62316'
202
203
    >>> soundex('Christopher', reverse=True)
204
    'R132'
205
206
    >>> soundex('Ashcroft')
207
    'A261'
208
    >>> soundex('Asicroft')
209
    'A226'
210
    >>> soundex('Ashcroft', var='special')
211
    'A226'
212
    >>> soundex('Asicroft', var='special')
213
    'A226'
214
215
    >>> soundex('Christopher', var='dm')
216
    {'494379', '594379'}
217
    >>> soundex('Niall', var='dm')
218
    {'680000'}
219
    >>> soundex('Smith', var='dm')
220
    {'463000'}
221
    >>> soundex('Schmidt', var='dm')
222
    {'463000'}
223
    """
224
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
225
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
226
                                    '01230129022455012623019202'))
227
228
    # Call the D-M Soundex function itself if requested
229
    if var == 'dm':
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
230
        return dm_soundex(word, maxlength, reverse, zero_pad)
231
    elif var == 'refined':
232
        return refined_soundex(word, maxlength, reverse, zero_pad)
233
234
    # Require a maxlength of at least 4 and not more than 64
235
    if maxlength is not None:
236
        maxlength = min(max(4, maxlength), 64)
237
    else:
238
        maxlength = 64
239
240
    # uppercase, normalize, decompose, and filter non-A-Z out
241
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
242
    word = word.replace('ß', 'SS')
243
    word = ''.join(c for c in word if c in
244
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
245
246
    # Nothing to convert, return base case
247
    if not word:
248
        if zero_pad:
249
            return '0'*maxlength
250
        return '0'
251
252
    # Reverse word if computing Reverse Soundex
253
    if reverse:
254
        word = word[::-1]
255
256
    # apply the Soundex algorithm
257
    sdx = word.translate(_soundex_translation)
258
259
    if var == 'special':
260
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
261
    else:
262
        sdx = sdx.replace('9', '')  # rule 1
263
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
264
265
    if word[0] in 'HW':
266
        sdx = word[0] + sdx
267
    else:
268
        sdx = word[0] + sdx[1:]
269
    sdx = sdx.replace('0', '')  # rule 1
270
271
    if zero_pad:
272
        sdx += ('0'*maxlength)  # rule 4
273
274
    return sdx[:maxlength]
275
276
277
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False):
0 ignored issues
show
Unused Code introduced by
The argument zero_pad seems to be unused.
Loading history...
278
    """Return the Refined Soundex code for a word.
279
280
    This is Soundex, but with more character classes. It appears to have been
281
    defined by the Apache Commons:
282
    https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/RefinedSoundex.html
283
284
    :param word: the word to transform
285
    :param maxlength: the length of the code returned (defaults to unlimited)
286
    :param reverse: reverse the word before computing the selected Soundex
287
        (defaults to False); This results in "Reverse Soundex"
288
    :param zero_pad: pad the end of the return value with 0s to achieve a
289
        maxlength string
290
    :returns: the Refined Soundex value
291
    :rtype: str
292
293
    >>> refined_soundex('Christopher')
294
    'C3090360109'
295
    >>> refined_soundex('Niall')
296
    'N807'
297
    >>> refined_soundex('Smith')
298
    'S38060'
299
    >>> refined_soundex('Schmidt')
300
    'S30806'
301
    """
302
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
303
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
304
                                        '01360240043788015936020505'))
305
306
    # uppercase, normalize, decompose, and filter non-A-Z out
307
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
308
    word = word.replace('ß', 'SS')
309
    word = ''.join(c for c in word if c in
310
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
311
312
    # Reverse word if computing Reverse Soundex
313
    if reverse:
314
        word = word[::-1]
315
316
    # apply the Soundex algorithm
317
    sdx = word[0] + word.translate(_ref_soundex_translation)
318
    sdx = _delete_consecutive_repeats(sdx)
319
320
    if maxlength and maxlength < _INFINITY:
321
        sdx = sdx[:maxlength]
322
        sdx += ('0' * maxlength)  # rule 4
323
324
    return sdx
325
326
327
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True):
328
    """Return the Daitch-Mokotoff Soundex code for a word.
329
330
    Returns values of a word as a set. A collection is necessary since there
331
    can be multiple values for a single word.
332
333
    :param word: the word to transform
334
    :param maxlength: the length of the code returned (defaults to 6)
335
    :param reverse: reverse the word before computing the selected Soundex
336
        (defaults to False); This results in "Reverse Soundex"
337
    :param zero_pad: pad the end of the return value with 0s to achieve a
338
        maxlength string
339
    :returns: the Daitch-Mokotoff Soundex value
340
    :rtype: str
341
342
    >>> dm_soundex('Christopher')
343
    {'494379', '594379'}
344
    >>> dm_soundex('Niall')
345
    {'680000'}
346
    >>> dm_soundex('Smith')
347
    {'463000'}
348
    >>> dm_soundex('Schmidt')
349
    {'463000'}
350
351
    >>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)
352
    {'35457976754', '3557976754'}
353
    """
354
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
355
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
356
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
357
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
358
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
359
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
360
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
361
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
362
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
363
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
364
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
365
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
366
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
367
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
368
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
369
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
370
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
371
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
372
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
373
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
374
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
375
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
376
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
377
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
378
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
379
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
380
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
381
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
382
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
383
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
384
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
385
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
386
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
387
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
388
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
389
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
390
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
391
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
392
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
393
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
394
                  'CH': ((5, 4), (5, 4), (5, 4)),
395
                  'CK': ((5, 45), (5, 45), (5, 45)),
396
                  'C': ((5, 4), (5, 4), (5, 4)),
397
                  'J': ((1, 4), ('_', 4), ('_', 4)),
398
                  'RZ': ((94, 4), (94, 4), (94, 4)),
399
                  'RS': ((94, 4), (94, 4), (94, 4))}
400
401
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
402
                  'B': ('B'),
403
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
404
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
405
                        'DZ', 'D'),
406
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
407
                  'F': ('FB', 'F'),
408
                  'G': ('G'),
409
                  'H': ('H'),
410
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
411
                  'J': ('J'),
412
                  'K': ('KH', 'KS', 'K'),
413
                  'L': ('L'),
414
                  'M': ('MN', 'M'),
415
                  'N': ('NM', 'N'),
416
                  'O': ('OI', 'OJ', 'OY', 'O'),
417
                  'P': ('PF', 'PH', 'P'),
418
                  'Q': ('Q'),
419
                  'R': ('RS', 'RZ', 'R'),
420
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
421
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
422
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
423
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
424
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
425
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
426
                        'TS', 'TZ', 'T'),
427
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
428
                  'V': ('V'),
429
                  'W': ('W'),
430
                  'X': ('X'),
431
                  'Y': ('Y'),
432
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
433
                        'ZH', 'ZS', 'Z')}
434
435
    _vowels = frozenset('AEIJOUY')
436
    dms = ['']  # initialize empty code list
437
438
    # Require a maxlength of at least 6 and not more than 64
439
    if maxlength is not None:
440
        maxlength = min(max(6, maxlength), 64)
441
    else:
442
        maxlength = 64
443
444
    # uppercase, normalize, decompose, and filter non-A-Z
445
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
446
    word = word.replace('ß', 'SS')
447
    word = ''.join(c for c in word if c in
448
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
449
450
    # Nothing to convert, return base case
451
    if not word:
452
        if zero_pad:
453
            return {'0'*maxlength}
454
        return {'0'}
455
456
    # Reverse word if computing Reverse Soundex
457
    if reverse:
458
        word = word[::-1]
459
460
    pos = 0
461
    while pos < len(word):
462
        # Iterate through _dms_order, which specifies the possible substrings
463
        # for which codes exist in the Daitch-Mokotoff coding
464
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
465
            if word[pos:].startswith(sstr):
466
                # Having determined a valid substring start, retrieve the code
467
                dm_val = _dms_table[sstr]
468
469
                # Having retried the code (triple), determine the correct
470
                # positional variant (first, pre-vocalic, elsewhere)
471
                if pos == 0:
472
                    dm_val = dm_val[0]
473
                elif (pos+len(sstr) < len(word) and
474
                      word[pos+len(sstr)] in _vowels):
475
                    dm_val = dm_val[1]
476
                else:
477
                    dm_val = dm_val[2]
478
479
                # Build the code strings
480
                if isinstance(dm_val, tuple):
481
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
482
                            + [_ + text_type(dm_val[1]) for _ in dms]
483
                else:
484
                    dms = [_ + text_type(dm_val) for _ in dms]
485
                pos += len(sstr)
486
                break
487
488
    # Filter out double letters and _ placeholders
489
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
490
           for _ in dms)
491
492
    # Trim codes and return set
493
    if zero_pad:
494
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
495
    else:
496
        dms = (_[:maxlength] for _ in dms)
497
    return set(dms)
498
499
500
def koelner_phonetik(word):
501
    """Return the Kölner Phonetik (numeric output) code for a word.
502
503
    Based on the algorithm described at
504
    https://de.wikipedia.org/wiki/Kölner_Phonetik
505
506
    While the output code is numeric, it is still a str because 0s can lead
507
    the code.
508
509
    :param str word: the word to transform
510
    :returns: the Kölner Phonetik value as a numeric string
511
    :rtype: str
512
513
    >>> koelner_phonetik('Christopher')
514
    '478237'
515
    >>> koelner_phonetik('Niall')
516
    '65'
517
    >>> koelner_phonetik('Smith')
518
    '862'
519
    >>> koelner_phonetik('Schmidt')
520
    '862'
521
    >>> koelner_phonetik('Müller')
522
    '657'
523
    >>> koelner_phonetik('Zimmermann')
524
    '86766'
525
    """
526
    # pylint: disable=too-many-branches
527
    def _after(word, i, letters):
528
        """Return True if word[i] follows one of the supplied letters."""
529
        if i > 0 and word[i-1] in letters:
530
            return True
531
        return False
532
533
    def _before(word, i, letters):
534
        """Return True if word[i] precedes one of the supplied letters."""
535
        if i+1 < len(word) and word[i+1] in letters:
536
            return True
537
        return False
538
539
    _vowels = frozenset('AEIJYOU')
540
541
    sdx = ''
542
543
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
544
    word = word.replace('ß', 'SS')
545
546
    word = word.replace('Ä', 'AE')
547
    word = word.replace('Ö', 'OE')
548
    word = word.replace('Ü', 'UE')
549
    word = ''.join(c for c in word if c in
550
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
551
552
    # Nothing to convert, return base case
553
    if not word:
554
        return sdx
555
556
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
557
        if word[i] in _vowels:
558
            sdx += '0'
559
        elif word[i] == 'B':
560
            sdx += '1'
561
        elif word[i] == 'P':
562
            if _before(word, i, frozenset('H')):
563
                sdx += '3'
564
            else:
565
                sdx += '1'
566
        elif word[i] in frozenset('DT'):
567
            if _before(word, i, frozenset('CSZ')):
568
                sdx += '8'
569
            else:
570
                sdx += '2'
571
        elif word[i] in frozenset('FVW'):
572
            sdx += '3'
573
        elif word[i] in frozenset('GKQ'):
574
            sdx += '4'
575
        elif word[i] == 'C':
576
            if _after(word, i, frozenset('SZ')):
577
                sdx += '8'
578
            elif i == 0:
579
                if _before(word, i, frozenset('AHKLOQRUX')):
580
                    sdx += '4'
581
                else:
582
                    sdx += '8'
583
            elif _before(word, i, frozenset('AHKOQUX')):
584
                sdx += '4'
585
            else:
586
                sdx += '8'
587
        elif word[i] == 'X':
588
            if _after(word, i, frozenset('CKQ')):
589
                sdx += '8'
590
            else:
591
                sdx += '48'
592
        elif word[i] == 'L':
593
            sdx += '5'
594
        elif word[i] in frozenset('MN'):
595
            sdx += '6'
596
        elif word[i] == 'R':
597
            sdx += '7'
598
        elif word[i] in frozenset('SZ'):
599
            sdx += '8'
600
601
    sdx = _delete_consecutive_repeats(sdx)
602
603
    if sdx:
604
        sdx = sdx[0] + sdx[1:].replace('0', '')
605
606
    return sdx
607
608
609
def koelner_phonetik_num_to_alpha(num):
610
    """Convert a Kölner Phonetik code from numeric to alphabetic.
611
612
    :param str num: a numeric Kölner Phonetik representation
613
    :returns: an alphabetic representation of the same word
614
    :rtype: str
615
616
    >>> koelner_phonetik_num_to_alpha(862)
617
    'SNT'
618
    >>> koelner_phonetik_num_to_alpha(657)
619
    'NLR'
620
    >>> koelner_phonetik_num_to_alpha(86766)
621
    'SNRNN'
622
    """
623
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
624
                                        'APTFKLNRS'))
625
    num = ''.join(c for c in text_type(num) if c in frozenset('012345678'))
626
    return num.translate(_koelner_num_translation)
627
628
629
def koelner_phonetik_alpha(word):
630
    """Return the Kölner Phonetik (alphabetic output) code for a word.
631
632
    :param str word: the word to transform
633
    :returns: the Kölner Phonetik value as an alphabetic string
634
    :rtype: str
635
636
    >>> koelner_phonetik_alpha('Smith')
637
    'SNT'
638
    >>> koelner_phonetik_alpha('Schmidt')
639
    'SNT'
640
    >>> koelner_phonetik_alpha('Müller')
641
    'NLR'
642
    >>> koelner_phonetik_alpha('Zimmermann')
643
    'SNRNN'
644
    """
645
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
646
647
648
def nysiis(word, maxlength=6, modified=False):
649
    """Return the NYSIIS code for a word.
650
651
    A description of the New York State Identification and Intelligence System
652
    algorithm can be found at
653
    https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System
654
655
    The modified version of this algorithm is described in Appendix B of
656
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
657
    Procedure for the SRS Record Linkage System.` Statistical Reporting
658
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
659
    https://naldc.nal.usda.gov/download/27833/PDF
660
661
    :param str word: the word to transform
662
    :param int maxlength: the maximum length (default 6) of the code to return
663
    :param bool modified: indicates whether to use USDA modified NYSIIS
664
    :returns: the NYSIIS value
665
    :rtype: str
666
667
    >>> nysiis('Christopher')
668
    'CRASTA'
669
    >>> nysiis('Niall')
670
    'NAL'
671
    >>> nysiis('Smith')
672
    'SNAT'
673
    >>> nysiis('Schmidt')
674
    'SNAD'
675
676
    >>> nysiis('Christopher', maxlength=_INFINITY)
677
    'CRASTAFAR'
678
679
    >>> nysiis('Christopher', maxlength=8, modified=True)
680
    'CRASTAFA'
681
    >>> nysiis('Niall', maxlength=8, modified=True)
682
    'NAL'
683
    >>> nysiis('Smith', maxlength=8, modified=True)
684
    'SNAT'
685
    >>> nysiis('Schmidt', maxlength=8, modified=True)
686
    'SNAD'
687
    """
688
    # Require a maxlength of at least 6
689
    if maxlength:
690
        maxlength = max(6, maxlength)
691
692
    _vowels = {'A', 'E', 'I', 'O', 'U'}
693
694
    word = ''.join(c for c in word.upper() if c.isalpha())
695
    word = word.replace('ß', 'SS')
696
697
    # exit early if there are no alphas
698
    if not word:
699
        return ''
700
701
    if modified:
702
        original_first_char = word[0]
703
704
    if word[:3] == 'MAC':
705
        word = 'MCC'+word[3:]
706
    elif word[:2] == 'KN':
707
        word = 'NN'+word[2:]
708
    elif word[:1] == 'K':
709
        word = 'C'+word[1:]
710
    elif word[:2] in {'PH', 'PF'}:
711
        word = 'FF'+word[2:]
712
    elif word[:3] == 'SCH':
713
        word = 'SSS'+word[3:]
714
    elif modified:
715
        if word[:2] == 'WR':
716
            word = 'RR'+word[2:]
717
        elif word[:2] == 'RH':
718
            word = 'RR'+word[2:]
719
        elif word[:2] == 'DG':
720
            word = 'GG'+word[2:]
721
        elif word[:1] in _vowels:
722
            word = 'A'+word[1:]
723
724
    if modified and word[-1] in {'S', 'Z'}:
725
        word = word[:-1]
726
727
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
728
                                                  word[-2:] == 'YE'):
729
        word = word[:-2]+'Y'
730
    elif word[-2:] in {'DT', 'RT', 'RD'}:
731
        word = word[:-2]+'D'
732
    elif word[-2:] in {'NT', 'ND'}:
733
        word = word[:-2]+('N' if modified else 'D')
734
    elif modified:
735
        if word[-2:] == 'IX':
736
            word = word[:-2]+'ICK'
737
        elif word[-2:] == 'EX':
738
            word = word[:-2]+'ECK'
739
        elif word[-2:] in {'JR', 'SR'}:
740
            return 'ERROR'  # TODO: decide how best to return an error
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
741
742
    key = word[0]
743
744
    skip = 0
745
    for i in range(1, len(word)):
746
        if i >= len(word):
747
            continue
748
        elif skip:
749
            skip -= 1
750
            continue
751
        elif word[i:i+2] == 'EV':
752
            word = word[:i] + 'AF' + word[i+2:]
753
            skip = 1
754
        elif word[i] in _vowels:
755
            word = word[:i] + 'A' + word[i+1:]
756
        elif modified and i != len(word)-1 and word[i] == 'Y':
757
            word = word[:i] + 'A' + word[i+1:]
758
        elif word[i] == 'Q':
759
            word = word[:i] + 'G' + word[i+1:]
760
        elif word[i] == 'Z':
761
            word = word[:i] + 'S' + word[i+1:]
762
        elif word[i] == 'M':
763
            word = word[:i] + 'N' + word[i+1:]
764
        elif word[i:i+2] == 'KN':
765
            word = word[:i] + 'N' + word[i+2:]
766
        elif word[i] == 'K':
767
            word = word[:i] + 'C' + word[i+1:]
768
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
769
            word = word[:i] + 'SSA'
770
            skip = 2
771
        elif word[i:i+3] == 'SCH':
772
            word = word[:i] + 'SSS' + word[i+3:]
773
            skip = 2
774
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
775
            word = word[:i] + 'SA'
776
            skip = 1
777
        elif word[i:i+2] == 'SH':
778
            word = word[:i] + 'SS' + word[i+2:]
779
            skip = 1
780
        elif word[i:i+2] == 'PH':
781
            word = word[:i] + 'FF' + word[i+2:]
782
            skip = 1
783
        elif modified and word[i:i+3] == 'GHT':
784
            word = word[:i] + 'TTT' + word[i+3:]
785
            skip = 2
786
        elif modified and word[i:i+2] == 'DG':
787
            word = word[:i] + 'GG' + word[i+2:]
788
            skip = 1
789
        elif modified and word[i:i+2] == 'WR':
790
            word = word[:i] + 'RR' + word[i+2:]
791
            skip = 1
792
        elif word[i] == 'H' and (word[i-1] not in _vowels or
793
                                 word[i+1:i+2] not in _vowels):
794
            word = word[:i] + word[i-1] + word[i+1:]
795
        elif word[i] == 'W' and word[i-1] in _vowels:
796
            word = word[:i] + word[i-1] + word[i+1:]
797
798
        if word[i:i+skip+1] != key[-1:]:
799
            key += word[i:i+skip+1]
800
801
    key = _delete_consecutive_repeats(key)
802
803
    if key[-1] == 'S':
804
        key = key[:-1]
805
    if key[-2:] == 'AY':
806
        key = key[:-2] + 'Y'
807
    if key[-1:] == 'A':
808
        key = key[:-1]
809
    if modified and key[0] == 'A':
810
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 701 is False. Are you sure this can never be the case?
Loading history...
811
812
    if maxlength and maxlength < _INFINITY:
813
        key = key[:maxlength]
814
815
    return key
816
817
818
def mra(word):
819
    """Return the MRA personal numeric identifier (PNI) for a word.
820
821
    A description of the Western Airlines Surname Match Rating Algorithm can
822
    be found on page 18 of
823
    https://archive.org/details/accessingindivid00moor
824
825
    :param str word: the word to transform
826
    :returns: the MRA PNI
827
    :rtype: str
828
829
    >>> mra('Christopher')
830
    'CHRPHR'
831
    >>> mra('Niall')
832
    'NL'
833
    >>> mra('Smith')
834
    'SMTH'
835
    >>> mra('Schmidt')
836
    'SCHMDT'
837
    """
838
    if not word:
839
        return word
840
    word = word.upper()
841
    word = word.replace('ß', 'SS')
842
    word = word[0]+''.join(c for c in word[1:] if
843
                           c not in frozenset('AEIOU'))
844
    word = _delete_consecutive_repeats(word)
845
    if len(word) > 6:
846
        word = word[:3]+word[-3:]
847
    return word
848
849
850
def metaphone(word, maxlength=_INFINITY):
851
    """Return the Metaphone code for a word.
852
853
    Based on Lawrence Philips' Pick BASIC code from 1990:
854
    http://aspell.net/metaphone/metaphone.basic
855
    This incorporates some corrections to the above code, particularly
856
    some of those suggested by Michael Kuhn in:
857
    http://aspell.net/metaphone/metaphone-kuhn.txt
858
859
    :param str word: the word to transform
860
    :param int maxlength: the maximum length of the returned Metaphone code
861
        (defaults to unlimited, but in Philips' original implementation
862
        this was 4)
863
    :returns: the Metaphone value
864
    :rtype: str
865
866
867
    >>> metaphone('Christopher')
868
    'KRSTFR'
869
    >>> metaphone('Niall')
870
    'NL'
871
    >>> metaphone('Smith')
872
    'SM0'
873
    >>> metaphone('Schmidt')
874
    'SKMTT'
875
    """
876
    # pylint: disable=too-many-branches
877
    _vowels = frozenset('AEIOU')
878
    _frontv = frozenset('EIY')
879
    _varson = frozenset('CSPTG')
880
881
    # Require a maxlength of at least 4
882
    if maxlength is not None:
883
        maxlength = max(4, maxlength)
884
    else:
885
        maxlength = 64
886
887
    # As in variable sound--those modified by adding an "h"
888
    ename = ''.join(c for c in word.upper() if c.isalnum())
889
    ename = ename.replace('ß', 'SS')
890
891
    # Delete nonalphanumeric characters and make all caps
892
    if not ename:
893
        return ''
894
    if ename[0:2] in frozenset(['PN', 'AE', 'KN', 'GN', 'WR']):
895
        ename = ename[1:]
896
    elif ename[0] == 'X':
897
        ename = 'S' + ename[1:]
898
    elif ename[0:2] == 'WH':
899
        ename = 'W' + ename[2:]
900
901
    # Convert to metaph
902
    elen = len(ename)-1
903
    metaph = ''
904
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
905
        if len(metaph) >= maxlength:
906
            break
907
        if ((ename[i] not in frozenset('GT') and
908
             i > 0 and ename[i-1] == ename[i])):
909
            continue
910
911
        if ename[i] in _vowels and i == 0:
912
            metaph = ename[i]
913
914
        elif ename[i] == 'B':
915
            if i != elen or ename[i-1] != 'M':
916
                metaph += ename[i]
917
918
        elif ename[i] == 'C':
919
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
920
                if ename[i+1:i+3] == 'IA':
921
                    metaph += 'X'
922
                elif ename[i+1:i+2] in _frontv:
923
                    metaph += 'S'
924
                elif i > 0 and ename[i-1:i+2] == 'SCH':
925
                    metaph += 'K'
926
                elif ename[i+1:i+2] == 'H':
927
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
928
                        metaph += 'K'
929
                    else:
930
                        metaph += 'X'
931
                else:
932
                    metaph += 'K'
933
934
        elif ename[i] == 'D':
935
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
936
                metaph += 'J'
937
            else:
938
                metaph += 'T'
939
940
        elif ename[i] == 'G':
941
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
942
                                              ename[i+2:i+3] not in _vowels):
943
                continue
944
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
945
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
946
                continue
947
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
948
                  ename[i+1] in _frontv):
949
                continue
950
            elif ename[i+1:i+2] == 'G':
951
                continue
952
            elif ename[i+1:i+2] in _frontv:
953
                if i == 0 or ename[i-1] != 'G':
954
                    metaph += 'J'
955
                else:
956
                    metaph += 'K'
957
            else:
958
                metaph += 'K'
959
960
        elif ename[i] == 'H':
961
            if ((i > 0 and ename[i-1] in _vowels and
962
                 ename[i+1:i+2] not in _vowels)):
963
                continue
964
            elif i > 0 and ename[i-1] in _varson:
965
                continue
966
            else:
967
                metaph += 'H'
968
969
        elif ename[i] in frozenset('FJLMNR'):
970
            metaph += ename[i]
971
972
        elif ename[i] == 'K':
973
            if i > 0 and ename[i-1] == 'C':
974
                continue
975
            else:
976
                metaph += 'K'
977
978
        elif ename[i] == 'P':
979
            if ename[i+1:i+2] == 'H':
980
                metaph += 'F'
981
            else:
982
                metaph += 'P'
983
984
        elif ename[i] == 'Q':
985
            metaph += 'K'
986
987
        elif ename[i] == 'S':
988
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
989
                 ename[i+2] in 'OA')):
990
                metaph += 'X'
991
            elif ename[i+1:i+2] == 'H':
992
                metaph += 'X'
993
            else:
994
                metaph += 'S'
995
996
        elif ename[i] == 'T':
997
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
998
                 ename[i+2] in frozenset('OA'))):
999
                metaph += 'X'
1000
            elif ename[i+1:i+2] == 'H':
1001
                metaph += '0'
1002
            elif ename[i+1:i+3] != 'CH':
1003
                if ename[i-1:i] != 'T':
1004
                    metaph += 'T'
1005
1006
        elif ename[i] == 'V':
1007
            metaph += 'F'
1008
1009
        elif ename[i] in 'WY':
1010
            if ename[i+1:i+2] in _vowels:
1011
                metaph += ename[i]
1012
1013
        elif ename[i] == 'X':
1014
            metaph += 'KS'
1015
1016
        elif ename[i] == 'Z':
1017
            metaph += 'S'
1018
1019
    return metaph
1020
1021
1022
def double_metaphone(word, maxlength=_INFINITY):
1023
    """Return the Double Metaphone code for a word.
1024
1025
    Based on Lawrence Philips' (Visual) C++ code from 1999:
1026
    http://aspell.net/metaphone/dmetaph.cpp
1027
1028
    :param word: the word to transform
1029
    :param maxlength: the maximum length of the returned Double Metaphone codes
1030
        (defaults to unlimited, but in Philips' original implementation this
1031
        was 4)
1032
    :returns: the Double Metaphone value(s)
1033
    :rtype: tuple
1034
1035
    >>> double_metaphone('Christopher')
1036
    ('KRSTFR', '')
1037
    >>> double_metaphone('Niall')
1038
    ('NL', '')
1039
    >>> double_metaphone('Smith')
1040
    ('SM0', 'XMT')
1041
    >>> double_metaphone('Schmidt')
1042
    ('XMT', 'SMT')
1043
    """
1044
    # pylint: disable=too-many-branches
1045
    # Require a maxlength of at least 4
1046
    if maxlength is not None:
1047
        maxlength = max(4, maxlength)
1048
    else:
1049
        maxlength = 64
1050
1051
    primary = ''
1052
    secondary = ''
1053
1054
    def _slavo_germanic():
1055
        """Return True if the word appears to be Slavic or Germanic."""
1056
        if 'W' in word or 'K' in word or 'CZ' in word:
1057
            return True
1058
        return False
1059
1060
    def _metaph_add(pri, sec=''):
1061
        """Return a new metaphone tuple with the supplied elements."""
1062
        newpri = primary
1063
        newsec = secondary
1064
        if pri:
1065
            newpri += pri
1066
        if sec:
1067
            if sec != ' ':
1068
                newsec += sec
1069
        else:
1070
            newsec += pri
1071
        return (newpri, newsec)
1072
1073
    def _is_vowel(pos):
1074
        """Return True if the character at word[pos] is a vowel."""
1075
        if pos >= 0 and word[pos] in frozenset('AEIOUY'):
1076
            return True
1077
        return False
1078
1079
    def _get_at(pos):
1080
        """Return the character at word[pos]."""
1081
        return word[pos]
1082
1083
    def _string_at(pos, slen, substrings):
1084
        """Return True if word[pos:pos+slen] is in substrings."""
1085
        if pos < 0:
1086
            return False
1087
        return word[pos:pos+slen] in substrings
1088
1089
    current = 0
1090
    length = len(word)
1091
    if length < 1:
1092
        return ('', '')
1093
    last = length - 1
1094
1095
    word = word.upper()
1096
    word = word.replace('ß', 'SS')
1097
1098
    # Pad the original string so that we can index beyond the edge of the world
1099
    word += '     '
1100
1101
    # Skip these when at start of word
1102
    if word[0:2] in frozenset(['GN', 'KN', 'PN', 'WR', 'PS']):
1103
        current += 1
1104
1105
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1106
    if _get_at(0) == 'X':
1107
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1108
        current += 1
1109
1110
    # Main loop
1111
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1112
        if current >= length:
1113
            break
1114
1115
        if _get_at(current) in frozenset('AEIOUY'):
1116
            if current == 0:
1117
                # All init vowels now map to 'A'
1118
                (primary, secondary) = _metaph_add('A')
1119
            current += 1
1120
            continue
1121
1122
        elif _get_at(current) == 'B':
1123
            # "-mb", e.g", "dumb", already skipped over...
1124
            (primary, secondary) = _metaph_add('P')
1125
            if _get_at(current + 1) == 'B':
1126
                current += 2
1127
            else:
1128
                current += 1
1129
            continue
1130
1131
        elif _get_at(current) == 'Ç':
1132
            (primary, secondary) = _metaph_add('S')
1133
            current += 1
1134
            continue
1135
1136
        elif _get_at(current) == 'C':
1137
            # Various Germanic
1138
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1139
                    _string_at((current - 1), 3, ['ACH']) and
1140
                    ((_get_at(current + 2) != 'I') and
1141
                     ((_get_at(current + 2) != 'E') or
1142
                      _string_at((current - 2), 6,
1143
                                 frozenset(['BACHER', 'MACHER']))))):
1144
                (primary, secondary) = _metaph_add('K')
1145
                current += 2
1146
                continue
1147
1148
            # Special case 'caesar'
1149
            elif current == 0 and _string_at(current, 6, ['CAESAR']):
1150
                (primary, secondary) = _metaph_add('S')
1151
                current += 2
1152
                continue
1153
1154
            # Italian 'chianti'
1155
            elif _string_at(current, 4, ['CHIA']):
1156
                (primary, secondary) = _metaph_add('K')
1157
                current += 2
1158
                continue
1159
1160
            elif _string_at(current, 2, ['CH']):
1161
                # Find 'Michael'
1162
                if current > 0 and _string_at(current, 4, ['CHAE']):
1163
                    (primary, secondary) = _metaph_add('K', 'X')
1164
                    current += 2
1165
                    continue
1166
1167
                # Greek roots e.g. 'chemistry', 'chorus'
1168
                elif (current == 0 and
1169
                      (_string_at((current + 1), 5,
1170
                                  frozenset(['HARAC', 'HARIS'])) or
1171
                       _string_at((current + 1), 3,
1172
                                  frozenset(['HOR', 'HYM', 'HIA', 'HEM']))) and
1173
                      not _string_at(0, 5, ['CHORE'])):
1174
                    (primary, secondary) = _metaph_add('K')
1175
                    current += 2
1176
                    continue
1177
1178
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1179
                elif ((_string_at(0, 4, frozenset(['VAN ', 'VON '])) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1180
                       _string_at(0, 3, ['SCH'])) or
1181
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1182
                      _string_at((current - 2), 6,
1183
                                 frozenset(['ORCHES', 'ARCHIT', 'ORCHID'])) or
1184
                      _string_at((current + 2), 1, frozenset(['T', 'S'])) or
1185
                      ((_string_at((current - 1), 1,
1186
                                   frozenset(['A', 'O', 'U', 'E'])) or
1187
                        (current == 0)) and
1188
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1189
                       _string_at((current + 2), 1,
1190
                                  frozenset(['L', 'R', 'N', 'M', 'B', 'H',
1191
                                             'F', 'V', 'W', ' '])))):
1192
                    (primary, secondary) = _metaph_add('K')
1193
1194
                else:
1195
                    if current > 0:
1196
                        if _string_at(0, 2, ['MC']):
1197
                            # e.g., "McHugh"
1198
                            (primary, secondary) = _metaph_add('K')
1199
                        else:
1200
                            (primary, secondary) = _metaph_add('X', 'K')
1201
                    else:
1202
                        (primary, secondary) = _metaph_add('X')
1203
1204
                current += 2
1205
                continue
1206
1207
            # e.g, 'czerny'
1208
            elif (_string_at(current, 2, ['CZ']) and
1209
                  not _string_at((current - 2), 4, ['WICZ'])):
1210
                (primary, secondary) = _metaph_add('S', 'X')
1211
                current += 2
1212
                continue
1213
1214
            # e.g., 'focaccia'
1215
            elif _string_at((current + 1), 3, ['CIA']):
1216
                (primary, secondary) = _metaph_add('X')
1217
                current += 3
1218
1219
            # double 'C', but not if e.g. 'McClellan'
1220
            elif (_string_at(current, 2, ['CC']) and
1221
                  not ((current == 1) and (_get_at(0) == 'M'))):
1222
                # 'bellocchio' but not 'bacchus'
1223
                if ((_string_at((current + 2), 1,
1224
                                frozenset(['I', 'E', 'H'])) and
1225
                     not _string_at((current + 2), 2, ['HU']))):
1226
                    # 'accident', 'accede' 'succeed'
1227
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1228
                         _string_at((current - 1), 5,
1229
                                    frozenset(['UCCEE', 'UCCES'])))):
1230
                        (primary, secondary) = _metaph_add('KS')
1231
                    # 'bacci', 'bertucci', other italian
1232
                    else:
1233
                        (primary, secondary) = _metaph_add('X')
1234
                    current += 3
1235
                    continue
1236
                else:  # Pierce's rule
1237
                    (primary, secondary) = _metaph_add('K')
1238
                    current += 2
1239
                    continue
1240
1241
            elif _string_at(current, 2, frozenset(['CK', 'CG', 'CQ'])):
1242
                (primary, secondary) = _metaph_add('K')
1243
                current += 2
1244
                continue
1245
1246
            elif _string_at(current, 2, frozenset(['CI', 'CE', 'CY'])):
1247
                # Italian vs. English
1248
                if _string_at(current, 3, frozenset(['CIO', 'CIE', 'CIA'])):
1249
                    (primary, secondary) = _metaph_add('S', 'X')
1250
                else:
1251
                    (primary, secondary) = _metaph_add('S')
1252
                current += 2
1253
                continue
1254
1255
            # else
1256
            else:
1257
                (primary, secondary) = _metaph_add('K')
1258
1259
                # name sent in 'mac caffrey', 'mac gregor
1260
                if _string_at((current + 1), 2, frozenset([' C', ' Q', ' G'])):
1261
                    current += 3
1262
                elif (_string_at((current + 1), 1,
1263
                                 frozenset(['C', 'K', 'Q'])) and
1264
                      not _string_at((current + 1), 2,
1265
                                     frozenset(['CE', 'CI']))):
1266
                    current += 2
1267
                else:
1268
                    current += 1
1269
                continue
1270
1271
        elif _get_at(current) == 'D':
1272
            if _string_at(current, 2, ['DG']):
1273
                if _string_at((current + 2), 1, frozenset(['I', 'E', 'Y'])):
1274
                    # e.g. 'edge'
1275
                    (primary, secondary) = _metaph_add('J')
1276
                    current += 3
1277
                    continue
1278
                else:
1279
                    # e.g. 'edgar'
1280
                    (primary, secondary) = _metaph_add('TK')
1281
                    current += 2
1282
                    continue
1283
1284
            elif _string_at(current, 2, frozenset(['DT', 'DD'])):
1285
                (primary, secondary) = _metaph_add('T')
1286
                current += 2
1287
                continue
1288
1289
            # else
1290
            else:
1291
                (primary, secondary) = _metaph_add('T')
1292
                current += 1
1293
                continue
1294
1295
        elif _get_at(current) == 'F':
1296
            if _get_at(current + 1) == 'F':
1297
                current += 2
1298
            else:
1299
                current += 1
1300
            (primary, secondary) = _metaph_add('F')
1301
            continue
1302
1303
        elif _get_at(current) == 'G':
1304
            if _get_at(current + 1) == 'H':
1305
                if (current > 0) and not _is_vowel(current - 1):
1306
                    (primary, secondary) = _metaph_add('K')
1307
                    current += 2
1308
                    continue
1309
1310
                # 'ghislane', ghiradelli
1311
                elif current == 0:
1312
                    if _get_at(current + 2) == 'I':
1313
                        (primary, secondary) = _metaph_add('J')
1314
                    else:
1315
                        (primary, secondary) = _metaph_add('K')
1316
                    current += 2
1317
                    continue
1318
1319
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1320
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1321
                       _string_at((current - 2), 1,
1322
                                  frozenset(['B', 'H', 'D']))) or
1323
                      # e.g., 'bough'
1324
                      ((current > 2) and
1325
                       _string_at((current - 3), 1,
1326
                                  frozenset(['B', 'H', 'D']))) or
1327
                      # e.g., 'broughton'
1328
                      ((current > 3) and
1329
                       _string_at((current - 4), 1,
1330
                                  frozenset(['B', 'H'])))):
1331
                    current += 2
1332
                    continue
1333
                else:
1334
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1335
                    #      'gough', 'rough', 'tough'
1336
                    if ((current > 2) and
1337
                            (_get_at(current - 1) == 'U') and
1338
                            (_string_at((current - 3), 1,
1339
                                        frozenset(['C', 'G', 'L', 'R',
1340
                                                   'T'])))):
1341
                        (primary, secondary) = _metaph_add('F')
1342
                    elif (current > 0) and _get_at(current - 1) != 'I':
1343
                        (primary, secondary) = _metaph_add('K')
1344
                    current += 2
1345
                    continue
1346
1347
            elif _get_at(current + 1) == 'N':
1348
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1349
                    (primary, secondary) = _metaph_add('KN', 'N')
1350
                # not e.g. 'cagney'
1351
                elif (not _string_at((current + 2), 2, ['EY']) and
1352
                      (_get_at(current + 1) != 'Y') and
1353
                      not _slavo_germanic()):
1354
                    (primary, secondary) = _metaph_add('N', 'KN')
1355
                else:
1356
                    (primary, secondary) = _metaph_add('KN')
1357
                current += 2
1358
                continue
1359
1360
            # 'tagliaro'
1361
            elif (_string_at((current + 1), 2, ['LI']) and
1362
                  not _slavo_germanic()):
1363
                (primary, secondary) = _metaph_add('KL', 'L')
1364
                current += 2
1365
                continue
1366
1367
            # -ges-, -gep-, -gel-, -gie- at beginning
1368
            elif ((current == 0) and
1369
                  ((_get_at(current + 1) == 'Y') or
1370
                   _string_at((current + 1), 2,
1371
                              frozenset(['ES', 'EP', 'EB', 'EL', 'EY', 'IB',
1372
                                         'IL', 'IN', 'IE', 'EI', 'ER'])))):
1373
                (primary, secondary) = _metaph_add('K', 'J')
1374
                current += 2
1375
                continue
1376
1377
            #  -ger-,  -gy-
1378
            elif ((_string_at((current + 1), 2, ['ER']) or
1379
                   (_get_at(current + 1) == 'Y')) and not
1380
                  _string_at(0, 6,
1381
                             frozenset(['DANGER', 'RANGER', 'MANGER'])) and not
1382
                  _string_at((current - 1), 1, frozenset(['E', 'I'])) and not
1383
                  _string_at((current - 1), 3, frozenset(['RGY', 'OGY']))):
1384
                (primary, secondary) = _metaph_add('K', 'J')
1385
                current += 2
1386
                continue
1387
1388
            #  italian e.g, 'biaggi'
1389
            elif (_string_at((current + 1), 1, frozenset(['E', 'I', 'Y'])) or
1390
                  _string_at((current - 1), 4, frozenset(['AGGI', 'OGGI']))):
1391
                # obvious germanic
1392
                if (((_string_at(0, 4, frozenset(['VAN ', 'VON '])) or
1393
                      _string_at(0, 3, ['SCH'])) or
1394
                     _string_at((current + 1), 2, ['ET']))):
1395
                    (primary, secondary) = _metaph_add('K')
1396
                elif _string_at((current + 1), 4, ['IER ']):
1397
                    (primary, secondary) = _metaph_add('J')
1398
                else:
1399
                    (primary, secondary) = _metaph_add('J', 'K')
1400
                current += 2
1401
                continue
1402
1403
            else:
1404
                if _get_at(current + 1) == 'G':
1405
                    current += 2
1406
                else:
1407
                    current += 1
1408
                (primary, secondary) = _metaph_add('K')
1409
                continue
1410
1411
        elif _get_at(current) == 'H':
1412
            # only keep if first & before vowel or btw. 2 vowels
1413
            if ((((current == 0) or _is_vowel(current - 1)) and
1414
                 _is_vowel(current + 1))):
1415
                (primary, secondary) = _metaph_add('H')
1416
                current += 2
1417
            else:  # also takes care of 'HH'
1418
                current += 1
1419
            continue
1420
1421
        elif _get_at(current) == 'J':
1422
            # obvious spanish, 'jose', 'san jacinto'
1423
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, ['SAN ']):
1424
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1425
                     _string_at(0, 4, ['SAN ']))):
1426
                    (primary, secondary) = _metaph_add('H')
1427
                else:
1428
                    (primary, secondary) = _metaph_add('J', 'H')
1429
                current += 1
1430
                continue
1431
1432
            elif (current == 0) and not _string_at(current, 4, ['JOSE']):
1433
                # Yankelovich/Jankelowicz
1434
                (primary, secondary) = _metaph_add('J', 'A')
1435
            # Spanish pron. of e.g. 'bajador'
1436
            elif (_is_vowel(current - 1) and
1437
                  not _slavo_germanic() and
1438
                  ((_get_at(current + 1) == 'A') or
1439
                   (_get_at(current + 1) == 'O'))):
1440
                (primary, secondary) = _metaph_add('J', 'H')
1441
            elif current == last:
1442
                (primary, secondary) = _metaph_add('J', ' ')
1443
            elif (not _string_at((current + 1), 1,
1444
                                 frozenset(['L', 'T', 'K', 'S', 'N', 'M', 'B',
1445
                                            'Z'])) and
1446
                  not _string_at((current - 1), 1,
1447
                                 frozenset(['S', 'K', 'L']))):
1448
                (primary, secondary) = _metaph_add('J')
1449
1450
            if _get_at(current + 1) == 'J':  # it could happen!
1451
                current += 2
1452
            else:
1453
                current += 1
1454
            continue
1455
1456
        elif _get_at(current) == 'K':
1457
            if _get_at(current + 1) == 'K':
1458
                current += 2
1459
            else:
1460
                current += 1
1461
            (primary, secondary) = _metaph_add('K')
1462
            continue
1463
1464
        elif _get_at(current) == 'L':
1465
            if _get_at(current + 1) == 'L':
1466
                # Spanish e.g. 'cabrillo', 'gallegos'
1467
                if (((current == (length - 3)) and
1468
                     _string_at((current - 1), 4,
1469
                                frozenset(['ILLO', 'ILLA', 'ALLE']))) or
1470
                        ((_string_at((last - 1), 2, frozenset(['AS', 'OS'])) or
1471
                          _string_at(last, 1, frozenset(['A', 'O']))) and
1472
                         _string_at((current - 1), 4, ['ALLE']))):
1473
                    (primary, secondary) = _metaph_add('L', ' ')
1474
                    current += 2
1475
                    continue
1476
                current += 2
1477
            else:
1478
                current += 1
1479
            (primary, secondary) = _metaph_add('L')
1480
            continue
1481
1482
        elif _get_at(current) == 'M':
1483
            if (((_string_at((current - 1), 3, ['UMB']) and
1484
                  (((current + 1) == last) or
1485
                   _string_at((current + 2), 2, ['ER']))) or
1486
                 # 'dumb', 'thumb'
1487
                 (_get_at(current + 1) == 'M'))):
1488
                current += 2
1489
            else:
1490
                current += 1
1491
            (primary, secondary) = _metaph_add('M')
1492
            continue
1493
1494
        elif _get_at(current) == 'N':
1495
            if _get_at(current + 1) == 'N':
1496
                current += 2
1497
            else:
1498
                current += 1
1499
            (primary, secondary) = _metaph_add('N')
1500
            continue
1501
1502
        elif _get_at(current) == 'Ñ':
1503
            current += 1
1504
            (primary, secondary) = _metaph_add('N')
1505
            continue
1506
1507
        elif _get_at(current) == 'P':
1508
            if _get_at(current + 1) == 'H':
1509
                (primary, secondary) = _metaph_add('F')
1510
                current += 2
1511
                continue
1512
1513
            # also account for "campbell", "raspberry"
1514
            elif _string_at((current + 1), 1, frozenset(['P', 'B'])):
1515
                current += 2
1516
            else:
1517
                current += 1
1518
            (primary, secondary) = _metaph_add('P')
1519
            continue
1520
1521
        elif _get_at(current) == 'Q':
1522
            if _get_at(current + 1) == 'Q':
1523
                current += 2
1524
            else:
1525
                current += 1
1526
            (primary, secondary) = _metaph_add('K')
1527
            continue
1528
1529
        elif _get_at(current) == 'R':
1530
            # french e.g. 'rogier', but exclude 'hochmeier'
1531
            if (((current == last) and
1532
                 not _slavo_germanic() and
1533
                 _string_at((current - 2), 2, ['IE']) and
1534
                 not _string_at((current - 4), 2, frozenset(['ME', 'MA'])))):
1535
                (primary, secondary) = _metaph_add('', 'R')
1536
            else:
1537
                (primary, secondary) = _metaph_add('R')
1538
1539
            if _get_at(current + 1) == 'R':
1540
                current += 2
1541
            else:
1542
                current += 1
1543
            continue
1544
1545
        elif _get_at(current) == 'S':
1546
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1547
            if _string_at((current - 1), 3, frozenset(['ISL', 'YSL'])):
1548
                current += 1
1549
                continue
1550
1551
            # special case 'sugar-'
1552
            elif (current == 0) and _string_at(current, 5, ['SUGAR']):
1553
                (primary, secondary) = _metaph_add('X', 'S')
1554
                current += 1
1555
                continue
1556
1557
            elif _string_at(current, 2, ['SH']):
1558
                # Germanic
1559
                if _string_at((current + 1), 4,
1560
                              frozenset(['HEIM', 'HOEK', 'HOLM', 'HOLZ'])):
1561
                    (primary, secondary) = _metaph_add('S')
1562
                else:
1563
                    (primary, secondary) = _metaph_add('X')
1564
                current += 2
1565
                continue
1566
1567
            # Italian & Armenian
1568
            elif (_string_at(current, 3, frozenset(['SIO', 'SIA'])) or
1569
                  _string_at(current, 4, ['SIAN'])):
1570
                if not _slavo_germanic():
1571
                    (primary, secondary) = _metaph_add('S', 'X')
1572
                else:
1573
                    (primary, secondary) = _metaph_add('S')
1574
                current += 3
1575
                continue
1576
1577
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1578
            #                               'snider' match 'schneider'
1579
            # also, -sz- in Slavic language although in Hungarian it is
1580
            #       pronounced 's'
1581
            elif (((current == 0) and
1582
                   _string_at((current + 1), 1,
1583
                              frozenset(['M', 'N', 'L', 'W']))) or
1584
                  _string_at((current + 1), 1, ['Z'])):
1585
                (primary, secondary) = _metaph_add('S', 'X')
1586
                if _string_at((current + 1), 1, ['Z']):
1587
                    current += 2
1588
                else:
1589
                    current += 1
1590
                continue
1591
1592
            elif _string_at(current, 2, ['SC']):
1593
                # Schlesinger's rule
1594
                if _get_at(current + 2) == 'H':
1595
                    # dutch origin, e.g. 'school', 'schooner'
1596
                    if _string_at((current + 3), 2,
1597
                                  frozenset(['OO', 'ER', 'EN', 'UY', 'ED',
1598
                                             'EM'])):
1599
                        # 'schermerhorn', 'schenker'
1600
                        if _string_at((current + 3), 2,
1601
                                      frozenset(['ER', 'EN'])):
1602
                            (primary, secondary) = _metaph_add('X', 'SK')
1603
                        else:
1604
                            (primary, secondary) = _metaph_add('SK')
1605
                        current += 3
1606
                        continue
1607
                    else:
1608
                        if (((current == 0) and not _is_vowel(3) and
1609
                             (_get_at(3) != 'W'))):
1610
                            (primary, secondary) = _metaph_add('X', 'S')
1611
                        else:
1612
                            (primary, secondary) = _metaph_add('X')
1613
                        current += 3
1614
                        continue
1615
1616
                elif _string_at((current + 2), 1,
1617
                                frozenset(['I', 'E', 'Y'])):
1618
                    (primary, secondary) = _metaph_add('S')
1619
                    current += 3
1620
                    continue
1621
1622
                # else
1623
                else:
1624
                    (primary, secondary) = _metaph_add('SK')
1625
                    current += 3
1626
                    continue
1627
1628
            else:
1629
                # french e.g. 'resnais', 'artois'
1630
                if (current == last) and _string_at((current - 2), 2,
1631
                                                    frozenset(['AI', 'OI'])):
1632
                    (primary, secondary) = _metaph_add('', 'S')
1633
                else:
1634
                    (primary, secondary) = _metaph_add('S')
1635
1636
                if _string_at((current + 1), 1, frozenset(['S', 'Z'])):
1637
                    current += 2
1638
                else:
1639
                    current += 1
1640
                continue
1641
1642
        elif _get_at(current) == 'T':
1643
            if _string_at(current, 4, ['TION']):
1644
                (primary, secondary) = _metaph_add('X')
1645
                current += 3
1646
                continue
1647
1648
            elif _string_at(current, 3, frozenset(['TIA', 'TCH'])):
1649
                (primary, secondary) = _metaph_add('X')
1650
                current += 3
1651
                continue
1652
1653
            elif (_string_at(current, 2, ['TH']) or
1654
                  _string_at(current, 3, ['TTH'])):
1655
                # special case 'thomas', 'thames' or germanic
1656
                if ((_string_at((current + 2), 2, frozenset(['OM', 'AM'])) or
1657
                     _string_at(0, 4, frozenset(['VAN ', 'VON '])) or
1658
                     _string_at(0, 3, ['SCH']))):
1659
                    (primary, secondary) = _metaph_add('T')
1660
                else:
1661
                    (primary, secondary) = _metaph_add('0', 'T')
1662
                current += 2
1663
                continue
1664
1665
            elif _string_at((current + 1), 1, frozenset(['T', 'D'])):
1666
                current += 2
1667
            else:
1668
                current += 1
1669
            (primary, secondary) = _metaph_add('T')
1670
            continue
1671
1672
        elif _get_at(current) == 'V':
1673
            if _get_at(current + 1) == 'V':
1674
                current += 2
1675
            else:
1676
                current += 1
1677
            (primary, secondary) = _metaph_add('F')
1678
            continue
1679
1680
        elif _get_at(current) == 'W':
1681
            # can also be in middle of word
1682
            if _string_at(current, 2, ['WR']):
1683
                (primary, secondary) = _metaph_add('R')
1684
                current += 2
1685
                continue
1686
            elif ((current == 0) and
1687
                  (_is_vowel(current + 1) or _string_at(current, 2, ['WH']))):
1688
                # Wasserman should match Vasserman
1689
                if _is_vowel(current + 1):
1690
                    (primary, secondary) = _metaph_add('A', 'F')
1691
                else:
1692
                    # need Uomo to match Womo
1693
                    (primary, secondary) = _metaph_add('A')
1694
1695
            # Arnow should match Arnoff
1696
            if ((((current == last) and _is_vowel(current - 1)) or
1697
                 _string_at((current - 1), 5,
1698
                            frozenset(['EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'])) or
1699
                 _string_at(0, 3, ['SCH']))):
1700
                (primary, secondary) = _metaph_add('', 'F')
1701
                current += 1
1702
                continue
1703
            # Polish e.g. 'filipowicz'
1704
            elif _string_at(current, 4, frozenset(['WICZ', 'WITZ'])):
1705
                (primary, secondary) = _metaph_add('TS', 'FX')
1706
                current += 4
1707
                continue
1708
            # else skip it
1709
            else:
1710
                current += 1
1711
                continue
1712
1713
        elif _get_at(current) == 'X':
1714
            # French e.g. breaux
1715
            if (not ((current == last) and
1716
                     (_string_at((current - 3), 3,
1717
                                 frozenset(['IAU', 'EAU'])) or
1718
                      _string_at((current - 2), 2, frozenset(['AU', 'OU']))))):
1719
                (primary, secondary) = _metaph_add('KS')
1720
1721
            if _string_at((current + 1), 1, frozenset(['C', 'X'])):
1722
                current += 2
1723
            else:
1724
                current += 1
1725
            continue
1726
1727
        elif _get_at(current) == 'Z':
1728
            # Chinese Pinyin e.g. 'zhao'
1729
            if _get_at(current + 1) == 'H':
1730
                (primary, secondary) = _metaph_add('J')
1731
                current += 2
1732
                continue
1733
            elif (_string_at((current + 1), 2,
1734
                             frozenset(['ZO', 'ZI', 'ZA'])) or
1735
                  (_slavo_germanic() and ((current > 0) and
1736
                                          _get_at(current - 1) != 'T'))):
1737
                (primary, secondary) = _metaph_add('S', 'TS')
1738
            else:
1739
                (primary, secondary) = _metaph_add('S')
1740
1741
            if _get_at(current + 1) == 'Z':
1742
                current += 2
1743
            else:
1744
                current += 1
1745
            continue
1746
1747
        else:
1748
            current += 1
1749
1750
    if maxlength and maxlength < _INFINITY:
1751
        primary = primary[:maxlength]
1752
        secondary = secondary[:maxlength]
1753
    if primary == secondary:
1754
        secondary = ''
1755
1756
    return (primary, secondary)
1757
1758
1759
def caverphone(word, version=2):
1760
    """Return the Caverphone code for a word.
1761
1762
    A description of version 1 of the algorithm can be found at:
1763
    http://caversham.otago.ac.nz/files/working/ctp060902.pdf
1764
1765
    A description of version 2 of the algorithm can be found at:
1766
    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
1767
1768
    :param str word: the word to transform
1769
    :param int version: the version of Caverphone to employ for encoding
1770
        (defaults to 2)
1771
    :returns: the Caverphone value
1772
    :rtype: str
1773
1774
    >>> caverphone('Christopher')
1775
    'KRSTFA1111'
1776
    >>> caverphone('Niall')
1777
    'NA11111111'
1778
    >>> caverphone('Smith')
1779
    'SMT1111111'
1780
    >>> caverphone('Schmidt')
1781
    'SKMT111111'
1782
1783
    >>> caverphone('Christopher', 1)
1784
    'KRSTF1'
1785
    >>> caverphone('Niall', 1)
1786
    'N11111'
1787
    >>> caverphone('Smith', 1)
1788
    'SMT111'
1789
    >>> caverphone('Schmidt', 1)
1790
    'SKMT11'
1791
    """
1792
    _vowels = frozenset('aeiou')
1793
1794
    word = word.lower()
1795
    word = ''.join(c for c in word if c in
1796
                   frozenset('abcdefghijklmnopqrstuvwxyz'))
1797
1798
    # the main replacemet algorithm
1799
    if version != 1 and word[-1:] == 'e':
1800
        word = word[:-1]
1801
    if word:
1802
        if word[:5] == 'cough':
1803
            word = 'cou2f'+word[5:]
1804
        if word[:5] == 'rough':
1805
            word = 'rou2f'+word[5:]
1806
        if word[:5] == 'tough':
1807
            word = 'tou2f'+word[5:]
1808
        if word[:6] == 'enough':
1809
            word = 'enou2f'+word[6:]
1810
        if version != 1 and word[:6] == 'trough':
1811
            word = 'trou2f'+word[6:]
1812
        if word[:2] == 'gn':
1813
            word = '2n'+word[2:]
1814
        if word[-2:] == 'mb':
1815
            word = word[:-1]+'2'
1816
        word = word.replace('cq', '2q')
1817
        word = word.replace('ci', 'si')
1818
        word = word.replace('ce', 'se')
1819
        word = word.replace('cy', 'sy')
1820
        word = word.replace('tch', '2ch')
1821
        word = word.replace('c', 'k')
1822
        word = word.replace('q', 'k')
1823
        word = word.replace('x', 'k')
1824
        word = word.replace('v', 'f')
1825
        word = word.replace('dg', '2g')
1826
        word = word.replace('tio', 'sio')
1827
        word = word.replace('tia', 'sia')
1828
        word = word.replace('d', 't')
1829
        word = word.replace('ph', 'fh')
1830
        word = word.replace('b', 'p')
1831
        word = word.replace('sh', 's2')
1832
        word = word.replace('z', 's')
1833
        if word[0] in _vowels:
1834
            word = 'A'+word[1:]
1835
        word = word.replace('a', '3')
1836
        word = word.replace('e', '3')
1837
        word = word.replace('i', '3')
1838
        word = word.replace('o', '3')
1839
        word = word.replace('u', '3')
1840
        if version != 1:
1841
            word = word.replace('j', 'y')
1842
            if word[:2] == 'y3':
1843
                word = 'Y3'+word[2:]
1844
            if word[:1] == 'y':
1845
                word = 'A'+word[1:]
1846
            word = word.replace('y', '3')
1847
        word = word.replace('3gh3', '3kh3')
1848
        word = word.replace('gh', '22')
1849
        word = word.replace('g', 'k')
1850
        word = re.sub(r's+', r'S', word)  # TODO: implement w/o re?
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
1851
        word = re.sub(r't+', r'T', word)
1852
        word = re.sub(r'p+', r'P', word)
1853
        word = re.sub(r'k+', r'K', word)
1854
        word = re.sub(r'f+', r'F', word)
1855
        word = re.sub(r'm+', r'M', word)
1856
        word = re.sub(r'n+', r'N', word)
1857
        word = word.replace('w3', 'W3')
1858
        if version == 1:
1859
            word = word.replace('wy', 'Wy')
1860
        word = word.replace('wh3', 'Wh3')
1861
        if version == 1:
1862
            word = word.replace('why', 'Why')
1863
        if version != 1 and word[-1:] == 'w':
1864
            word = word[:-1]+'3'
1865
        word = word.replace('w', '2')
1866
        if word[:1] == 'h':
1867
            word = 'A'+word[1:]
1868
        word = word.replace('h', '2')
1869
        word = word.replace('r3', 'R3')
1870
        if version == 1:
1871
            word = word.replace('ry', 'Ry')
1872
        if version != 1 and word[-1:] == 'r':
1873
            word = word[:-1]+'3'
1874
        word = word.replace('r', '2')
1875
        word = word.replace('l3', 'L3')
1876
        if version == 1:
1877
            word = word.replace('ly', 'Ly')
1878
        if version != 1 and word[-1:] == 'l':
1879
            word = word[:-1]+'3'
1880
        word = word.replace('l', '2')
1881
        if version == 1:
1882
            word = word.replace('j', 'y')
1883
            word = word.replace('y3', 'Y3')
1884
            word = word.replace('y', '2')
1885
        word = word.replace('2', '')
1886
        if version != 1 and word[-1:] == '3':
1887
            word = word[:-1]+'A'
1888
        word = word.replace('3', '')
1889
1890
    # pad with 1s, then extract the necessary length of code
1891
    word = word+'1'*10
1892
    if version != 1:
1893
        word = word[:10]
1894
    else:
1895
        word = word[:6]
1896
1897
    return word
1898
1899
1900
def alpha_sis(word, maxlength=14):
1901
    """Return the IBM Alpha Search Inquiry System code for a word.
1902
1903
    Based on the algorithm described in "Accessing individual records from
1904
    personal data files using non-unique identifiers" / Gwendolyn B. Moore,
1905
    et al.; prepared for the Institute for Computer Sciences and Technology,
1906
    National Bureau of Standards, Washington, D.C (1977):
1907
    https://archive.org/stream/accessingindivid00moor#page/15/mode/1up
1908
1909
    A collection is necessary since there can be multiple values for a
1910
    single word. But the collection must be ordered since the first value
1911
    is the primary coding.
1912
1913
    :param str word: the word to transform
1914
    :param int maxlength: the length of the code returned (defaults to 14)
1915
    :returns: the Alpha SIS value
1916
    :rtype: tuple
1917
1918
    >>> alpha_sis('Christopher')
1919
    ('06401840000000', '07040184000000', '04018400000000')
1920
    >>> alpha_sis('Niall')
1921
    ('02500000000000',)
1922
    >>> alpha_sis('Smith')
1923
    ('03100000000000',)
1924
    >>> alpha_sis('Schmidt')
1925
    ('06310000000000',)
1926
    """
1927
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1928
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1929
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1930
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1931
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1932
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1933
                                 'Y')
1934
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1935
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1936
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1937
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1938
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1939
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1940
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1941
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1942
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1943
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1944
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1945
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1946
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1947
1948
    alpha = ['']
1949
    pos = 0
1950
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
1951
    word = word.replace('ß', 'SS')
1952
    word = ''.join(c for c in word if c in
1953
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
1954
1955
    # Clamp maxlength to [4, 64]
1956
    if maxlength is not None:
1957
        maxlength = min(max(4, maxlength), 64)
1958
    else:
1959
        maxlength = 64
1960
1961
    # Do special processing for initial substrings
1962
    for k in _alpha_sis_initials_order:
1963
        if word.startswith(k):
1964
            alpha[0] += _alpha_sis_initials[k]
1965
            pos += len(k)
1966
            break
1967
1968
    # Add a '0' if alpha is still empty
1969
    if not alpha[0]:
1970
        alpha[0] += '0'
1971
1972
    # Whether or not any special initial codes were encoded, iterate
1973
    # through the length of the word in the main encoding loop
1974
    while pos < len(word):
1975
        origpos = pos
1976
        for k in _alpha_sis_basic_order:
1977
            if word[pos:].startswith(k):
1978
                if isinstance(_alpha_sis_basic[k], tuple):
1979
                    newalpha = []
1980
                    for i in range(len(_alpha_sis_basic[k])):
1981
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
1982
                    alpha = newalpha
1983
                else:
1984
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
1985
                pos += len(k)
1986
                break
1987
        if pos == origpos:
1988
            alpha = [_ + '_' for _ in alpha]
1989
            pos += 1
1990
1991
    # Trim doublets and placeholders
1992
    for i in range(len(alpha)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1993
        pos = 1
1994
        while pos < len(alpha[i]):
1995
            if alpha[i][pos] == alpha[i][pos-1]:
1996
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
1997
            pos += 1
1998
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1999
2000
    # Trim codes and return tuple
2001
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2002
    return tuple(alpha)
2003
2004
2005
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2006
    """Return the Fuzzy Soundex code for a word.
2007
2008
    Fuzzy Soundex is an algorithm derived from Soundex, defined in:
2009
    Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for
2010
    Soundex Retrieval."
2011
    http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
2012
2013
    :param str word: the word to transform
2014
    :param int maxlength: the length of the code returned (defaults to 4)
2015
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2016
        a maxlength string
2017
    :returns: the Fuzzy Soundex value
2018
    :rtype: str
2019
2020
    >>> fuzzy_soundex('Christopher')
2021
    'K6931'
2022
    >>> fuzzy_soundex('Niall')
2023
    'N4000'
2024
    >>> fuzzy_soundex('Smith')
2025
    'S5300'
2026
    >>> fuzzy_soundex('Smith')
2027
    'S5300'
2028
    """
2029
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2030
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2031
                                          '0193017-07745501769301-7-9'))
2032
2033
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2034
    word = word.replace('ß', 'SS')
2035
2036
    # Clamp maxlength to [4, 64]
2037
    if maxlength is not None:
2038
        maxlength = min(max(4, maxlength), 64)
2039
    else:
2040
        maxlength = 64
2041
2042
    if not word:
2043
        if zero_pad:
2044
            return '0' * maxlength
2045
        return '0'
2046
2047
    if word[:2] in frozenset(['CS', 'CZ', 'TS', 'TZ']):
2048
        word = 'SS' + word[2:]
2049
    elif word[:2] == 'GN':
2050
        word = 'NN' + word[2:]
2051
    elif word[:2] in frozenset(['HR', 'WR']):
2052
        word = 'RR' + word[2:]
2053
    elif word[:2] == 'HW':
2054
        word = 'WW' + word[2:]
2055
    elif word[:2] in frozenset(['KN', 'NG']):
2056
        word = 'NN' + word[2:]
2057
2058
    if word[-2:] == 'CH':
2059
        word = word[:-2] + 'KK'
2060
    elif word[-2:] == 'NT':
2061
        word = word[:-2] + 'TT'
2062
    elif word[-2:] == 'RT':
2063
        word = word[:-2] + 'RR'
2064
    elif word[-3:] == 'RDT':
2065
        word = word[:-3] + 'RR'
2066
2067
    word = word.replace('CA', 'KA')
2068
    word = word.replace('CC', 'KK')
2069
    word = word.replace('CK', 'KK')
2070
    word = word.replace('CE', 'SE')
2071
    word = word.replace('CHL', 'KL')
2072
    word = word.replace('CL', 'KL')
2073
    word = word.replace('CHR', 'KR')
2074
    word = word.replace('CR', 'KR')
2075
    word = word.replace('CI', 'SI')
2076
    word = word.replace('CO', 'KO')
2077
    word = word.replace('CU', 'KU')
2078
    word = word.replace('CY', 'SY')
2079
    word = word.replace('DG', 'GG')
2080
    word = word.replace('GH', 'HH')
2081
    word = word.replace('MAC', 'MK')
2082
    word = word.replace('MC', 'MK')
2083
    word = word.replace('NST', 'NSS')
2084
    word = word.replace('PF', 'FF')
2085
    word = word.replace('PH', 'FF')
2086
    word = word.replace('SCH', 'SSS')
2087
    word = word.replace('TIO', 'SIO')
2088
    word = word.replace('TIA', 'SIO')
2089
    word = word.replace('TCH', 'CHH')
2090
2091
    sdx = word.translate(_fuzzy_soundex_translation)
2092
    sdx = sdx.replace('-', '')
2093
2094
    # remove repeating characters
2095
    sdx = _delete_consecutive_repeats(sdx)
2096
2097
    if word[0] in frozenset('HWY'):
2098
        sdx = word[0] + sdx
2099
    else:
2100
        sdx = word[0] + sdx[1:]
2101
2102
    sdx = sdx.replace('0', '')
2103
2104
    if zero_pad:
2105
        sdx += ('0'*maxlength)
2106
2107
    return sdx[:maxlength]
2108
2109
2110
def phonex(word, maxlength=4, zero_pad=True):
2111
    """Return the Phonex code for a word.
2112
2113
    Phonex is an algorithm derived from Soundex, defined in:
2114
    Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms".
2115
    http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf
2116
2117
    :param str word: the word to transform
2118
    :param int maxlength: the length of the code returned (defaults to 4)
2119
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2120
        a maxlength string
2121
    :returns: the Phonex value
2122
    :rtype: str
2123
2124
    >>> phonex('Christopher')
2125
    'C623'
2126
    >>> phonex('Niall')
2127
    'N400'
2128
    >>> phonex('Schmidt')
2129
    'S253'
2130
    >>> phonex('Smith')
2131
    'S530'
2132
    """
2133
    name = unicodedata.normalize('NFKD', text_type(word.upper()))
2134
    name = name.replace('ß', 'SS')
2135
2136
    # Clamp maxlength to [4, 64]
2137
    if maxlength is not None:
2138
        maxlength = min(max(4, maxlength), 64)
2139
    else:
2140
        maxlength = 64
2141
2142
    name_code = last = ''
2143
2144
    # Deletions effected by replacing with next letter which
2145
    # will be ignored due to duplicate handling of Soundex code.
2146
    # This is faster than 'moving' all subsequent letters.
2147
2148
    # Remove any trailing Ss
2149
    while name[-1:] == 'S':
2150
        name = name[:-1]
2151
2152
    # Phonetic equivalents of first 2 characters
2153
    # Works since duplicate letters are ignored
2154
    if name[:2] == 'KN':
2155
        name = 'N' + name[2:]  # KN.. == N..
2156
    elif name[:2] == 'PH':
2157
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2158
    elif name[:2] == 'WR':
2159
        name = 'R' + name[2:]  # WR.. == R..
2160
2161
    if name:
2162
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2163
        # Works since duplicate letters are ignored
2164
        if name[0] == 'H':
2165
            name = name[1:]
2166
2167
    if name:
2168
        # Phonetic equivalents of first character
2169
        if name[0] in frozenset('AEIOUY'):
2170
            name = 'A' + name[1:]
2171
        elif name[0] in frozenset('BP'):
2172
            name = 'B' + name[1:]
2173
        elif name[0] in frozenset('VF'):
2174
            name = 'F' + name[1:]
2175
        elif name[0] in frozenset('KQC'):
2176
            name = 'C' + name[1:]
2177
        elif name[0] in frozenset('JG'):
2178
            name = 'G' + name[1:]
2179
        elif name[0] in frozenset('ZS'):
2180
            name = 'S' + name[1:]
2181
2182
        name_code = last = name[0]
2183
2184
    # MODIFIED SOUNDEX CODE
2185
    for i in range(1, len(name)):
2186
        code = '0'
2187
        if name[i] in frozenset('BPFV'):
2188
            code = '1'
2189
        elif name[i] in frozenset('CSKGJQXZ'):
2190
            code = '2'
2191
        elif name[i] in frozenset('DT'):
2192
            if name[i+1:i+2] != 'C':
2193
                code = '3'
2194
        elif name[i] == 'L':
2195
            if name[i+1:i+2] in frozenset('AEIOUY') or i+1 == len(name):
2196
                code = '4'
2197
        elif name[i] in frozenset('MN'):
2198
            if name[i+1:i+2] in frozenset('DG'):
2199
                name = name[:i+1] + name[i] + name[i+2:]
2200
            code = '5'
2201
        elif name[i] == 'R':
2202
            if name[i+1:i+2] in frozenset('AEIOUY') or i+1 == len(name):
2203
                code = '6'
2204
2205
        if code != last and code != '0' and i != 0:
2206
            name_code += code
2207
2208
        last = name_code[-1]
2209
2210
    if zero_pad:
2211
        name_code += '0' * maxlength
2212
    if not name_code:
2213
        name_code = '0'
2214
    return name_code[:maxlength]
2215
2216
2217
def phonem(word):
2218
    """Return the Phonem code for a word.
2219
2220
    Phonem is defined in Wilde, Georg and Carsten Meyer. 1999. "Doppelgaenger
2221
    gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung."
2222
    ct Magazin fuer Computer & Technik 25/1999.
2223
2224
    This version is based on the Perl implementation documented at:
2225
    http://phonetik.phil-fak.uni-koeln.de/fileadmin/home/ritters/Allgemeine_Dateien/Martin_Wilz.pdf
2226
    It includes some enhancements presented in the Java port at:
2227
    https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java
2228
2229
    Phonem is intended chiefly for German names/words.
2230
2231
    :param str word: the word to transform
2232
    :returns: the Phonem value
2233
    :rtype: str
2234
2235
    >>> phonem('Christopher')
2236
    'CRYSDOVR'
2237
    >>> phonem('Niall')
2238
    'NYAL'
2239
    >>> phonem('Smith')
2240
    'SMYD'
2241
    >>> phonem('Schmidt')
2242
    'CMYD'
2243
    """
2244
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2245
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2246
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2247
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2248
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2249
                             ('AU', 'A§'), ('OU', '§'))
2250
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2251
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2252
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2253
2254
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2255
    for i, j in _phonem_substitutions:
2256
        word = word.replace(i, j)
2257
    word = word.translate(_phonem_translation)
2258
2259
    return ''.join(c for c in _delete_consecutive_repeats(word)
2260
                   if c in frozenset('ABCDLMNORSUVWXYÖ'))
2261
2262
2263
def phonix(word, maxlength=4, zero_pad=True):
2264
    """Return the Phonix code for a word.
2265
2266
    Phonix is a Soundex-like algorithm defined in:
2267
    T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366.
2268
2269
    This implementation is based on
2270
    http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c
2271
    http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py
2272
    and
2273
    https://metacpan.org/pod/Text::Phonetic::Phonix
2274
2275
    :param str word: the word to transform
2276
    :param int maxlength: the length of the code returned (defaults to 4)
2277
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2278
        a maxlength string
2279
    :returns: the Phonix value
2280
    :rtype: str
2281
2282
    >>> phonix('Christopher')
2283
    'K683'
2284
    >>> phonix('Niall')
2285
    'N400'
2286
    >>> phonix('Smith')
2287
    'S530'
2288
    >>> phonix('Schmidt')
2289
    'S530'
2290
    """
2291
    # pylint: disable=too-many-branches
2292
    def _start_repl(word, src, tar, post=None):
2293
        r"""Replace src with tar at the start of word."""
2294
        if post:
2295
            for i in post:
2296
                if word.startswith(src+i):
2297
                    return tar + word[len(src):]
2298
        elif word.startswith(src):
2299
            return tar + word[len(src):]
2300
        return word
2301
2302
    def _end_repl(word, src, tar, pre=None):
2303
        r"""Replace src with tar at the end of word."""
2304
        if pre:
2305
            for i in pre:
2306
                if word.endswith(i+src):
2307
                    return word[:-len(src)] + tar
2308
        elif word.endswith(src):
2309
            return word[:-len(src)] + tar
2310
        return word
2311
2312
    def _mid_repl(word, src, tar, pre=None, post=None):
2313
        r"""Replace src with tar in the middle of word."""
2314
        if pre or post:
2315
            if not pre:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2316
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2317
            elif not post:
2318
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2319
            return _all_repl(word, src, tar, pre, post)
2320
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2321
                word[-1])
2322
2323
    def _all_repl(word, src, tar, pre=None, post=None):
2324
        r"""Replace src with tar anywhere in word."""
2325
        if pre or post:
2326
            if post:
2327
                post = post
2328
            else:
2329
                post = frozenset(('',))
2330
            if pre:
2331
                pre = pre
2332
            else:
2333
                pre = frozenset(('',))
2334
2335
            for i, j in ((i, j) for i in pre for j in post):
2336
                word = word.replace(i+src+j, i+tar+j)
2337
            return word
2338
        else:
2339
            return word.replace(src, tar)
2340
2341
    _vow = frozenset('AEIOU')
2342
    _con = frozenset('BCDFGHJKLMNPQRSTVWXYZ')
2343
2344
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2345
                             (_all_repl, 'CO', 'KO'),
2346
                             (_all_repl, 'CA', 'KA'),
2347
                             (_all_repl, 'CU', 'KU'),
2348
                             (_all_repl, 'CY', 'SI'),
2349
                             (_all_repl, 'CI', 'SI'),
2350
                             (_all_repl, 'CE', 'SE'),
2351
                             (_start_repl, 'CL', 'KL', _vow),
2352
                             (_all_repl, 'CK', 'K'),
2353
                             (_end_repl, 'GC', 'K'),
2354
                             (_end_repl, 'JC', 'K'),
2355
                             (_start_repl, 'CHR', 'KR', _vow),
2356
                             (_start_repl, 'CR', 'KR', _vow),
2357
                             (_start_repl, 'WR', 'R'),
2358
                             (_all_repl, 'NC', 'NK'),
2359
                             (_all_repl, 'CT', 'KT'),
2360
                             (_all_repl, 'PH', 'F'),
2361
                             (_all_repl, 'AA', 'AR'),
2362
                             (_all_repl, 'SCH', 'SH'),
2363
                             (_all_repl, 'BTL', 'TL'),
2364
                             (_all_repl, 'GHT', 'T'),
2365
                             (_all_repl, 'AUGH', 'ARF'),
2366
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2367
                             (_all_repl, 'LOUGH', 'LOW'),
2368
                             (_start_repl, 'Q', 'KW'),
2369
                             (_start_repl, 'KN', 'N'),
2370
                             (_end_repl, 'GN', 'N'),
2371
                             (_all_repl, 'GHN', 'N'),
2372
                             (_end_repl, 'GNE', 'N'),
2373
                             (_all_repl, 'GHNE', 'NE'),
2374
                             (_end_repl, 'GNES', 'NS'),
2375
                             (_start_repl, 'GN', 'N'),
2376
                             (_mid_repl, 'GN', 'N', None, _con),
2377
                             (_end_repl, 'GN', 'N'),
2378
                             (_start_repl, 'PS', 'S'),
2379
                             (_start_repl, 'PT', 'T'),
2380
                             (_start_repl, 'CZ', 'C'),
2381
                             (_mid_repl, 'WZ', 'Z', _vow),
2382
                             (_mid_repl, 'CZ', 'CH'),
2383
                             (_all_repl, 'LZ', 'LSH'),
2384
                             (_all_repl, 'RZ', 'RSH'),
2385
                             (_mid_repl, 'Z', 'S', None, _vow),
2386
                             (_all_repl, 'ZZ', 'TS'),
2387
                             (_mid_repl, 'Z', 'TS', _con),
2388
                             (_all_repl, 'HROUG', 'REW'),
2389
                             (_all_repl, 'OUGH', 'OF'),
2390
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2391
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2392
                             (_start_repl, 'YJ', 'Y', _vow),
2393
                             (_start_repl, 'GH', 'G'),
2394
                             (_end_repl, 'GH', 'E', _vow),
2395
                             (_start_repl, 'CY', 'S'),
2396
                             (_all_repl, 'NX', 'NKS'),
2397
                             (_start_repl, 'PF', 'F'),
2398
                             (_end_repl, 'DT', 'T'),
2399
                             (_end_repl, 'TL', 'TIL'),
2400
                             (_end_repl, 'DL', 'DIL'),
2401
                             (_all_repl, 'YTH', 'ITH'),
2402
                             (_start_repl, 'TJ', 'CH', _vow),
2403
                             (_start_repl, 'TSJ', 'CH', _vow),
2404
                             (_start_repl, 'TS', 'T', _vow),
2405
                             (_all_repl, 'TCH', 'CH'),
2406
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2407
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2408
                             (_start_repl, 'MN', 'N', _vow),
2409
                             (_start_repl, 'PN', 'N', _vow),
2410
                             (_mid_repl, 'STL', 'SL', _vow),
2411
                             (_end_repl, 'STL', 'SL', _vow),
2412
                             (_end_repl, 'TNT', 'ENT'),
2413
                             (_end_repl, 'EAUX', 'OH'),
2414
                             (_all_repl, 'EXCI', 'ECS'),
2415
                             (_all_repl, 'X', 'ECS'),
2416
                             (_end_repl, 'NED', 'ND'),
2417
                             (_all_repl, 'JR', 'DR'),
2418
                             (_end_repl, 'EE', 'EA'),
2419
                             (_all_repl, 'ZS', 'S'),
2420
                             (_mid_repl, 'R', 'AH', _vow, _con),
2421
                             (_end_repl, 'R', 'AH', _vow),
2422
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2423
                             (_end_repl, 'HR', 'AH', _vow),
2424
                             (_end_repl, 'HR', 'AH', _vow),
2425
                             (_end_repl, 'RE', 'AR'),
2426
                             (_end_repl, 'R', 'AH', _vow),
2427
                             (_all_repl, 'LLE', 'LE'),
2428
                             (_end_repl, 'LE', 'ILE', _con),
2429
                             (_end_repl, 'LES', 'ILES', _con),
2430
                             (_end_repl, 'E', ''),
2431
                             (_end_repl, 'ES', 'S'),
2432
                             (_end_repl, 'SS', 'AS', _vow),
2433
                             (_end_repl, 'MB', 'M', _vow),
2434
                             (_all_repl, 'MPTS', 'MPS'),
2435
                             (_all_repl, 'MPS', 'MS'),
2436
                             (_all_repl, 'MPT', 'MT'))
2437
2438
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2439
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2440
                                   '01230720022455012683070808'))
2441
2442
    sdx = ''
2443
2444
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2445
    word = word.replace('ß', 'SS')
2446
    word = ''.join(c for c in word if c in
2447
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
2448
    if word:
2449
        for trans in _phonix_substitutions:
2450
            word = trans[0](word, *trans[1:])
2451
        if word[0] in frozenset('AEIOUY'):
2452
            sdx = 'v' + word[1:].translate(_phonix_translation)
2453
        else:
2454
            sdx = word[0] + word[1:].translate(_phonix_translation)
2455
        sdx = _delete_consecutive_repeats(sdx)
2456
        sdx = sdx.replace('0', '')
2457
2458
    # Clamp maxlength to [4, 64]
2459
    if maxlength is not None:
2460
        maxlength = min(max(4, maxlength), 64)
2461
    else:
2462
        maxlength = 64
2463
2464
    if zero_pad:
2465
        sdx += '0' * maxlength
2466
    if not sdx:
2467
        sdx = '0'
2468
    return sdx[:maxlength]
2469
2470
2471
def sfinxbis(word, maxlength=None):
2472
    """Return the SfinxBis code for a word.
2473
2474
    SfinxBis is a Soundex-like algorithm defined in:
2475
    http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf
2476
2477
    This implementation follows the reference implementation:
2478
    http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt
2479
2480
    SfinxBis is intended chiefly for Swedish names.
2481
2482
    :param str word: the word to transform
2483
    :param int maxlength: the length of the code returned (defaults to
2484
        unlimited)
2485
    :returns: the SfinxBis value
2486
    :rtype: tuple
2487
2488
    >>> sfinxbis('Christopher')
2489
    ('K68376',)
2490
    >>> sfinxbis('Niall')
2491
    ('N4',)
2492
    >>> sfinxbis('Smith')
2493
    ('S53',)
2494
    >>> sfinxbis('Schmidt')
2495
    ('S53',)
2496
2497
    >>> sfinxbis('Johansson')
2498
    ('J585',)
2499
    >>> sfinxbis('Sjöberg')
2500
    ('#162',)
2501
    """
2502
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2503
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2504
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2505
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2506
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2507
                   ' S:T ')
2508
2509
    _harde_vokaler = frozenset('AOUÅ')
2510
    _mjuka_vokaler = frozenset('EIYÄÖ')
2511
    _konsonanter = frozenset('BCDFGHJKLMNPQRSTVWXZ')
2512
    _alfabet = frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ')
2513
2514
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2515
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2516
                                     '123729224551268378999999999'))
2517
2518
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2519
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2520
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2521
2522
    def _foersvensker(ordet):
2523
        """Return the Swedish-ized form of the word."""
2524
        ordet = ordet.replace('STIERN', 'STJÄRN')
2525
        ordet = ordet.replace('HIE', 'HJ')
2526
        ordet = ordet.replace('SIÖ', 'SJÖ')
2527
        ordet = ordet.replace('SCH', 'SH')
2528
        ordet = ordet.replace('QU', 'KV')
2529
        ordet = ordet.replace('IO', 'JO')
2530
        ordet = ordet.replace('PH', 'F')
2531
2532
        for i in _harde_vokaler:
2533
            ordet = ordet.replace(i+'Ü', i+'J')
2534
            ordet = ordet.replace(i+'Y', i+'J')
2535
            ordet = ordet.replace(i+'I', i+'J')
2536
        for i in _mjuka_vokaler:
2537
            ordet = ordet.replace(i+'Ü', i+'J')
2538
            ordet = ordet.replace(i+'Y', i+'J')
2539
            ordet = ordet.replace(i+'I', i+'J')
2540
2541
        if 'H' in ordet:
2542
            for i in _konsonanter:
2543
                ordet = ordet.replace('H'+i, i)
2544
2545
        ordet = ordet.translate(_sfinxbis_substitutions)
2546
2547
        ordet = ordet.replace('Ð', 'ETH')
2548
        ordet = ordet.replace('Þ', 'TH')
2549
        ordet = ordet.replace('ß', 'SS')
2550
2551
        return ordet
2552
2553
    def _koda_foersta_ljudet(ordet):
2554
        """Return the word with the first sound coded."""
2555
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2556
            ordet = '$' + ordet[1:]
2557
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2558
            ordet = 'J' + ordet[2:]
2559
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2560
            ordet = 'J' + ordet[1:]
2561
        elif ordet[0:1] == 'Q':
2562
            ordet = 'K' + ordet[1:]
2563
        elif (ordet[0:2] == 'CH' and
2564
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2565
            ordet = '#' + ordet[2:]
2566
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2567
            ordet = 'K' + ordet[1:]
2568
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2569
            ordet = 'K' + ordet[1:]
2570
        elif ordet[0:1] == 'X':
2571
            ordet = 'S' + ordet[1:]
2572
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2573
            ordet = 'S' + ordet[1:]
2574
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2575
            ordet = '#' + ordet[3:]
2576
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2577
            ordet = '#' + ordet[2:]
2578
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2579
            ordet = '#' + ordet[2:]
2580
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2581
            ordet = '#' + ordet[1:]
2582
        return ordet
2583
2584
    # Steg 1, Versaler
2585
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2586
    word = word.replace('ß', 'SS')
2587
    word = word.replace('-', ' ')
2588
2589
    # Steg 2, Ta bort adelsprefix
2590
    for adelstitel in adelstitler:
2591
        while adelstitel in word:
2592
            word = word.replace(adelstitel, ' ')
2593
        if word.startswith(adelstitel[1:]):
2594
            word = word[len(adelstitel)-1:]
2595
2596
    # Split word into tokens
2597
    ordlista = word.split()
2598
2599
    # Steg 3, Ta bort dubbelteckning i början på namnet
2600
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2601
    if not ordlista:
2602
        return ('',)
2603
2604
    # Steg 4, Försvenskning
2605
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2606
2607
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2608
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2609
                for ordet in ordlista]
2610
2611
    # Steg 6, Koda första ljudet
2612
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2613
2614
    # Steg 7, Dela upp namnet i två delar
2615
    rest = [ordet[1:] for ordet in ordlista]
2616
2617
    # Steg 8, Utför fonetisk transformation i resten
2618
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2619
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2620
2621
    # Steg 9, Koda resten till en sifferkod
2622
    for vokal in _mjuka_vokaler:
2623
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2624
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2625
2626
    # Steg 10, Ta bort intilliggande dubbletter
2627
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2628
2629
    # Steg 11, Ta bort alla "9"
2630
    rest = [ordet.replace('9', '') for ordet in rest]
2631
2632
    # Steg 12, Sätt ihop delarna igen
2633
    ordlista = [''.join(ordet) for ordet in
2634
                zip((_[0:1] for _ in ordlista), rest)]
2635
2636
    # truncate, if maxlength is set
2637
    if maxlength and maxlength < _INFINITY:
2638
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2639
2640
    return tuple(ordlista)
2641
2642
2643
def phonet(word, mode=1, lang='de', trace=False):
2644
    """Return the phonet code for a word.
2645
2646
    phonet was developed by Jörg Michael and documented in c't magazine
2647
    vol. 25/1999, p. 252. It is a phonetic algorithm designed primarily for
2648
    German.
2649
    Cf. http://www.heise.de/ct/ftp/99/25/252/
2650
2651
    This is a port of Jesper Zedlitz's code, which is licensed LGPL:
2652
    https://code.google.com/p/phonet4java/source/browse/trunk/src/main/java/com/googlecode/phonet4java/Phonet.java
2653
2654
    That is, in turn, based on Michael's C code, which is also licensed LGPL:
2655
    ftp://ftp.heise.de/pub/ct/listings/phonet.zip
2656
2657
    :param str word: the word to transform
2658
    :param int mode: the ponet variant to employ (1 or 2)
2659
    :param str lang: 'de' (default) for German
2660
            'none' for no language
2661
    :param bool trace: prints debugging info if True
2662
    :returns: the phonet value
2663
    :rtype: str
2664
2665
    >>> phonet('Christopher')
2666
    'KRISTOFA'
2667
    >>> phonet('Niall')
2668
    'NIAL'
2669
    >>> phonet('Smith')
2670
    'SMIT'
2671
    >>> phonet('Schmidt')
2672
    'SHMIT'
2673
2674
    >>> phonet('Christopher', mode=2)
2675
    'KRIZTUFA'
2676
    >>> phonet('Niall', mode=2)
2677
    'NIAL'
2678
    >>> phonet('Smith', mode=2)
2679
    'ZNIT'
2680
    >>> phonet('Schmidt', mode=2)
2681
    'ZNIT'
2682
2683
    >>> phonet('Christopher', lang='none')
2684
    'CHRISTOPHER'
2685
    >>> phonet('Niall', lang='none')
2686
    'NIAL'
2687
    >>> phonet('Smith', lang='none')
2688
    'SMITH'
2689
    >>> phonet('Schmidt', lang='none')
2690
    'SCHMIDT'
2691
    """
2692
    # pylint: disable=too-many-branches
2693
2694
    _phonet_rules_no_lang = (  # separator chars
2695
        '´', ' ', ' ',
2696
        '"', ' ', ' ',
2697
        '`$', '', '',
2698
        '\'', ' ', ' ',
2699
        ',', ',', ',',
2700
        ';', ',', ',',
2701
        '-', ' ', ' ',
2702
        ' ', ' ', ' ',
2703
        '.', '.', '.',
2704
        ':', '.', '.',
2705
        # German umlauts
2706
        'Ä', 'AE', 'AE',
2707
        'Ö', 'OE', 'OE',
2708
        'Ü', 'UE', 'UE',
2709
        'ß', 'S', 'S',
2710
        # international umlauts
2711
        'À', 'A', 'A',
2712
        'Á', 'A', 'A',
2713
        'Â', 'A', 'A',
2714
        'Ã', 'A', 'A',
2715
        'Å', 'A', 'A',
2716
        'Æ', 'AE', 'AE',
2717
        'Ç', 'C', 'C',
2718
        'Ð', 'DJ', 'DJ',
2719
        'È', 'E', 'E',
2720
        'É', 'E', 'E',
2721
        'Ê', 'E', 'E',
2722
        'Ë', 'E', 'E',
2723
        'Ì', 'I', 'I',
2724
        'Í', 'I', 'I',
2725
        'Î', 'I', 'I',
2726
        'Ï', 'I', 'I',
2727
        'Ñ', 'NH', 'NH',
2728
        'Ò', 'O', 'O',
2729
        'Ó', 'O', 'O',
2730
        'Ô', 'O', 'O',
2731
        'Õ', 'O', 'O',
2732
        'Œ', 'OE', 'OE',
2733
        'Ø', 'OE', 'OE',
2734
        'Š', 'SH', 'SH',
2735
        'Þ', 'TH', 'TH',
2736
        'Ù', 'U', 'U',
2737
        'Ú', 'U', 'U',
2738
        'Û', 'U', 'U',
2739
        'Ý', 'Y', 'Y',
2740
        'Ÿ', 'Y', 'Y',
2741
        # 'normal' letters (A-Z)
2742
        'MC^', 'MAC', 'MAC',
2743
        'MC^', 'MAC', 'MAC',
2744
        'M´^', 'MAC', 'MAC',
2745
        'M\'^', 'MAC', 'MAC',
2746
        'O´^', 'O', 'O',
2747
        'O\'^', 'O', 'O',
2748
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2749
        None, None, None)
2750
2751
    _phonet_rules_german = (  # separator chars
2752
        '´', ' ', ' ',
2753
        '"', ' ', ' ',
2754
        '`$', '', '',
2755
        '\'', ' ', ' ',
2756
        ',', ' ', ' ',
2757
        ';', ' ', ' ',
2758
        '-', ' ', ' ',
2759
        ' ', ' ', ' ',
2760
        '.', '.', '.',
2761
        ':', '.', '.',
2762
        # German umlauts
2763
        'ÄE', 'E', 'E',
2764
        'ÄU<', 'EU', 'EU',
2765
        'ÄV(AEOU)-<', 'EW', None,
2766
        'Ä$', 'Ä', None,
2767
        'Ä<', None, 'E',
2768
        'Ä', 'E', None,
2769
        'ÖE', 'Ö', 'Ö',
2770
        'ÖU', 'Ö', 'Ö',
2771
        'ÖVER--<', 'ÖW', None,
2772
        'ÖV(AOU)-', 'ÖW', None,
2773
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2774
        'ÜBER^^', 'ÜBA', 'IBA',
2775
        'ÜE', 'Ü', 'I',
2776
        'ÜVER--<', 'ÜW', None,
2777
        'ÜV(AOU)-', 'ÜW', None,
2778
        'Ü', None, 'I',
2779
        'ßCH<', None, 'Z',
2780
        'ß<', 'S', 'Z',
2781
        # international umlauts
2782
        'À<', 'A', 'A',
2783
        'Á<', 'A', 'A',
2784
        'Â<', 'A', 'A',
2785
        'Ã<', 'A', 'A',
2786
        'Å<', 'A', 'A',
2787
        'ÆER-', 'E', 'E',
2788
        'ÆU<', 'EU', 'EU',
2789
        'ÆV(AEOU)-<', 'EW', None,
2790
        'Æ$', 'Ä', None,
2791
        'Æ<', None, 'E',
2792
        'Æ', 'E', None,
2793
        'Ç', 'Z', 'Z',
2794
        'ÐÐ-', '', '',
2795
        'Ð', 'DI', 'TI',
2796
        'È<', 'E', 'E',
2797
        'É<', 'E', 'E',
2798
        'Ê<', 'E', 'E',
2799
        'Ë', 'E', 'E',
2800
        'Ì<', 'I', 'I',
2801
        'Í<', 'I', 'I',
2802
        'Î<', 'I', 'I',
2803
        'Ï', 'I', 'I',
2804
        'ÑÑ-', '', '',
2805
        'Ñ', 'NI', 'NI',
2806
        'Ò<', 'O', 'U',
2807
        'Ó<', 'O', 'U',
2808
        'Ô<', 'O', 'U',
2809
        'Õ<', 'O', 'U',
2810
        'Œ<', 'Ö', 'Ö',
2811
        'Ø(IJY)-<', 'E', 'E',
2812
        'Ø<', 'Ö', 'Ö',
2813
        'Š', 'SH', 'Z',
2814
        'Þ', 'T', 'T',
2815
        'Ù<', 'U', 'U',
2816
        'Ú<', 'U', 'U',
2817
        'Û<', 'U', 'U',
2818
        'Ý<', 'I', 'I',
2819
        'Ÿ<', 'I', 'I',
2820
        # 'normal' letters (A-Z)
2821
        'ABELLE$', 'ABL', 'ABL',
2822
        'ABELL$', 'ABL', 'ABL',
2823
        'ABIENNE$', 'ABIN', 'ABIN',
2824
        'ACHME---^', 'ACH', 'AK',
2825
        'ACEY$', 'AZI', 'AZI',
2826
        'ADV', 'ATW', None,
2827
        'AEGL-', 'EK', None,
2828
        'AEU<', 'EU', 'EU',
2829
        'AE2', 'E', 'E',
2830
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2831
        'AGL-1', 'AK', None,
2832
        'AGNI-^', 'AKN', 'AKN',
2833
        'AGNIE-', 'ANI', 'ANI',
2834
        'AGN(AEOU)-$', 'ANI', 'ANI',
2835
        'AH(AIOÖUÜY)-', 'AH', None,
2836
        'AIA2', 'AIA', 'AIA',
2837
        'AIE$', 'E', 'E',
2838
        'AILL(EOU)-', 'ALI', 'ALI',
2839
        'AINE$', 'EN', 'EN',
2840
        'AIRE$', 'ER', 'ER',
2841
        'AIR-', 'E', 'E',
2842
        'AISE$', 'ES', 'EZ',
2843
        'AISSANCE$', 'ESANS', 'EZANZ',
2844
        'AISSE$', 'ES', 'EZ',
2845
        'AIX$', 'EX', 'EX',
2846
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2847
        'AKTIE', 'AXIE', 'AXIE',
2848
        'AKTUEL', 'AKTUEL', None,
2849
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2850
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2851
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2852
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2853
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2854
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2855
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2856
        'ANDERGING----', 'ANDA ', 'ANTA ',
2857
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2858
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2859
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2860
        'ANER(BKO)---^^', 'AN', None,
2861
        'ANHAND---^$', 'AN H', 'AN ',
2862
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2863
        'ANIELLE$', 'ANIEL', 'ANIL',
2864
        'ANIEL', 'ANIEL', None,
2865
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2866
        'ANTI^^', 'ANTI', 'ANTI',
2867
        'ANVER^^', 'ANFA', 'ANFA',
2868
        'ATIA$', 'ATIA', 'ATIA',
2869
        'ATIA(NS)--', 'ATI', 'ATI',
2870
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2871
        'AUAU--', '', '',
2872
        'AUERE$', 'AUERE', None,
2873
        'AUERE(NS)-$', 'AUERE', None,
2874
        'AUERE(AIOUY)--', 'AUER', None,
2875
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2876
        'AUER<', 'AUA', 'AUA',
2877
        'AUF^^', 'AUF', 'AUF',
2878
        'AULT$', 'O', 'U',
2879
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2880
        'AUR$', 'AUA', 'AUA',
2881
        'AUSSE$', 'OS', 'UZ',
2882
        'AUS(ST)-^', 'AUS', 'AUS',
2883
        'AUS^^', 'AUS', 'AUS',
2884
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2885
        'AUTO^^', 'AUTO', 'AUTU',
2886
        'AUX(IY)-', 'AUX', 'AUX',
2887
        'AUX', 'O', 'U',
2888
        'AU', 'AU', 'AU',
2889
        'AVER--<', 'AW', None,
2890
        'AVIER$', 'AWIE', 'AFIE',
2891
        'AV(EÈÉÊI)-^', 'AW', None,
2892
        'AV(AOU)-', 'AW', None,
2893
        'AYRE$', 'EIRE', 'EIRE',
2894
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2895
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2896
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2897
        'AYR<', 'EIA', 'EIA',
2898
        'AYER--<', 'EI', 'EI',
2899
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2900
        'AË', 'E', 'E',
2901
        'A(IJY)<', 'EI', 'EI',
2902
        'BABY^$', 'BEBI', 'BEBI',
2903
        'BAB(IY)^', 'BEBI', 'BEBI',
2904
        'BEAU^$', 'BO', None,
2905
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2906
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2907
        'BEE$', 'BI', 'BI',
2908
        'BEIGE^$', 'BESH', 'BEZ',
2909
        'BENOIT--', 'BENO', 'BENU',
2910
        'BER(DT)-', 'BER', None,
2911
        'BERN(DT)-', 'BERN', None,
2912
        'BE(LMNRST)-^', 'BE', 'BE',
2913
        'BETTE$', 'BET', 'BET',
2914
        'BEVOR^$', 'BEFOR', None,
2915
        'BIC$', 'BIZ', 'BIZ',
2916
        'BOWL(EI)-', 'BOL', 'BUL',
2917
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2918
        'BRINGEND-----^', 'BRI', 'BRI',
2919
        'BRINGEND-----', ' BRI', ' BRI',
2920
        'BROW(NS)-', 'BRAU', 'BRAU',
2921
        'BUDGET7', 'BÜGE', 'BIKE',
2922
        'BUFFET7', 'BÜFE', 'BIFE',
2923
        'BYLLE$', 'BILE', 'BILE',
2924
        'BYLL$', 'BIL', 'BIL',
2925
        'BYPA--^', 'BEI', 'BEI',
2926
        'BYTE<', 'BEIT', 'BEIT',
2927
        'BY9^', 'BÜ', None,
2928
        'B(SßZ)$', 'BS', None,
2929
        'CACH(EI)-^', 'KESH', 'KEZ',
2930
        'CAE--', 'Z', 'Z',
2931
        'CA(IY)$', 'ZEI', 'ZEI',
2932
        'CE(EIJUY)--', 'Z', 'Z',
2933
        'CENT<', 'ZENT', 'ZENT',
2934
        'CERST(EI)----^', 'KE', 'KE',
2935
        'CER$', 'ZA', 'ZA',
2936
        'CE3', 'ZE', 'ZE',
2937
        'CH\'S$', 'X', 'X',
2938
        'CH´S$', 'X', 'X',
2939
        'CHAO(ST)-', 'KAO', 'KAU',
2940
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2941
        'CHAR(AI)-^', 'KAR', 'KAR',
2942
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2943
        'CHÄ(CF)-', 'SHE', 'ZE',
2944
        'CHE(CF)-', 'SHE', 'ZE',
2945
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2946
        'CHEQUE<', 'SHEK', 'ZEK',
2947
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2948
        'CH(AEUY)-<^', 'SH', 'Z',
2949
        'CHK-', '', '',
2950
        'CHO(CKPS)-^', 'SHO', 'ZU',
2951
        'CHRIS-', 'KRI', None,
2952
        'CHRO-', 'KR', None,
2953
        'CH(LOR)-<^', 'K', 'K',
2954
        'CHST-', 'X', 'X',
2955
        'CH(SßXZ)3', 'X', 'X',
2956
        'CHTNI-3', 'CHN', 'KN',
2957
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2958
        'CH', 'CH', 'K',
2959
        'CIC$', 'ZIZ', 'ZIZ',
2960
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
2961
        'CIENCE$', 'EIENS', 'EIENZ',
2962
        'CIER$', 'ZIE', 'ZIE',
2963
        'CYB-^', 'ZEI', 'ZEI',
2964
        'CY9^', 'ZÜ', 'ZI',
2965
        'C(IJY)-<3', 'Z', 'Z',
2966
        'CLOWN-', 'KLAU', 'KLAU',
2967
        'CCH', 'Z', 'Z',
2968
        'CCE-', 'X', 'X',
2969
        'C(CK)-', '', '',
2970
        'CLAUDET---', 'KLO', 'KLU',
2971
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
2972
        'COACH', 'KOSH', 'KUZ',
2973
        'COLE$', 'KOL', 'KUL',
2974
        'COUCH', 'KAUSH', 'KAUZ',
2975
        'COW', 'KAU', 'KAU',
2976
        'CQUES$', 'K', 'K',
2977
        'CQUE', 'K', 'K',
2978
        'CRASH--9', 'KRE', 'KRE',
2979
        'CREAT-^', 'KREA', 'KREA',
2980
        'CST', 'XT', 'XT',
2981
        'CS<^', 'Z', 'Z',
2982
        'C(SßX)', 'X', 'X',
2983
        'CT\'S$', 'X', 'X',
2984
        'CT(SßXZ)', 'X', 'X',
2985
        'CZ<', 'Z', 'Z',
2986
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
2987
        'C.^', 'C.', 'C.',
2988
        'CÄ-', 'Z', 'Z',
2989
        'CÜ$', 'ZÜ', 'ZI',
2990
        'C\'S$', 'X', 'X',
2991
        'C<', 'K', 'K',
2992
        'DAHER^$', 'DAHER', None,
2993
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
2994
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
2995
        'DD(SZ)--<', '', '',
2996
        'DD9', 'D', None,
2997
        'DEPOT7', 'DEPO', 'TEBU',
2998
        'DESIGN', 'DISEIN', 'TIZEIN',
2999
        'DE(LMNRST)-3^', 'DE', 'TE',
3000
        'DETTE$', 'DET', 'TET',
3001
        'DH$', 'T', None,
3002
        'DIC$', 'DIZ', 'TIZ',
3003
        'DIDR-^', 'DIT', None,
3004
        'DIEDR-^', 'DIT', None,
3005
        'DJ(AEIOU)-^', 'I', 'I',
3006
        'DMITR-^', 'DIMIT', 'TINIT',
3007
        'DRY9^', 'DRÜ', None,
3008
        'DT-', '', '',
3009
        'DUIS-^', 'DÜ', 'TI',
3010
        'DURCH^^', 'DURCH', 'TURK',
3011
        'DVA$', 'TWA', None,
3012
        'DY9^', 'DÜ', None,
3013
        'DYS$', 'DIS', None,
3014
        'DS(CH)--<', 'T', 'T',
3015
        'DST', 'ZT', 'ZT',
3016
        'DZS(CH)--', 'T', 'T',
3017
        'D(SßZ)', 'Z', 'Z',
3018
        'D(AÄEIOÖRUÜY)-', 'D', None,
3019
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3020
        'D\'H^', 'D', 'T',
3021
        'D´H^', 'D', 'T',
3022
        'D`H^', 'D', 'T',
3023
        'D\'S3$', 'Z', 'Z',
3024
        'D´S3$', 'Z', 'Z',
3025
        'D^', 'D', None,
3026
        'D', 'T', 'T',
3027
        'EAULT$', 'O', 'U',
3028
        'EAUX$', 'O', 'U',
3029
        'EAU', 'O', 'U',
3030
        'EAV', 'IW', 'IF',
3031
        'EAS3$', 'EAS', None,
3032
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3033
        'EA3$', 'EA', 'EA',
3034
        'EA3', 'I', 'I',
3035
        'EBENSO^$', 'EBNSO', 'EBNZU',
3036
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3037
        'EBEN^^', 'EBN', 'EBN',
3038
        'EE9', 'E', 'E',
3039
        'EGL-1', 'EK', None,
3040
        'EHE(IUY)--1', 'EH', None,
3041
        'EHUNG---1', 'E', None,
3042
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3043
        'EIEI--', '', '',
3044
        'EIERE^$', 'EIERE', None,
3045
        'EIERE$', 'EIERE', None,
3046
        'EIERE(NS)-$', 'EIERE', None,
3047
        'EIERE(AIOUY)--', 'EIER', None,
3048
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3049
        'EIER<', 'EIA', None,
3050
        'EIGL-1', 'EIK', None,
3051
        'EIGH$', 'EI', 'EI',
3052
        'EIH--', 'E', 'E',
3053
        'EILLE$', 'EI', 'EI',
3054
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3055
        'EIR$', 'EIA', 'EIA',
3056
        'EITRAUBEN------', 'EIT ', 'EIT ',
3057
        'EI', 'EI', 'EI',
3058
        'EJ$', 'EI', 'EI',
3059
        'ELIZ^', 'ELIS', None,
3060
        'ELZ^', 'ELS', None,
3061
        'EL-^', 'E', 'E',
3062
        'ELANG----1', 'E', 'E',
3063
        'EL(DKL)--1', 'E', 'E',
3064
        'EL(MNT)--1$', 'E', 'E',
3065
        'ELYNE$', 'ELINE', 'ELINE',
3066
        'ELYN$', 'ELIN', 'ELIN',
3067
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3068
        'EL-1', 'L', 'L',
3069
        'EM-^', None, 'E',
3070
        'EM(DFKMPQT)--1', None, 'E',
3071
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3072
        'EM-1', None, 'N',
3073
        'ENGAG-^', 'ANGA', 'ANKA',
3074
        'EN-^', 'E', 'E',
3075
        'ENTUEL', 'ENTUEL', None,
3076
        'EN(CDGKQSTZ)--1', 'E', 'E',
3077
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3078
        'EN-1', '', '',
3079
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3080
        'ER-^', 'E', 'E',
3081
        'ERREGEND-----', ' ER', ' ER',
3082
        'ERT1$', 'AT', None,
3083
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3084
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3085
        'ER1$', 'A', 'A',
3086
        'ER<1', 'A', 'A',
3087
        'ETAT7', 'ETA', 'ETA',
3088
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3089
        'EUERE$', 'EUERE', None,
3090
        'EUERE(NS)-$', 'EUERE', None,
3091
        'EUERE(AIOUY)--', 'EUER', None,
3092
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3093
        'EUER<', 'EUA', None,
3094
        'EUEU--', '', '',
3095
        'EUILLE$', 'Ö', 'Ö',
3096
        'EUR$', 'ÖR', 'ÖR',
3097
        'EUX', 'Ö', 'Ö',
3098
        'EUSZ$', 'EUS', None,
3099
        'EUTZ$', 'EUS', None,
3100
        'EUYS$', 'EUS', 'EUZ',
3101
        'EUZ$', 'EUS', None,
3102
        'EU', 'EU', 'EU',
3103
        'EVER--<1', 'EW', None,
3104
        'EV(ÄOÖUÜ)-1', 'EW', None,
3105
        'EYER<', 'EIA', 'EIA',
3106
        'EY<', 'EI', 'EI',
3107
        'FACETTE', 'FASET', 'FAZET',
3108
        'FANS--^$', 'FE', 'FE',
3109
        'FAN-^$', 'FE', 'FE',
3110
        'FAULT-', 'FOL', 'FUL',
3111
        'FEE(DL)-', 'FI', 'FI',
3112
        'FEHLER', 'FELA', 'FELA',
3113
        'FE(LMNRST)-3^', 'FE', 'FE',
3114
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3115
        'FOERDERN---', ' FÖRD', ' FÖRT',
3116
        'FOND7', 'FON', 'FUN',
3117
        'FRAIN$', 'FRA', 'FRA',
3118
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3119
        'FY9^', 'FÜ', None,
3120
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3121
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3122
        'GAGS^$', 'GEX', 'KEX',
3123
        'GAG^$', 'GEK', 'KEK',
3124
        'GD', 'KT', 'KT',
3125
        'GEGEN^^', 'GEGN', 'KEKN',
3126
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3127
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3128
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3129
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3130
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3131
        'GENRE', 'IORE', 'IURE',
3132
        'GE(LMNRST)-3^', 'GE', 'KE',
3133
        'GER(DKT)-', 'GER', None,
3134
        'GETTE$', 'GET', 'KET',
3135
        'GGF.', 'GF.', None,
3136
        'GG-', '', '',
3137
        'GH', 'G', None,
3138
        'GI(AOU)-^', 'I', 'I',
3139
        'GION-3', 'KIO', 'KIU',
3140
        'G(CK)-', '', '',
3141
        'GJ(AEIOU)-^', 'I', 'I',
3142
        'GMBH^$', 'GMBH', 'GMBH',
3143
        'GNAC$', 'NIAK', 'NIAK',
3144
        'GNON$', 'NION', 'NIUN',
3145
        'GN$', 'N', 'N',
3146
        'GONCAL-^', 'GONZA', 'KUNZA',
3147
        'GRY9^', 'GRÜ', None,
3148
        'G(SßXZ)-<', 'K', 'K',
3149
        'GUCK-', 'KU', 'KU',
3150
        'GUISEP-^', 'IUSE', 'IUZE',
3151
        'GUI-^', 'G', 'K',
3152
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3153
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3154
        'GY9^', 'GÜ', None,
3155
        'G(AÄEILOÖRUÜY)-', 'G', None,
3156
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3157
        'G\'S$', 'X', 'X',
3158
        'G´S$', 'X', 'X',
3159
        'G^', 'G', None,
3160
        'G', 'K', 'K',
3161
        'HA(HIUY)--1', 'H', None,
3162
        'HANDVOL---^', 'HANT ', 'ANT ',
3163
        'HANNOVE-^', 'HANOF', None,
3164
        'HAVEN7$', 'HAFN', None,
3165
        'HEAD-', 'HE', 'E',
3166
        'HELIEGEN------', 'E ', 'E ',
3167
        'HESTEHEN------', 'E ', 'E ',
3168
        'HE(LMNRST)-3^', 'HE', 'E',
3169
        'HE(LMN)-1', 'E', 'E',
3170
        'HEUR1$', 'ÖR', 'ÖR',
3171
        'HE(HIUY)--1', 'H', None,
3172
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3173
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3174
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3175
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3176
        'HOBBY9^', 'HOBI', None,
3177
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3178
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3179
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3180
        'HO(HIY)--1', 'H', None,
3181
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3182
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3183
        'HUIS^^', 'HÜS', 'IZ',
3184
        'HUIS$', 'ÜS', 'IZ',
3185
        'HUI--1', 'H', None,
3186
        'HYGIEN^', 'HÜKIEN', None,
3187
        'HY9^', 'HÜ', None,
3188
        'HY(BDGMNPST)-', 'Ü', None,
3189
        'H.^', None, 'H.',
3190
        'HÄU--1', 'H', None,
3191
        'H^', 'H', '',
3192
        'H', '', '',
3193
        'ICHELL---', 'ISH', 'IZ',
3194
        'ICHI$', 'ISHI', 'IZI',
3195
        'IEC$', 'IZ', 'IZ',
3196
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3197
        'IEI-3', '', '',
3198
        'IELL3', 'IEL', 'IEL',
3199
        'IENNE$', 'IN', 'IN',
3200
        'IERRE$', 'IER', 'IER',
3201
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3202
        'IETTE$', 'IT', 'IT',
3203
        'IEU', 'IÖ', 'IÖ',
3204
        'IE<4', 'I', 'I',
3205
        'IGL-1', 'IK', None,
3206
        'IGHT3$', 'EIT', 'EIT',
3207
        'IGNI(EO)-', 'INI', 'INI',
3208
        'IGN(AEOU)-$', 'INI', 'INI',
3209
        'IHER(DGLKRT)--1', 'IHE', None,
3210
        'IHE(IUY)--', 'IH', None,
3211
        'IH(AIOÖUÜY)-', 'IH', None,
3212
        'IJ(AOU)-', 'I', 'I',
3213
        'IJ$', 'I', 'I',
3214
        'IJ<', 'EI', 'EI',
3215
        'IKOLE$', 'IKOL', 'IKUL',
3216
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3217
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3218
        'IMSTAN----^', 'IM ', 'IN ',
3219
        'INDELERREGE------', 'INDL ', 'INTL ',
3220
        'INFRAGE-----^$', 'IN ', 'IN ',
3221
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3222
        'INVER-', 'INWE', 'INFE',
3223
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3224
        'IUSZ$', 'IUS', None,
3225
        'IUTZ$', 'IUS', None,
3226
        'IUZ$', 'IUS', None,
3227
        'IVER--<', 'IW', None,
3228
        'IVIER$', 'IWIE', 'IFIE',
3229
        'IV(ÄOÖUÜ)-', 'IW', None,
3230
        'IV<3', 'IW', None,
3231
        'IY2', 'I', None,
3232
        'I(ÈÉÊ)<4', 'I', 'I',
3233
        'JAVIE---<^', 'ZA', 'ZA',
3234
        'JEANS^$', 'JINS', 'INZ',
3235
        'JEANNE^$', 'IAN', 'IAN',
3236
        'JEAN-^', 'IA', 'IA',
3237
        'JER-^', 'IE', 'IE',
3238
        'JE(LMNST)-', 'IE', 'IE',
3239
        'JI^', 'JI', None,
3240
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3241
        'J', 'I', 'I',
3242
        'KC(ÄEIJ)-', 'X', 'X',
3243
        'KD', 'KT', None,
3244
        'KE(LMNRST)-3^', 'KE', 'KE',
3245
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3246
        'KH<^', 'K', 'K',
3247
        'KIC$', 'KIZ', 'KIZ',
3248
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3249
        'KOTELE-^', 'KOTL', 'KUTL',
3250
        'KREAT-^', 'KREA', 'KREA',
3251
        'KRÜS(TZ)--^', 'KRI', None,
3252
        'KRYS(TZ)--^', 'KRI', None,
3253
        'KRY9^', 'KRÜ', None,
3254
        'KSCH---', 'K', 'K',
3255
        'KSH--', 'K', 'K',
3256
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3257
        'KT\'S$', 'X', 'X',
3258
        'KTI(AIOU)-3', 'XI', 'XI',
3259
        'KT(SßXZ)', 'X', 'X',
3260
        'KY9^', 'KÜ', None,
3261
        'K\'S$', 'X', 'X',
3262
        'K´S$', 'X', 'X',
3263
        'LANGES$', ' LANGES', ' LANKEZ',
3264
        'LANGE$', ' LANGE', ' LANKE',
3265
        'LANG$', ' LANK', ' LANK',
3266
        'LARVE-', 'LARF', 'LARF',
3267
        'LD(SßZ)$', 'LS', 'LZ',
3268
        'LD\'S$', 'LS', 'LZ',
3269
        'LD´S$', 'LS', 'LZ',
3270
        'LEAND-^', 'LEAN', 'LEAN',
3271
        'LEERSTEHE-----^', 'LER ', 'LER ',
3272
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3273
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3274
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3275
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3276
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3277
        'LEL-', 'LE', 'LE',
3278
        'LE(MNRST)-3^', 'LE', 'LE',
3279
        'LETTE$', 'LET', 'LET',
3280
        'LFGNAG-', 'LFGAN', 'LFKAN',
3281
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3282
        'LIC$', 'LIZ', 'LIZ',
3283
        'LIVE^$', 'LEIF', 'LEIF',
3284
        'LT(SßZ)$', 'LS', 'LZ',
3285
        'LT\'S$', 'LS', 'LZ',
3286
        'LT´S$', 'LS', 'LZ',
3287
        'LUI(GS)--', 'LU', 'LU',
3288
        'LV(AIO)-', 'LW', None,
3289
        'LY9^', 'LÜ', None,
3290
        'LSTS$', 'LS', 'LZ',
3291
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3292
        'L(SßZ)$', 'LS', None,
3293
        'MAIR-<', 'MEI', 'NEI',
3294
        'MANAG-', 'MENE', 'NENE',
3295
        'MANUEL', 'MANUEL', None,
3296
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3297
        'MATCH', 'MESH', 'NEZ',
3298
        'MAURICE', 'MORIS', 'NURIZ',
3299
        'MBH^$', 'MBH', 'MBH',
3300
        'MB(ßZ)$', 'MS', None,
3301
        'MB(SßTZ)-', 'M', 'N',
3302
        'MCG9^', 'MAK', 'NAK',
3303
        'MC9^', 'MAK', 'NAK',
3304
        'MEMOIR-^', 'MEMOA', 'NENUA',
3305
        'MERHAVEN$', 'MAHAFN', None,
3306
        'ME(LMNRST)-3^', 'ME', 'NE',
3307
        'MEN(STZ)--3', 'ME', None,
3308
        'MEN$', 'MEN', None,
3309
        'MIGUEL-', 'MIGE', 'NIKE',
3310
        'MIKE^$', 'MEIK', 'NEIK',
3311
        'MITHILFE----^$', 'MIT H', 'NIT ',
3312
        'MN$', 'M', None,
3313
        'MN', 'N', 'N',
3314
        'MPJUTE-', 'MPUT', 'NBUT',
3315
        'MP(ßZ)$', 'MS', None,
3316
        'MP(SßTZ)-', 'M', 'N',
3317
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3318
        'MY9^', 'MÜ', None,
3319
        'M(ßZ)$', 'MS', None,
3320
        'M´G7^', 'MAK', 'NAK',
3321
        'M\'G7^', 'MAK', 'NAK',
3322
        'M´^', 'MAK', 'NAK',
3323
        'M\'^', 'MAK', 'NAK',
3324
        'M', None, 'N',
3325
        'NACH^^', 'NACH', 'NAK',
3326
        'NADINE', 'NADIN', 'NATIN',
3327
        'NAIV--', 'NA', 'NA',
3328
        'NAISE$', 'NESE', 'NEZE',
3329
        'NAUGENOMM------', 'NAU ', 'NAU ',
3330
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3331
        'NCH$', 'NSH', 'NZ',
3332
        'NCOISE$', 'SOA', 'ZUA',
3333
        'NCOIS$', 'SOA', 'ZUA',
3334
        'NDAR$', 'NDA', 'NTA',
3335
        'NDERINGEN------', 'NDE ', 'NTE ',
3336
        'NDRO(CDKTZ)-', 'NTRO', None,
3337
        'ND(BFGJLMNPQVW)-', 'NT', None,
3338
        'ND(SßZ)$', 'NS', 'NZ',
3339
        'ND\'S$', 'NS', 'NZ',
3340
        'ND´S$', 'NS', 'NZ',
3341
        'NEBEN^^', 'NEBN', 'NEBN',
3342
        'NENGELERN------', 'NEN ', 'NEN ',
3343
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3344
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3345
        'NE(LMNRST)-3^', 'NE', 'NE',
3346
        'NEN-3', 'NE', 'NE',
3347
        'NETTE$', 'NET', 'NET',
3348
        'NGU^^', 'NU', 'NU',
3349
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3350
        'NH(AUO)-$', 'NI', 'NI',
3351
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3352
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3353
        'NICHTS^^', 'NIX', 'NIX',
3354
        'NICHT^^', 'NICHT', 'NIKT',
3355
        'NINE$', 'NIN', 'NIN',
3356
        'NON^^', 'NON', 'NUN',
3357
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3358
        'NOT^^', 'NOT', 'NUT',
3359
        'NTI(AIOU)-3', 'NZI', 'NZI',
3360
        'NTIEL--3', 'NZI', 'NZI',
3361
        'NT(SßZ)$', 'NS', 'NZ',
3362
        'NT\'S$', 'NS', 'NZ',
3363
        'NT´S$', 'NS', 'NZ',
3364
        'NYLON', 'NEILON', 'NEILUN',
3365
        'NY9^', 'NÜ', None,
3366
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3367
        'NSZ-', 'NS', None,
3368
        'NSTS$', 'NS', 'NZ',
3369
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3370
        'N(SßZ)$', 'NS', None,
3371
        'OBERE-', 'OBER', None,
3372
        'OBER^^', 'OBA', 'UBA',
3373
        'OEU2', 'Ö', 'Ö',
3374
        'OE<2', 'Ö', 'Ö',
3375
        'OGL-', 'OK', None,
3376
        'OGNIE-', 'ONI', 'UNI',
3377
        'OGN(AEOU)-$', 'ONI', 'UNI',
3378
        'OH(AIOÖUÜY)-', 'OH', None,
3379
        'OIE$', 'Ö', 'Ö',
3380
        'OIRE$', 'OA', 'UA',
3381
        'OIR$', 'OA', 'UA',
3382
        'OIX', 'OA', 'UA',
3383
        'OI<3', 'EU', 'EU',
3384
        'OKAY^$', 'OKE', 'UKE',
3385
        'OLYN$', 'OLIN', 'ULIN',
3386
        'OO(DLMZ)-', 'U', None,
3387
        'OO$', 'U', None,
3388
        'OO-', '', '',
3389
        'ORGINAL-----', 'ORI', 'URI',
3390
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3391
        'OUI^', 'WI', 'FI',
3392
        'OUILLE$', 'ULIE', 'ULIE',
3393
        'OU(DT)-^', 'AU', 'AU',
3394
        'OUSE$', 'AUS', 'AUZ',
3395
        'OUT-', 'AU', 'AU',
3396
        'OU', 'U', 'U',
3397
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3398
        'OVER--<', 'OW', None,
3399
        'OV(AOU)-', 'OW', None,
3400
        'OW$', 'AU', 'AU',
3401
        'OWS$', 'OS', 'UZ',
3402
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3403
        'OYER', 'OIA', None,
3404
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3405
        'O(JY)<', 'EU', 'EU',
3406
        'OZ$', 'OS', None,
3407
        'O´^', 'O', 'U',
3408
        'O\'^', 'O', 'U',
3409
        'O', None, 'U',
3410
        'PATIEN--^', 'PAZI', 'PAZI',
3411
        'PENSIO-^', 'PANSI', 'PANZI',
3412
        'PE(LMNRST)-3^', 'PE', 'PE',
3413
        'PFER-^', 'FE', 'FE',
3414
        'P(FH)<', 'F', 'F',
3415
        'PIC^$', 'PIK', 'PIK',
3416
        'PIC$', 'PIZ', 'PIZ',
3417
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3418
        'POLYP-', 'POLÜ', None,
3419
        'POLY^^', 'POLI', 'PULI',
3420
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3421
        'POWER7', 'PAUA', 'PAUA',
3422
        'PP(FH)--<', 'B', 'B',
3423
        'PP-', '', '',
3424
        'PRODUZ-^', 'PRODU', 'BRUTU',
3425
        'PRODUZI--', ' PRODU', ' BRUTU',
3426
        'PRIX^$', 'PRI', 'PRI',
3427
        'PS-^^', 'P', None,
3428
        'P(SßZ)^', None, 'Z',
3429
        'P(SßZ)$', 'BS', None,
3430
        'PT-^', '', '',
3431
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3432
        'PY9^', 'PÜ', None,
3433
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3434
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3435
        'P.^', None, 'P.',
3436
        'P^', 'P', None,
3437
        'P', 'B', 'B',
3438
        'QI-', 'Z', 'Z',
3439
        'QUARANT--', 'KARA', 'KARA',
3440
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3441
        'QUE$', 'K', 'K',
3442
        'QUI(NS)$', 'KI', 'KI',
3443
        'QUIZ7', 'KWIS', None,
3444
        'Q(UV)7', 'KW', 'KF',
3445
        'Q<', 'K', 'K',
3446
        'RADFAHR----', 'RAT ', 'RAT ',
3447
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3448
        'RCH', 'RCH', 'RK',
3449
        'REA(DU)---3^', 'R', None,
3450
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3451
        'RECHERCH^', 'RESHASH', 'REZAZ',
3452
        'RECYCL--', 'RIZEI', 'RIZEI',
3453
        'RE(ALST)-3^', 'RE', None,
3454
        'REE$', 'RI', 'RI',
3455
        'RER$', 'RA', 'RA',
3456
        'RE(MNR)-4', 'RE', 'RE',
3457
        'RETTE$', 'RET', 'RET',
3458
        'REUZ$', 'REUZ', None,
3459
        'REW$', 'RU', 'RU',
3460
        'RH<^', 'R', 'R',
3461
        'RJA(MN)--', 'RI', 'RI',
3462
        'ROWD-^', 'RAU', 'RAU',
3463
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3464
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3465
        'RTIEL--3', 'RZI', 'RZI',
3466
        'RV(AEOU)-3', 'RW', None,
3467
        'RY(KN)-$', 'RI', 'RI',
3468
        'RY9^', 'RÜ', None,
3469
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3470
        'SAISO-^', 'SES', 'ZEZ',
3471
        'SAFE^$', 'SEIF', 'ZEIF',
3472
        'SAUCE-^', 'SOS', 'ZUZ',
3473
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3474
        'SCHSCH---7', '', '',
3475
        'SCHTSCH', 'SH', 'Z',
3476
        'SC(HZ)<', 'SH', 'Z',
3477
        'SC', 'SK', 'ZK',
3478
        'SELBSTST--7^^', 'SELB', 'ZELB',
3479
        'SELBST7^^', 'SELBST', 'ZELBZT',
3480
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3481
        'SERVI-^', 'SERW', None,
3482
        'SE(LMNRST)-3^', 'SE', 'ZE',
3483
        'SETTE$', 'SET', 'ZET',
3484
        'SHP-^', 'S', 'Z',
3485
        'SHST', 'SHT', 'ZT',
3486
        'SHTSH', 'SH', 'Z',
3487
        'SHT', 'ST', 'Z',
3488
        'SHY9^', 'SHÜ', None,
3489
        'SH^^', 'SH', None,
3490
        'SH3', 'SH', 'Z',
3491
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3492
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3493
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3494
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3495
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3496
        'SIEGLI-^', 'SIKL', 'ZIKL',
3497
        'SIGLI-^', 'SIKL', 'ZIKL',
3498
        'SIGHT', 'SEIT', 'ZEIT',
3499
        'SIGN', 'SEIN', 'ZEIN',
3500
        'SKI(NPZ)-', 'SKI', 'ZKI',
3501
        'SKI<^', 'SHI', 'ZI',
3502
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3503
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3504
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3505
        'SOUND-', 'SAUN', 'ZAUN',
3506
        'STAATS^^', 'STAZ', 'ZTAZ',
3507
        'STADT^^', 'STAT', 'ZTAT',
3508
        'STANDE$', ' STANDE', ' ZTANTE',
3509
        'START^^', 'START', 'ZTART',
3510
        'STAURANT7', 'STORAN', 'ZTURAN',
3511
        'STEAK-', 'STE', 'ZTE',
3512
        'STEPHEN-^$', 'STEW', None,
3513
        'STERN', 'STERN', None,
3514
        'STRAF^^', 'STRAF', 'ZTRAF',
3515
        'ST\'S$', 'Z', 'Z',
3516
        'ST´S$', 'Z', 'Z',
3517
        'STST--', '', '',
3518
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3519
        'ST(SZ)', 'Z', 'Z',
3520
        'SPAREN---^', 'SPA', 'ZPA',
3521
        'SPAREND----', ' SPA', ' ZPA',
3522
        'S(PTW)-^^', 'S', None,
3523
        'SP', 'SP', None,
3524
        'STYN(AE)-$', 'STIN', 'ZTIN',
3525
        'ST', 'ST', 'ZT',
3526
        'SUITE<', 'SIUT', 'ZIUT',
3527
        'SUKE--$', 'S', 'Z',
3528
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3529
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3530
        'SYB(IY)--^', 'SIB', None,
3531
        'SYL(KVW)--^', 'SI', None,
3532
        'SY9^', 'SÜ', None,
3533
        'SZE(NPT)-^', 'ZE', 'ZE',
3534
        'SZI(ELN)-^', 'ZI', 'ZI',
3535
        'SZCZ<', 'SH', 'Z',
3536
        'SZT<', 'ST', 'ZT',
3537
        'SZ<3', 'SH', 'Z',
3538
        'SÜL(KVW)--^', 'SI', None,
3539
        'S', None, 'Z',
3540
        'TCH', 'SH', 'Z',
3541
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3542
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3543
        'TEAT-^', 'TEA', 'TEA',
3544
        'TERRAI7^', 'TERA', 'TERA',
3545
        'TE(LMNRST)-3^', 'TE', 'TE',
3546
        'TH<', 'T', 'T',
3547
        'TICHT-', 'TIK', 'TIK',
3548
        'TICH$', 'TIK', 'TIK',
3549
        'TIC$', 'TIZ', 'TIZ',
3550
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3551
        'TIGSTELL-----', 'TIK ', 'TIK ',
3552
        'TOAS-^', 'TO', 'TU',
3553
        'TOILET-', 'TOLE', 'TULE',
3554
        'TOIN-', 'TOA', 'TUA',
3555
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3556
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3557
        'TRAINI-', 'TREN', 'TREN',
3558
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3559
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3560
        'TSCH', 'SH', 'Z',
3561
        'TSH', 'SH', 'Z',
3562
        'TST', 'ZT', 'ZT',
3563
        'T(Sß)', 'Z', 'Z',
3564
        'TT(SZ)--<', '', '',
3565
        'TT9', 'T', 'T',
3566
        'TV^$', 'TV', 'TV',
3567
        'TX(AEIOU)-3', 'SH', 'Z',
3568
        'TY9^', 'TÜ', None,
3569
        'TZ-', '', '',
3570
        'T\'S3$', 'Z', 'Z',
3571
        'T´S3$', 'Z', 'Z',
3572
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3573
        'UEBER^^', 'ÜBA', 'IBA',
3574
        'UE2', 'Ü', 'I',
3575
        'UGL-', 'UK', None,
3576
        'UH(AOÖUÜY)-', 'UH', None,
3577
        'UIE$', 'Ü', 'I',
3578
        'UM^^', 'UM', 'UN',
3579
        'UNTERE--3', 'UNTE', 'UNTE',
3580
        'UNTER^^', 'UNTA', 'UNTA',
3581
        'UNVER^^', 'UNFA', 'UNFA',
3582
        'UN^^', 'UN', 'UN',
3583
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3584
        'UVE-4', 'UW', None,
3585
        'UY2', 'UI', None,
3586
        'UZZ', 'AS', 'AZ',
3587
        'VACL-^', 'WAZ', 'FAZ',
3588
        'VAC$', 'WAZ', 'FAZ',
3589
        'VAN DEN ^', 'FANDN', 'FANTN',
3590
        'VANES-^', 'WANE', None,
3591
        'VATRO-', 'WATR', None,
3592
        'VA(DHJNT)--^', 'F', None,
3593
        'VEDD-^', 'FE', 'FE',
3594
        'VE(BEHIU)--^', 'F', None,
3595
        'VEL(BDLMNT)-^', 'FEL', None,
3596
        'VENTZ-^', 'FEN', None,
3597
        'VEN(NRSZ)-^', 'FEN', None,
3598
        'VER(AB)-^$', 'WER', None,
3599
        'VERBAL^$', 'WERBAL', None,
3600
        'VERBAL(EINS)-^', 'WERBAL', None,
3601
        'VERTEBR--', 'WERTE', None,
3602
        'VEREIN-----', 'F', None,
3603
        'VEREN(AEIOU)-^', 'WEREN', None,
3604
        'VERIFI', 'WERIFI', None,
3605
        'VERON(AEIOU)-^', 'WERON', None,
3606
        'VERSEN^', 'FERSN', 'FAZN',
3607
        'VERSIERT--^', 'WERSI', None,
3608
        'VERSIO--^', 'WERS', None,
3609
        'VERSUS', 'WERSUS', None,
3610
        'VERTI(GK)-', 'WERTI', None,
3611
        'VER^^', 'FER', 'FA',
3612
        'VERSPRECHE-------', ' FER', ' FA',
3613
        'VER$', 'WA', None,
3614
        'VER', 'FA', 'FA',
3615
        'VET(HT)-^', 'FET', 'FET',
3616
        'VETTE$', 'WET', 'FET',
3617
        'VE^', 'WE', None,
3618
        'VIC$', 'WIZ', 'FIZ',
3619
        'VIELSAGE----', 'FIL ', 'FIL ',
3620
        'VIEL', 'FIL', 'FIL',
3621
        'VIEW', 'WIU', 'FIU',
3622
        'VILL(AE)-', 'WIL', None,
3623
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3624
        'VI(ELS)--^', 'F', None,
3625
        'VILLON--', 'WILI', 'FILI',
3626
        'VIZE^^', 'FIZE', 'FIZE',
3627
        'VLIE--^', 'FL', None,
3628
        'VL(AEIOU)--', 'W', None,
3629
        'VOKA-^', 'WOK', None,
3630
        'VOL(ATUVW)--^', 'WO', None,
3631
        'VOR^^', 'FOR', 'FUR',
3632
        'VR(AEIOU)--', 'W', None,
3633
        'VV9', 'W', None,
3634
        'VY9^', 'WÜ', 'FI',
3635
        'V(ÜY)-', 'W', None,
3636
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3637
        'V(AEIJLRU)-<', 'W', None,
3638
        'V.^', 'V.', None,
3639
        'V<', 'F', 'F',
3640
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3641
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3642
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3643
        'WE(LMNRST)-3^', 'WE', 'FE',
3644
        'WER(DST)-', 'WER', None,
3645
        'WIC$', 'WIZ', 'FIZ',
3646
        'WIEDERU--', 'WIDE', 'FITE',
3647
        'WIEDER^$', 'WIDA', 'FITA',
3648
        'WIEDER^^', 'WIDA ', 'FITA ',
3649
        'WIEVIEL', 'WI FIL', 'FI FIL',
3650
        'WISUEL', 'WISUEL', None,
3651
        'WR-^', 'W', None,
3652
        'WY9^', 'WÜ', 'FI',
3653
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3654
        'W$', 'F', None,
3655
        'W', None, 'F',
3656
        'X<^', 'Z', 'Z',
3657
        'XHAVEN$', 'XAFN', None,
3658
        'X(CSZ)', 'X', 'X',
3659
        'XTS(CH)--', 'XT', 'XT',
3660
        'XT(SZ)', 'Z', 'Z',
3661
        'YE(LMNRST)-3^', 'IE', 'IE',
3662
        'YE-3', 'I', 'I',
3663
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3664
        'Y(AOU)-<7', 'I', 'I',
3665
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3666
        'YVES^$', 'IF', 'IF',
3667
        'YVONNE^$', 'IWON', 'IFUN',
3668
        'Y.^', 'Y.', None,
3669
        'Y', 'I', 'I',
3670
        'ZC(AOU)-', 'SK', 'ZK',
3671
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3672
        'ZIEJ$', 'ZI', 'ZI',
3673
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3674
        'ZL(AEIOU)-', 'SL', None,
3675
        'ZS(CHT)--', '', '',
3676
        'ZS', 'SH', 'Z',
3677
        'ZUERST', 'ZUERST', 'ZUERST',
3678
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3679
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3680
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3681
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3682
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3683
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3684
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3685
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3686
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3687
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3688
        'ZUVER^^', 'ZUFA', 'ZUFA',
3689
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3690
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3691
        'ZY9^', 'ZÜ', None,
3692
        'ZYK3$', 'ZIK', None,
3693
        'Z(VW)7^', 'SW', None,
3694
        None, None, None)
3695
3696
    phonet_hash = Counter()
3697
    alpha_pos = Counter()
3698
3699
    phonet_hash_1 = Counter()
3700
    phonet_hash_2 = Counter()
3701
3702
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3703
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3704
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3705
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3706
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3707
3708
    def _trinfo(text, rule, err_text, lang):
3709
        """Output debug information."""
3710
        if lang == 'none':
3711
            _phonet_rules = _phonet_rules_no_lang
3712
        else:
3713
            _phonet_rules = _phonet_rules_german
3714
3715
        from_rule = ('(NULL)' if _phonet_rules[rule] is None else
3716
                     _phonet_rules[rule])
3717
        to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else
3718
                    _phonet_rules[rule + 1])
3719
        to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else
3720
                    _phonet_rules[rule + 2])
3721
        print('"{} {}:  "{}"{}"{}" {}'.format(text, ((rule / 3) + 1),
3722
                                              from_rule, to_rule1, to_rule2,
3723
                                              err_text))
3724
3725
    def _initialize_phonet(lang):
3726
        """Initialize phonet variables."""
3727
        if lang == 'none':
3728
            _phonet_rules = _phonet_rules_no_lang
3729
        else:
3730
            _phonet_rules = _phonet_rules_german
3731
3732
        phonet_hash[''] = -1
3733
3734
        # German and international umlauts
3735
        for j in frozenset('ÀÁÂÃÅÄÆÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'):
3736
            alpha_pos[j] = 1
3737
            phonet_hash[j] = -1
3738
3739
        # "normal" letters ('A'-'Z')
3740
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3741
            alpha_pos[j] = i + 2
3742
            phonet_hash[j] = -1
3743
3744
        for i in range(26):
3745
            for j in range(28):
3746
                phonet_hash_1[i, j] = -1
3747
                phonet_hash_2[i, j] = -1
3748
3749
        # for each phonetc rule
3750
        for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
3751
            rule = _phonet_rules[i]
3752
3753
            if rule and i % 3 == 0:
3754
                # calculate first hash value
3755
                k = _phonet_rules[i][0]
3756
3757
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3758
                                           _phonet_rules[i+2]):
3759
                    phonet_hash[k] = i
3760
3761
                # calculate second hash values
3762
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3763
                    k = alpha_pos[k]
3764
3765
                    j = k-2
3766
                    rule = rule[1:]
3767
3768
                    if not rule:
3769
                        rule = ' '
3770
                    elif rule[0] == '(':
3771
                        rule = rule[1:]
3772
                    else:
3773
                        rule = rule[0]
3774
3775
                    while rule and (rule[0] != ')'):
3776
                        k = alpha_pos[rule[0]]
3777
3778
                        if k > 0:
3779
                            # add hash value for this letter
3780
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3781
                                phonet_hash_1[j, k] = i
3782
                                phonet_hash_2[j, k] = i
3783
3784
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3785
                                phonet_hash_2[j, k] = i
3786
                            else:
3787
                                k = -1
3788
3789
                        if k <= 0:
3790
                            # add hash value for all letters
3791
                            if phonet_hash_1[j, 0] < 0:
3792
                                phonet_hash_1[j, 0] = i
3793
3794
                            phonet_hash_2[j, 0] = i
3795
3796
                        rule = rule[1:]
3797
3798
    def _phonet(term, mode, lang, trace):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (30/15).
Loading history...
3799
        """Return the phonet coded form of a term."""
3800
        if lang == 'none':
3801
            _phonet_rules = _phonet_rules_no_lang
3802
        else:
3803
            _phonet_rules = _phonet_rules_german
3804
3805
        char0 = ''
3806
        dest = term
3807
3808
        if not term:
3809
            return ''
3810
3811
        term_length = len(term)
3812
3813
        # convert input string to upper-case
3814
        src = term.translate(_phonet_upper_translation)
3815
3816
        # check "src"
3817
        i = 0
3818
        j = 0
3819
        zeta = 0
3820
3821
        while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
3822
            char = src[i]
3823
3824
            if trace:
3825
                print('\ncheck position {}:  src = "{}",  dest = "{}"'.format
3826
                      (j, src[i:], dest[:j]))
3827
3828
            pos = alpha_pos[char]
3829
3830
            if pos >= 2:
3831
                xpos = pos-2
3832
3833
                if i+1 == len(src):
3834
                    pos = alpha_pos['']
3835
                else:
3836
                    pos = alpha_pos[src[i+1]]
3837
3838
                start1 = phonet_hash_1[xpos, pos]
3839
                start2 = phonet_hash_1[xpos, 0]
3840
                end1 = phonet_hash_2[xpos, pos]
3841
                end2 = phonet_hash_2[xpos, 0]
3842
3843
                # preserve rule priorities
3844
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3845
                    pos = start1
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3846
                    start1 = start2
3847
                    start2 = pos
3848
                    pos = end1
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3849
                    end1 = end2
3850
                    end2 = pos
3851
3852
                if (end1 >= start2) and (start2 >= 0):
0 ignored issues
show
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
3853
                    if end2 > end1:
3854
                        end1 = end2
3855
3856
                    start2 = -1
3857
                    end2 = -1
3858
            else:
3859
                pos = phonet_hash[char]
3860
                start1 = pos
3861
                end1 = 10000
3862
                start2 = -1
3863
                end2 = -1
3864
3865
            pos = start1
3866
            zeta0 = 0
3867
3868
            if pos >= 0:
3869
                # check rules for this char
3870
                while ((_phonet_rules[pos] is None) or
3871
                       (_phonet_rules[pos][0] == char)):
3872
                    if pos > end1:
3873
                        if start2 > 0:
3874
                            pos = start2
3875
                            start1 = start2
3876
                            start2 = -1
3877
                            end1 = end2
3878
                            end2 = -1
3879
                            continue
3880
3881
                        break
3882
3883
                    if (((_phonet_rules[pos] is None) or
3884
                         (_phonet_rules[pos + mode] is None))):
3885
                        # no conversion rule available
3886
                        pos += 3
3887
                        continue
3888
3889
                    if trace:
3890
                        _trinfo('> rule no.', pos, 'is being checked', lang)
3891
3892
                    # check whole string
3893
                    matches = 1  # number of matching letters
3894
                    priority = 5  # default priority
3895
                    rule = _phonet_rules[pos]
3896
                    rule = rule[1:]
3897
3898
                    while (rule and
3899
                           (len(src) > (i + matches)) and
3900
                           (src[i + matches] == rule[0]) and
3901
                           not rule[0].isdigit() and
3902
                           (rule not in '(-<^$')):
3903
                        matches += 1
3904
                        rule = rule[1:]
3905
3906
                    if rule and (rule[0] == '('):
3907
                        # check an array of letters
3908
                        if (((len(src) > (i + matches)) and
3909
                             src[i + matches].isalpha() and
3910
                             (src[i + matches] in rule[1:]))):
3911
                            matches += 1
3912
3913
                            while rule and rule[0] != ')':
3914
                                rule = rule[1:]
3915
3916
                            # if rule[0] == ')':
3917
                            rule = rule[1:]
3918
3919
                    if rule:
3920
                        priority0 = ord(rule[0])
3921
                    else:
3922
                        priority0 = 0
3923
3924
                    matches0 = matches
3925
3926
                    while rule and rule[0] == '-' and matches > 1:
3927
                        matches -= 1
3928
                        rule = rule[1:]
3929
3930
                    if rule and rule[0] == '<':
3931
                        rule = rule[1:]
3932
3933
                    if rule and rule[0].isdigit():
3934
                        # read priority
3935
                        priority = int(rule[0])
3936
                        rule = rule[1:]
3937
3938
                    if rule and rule[0:2] == '^^':
3939
                        rule = rule[1:]
3940
3941
                    if (not rule or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
3942
                            ((rule[0] == '^') and
3943
                             ((i == 0) or not src[i-1].isalpha()) and
3944
                             ((rule[1:2] != '$') or
3945
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3946
                               (src[i+matches0:i+matches0+1] != '.')))) or
3947
                            ((rule[0] == '$') and (i > 0) and
3948
                             src[i-1].isalpha() and
3949
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3950
                              (src[i+matches0:i+matches0+1] != '.')))):
3951
                        # look for continuation, if:
3952
                        # matches > 1 und NO '-' in first string */
3953
                        pos0 = -1
3954
3955
                        start3 = 0
3956
                        start4 = 0
3957
                        end3 = 0
3958
                        end4 = 0
3959
3960
                        if (((matches > 1) and
3961
                             src[i+matches:i+matches+1] and
3962
                             (priority0 != ord('-')))):
3963
                            char0 = src[i+matches-1]
3964
                            pos0 = alpha_pos[char0]
3965
3966
                            if pos0 >= 2 and src[i+matches]:
3967
                                xpos = pos0 - 2
3968
                                pos0 = alpha_pos[src[i+matches]]
3969
                                start3 = phonet_hash_1[xpos, pos0]
3970
                                start4 = phonet_hash_1[xpos, 0]
3971
                                end3 = phonet_hash_2[xpos, pos0]
3972
                                end4 = phonet_hash_2[xpos, 0]
3973
3974
                                # preserve rule priorities
3975
                                if (((start4 >= 0) and
3976
                                     ((start3 < 0) or (start4 < start3)))):
3977
                                    pos0 = start3
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3978
                                    start3 = start4
3979
                                    start4 = pos0
3980
                                    pos0 = end3
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3981
                                    end3 = end4
3982
                                    end4 = pos0
3983
3984
                                if (end3 >= start4) and (start4 >= 0):
0 ignored issues
show
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
3985
                                    if end4 > end3:
3986
                                        end3 = end4
3987
3988
                                    start4 = -1
3989
                                    end4 = -1
3990
                            else:
3991
                                pos0 = phonet_hash[char0]
3992
                                start3 = pos0
3993
                                end3 = 10000
3994
                                start4 = -1
3995
                                end4 = -1
3996
3997
                            pos0 = start3
3998
3999
                        # check continuation rules for src[i+matches]
4000
                        if pos0 >= 0:
4001
                            while ((_phonet_rules[pos0] is None) or
4002
                                   (_phonet_rules[pos0][0] == char0)):
0 ignored issues
show
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
4003
                                if pos0 > end3:
4004
                                    if start4 > 0:
4005
                                        pos0 = start4
4006
                                        start3 = start4
4007
                                        start4 = -1
4008
                                        end3 = end4
4009
                                        end4 = -1
4010
                                        continue
4011
4012
                                    priority0 = -1
4013
4014
                                    # important
4015
                                    break
4016
4017
                                if (((_phonet_rules[pos0] is None) or
4018
                                     (_phonet_rules[pos0 + mode] is None))):
4019
                                    # no conversion rule available
4020
                                    pos0 += 3
4021
                                    continue
4022
4023
                                if trace:
4024
                                    _trinfo('> > continuation rule no.', pos0,
4025
                                            'is being checked', lang)
4026
4027
                                # check whole string
4028
                                matches0 = matches
4029
                                priority0 = 5
4030
                                rule = _phonet_rules[pos0]
4031
                                rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
4032
4033
                                while (rule and
4034
                                       (src[i+matches0:i+matches0+1] ==
4035
                                        rule[0]) and
4036
                                       (not rule[0].isdigit() or
4037
                                        (rule in '(-<^$'))):
4038
                                    matches0 += 1
4039
                                    rule = rule[1:]
4040
4041
                                if rule and rule[0] == '(':
4042
                                    # check an array of letters
4043
                                    if ((src[i+matches0:i+matches0+1]
4044
                                         .isalpha() and
4045
                                         (src[i+matches0] in rule[1:]))):
4046
                                        matches0 += 1
4047
4048
                                        while rule and rule[0] != ')':
4049
                                            rule = rule[1:]
4050
4051
                                        # if rule[0] == ')':
4052
                                        rule = rule[1:]
4053
4054
                                while rule and rule[0] == '-':
4055
                                    # "matches0" is NOT decremented
4056
                                    # because of  "if (matches0 == matches)"
4057
                                    rule = rule[1:]
4058
4059
                                if rule and rule[0] == '<':
4060
                                    rule = rule[1:]
4061
4062
                                if rule and rule[0].isdigit():
4063
                                    priority0 = int(rule[0])
4064
                                    rule = rule[1:]
4065
4066
                                if (not rule or
4067
                                        # rule == '^' is not possible here
4068
                                        ((rule[0] == '$') and not
4069
                                         src[i+matches0:i+matches0+1]
4070
                                         .isalpha() and
4071
                                         (src[i+matches0:i+matches0+1]
4072
                                          != '.'))):
4073
                                    if matches0 == matches:
4074
                                        # this is only a partial string
4075
                                        if trace:
4076
                                            _trinfo('> > continuation ' +
4077
                                                    'rule no.',
4078
                                                    pos0,
4079
                                                    'not used (too short)',
4080
                                                    lang)
4081
4082
                                        pos0 += 3
4083
                                        continue
4084
4085
                                    if priority0 < priority:
4086
                                        # priority is too low
4087
                                        if trace:
4088
                                            _trinfo('> > continuation ' +
4089
                                                    'rule no.',
4090
                                                    pos0,
4091
                                                    'not used (priority)',
4092
                                                    lang)
4093
4094
                                        pos0 += 3
4095
                                        continue
4096
4097
                                    # continuation rule found
4098
                                    break
4099
4100
                                if trace:
4101
                                    _trinfo('> > continuation rule no.', pos0,
4102
                                            'not used', lang)
4103
4104
                                pos0 += 3
4105
4106
                            # end of "while"
4107
                            if ((priority0 >= priority) and
4108
                                    ((_phonet_rules[pos0] is not None) and
4109
                                     (_phonet_rules[pos0][0] == char0))):
0 ignored issues
show
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
4110
4111
                                if trace:
4112
                                    _trinfo('> rule no.', pos, '', lang)
4113
                                    _trinfo('> not used because of ' +
4114
                                            'continuation', pos0, '', lang)
4115
4116
                                pos += 3
4117
                                continue
4118
4119
                        # replace string
4120
                        if trace:
4121
                            _trinfo('Rule no.', pos, 'is applied', lang)
4122
4123
                        if ((_phonet_rules[pos] and
4124
                             ('<' in _phonet_rules[pos][1:]))):
4125
                            priority0 = 1
4126
                        else:
4127
                            priority0 = 0
4128
4129
                        rule = _phonet_rules[pos + mode]
4130
4131
                        if (priority0 == 1) and (zeta == 0):
4132
                            # rule with '<' is applied
4133
                            if ((j > 0) and rule and
4134
                                    ((dest[j-1] == char) or
4135
                                     (dest[j-1] == rule[0]))):
4136
                                j -= 1
4137
4138
                            zeta0 = 1
4139
                            zeta += 1
4140
                            matches0 = 0
4141
4142
                            while rule and src[i+matches0]:
4143
                                src = (src[0:i+matches0] + rule[0] +
4144
                                       src[i+matches0+1:])
4145
                                matches0 += 1
4146
                                rule = rule[1:]
4147
4148
                            if matches0 < matches:
4149
                                src = (src[0:i+matches0] +
4150
                                       src[i+matches:])
4151
4152
                            char = src[i]
4153
                        else:
4154
                            i = i + matches - 1
4155
                            zeta = 0
4156
4157
                            while len(rule) > 1:
4158
                                if (j == 0) or (dest[j - 1] != rule[0]):
4159
                                    dest = (dest[0:j] + rule[0] +
4160
                                            dest[min(len(dest), j+1):])
4161
                                    j += 1
4162
4163
                                rule = rule[1:]
4164
4165
                            # new "current char"
4166
                            if not rule:
4167
                                rule = ''
4168
                                char = ''
4169
                            else:
4170
                                char = rule[0]
4171
4172
                            if ((_phonet_rules[pos] and
4173
                                 '^^' in _phonet_rules[pos][1:])):
4174
                                if char:  # pragma: no branch
4175
                                    dest = (dest[0:j] + char +
4176
                                            dest[min(len(dest), j + 1):])
4177
                                    j += 1
4178
4179
                                src = src[i + 1:]
4180
                                i = 0
4181
                                zeta0 = 1
4182
4183
                        break
4184
4185
                    pos += 3
4186
4187
                    if pos > end1 and start2 > 0:
4188
                        pos = start2
4189
                        start1 = start2
4190
                        end1 = end2
4191
                        start2 = -1
4192
                        end2 = -1
4193
4194
            if zeta0 == 0:
4195
                if char and ((j == 0) or (dest[j-1] != char)):
4196
                    # delete multiple letters only
4197
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4198
                    j += 1
4199
4200
                i += 1
4201
                zeta = 0
4202
4203
        dest = dest[0:j]
4204
4205
        return dest
4206
4207
    _initialize_phonet(lang)
4208
4209
    word = unicodedata.normalize('NFKC', text_type(word))
4210
    return _phonet(word, mode, lang, trace)
4211
4212
4213
def spfc(word):
4214
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4215
4216
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4217
    This implementation is based on page 19-21 of
4218
    https://archive.org/stream/accessingindivid00moor#page/19/mode/1up
4219
4220
    :param str word: the word to transform
4221
    :returns: the SPFC value
4222
    :rtype: str
4223
4224
    >>> spfc('Christopher Smith')
4225
    '01160'
4226
    >>> spfc('Christopher Schmidt')
4227
    '01160'
4228
    >>> spfc('Niall Smith')
4229
    '01660'
4230
    >>> spfc('Niall Schmidt')
4231
4232
    >>> spfc('L.Smith')
4233
    '01960'
4234
    >>> spfc('R.Miller')
4235
    '65490'
4236
4237
    >>> spfc(('L', 'Smith'))
4238
    '01960'
4239
    >>> spfc(('R', 'Miller'))
4240
    '65490'
4241
    """
4242
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4243
                    '0011112222334445556666777'))
4244
    _pf2 = dict(zip((ord(_) for _ in
4245
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4246
                    '0011122233445556677788899'))
4247
    _pf3 = dict(zip((ord(_) for _ in
4248
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4249
                    '00000112223334456677777777'))
4250
4251
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4252
                      ('MN', 'N'))
4253
4254
    def _raise_word_ex():
4255
        """Raise an AttributeError."""
4256
        raise AttributeError('word attribute must be a string with a space ' +
4257
                             'or period dividing the first and last names ' +
4258
                             'or a tuple/list consisting of the first and ' +
4259
                             'last names')
4260
4261
    if not word:
4262
        return ''
4263
4264
    if isinstance(word, (str, text_type)):
4265
        names = word.split('.', 1)
4266
        if len(names) != 2:
4267
            names = word.split(' ', 1)
4268
            if len(names) != 2:
4269
                _raise_word_ex()
4270
    elif hasattr(word, '__iter__'):
4271
        if len(word) != 2:
4272
            _raise_word_ex()
4273
        names = word
4274
    else:
4275
        _raise_word_ex()
4276
4277
    names = [unicodedata.normalize('NFKD', text_type(_.strip()
4278
                                                     .replace('ß', 'SS')
4279
                                                     .upper()))
4280
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4281
    code = ''
4282
4283
    def steps_one_to_three(name):
4284
        """Perform the first three steps of SPFC."""
4285
        # filter out non A-Z
4286
        name = ''.join(_ for _ in name if _ in
4287
                       frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4288
4289
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4290
        # and MN to N
4291
        for subst in _substitutions:
4292
            name = name.replace(subst[0], subst[1])
4293
4294
        # 2. In the name field, replace multiple letters with a single letter
4295
        name = _delete_consecutive_repeats(name)
4296
4297
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4298
        # field.
4299
        if name:
4300
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4301
                                     frozenset('AEIOUWHY'))
4302
        return name
4303
4304
    names = [steps_one_to_three(_) for _ in names]
4305
4306
    # 4. The first digit of the code is obtained using PF1 and the first letter
4307
    # of the name field. Remove this letter after coding.
4308
    if names[1]:
4309
        code += names[1][0].translate(_pf1)
4310
        names[1] = names[1][1:]
4311
4312
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4313
    # second digit of the code. Use as many letters as possible and remove
4314
    # after coding.
4315
    if names[1]:
4316
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4317
            code += '8'
4318
            names[1] = names[1][:-3]
4319
        elif names[1][-2:] == 'SN':
4320
            code += '8'
4321
            names[1] = names[1][:-2]
4322
        elif names[1][-3:] == 'STR':
4323
            code += '9'
4324
            names[1] = names[1][:-3]
4325
        elif names[1][-2:] in frozenset(['SR', 'TN', 'TD']):
4326
            code += '9'
4327
            names[1] = names[1][:-2]
4328
        elif names[1][-3:] == 'DRS':
4329
            code += '7'
4330
            names[1] = names[1][:-3]
4331
        elif names[1][-2:] in frozenset(['TR', 'MN']):
4332
            code += '7'
4333
            names[1] = names[1][:-2]
4334
        else:
4335
            code += names[1][-1].translate(_pf3)
4336
            names[1] = names[1][:-1]
4337
4338
    # 6. The third digit is found using Table PF2 and the first character of
4339
    # the first name. Remove after coding.
4340
    if names[0]:
4341
        code += names[0][0].translate(_pf2)
4342
        names[0] = names[0][1:]
4343
4344
    # 7. The fourth digit is found using Table PF2 and the first character of
4345
    # the name field. If no letters remain use zero. After coding remove the
4346
    # letter.
4347
    # 8. The fifth digit is found in the same manner as the fourth using the
4348
    # remaining characters of the name field if any.
4349
    for _ in range(2):
4350
        if names[1]:
4351
            code += names[1][0].translate(_pf2)
4352
            names[1] = names[1][1:]
4353
        else:
4354
            code += '0'
4355
4356
    return code
4357
4358
4359
def statistics_canada(word, maxlength=4):
4360
    """Return the Statistics Canada code for a word.
4361
4362
    The original description of this algorithm could not be located, and
4363
    may only have been specified in an unpublished TR. The coding does not
4364
    appear to be in use by Statistics Canada any longer. In its place, this is
4365
    an implementation of the "Census modified Statistics Canada name coding
4366
    procedure".
4367
4368
    The modified version of this algorithm is described in Appendix B of
4369
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
4370
    Procedure for the SRS Record Linkage System.` Statistical Reporting
4371
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
4372
    https://naldc.nal.usda.gov/download/27833/PDF
4373
4374
    :param str word: the word to transform
4375
    :param int maxlength: the maximum length (default 6) of the code to return
4376
    :param bool modified: indicates whether to use USDA modified algorithm
4377
    :returns: the Statistics Canada name code value
4378
    :rtype: str
4379
4380
    >>> statistics_canada('Christopher')
4381
    'CHRS'
4382
    >>> statistics_canada('Niall')
4383
    'NL'
4384
    >>> statistics_canada('Smith')
4385
    'SMTH'
4386
    >>> statistics_canada('Schmidt')
4387
    'SCHM'
4388
    """
4389
    # uppercase, normalize, decompose, and filter non-A-Z out
4390
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4391
    word = word.replace('ß', 'SS')
4392
    word = ''.join(c for c in word if c in
4393
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4394
    if not word:
4395
        return ''
4396
4397
    code = word[1:]
4398
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4399
        code = code.replace(vowel, '')
4400
    code = word[0]+code
4401
    code = _delete_consecutive_repeats(code)
4402
    code = code.replace(' ', '')
4403
4404
    return code[:maxlength]
4405
4406
4407
def lein(word, maxlength=4, zero_pad=True):
4408
    """Return the Lein code for a word.
4409
4410
    This is Lein name coding, based on
4411
    https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF
4412
4413
    :param str word: the word to transform
4414
    :param int maxlength: the maximum length (default 4) of the code to return
4415
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4416
        maxlength string
4417
    :returns: the Lein code
4418
    :rtype: str
4419
4420
    >>> lein('Christopher')
4421
    'C351'
4422
    >>> lein('Niall')
4423
    'N300'
4424
    >>> lein('Smith')
4425
    'S210'
4426
    >>> lein('Schmidt')
4427
    'S521'
4428
    """
4429
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4430
                                  'BCDFGJKLMNPQRSTVXZ'),
4431
                                 '451455532245351455'))
4432
4433
    # uppercase, normalize, decompose, and filter non-A-Z out
4434
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4435
    word = word.replace('ß', 'SS')
4436
    word = ''.join(c for c in word if c in
4437
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4438
4439
    if not word:
4440
        return ''
4441
4442
    code = word[0]  # Rule 1
4443
    word = word[1:].translate(str.maketrans('', '', 'AEIOUYWH '))  # Rule 2
4444
    word = _delete_consecutive_repeats(word)  # Rule 3
4445
    code += word.translate(_lein_translation)  # Rule 4
4446
4447
    if zero_pad:
4448
        code += ('0'*maxlength)  # Rule 4
4449
4450
    return code[:maxlength]
4451
4452
4453
def roger_root(word, maxlength=5, zero_pad=True):
4454
    """Return the Roger Root code for a word.
4455
4456
    This is Roger Root name coding, based on
4457
    https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF
4458
4459
    :param str word: the word to transform
4460
    :param int maxlength: the maximum length (default 5) of the code to return
4461
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4462
        maxlength string
4463
    :returns: the Roger Root code
4464
    :rtype: str
4465
4466
    >>> roger_root('Christopher')
4467
    '06401'
4468
    >>> roger_root('Niall')
4469
    '02500'
4470
    >>> roger_root('Smith')
4471
    '00310'
4472
    >>> roger_root('Schmidt')
4473
    '06310'
4474
    """
4475
    # uppercase, normalize, decompose, and filter non-A-Z out
4476
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4477
    word = word.replace('ß', 'SS')
4478
    word = ''.join(c for c in word if c in
4479
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4480
4481
    if not word:
4482
        return ''
4483
4484
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4485
    _init_patterns = {4: {'TSCH': '06'},
4486
                      3: {'TSH': '06', 'SCH': '06'},
4487
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4488
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4489
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4490
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4491
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4492
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4493
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4494
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4495
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4496
                          'Y': '5', 'Z': '0*0'}}
4497
4498
    _med_patterns = {4: {'TSCH': '6'},
4499
                     3: {'TSH': '6', 'SCH': '6'},
4500
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4501
                         'PH': '8', 'SH': '6', 'TS': '0'},
4502
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4503
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4504
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4505
                         'V': '8', 'X': '7', 'Z': '0',
4506
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4507
                         'U': '*', 'W': '*', 'Y': '*'}}
4508
4509
    code = ''
4510
    pos = 0
4511
4512
    # Do first digit(s) first
4513
    for n in range(4, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
4514
        if word[:n] in _init_patterns[n]:
4515
            code = _init_patterns[n][word[:n]]
4516
            pos += n
4517
            break
4518
    else:
4519
        pos += 1  # Advance if nothing is recognized
4520
4521
    # Then code subsequent digits
4522
    while pos < len(word):
4523
        for n in range(4, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
4524
            if word[pos:pos+n] in _med_patterns[n]:
4525
                code += _med_patterns[n][word[pos:pos+n]]
4526
                pos += n
4527
                break
4528
        else:
4529
            pos += 1  # Advance if nothing is recognized
4530
4531
    code = _delete_consecutive_repeats(code)
4532
    code = code.replace('*', '')
4533
4534
    if zero_pad:
4535
        code += '0'*maxlength
4536
4537
    return code[:maxlength]
4538
4539
4540
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
4541
         concat=False, filter_langs=False):
4542
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
4543
4544
    The Beider-Morse Phonetic Matching algorithm is described at:
4545
    http://stevemorse.org/phonetics/bmpm.htm
4546
    The reference implementation is licensed under GPLv3 and available at:
4547
    http://stevemorse.org/phoneticinfo.htm
4548
4549
    :param str word: the word to transform
4550
    :param str language_arg: the language of the term; supported values
4551
        include:
4552
4553
            - 'any'
4554
            - 'arabic'
4555
            - 'cyrillic'
4556
            - 'czech'
4557
            - 'dutch'
4558
            - 'english'
4559
            - 'french'
4560
            - 'german'
4561
            - 'greek'
4562
            - 'greeklatin'
4563
            - 'hebrew'
4564
            - 'hungarian'
4565
            - 'italian'
4566
            - 'polish'
4567
            - 'portuguese'
4568
            - 'romanian'
4569
            - 'russian'
4570
            - 'spanish'
4571
            - 'turkish'
4572
            - 'germandjsg'
4573
            - 'polishdjskp'
4574
            - 'russiandjsre'
4575
4576
    :param str name_mode: the name mode of the algorithm:
4577
4578
            - 'gen' -- general (default)
4579
            - 'ash' -- Ashkenazi
4580
            - 'sep' -- Sephardic
4581
4582
    :param str match_mode: matching mode: 'approx' or 'exact'
4583
    :param bool concat: concatenation mode
4584
    :param bool filter_langs: filter out incompatible languages
4585
    :returns: the BMPM value(s)
4586
    :rtype: tuple
4587
4588
    >>> bmpm('Christopher')
4589
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4590
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
4591
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
4592
    zritofi'
4593
    >>> bmpm('Niall')
4594
    'nial niol'
4595
    >>> bmpm('Smith')
4596
    'zmit'
4597
    >>> bmpm('Schmidt')
4598
    'zmit stzmit'
4599
4600
    >>> bmpm('Christopher', language_arg='German')
4601
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4602
    xristYfir'
4603
    >>> bmpm('Christopher', language_arg='English')
4604
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
4605
    xrQstafir'
4606
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
4607
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4608
    xristYfir'
4609
4610
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
4611
    'xriStopher xriStofer xristopher xristofer'
4612
    """
4613
    return _bmpm(word, language_arg, name_mode, match_mode,
4614
                 concat, filter_langs)
4615
4616
4617
if __name__ == '__main__':
4618
    import doctest
4619
    doctest.testmod()
4620