Completed
Push — master ( a6c366...b119e0 )
by Chris
10:32
created

abydos.phonetic.russell_index()   B

Complexity

Conditions 8

Size

Total Lines 43
Code Lines 16

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 16
nop 1
dl 0
loc 43
rs 7.3333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (4796/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Refined Soundex
26
    - Daitch-Mokotoff Soundex
27
    - Kölner Phonetik
28
    - NYSIIS
29
    - Match Rating Algorithm
30
    - Metaphone
31
    - Double Metaphone
32
    - Caverphone
33
    - Alpha Search Inquiry System
34
    - Fuzzy Soundex
35
    - Phonex
36
    - Phonem
37
    - Phonix
38
    - SfinxBis
39
    - phonet
40
    - Standardized Phonetic Frequency Code
41
    - Statistics Canada
42
    - Lein
43
    - Roger Root
44
    - Oxford Name Compression Algorithm (ONCA)
45
    - Beider-Morse Phonetic Matching
46
"""
47
48
from __future__ import division, unicode_literals
49
50
import re
51
import unicodedata
52
from collections import Counter
53
from itertools import groupby
54
55
from six import text_type
56
from six.moves import range
57
58
from ._bm import _bmpm
59
60
_INFINITY = float('inf')
61
62
63
def _delete_consecutive_repeats(word):
64
    """Delete consecutive repeated characters in a word.
65
66
    :param str word: the word to transform
67
    :returns: word with consecutive repeating characters collapsed to
68
        a single instance
69
    :rtype: str
70
    """
71
    return ''.join(char for char, _ in groupby(word))
72
73
74
def russell_index(word):
75
    """Return the Russell Index (integer output) of a word.
76
77
    This follows Robert C. Russell's Index algorithm, as described in
78
    US Patent 1,261,167 (1917)
79
80
    :param str word: the word to transform
81
    :returns: the Russell Index value
82
    :rtype: int
83
84
    >>> russell_index('Christopher')
85
    3813428
86
    >>> russell_index('Niall')
87
    715
88
    >>> russell_index('Smith')
89
    3614
90
    >>> russell_index('Schmidt')
91
    3614
92
    """
93
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
94
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
95
                                    '12341231356712383412313'))
96
97
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
98
    word = word.replace('ß', 'SS')
99
    word = word.replace('GH', '')  # discard gh (rule 3)
100
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
101
102
    # translate according to Russell's mapping
103
    word = ''.join(c for c in word if c in
104
                   frozenset('ABCDEFGIKLMNOPQRSTUVXYZ'))
105
    sdx = word.translate(_russell_translation)
106
107
    # remove any 1s after the first occurrence
108
    one = sdx.find('1')+1
109
    if one:
110
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
111
112
    # remove repeating characters
113
    sdx = _delete_consecutive_repeats(sdx)
114
115
    # return as an int
116
    return int(sdx) if sdx else float('NaN')
117
118
119
def russell_index_num_to_alpha(num):
120
    """Convert the Russell Index integer to an alphabetic string.
121
122
    This follows Robert C. Russell's Index algorithm, as described in
123
    US Patent 1,261,167 (1917)
124
125
    :param int num: a Russell Index integer value
126
    :returns: the Russell Index as an alphabetic string
127
    :rtype: str
128
129
    >>> russell_index_num_to_alpha(3813428)
130
    'CRACDBR'
131
    >>> russell_index_num_to_alpha(715)
132
    'NAL'
133
    >>> russell_index_num_to_alpha(3614)
134
    'CMAD'
135
    """
136
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
137
                                        'ABCDLMNR'))
138
    num = ''.join(c for c in text_type(num) if c in frozenset('12345678'))
139
    if num:
140
        return num.translate(_russell_num_translation)
141
    return ''
142
143
144
def russell_index_alpha(word):
145
    """Return the Russell Index (alphabetic output) for the word.
146
147
    This follows Robert C. Russell's Index algorithm, as described in
148
    US Patent 1,261,167 (1917)
149
150
    :param str word: the word to transform
151
    :returns: the Russell Index value as an alphabetic string
152
    :rtype: str
153
154
    >>> russell_index_alpha('Christopher')
155
    'CRACDBR'
156
    >>> russell_index_alpha('Niall')
157
    'NAL'
158
    >>> russell_index_alpha('Smith')
159
    'CMAD'
160
    >>> russell_index_alpha('Schmidt')
161
    'CMAD'
162
    """
163
    if word:
164
        return russell_index_num_to_alpha(russell_index(word))
165
    return ''
166
167
168
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
169
    """Return the Soundex code for a word.
170
171
    :param str word: the word to transform
172
    :param int maxlength: the length of the code returned (defaults to 4)
173
    :param str var: the variant of the algorithm to employ (defaults to
174
        'American'):
175
176
        - 'American' follows the American Soundex algorithm, as described at
177
          http://www.archives.gov/publications/general-info-leaflets/55-census.html
178
          and in Knuth(1998:394); this is also called Miracode
179
        - 'special' follows the rules from the 1880-1910 US Census
180
          retrospective re-analysis, in which h & w are not treated as blocking
181
          consonants but as vowels.
182
          Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
183
        - 'dm' computes the Daitch-Mokotoff Soundex
184
185
    :param bool reverse: reverse the word before computing the selected Soundex
186
        (defaults to False); This results in "Reverse Soundex"
187
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
188
        maxlength string
189
    :returns: the Soundex value
190
    :rtype: str
191
192
    >>> soundex("Christopher")
193
    'C623'
194
    >>> soundex("Niall")
195
    'N400'
196
    >>> soundex('Smith')
197
    'S530'
198
    >>> soundex('Schmidt')
199
    'S530'
200
201
202
    >>> soundex('Christopher', maxlength=_INFINITY)
203
    'C623160000000000000000000000000000000000000000000000000000000000'
204
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
205
    'C62316'
206
207
    >>> soundex('Christopher', reverse=True)
208
    'R132'
209
210
    >>> soundex('Ashcroft')
211
    'A261'
212
    >>> soundex('Asicroft')
213
    'A226'
214
    >>> soundex('Ashcroft', var='special')
215
    'A226'
216
    >>> soundex('Asicroft', var='special')
217
    'A226'
218
219
    >>> soundex('Christopher', var='dm')
220
    {'494379', '594379'}
221
    >>> soundex('Niall', var='dm')
222
    {'680000'}
223
    >>> soundex('Smith', var='dm')
224
    {'463000'}
225
    >>> soundex('Schmidt', var='dm')
226
    {'463000'}
227
    """
228
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
229
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
230
                                    '01230129022455012623019202'))
231
232
    # Call the D-M Soundex function itself if requested
233
    if var == 'dm':
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
234
        return dm_soundex(word, maxlength, reverse, zero_pad)
235
    elif var == 'refined':
236
        return refined_soundex(word, maxlength, reverse, zero_pad)
237
238
    # Require a maxlength of at least 4 and not more than 64
239
    if maxlength is not None:
240
        maxlength = min(max(4, maxlength), 64)
241
    else:
242
        maxlength = 64
243
244
    # uppercase, normalize, decompose, and filter non-A-Z out
245
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
246
    word = word.replace('ß', 'SS')
247
    word = ''.join(c for c in word if c in
248
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
249
250
    # Nothing to convert, return base case
251
    if not word:
252
        if zero_pad:
253
            return '0'*maxlength
254
        return '0'
255
256
    # Reverse word if computing Reverse Soundex
257
    if reverse:
258
        word = word[::-1]
259
260
    # apply the Soundex algorithm
261
    sdx = word.translate(_soundex_translation)
262
263
    if var == 'special':
264
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
265
    else:
266
        sdx = sdx.replace('9', '')  # rule 1
267
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
268
269
    if word[0] in 'HW':
270
        sdx = word[0] + sdx
271
    else:
272
        sdx = word[0] + sdx[1:]
273
    sdx = sdx.replace('0', '')  # rule 1
274
275
    if zero_pad:
276
        sdx += ('0'*maxlength)  # rule 4
277
278
    return sdx[:maxlength]
279
280
281
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False):
0 ignored issues
show
Unused Code introduced by
The argument zero_pad seems to be unused.
Loading history...
282
    """Return the Refined Soundex code for a word.
283
284
    This is Soundex, but with more character classes. It appears to have been
285
    defined by the Apache Commons:
286
    https://commons.apache.org/proper/commons-codec/apidocs/src-html/org/apache/commons/codec/language/RefinedSoundex.html
287
288
    :param word: the word to transform
289
    :param maxlength: the length of the code returned (defaults to unlimited)
290
    :param reverse: reverse the word before computing the selected Soundex
291
        (defaults to False); This results in "Reverse Soundex"
292
    :param zero_pad: pad the end of the return value with 0s to achieve a
293
        maxlength string
294
    :returns: the Refined Soundex value
295
    :rtype: str
296
297
    >>> refined_soundex('Christopher')
298
    'C3090360109'
299
    >>> refined_soundex('Niall')
300
    'N807'
301
    >>> refined_soundex('Smith')
302
    'S38060'
303
    >>> refined_soundex('Schmidt')
304
    'S30806'
305
    """
306
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
307
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
308
                                        '01360240043788015936020505'))
309
310
    # uppercase, normalize, decompose, and filter non-A-Z out
311
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
312
    word = word.replace('ß', 'SS')
313
    word = ''.join(c for c in word if c in
314
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
315
316
    # Reverse word if computing Reverse Soundex
317
    if reverse:
318
        word = word[::-1]
319
320
    # apply the Soundex algorithm
321
    sdx = word[0] + word.translate(_ref_soundex_translation)
322
    sdx = _delete_consecutive_repeats(sdx)
323
324
    if maxlength and maxlength < _INFINITY:
325
        sdx = sdx[:maxlength]
326
        sdx += ('0' * maxlength)  # rule 4
327
328
    return sdx
329
330
331
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True):
332
    """Return the Daitch-Mokotoff Soundex code for a word.
333
334
    Returns values of a word as a set. A collection is necessary since there
335
    can be multiple values for a single word.
336
337
    :param word: the word to transform
338
    :param maxlength: the length of the code returned (defaults to 6)
339
    :param reverse: reverse the word before computing the selected Soundex
340
        (defaults to False); This results in "Reverse Soundex"
341
    :param zero_pad: pad the end of the return value with 0s to achieve a
342
        maxlength string
343
    :returns: the Daitch-Mokotoff Soundex value
344
    :rtype: str
345
346
    >>> dm_soundex('Christopher')
347
    {'494379', '594379'}
348
    >>> dm_soundex('Niall')
349
    {'680000'}
350
    >>> dm_soundex('Smith')
351
    {'463000'}
352
    >>> dm_soundex('Schmidt')
353
    {'463000'}
354
355
    >>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)
356
    {'35457976754', '3557976754'}
357
    """
358
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
359
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
360
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
361
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
362
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
363
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
364
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
365
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
366
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
367
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
368
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
369
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
370
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
371
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
372
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
373
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
374
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
375
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
376
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
377
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
378
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
379
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
380
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
381
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
382
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
383
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
384
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
385
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
386
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
387
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
388
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
389
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
390
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
391
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
392
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
393
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
394
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
395
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
396
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
397
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
398
                  'CH': ((5, 4), (5, 4), (5, 4)),
399
                  'CK': ((5, 45), (5, 45), (5, 45)),
400
                  'C': ((5, 4), (5, 4), (5, 4)),
401
                  'J': ((1, 4), ('_', 4), ('_', 4)),
402
                  'RZ': ((94, 4), (94, 4), (94, 4)),
403
                  'RS': ((94, 4), (94, 4), (94, 4))}
404
405
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
406
                  'B': ('B'),
407
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
408
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
409
                        'DZ', 'D'),
410
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
411
                  'F': ('FB', 'F'),
412
                  'G': ('G'),
413
                  'H': ('H'),
414
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
415
                  'J': ('J'),
416
                  'K': ('KH', 'KS', 'K'),
417
                  'L': ('L'),
418
                  'M': ('MN', 'M'),
419
                  'N': ('NM', 'N'),
420
                  'O': ('OI', 'OJ', 'OY', 'O'),
421
                  'P': ('PF', 'PH', 'P'),
422
                  'Q': ('Q'),
423
                  'R': ('RS', 'RZ', 'R'),
424
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
425
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
426
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
427
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
428
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
429
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
430
                        'TS', 'TZ', 'T'),
431
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
432
                  'V': ('V'),
433
                  'W': ('W'),
434
                  'X': ('X'),
435
                  'Y': ('Y'),
436
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
437
                        'ZH', 'ZS', 'Z')}
438
439
    _vowels = frozenset('AEIJOUY')
440
    dms = ['']  # initialize empty code list
441
442
    # Require a maxlength of at least 6 and not more than 64
443
    if maxlength is not None:
444
        maxlength = min(max(6, maxlength), 64)
445
    else:
446
        maxlength = 64
447
448
    # uppercase, normalize, decompose, and filter non-A-Z
449
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
450
    word = word.replace('ß', 'SS')
451
    word = ''.join(c for c in word if c in
452
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
453
454
    # Nothing to convert, return base case
455
    if not word:
456
        if zero_pad:
457
            return {'0'*maxlength}
458
        return {'0'}
459
460
    # Reverse word if computing Reverse Soundex
461
    if reverse:
462
        word = word[::-1]
463
464
    pos = 0
465
    while pos < len(word):
466
        # Iterate through _dms_order, which specifies the possible substrings
467
        # for which codes exist in the Daitch-Mokotoff coding
468
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
469
            if word[pos:].startswith(sstr):
470
                # Having determined a valid substring start, retrieve the code
471
                dm_val = _dms_table[sstr]
472
473
                # Having retried the code (triple), determine the correct
474
                # positional variant (first, pre-vocalic, elsewhere)
475
                if pos == 0:
476
                    dm_val = dm_val[0]
477
                elif (pos+len(sstr) < len(word) and
478
                      word[pos+len(sstr)] in _vowels):
479
                    dm_val = dm_val[1]
480
                else:
481
                    dm_val = dm_val[2]
482
483
                # Build the code strings
484
                if isinstance(dm_val, tuple):
485
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
486
                            + [_ + text_type(dm_val[1]) for _ in dms]
487
                else:
488
                    dms = [_ + text_type(dm_val) for _ in dms]
489
                pos += len(sstr)
490
                break
491
492
    # Filter out double letters and _ placeholders
493
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
494
           for _ in dms)
495
496
    # Trim codes and return set
497
    if zero_pad:
498
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
499
    else:
500
        dms = (_[:maxlength] for _ in dms)
501
    return set(dms)
502
503
504
def koelner_phonetik(word):
505
    """Return the Kölner Phonetik (numeric output) code for a word.
506
507
    Based on the algorithm described at
508
    https://de.wikipedia.org/wiki/Kölner_Phonetik
509
510
    While the output code is numeric, it is still a str because 0s can lead
511
    the code.
512
513
    :param str word: the word to transform
514
    :returns: the Kölner Phonetik value as a numeric string
515
    :rtype: str
516
517
    >>> koelner_phonetik('Christopher')
518
    '478237'
519
    >>> koelner_phonetik('Niall')
520
    '65'
521
    >>> koelner_phonetik('Smith')
522
    '862'
523
    >>> koelner_phonetik('Schmidt')
524
    '862'
525
    >>> koelner_phonetik('Müller')
526
    '657'
527
    >>> koelner_phonetik('Zimmermann')
528
    '86766'
529
    """
530
    # pylint: disable=too-many-branches
531
    def _after(word, i, letters):
532
        """Return True if word[i] follows one of the supplied letters."""
533
        if i > 0 and word[i-1] in letters:
534
            return True
535
        return False
536
537
    def _before(word, i, letters):
538
        """Return True if word[i] precedes one of the supplied letters."""
539
        if i+1 < len(word) and word[i+1] in letters:
540
            return True
541
        return False
542
543
    _vowels = frozenset('AEIJYOU')
544
545
    sdx = ''
546
547
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
548
    word = word.replace('ß', 'SS')
549
550
    word = word.replace('Ä', 'AE')
551
    word = word.replace('Ö', 'OE')
552
    word = word.replace('Ü', 'UE')
553
    word = ''.join(c for c in word if c in
554
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
555
556
    # Nothing to convert, return base case
557
    if not word:
558
        return sdx
559
560
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
561
        if word[i] in _vowels:
562
            sdx += '0'
563
        elif word[i] == 'B':
564
            sdx += '1'
565
        elif word[i] == 'P':
566
            if _before(word, i, frozenset('H')):
567
                sdx += '3'
568
            else:
569
                sdx += '1'
570
        elif word[i] in frozenset('DT'):
571
            if _before(word, i, frozenset('CSZ')):
572
                sdx += '8'
573
            else:
574
                sdx += '2'
575
        elif word[i] in frozenset('FVW'):
576
            sdx += '3'
577
        elif word[i] in frozenset('GKQ'):
578
            sdx += '4'
579
        elif word[i] == 'C':
580
            if _after(word, i, frozenset('SZ')):
581
                sdx += '8'
582
            elif i == 0:
583
                if _before(word, i, frozenset('AHKLOQRUX')):
584
                    sdx += '4'
585
                else:
586
                    sdx += '8'
587
            elif _before(word, i, frozenset('AHKOQUX')):
588
                sdx += '4'
589
            else:
590
                sdx += '8'
591
        elif word[i] == 'X':
592
            if _after(word, i, frozenset('CKQ')):
593
                sdx += '8'
594
            else:
595
                sdx += '48'
596
        elif word[i] == 'L':
597
            sdx += '5'
598
        elif word[i] in frozenset('MN'):
599
            sdx += '6'
600
        elif word[i] == 'R':
601
            sdx += '7'
602
        elif word[i] in frozenset('SZ'):
603
            sdx += '8'
604
605
    sdx = _delete_consecutive_repeats(sdx)
606
607
    if sdx:
608
        sdx = sdx[0] + sdx[1:].replace('0', '')
609
610
    return sdx
611
612
613
def koelner_phonetik_num_to_alpha(num):
614
    """Convert a Kölner Phonetik code from numeric to alphabetic.
615
616
    :param str num: a numeric Kölner Phonetik representation
617
    :returns: an alphabetic representation of the same word
618
    :rtype: str
619
620
    >>> koelner_phonetik_num_to_alpha(862)
621
    'SNT'
622
    >>> koelner_phonetik_num_to_alpha(657)
623
    'NLR'
624
    >>> koelner_phonetik_num_to_alpha(86766)
625
    'SNRNN'
626
    """
627
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
628
                                        'APTFKLNRS'))
629
    num = ''.join(c for c in text_type(num) if c in frozenset('012345678'))
630
    return num.translate(_koelner_num_translation)
631
632
633
def koelner_phonetik_alpha(word):
634
    """Return the Kölner Phonetik (alphabetic output) code for a word.
635
636
    :param str word: the word to transform
637
    :returns: the Kölner Phonetik value as an alphabetic string
638
    :rtype: str
639
640
    >>> koelner_phonetik_alpha('Smith')
641
    'SNT'
642
    >>> koelner_phonetik_alpha('Schmidt')
643
    'SNT'
644
    >>> koelner_phonetik_alpha('Müller')
645
    'NLR'
646
    >>> koelner_phonetik_alpha('Zimmermann')
647
    'SNRNN'
648
    """
649
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
650
651
652
def nysiis(word, maxlength=6, modified=False):
653
    """Return the NYSIIS code for a word.
654
655
    A description of the New York State Identification and Intelligence System
656
    algorithm can be found at
657
    https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System
658
659
    The modified version of this algorithm is described in Appendix B of
660
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
661
    Procedure for the SRS Record Linkage System.` Statistical Reporting
662
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
663
    https://naldc.nal.usda.gov/download/27833/PDF
664
665
    :param str word: the word to transform
666
    :param int maxlength: the maximum length (default 6) of the code to return
667
    :param bool modified: indicates whether to use USDA modified NYSIIS
668
    :returns: the NYSIIS value
669
    :rtype: str
670
671
    >>> nysiis('Christopher')
672
    'CRASTA'
673
    >>> nysiis('Niall')
674
    'NAL'
675
    >>> nysiis('Smith')
676
    'SNAT'
677
    >>> nysiis('Schmidt')
678
    'SNAD'
679
680
    >>> nysiis('Christopher', maxlength=_INFINITY)
681
    'CRASTAFAR'
682
683
    >>> nysiis('Christopher', maxlength=8, modified=True)
684
    'CRASTAFA'
685
    >>> nysiis('Niall', maxlength=8, modified=True)
686
    'NAL'
687
    >>> nysiis('Smith', maxlength=8, modified=True)
688
    'SNAT'
689
    >>> nysiis('Schmidt', maxlength=8, modified=True)
690
    'SNAD'
691
    """
692
    # Require a maxlength of at least 6
693
    if maxlength:
694
        maxlength = max(6, maxlength)
695
696
    _vowels = {'A', 'E', 'I', 'O', 'U'}
697
698
    word = ''.join(c for c in word.upper() if c.isalpha())
699
    word = word.replace('ß', 'SS')
700
701
    # exit early if there are no alphas
702
    if not word:
703
        return ''
704
705
    if modified:
706
        original_first_char = word[0]
707
708
    if word[:3] == 'MAC':
709
        word = 'MCC'+word[3:]
710
    elif word[:2] == 'KN':
711
        word = 'NN'+word[2:]
712
    elif word[:1] == 'K':
713
        word = 'C'+word[1:]
714
    elif word[:2] in {'PH', 'PF'}:
715
        word = 'FF'+word[2:]
716
    elif word[:3] == 'SCH':
717
        word = 'SSS'+word[3:]
718
    elif modified:
719
        if word[:2] == 'WR':
720
            word = 'RR'+word[2:]
721
        elif word[:2] == 'RH':
722
            word = 'RR'+word[2:]
723
        elif word[:2] == 'DG':
724
            word = 'GG'+word[2:]
725
        elif word[:1] in _vowels:
726
            word = 'A'+word[1:]
727
728
    if modified and word[-1] in {'S', 'Z'}:
729
        word = word[:-1]
730
731
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
732
                                                  word[-2:] == 'YE'):
733
        word = word[:-2]+'Y'
734
    elif word[-2:] in {'DT', 'RT', 'RD'}:
735
        word = word[:-2]+'D'
736
    elif word[-2:] in {'NT', 'ND'}:
737
        word = word[:-2]+('N' if modified else 'D')
738
    elif modified:
739
        if word[-2:] == 'IX':
740
            word = word[:-2]+'ICK'
741
        elif word[-2:] == 'EX':
742
            word = word[:-2]+'ECK'
743
        elif word[-2:] in {'JR', 'SR'}:
744
            return 'ERROR'  # TODO: decide how best to return an error
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
745
746
    key = word[0]
747
748
    skip = 0
749
    for i in range(1, len(word)):
750
        if i >= len(word):
751
            continue
752
        elif skip:
753
            skip -= 1
754
            continue
755
        elif word[i:i+2] == 'EV':
756
            word = word[:i] + 'AF' + word[i+2:]
757
            skip = 1
758
        elif word[i] in _vowels:
759
            word = word[:i] + 'A' + word[i+1:]
760
        elif modified and i != len(word)-1 and word[i] == 'Y':
761
            word = word[:i] + 'A' + word[i+1:]
762
        elif word[i] == 'Q':
763
            word = word[:i] + 'G' + word[i+1:]
764
        elif word[i] == 'Z':
765
            word = word[:i] + 'S' + word[i+1:]
766
        elif word[i] == 'M':
767
            word = word[:i] + 'N' + word[i+1:]
768
        elif word[i:i+2] == 'KN':
769
            word = word[:i] + 'N' + word[i+2:]
770
        elif word[i] == 'K':
771
            word = word[:i] + 'C' + word[i+1:]
772
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
773
            word = word[:i] + 'SSA'
774
            skip = 2
775
        elif word[i:i+3] == 'SCH':
776
            word = word[:i] + 'SSS' + word[i+3:]
777
            skip = 2
778
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
779
            word = word[:i] + 'SA'
780
            skip = 1
781
        elif word[i:i+2] == 'SH':
782
            word = word[:i] + 'SS' + word[i+2:]
783
            skip = 1
784
        elif word[i:i+2] == 'PH':
785
            word = word[:i] + 'FF' + word[i+2:]
786
            skip = 1
787
        elif modified and word[i:i+3] == 'GHT':
788
            word = word[:i] + 'TTT' + word[i+3:]
789
            skip = 2
790
        elif modified and word[i:i+2] == 'DG':
791
            word = word[:i] + 'GG' + word[i+2:]
792
            skip = 1
793
        elif modified and word[i:i+2] == 'WR':
794
            word = word[:i] + 'RR' + word[i+2:]
795
            skip = 1
796
        elif word[i] == 'H' and (word[i-1] not in _vowels or
797
                                 word[i+1:i+2] not in _vowels):
798
            word = word[:i] + word[i-1] + word[i+1:]
799
        elif word[i] == 'W' and word[i-1] in _vowels:
800
            word = word[:i] + word[i-1] + word[i+1:]
801
802
        if word[i:i+skip+1] != key[-1:]:
803
            key += word[i:i+skip+1]
804
805
    key = _delete_consecutive_repeats(key)
806
807
    if key[-1] == 'S':
808
        key = key[:-1]
809
    if key[-2:] == 'AY':
810
        key = key[:-2] + 'Y'
811
    if key[-1:] == 'A':
812
        key = key[:-1]
813
    if modified and key[0] == 'A':
814
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 705 is False. Are you sure this can never be the case?
Loading history...
815
816
    if maxlength and maxlength < _INFINITY:
817
        key = key[:maxlength]
818
819
    return key
820
821
822
def mra(word):
823
    """Return the MRA personal numeric identifier (PNI) for a word.
824
825
    A description of the Western Airlines Surname Match Rating Algorithm can
826
    be found on page 18 of
827
    https://archive.org/details/accessingindivid00moor
828
829
    :param str word: the word to transform
830
    :returns: the MRA PNI
831
    :rtype: str
832
833
    >>> mra('Christopher')
834
    'CHRPHR'
835
    >>> mra('Niall')
836
    'NL'
837
    >>> mra('Smith')
838
    'SMTH'
839
    >>> mra('Schmidt')
840
    'SCHMDT'
841
    """
842
    if not word:
843
        return word
844
    word = word.upper()
845
    word = word.replace('ß', 'SS')
846
    word = word[0]+''.join(c for c in word[1:] if
847
                           c not in frozenset('AEIOU'))
848
    word = _delete_consecutive_repeats(word)
849
    if len(word) > 6:
850
        word = word[:3]+word[-3:]
851
    return word
852
853
854
def metaphone(word, maxlength=_INFINITY):
855
    """Return the Metaphone code for a word.
856
857
    Based on Lawrence Philips' Pick BASIC code from 1990:
858
    http://aspell.net/metaphone/metaphone.basic
859
    This incorporates some corrections to the above code, particularly
860
    some of those suggested by Michael Kuhn in:
861
    http://aspell.net/metaphone/metaphone-kuhn.txt
862
863
    :param str word: the word to transform
864
    :param int maxlength: the maximum length of the returned Metaphone code
865
        (defaults to unlimited, but in Philips' original implementation
866
        this was 4)
867
    :returns: the Metaphone value
868
    :rtype: str
869
870
871
    >>> metaphone('Christopher')
872
    'KRSTFR'
873
    >>> metaphone('Niall')
874
    'NL'
875
    >>> metaphone('Smith')
876
    'SM0'
877
    >>> metaphone('Schmidt')
878
    'SKMTT'
879
    """
880
    # pylint: disable=too-many-branches
881
    _vowels = frozenset('AEIOU')
882
    _frontv = frozenset('EIY')
883
    _varson = frozenset('CSPTG')
884
885
    # Require a maxlength of at least 4
886
    if maxlength is not None:
887
        maxlength = max(4, maxlength)
888
    else:
889
        maxlength = 64
890
891
    # As in variable sound--those modified by adding an "h"
892
    ename = ''.join(c for c in word.upper() if c.isalnum())
893
    ename = ename.replace('ß', 'SS')
894
895
    # Delete nonalphanumeric characters and make all caps
896
    if not ename:
897
        return ''
898
    if ename[0:2] in frozenset(['PN', 'AE', 'KN', 'GN', 'WR']):
899
        ename = ename[1:]
900
    elif ename[0] == 'X':
901
        ename = 'S' + ename[1:]
902
    elif ename[0:2] == 'WH':
903
        ename = 'W' + ename[2:]
904
905
    # Convert to metaph
906
    elen = len(ename)-1
907
    metaph = ''
908
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
909
        if len(metaph) >= maxlength:
910
            break
911
        if ((ename[i] not in frozenset('GT') and
912
             i > 0 and ename[i-1] == ename[i])):
913
            continue
914
915
        if ename[i] in _vowels and i == 0:
916
            metaph = ename[i]
917
918
        elif ename[i] == 'B':
919
            if i != elen or ename[i-1] != 'M':
920
                metaph += ename[i]
921
922
        elif ename[i] == 'C':
923
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
924
                if ename[i+1:i+3] == 'IA':
925
                    metaph += 'X'
926
                elif ename[i+1:i+2] in _frontv:
927
                    metaph += 'S'
928
                elif i > 0 and ename[i-1:i+2] == 'SCH':
929
                    metaph += 'K'
930
                elif ename[i+1:i+2] == 'H':
931
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
932
                        metaph += 'K'
933
                    else:
934
                        metaph += 'X'
935
                else:
936
                    metaph += 'K'
937
938
        elif ename[i] == 'D':
939
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
940
                metaph += 'J'
941
            else:
942
                metaph += 'T'
943
944
        elif ename[i] == 'G':
945
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
946
                                              ename[i+2:i+3] not in _vowels):
947
                continue
948
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
949
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
950
                continue
951
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
952
                  ename[i+1] in _frontv):
953
                continue
954
            elif ename[i+1:i+2] == 'G':
955
                continue
956
            elif ename[i+1:i+2] in _frontv:
957
                if i == 0 or ename[i-1] != 'G':
958
                    metaph += 'J'
959
                else:
960
                    metaph += 'K'
961
            else:
962
                metaph += 'K'
963
964
        elif ename[i] == 'H':
965
            if ((i > 0 and ename[i-1] in _vowels and
966
                 ename[i+1:i+2] not in _vowels)):
967
                continue
968
            elif i > 0 and ename[i-1] in _varson:
969
                continue
970
            else:
971
                metaph += 'H'
972
973
        elif ename[i] in frozenset('FJLMNR'):
974
            metaph += ename[i]
975
976
        elif ename[i] == 'K':
977
            if i > 0 and ename[i-1] == 'C':
978
                continue
979
            else:
980
                metaph += 'K'
981
982
        elif ename[i] == 'P':
983
            if ename[i+1:i+2] == 'H':
984
                metaph += 'F'
985
            else:
986
                metaph += 'P'
987
988
        elif ename[i] == 'Q':
989
            metaph += 'K'
990
991
        elif ename[i] == 'S':
992
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
993
                 ename[i+2] in 'OA')):
994
                metaph += 'X'
995
            elif ename[i+1:i+2] == 'H':
996
                metaph += 'X'
997
            else:
998
                metaph += 'S'
999
1000
        elif ename[i] == 'T':
1001
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1002
                 ename[i+2] in frozenset('OA'))):
1003
                metaph += 'X'
1004
            elif ename[i+1:i+2] == 'H':
1005
                metaph += '0'
1006
            elif ename[i+1:i+3] != 'CH':
1007
                if ename[i-1:i] != 'T':
1008
                    metaph += 'T'
1009
1010
        elif ename[i] == 'V':
1011
            metaph += 'F'
1012
1013
        elif ename[i] in 'WY':
1014
            if ename[i+1:i+2] in _vowels:
1015
                metaph += ename[i]
1016
1017
        elif ename[i] == 'X':
1018
            metaph += 'KS'
1019
1020
        elif ename[i] == 'Z':
1021
            metaph += 'S'
1022
1023
    return metaph
1024
1025
1026
def double_metaphone(word, maxlength=_INFINITY):
1027
    """Return the Double Metaphone code for a word.
1028
1029
    Based on Lawrence Philips' (Visual) C++ code from 1999:
1030
    http://aspell.net/metaphone/dmetaph.cpp
1031
1032
    :param word: the word to transform
1033
    :param maxlength: the maximum length of the returned Double Metaphone codes
1034
        (defaults to unlimited, but in Philips' original implementation this
1035
        was 4)
1036
    :returns: the Double Metaphone value(s)
1037
    :rtype: tuple
1038
1039
    >>> double_metaphone('Christopher')
1040
    ('KRSTFR', '')
1041
    >>> double_metaphone('Niall')
1042
    ('NL', '')
1043
    >>> double_metaphone('Smith')
1044
    ('SM0', 'XMT')
1045
    >>> double_metaphone('Schmidt')
1046
    ('XMT', 'SMT')
1047
    """
1048
    # pylint: disable=too-many-branches
1049
    # Require a maxlength of at least 4
1050
    if maxlength is not None:
1051
        maxlength = max(4, maxlength)
1052
    else:
1053
        maxlength = 64
1054
1055
    primary = ''
1056
    secondary = ''
1057
1058
    def _slavo_germanic():
1059
        """Return True if the word appears to be Slavic or Germanic."""
1060
        if 'W' in word or 'K' in word or 'CZ' in word:
1061
            return True
1062
        return False
1063
1064
    def _metaph_add(pri, sec=''):
1065
        """Return a new metaphone tuple with the supplied elements."""
1066
        newpri = primary
1067
        newsec = secondary
1068
        if pri:
1069
            newpri += pri
1070
        if sec:
1071
            if sec != ' ':
1072
                newsec += sec
1073
        else:
1074
            newsec += pri
1075
        return (newpri, newsec)
1076
1077
    def _is_vowel(pos):
1078
        """Return True if the character at word[pos] is a vowel."""
1079
        if pos >= 0 and word[pos] in frozenset('AEIOUY'):
1080
            return True
1081
        return False
1082
1083
    def _get_at(pos):
1084
        """Return the character at word[pos]."""
1085
        return word[pos]
1086
1087
    def _string_at(pos, slen, substrings):
1088
        """Return True if word[pos:pos+slen] is in substrings."""
1089
        if pos < 0:
1090
            return False
1091
        return word[pos:pos+slen] in substrings
1092
1093
    current = 0
1094
    length = len(word)
1095
    if length < 1:
1096
        return ('', '')
1097
    last = length - 1
1098
1099
    word = word.upper()
1100
    word = word.replace('ß', 'SS')
1101
1102
    # Pad the original string so that we can index beyond the edge of the world
1103
    word += '     '
1104
1105
    # Skip these when at start of word
1106
    if word[0:2] in frozenset(['GN', 'KN', 'PN', 'WR', 'PS']):
1107
        current += 1
1108
1109
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1110
    if _get_at(0) == 'X':
1111
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1112
        current += 1
1113
1114
    # Main loop
1115
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1116
        if current >= length:
1117
            break
1118
1119
        if _get_at(current) in frozenset('AEIOUY'):
1120
            if current == 0:
1121
                # All init vowels now map to 'A'
1122
                (primary, secondary) = _metaph_add('A')
1123
            current += 1
1124
            continue
1125
1126
        elif _get_at(current) == 'B':
1127
            # "-mb", e.g", "dumb", already skipped over...
1128
            (primary, secondary) = _metaph_add('P')
1129
            if _get_at(current + 1) == 'B':
1130
                current += 2
1131
            else:
1132
                current += 1
1133
            continue
1134
1135
        elif _get_at(current) == 'Ç':
1136
            (primary, secondary) = _metaph_add('S')
1137
            current += 1
1138
            continue
1139
1140
        elif _get_at(current) == 'C':
1141
            # Various Germanic
1142
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1143
                    _string_at((current - 1), 3, ['ACH']) and
1144
                    ((_get_at(current + 2) != 'I') and
1145
                     ((_get_at(current + 2) != 'E') or
1146
                      _string_at((current - 2), 6,
1147
                                 frozenset(['BACHER', 'MACHER']))))):
1148
                (primary, secondary) = _metaph_add('K')
1149
                current += 2
1150
                continue
1151
1152
            # Special case 'caesar'
1153
            elif current == 0 and _string_at(current, 6, ['CAESAR']):
1154
                (primary, secondary) = _metaph_add('S')
1155
                current += 2
1156
                continue
1157
1158
            # Italian 'chianti'
1159
            elif _string_at(current, 4, ['CHIA']):
1160
                (primary, secondary) = _metaph_add('K')
1161
                current += 2
1162
                continue
1163
1164
            elif _string_at(current, 2, ['CH']):
1165
                # Find 'Michael'
1166
                if current > 0 and _string_at(current, 4, ['CHAE']):
1167
                    (primary, secondary) = _metaph_add('K', 'X')
1168
                    current += 2
1169
                    continue
1170
1171
                # Greek roots e.g. 'chemistry', 'chorus'
1172
                elif (current == 0 and
1173
                      (_string_at((current + 1), 5,
1174
                                  frozenset(['HARAC', 'HARIS'])) or
1175
                       _string_at((current + 1), 3,
1176
                                  frozenset(['HOR', 'HYM', 'HIA', 'HEM']))) and
1177
                      not _string_at(0, 5, ['CHORE'])):
1178
                    (primary, secondary) = _metaph_add('K')
1179
                    current += 2
1180
                    continue
1181
1182
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1183
                elif ((_string_at(0, 4, frozenset(['VAN ', 'VON '])) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1184
                       _string_at(0, 3, ['SCH'])) or
1185
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1186
                      _string_at((current - 2), 6,
1187
                                 frozenset(['ORCHES', 'ARCHIT', 'ORCHID'])) or
1188
                      _string_at((current + 2), 1, frozenset(['T', 'S'])) or
1189
                      ((_string_at((current - 1), 1,
1190
                                   frozenset(['A', 'O', 'U', 'E'])) or
1191
                        (current == 0)) and
1192
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1193
                       _string_at((current + 2), 1,
1194
                                  frozenset(['L', 'R', 'N', 'M', 'B', 'H',
1195
                                             'F', 'V', 'W', ' '])))):
1196
                    (primary, secondary) = _metaph_add('K')
1197
1198
                else:
1199
                    if current > 0:
1200
                        if _string_at(0, 2, ['MC']):
1201
                            # e.g., "McHugh"
1202
                            (primary, secondary) = _metaph_add('K')
1203
                        else:
1204
                            (primary, secondary) = _metaph_add('X', 'K')
1205
                    else:
1206
                        (primary, secondary) = _metaph_add('X')
1207
1208
                current += 2
1209
                continue
1210
1211
            # e.g, 'czerny'
1212
            elif (_string_at(current, 2, ['CZ']) and
1213
                  not _string_at((current - 2), 4, ['WICZ'])):
1214
                (primary, secondary) = _metaph_add('S', 'X')
1215
                current += 2
1216
                continue
1217
1218
            # e.g., 'focaccia'
1219
            elif _string_at((current + 1), 3, ['CIA']):
1220
                (primary, secondary) = _metaph_add('X')
1221
                current += 3
1222
1223
            # double 'C', but not if e.g. 'McClellan'
1224
            elif (_string_at(current, 2, ['CC']) and
1225
                  not ((current == 1) and (_get_at(0) == 'M'))):
1226
                # 'bellocchio' but not 'bacchus'
1227
                if ((_string_at((current + 2), 1,
1228
                                frozenset(['I', 'E', 'H'])) and
1229
                     not _string_at((current + 2), 2, ['HU']))):
1230
                    # 'accident', 'accede' 'succeed'
1231
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1232
                         _string_at((current - 1), 5,
1233
                                    frozenset(['UCCEE', 'UCCES'])))):
1234
                        (primary, secondary) = _metaph_add('KS')
1235
                    # 'bacci', 'bertucci', other italian
1236
                    else:
1237
                        (primary, secondary) = _metaph_add('X')
1238
                    current += 3
1239
                    continue
1240
                else:  # Pierce's rule
1241
                    (primary, secondary) = _metaph_add('K')
1242
                    current += 2
1243
                    continue
1244
1245
            elif _string_at(current, 2, frozenset(['CK', 'CG', 'CQ'])):
1246
                (primary, secondary) = _metaph_add('K')
1247
                current += 2
1248
                continue
1249
1250
            elif _string_at(current, 2, frozenset(['CI', 'CE', 'CY'])):
1251
                # Italian vs. English
1252
                if _string_at(current, 3, frozenset(['CIO', 'CIE', 'CIA'])):
1253
                    (primary, secondary) = _metaph_add('S', 'X')
1254
                else:
1255
                    (primary, secondary) = _metaph_add('S')
1256
                current += 2
1257
                continue
1258
1259
            # else
1260
            else:
1261
                (primary, secondary) = _metaph_add('K')
1262
1263
                # name sent in 'mac caffrey', 'mac gregor
1264
                if _string_at((current + 1), 2, frozenset([' C', ' Q', ' G'])):
1265
                    current += 3
1266
                elif (_string_at((current + 1), 1,
1267
                                 frozenset(['C', 'K', 'Q'])) and
1268
                      not _string_at((current + 1), 2,
1269
                                     frozenset(['CE', 'CI']))):
1270
                    current += 2
1271
                else:
1272
                    current += 1
1273
                continue
1274
1275
        elif _get_at(current) == 'D':
1276
            if _string_at(current, 2, ['DG']):
1277
                if _string_at((current + 2), 1, frozenset(['I', 'E', 'Y'])):
1278
                    # e.g. 'edge'
1279
                    (primary, secondary) = _metaph_add('J')
1280
                    current += 3
1281
                    continue
1282
                else:
1283
                    # e.g. 'edgar'
1284
                    (primary, secondary) = _metaph_add('TK')
1285
                    current += 2
1286
                    continue
1287
1288
            elif _string_at(current, 2, frozenset(['DT', 'DD'])):
1289
                (primary, secondary) = _metaph_add('T')
1290
                current += 2
1291
                continue
1292
1293
            # else
1294
            else:
1295
                (primary, secondary) = _metaph_add('T')
1296
                current += 1
1297
                continue
1298
1299
        elif _get_at(current) == 'F':
1300
            if _get_at(current + 1) == 'F':
1301
                current += 2
1302
            else:
1303
                current += 1
1304
            (primary, secondary) = _metaph_add('F')
1305
            continue
1306
1307
        elif _get_at(current) == 'G':
1308
            if _get_at(current + 1) == 'H':
1309
                if (current > 0) and not _is_vowel(current - 1):
1310
                    (primary, secondary) = _metaph_add('K')
1311
                    current += 2
1312
                    continue
1313
1314
                # 'ghislane', ghiradelli
1315
                elif current == 0:
1316
                    if _get_at(current + 2) == 'I':
1317
                        (primary, secondary) = _metaph_add('J')
1318
                    else:
1319
                        (primary, secondary) = _metaph_add('K')
1320
                    current += 2
1321
                    continue
1322
1323
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1324
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1325
                       _string_at((current - 2), 1,
1326
                                  frozenset(['B', 'H', 'D']))) or
1327
                      # e.g., 'bough'
1328
                      ((current > 2) and
1329
                       _string_at((current - 3), 1,
1330
                                  frozenset(['B', 'H', 'D']))) or
1331
                      # e.g., 'broughton'
1332
                      ((current > 3) and
1333
                       _string_at((current - 4), 1,
1334
                                  frozenset(['B', 'H'])))):
1335
                    current += 2
1336
                    continue
1337
                else:
1338
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1339
                    #      'gough', 'rough', 'tough'
1340
                    if ((current > 2) and
1341
                            (_get_at(current - 1) == 'U') and
1342
                            (_string_at((current - 3), 1,
1343
                                        frozenset(['C', 'G', 'L', 'R',
1344
                                                   'T'])))):
1345
                        (primary, secondary) = _metaph_add('F')
1346
                    elif (current > 0) and _get_at(current - 1) != 'I':
1347
                        (primary, secondary) = _metaph_add('K')
1348
                    current += 2
1349
                    continue
1350
1351
            elif _get_at(current + 1) == 'N':
1352
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1353
                    (primary, secondary) = _metaph_add('KN', 'N')
1354
                # not e.g. 'cagney'
1355
                elif (not _string_at((current + 2), 2, ['EY']) and
1356
                      (_get_at(current + 1) != 'Y') and
1357
                      not _slavo_germanic()):
1358
                    (primary, secondary) = _metaph_add('N', 'KN')
1359
                else:
1360
                    (primary, secondary) = _metaph_add('KN')
1361
                current += 2
1362
                continue
1363
1364
            # 'tagliaro'
1365
            elif (_string_at((current + 1), 2, ['LI']) and
1366
                  not _slavo_germanic()):
1367
                (primary, secondary) = _metaph_add('KL', 'L')
1368
                current += 2
1369
                continue
1370
1371
            # -ges-, -gep-, -gel-, -gie- at beginning
1372
            elif ((current == 0) and
1373
                  ((_get_at(current + 1) == 'Y') or
1374
                   _string_at((current + 1), 2,
1375
                              frozenset(['ES', 'EP', 'EB', 'EL', 'EY', 'IB',
1376
                                         'IL', 'IN', 'IE', 'EI', 'ER'])))):
1377
                (primary, secondary) = _metaph_add('K', 'J')
1378
                current += 2
1379
                continue
1380
1381
            #  -ger-,  -gy-
1382
            elif ((_string_at((current + 1), 2, ['ER']) or
1383
                   (_get_at(current + 1) == 'Y')) and not
1384
                  _string_at(0, 6,
1385
                             frozenset(['DANGER', 'RANGER', 'MANGER'])) and not
1386
                  _string_at((current - 1), 1, frozenset(['E', 'I'])) and not
1387
                  _string_at((current - 1), 3, frozenset(['RGY', 'OGY']))):
1388
                (primary, secondary) = _metaph_add('K', 'J')
1389
                current += 2
1390
                continue
1391
1392
            #  italian e.g, 'biaggi'
1393
            elif (_string_at((current + 1), 1, frozenset(['E', 'I', 'Y'])) or
1394
                  _string_at((current - 1), 4, frozenset(['AGGI', 'OGGI']))):
1395
                # obvious germanic
1396
                if (((_string_at(0, 4, frozenset(['VAN ', 'VON '])) or
1397
                      _string_at(0, 3, ['SCH'])) or
1398
                     _string_at((current + 1), 2, ['ET']))):
1399
                    (primary, secondary) = _metaph_add('K')
1400
                elif _string_at((current + 1), 4, ['IER ']):
1401
                    (primary, secondary) = _metaph_add('J')
1402
                else:
1403
                    (primary, secondary) = _metaph_add('J', 'K')
1404
                current += 2
1405
                continue
1406
1407
            else:
1408
                if _get_at(current + 1) == 'G':
1409
                    current += 2
1410
                else:
1411
                    current += 1
1412
                (primary, secondary) = _metaph_add('K')
1413
                continue
1414
1415
        elif _get_at(current) == 'H':
1416
            # only keep if first & before vowel or btw. 2 vowels
1417
            if ((((current == 0) or _is_vowel(current - 1)) and
1418
                 _is_vowel(current + 1))):
1419
                (primary, secondary) = _metaph_add('H')
1420
                current += 2
1421
            else:  # also takes care of 'HH'
1422
                current += 1
1423
            continue
1424
1425
        elif _get_at(current) == 'J':
1426
            # obvious spanish, 'jose', 'san jacinto'
1427
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, ['SAN ']):
1428
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1429
                     _string_at(0, 4, ['SAN ']))):
1430
                    (primary, secondary) = _metaph_add('H')
1431
                else:
1432
                    (primary, secondary) = _metaph_add('J', 'H')
1433
                current += 1
1434
                continue
1435
1436
            elif (current == 0) and not _string_at(current, 4, ['JOSE']):
1437
                # Yankelovich/Jankelowicz
1438
                (primary, secondary) = _metaph_add('J', 'A')
1439
            # Spanish pron. of e.g. 'bajador'
1440
            elif (_is_vowel(current - 1) and
1441
                  not _slavo_germanic() and
1442
                  ((_get_at(current + 1) == 'A') or
1443
                   (_get_at(current + 1) == 'O'))):
1444
                (primary, secondary) = _metaph_add('J', 'H')
1445
            elif current == last:
1446
                (primary, secondary) = _metaph_add('J', ' ')
1447
            elif (not _string_at((current + 1), 1,
1448
                                 frozenset(['L', 'T', 'K', 'S', 'N', 'M', 'B',
1449
                                            'Z'])) and
1450
                  not _string_at((current - 1), 1,
1451
                                 frozenset(['S', 'K', 'L']))):
1452
                (primary, secondary) = _metaph_add('J')
1453
1454
            if _get_at(current + 1) == 'J':  # it could happen!
1455
                current += 2
1456
            else:
1457
                current += 1
1458
            continue
1459
1460
        elif _get_at(current) == 'K':
1461
            if _get_at(current + 1) == 'K':
1462
                current += 2
1463
            else:
1464
                current += 1
1465
            (primary, secondary) = _metaph_add('K')
1466
            continue
1467
1468
        elif _get_at(current) == 'L':
1469
            if _get_at(current + 1) == 'L':
1470
                # Spanish e.g. 'cabrillo', 'gallegos'
1471
                if (((current == (length - 3)) and
1472
                     _string_at((current - 1), 4,
1473
                                frozenset(['ILLO', 'ILLA', 'ALLE']))) or
1474
                        ((_string_at((last - 1), 2, frozenset(['AS', 'OS'])) or
1475
                          _string_at(last, 1, frozenset(['A', 'O']))) and
1476
                         _string_at((current - 1), 4, ['ALLE']))):
1477
                    (primary, secondary) = _metaph_add('L', ' ')
1478
                    current += 2
1479
                    continue
1480
                current += 2
1481
            else:
1482
                current += 1
1483
            (primary, secondary) = _metaph_add('L')
1484
            continue
1485
1486
        elif _get_at(current) == 'M':
1487
            if (((_string_at((current - 1), 3, ['UMB']) and
1488
                  (((current + 1) == last) or
1489
                   _string_at((current + 2), 2, ['ER']))) or
1490
                 # 'dumb', 'thumb'
1491
                 (_get_at(current + 1) == 'M'))):
1492
                current += 2
1493
            else:
1494
                current += 1
1495
            (primary, secondary) = _metaph_add('M')
1496
            continue
1497
1498
        elif _get_at(current) == 'N':
1499
            if _get_at(current + 1) == 'N':
1500
                current += 2
1501
            else:
1502
                current += 1
1503
            (primary, secondary) = _metaph_add('N')
1504
            continue
1505
1506
        elif _get_at(current) == 'Ñ':
1507
            current += 1
1508
            (primary, secondary) = _metaph_add('N')
1509
            continue
1510
1511
        elif _get_at(current) == 'P':
1512
            if _get_at(current + 1) == 'H':
1513
                (primary, secondary) = _metaph_add('F')
1514
                current += 2
1515
                continue
1516
1517
            # also account for "campbell", "raspberry"
1518
            elif _string_at((current + 1), 1, frozenset(['P', 'B'])):
1519
                current += 2
1520
            else:
1521
                current += 1
1522
            (primary, secondary) = _metaph_add('P')
1523
            continue
1524
1525
        elif _get_at(current) == 'Q':
1526
            if _get_at(current + 1) == 'Q':
1527
                current += 2
1528
            else:
1529
                current += 1
1530
            (primary, secondary) = _metaph_add('K')
1531
            continue
1532
1533
        elif _get_at(current) == 'R':
1534
            # french e.g. 'rogier', but exclude 'hochmeier'
1535
            if (((current == last) and
1536
                 not _slavo_germanic() and
1537
                 _string_at((current - 2), 2, ['IE']) and
1538
                 not _string_at((current - 4), 2, frozenset(['ME', 'MA'])))):
1539
                (primary, secondary) = _metaph_add('', 'R')
1540
            else:
1541
                (primary, secondary) = _metaph_add('R')
1542
1543
            if _get_at(current + 1) == 'R':
1544
                current += 2
1545
            else:
1546
                current += 1
1547
            continue
1548
1549
        elif _get_at(current) == 'S':
1550
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1551
            if _string_at((current - 1), 3, frozenset(['ISL', 'YSL'])):
1552
                current += 1
1553
                continue
1554
1555
            # special case 'sugar-'
1556
            elif (current == 0) and _string_at(current, 5, ['SUGAR']):
1557
                (primary, secondary) = _metaph_add('X', 'S')
1558
                current += 1
1559
                continue
1560
1561
            elif _string_at(current, 2, ['SH']):
1562
                # Germanic
1563
                if _string_at((current + 1), 4,
1564
                              frozenset(['HEIM', 'HOEK', 'HOLM', 'HOLZ'])):
1565
                    (primary, secondary) = _metaph_add('S')
1566
                else:
1567
                    (primary, secondary) = _metaph_add('X')
1568
                current += 2
1569
                continue
1570
1571
            # Italian & Armenian
1572
            elif (_string_at(current, 3, frozenset(['SIO', 'SIA'])) or
1573
                  _string_at(current, 4, ['SIAN'])):
1574
                if not _slavo_germanic():
1575
                    (primary, secondary) = _metaph_add('S', 'X')
1576
                else:
1577
                    (primary, secondary) = _metaph_add('S')
1578
                current += 3
1579
                continue
1580
1581
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1582
            #                               'snider' match 'schneider'
1583
            # also, -sz- in Slavic language although in Hungarian it is
1584
            #       pronounced 's'
1585
            elif (((current == 0) and
1586
                   _string_at((current + 1), 1,
1587
                              frozenset(['M', 'N', 'L', 'W']))) or
1588
                  _string_at((current + 1), 1, ['Z'])):
1589
                (primary, secondary) = _metaph_add('S', 'X')
1590
                if _string_at((current + 1), 1, ['Z']):
1591
                    current += 2
1592
                else:
1593
                    current += 1
1594
                continue
1595
1596
            elif _string_at(current, 2, ['SC']):
1597
                # Schlesinger's rule
1598
                if _get_at(current + 2) == 'H':
1599
                    # dutch origin, e.g. 'school', 'schooner'
1600
                    if _string_at((current + 3), 2,
1601
                                  frozenset(['OO', 'ER', 'EN', 'UY', 'ED',
1602
                                             'EM'])):
1603
                        # 'schermerhorn', 'schenker'
1604
                        if _string_at((current + 3), 2,
1605
                                      frozenset(['ER', 'EN'])):
1606
                            (primary, secondary) = _metaph_add('X', 'SK')
1607
                        else:
1608
                            (primary, secondary) = _metaph_add('SK')
1609
                        current += 3
1610
                        continue
1611
                    else:
1612
                        if (((current == 0) and not _is_vowel(3) and
1613
                             (_get_at(3) != 'W'))):
1614
                            (primary, secondary) = _metaph_add('X', 'S')
1615
                        else:
1616
                            (primary, secondary) = _metaph_add('X')
1617
                        current += 3
1618
                        continue
1619
1620
                elif _string_at((current + 2), 1,
1621
                                frozenset(['I', 'E', 'Y'])):
1622
                    (primary, secondary) = _metaph_add('S')
1623
                    current += 3
1624
                    continue
1625
1626
                # else
1627
                else:
1628
                    (primary, secondary) = _metaph_add('SK')
1629
                    current += 3
1630
                    continue
1631
1632
            else:
1633
                # french e.g. 'resnais', 'artois'
1634
                if (current == last) and _string_at((current - 2), 2,
1635
                                                    frozenset(['AI', 'OI'])):
1636
                    (primary, secondary) = _metaph_add('', 'S')
1637
                else:
1638
                    (primary, secondary) = _metaph_add('S')
1639
1640
                if _string_at((current + 1), 1, frozenset(['S', 'Z'])):
1641
                    current += 2
1642
                else:
1643
                    current += 1
1644
                continue
1645
1646
        elif _get_at(current) == 'T':
1647
            if _string_at(current, 4, ['TION']):
1648
                (primary, secondary) = _metaph_add('X')
1649
                current += 3
1650
                continue
1651
1652
            elif _string_at(current, 3, frozenset(['TIA', 'TCH'])):
1653
                (primary, secondary) = _metaph_add('X')
1654
                current += 3
1655
                continue
1656
1657
            elif (_string_at(current, 2, ['TH']) or
1658
                  _string_at(current, 3, ['TTH'])):
1659
                # special case 'thomas', 'thames' or germanic
1660
                if ((_string_at((current + 2), 2, frozenset(['OM', 'AM'])) or
1661
                     _string_at(0, 4, frozenset(['VAN ', 'VON '])) or
1662
                     _string_at(0, 3, ['SCH']))):
1663
                    (primary, secondary) = _metaph_add('T')
1664
                else:
1665
                    (primary, secondary) = _metaph_add('0', 'T')
1666
                current += 2
1667
                continue
1668
1669
            elif _string_at((current + 1), 1, frozenset(['T', 'D'])):
1670
                current += 2
1671
            else:
1672
                current += 1
1673
            (primary, secondary) = _metaph_add('T')
1674
            continue
1675
1676
        elif _get_at(current) == 'V':
1677
            if _get_at(current + 1) == 'V':
1678
                current += 2
1679
            else:
1680
                current += 1
1681
            (primary, secondary) = _metaph_add('F')
1682
            continue
1683
1684
        elif _get_at(current) == 'W':
1685
            # can also be in middle of word
1686
            if _string_at(current, 2, ['WR']):
1687
                (primary, secondary) = _metaph_add('R')
1688
                current += 2
1689
                continue
1690
            elif ((current == 0) and
1691
                  (_is_vowel(current + 1) or _string_at(current, 2, ['WH']))):
1692
                # Wasserman should match Vasserman
1693
                if _is_vowel(current + 1):
1694
                    (primary, secondary) = _metaph_add('A', 'F')
1695
                else:
1696
                    # need Uomo to match Womo
1697
                    (primary, secondary) = _metaph_add('A')
1698
1699
            # Arnow should match Arnoff
1700
            if ((((current == last) and _is_vowel(current - 1)) or
1701
                 _string_at((current - 1), 5,
1702
                            frozenset(['EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'])) or
1703
                 _string_at(0, 3, ['SCH']))):
1704
                (primary, secondary) = _metaph_add('', 'F')
1705
                current += 1
1706
                continue
1707
            # Polish e.g. 'filipowicz'
1708
            elif _string_at(current, 4, frozenset(['WICZ', 'WITZ'])):
1709
                (primary, secondary) = _metaph_add('TS', 'FX')
1710
                current += 4
1711
                continue
1712
            # else skip it
1713
            else:
1714
                current += 1
1715
                continue
1716
1717
        elif _get_at(current) == 'X':
1718
            # French e.g. breaux
1719
            if (not ((current == last) and
1720
                     (_string_at((current - 3), 3,
1721
                                 frozenset(['IAU', 'EAU'])) or
1722
                      _string_at((current - 2), 2, frozenset(['AU', 'OU']))))):
1723
                (primary, secondary) = _metaph_add('KS')
1724
1725
            if _string_at((current + 1), 1, frozenset(['C', 'X'])):
1726
                current += 2
1727
            else:
1728
                current += 1
1729
            continue
1730
1731
        elif _get_at(current) == 'Z':
1732
            # Chinese Pinyin e.g. 'zhao'
1733
            if _get_at(current + 1) == 'H':
1734
                (primary, secondary) = _metaph_add('J')
1735
                current += 2
1736
                continue
1737
            elif (_string_at((current + 1), 2,
1738
                             frozenset(['ZO', 'ZI', 'ZA'])) or
1739
                  (_slavo_germanic() and ((current > 0) and
1740
                                          _get_at(current - 1) != 'T'))):
1741
                (primary, secondary) = _metaph_add('S', 'TS')
1742
            else:
1743
                (primary, secondary) = _metaph_add('S')
1744
1745
            if _get_at(current + 1) == 'Z':
1746
                current += 2
1747
            else:
1748
                current += 1
1749
            continue
1750
1751
        else:
1752
            current += 1
1753
1754
    if maxlength and maxlength < _INFINITY:
1755
        primary = primary[:maxlength]
1756
        secondary = secondary[:maxlength]
1757
    if primary == secondary:
1758
        secondary = ''
1759
1760
    return (primary, secondary)
1761
1762
1763
def caverphone(word, version=2):
1764
    """Return the Caverphone code for a word.
1765
1766
    A description of version 1 of the algorithm can be found at:
1767
    http://caversham.otago.ac.nz/files/working/ctp060902.pdf
1768
1769
    A description of version 2 of the algorithm can be found at:
1770
    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
1771
1772
    :param str word: the word to transform
1773
    :param int version: the version of Caverphone to employ for encoding
1774
        (defaults to 2)
1775
    :returns: the Caverphone value
1776
    :rtype: str
1777
1778
    >>> caverphone('Christopher')
1779
    'KRSTFA1111'
1780
    >>> caverphone('Niall')
1781
    'NA11111111'
1782
    >>> caverphone('Smith')
1783
    'SMT1111111'
1784
    >>> caverphone('Schmidt')
1785
    'SKMT111111'
1786
1787
    >>> caverphone('Christopher', 1)
1788
    'KRSTF1'
1789
    >>> caverphone('Niall', 1)
1790
    'N11111'
1791
    >>> caverphone('Smith', 1)
1792
    'SMT111'
1793
    >>> caverphone('Schmidt', 1)
1794
    'SKMT11'
1795
    """
1796
    _vowels = frozenset('aeiou')
1797
1798
    word = word.lower()
1799
    word = ''.join(c for c in word if c in
1800
                   frozenset('abcdefghijklmnopqrstuvwxyz'))
1801
1802
    # the main replacemet algorithm
1803
    if version != 1 and word[-1:] == 'e':
1804
        word = word[:-1]
1805
    if word:
1806
        if word[:5] == 'cough':
1807
            word = 'cou2f'+word[5:]
1808
        if word[:5] == 'rough':
1809
            word = 'rou2f'+word[5:]
1810
        if word[:5] == 'tough':
1811
            word = 'tou2f'+word[5:]
1812
        if word[:6] == 'enough':
1813
            word = 'enou2f'+word[6:]
1814
        if version != 1 and word[:6] == 'trough':
1815
            word = 'trou2f'+word[6:]
1816
        if word[:2] == 'gn':
1817
            word = '2n'+word[2:]
1818
        if word[-2:] == 'mb':
1819
            word = word[:-1]+'2'
1820
        word = word.replace('cq', '2q')
1821
        word = word.replace('ci', 'si')
1822
        word = word.replace('ce', 'se')
1823
        word = word.replace('cy', 'sy')
1824
        word = word.replace('tch', '2ch')
1825
        word = word.replace('c', 'k')
1826
        word = word.replace('q', 'k')
1827
        word = word.replace('x', 'k')
1828
        word = word.replace('v', 'f')
1829
        word = word.replace('dg', '2g')
1830
        word = word.replace('tio', 'sio')
1831
        word = word.replace('tia', 'sia')
1832
        word = word.replace('d', 't')
1833
        word = word.replace('ph', 'fh')
1834
        word = word.replace('b', 'p')
1835
        word = word.replace('sh', 's2')
1836
        word = word.replace('z', 's')
1837
        if word[0] in _vowels:
1838
            word = 'A'+word[1:]
1839
        word = word.replace('a', '3')
1840
        word = word.replace('e', '3')
1841
        word = word.replace('i', '3')
1842
        word = word.replace('o', '3')
1843
        word = word.replace('u', '3')
1844
        if version != 1:
1845
            word = word.replace('j', 'y')
1846
            if word[:2] == 'y3':
1847
                word = 'Y3'+word[2:]
1848
            if word[:1] == 'y':
1849
                word = 'A'+word[1:]
1850
            word = word.replace('y', '3')
1851
        word = word.replace('3gh3', '3kh3')
1852
        word = word.replace('gh', '22')
1853
        word = word.replace('g', 'k')
1854
        word = re.sub(r's+', r'S', word)  # TODO: implement w/o re?
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
1855
        word = re.sub(r't+', r'T', word)
1856
        word = re.sub(r'p+', r'P', word)
1857
        word = re.sub(r'k+', r'K', word)
1858
        word = re.sub(r'f+', r'F', word)
1859
        word = re.sub(r'm+', r'M', word)
1860
        word = re.sub(r'n+', r'N', word)
1861
        word = word.replace('w3', 'W3')
1862
        if version == 1:
1863
            word = word.replace('wy', 'Wy')
1864
        word = word.replace('wh3', 'Wh3')
1865
        if version == 1:
1866
            word = word.replace('why', 'Why')
1867
        if version != 1 and word[-1:] == 'w':
1868
            word = word[:-1]+'3'
1869
        word = word.replace('w', '2')
1870
        if word[:1] == 'h':
1871
            word = 'A'+word[1:]
1872
        word = word.replace('h', '2')
1873
        word = word.replace('r3', 'R3')
1874
        if version == 1:
1875
            word = word.replace('ry', 'Ry')
1876
        if version != 1 and word[-1:] == 'r':
1877
            word = word[:-1]+'3'
1878
        word = word.replace('r', '2')
1879
        word = word.replace('l3', 'L3')
1880
        if version == 1:
1881
            word = word.replace('ly', 'Ly')
1882
        if version != 1 and word[-1:] == 'l':
1883
            word = word[:-1]+'3'
1884
        word = word.replace('l', '2')
1885
        if version == 1:
1886
            word = word.replace('j', 'y')
1887
            word = word.replace('y3', 'Y3')
1888
            word = word.replace('y', '2')
1889
        word = word.replace('2', '')
1890
        if version != 1 and word[-1:] == '3':
1891
            word = word[:-1]+'A'
1892
        word = word.replace('3', '')
1893
1894
    # pad with 1s, then extract the necessary length of code
1895
    word = word+'1'*10
1896
    if version != 1:
1897
        word = word[:10]
1898
    else:
1899
        word = word[:6]
1900
1901
    return word
1902
1903
1904
def alpha_sis(word, maxlength=14):
1905
    """Return the IBM Alpha Search Inquiry System code for a word.
1906
1907
    Based on the algorithm described in "Accessing individual records from
1908
    personal data files using non-unique identifiers" / Gwendolyn B. Moore,
1909
    et al.; prepared for the Institute for Computer Sciences and Technology,
1910
    National Bureau of Standards, Washington, D.C (1977):
1911
    https://archive.org/stream/accessingindivid00moor#page/15/mode/1up
1912
1913
    A collection is necessary since there can be multiple values for a
1914
    single word. But the collection must be ordered since the first value
1915
    is the primary coding.
1916
1917
    :param str word: the word to transform
1918
    :param int maxlength: the length of the code returned (defaults to 14)
1919
    :returns: the Alpha SIS value
1920
    :rtype: tuple
1921
1922
    >>> alpha_sis('Christopher')
1923
    ('06401840000000', '07040184000000', '04018400000000')
1924
    >>> alpha_sis('Niall')
1925
    ('02500000000000',)
1926
    >>> alpha_sis('Smith')
1927
    ('03100000000000',)
1928
    >>> alpha_sis('Schmidt')
1929
    ('06310000000000',)
1930
    """
1931
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1932
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1933
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1934
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1935
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1936
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1937
                                 'Y')
1938
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1939
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1940
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1941
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1942
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1943
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1944
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1945
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1946
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1947
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1948
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1949
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1950
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1951
1952
    alpha = ['']
1953
    pos = 0
1954
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
1955
    word = word.replace('ß', 'SS')
1956
    word = ''.join(c for c in word if c in
1957
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
1958
1959
    # Clamp maxlength to [4, 64]
1960
    if maxlength is not None:
1961
        maxlength = min(max(4, maxlength), 64)
1962
    else:
1963
        maxlength = 64
1964
1965
    # Do special processing for initial substrings
1966
    for k in _alpha_sis_initials_order:
1967
        if word.startswith(k):
1968
            alpha[0] += _alpha_sis_initials[k]
1969
            pos += len(k)
1970
            break
1971
1972
    # Add a '0' if alpha is still empty
1973
    if not alpha[0]:
1974
        alpha[0] += '0'
1975
1976
    # Whether or not any special initial codes were encoded, iterate
1977
    # through the length of the word in the main encoding loop
1978
    while pos < len(word):
1979
        origpos = pos
1980
        for k in _alpha_sis_basic_order:
1981
            if word[pos:].startswith(k):
1982
                if isinstance(_alpha_sis_basic[k], tuple):
1983
                    newalpha = []
1984
                    for i in range(len(_alpha_sis_basic[k])):
1985
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
1986
                    alpha = newalpha
1987
                else:
1988
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
1989
                pos += len(k)
1990
                break
1991
        if pos == origpos:
1992
            alpha = [_ + '_' for _ in alpha]
1993
            pos += 1
1994
1995
    # Trim doublets and placeholders
1996
    for i in range(len(alpha)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1997
        pos = 1
1998
        while pos < len(alpha[i]):
1999
            if alpha[i][pos] == alpha[i][pos-1]:
2000
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
2001
            pos += 1
2002
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2003
2004
    # Trim codes and return tuple
2005
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2006
    return tuple(alpha)
2007
2008
2009
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2010
    """Return the Fuzzy Soundex code for a word.
2011
2012
    Fuzzy Soundex is an algorithm derived from Soundex, defined in:
2013
    Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for
2014
    Soundex Retrieval."
2015
    http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
2016
2017
    :param str word: the word to transform
2018
    :param int maxlength: the length of the code returned (defaults to 4)
2019
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2020
        a maxlength string
2021
    :returns: the Fuzzy Soundex value
2022
    :rtype: str
2023
2024
    >>> fuzzy_soundex('Christopher')
2025
    'K6931'
2026
    >>> fuzzy_soundex('Niall')
2027
    'N4000'
2028
    >>> fuzzy_soundex('Smith')
2029
    'S5300'
2030
    >>> fuzzy_soundex('Smith')
2031
    'S5300'
2032
    """
2033
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2034
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2035
                                          '0193017-07745501769301-7-9'))
2036
2037
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2038
    word = word.replace('ß', 'SS')
2039
2040
    # Clamp maxlength to [4, 64]
2041
    if maxlength is not None:
2042
        maxlength = min(max(4, maxlength), 64)
2043
    else:
2044
        maxlength = 64
2045
2046
    if not word:
2047
        if zero_pad:
2048
            return '0' * maxlength
2049
        return '0'
2050
2051
    if word[:2] in frozenset(['CS', 'CZ', 'TS', 'TZ']):
2052
        word = 'SS' + word[2:]
2053
    elif word[:2] == 'GN':
2054
        word = 'NN' + word[2:]
2055
    elif word[:2] in frozenset(['HR', 'WR']):
2056
        word = 'RR' + word[2:]
2057
    elif word[:2] == 'HW':
2058
        word = 'WW' + word[2:]
2059
    elif word[:2] in frozenset(['KN', 'NG']):
2060
        word = 'NN' + word[2:]
2061
2062
    if word[-2:] == 'CH':
2063
        word = word[:-2] + 'KK'
2064
    elif word[-2:] == 'NT':
2065
        word = word[:-2] + 'TT'
2066
    elif word[-2:] == 'RT':
2067
        word = word[:-2] + 'RR'
2068
    elif word[-3:] == 'RDT':
2069
        word = word[:-3] + 'RR'
2070
2071
    word = word.replace('CA', 'KA')
2072
    word = word.replace('CC', 'KK')
2073
    word = word.replace('CK', 'KK')
2074
    word = word.replace('CE', 'SE')
2075
    word = word.replace('CHL', 'KL')
2076
    word = word.replace('CL', 'KL')
2077
    word = word.replace('CHR', 'KR')
2078
    word = word.replace('CR', 'KR')
2079
    word = word.replace('CI', 'SI')
2080
    word = word.replace('CO', 'KO')
2081
    word = word.replace('CU', 'KU')
2082
    word = word.replace('CY', 'SY')
2083
    word = word.replace('DG', 'GG')
2084
    word = word.replace('GH', 'HH')
2085
    word = word.replace('MAC', 'MK')
2086
    word = word.replace('MC', 'MK')
2087
    word = word.replace('NST', 'NSS')
2088
    word = word.replace('PF', 'FF')
2089
    word = word.replace('PH', 'FF')
2090
    word = word.replace('SCH', 'SSS')
2091
    word = word.replace('TIO', 'SIO')
2092
    word = word.replace('TIA', 'SIO')
2093
    word = word.replace('TCH', 'CHH')
2094
2095
    sdx = word.translate(_fuzzy_soundex_translation)
2096
    sdx = sdx.replace('-', '')
2097
2098
    # remove repeating characters
2099
    sdx = _delete_consecutive_repeats(sdx)
2100
2101
    if word[0] in frozenset('HWY'):
2102
        sdx = word[0] + sdx
2103
    else:
2104
        sdx = word[0] + sdx[1:]
2105
2106
    sdx = sdx.replace('0', '')
2107
2108
    if zero_pad:
2109
        sdx += ('0'*maxlength)
2110
2111
    return sdx[:maxlength]
2112
2113
2114
def phonex(word, maxlength=4, zero_pad=True):
2115
    """Return the Phonex code for a word.
2116
2117
    Phonex is an algorithm derived from Soundex, defined in:
2118
    Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms".
2119
    http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf
2120
2121
    :param str word: the word to transform
2122
    :param int maxlength: the length of the code returned (defaults to 4)
2123
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2124
        a maxlength string
2125
    :returns: the Phonex value
2126
    :rtype: str
2127
2128
    >>> phonex('Christopher')
2129
    'C623'
2130
    >>> phonex('Niall')
2131
    'N400'
2132
    >>> phonex('Schmidt')
2133
    'S253'
2134
    >>> phonex('Smith')
2135
    'S530'
2136
    """
2137
    name = unicodedata.normalize('NFKD', text_type(word.upper()))
2138
    name = name.replace('ß', 'SS')
2139
2140
    # Clamp maxlength to [4, 64]
2141
    if maxlength is not None:
2142
        maxlength = min(max(4, maxlength), 64)
2143
    else:
2144
        maxlength = 64
2145
2146
    name_code = last = ''
2147
2148
    # Deletions effected by replacing with next letter which
2149
    # will be ignored due to duplicate handling of Soundex code.
2150
    # This is faster than 'moving' all subsequent letters.
2151
2152
    # Remove any trailing Ss
2153
    while name[-1:] == 'S':
2154
        name = name[:-1]
2155
2156
    # Phonetic equivalents of first 2 characters
2157
    # Works since duplicate letters are ignored
2158
    if name[:2] == 'KN':
2159
        name = 'N' + name[2:]  # KN.. == N..
2160
    elif name[:2] == 'PH':
2161
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2162
    elif name[:2] == 'WR':
2163
        name = 'R' + name[2:]  # WR.. == R..
2164
2165
    if name:
2166
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2167
        # Works since duplicate letters are ignored
2168
        if name[0] == 'H':
2169
            name = name[1:]
2170
2171
    if name:
2172
        # Phonetic equivalents of first character
2173
        if name[0] in frozenset('AEIOUY'):
2174
            name = 'A' + name[1:]
2175
        elif name[0] in frozenset('BP'):
2176
            name = 'B' + name[1:]
2177
        elif name[0] in frozenset('VF'):
2178
            name = 'F' + name[1:]
2179
        elif name[0] in frozenset('KQC'):
2180
            name = 'C' + name[1:]
2181
        elif name[0] in frozenset('JG'):
2182
            name = 'G' + name[1:]
2183
        elif name[0] in frozenset('ZS'):
2184
            name = 'S' + name[1:]
2185
2186
        name_code = last = name[0]
2187
2188
    # MODIFIED SOUNDEX CODE
2189
    for i in range(1, len(name)):
2190
        code = '0'
2191
        if name[i] in frozenset('BPFV'):
2192
            code = '1'
2193
        elif name[i] in frozenset('CSKGJQXZ'):
2194
            code = '2'
2195
        elif name[i] in frozenset('DT'):
2196
            if name[i+1:i+2] != 'C':
2197
                code = '3'
2198
        elif name[i] == 'L':
2199
            if name[i+1:i+2] in frozenset('AEIOUY') or i+1 == len(name):
2200
                code = '4'
2201
        elif name[i] in frozenset('MN'):
2202
            if name[i+1:i+2] in frozenset('DG'):
2203
                name = name[:i+1] + name[i] + name[i+2:]
2204
            code = '5'
2205
        elif name[i] == 'R':
2206
            if name[i+1:i+2] in frozenset('AEIOUY') or i+1 == len(name):
2207
                code = '6'
2208
2209
        if code != last and code != '0' and i != 0:
2210
            name_code += code
2211
2212
        last = name_code[-1]
2213
2214
    if zero_pad:
2215
        name_code += '0' * maxlength
2216
    if not name_code:
2217
        name_code = '0'
2218
    return name_code[:maxlength]
2219
2220
2221
def phonem(word):
2222
    """Return the Phonem code for a word.
2223
2224
    Phonem is defined in Wilde, Georg and Carsten Meyer. 1999. "Doppelgaenger
2225
    gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung."
2226
    ct Magazin fuer Computer & Technik 25/1999.
2227
2228
    This version is based on the Perl implementation documented at:
2229
    http://phonetik.phil-fak.uni-koeln.de/fileadmin/home/ritters/Allgemeine_Dateien/Martin_Wilz.pdf
2230
    It includes some enhancements presented in the Java port at:
2231
    https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java
2232
2233
    Phonem is intended chiefly for German names/words.
2234
2235
    :param str word: the word to transform
2236
    :returns: the Phonem value
2237
    :rtype: str
2238
2239
    >>> phonem('Christopher')
2240
    'CRYSDOVR'
2241
    >>> phonem('Niall')
2242
    'NYAL'
2243
    >>> phonem('Smith')
2244
    'SMYD'
2245
    >>> phonem('Schmidt')
2246
    'CMYD'
2247
    """
2248
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2249
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2250
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2251
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2252
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2253
                             ('AU', 'A§'), ('OU', '§'))
2254
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2255
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2256
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2257
2258
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2259
    for i, j in _phonem_substitutions:
2260
        word = word.replace(i, j)
2261
    word = word.translate(_phonem_translation)
2262
2263
    return ''.join(c for c in _delete_consecutive_repeats(word)
2264
                   if c in frozenset('ABCDLMNORSUVWXYÖ'))
2265
2266
2267
def phonix(word, maxlength=4, zero_pad=True):
2268
    """Return the Phonix code for a word.
2269
2270
    Phonix is a Soundex-like algorithm defined in:
2271
    T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366.
2272
2273
    This implementation is based on
2274
    http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c
2275
    http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py
2276
    and
2277
    https://metacpan.org/pod/Text::Phonetic::Phonix
2278
2279
    :param str word: the word to transform
2280
    :param int maxlength: the length of the code returned (defaults to 4)
2281
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2282
        a maxlength string
2283
    :returns: the Phonix value
2284
    :rtype: str
2285
2286
    >>> phonix('Christopher')
2287
    'K683'
2288
    >>> phonix('Niall')
2289
    'N400'
2290
    >>> phonix('Smith')
2291
    'S530'
2292
    >>> phonix('Schmidt')
2293
    'S530'
2294
    """
2295
    # pylint: disable=too-many-branches
2296
    def _start_repl(word, src, tar, post=None):
2297
        r"""Replace src with tar at the start of word."""
2298
        if post:
2299
            for i in post:
2300
                if word.startswith(src+i):
2301
                    return tar + word[len(src):]
2302
        elif word.startswith(src):
2303
            return tar + word[len(src):]
2304
        return word
2305
2306
    def _end_repl(word, src, tar, pre=None):
2307
        r"""Replace src with tar at the end of word."""
2308
        if pre:
2309
            for i in pre:
2310
                if word.endswith(i+src):
2311
                    return word[:-len(src)] + tar
2312
        elif word.endswith(src):
2313
            return word[:-len(src)] + tar
2314
        return word
2315
2316
    def _mid_repl(word, src, tar, pre=None, post=None):
2317
        r"""Replace src with tar in the middle of word."""
2318
        if pre or post:
2319
            if not pre:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
2320
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2321
            elif not post:
2322
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2323
            return _all_repl(word, src, tar, pre, post)
2324
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2325
                word[-1])
2326
2327
    def _all_repl(word, src, tar, pre=None, post=None):
2328
        r"""Replace src with tar anywhere in word."""
2329
        if pre or post:
2330
            if post:
2331
                post = post
2332
            else:
2333
                post = frozenset(('',))
2334
            if pre:
2335
                pre = pre
2336
            else:
2337
                pre = frozenset(('',))
2338
2339
            for i, j in ((i, j) for i in pre for j in post):
2340
                word = word.replace(i+src+j, i+tar+j)
2341
            return word
2342
        else:
2343
            return word.replace(src, tar)
2344
2345
    _vow = frozenset('AEIOU')
2346
    _con = frozenset('BCDFGHJKLMNPQRSTVWXYZ')
2347
2348
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2349
                             (_all_repl, 'CO', 'KO'),
2350
                             (_all_repl, 'CA', 'KA'),
2351
                             (_all_repl, 'CU', 'KU'),
2352
                             (_all_repl, 'CY', 'SI'),
2353
                             (_all_repl, 'CI', 'SI'),
2354
                             (_all_repl, 'CE', 'SE'),
2355
                             (_start_repl, 'CL', 'KL', _vow),
2356
                             (_all_repl, 'CK', 'K'),
2357
                             (_end_repl, 'GC', 'K'),
2358
                             (_end_repl, 'JC', 'K'),
2359
                             (_start_repl, 'CHR', 'KR', _vow),
2360
                             (_start_repl, 'CR', 'KR', _vow),
2361
                             (_start_repl, 'WR', 'R'),
2362
                             (_all_repl, 'NC', 'NK'),
2363
                             (_all_repl, 'CT', 'KT'),
2364
                             (_all_repl, 'PH', 'F'),
2365
                             (_all_repl, 'AA', 'AR'),
2366
                             (_all_repl, 'SCH', 'SH'),
2367
                             (_all_repl, 'BTL', 'TL'),
2368
                             (_all_repl, 'GHT', 'T'),
2369
                             (_all_repl, 'AUGH', 'ARF'),
2370
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2371
                             (_all_repl, 'LOUGH', 'LOW'),
2372
                             (_start_repl, 'Q', 'KW'),
2373
                             (_start_repl, 'KN', 'N'),
2374
                             (_end_repl, 'GN', 'N'),
2375
                             (_all_repl, 'GHN', 'N'),
2376
                             (_end_repl, 'GNE', 'N'),
2377
                             (_all_repl, 'GHNE', 'NE'),
2378
                             (_end_repl, 'GNES', 'NS'),
2379
                             (_start_repl, 'GN', 'N'),
2380
                             (_mid_repl, 'GN', 'N', None, _con),
2381
                             (_end_repl, 'GN', 'N'),
2382
                             (_start_repl, 'PS', 'S'),
2383
                             (_start_repl, 'PT', 'T'),
2384
                             (_start_repl, 'CZ', 'C'),
2385
                             (_mid_repl, 'WZ', 'Z', _vow),
2386
                             (_mid_repl, 'CZ', 'CH'),
2387
                             (_all_repl, 'LZ', 'LSH'),
2388
                             (_all_repl, 'RZ', 'RSH'),
2389
                             (_mid_repl, 'Z', 'S', None, _vow),
2390
                             (_all_repl, 'ZZ', 'TS'),
2391
                             (_mid_repl, 'Z', 'TS', _con),
2392
                             (_all_repl, 'HROUG', 'REW'),
2393
                             (_all_repl, 'OUGH', 'OF'),
2394
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2395
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2396
                             (_start_repl, 'YJ', 'Y', _vow),
2397
                             (_start_repl, 'GH', 'G'),
2398
                             (_end_repl, 'GH', 'E', _vow),
2399
                             (_start_repl, 'CY', 'S'),
2400
                             (_all_repl, 'NX', 'NKS'),
2401
                             (_start_repl, 'PF', 'F'),
2402
                             (_end_repl, 'DT', 'T'),
2403
                             (_end_repl, 'TL', 'TIL'),
2404
                             (_end_repl, 'DL', 'DIL'),
2405
                             (_all_repl, 'YTH', 'ITH'),
2406
                             (_start_repl, 'TJ', 'CH', _vow),
2407
                             (_start_repl, 'TSJ', 'CH', _vow),
2408
                             (_start_repl, 'TS', 'T', _vow),
2409
                             (_all_repl, 'TCH', 'CH'),
2410
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2411
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2412
                             (_start_repl, 'MN', 'N', _vow),
2413
                             (_start_repl, 'PN', 'N', _vow),
2414
                             (_mid_repl, 'STL', 'SL', _vow),
2415
                             (_end_repl, 'STL', 'SL', _vow),
2416
                             (_end_repl, 'TNT', 'ENT'),
2417
                             (_end_repl, 'EAUX', 'OH'),
2418
                             (_all_repl, 'EXCI', 'ECS'),
2419
                             (_all_repl, 'X', 'ECS'),
2420
                             (_end_repl, 'NED', 'ND'),
2421
                             (_all_repl, 'JR', 'DR'),
2422
                             (_end_repl, 'EE', 'EA'),
2423
                             (_all_repl, 'ZS', 'S'),
2424
                             (_mid_repl, 'R', 'AH', _vow, _con),
2425
                             (_end_repl, 'R', 'AH', _vow),
2426
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2427
                             (_end_repl, 'HR', 'AH', _vow),
2428
                             (_end_repl, 'HR', 'AH', _vow),
2429
                             (_end_repl, 'RE', 'AR'),
2430
                             (_end_repl, 'R', 'AH', _vow),
2431
                             (_all_repl, 'LLE', 'LE'),
2432
                             (_end_repl, 'LE', 'ILE', _con),
2433
                             (_end_repl, 'LES', 'ILES', _con),
2434
                             (_end_repl, 'E', ''),
2435
                             (_end_repl, 'ES', 'S'),
2436
                             (_end_repl, 'SS', 'AS', _vow),
2437
                             (_end_repl, 'MB', 'M', _vow),
2438
                             (_all_repl, 'MPTS', 'MPS'),
2439
                             (_all_repl, 'MPS', 'MS'),
2440
                             (_all_repl, 'MPT', 'MT'))
2441
2442
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2443
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2444
                                   '01230720022455012683070808'))
2445
2446
    sdx = ''
2447
2448
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2449
    word = word.replace('ß', 'SS')
2450
    word = ''.join(c for c in word if c in
2451
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
2452
    if word:
2453
        for trans in _phonix_substitutions:
2454
            word = trans[0](word, *trans[1:])
2455
        if word[0] in frozenset('AEIOUY'):
2456
            sdx = 'v' + word[1:].translate(_phonix_translation)
2457
        else:
2458
            sdx = word[0] + word[1:].translate(_phonix_translation)
2459
        sdx = _delete_consecutive_repeats(sdx)
2460
        sdx = sdx.replace('0', '')
2461
2462
    # Clamp maxlength to [4, 64]
2463
    if maxlength is not None:
2464
        maxlength = min(max(4, maxlength), 64)
2465
    else:
2466
        maxlength = 64
2467
2468
    if zero_pad:
2469
        sdx += '0' * maxlength
2470
    if not sdx:
2471
        sdx = '0'
2472
    return sdx[:maxlength]
2473
2474
2475
def sfinxbis(word, maxlength=None):
2476
    """Return the SfinxBis code for a word.
2477
2478
    SfinxBis is a Soundex-like algorithm defined in:
2479
    http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf
2480
2481
    This implementation follows the reference implementation:
2482
    http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt
2483
2484
    SfinxBis is intended chiefly for Swedish names.
2485
2486
    :param str word: the word to transform
2487
    :param int maxlength: the length of the code returned (defaults to
2488
        unlimited)
2489
    :returns: the SfinxBis value
2490
    :rtype: tuple
2491
2492
    >>> sfinxbis('Christopher')
2493
    ('K68376',)
2494
    >>> sfinxbis('Niall')
2495
    ('N4',)
2496
    >>> sfinxbis('Smith')
2497
    ('S53',)
2498
    >>> sfinxbis('Schmidt')
2499
    ('S53',)
2500
2501
    >>> sfinxbis('Johansson')
2502
    ('J585',)
2503
    >>> sfinxbis('Sjöberg')
2504
    ('#162',)
2505
    """
2506
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2507
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2508
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2509
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2510
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2511
                   ' S:T ')
2512
2513
    _harde_vokaler = frozenset('AOUÅ')
2514
    _mjuka_vokaler = frozenset('EIYÄÖ')
2515
    _konsonanter = frozenset('BCDFGHJKLMNPQRSTVWXZ')
2516
    _alfabet = frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZÅÄÖ')
2517
2518
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2519
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2520
                                     '123729224551268378999999999'))
2521
2522
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2523
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2524
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2525
2526
    def _foersvensker(ordet):
2527
        """Return the Swedish-ized form of the word."""
2528
        ordet = ordet.replace('STIERN', 'STJÄRN')
2529
        ordet = ordet.replace('HIE', 'HJ')
2530
        ordet = ordet.replace('SIÖ', 'SJÖ')
2531
        ordet = ordet.replace('SCH', 'SH')
2532
        ordet = ordet.replace('QU', 'KV')
2533
        ordet = ordet.replace('IO', 'JO')
2534
        ordet = ordet.replace('PH', 'F')
2535
2536
        for i in _harde_vokaler:
2537
            ordet = ordet.replace(i+'Ü', i+'J')
2538
            ordet = ordet.replace(i+'Y', i+'J')
2539
            ordet = ordet.replace(i+'I', i+'J')
2540
        for i in _mjuka_vokaler:
2541
            ordet = ordet.replace(i+'Ü', i+'J')
2542
            ordet = ordet.replace(i+'Y', i+'J')
2543
            ordet = ordet.replace(i+'I', i+'J')
2544
2545
        if 'H' in ordet:
2546
            for i in _konsonanter:
2547
                ordet = ordet.replace('H'+i, i)
2548
2549
        ordet = ordet.translate(_sfinxbis_substitutions)
2550
2551
        ordet = ordet.replace('Ð', 'ETH')
2552
        ordet = ordet.replace('Þ', 'TH')
2553
        ordet = ordet.replace('ß', 'SS')
2554
2555
        return ordet
2556
2557
    def _koda_foersta_ljudet(ordet):
2558
        """Return the word with the first sound coded."""
2559
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2560
            ordet = '$' + ordet[1:]
2561
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2562
            ordet = 'J' + ordet[2:]
2563
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2564
            ordet = 'J' + ordet[1:]
2565
        elif ordet[0:1] == 'Q':
2566
            ordet = 'K' + ordet[1:]
2567
        elif (ordet[0:2] == 'CH' and
2568
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2569
            ordet = '#' + ordet[2:]
2570
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2571
            ordet = 'K' + ordet[1:]
2572
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2573
            ordet = 'K' + ordet[1:]
2574
        elif ordet[0:1] == 'X':
2575
            ordet = 'S' + ordet[1:]
2576
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2577
            ordet = 'S' + ordet[1:]
2578
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2579
            ordet = '#' + ordet[3:]
2580
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2581
            ordet = '#' + ordet[2:]
2582
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2583
            ordet = '#' + ordet[2:]
2584
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2585
            ordet = '#' + ordet[1:]
2586
        return ordet
2587
2588
    # Steg 1, Versaler
2589
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2590
    word = word.replace('ß', 'SS')
2591
    word = word.replace('-', ' ')
2592
2593
    # Steg 2, Ta bort adelsprefix
2594
    for adelstitel in adelstitler:
2595
        while adelstitel in word:
2596
            word = word.replace(adelstitel, ' ')
2597
        if word.startswith(adelstitel[1:]):
2598
            word = word[len(adelstitel)-1:]
2599
2600
    # Split word into tokens
2601
    ordlista = word.split()
2602
2603
    # Steg 3, Ta bort dubbelteckning i början på namnet
2604
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2605
    if not ordlista:
2606
        return ('',)
2607
2608
    # Steg 4, Försvenskning
2609
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2610
2611
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2612
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2613
                for ordet in ordlista]
2614
2615
    # Steg 6, Koda första ljudet
2616
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2617
2618
    # Steg 7, Dela upp namnet i två delar
2619
    rest = [ordet[1:] for ordet in ordlista]
2620
2621
    # Steg 8, Utför fonetisk transformation i resten
2622
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2623
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2624
2625
    # Steg 9, Koda resten till en sifferkod
2626
    for vokal in _mjuka_vokaler:
2627
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2628
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2629
2630
    # Steg 10, Ta bort intilliggande dubbletter
2631
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2632
2633
    # Steg 11, Ta bort alla "9"
2634
    rest = [ordet.replace('9', '') for ordet in rest]
2635
2636
    # Steg 12, Sätt ihop delarna igen
2637
    ordlista = [''.join(ordet) for ordet in
2638
                zip((_[0:1] for _ in ordlista), rest)]
2639
2640
    # truncate, if maxlength is set
2641
    if maxlength and maxlength < _INFINITY:
2642
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2643
2644
    return tuple(ordlista)
2645
2646
2647
def phonet(word, mode=1, lang='de', trace=False):
2648
    """Return the phonet code for a word.
2649
2650
    phonet was developed by Jörg Michael and documented in c't magazine
2651
    vol. 25/1999, p. 252. It is a phonetic algorithm designed primarily for
2652
    German.
2653
    Cf. http://www.heise.de/ct/ftp/99/25/252/
2654
2655
    This is a port of Jesper Zedlitz's code, which is licensed LGPL:
2656
    https://code.google.com/p/phonet4java/source/browse/trunk/src/main/java/com/googlecode/phonet4java/Phonet.java
2657
2658
    That is, in turn, based on Michael's C code, which is also licensed LGPL:
2659
    ftp://ftp.heise.de/pub/ct/listings/phonet.zip
2660
2661
    :param str word: the word to transform
2662
    :param int mode: the ponet variant to employ (1 or 2)
2663
    :param str lang: 'de' (default) for German
2664
            'none' for no language
2665
    :param bool trace: prints debugging info if True
2666
    :returns: the phonet value
2667
    :rtype: str
2668
2669
    >>> phonet('Christopher')
2670
    'KRISTOFA'
2671
    >>> phonet('Niall')
2672
    'NIAL'
2673
    >>> phonet('Smith')
2674
    'SMIT'
2675
    >>> phonet('Schmidt')
2676
    'SHMIT'
2677
2678
    >>> phonet('Christopher', mode=2)
2679
    'KRIZTUFA'
2680
    >>> phonet('Niall', mode=2)
2681
    'NIAL'
2682
    >>> phonet('Smith', mode=2)
2683
    'ZNIT'
2684
    >>> phonet('Schmidt', mode=2)
2685
    'ZNIT'
2686
2687
    >>> phonet('Christopher', lang='none')
2688
    'CHRISTOPHER'
2689
    >>> phonet('Niall', lang='none')
2690
    'NIAL'
2691
    >>> phonet('Smith', lang='none')
2692
    'SMITH'
2693
    >>> phonet('Schmidt', lang='none')
2694
    'SCHMIDT'
2695
    """
2696
    # pylint: disable=too-many-branches
2697
2698
    _phonet_rules_no_lang = (  # separator chars
2699
        '´', ' ', ' ',
2700
        '"', ' ', ' ',
2701
        '`$', '', '',
2702
        '\'', ' ', ' ',
2703
        ',', ',', ',',
2704
        ';', ',', ',',
2705
        '-', ' ', ' ',
2706
        ' ', ' ', ' ',
2707
        '.', '.', '.',
2708
        ':', '.', '.',
2709
        # German umlauts
2710
        'Ä', 'AE', 'AE',
2711
        'Ö', 'OE', 'OE',
2712
        'Ü', 'UE', 'UE',
2713
        'ß', 'S', 'S',
2714
        # international umlauts
2715
        'À', 'A', 'A',
2716
        'Á', 'A', 'A',
2717
        'Â', 'A', 'A',
2718
        'Ã', 'A', 'A',
2719
        'Å', 'A', 'A',
2720
        'Æ', 'AE', 'AE',
2721
        'Ç', 'C', 'C',
2722
        'Ð', 'DJ', 'DJ',
2723
        'È', 'E', 'E',
2724
        'É', 'E', 'E',
2725
        'Ê', 'E', 'E',
2726
        'Ë', 'E', 'E',
2727
        'Ì', 'I', 'I',
2728
        'Í', 'I', 'I',
2729
        'Î', 'I', 'I',
2730
        'Ï', 'I', 'I',
2731
        'Ñ', 'NH', 'NH',
2732
        'Ò', 'O', 'O',
2733
        'Ó', 'O', 'O',
2734
        'Ô', 'O', 'O',
2735
        'Õ', 'O', 'O',
2736
        'Œ', 'OE', 'OE',
2737
        'Ø', 'OE', 'OE',
2738
        'Š', 'SH', 'SH',
2739
        'Þ', 'TH', 'TH',
2740
        'Ù', 'U', 'U',
2741
        'Ú', 'U', 'U',
2742
        'Û', 'U', 'U',
2743
        'Ý', 'Y', 'Y',
2744
        'Ÿ', 'Y', 'Y',
2745
        # 'normal' letters (A-Z)
2746
        'MC^', 'MAC', 'MAC',
2747
        'MC^', 'MAC', 'MAC',
2748
        'M´^', 'MAC', 'MAC',
2749
        'M\'^', 'MAC', 'MAC',
2750
        'O´^', 'O', 'O',
2751
        'O\'^', 'O', 'O',
2752
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2753
        None, None, None)
2754
2755
    _phonet_rules_german = (  # separator chars
2756
        '´', ' ', ' ',
2757
        '"', ' ', ' ',
2758
        '`$', '', '',
2759
        '\'', ' ', ' ',
2760
        ',', ' ', ' ',
2761
        ';', ' ', ' ',
2762
        '-', ' ', ' ',
2763
        ' ', ' ', ' ',
2764
        '.', '.', '.',
2765
        ':', '.', '.',
2766
        # German umlauts
2767
        'ÄE', 'E', 'E',
2768
        'ÄU<', 'EU', 'EU',
2769
        'ÄV(AEOU)-<', 'EW', None,
2770
        'Ä$', 'Ä', None,
2771
        'Ä<', None, 'E',
2772
        'Ä', 'E', None,
2773
        'ÖE', 'Ö', 'Ö',
2774
        'ÖU', 'Ö', 'Ö',
2775
        'ÖVER--<', 'ÖW', None,
2776
        'ÖV(AOU)-', 'ÖW', None,
2777
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2778
        'ÜBER^^', 'ÜBA', 'IBA',
2779
        'ÜE', 'Ü', 'I',
2780
        'ÜVER--<', 'ÜW', None,
2781
        'ÜV(AOU)-', 'ÜW', None,
2782
        'Ü', None, 'I',
2783
        'ßCH<', None, 'Z',
2784
        'ß<', 'S', 'Z',
2785
        # international umlauts
2786
        'À<', 'A', 'A',
2787
        'Á<', 'A', 'A',
2788
        'Â<', 'A', 'A',
2789
        'Ã<', 'A', 'A',
2790
        'Å<', 'A', 'A',
2791
        'ÆER-', 'E', 'E',
2792
        'ÆU<', 'EU', 'EU',
2793
        'ÆV(AEOU)-<', 'EW', None,
2794
        'Æ$', 'Ä', None,
2795
        'Æ<', None, 'E',
2796
        'Æ', 'E', None,
2797
        'Ç', 'Z', 'Z',
2798
        'ÐÐ-', '', '',
2799
        'Ð', 'DI', 'TI',
2800
        'È<', 'E', 'E',
2801
        'É<', 'E', 'E',
2802
        'Ê<', 'E', 'E',
2803
        'Ë', 'E', 'E',
2804
        'Ì<', 'I', 'I',
2805
        'Í<', 'I', 'I',
2806
        'Î<', 'I', 'I',
2807
        'Ï', 'I', 'I',
2808
        'ÑÑ-', '', '',
2809
        'Ñ', 'NI', 'NI',
2810
        'Ò<', 'O', 'U',
2811
        'Ó<', 'O', 'U',
2812
        'Ô<', 'O', 'U',
2813
        'Õ<', 'O', 'U',
2814
        'Œ<', 'Ö', 'Ö',
2815
        'Ø(IJY)-<', 'E', 'E',
2816
        'Ø<', 'Ö', 'Ö',
2817
        'Š', 'SH', 'Z',
2818
        'Þ', 'T', 'T',
2819
        'Ù<', 'U', 'U',
2820
        'Ú<', 'U', 'U',
2821
        'Û<', 'U', 'U',
2822
        'Ý<', 'I', 'I',
2823
        'Ÿ<', 'I', 'I',
2824
        # 'normal' letters (A-Z)
2825
        'ABELLE$', 'ABL', 'ABL',
2826
        'ABELL$', 'ABL', 'ABL',
2827
        'ABIENNE$', 'ABIN', 'ABIN',
2828
        'ACHME---^', 'ACH', 'AK',
2829
        'ACEY$', 'AZI', 'AZI',
2830
        'ADV', 'ATW', None,
2831
        'AEGL-', 'EK', None,
2832
        'AEU<', 'EU', 'EU',
2833
        'AE2', 'E', 'E',
2834
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2835
        'AGL-1', 'AK', None,
2836
        'AGNI-^', 'AKN', 'AKN',
2837
        'AGNIE-', 'ANI', 'ANI',
2838
        'AGN(AEOU)-$', 'ANI', 'ANI',
2839
        'AH(AIOÖUÜY)-', 'AH', None,
2840
        'AIA2', 'AIA', 'AIA',
2841
        'AIE$', 'E', 'E',
2842
        'AILL(EOU)-', 'ALI', 'ALI',
2843
        'AINE$', 'EN', 'EN',
2844
        'AIRE$', 'ER', 'ER',
2845
        'AIR-', 'E', 'E',
2846
        'AISE$', 'ES', 'EZ',
2847
        'AISSANCE$', 'ESANS', 'EZANZ',
2848
        'AISSE$', 'ES', 'EZ',
2849
        'AIX$', 'EX', 'EX',
2850
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2851
        'AKTIE', 'AXIE', 'AXIE',
2852
        'AKTUEL', 'AKTUEL', None,
2853
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2854
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2855
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2856
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2857
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2858
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2859
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2860
        'ANDERGING----', 'ANDA ', 'ANTA ',
2861
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2862
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2863
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2864
        'ANER(BKO)---^^', 'AN', None,
2865
        'ANHAND---^$', 'AN H', 'AN ',
2866
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2867
        'ANIELLE$', 'ANIEL', 'ANIL',
2868
        'ANIEL', 'ANIEL', None,
2869
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2870
        'ANTI^^', 'ANTI', 'ANTI',
2871
        'ANVER^^', 'ANFA', 'ANFA',
2872
        'ATIA$', 'ATIA', 'ATIA',
2873
        'ATIA(NS)--', 'ATI', 'ATI',
2874
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2875
        'AUAU--', '', '',
2876
        'AUERE$', 'AUERE', None,
2877
        'AUERE(NS)-$', 'AUERE', None,
2878
        'AUERE(AIOUY)--', 'AUER', None,
2879
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2880
        'AUER<', 'AUA', 'AUA',
2881
        'AUF^^', 'AUF', 'AUF',
2882
        'AULT$', 'O', 'U',
2883
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2884
        'AUR$', 'AUA', 'AUA',
2885
        'AUSSE$', 'OS', 'UZ',
2886
        'AUS(ST)-^', 'AUS', 'AUS',
2887
        'AUS^^', 'AUS', 'AUS',
2888
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2889
        'AUTO^^', 'AUTO', 'AUTU',
2890
        'AUX(IY)-', 'AUX', 'AUX',
2891
        'AUX', 'O', 'U',
2892
        'AU', 'AU', 'AU',
2893
        'AVER--<', 'AW', None,
2894
        'AVIER$', 'AWIE', 'AFIE',
2895
        'AV(EÈÉÊI)-^', 'AW', None,
2896
        'AV(AOU)-', 'AW', None,
2897
        'AYRE$', 'EIRE', 'EIRE',
2898
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2899
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2900
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2901
        'AYR<', 'EIA', 'EIA',
2902
        'AYER--<', 'EI', 'EI',
2903
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2904
        'AË', 'E', 'E',
2905
        'A(IJY)<', 'EI', 'EI',
2906
        'BABY^$', 'BEBI', 'BEBI',
2907
        'BAB(IY)^', 'BEBI', 'BEBI',
2908
        'BEAU^$', 'BO', None,
2909
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2910
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2911
        'BEE$', 'BI', 'BI',
2912
        'BEIGE^$', 'BESH', 'BEZ',
2913
        'BENOIT--', 'BENO', 'BENU',
2914
        'BER(DT)-', 'BER', None,
2915
        'BERN(DT)-', 'BERN', None,
2916
        'BE(LMNRST)-^', 'BE', 'BE',
2917
        'BETTE$', 'BET', 'BET',
2918
        'BEVOR^$', 'BEFOR', None,
2919
        'BIC$', 'BIZ', 'BIZ',
2920
        'BOWL(EI)-', 'BOL', 'BUL',
2921
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2922
        'BRINGEND-----^', 'BRI', 'BRI',
2923
        'BRINGEND-----', ' BRI', ' BRI',
2924
        'BROW(NS)-', 'BRAU', 'BRAU',
2925
        'BUDGET7', 'BÜGE', 'BIKE',
2926
        'BUFFET7', 'BÜFE', 'BIFE',
2927
        'BYLLE$', 'BILE', 'BILE',
2928
        'BYLL$', 'BIL', 'BIL',
2929
        'BYPA--^', 'BEI', 'BEI',
2930
        'BYTE<', 'BEIT', 'BEIT',
2931
        'BY9^', 'BÜ', None,
2932
        'B(SßZ)$', 'BS', None,
2933
        'CACH(EI)-^', 'KESH', 'KEZ',
2934
        'CAE--', 'Z', 'Z',
2935
        'CA(IY)$', 'ZEI', 'ZEI',
2936
        'CE(EIJUY)--', 'Z', 'Z',
2937
        'CENT<', 'ZENT', 'ZENT',
2938
        'CERST(EI)----^', 'KE', 'KE',
2939
        'CER$', 'ZA', 'ZA',
2940
        'CE3', 'ZE', 'ZE',
2941
        'CH\'S$', 'X', 'X',
2942
        'CH´S$', 'X', 'X',
2943
        'CHAO(ST)-', 'KAO', 'KAU',
2944
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2945
        'CHAR(AI)-^', 'KAR', 'KAR',
2946
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2947
        'CHÄ(CF)-', 'SHE', 'ZE',
2948
        'CHE(CF)-', 'SHE', 'ZE',
2949
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2950
        'CHEQUE<', 'SHEK', 'ZEK',
2951
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2952
        'CH(AEUY)-<^', 'SH', 'Z',
2953
        'CHK-', '', '',
2954
        'CHO(CKPS)-^', 'SHO', 'ZU',
2955
        'CHRIS-', 'KRI', None,
2956
        'CHRO-', 'KR', None,
2957
        'CH(LOR)-<^', 'K', 'K',
2958
        'CHST-', 'X', 'X',
2959
        'CH(SßXZ)3', 'X', 'X',
2960
        'CHTNI-3', 'CHN', 'KN',
2961
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2962
        'CH', 'CH', 'K',
2963
        'CIC$', 'ZIZ', 'ZIZ',
2964
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
2965
        'CIENCE$', 'EIENS', 'EIENZ',
2966
        'CIER$', 'ZIE', 'ZIE',
2967
        'CYB-^', 'ZEI', 'ZEI',
2968
        'CY9^', 'ZÜ', 'ZI',
2969
        'C(IJY)-<3', 'Z', 'Z',
2970
        'CLOWN-', 'KLAU', 'KLAU',
2971
        'CCH', 'Z', 'Z',
2972
        'CCE-', 'X', 'X',
2973
        'C(CK)-', '', '',
2974
        'CLAUDET---', 'KLO', 'KLU',
2975
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
2976
        'COACH', 'KOSH', 'KUZ',
2977
        'COLE$', 'KOL', 'KUL',
2978
        'COUCH', 'KAUSH', 'KAUZ',
2979
        'COW', 'KAU', 'KAU',
2980
        'CQUES$', 'K', 'K',
2981
        'CQUE', 'K', 'K',
2982
        'CRASH--9', 'KRE', 'KRE',
2983
        'CREAT-^', 'KREA', 'KREA',
2984
        'CST', 'XT', 'XT',
2985
        'CS<^', 'Z', 'Z',
2986
        'C(SßX)', 'X', 'X',
2987
        'CT\'S$', 'X', 'X',
2988
        'CT(SßXZ)', 'X', 'X',
2989
        'CZ<', 'Z', 'Z',
2990
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
2991
        'C.^', 'C.', 'C.',
2992
        'CÄ-', 'Z', 'Z',
2993
        'CÜ$', 'ZÜ', 'ZI',
2994
        'C\'S$', 'X', 'X',
2995
        'C<', 'K', 'K',
2996
        'DAHER^$', 'DAHER', None,
2997
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
2998
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
2999
        'DD(SZ)--<', '', '',
3000
        'DD9', 'D', None,
3001
        'DEPOT7', 'DEPO', 'TEBU',
3002
        'DESIGN', 'DISEIN', 'TIZEIN',
3003
        'DE(LMNRST)-3^', 'DE', 'TE',
3004
        'DETTE$', 'DET', 'TET',
3005
        'DH$', 'T', None,
3006
        'DIC$', 'DIZ', 'TIZ',
3007
        'DIDR-^', 'DIT', None,
3008
        'DIEDR-^', 'DIT', None,
3009
        'DJ(AEIOU)-^', 'I', 'I',
3010
        'DMITR-^', 'DIMIT', 'TINIT',
3011
        'DRY9^', 'DRÜ', None,
3012
        'DT-', '', '',
3013
        'DUIS-^', 'DÜ', 'TI',
3014
        'DURCH^^', 'DURCH', 'TURK',
3015
        'DVA$', 'TWA', None,
3016
        'DY9^', 'DÜ', None,
3017
        'DYS$', 'DIS', None,
3018
        'DS(CH)--<', 'T', 'T',
3019
        'DST', 'ZT', 'ZT',
3020
        'DZS(CH)--', 'T', 'T',
3021
        'D(SßZ)', 'Z', 'Z',
3022
        'D(AÄEIOÖRUÜY)-', 'D', None,
3023
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3024
        'D\'H^', 'D', 'T',
3025
        'D´H^', 'D', 'T',
3026
        'D`H^', 'D', 'T',
3027
        'D\'S3$', 'Z', 'Z',
3028
        'D´S3$', 'Z', 'Z',
3029
        'D^', 'D', None,
3030
        'D', 'T', 'T',
3031
        'EAULT$', 'O', 'U',
3032
        'EAUX$', 'O', 'U',
3033
        'EAU', 'O', 'U',
3034
        'EAV', 'IW', 'IF',
3035
        'EAS3$', 'EAS', None,
3036
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3037
        'EA3$', 'EA', 'EA',
3038
        'EA3', 'I', 'I',
3039
        'EBENSO^$', 'EBNSO', 'EBNZU',
3040
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3041
        'EBEN^^', 'EBN', 'EBN',
3042
        'EE9', 'E', 'E',
3043
        'EGL-1', 'EK', None,
3044
        'EHE(IUY)--1', 'EH', None,
3045
        'EHUNG---1', 'E', None,
3046
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3047
        'EIEI--', '', '',
3048
        'EIERE^$', 'EIERE', None,
3049
        'EIERE$', 'EIERE', None,
3050
        'EIERE(NS)-$', 'EIERE', None,
3051
        'EIERE(AIOUY)--', 'EIER', None,
3052
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3053
        'EIER<', 'EIA', None,
3054
        'EIGL-1', 'EIK', None,
3055
        'EIGH$', 'EI', 'EI',
3056
        'EIH--', 'E', 'E',
3057
        'EILLE$', 'EI', 'EI',
3058
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3059
        'EIR$', 'EIA', 'EIA',
3060
        'EITRAUBEN------', 'EIT ', 'EIT ',
3061
        'EI', 'EI', 'EI',
3062
        'EJ$', 'EI', 'EI',
3063
        'ELIZ^', 'ELIS', None,
3064
        'ELZ^', 'ELS', None,
3065
        'EL-^', 'E', 'E',
3066
        'ELANG----1', 'E', 'E',
3067
        'EL(DKL)--1', 'E', 'E',
3068
        'EL(MNT)--1$', 'E', 'E',
3069
        'ELYNE$', 'ELINE', 'ELINE',
3070
        'ELYN$', 'ELIN', 'ELIN',
3071
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3072
        'EL-1', 'L', 'L',
3073
        'EM-^', None, 'E',
3074
        'EM(DFKMPQT)--1', None, 'E',
3075
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3076
        'EM-1', None, 'N',
3077
        'ENGAG-^', 'ANGA', 'ANKA',
3078
        'EN-^', 'E', 'E',
3079
        'ENTUEL', 'ENTUEL', None,
3080
        'EN(CDGKQSTZ)--1', 'E', 'E',
3081
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3082
        'EN-1', '', '',
3083
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3084
        'ER-^', 'E', 'E',
3085
        'ERREGEND-----', ' ER', ' ER',
3086
        'ERT1$', 'AT', None,
3087
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3088
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3089
        'ER1$', 'A', 'A',
3090
        'ER<1', 'A', 'A',
3091
        'ETAT7', 'ETA', 'ETA',
3092
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3093
        'EUERE$', 'EUERE', None,
3094
        'EUERE(NS)-$', 'EUERE', None,
3095
        'EUERE(AIOUY)--', 'EUER', None,
3096
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3097
        'EUER<', 'EUA', None,
3098
        'EUEU--', '', '',
3099
        'EUILLE$', 'Ö', 'Ö',
3100
        'EUR$', 'ÖR', 'ÖR',
3101
        'EUX', 'Ö', 'Ö',
3102
        'EUSZ$', 'EUS', None,
3103
        'EUTZ$', 'EUS', None,
3104
        'EUYS$', 'EUS', 'EUZ',
3105
        'EUZ$', 'EUS', None,
3106
        'EU', 'EU', 'EU',
3107
        'EVER--<1', 'EW', None,
3108
        'EV(ÄOÖUÜ)-1', 'EW', None,
3109
        'EYER<', 'EIA', 'EIA',
3110
        'EY<', 'EI', 'EI',
3111
        'FACETTE', 'FASET', 'FAZET',
3112
        'FANS--^$', 'FE', 'FE',
3113
        'FAN-^$', 'FE', 'FE',
3114
        'FAULT-', 'FOL', 'FUL',
3115
        'FEE(DL)-', 'FI', 'FI',
3116
        'FEHLER', 'FELA', 'FELA',
3117
        'FE(LMNRST)-3^', 'FE', 'FE',
3118
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3119
        'FOERDERN---', ' FÖRD', ' FÖRT',
3120
        'FOND7', 'FON', 'FUN',
3121
        'FRAIN$', 'FRA', 'FRA',
3122
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3123
        'FY9^', 'FÜ', None,
3124
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3125
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3126
        'GAGS^$', 'GEX', 'KEX',
3127
        'GAG^$', 'GEK', 'KEK',
3128
        'GD', 'KT', 'KT',
3129
        'GEGEN^^', 'GEGN', 'KEKN',
3130
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3131
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3132
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3133
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3134
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3135
        'GENRE', 'IORE', 'IURE',
3136
        'GE(LMNRST)-3^', 'GE', 'KE',
3137
        'GER(DKT)-', 'GER', None,
3138
        'GETTE$', 'GET', 'KET',
3139
        'GGF.', 'GF.', None,
3140
        'GG-', '', '',
3141
        'GH', 'G', None,
3142
        'GI(AOU)-^', 'I', 'I',
3143
        'GION-3', 'KIO', 'KIU',
3144
        'G(CK)-', '', '',
3145
        'GJ(AEIOU)-^', 'I', 'I',
3146
        'GMBH^$', 'GMBH', 'GMBH',
3147
        'GNAC$', 'NIAK', 'NIAK',
3148
        'GNON$', 'NION', 'NIUN',
3149
        'GN$', 'N', 'N',
3150
        'GONCAL-^', 'GONZA', 'KUNZA',
3151
        'GRY9^', 'GRÜ', None,
3152
        'G(SßXZ)-<', 'K', 'K',
3153
        'GUCK-', 'KU', 'KU',
3154
        'GUISEP-^', 'IUSE', 'IUZE',
3155
        'GUI-^', 'G', 'K',
3156
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3157
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3158
        'GY9^', 'GÜ', None,
3159
        'G(AÄEILOÖRUÜY)-', 'G', None,
3160
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3161
        'G\'S$', 'X', 'X',
3162
        'G´S$', 'X', 'X',
3163
        'G^', 'G', None,
3164
        'G', 'K', 'K',
3165
        'HA(HIUY)--1', 'H', None,
3166
        'HANDVOL---^', 'HANT ', 'ANT ',
3167
        'HANNOVE-^', 'HANOF', None,
3168
        'HAVEN7$', 'HAFN', None,
3169
        'HEAD-', 'HE', 'E',
3170
        'HELIEGEN------', 'E ', 'E ',
3171
        'HESTEHEN------', 'E ', 'E ',
3172
        'HE(LMNRST)-3^', 'HE', 'E',
3173
        'HE(LMN)-1', 'E', 'E',
3174
        'HEUR1$', 'ÖR', 'ÖR',
3175
        'HE(HIUY)--1', 'H', None,
3176
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3177
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3178
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3179
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3180
        'HOBBY9^', 'HOBI', None,
3181
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3182
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3183
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3184
        'HO(HIY)--1', 'H', None,
3185
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3186
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3187
        'HUIS^^', 'HÜS', 'IZ',
3188
        'HUIS$', 'ÜS', 'IZ',
3189
        'HUI--1', 'H', None,
3190
        'HYGIEN^', 'HÜKIEN', None,
3191
        'HY9^', 'HÜ', None,
3192
        'HY(BDGMNPST)-', 'Ü', None,
3193
        'H.^', None, 'H.',
3194
        'HÄU--1', 'H', None,
3195
        'H^', 'H', '',
3196
        'H', '', '',
3197
        'ICHELL---', 'ISH', 'IZ',
3198
        'ICHI$', 'ISHI', 'IZI',
3199
        'IEC$', 'IZ', 'IZ',
3200
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3201
        'IEI-3', '', '',
3202
        'IELL3', 'IEL', 'IEL',
3203
        'IENNE$', 'IN', 'IN',
3204
        'IERRE$', 'IER', 'IER',
3205
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3206
        'IETTE$', 'IT', 'IT',
3207
        'IEU', 'IÖ', 'IÖ',
3208
        'IE<4', 'I', 'I',
3209
        'IGL-1', 'IK', None,
3210
        'IGHT3$', 'EIT', 'EIT',
3211
        'IGNI(EO)-', 'INI', 'INI',
3212
        'IGN(AEOU)-$', 'INI', 'INI',
3213
        'IHER(DGLKRT)--1', 'IHE', None,
3214
        'IHE(IUY)--', 'IH', None,
3215
        'IH(AIOÖUÜY)-', 'IH', None,
3216
        'IJ(AOU)-', 'I', 'I',
3217
        'IJ$', 'I', 'I',
3218
        'IJ<', 'EI', 'EI',
3219
        'IKOLE$', 'IKOL', 'IKUL',
3220
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3221
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3222
        'IMSTAN----^', 'IM ', 'IN ',
3223
        'INDELERREGE------', 'INDL ', 'INTL ',
3224
        'INFRAGE-----^$', 'IN ', 'IN ',
3225
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3226
        'INVER-', 'INWE', 'INFE',
3227
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3228
        'IUSZ$', 'IUS', None,
3229
        'IUTZ$', 'IUS', None,
3230
        'IUZ$', 'IUS', None,
3231
        'IVER--<', 'IW', None,
3232
        'IVIER$', 'IWIE', 'IFIE',
3233
        'IV(ÄOÖUÜ)-', 'IW', None,
3234
        'IV<3', 'IW', None,
3235
        'IY2', 'I', None,
3236
        'I(ÈÉÊ)<4', 'I', 'I',
3237
        'JAVIE---<^', 'ZA', 'ZA',
3238
        'JEANS^$', 'JINS', 'INZ',
3239
        'JEANNE^$', 'IAN', 'IAN',
3240
        'JEAN-^', 'IA', 'IA',
3241
        'JER-^', 'IE', 'IE',
3242
        'JE(LMNST)-', 'IE', 'IE',
3243
        'JI^', 'JI', None,
3244
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3245
        'J', 'I', 'I',
3246
        'KC(ÄEIJ)-', 'X', 'X',
3247
        'KD', 'KT', None,
3248
        'KE(LMNRST)-3^', 'KE', 'KE',
3249
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3250
        'KH<^', 'K', 'K',
3251
        'KIC$', 'KIZ', 'KIZ',
3252
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3253
        'KOTELE-^', 'KOTL', 'KUTL',
3254
        'KREAT-^', 'KREA', 'KREA',
3255
        'KRÜS(TZ)--^', 'KRI', None,
3256
        'KRYS(TZ)--^', 'KRI', None,
3257
        'KRY9^', 'KRÜ', None,
3258
        'KSCH---', 'K', 'K',
3259
        'KSH--', 'K', 'K',
3260
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3261
        'KT\'S$', 'X', 'X',
3262
        'KTI(AIOU)-3', 'XI', 'XI',
3263
        'KT(SßXZ)', 'X', 'X',
3264
        'KY9^', 'KÜ', None,
3265
        'K\'S$', 'X', 'X',
3266
        'K´S$', 'X', 'X',
3267
        'LANGES$', ' LANGES', ' LANKEZ',
3268
        'LANGE$', ' LANGE', ' LANKE',
3269
        'LANG$', ' LANK', ' LANK',
3270
        'LARVE-', 'LARF', 'LARF',
3271
        'LD(SßZ)$', 'LS', 'LZ',
3272
        'LD\'S$', 'LS', 'LZ',
3273
        'LD´S$', 'LS', 'LZ',
3274
        'LEAND-^', 'LEAN', 'LEAN',
3275
        'LEERSTEHE-----^', 'LER ', 'LER ',
3276
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3277
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3278
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3279
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3280
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3281
        'LEL-', 'LE', 'LE',
3282
        'LE(MNRST)-3^', 'LE', 'LE',
3283
        'LETTE$', 'LET', 'LET',
3284
        'LFGNAG-', 'LFGAN', 'LFKAN',
3285
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3286
        'LIC$', 'LIZ', 'LIZ',
3287
        'LIVE^$', 'LEIF', 'LEIF',
3288
        'LT(SßZ)$', 'LS', 'LZ',
3289
        'LT\'S$', 'LS', 'LZ',
3290
        'LT´S$', 'LS', 'LZ',
3291
        'LUI(GS)--', 'LU', 'LU',
3292
        'LV(AIO)-', 'LW', None,
3293
        'LY9^', 'LÜ', None,
3294
        'LSTS$', 'LS', 'LZ',
3295
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3296
        'L(SßZ)$', 'LS', None,
3297
        'MAIR-<', 'MEI', 'NEI',
3298
        'MANAG-', 'MENE', 'NENE',
3299
        'MANUEL', 'MANUEL', None,
3300
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3301
        'MATCH', 'MESH', 'NEZ',
3302
        'MAURICE', 'MORIS', 'NURIZ',
3303
        'MBH^$', 'MBH', 'MBH',
3304
        'MB(ßZ)$', 'MS', None,
3305
        'MB(SßTZ)-', 'M', 'N',
3306
        'MCG9^', 'MAK', 'NAK',
3307
        'MC9^', 'MAK', 'NAK',
3308
        'MEMOIR-^', 'MEMOA', 'NENUA',
3309
        'MERHAVEN$', 'MAHAFN', None,
3310
        'ME(LMNRST)-3^', 'ME', 'NE',
3311
        'MEN(STZ)--3', 'ME', None,
3312
        'MEN$', 'MEN', None,
3313
        'MIGUEL-', 'MIGE', 'NIKE',
3314
        'MIKE^$', 'MEIK', 'NEIK',
3315
        'MITHILFE----^$', 'MIT H', 'NIT ',
3316
        'MN$', 'M', None,
3317
        'MN', 'N', 'N',
3318
        'MPJUTE-', 'MPUT', 'NBUT',
3319
        'MP(ßZ)$', 'MS', None,
3320
        'MP(SßTZ)-', 'M', 'N',
3321
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3322
        'MY9^', 'MÜ', None,
3323
        'M(ßZ)$', 'MS', None,
3324
        'M´G7^', 'MAK', 'NAK',
3325
        'M\'G7^', 'MAK', 'NAK',
3326
        'M´^', 'MAK', 'NAK',
3327
        'M\'^', 'MAK', 'NAK',
3328
        'M', None, 'N',
3329
        'NACH^^', 'NACH', 'NAK',
3330
        'NADINE', 'NADIN', 'NATIN',
3331
        'NAIV--', 'NA', 'NA',
3332
        'NAISE$', 'NESE', 'NEZE',
3333
        'NAUGENOMM------', 'NAU ', 'NAU ',
3334
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3335
        'NCH$', 'NSH', 'NZ',
3336
        'NCOISE$', 'SOA', 'ZUA',
3337
        'NCOIS$', 'SOA', 'ZUA',
3338
        'NDAR$', 'NDA', 'NTA',
3339
        'NDERINGEN------', 'NDE ', 'NTE ',
3340
        'NDRO(CDKTZ)-', 'NTRO', None,
3341
        'ND(BFGJLMNPQVW)-', 'NT', None,
3342
        'ND(SßZ)$', 'NS', 'NZ',
3343
        'ND\'S$', 'NS', 'NZ',
3344
        'ND´S$', 'NS', 'NZ',
3345
        'NEBEN^^', 'NEBN', 'NEBN',
3346
        'NENGELERN------', 'NEN ', 'NEN ',
3347
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3348
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3349
        'NE(LMNRST)-3^', 'NE', 'NE',
3350
        'NEN-3', 'NE', 'NE',
3351
        'NETTE$', 'NET', 'NET',
3352
        'NGU^^', 'NU', 'NU',
3353
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3354
        'NH(AUO)-$', 'NI', 'NI',
3355
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3356
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3357
        'NICHTS^^', 'NIX', 'NIX',
3358
        'NICHT^^', 'NICHT', 'NIKT',
3359
        'NINE$', 'NIN', 'NIN',
3360
        'NON^^', 'NON', 'NUN',
3361
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3362
        'NOT^^', 'NOT', 'NUT',
3363
        'NTI(AIOU)-3', 'NZI', 'NZI',
3364
        'NTIEL--3', 'NZI', 'NZI',
3365
        'NT(SßZ)$', 'NS', 'NZ',
3366
        'NT\'S$', 'NS', 'NZ',
3367
        'NT´S$', 'NS', 'NZ',
3368
        'NYLON', 'NEILON', 'NEILUN',
3369
        'NY9^', 'NÜ', None,
3370
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3371
        'NSZ-', 'NS', None,
3372
        'NSTS$', 'NS', 'NZ',
3373
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3374
        'N(SßZ)$', 'NS', None,
3375
        'OBERE-', 'OBER', None,
3376
        'OBER^^', 'OBA', 'UBA',
3377
        'OEU2', 'Ö', 'Ö',
3378
        'OE<2', 'Ö', 'Ö',
3379
        'OGL-', 'OK', None,
3380
        'OGNIE-', 'ONI', 'UNI',
3381
        'OGN(AEOU)-$', 'ONI', 'UNI',
3382
        'OH(AIOÖUÜY)-', 'OH', None,
3383
        'OIE$', 'Ö', 'Ö',
3384
        'OIRE$', 'OA', 'UA',
3385
        'OIR$', 'OA', 'UA',
3386
        'OIX', 'OA', 'UA',
3387
        'OI<3', 'EU', 'EU',
3388
        'OKAY^$', 'OKE', 'UKE',
3389
        'OLYN$', 'OLIN', 'ULIN',
3390
        'OO(DLMZ)-', 'U', None,
3391
        'OO$', 'U', None,
3392
        'OO-', '', '',
3393
        'ORGINAL-----', 'ORI', 'URI',
3394
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3395
        'OUI^', 'WI', 'FI',
3396
        'OUILLE$', 'ULIE', 'ULIE',
3397
        'OU(DT)-^', 'AU', 'AU',
3398
        'OUSE$', 'AUS', 'AUZ',
3399
        'OUT-', 'AU', 'AU',
3400
        'OU', 'U', 'U',
3401
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3402
        'OVER--<', 'OW', None,
3403
        'OV(AOU)-', 'OW', None,
3404
        'OW$', 'AU', 'AU',
3405
        'OWS$', 'OS', 'UZ',
3406
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3407
        'OYER', 'OIA', None,
3408
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3409
        'O(JY)<', 'EU', 'EU',
3410
        'OZ$', 'OS', None,
3411
        'O´^', 'O', 'U',
3412
        'O\'^', 'O', 'U',
3413
        'O', None, 'U',
3414
        'PATIEN--^', 'PAZI', 'PAZI',
3415
        'PENSIO-^', 'PANSI', 'PANZI',
3416
        'PE(LMNRST)-3^', 'PE', 'PE',
3417
        'PFER-^', 'FE', 'FE',
3418
        'P(FH)<', 'F', 'F',
3419
        'PIC^$', 'PIK', 'PIK',
3420
        'PIC$', 'PIZ', 'PIZ',
3421
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3422
        'POLYP-', 'POLÜ', None,
3423
        'POLY^^', 'POLI', 'PULI',
3424
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3425
        'POWER7', 'PAUA', 'PAUA',
3426
        'PP(FH)--<', 'B', 'B',
3427
        'PP-', '', '',
3428
        'PRODUZ-^', 'PRODU', 'BRUTU',
3429
        'PRODUZI--', ' PRODU', ' BRUTU',
3430
        'PRIX^$', 'PRI', 'PRI',
3431
        'PS-^^', 'P', None,
3432
        'P(SßZ)^', None, 'Z',
3433
        'P(SßZ)$', 'BS', None,
3434
        'PT-^', '', '',
3435
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3436
        'PY9^', 'PÜ', None,
3437
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3438
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3439
        'P.^', None, 'P.',
3440
        'P^', 'P', None,
3441
        'P', 'B', 'B',
3442
        'QI-', 'Z', 'Z',
3443
        'QUARANT--', 'KARA', 'KARA',
3444
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3445
        'QUE$', 'K', 'K',
3446
        'QUI(NS)$', 'KI', 'KI',
3447
        'QUIZ7', 'KWIS', None,
3448
        'Q(UV)7', 'KW', 'KF',
3449
        'Q<', 'K', 'K',
3450
        'RADFAHR----', 'RAT ', 'RAT ',
3451
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3452
        'RCH', 'RCH', 'RK',
3453
        'REA(DU)---3^', 'R', None,
3454
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3455
        'RECHERCH^', 'RESHASH', 'REZAZ',
3456
        'RECYCL--', 'RIZEI', 'RIZEI',
3457
        'RE(ALST)-3^', 'RE', None,
3458
        'REE$', 'RI', 'RI',
3459
        'RER$', 'RA', 'RA',
3460
        'RE(MNR)-4', 'RE', 'RE',
3461
        'RETTE$', 'RET', 'RET',
3462
        'REUZ$', 'REUZ', None,
3463
        'REW$', 'RU', 'RU',
3464
        'RH<^', 'R', 'R',
3465
        'RJA(MN)--', 'RI', 'RI',
3466
        'ROWD-^', 'RAU', 'RAU',
3467
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3468
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3469
        'RTIEL--3', 'RZI', 'RZI',
3470
        'RV(AEOU)-3', 'RW', None,
3471
        'RY(KN)-$', 'RI', 'RI',
3472
        'RY9^', 'RÜ', None,
3473
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3474
        'SAISO-^', 'SES', 'ZEZ',
3475
        'SAFE^$', 'SEIF', 'ZEIF',
3476
        'SAUCE-^', 'SOS', 'ZUZ',
3477
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3478
        'SCHSCH---7', '', '',
3479
        'SCHTSCH', 'SH', 'Z',
3480
        'SC(HZ)<', 'SH', 'Z',
3481
        'SC', 'SK', 'ZK',
3482
        'SELBSTST--7^^', 'SELB', 'ZELB',
3483
        'SELBST7^^', 'SELBST', 'ZELBZT',
3484
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3485
        'SERVI-^', 'SERW', None,
3486
        'SE(LMNRST)-3^', 'SE', 'ZE',
3487
        'SETTE$', 'SET', 'ZET',
3488
        'SHP-^', 'S', 'Z',
3489
        'SHST', 'SHT', 'ZT',
3490
        'SHTSH', 'SH', 'Z',
3491
        'SHT', 'ST', 'Z',
3492
        'SHY9^', 'SHÜ', None,
3493
        'SH^^', 'SH', None,
3494
        'SH3', 'SH', 'Z',
3495
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3496
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3497
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3498
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3499
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3500
        'SIEGLI-^', 'SIKL', 'ZIKL',
3501
        'SIGLI-^', 'SIKL', 'ZIKL',
3502
        'SIGHT', 'SEIT', 'ZEIT',
3503
        'SIGN', 'SEIN', 'ZEIN',
3504
        'SKI(NPZ)-', 'SKI', 'ZKI',
3505
        'SKI<^', 'SHI', 'ZI',
3506
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3507
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3508
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3509
        'SOUND-', 'SAUN', 'ZAUN',
3510
        'STAATS^^', 'STAZ', 'ZTAZ',
3511
        'STADT^^', 'STAT', 'ZTAT',
3512
        'STANDE$', ' STANDE', ' ZTANTE',
3513
        'START^^', 'START', 'ZTART',
3514
        'STAURANT7', 'STORAN', 'ZTURAN',
3515
        'STEAK-', 'STE', 'ZTE',
3516
        'STEPHEN-^$', 'STEW', None,
3517
        'STERN', 'STERN', None,
3518
        'STRAF^^', 'STRAF', 'ZTRAF',
3519
        'ST\'S$', 'Z', 'Z',
3520
        'ST´S$', 'Z', 'Z',
3521
        'STST--', '', '',
3522
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3523
        'ST(SZ)', 'Z', 'Z',
3524
        'SPAREN---^', 'SPA', 'ZPA',
3525
        'SPAREND----', ' SPA', ' ZPA',
3526
        'S(PTW)-^^', 'S', None,
3527
        'SP', 'SP', None,
3528
        'STYN(AE)-$', 'STIN', 'ZTIN',
3529
        'ST', 'ST', 'ZT',
3530
        'SUITE<', 'SIUT', 'ZIUT',
3531
        'SUKE--$', 'S', 'Z',
3532
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3533
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3534
        'SYB(IY)--^', 'SIB', None,
3535
        'SYL(KVW)--^', 'SI', None,
3536
        'SY9^', 'SÜ', None,
3537
        'SZE(NPT)-^', 'ZE', 'ZE',
3538
        'SZI(ELN)-^', 'ZI', 'ZI',
3539
        'SZCZ<', 'SH', 'Z',
3540
        'SZT<', 'ST', 'ZT',
3541
        'SZ<3', 'SH', 'Z',
3542
        'SÜL(KVW)--^', 'SI', None,
3543
        'S', None, 'Z',
3544
        'TCH', 'SH', 'Z',
3545
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3546
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3547
        'TEAT-^', 'TEA', 'TEA',
3548
        'TERRAI7^', 'TERA', 'TERA',
3549
        'TE(LMNRST)-3^', 'TE', 'TE',
3550
        'TH<', 'T', 'T',
3551
        'TICHT-', 'TIK', 'TIK',
3552
        'TICH$', 'TIK', 'TIK',
3553
        'TIC$', 'TIZ', 'TIZ',
3554
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3555
        'TIGSTELL-----', 'TIK ', 'TIK ',
3556
        'TOAS-^', 'TO', 'TU',
3557
        'TOILET-', 'TOLE', 'TULE',
3558
        'TOIN-', 'TOA', 'TUA',
3559
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3560
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3561
        'TRAINI-', 'TREN', 'TREN',
3562
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3563
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3564
        'TSCH', 'SH', 'Z',
3565
        'TSH', 'SH', 'Z',
3566
        'TST', 'ZT', 'ZT',
3567
        'T(Sß)', 'Z', 'Z',
3568
        'TT(SZ)--<', '', '',
3569
        'TT9', 'T', 'T',
3570
        'TV^$', 'TV', 'TV',
3571
        'TX(AEIOU)-3', 'SH', 'Z',
3572
        'TY9^', 'TÜ', None,
3573
        'TZ-', '', '',
3574
        'T\'S3$', 'Z', 'Z',
3575
        'T´S3$', 'Z', 'Z',
3576
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3577
        'UEBER^^', 'ÜBA', 'IBA',
3578
        'UE2', 'Ü', 'I',
3579
        'UGL-', 'UK', None,
3580
        'UH(AOÖUÜY)-', 'UH', None,
3581
        'UIE$', 'Ü', 'I',
3582
        'UM^^', 'UM', 'UN',
3583
        'UNTERE--3', 'UNTE', 'UNTE',
3584
        'UNTER^^', 'UNTA', 'UNTA',
3585
        'UNVER^^', 'UNFA', 'UNFA',
3586
        'UN^^', 'UN', 'UN',
3587
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3588
        'UVE-4', 'UW', None,
3589
        'UY2', 'UI', None,
3590
        'UZZ', 'AS', 'AZ',
3591
        'VACL-^', 'WAZ', 'FAZ',
3592
        'VAC$', 'WAZ', 'FAZ',
3593
        'VAN DEN ^', 'FANDN', 'FANTN',
3594
        'VANES-^', 'WANE', None,
3595
        'VATRO-', 'WATR', None,
3596
        'VA(DHJNT)--^', 'F', None,
3597
        'VEDD-^', 'FE', 'FE',
3598
        'VE(BEHIU)--^', 'F', None,
3599
        'VEL(BDLMNT)-^', 'FEL', None,
3600
        'VENTZ-^', 'FEN', None,
3601
        'VEN(NRSZ)-^', 'FEN', None,
3602
        'VER(AB)-^$', 'WER', None,
3603
        'VERBAL^$', 'WERBAL', None,
3604
        'VERBAL(EINS)-^', 'WERBAL', None,
3605
        'VERTEBR--', 'WERTE', None,
3606
        'VEREIN-----', 'F', None,
3607
        'VEREN(AEIOU)-^', 'WEREN', None,
3608
        'VERIFI', 'WERIFI', None,
3609
        'VERON(AEIOU)-^', 'WERON', None,
3610
        'VERSEN^', 'FERSN', 'FAZN',
3611
        'VERSIERT--^', 'WERSI', None,
3612
        'VERSIO--^', 'WERS', None,
3613
        'VERSUS', 'WERSUS', None,
3614
        'VERTI(GK)-', 'WERTI', None,
3615
        'VER^^', 'FER', 'FA',
3616
        'VERSPRECHE-------', ' FER', ' FA',
3617
        'VER$', 'WA', None,
3618
        'VER', 'FA', 'FA',
3619
        'VET(HT)-^', 'FET', 'FET',
3620
        'VETTE$', 'WET', 'FET',
3621
        'VE^', 'WE', None,
3622
        'VIC$', 'WIZ', 'FIZ',
3623
        'VIELSAGE----', 'FIL ', 'FIL ',
3624
        'VIEL', 'FIL', 'FIL',
3625
        'VIEW', 'WIU', 'FIU',
3626
        'VILL(AE)-', 'WIL', None,
3627
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3628
        'VI(ELS)--^', 'F', None,
3629
        'VILLON--', 'WILI', 'FILI',
3630
        'VIZE^^', 'FIZE', 'FIZE',
3631
        'VLIE--^', 'FL', None,
3632
        'VL(AEIOU)--', 'W', None,
3633
        'VOKA-^', 'WOK', None,
3634
        'VOL(ATUVW)--^', 'WO', None,
3635
        'VOR^^', 'FOR', 'FUR',
3636
        'VR(AEIOU)--', 'W', None,
3637
        'VV9', 'W', None,
3638
        'VY9^', 'WÜ', 'FI',
3639
        'V(ÜY)-', 'W', None,
3640
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3641
        'V(AEIJLRU)-<', 'W', None,
3642
        'V.^', 'V.', None,
3643
        'V<', 'F', 'F',
3644
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3645
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3646
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3647
        'WE(LMNRST)-3^', 'WE', 'FE',
3648
        'WER(DST)-', 'WER', None,
3649
        'WIC$', 'WIZ', 'FIZ',
3650
        'WIEDERU--', 'WIDE', 'FITE',
3651
        'WIEDER^$', 'WIDA', 'FITA',
3652
        'WIEDER^^', 'WIDA ', 'FITA ',
3653
        'WIEVIEL', 'WI FIL', 'FI FIL',
3654
        'WISUEL', 'WISUEL', None,
3655
        'WR-^', 'W', None,
3656
        'WY9^', 'WÜ', 'FI',
3657
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3658
        'W$', 'F', None,
3659
        'W', None, 'F',
3660
        'X<^', 'Z', 'Z',
3661
        'XHAVEN$', 'XAFN', None,
3662
        'X(CSZ)', 'X', 'X',
3663
        'XTS(CH)--', 'XT', 'XT',
3664
        'XT(SZ)', 'Z', 'Z',
3665
        'YE(LMNRST)-3^', 'IE', 'IE',
3666
        'YE-3', 'I', 'I',
3667
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3668
        'Y(AOU)-<7', 'I', 'I',
3669
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3670
        'YVES^$', 'IF', 'IF',
3671
        'YVONNE^$', 'IWON', 'IFUN',
3672
        'Y.^', 'Y.', None,
3673
        'Y', 'I', 'I',
3674
        'ZC(AOU)-', 'SK', 'ZK',
3675
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3676
        'ZIEJ$', 'ZI', 'ZI',
3677
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3678
        'ZL(AEIOU)-', 'SL', None,
3679
        'ZS(CHT)--', '', '',
3680
        'ZS', 'SH', 'Z',
3681
        'ZUERST', 'ZUERST', 'ZUERST',
3682
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3683
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3684
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3685
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3686
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3687
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3688
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3689
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3690
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3691
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3692
        'ZUVER^^', 'ZUFA', 'ZUFA',
3693
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3694
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3695
        'ZY9^', 'ZÜ', None,
3696
        'ZYK3$', 'ZIK', None,
3697
        'Z(VW)7^', 'SW', None,
3698
        None, None, None)
3699
3700
    phonet_hash = Counter()
3701
    alpha_pos = Counter()
3702
3703
    phonet_hash_1 = Counter()
3704
    phonet_hash_2 = Counter()
3705
3706
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3707
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3708
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3709
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3710
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3711
3712
    def _trinfo(text, rule, err_text, lang):
3713
        """Output debug information."""
3714
        if lang == 'none':
3715
            _phonet_rules = _phonet_rules_no_lang
3716
        else:
3717
            _phonet_rules = _phonet_rules_german
3718
3719
        from_rule = ('(NULL)' if _phonet_rules[rule] is None else
3720
                     _phonet_rules[rule])
3721
        to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else
3722
                    _phonet_rules[rule + 1])
3723
        to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else
3724
                    _phonet_rules[rule + 2])
3725
        print('"{} {}:  "{}"{}"{}" {}'.format(text, ((rule / 3) + 1),
3726
                                              from_rule, to_rule1, to_rule2,
3727
                                              err_text))
3728
3729
    def _initialize_phonet(lang):
3730
        """Initialize phonet variables."""
3731
        if lang == 'none':
3732
            _phonet_rules = _phonet_rules_no_lang
3733
        else:
3734
            _phonet_rules = _phonet_rules_german
3735
3736
        phonet_hash[''] = -1
3737
3738
        # German and international umlauts
3739
        for j in frozenset('ÀÁÂÃÅÄÆÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'):
3740
            alpha_pos[j] = 1
3741
            phonet_hash[j] = -1
3742
3743
        # "normal" letters ('A'-'Z')
3744
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3745
            alpha_pos[j] = i + 2
3746
            phonet_hash[j] = -1
3747
3748
        for i in range(26):
3749
            for j in range(28):
3750
                phonet_hash_1[i, j] = -1
3751
                phonet_hash_2[i, j] = -1
3752
3753
        # for each phonetc rule
3754
        for i in range(len(_phonet_rules)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
3755
            rule = _phonet_rules[i]
3756
3757
            if rule and i % 3 == 0:
3758
                # calculate first hash value
3759
                k = _phonet_rules[i][0]
3760
3761
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3762
                                           _phonet_rules[i+2]):
3763
                    phonet_hash[k] = i
3764
3765
                # calculate second hash values
3766
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3767
                    k = alpha_pos[k]
3768
3769
                    j = k-2
3770
                    rule = rule[1:]
3771
3772
                    if not rule:
3773
                        rule = ' '
3774
                    elif rule[0] == '(':
3775
                        rule = rule[1:]
3776
                    else:
3777
                        rule = rule[0]
3778
3779
                    while rule and (rule[0] != ')'):
3780
                        k = alpha_pos[rule[0]]
3781
3782
                        if k > 0:
3783
                            # add hash value for this letter
3784
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3785
                                phonet_hash_1[j, k] = i
3786
                                phonet_hash_2[j, k] = i
3787
3788
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3789
                                phonet_hash_2[j, k] = i
3790
                            else:
3791
                                k = -1
3792
3793
                        if k <= 0:
3794
                            # add hash value for all letters
3795
                            if phonet_hash_1[j, 0] < 0:
3796
                                phonet_hash_1[j, 0] = i
3797
3798
                            phonet_hash_2[j, 0] = i
3799
3800
                        rule = rule[1:]
3801
3802
    def _phonet(term, mode, lang, trace):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (30/15).
Loading history...
3803
        """Return the phonet coded form of a term."""
3804
        if lang == 'none':
3805
            _phonet_rules = _phonet_rules_no_lang
3806
        else:
3807
            _phonet_rules = _phonet_rules_german
3808
3809
        char0 = ''
3810
        dest = term
3811
3812
        if not term:
3813
            return ''
3814
3815
        term_length = len(term)
3816
3817
        # convert input string to upper-case
3818
        src = term.translate(_phonet_upper_translation)
3819
3820
        # check "src"
3821
        i = 0
3822
        j = 0
3823
        zeta = 0
3824
3825
        while i < len(src):
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
unused-code introduced by
Too many nested blocks (8/5)
Loading history...
unused-code introduced by
Too many nested blocks (9/5)
Loading history...
unused-code introduced by
Too many nested blocks (7/5)
Loading history...
3826
            char = src[i]
3827
3828
            if trace:
3829
                print('\ncheck position {}:  src = "{}",  dest = "{}"'.format
3830
                      (j, src[i:], dest[:j]))
3831
3832
            pos = alpha_pos[char]
3833
3834
            if pos >= 2:
3835
                xpos = pos-2
3836
3837
                if i+1 == len(src):
3838
                    pos = alpha_pos['']
3839
                else:
3840
                    pos = alpha_pos[src[i+1]]
3841
3842
                start1 = phonet_hash_1[xpos, pos]
3843
                start2 = phonet_hash_1[xpos, 0]
3844
                end1 = phonet_hash_2[xpos, pos]
3845
                end2 = phonet_hash_2[xpos, 0]
3846
3847
                # preserve rule priorities
3848
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3849
                    pos = start1
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3850
                    start1 = start2
3851
                    start2 = pos
3852
                    pos = end1
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3853
                    end1 = end2
3854
                    end2 = pos
3855
3856
                if (end1 >= start2) and (start2 >= 0):
0 ignored issues
show
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
3857
                    if end2 > end1:
3858
                        end1 = end2
3859
3860
                    start2 = -1
3861
                    end2 = -1
3862
            else:
3863
                pos = phonet_hash[char]
3864
                start1 = pos
3865
                end1 = 10000
3866
                start2 = -1
3867
                end2 = -1
3868
3869
            pos = start1
3870
            zeta0 = 0
3871
3872
            if pos >= 0:
3873
                # check rules for this char
3874
                while ((_phonet_rules[pos] is None) or
3875
                       (_phonet_rules[pos][0] == char)):
3876
                    if pos > end1:
3877
                        if start2 > 0:
3878
                            pos = start2
3879
                            start1 = start2
3880
                            start2 = -1
3881
                            end1 = end2
3882
                            end2 = -1
3883
                            continue
3884
3885
                        break
3886
3887
                    if (((_phonet_rules[pos] is None) or
3888
                         (_phonet_rules[pos + mode] is None))):
3889
                        # no conversion rule available
3890
                        pos += 3
3891
                        continue
3892
3893
                    if trace:
3894
                        _trinfo('> rule no.', pos, 'is being checked', lang)
3895
3896
                    # check whole string
3897
                    matches = 1  # number of matching letters
3898
                    priority = 5  # default priority
3899
                    rule = _phonet_rules[pos]
3900
                    rule = rule[1:]
3901
3902
                    while (rule and
3903
                           (len(src) > (i + matches)) and
3904
                           (src[i + matches] == rule[0]) and
3905
                           not rule[0].isdigit() and
3906
                           (rule not in '(-<^$')):
3907
                        matches += 1
3908
                        rule = rule[1:]
3909
3910
                    if rule and (rule[0] == '('):
3911
                        # check an array of letters
3912
                        if (((len(src) > (i + matches)) and
3913
                             src[i + matches].isalpha() and
3914
                             (src[i + matches] in rule[1:]))):
3915
                            matches += 1
3916
3917
                            while rule and rule[0] != ')':
3918
                                rule = rule[1:]
3919
3920
                            # if rule[0] == ')':
3921
                            rule = rule[1:]
3922
3923
                    if rule:
3924
                        priority0 = ord(rule[0])
3925
                    else:
3926
                        priority0 = 0
3927
3928
                    matches0 = matches
3929
3930
                    while rule and rule[0] == '-' and matches > 1:
3931
                        matches -= 1
3932
                        rule = rule[1:]
3933
3934
                    if rule and rule[0] == '<':
3935
                        rule = rule[1:]
3936
3937
                    if rule and rule[0].isdigit():
3938
                        # read priority
3939
                        priority = int(rule[0])
3940
                        rule = rule[1:]
3941
3942
                    if rule and rule[0:2] == '^^':
3943
                        rule = rule[1:]
3944
3945
                    if (not rule or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (12/5)
Loading history...
3946
                            ((rule[0] == '^') and
3947
                             ((i == 0) or not src[i-1].isalpha()) and
3948
                             ((rule[1:2] != '$') or
3949
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3950
                               (src[i+matches0:i+matches0+1] != '.')))) or
3951
                            ((rule[0] == '$') and (i > 0) and
3952
                             src[i-1].isalpha() and
3953
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3954
                              (src[i+matches0:i+matches0+1] != '.')))):
3955
                        # look for continuation, if:
3956
                        # matches > 1 und NO '-' in first string */
3957
                        pos0 = -1
3958
3959
                        start3 = 0
3960
                        start4 = 0
3961
                        end3 = 0
3962
                        end4 = 0
3963
3964
                        if (((matches > 1) and
3965
                             src[i+matches:i+matches+1] and
3966
                             (priority0 != ord('-')))):
3967
                            char0 = src[i+matches-1]
3968
                            pos0 = alpha_pos[char0]
3969
3970
                            if pos0 >= 2 and src[i+matches]:
3971
                                xpos = pos0 - 2
3972
                                pos0 = alpha_pos[src[i+matches]]
3973
                                start3 = phonet_hash_1[xpos, pos0]
3974
                                start4 = phonet_hash_1[xpos, 0]
3975
                                end3 = phonet_hash_2[xpos, pos0]
3976
                                end4 = phonet_hash_2[xpos, 0]
3977
3978
                                # preserve rule priorities
3979
                                if (((start4 >= 0) and
3980
                                     ((start3 < 0) or (start4 < start3)))):
3981
                                    pos0 = start3
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3982
                                    start3 = start4
3983
                                    start4 = pos0
3984
                                    pos0 = end3
0 ignored issues
show
Unused Code introduced by
Consider using tuple unpacking for swapping variables
Loading history...
3985
                                    end3 = end4
3986
                                    end4 = pos0
3987
3988
                                if (end3 >= start4) and (start4 >= 0):
0 ignored issues
show
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
3989
                                    if end4 > end3:
3990
                                        end3 = end4
3991
3992
                                    start4 = -1
3993
                                    end4 = -1
3994
                            else:
3995
                                pos0 = phonet_hash[char0]
3996
                                start3 = pos0
3997
                                end3 = 10000
3998
                                start4 = -1
3999
                                end4 = -1
4000
4001
                            pos0 = start3
4002
4003
                        # check continuation rules for src[i+matches]
4004
                        if pos0 >= 0:
4005
                            while ((_phonet_rules[pos0] is None) or
4006
                                   (_phonet_rules[pos0][0] == char0)):
0 ignored issues
show
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
4007
                                if pos0 > end3:
4008
                                    if start4 > 0:
4009
                                        pos0 = start4
4010
                                        start3 = start4
4011
                                        start4 = -1
4012
                                        end3 = end4
4013
                                        end4 = -1
4014
                                        continue
4015
4016
                                    priority0 = -1
4017
4018
                                    # important
4019
                                    break
4020
4021
                                if (((_phonet_rules[pos0] is None) or
4022
                                     (_phonet_rules[pos0 + mode] is None))):
4023
                                    # no conversion rule available
4024
                                    pos0 += 3
4025
                                    continue
4026
4027
                                if trace:
4028
                                    _trinfo('> > continuation rule no.', pos0,
4029
                                            'is being checked', lang)
4030
4031
                                # check whole string
4032
                                matches0 = matches
4033
                                priority0 = 5
4034
                                rule = _phonet_rules[pos0]
4035
                                rule = rule[1:]
0 ignored issues
show
introduced by
Value 'rule' is unsubscriptable
Loading history...
4036
4037
                                while (rule and
4038
                                       (src[i+matches0:i+matches0+1] ==
4039
                                        rule[0]) and
4040
                                       (not rule[0].isdigit() or
4041
                                        (rule in '(-<^$'))):
4042
                                    matches0 += 1
4043
                                    rule = rule[1:]
4044
4045
                                if rule and rule[0] == '(':
4046
                                    # check an array of letters
4047
                                    if ((src[i+matches0:i+matches0+1]
4048
                                         .isalpha() and
4049
                                         (src[i+matches0] in rule[1:]))):
4050
                                        matches0 += 1
4051
4052
                                        while rule and rule[0] != ')':
4053
                                            rule = rule[1:]
4054
4055
                                        # if rule[0] == ')':
4056
                                        rule = rule[1:]
4057
4058
                                while rule and rule[0] == '-':
4059
                                    # "matches0" is NOT decremented
4060
                                    # because of  "if (matches0 == matches)"
4061
                                    rule = rule[1:]
4062
4063
                                if rule and rule[0] == '<':
4064
                                    rule = rule[1:]
4065
4066
                                if rule and rule[0].isdigit():
4067
                                    priority0 = int(rule[0])
4068
                                    rule = rule[1:]
4069
4070
                                if (not rule or
4071
                                        # rule == '^' is not possible here
4072
                                        ((rule[0] == '$') and not
4073
                                         src[i+matches0:i+matches0+1]
4074
                                         .isalpha() and
4075
                                         (src[i+matches0:i+matches0+1]
4076
                                          != '.'))):
4077
                                    if matches0 == matches:
4078
                                        # this is only a partial string
4079
                                        if trace:
4080
                                            _trinfo('> > continuation ' +
4081
                                                    'rule no.',
4082
                                                    pos0,
4083
                                                    'not used (too short)',
4084
                                                    lang)
4085
4086
                                        pos0 += 3
4087
                                        continue
4088
4089
                                    if priority0 < priority:
4090
                                        # priority is too low
4091
                                        if trace:
4092
                                            _trinfo('> > continuation ' +
4093
                                                    'rule no.',
4094
                                                    pos0,
4095
                                                    'not used (priority)',
4096
                                                    lang)
4097
4098
                                        pos0 += 3
4099
                                        continue
4100
4101
                                    # continuation rule found
4102
                                    break
4103
4104
                                if trace:
4105
                                    _trinfo('> > continuation rule no.', pos0,
4106
                                            'not used', lang)
4107
4108
                                pos0 += 3
4109
4110
                            # end of "while"
4111
                            if ((priority0 >= priority) and
4112
                                    ((_phonet_rules[pos0] is not None) and
4113
                                     (_phonet_rules[pos0][0] == char0))):
0 ignored issues
show
introduced by
Value '_phonet_rules[pos0]' is unsubscriptable
Loading history...
4114
4115
                                if trace:
4116
                                    _trinfo('> rule no.', pos, '', lang)
4117
                                    _trinfo('> not used because of ' +
4118
                                            'continuation', pos0, '', lang)
4119
4120
                                pos += 3
4121
                                continue
4122
4123
                        # replace string
4124
                        if trace:
4125
                            _trinfo('Rule no.', pos, 'is applied', lang)
4126
4127
                        if ((_phonet_rules[pos] and
4128
                             ('<' in _phonet_rules[pos][1:]))):
4129
                            priority0 = 1
4130
                        else:
4131
                            priority0 = 0
4132
4133
                        rule = _phonet_rules[pos + mode]
4134
4135
                        if (priority0 == 1) and (zeta == 0):
4136
                            # rule with '<' is applied
4137
                            if ((j > 0) and rule and
4138
                                    ((dest[j-1] == char) or
4139
                                     (dest[j-1] == rule[0]))):
4140
                                j -= 1
4141
4142
                            zeta0 = 1
4143
                            zeta += 1
4144
                            matches0 = 0
4145
4146
                            while rule and src[i+matches0]:
4147
                                src = (src[0:i+matches0] + rule[0] +
4148
                                       src[i+matches0+1:])
4149
                                matches0 += 1
4150
                                rule = rule[1:]
4151
4152
                            if matches0 < matches:
4153
                                src = (src[0:i+matches0] +
4154
                                       src[i+matches:])
4155
4156
                            char = src[i]
4157
                        else:
4158
                            i = i + matches - 1
4159
                            zeta = 0
4160
4161
                            while len(rule) > 1:
4162
                                if (j == 0) or (dest[j - 1] != rule[0]):
4163
                                    dest = (dest[0:j] + rule[0] +
4164
                                            dest[min(len(dest), j+1):])
4165
                                    j += 1
4166
4167
                                rule = rule[1:]
4168
4169
                            # new "current char"
4170
                            if not rule:
4171
                                rule = ''
4172
                                char = ''
4173
                            else:
4174
                                char = rule[0]
4175
4176
                            if ((_phonet_rules[pos] and
4177
                                 '^^' in _phonet_rules[pos][1:])):
4178
                                if char:  # pragma: no branch
4179
                                    dest = (dest[0:j] + char +
4180
                                            dest[min(len(dest), j + 1):])
4181
                                    j += 1
4182
4183
                                src = src[i + 1:]
4184
                                i = 0
4185
                                zeta0 = 1
4186
4187
                        break
4188
4189
                    pos += 3
4190
4191
                    if pos > end1 and start2 > 0:
4192
                        pos = start2
4193
                        start1 = start2
4194
                        end1 = end2
4195
                        start2 = -1
4196
                        end2 = -1
4197
4198
            if zeta0 == 0:
4199
                if char and ((j == 0) or (dest[j-1] != char)):
4200
                    # delete multiple letters only
4201
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4202
                    j += 1
4203
4204
                i += 1
4205
                zeta = 0
4206
4207
        dest = dest[0:j]
4208
4209
        return dest
4210
4211
    _initialize_phonet(lang)
4212
4213
    word = unicodedata.normalize('NFKC', text_type(word))
4214
    return _phonet(word, mode, lang, trace)
4215
4216
4217
def spfc(word):
4218
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4219
4220
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4221
    This implementation is based on page 19-21 of
4222
    https://archive.org/stream/accessingindivid00moor#page/19/mode/1up
4223
4224
    :param str word: the word to transform
4225
    :returns: the SPFC value
4226
    :rtype: str
4227
4228
    >>> spfc('Christopher Smith')
4229
    '01160'
4230
    >>> spfc('Christopher Schmidt')
4231
    '01160'
4232
    >>> spfc('Niall Smith')
4233
    '01660'
4234
    >>> spfc('Niall Schmidt')
4235
4236
    >>> spfc('L.Smith')
4237
    '01960'
4238
    >>> spfc('R.Miller')
4239
    '65490'
4240
4241
    >>> spfc(('L', 'Smith'))
4242
    '01960'
4243
    >>> spfc(('R', 'Miller'))
4244
    '65490'
4245
    """
4246
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4247
                    '0011112222334445556666777'))
4248
    _pf2 = dict(zip((ord(_) for _ in
4249
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4250
                    '0011122233445556677788899'))
4251
    _pf3 = dict(zip((ord(_) for _ in
4252
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4253
                    '00000112223334456677777777'))
4254
4255
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4256
                      ('MN', 'N'))
4257
4258
    def _raise_word_ex():
4259
        """Raise an AttributeError."""
4260
        raise AttributeError('word attribute must be a string with a space ' +
4261
                             'or period dividing the first and last names ' +
4262
                             'or a tuple/list consisting of the first and ' +
4263
                             'last names')
4264
4265
    if not word:
4266
        return ''
4267
4268
    if isinstance(word, (str, text_type)):
4269
        names = word.split('.', 1)
4270
        if len(names) != 2:
4271
            names = word.split(' ', 1)
4272
            if len(names) != 2:
4273
                _raise_word_ex()
4274
    elif hasattr(word, '__iter__'):
4275
        if len(word) != 2:
4276
            _raise_word_ex()
4277
        names = word
4278
    else:
4279
        _raise_word_ex()
4280
4281
    names = [unicodedata.normalize('NFKD', text_type(_.strip()
4282
                                                     .replace('ß', 'SS')
4283
                                                     .upper()))
4284
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4285
    code = ''
4286
4287
    def steps_one_to_three(name):
4288
        """Perform the first three steps of SPFC."""
4289
        # filter out non A-Z
4290
        name = ''.join(_ for _ in name if _ in
4291
                       frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4292
4293
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4294
        # and MN to N
4295
        for subst in _substitutions:
4296
            name = name.replace(subst[0], subst[1])
4297
4298
        # 2. In the name field, replace multiple letters with a single letter
4299
        name = _delete_consecutive_repeats(name)
4300
4301
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4302
        # field.
4303
        if name:
4304
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4305
                                     frozenset('AEIOUWHY'))
4306
        return name
4307
4308
    names = [steps_one_to_three(_) for _ in names]
4309
4310
    # 4. The first digit of the code is obtained using PF1 and the first letter
4311
    # of the name field. Remove this letter after coding.
4312
    if names[1]:
4313
        code += names[1][0].translate(_pf1)
4314
        names[1] = names[1][1:]
4315
4316
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4317
    # second digit of the code. Use as many letters as possible and remove
4318
    # after coding.
4319
    if names[1]:
4320
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4321
            code += '8'
4322
            names[1] = names[1][:-3]
4323
        elif names[1][-2:] == 'SN':
4324
            code += '8'
4325
            names[1] = names[1][:-2]
4326
        elif names[1][-3:] == 'STR':
4327
            code += '9'
4328
            names[1] = names[1][:-3]
4329
        elif names[1][-2:] in frozenset(['SR', 'TN', 'TD']):
4330
            code += '9'
4331
            names[1] = names[1][:-2]
4332
        elif names[1][-3:] == 'DRS':
4333
            code += '7'
4334
            names[1] = names[1][:-3]
4335
        elif names[1][-2:] in frozenset(['TR', 'MN']):
4336
            code += '7'
4337
            names[1] = names[1][:-2]
4338
        else:
4339
            code += names[1][-1].translate(_pf3)
4340
            names[1] = names[1][:-1]
4341
4342
    # 6. The third digit is found using Table PF2 and the first character of
4343
    # the first name. Remove after coding.
4344
    if names[0]:
4345
        code += names[0][0].translate(_pf2)
4346
        names[0] = names[0][1:]
4347
4348
    # 7. The fourth digit is found using Table PF2 and the first character of
4349
    # the name field. If no letters remain use zero. After coding remove the
4350
    # letter.
4351
    # 8. The fifth digit is found in the same manner as the fourth using the
4352
    # remaining characters of the name field if any.
4353
    for _ in range(2):
4354
        if names[1]:
4355
            code += names[1][0].translate(_pf2)
4356
            names[1] = names[1][1:]
4357
        else:
4358
            code += '0'
4359
4360
    return code
4361
4362
4363
def statistics_canada(word, maxlength=4):
4364
    """Return the Statistics Canada code for a word.
4365
4366
    The original description of this algorithm could not be located, and
4367
    may only have been specified in an unpublished TR. The coding does not
4368
    appear to be in use by Statistics Canada any longer. In its place, this is
4369
    an implementation of the "Census modified Statistics Canada name coding
4370
    procedure".
4371
4372
    The modified version of this algorithm is described in Appendix B of
4373
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
4374
    Procedure for the SRS Record Linkage System.` Statistical Reporting
4375
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
4376
    https://naldc.nal.usda.gov/download/27833/PDF
4377
4378
    :param str word: the word to transform
4379
    :param int maxlength: the maximum length (default 6) of the code to return
4380
    :param bool modified: indicates whether to use USDA modified algorithm
4381
    :returns: the Statistics Canada name code value
4382
    :rtype: str
4383
4384
    >>> statistics_canada('Christopher')
4385
    'CHRS'
4386
    >>> statistics_canada('Niall')
4387
    'NL'
4388
    >>> statistics_canada('Smith')
4389
    'SMTH'
4390
    >>> statistics_canada('Schmidt')
4391
    'SCHM'
4392
    """
4393
    # uppercase, normalize, decompose, and filter non-A-Z out
4394
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4395
    word = word.replace('ß', 'SS')
4396
    word = ''.join(c for c in word if c in
4397
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4398
    if not word:
4399
        return ''
4400
4401
    code = word[1:]
4402
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4403
        code = code.replace(vowel, '')
4404
    code = word[0]+code
4405
    code = _delete_consecutive_repeats(code)
4406
    code = code.replace(' ', '')
4407
4408
    return code[:maxlength]
4409
4410
4411
def lein(word, maxlength=4, zero_pad=True):
4412
    """Return the Lein code for a word.
4413
4414
    This is Lein name coding, based on
4415
    https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF
4416
4417
    :param str word: the word to transform
4418
    :param int maxlength: the maximum length (default 4) of the code to return
4419
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4420
        maxlength string
4421
    :returns: the Lein code
4422
    :rtype: str
4423
4424
    >>> lein('Christopher')
4425
    'C351'
4426
    >>> lein('Niall')
4427
    'N300'
4428
    >>> lein('Smith')
4429
    'S210'
4430
    >>> lein('Schmidt')
4431
    'S521'
4432
    """
4433
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4434
                                  'BCDFGJKLMNPQRSTVXZ'),
4435
                                 '451455532245351455'))
4436
4437
    # uppercase, normalize, decompose, and filter non-A-Z out
4438
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4439
    word = word.replace('ß', 'SS')
4440
    word = ''.join(c for c in word if c in
4441
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4442
4443
    if not word:
4444
        return ''
4445
4446
    code = word[0]  # Rule 1
4447
    word = word[1:].translate(str.maketrans('', '', 'AEIOUYWH '))  # Rule 2
4448
    word = _delete_consecutive_repeats(word)  # Rule 3
4449
    code += word.translate(_lein_translation)  # Rule 4
4450
4451
    if zero_pad:
4452
        code += ('0'*maxlength)  # Rule 4
4453
4454
    return code[:maxlength]
4455
4456
4457
def roger_root(word, maxlength=5, zero_pad=True):
4458
    """Return the Roger Root code for a word.
4459
4460
    This is Roger Root name coding, based on
4461
    https://naldc-legacy.nal.usda.gov/naldc/download.xhtml?id=27833&content=PDF
4462
4463
    :param str word: the word to transform
4464
    :param int maxlength: the maximum length (default 5) of the code to return
4465
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4466
        maxlength string
4467
    :returns: the Roger Root code
4468
    :rtype: str
4469
4470
    >>> roger_root('Christopher')
4471
    '06401'
4472
    >>> roger_root('Niall')
4473
    '02500'
4474
    >>> roger_root('Smith')
4475
    '00310'
4476
    >>> roger_root('Schmidt')
4477
    '06310'
4478
    """
4479
    # uppercase, normalize, decompose, and filter non-A-Z out
4480
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4481
    word = word.replace('ß', 'SS')
4482
    word = ''.join(c for c in word if c in
4483
                   frozenset('ABCDEFGHIJKLMNOPQRSTUVWXYZ'))
4484
4485
    if not word:
4486
        return ''
4487
4488
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4489
    _init_patterns = {4: {'TSCH': '06'},
4490
                      3: {'TSH': '06', 'SCH': '06'},
4491
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4492
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4493
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4494
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4495
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4496
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4497
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4498
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4499
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4500
                          'Y': '5', 'Z': '0*0'}}
4501
4502
    _med_patterns = {4: {'TSCH': '6'},
4503
                     3: {'TSH': '6', 'SCH': '6'},
4504
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4505
                         'PH': '8', 'SH': '6', 'TS': '0'},
4506
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4507
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4508
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4509
                         'V': '8', 'X': '7', 'Z': '0',
4510
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4511
                         'U': '*', 'W': '*', 'Y': '*'}}
4512
4513
    code = ''
4514
    pos = 0
4515
4516
    # Do first digit(s) first
4517
    for n in range(4, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
4518
        if word[:n] in _init_patterns[n]:
4519
            code = _init_patterns[n][word[:n]]
4520
            pos += n
4521
            break
4522
    else:
4523
        pos += 1  # Advance if nothing is recognized
4524
4525
    # Then code subsequent digits
4526
    while pos < len(word):
4527
        for n in range(4, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
4528
            if word[pos:pos+n] in _med_patterns[n]:
4529
                code += _med_patterns[n][word[pos:pos+n]]
4530
                pos += n
4531
                break
4532
        else:
4533
            pos += 1  # Advance if nothing is recognized
4534
4535
    code = _delete_consecutive_repeats(code)
4536
    code = code.replace('*', '')
4537
4538
    if zero_pad:
4539
        code += '0'*maxlength
4540
4541
    return code[:maxlength]
4542
4543
4544
def onca(word, maxlength=4, zero_pad=True):
4545
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.
4546
4547
    This is the Oxford Name Compression Algorithm, based on:
4548
    Gill, Leicester E. 1997. "OX-LINK: The Oxford Medical Record Linkage
4549
    System." In ``Record Linkage Techniques -- 1997``. Arlington, VA. March
4550
    20--21, 1997.
4551
    https://nces.ed.gov/FCSM/pdf/RLT97.pdf
4552
4553
    I can find no complete description of the "anglicised version of the NYSIIS
4554
    method" identified as the first step in this algorithm, so this is likely
4555
    not a correct implementation, in that it employs the standard NYSIIS
4556
    algorithm.
4557
4558
    :param str word: the word to transform
4559
    :param int maxlength: the maximum length (default 5) of the code to return
4560
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4561
        maxlength string
4562
    :returns: the ONCA code
4563
    :rtype: str
4564
4565
    >>> onca('Christopher')
4566
    'C623'
4567
    >>> onca('Niall')
4568
    'N400'
4569
    >>> onca('Smith')
4570
    'S530'
4571
    >>> onca('Schmidt')
4572
    'S530'
4573
    """
4574
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
4575
    # to one character of output, so give it triple the maxlength.
4576
    return soundex(nysiis(word, maxlength=maxlength*3), maxlength, zero_pad=zero_pad)
4577
4578
4579
def eudex(word):
0 ignored issues
show
Unused Code introduced by
The argument word seems to be unused.
Loading history...
4580
    """Return the eudex hash of a word.
4581
4582
    :param str word: the word to transform
4583
    :returns: the eudex hash
4584
    :rtype: str
4585
    """
4586
    _phones = {
4587
        0x61:0,           # a
4588
        0x62:0b01001000,  # b
4589
        0x63:0b00001100,  # c
4590
        0x64:0b00011000,  # d
4591
        0x65:0,           # e
4592
        0x66:0b01000100,  # f
4593
        0x67:0b00001000,  # g
4594
        0x68:0b00000100,  # h
4595
        0x69:1,           # i
4596
        0x6a:0b00000101,  # j
4597
        0x6b:0b00001001,  # k
4598
        0x6c:0b10100000,  # l
4599
        0x6d:0b00000010,  # m
4600
        0x6e:0b00010010,  # n
4601
        0x6f:0,           # o
4602
        0x70:0b01001001,  # p
4603
        0x71:0b10101000,  # q
4604
        0x72:0b10100001,  # r
4605
        0x73:0b00010100,  # s
4606
        0x74:0b00011101,  # t
4607
        0x75:1,           # u
4608
        0x76:0b01000101,  # v
4609
        0x77:0b00000000,  # w
4610
        0x78:0b10000100,  # x
4611
        0x79:1,           # y
4612
        0x7a:0b10010100,  # z
4613
4614
        0xdf:0b00010101,  # ß
4615
        0xe0:0,           # à
4616
        0xe1:0,           # á
4617
        0xe2:0,           # â
4618
        0xe3:0,           # ã
4619
        0xe4:0,           # ä[æ]
4620
        0xe5:1,           # å[oː]
4621
        0xe6:0,           # æ[æ]
4622
        0xe7:0b10010101,  # ç[t͡ʃ]
4623
        0xe8:1,           # è
4624
        0xe9:1,           # é
4625
        0xea:1,           # ê
4626
        0xeb:1,           # ë
4627
        0xec:1,           # ì
4628
        0xed:1,           # í
4629
        0xee:1,           # î
4630
        0xef:1,           # ï
4631
        0xf0:0b00010101,  # ð[ð̠](represented as a non-plosive T)
4632
        0xf1:0b00010111,  # ñ[nj](represented as a combination of n and j)
4633
        0xf2:0,           # ò
4634
        0xf3:0,           # ó
4635
        0xf4:0,           # ô
4636
        0xf5:0,           # õ
4637
        0xf6:1,           # ö[ø]
4638
        0xf7:0b11111111,  # ÷
4639
        0xf8:1,           # ø[ø]
4640
        0xf9:1,           # ù
4641
        0xfa:1,           # ú
4642
        0xfb:1,           # û
4643
        0xfc:1,           # ü
4644
        0xfd:1,           # ý
4645
        0xfe:0b00010101,  # þ[ð̠](represented as a non-plosive T)
4646
        0xff:1,           # ÿ
4647
    }
4648
4649
    _injective_phones = {
4650
        0x61:0b10000100,  # a*
4651
        0x62:0b00100100,  # b
4652
        0x63:0b00000110,  # c
4653
        0x64:0b00001100,  # d
4654
        0x65:0b11011000,  # e*
4655
        0x66:0b00100010,  # f
4656
        0x67:0b00000100,  # g
4657
        0x68:0b00000010,  # h
4658
        0x69:0b11111000,  # i*
4659
        0x6a:0b00000011,  # j
4660
        0x6b:0b00000101,  # k
4661
        0x6c:0b01010000,  # l
4662
        0x6d:0b00000001,  # m
4663
        0x6e:0b00001001,  # n
4664
        0x6f:0b10010100,  # o*
4665
        0x70:0b00100101,  # p
4666
        0x71:0b01010100,  # q
4667
        0x72:0b01010001,  # r
4668
        0x73:0b00001010,  # s
4669
        0x74:0b00001110,  # t
4670
        0x75:0b11100000,  # u*
4671
        0x76:0b00100011,  # v
4672
        0x77:0b00000000,  # w
4673
        0x78:0b01000010,  # x
4674
        0x79:0b11100100,  # y*
4675
        0x7a:0b01001010,  # z
4676
4677
        0xdf:0b00001011,  # ß
4678
        0xe0:0b10000101,  # à
4679
        0xe1:0b10000101,  # á
4680
        0xe2:0b10000000,  # â
4681
        0xe3:0b10000110,  # ã
4682
        0xe4:0b10100110,  # ä [æ]
4683
        0xe5:0b11000010,  # å [oː]
4684
        0xe6:0b10100111,  # æ [æ]
4685
        0xe7:0b01010100,  # ç [t͡ʃ]
4686
        0xe8:0b11011001,  # è
4687
        0xe9:0b11011001,  # é
4688
        0xea:0b11011001,  # ê
4689
        0xeb:0b11000110,  # ë [ə] or [œ]
4690
        0xec:0b11111001,  # ì
4691
        0xed:0b11111001,  # í
4692
        0xee:0b11111001,  # î
4693
        0xef:0b11111001,  # ï
4694
        0xf0:0b00001011,  # ð [ð̠] (represented as a non-plosive T)
4695
        0xf1:0b00001011,  # ñ [nj] (represented as a combination of n and j)
4696
        0xf2:0b10010101,  # ò
4697
        0xf3:0b10010101,  # ó
4698
        0xf4:0b10010101,  # ô
4699
        0xf5:0b10010101,  # õ
4700
        0xf6:0b11011100,  # ö [œ] or [ø]
4701
        0xf7:0b11111111,  # ÷
4702
        0xf8:0b11011101,  # ø [œ] or [ø]
4703
        0xf9:0b11100001,  # ù
4704
        0xfa:0b11100001,  # ú
4705
        0xfb:0b11100001,  # û
4706
        0xfc:0b11100101,  # ü
4707
        0xfd:0b11100101,  # ý
4708
        0xfe:0b00001011,  # þ [ð̠] (represented as a non-plosive T)
4709
        0xff:0b11100101,  # ÿ
4710
    }
4711
4712
    def _map_first(letter):
4713
        """Map the first character in a word."""
4714
        letter |= 32
4715
4716
4717
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
4718
         concat=False, filter_langs=False):
4719
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
4720
4721
    The Beider-Morse Phonetic Matching algorithm is described at:
4722
    http://stevemorse.org/phonetics/bmpm.htm
4723
    The reference implementation is licensed under GPLv3 and available at:
4724
    http://stevemorse.org/phoneticinfo.htm
4725
4726
    :param str word: the word to transform
4727
    :param str language_arg: the language of the term; supported values
4728
        include:
4729
4730
            - 'any'
4731
            - 'arabic'
4732
            - 'cyrillic'
4733
            - 'czech'
4734
            - 'dutch'
4735
            - 'english'
4736
            - 'french'
4737
            - 'german'
4738
            - 'greek'
4739
            - 'greeklatin'
4740
            - 'hebrew'
4741
            - 'hungarian'
4742
            - 'italian'
4743
            - 'polish'
4744
            - 'portuguese'
4745
            - 'romanian'
4746
            - 'russian'
4747
            - 'spanish'
4748
            - 'turkish'
4749
            - 'germandjsg'
4750
            - 'polishdjskp'
4751
            - 'russiandjsre'
4752
4753
    :param str name_mode: the name mode of the algorithm:
4754
4755
            - 'gen' -- general (default)
4756
            - 'ash' -- Ashkenazi
4757
            - 'sep' -- Sephardic
4758
4759
    :param str match_mode: matching mode: 'approx' or 'exact'
4760
    :param bool concat: concatenation mode
4761
    :param bool filter_langs: filter out incompatible languages
4762
    :returns: the BMPM value(s)
4763
    :rtype: tuple
4764
4765
    >>> bmpm('Christopher')
4766
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4767
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
4768
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
4769
    zritofi'
4770
    >>> bmpm('Niall')
4771
    'nial niol'
4772
    >>> bmpm('Smith')
4773
    'zmit'
4774
    >>> bmpm('Schmidt')
4775
    'zmit stzmit'
4776
4777
    >>> bmpm('Christopher', language_arg='German')
4778
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4779
    xristYfir'
4780
    >>> bmpm('Christopher', language_arg='English')
4781
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
4782
    xrQstafir'
4783
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
4784
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
4785
    xristYfir'
4786
4787
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
4788
    'xriStopher xriStofer xristopher xristofer'
4789
    """
4790
    return _bmpm(word, language_arg, name_mode, match_mode,
4791
                 concat, filter_langs)
4792
4793
4794
if __name__ == '__main__':
4795
    import doctest
4796
    doctest.testmod()
4797