Test Failed
Push — master ( bf99bd...78b0d5 )
by Chris
11:17
created

abydos.phonetic.soundex_br()   C

Complexity

Conditions 11

Size

Total Lines 40
Code Lines 29

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 11
eloc 29
nop 3
dl 0
loc 40
rs 5.4
c 0
b 0
f 0

How to fix   Complexity   

Complexity

Complex classes like abydos.phonetic.soundex_br() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (6104/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Refined Soundex
26
    - Daitch-Mokotoff Soundex
27
    - Kölner Phonetik
28
    - NYSIIS
29
    - Match Rating Algorithm
30
    - Metaphone
31
    - Double Metaphone
32
    - Caverphone
33
    - Alpha Search Inquiry System
34
    - Fuzzy Soundex
35
    - Phonex
36
    - Phonem
37
    - Phonix
38
    - SfinxBis
39
    - phonet
40
    - Standardized Phonetic Frequency Code
41
    - Statistics Canada
42
    - Lein
43
    - Roger Root
44
    - Oxford Name Compression Algorithm (ONCA)
45
    - Eudex phonetic hash
46
    - Haase Phonetik
47
    - Reth-Schek Phonetik
48
    - FONEM
49
    - Parmar-Kumbharana
50
    - Davidson's Consonant Code
51
    - SoundD
52
    - PSHP Soundex/Viewex Coding
53
    - an early version of Henry Code
54
    - Norphone
55
    - Dolby Code
56
    - Phonetic Spanish
57
    - Spanish Metaphone
58
    - MetaSoundex
59
    - SoundexBR
60
    - Beider-Morse Phonetic Matching
61
"""
62
63
from __future__ import division, unicode_literals
64
65
from collections import Counter
66
from itertools import groupby, product
67
from re import compile as re_compile
68
from unicodedata import normalize
69
70
from six import text_type
71
from six.moves import range
72
73
from ._bm import _bmpm
74
75
_INFINITY = float('inf')
76
77
78
def _delete_consecutive_repeats(word):
79
    """Delete consecutive repeated characters in a word.
80
81
    :param str word: the word to transform
82
    :returns: word with consecutive repeating characters collapsed to
83
        a single instance
84
    :rtype: str
85
    """
86
    return ''.join(char for char, _ in groupby(word))
87
88
89
def russell_index(word):
90
    """Return the Russell Index (integer output) of a word.
91
92
    This follows Robert C. Russell's Index algorithm, as described in
93
    :cite:`Russell:1917`.
94
95
    :param str word: the word to transform
96
    :returns: the Russell Index value
97
    :rtype: int
98
99
    >>> russell_index('Christopher')
100
    3813428
101
    >>> russell_index('Niall')
102
    715
103
    >>> russell_index('Smith')
104
    3614
105
    >>> russell_index('Schmidt')
106
    3614
107
    """
108
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
109
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
110
                                    '12341231356712383412313'))
111
112
    word = normalize('NFKD', text_type(word.upper()))
113
    word = word.replace('ß', 'SS')
114
    word = word.replace('GH', '')  # discard gh (rule 3)
115
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
116
117
    # translate according to Russell's mapping
118
    word = ''.join(c for c in word if c in
119
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
120
                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
121
    sdx = word.translate(_russell_translation)
122
123
    # remove any 1s after the first occurrence
124
    one = sdx.find('1')+1
125
    if one:
126
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
127
128
    # remove repeating characters
129
    sdx = _delete_consecutive_repeats(sdx)
130
131
    # return as an int
132
    return int(sdx) if sdx else float('NaN')
133
134
135
def russell_index_num_to_alpha(num):
136
    """Convert the Russell Index integer to an alphabetic string.
137
138
    This follows Robert C. Russell's Index algorithm, as described in
139
    :cite:`Russell:1917`.
140
141
    :param int num: a Russell Index integer value
142
    :returns: the Russell Index as an alphabetic string
143
    :rtype: str
144
145
    >>> russell_index_num_to_alpha(3813428)
146
    'CRACDBR'
147
    >>> russell_index_num_to_alpha(715)
148
    'NAL'
149
    >>> russell_index_num_to_alpha(3614)
150
    'CMAD'
151
    """
152
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
153
                                        'ABCDLMNR'))
154
    num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
155
                                                     '6', '7', '8'})
156
    if num:
157
        return num.translate(_russell_num_translation)
158
    return ''
159
160
161
def russell_index_alpha(word):
162
    """Return the Russell Index (alphabetic output) for the word.
163
164
    This follows Robert C. Russell's Index algorithm, as described in
165
    :cite:`Russell:1917`.
166
167
    :param str word: the word to transform
168
    :returns: the Russell Index value as an alphabetic string
169
    :rtype: str
170
171
    >>> russell_index_alpha('Christopher')
172
    'CRACDBR'
173
    >>> russell_index_alpha('Niall')
174
    'NAL'
175
    >>> russell_index_alpha('Smith')
176
    'CMAD'
177
    >>> russell_index_alpha('Schmidt')
178
    'CMAD'
179
    """
180
    if word:
181
        return russell_index_num_to_alpha(russell_index(word))
182
    return ''
183
184
185
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
186
    """Return the Soundex code for a word.
187
188
    :param str word: the word to transform
189
    :param int maxlength: the length of the code returned (defaults to 4)
190
    :param str var: the variant of the algorithm to employ (defaults to
191
        'American'):
192
193
        - 'American' follows the American Soundex algorithm, as described at
194
          :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
195
          Miracode
196
        - 'special' follows the rules from the 1880-1910 US Census
197
          retrospective re-analysis, in which h & w are not treated as blocking
198
          consonants but as vowels. Cf. :cite:`Repici:2013`.
199
        - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
200
          US Census, including coding prefixed and unprefixed versions of some
201
          names
202
203
    :param bool reverse: reverse the word before computing the selected Soundex
204
        (defaults to False); This results in "Reverse Soundex"
205
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
206
        maxlength string
207
    :returns: the Soundex value
208
    :rtype: str
209
210
    >>> soundex("Christopher")
211
    'C623'
212
    >>> soundex("Niall")
213
    'N400'
214
    >>> soundex('Smith')
215
    'S530'
216
    >>> soundex('Schmidt')
217
    'S530'
218
219
220
    >>> soundex('Christopher', maxlength=_INFINITY)
221
    'C623160000000000000000000000000000000000000000000000000000000000'
222
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
223
    'C62316'
224
225
    >>> soundex('Christopher', reverse=True)
226
    'R132'
227
228
    >>> soundex('Ashcroft')
229
    'A261'
230
    >>> soundex('Asicroft')
231
    'A226'
232
    >>> soundex('Ashcroft', var='special')
233
    'A226'
234
    >>> soundex('Asicroft', var='special')
235
    'A226'
236
    """
237
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
238
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
239
                                    '01230129022455012623019202'))
240
241
    # Require a maxlength of at least 4 and not more than 64
242
    if maxlength is not None:
243
        maxlength = min(max(4, maxlength), 64)
244
    else:
245
        maxlength = 64
246
247
    # uppercase, normalize, decompose, and filter non-A-Z out
248
    word = normalize('NFKD', text_type(word.upper()))
249
    word = word.replace('ß', 'SS')
250
251
    if var == 'Census':
252
        # Should these prefixes be supplemented? (VANDE, DELA, VON)
253
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
254
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
255
                    soundex(word[3:], maxlength, 'American', reverse,
256
                            zero_pad))
257
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
258
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
259
                    soundex(word[2:], maxlength, 'American', reverse,
260
                            zero_pad))
261
        # Otherwise, proceed as usual (var='American' mode, ostensibly)
262
263
    word = ''.join(c for c in word if c in
264
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
265
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
266
                    'Y', 'Z'})
267
268
    # Nothing to convert, return base case
269
    if not word:
270
        if zero_pad:
271
            return '0'*maxlength
272
        return '0'
273
274
    # Reverse word if computing Reverse Soundex
275
    if reverse:
276
        word = word[::-1]
277
278
    # apply the Soundex algorithm
279
    sdx = word.translate(_soundex_translation)
280
281
    if var == 'special':
282
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
283
    else:
284
        sdx = sdx.replace('9', '')  # rule 1
285
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
286
287
    if word[0] in 'HW':
288
        sdx = word[0] + sdx
289
    else:
290
        sdx = word[0] + sdx[1:]
291
    sdx = sdx.replace('0', '')  # rule 1
292
293
    if zero_pad:
294
        sdx += ('0'*maxlength)  # rule 4
295
296
    return sdx[:maxlength]
297
298
299
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False,
300
                    retain_vowels=False):
301
    """Return the Refined Soundex code for a word.
302
303
    This is Soundex, but with more character classes. It was defined at
304
    :cite:`Boyce:1998`.
305
306
    :param word: the word to transform
307
    :param maxlength: the length of the code returned (defaults to unlimited)
308
    :param reverse: reverse the word before computing the selected Soundex
309
        (defaults to False); This results in "Reverse Soundex"
310
    :param zero_pad: pad the end of the return value with 0s to achieve a
311
        maxlength string
312
    :param retain_vowels: retain vowels (as 0) in the resulting code
313
    :returns: the Refined Soundex value
314
    :rtype: str
315
316
    >>> refined_soundex('Christopher')
317
    'C393619'
318
    >>> refined_soundex('Niall')
319
    'N87'
320
    >>> refined_soundex('Smith')
321
    'S386'
322
    >>> refined_soundex('Schmidt')
323
    'S386'
324
    """
325
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
326
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
327
                                        '01360240043788015936020505'))
328
329
    # uppercase, normalize, decompose, and filter non-A-Z out
330
    word = normalize('NFKD', text_type(word.upper()))
331
    word = word.replace('ß', 'SS')
332
    word = ''.join(c for c in word if c in
333
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
334
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
335
                    'Y', 'Z'})
336
337
    # Reverse word if computing Reverse Soundex
338
    if reverse:
339
        word = word[::-1]
340
341
    # apply the Soundex algorithm
342
    sdx = word[:1] + word.translate(_ref_soundex_translation)
343
    sdx = _delete_consecutive_repeats(sdx)
344
    if not retain_vowels:
345
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y
346
347
    if maxlength < _INFINITY:
348
        if zero_pad:
349
            sdx += ('0' * maxlength)
350
        if maxlength:
351
            sdx = sdx[:maxlength]
352
353
    return sdx
354
355
356
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True):
357
    """Return the Daitch-Mokotoff Soundex code for a word.
358
359
    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
360
    of a word as a set. A collection is necessary since there can be multiple
361
    values for a single word.
362
363
    :param word: the word to transform
364
    :param maxlength: the length of the code returned (defaults to 6)
365
    :param reverse: reverse the word before computing the selected Soundex
366
        (defaults to False); This results in "Reverse Soundex"
367
    :param zero_pad: pad the end of the return value with 0s to achieve a
368
        maxlength string
369
    :returns: the Daitch-Mokotoff Soundex value
370
    :rtype: str
371
372
    >>> sorted(dm_soundex('Christopher'))
373
    ['494379', '594379']
374
    >>> dm_soundex('Niall')
375
    {'680000'}
376
    >>> dm_soundex('Smith')
377
    {'463000'}
378
    >>> dm_soundex('Schmidt')
379
    {'463000'}
380
381
    >>> sorted(dm_soundex('The quick brown fox', maxlength=20, zero_pad=False))
382
    ['35457976754', '3557976754']
383
    """
384
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
385
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
386
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
387
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
388
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
389
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
390
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
391
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
392
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
393
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
394
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
395
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
396
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
397
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
398
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
399
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
400
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
401
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
402
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
403
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
404
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
405
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
406
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
407
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
408
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
409
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
410
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
411
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
412
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
413
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
414
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
415
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
416
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
417
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
418
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
419
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
420
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
421
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
422
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
423
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
424
                  'CH': ((5, 4), (5, 4), (5, 4)),
425
                  'CK': ((5, 45), (5, 45), (5, 45)),
426
                  'C': ((5, 4), (5, 4), (5, 4)),
427
                  'J': ((1, 4), ('_', 4), ('_', 4)),
428
                  'RZ': ((94, 4), (94, 4), (94, 4)),
429
                  'RS': ((94, 4), (94, 4), (94, 4))}
430
431
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
432
                  'B': ('B'),
433
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
434
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
435
                        'DZ', 'D'),
436
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
437
                  'F': ('FB', 'F'),
438
                  'G': ('G'),
439
                  'H': ('H'),
440
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
441
                  'J': ('J'),
442
                  'K': ('KH', 'KS', 'K'),
443
                  'L': ('L'),
444
                  'M': ('MN', 'M'),
445
                  'N': ('NM', 'N'),
446
                  'O': ('OI', 'OJ', 'OY', 'O'),
447
                  'P': ('PF', 'PH', 'P'),
448
                  'Q': ('Q'),
449
                  'R': ('RS', 'RZ', 'R'),
450
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
451
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
452
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
453
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
454
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
455
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
456
                        'TS', 'TZ', 'T'),
457
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
458
                  'V': ('V'),
459
                  'W': ('W'),
460
                  'X': ('X'),
461
                  'Y': ('Y'),
462
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
463
                        'ZH', 'ZS', 'Z')}
464
465
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
466
    dms = ['']  # initialize empty code list
467
468
    # Require a maxlength of at least 6 and not more than 64
469
    if maxlength is not None:
470
        maxlength = min(max(6, maxlength), 64)
471
    else:
472
        maxlength = 64
473
474
    # uppercase, normalize, decompose, and filter non-A-Z
475
    word = normalize('NFKD', text_type(word.upper()))
476
    word = word.replace('ß', 'SS')
477
    word = ''.join(c for c in word if c in
478
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
479
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
480
                    'Y', 'Z'})
481
482
    # Nothing to convert, return base case
483
    if not word:
484
        if zero_pad:
485
            return {'0'*maxlength}
486
        return {'0'}
487
488
    # Reverse word if computing Reverse Soundex
489
    if reverse:
490
        word = word[::-1]
491
492
    pos = 0
493
    while pos < len(word):
494
        # Iterate through _dms_order, which specifies the possible substrings
495
        # for which codes exist in the Daitch-Mokotoff coding
496
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
497
            if word[pos:].startswith(sstr):
498
                # Having determined a valid substring start, retrieve the code
499
                dm_val = _dms_table[sstr]
500
501
                # Having retried the code (triple), determine the correct
502
                # positional variant (first, pre-vocalic, elsewhere)
503
                if pos == 0:
504
                    dm_val = dm_val[0]
505
                elif (pos+len(sstr) < len(word) and
506
                      word[pos+len(sstr)] in _vowels):
507
                    dm_val = dm_val[1]
508
                else:
509
                    dm_val = dm_val[2]
510
511
                # Build the code strings
512
                if isinstance(dm_val, tuple):
513
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
514
                            + [_ + text_type(dm_val[1]) for _ in dms]
515
                else:
516
                    dms = [_ + text_type(dm_val) for _ in dms]
517
                pos += len(sstr)
518
                break
519
520
    # Filter out double letters and _ placeholders
521
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
522
           for _ in dms)
523
524
    # Trim codes and return set
525
    if zero_pad:
526
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
527
    else:
528
        dms = (_[:maxlength] for _ in dms)
529
    return set(dms)
530
531
532
def koelner_phonetik(word):
533
    """Return the Kölner Phonetik (numeric output) code for a word.
534
535
    Based on the algorithm defined by :cite:`Postel:1969`.
536
537
    While the output code is numeric, it is still a str because 0s can lead
538
    the code.
539
540
    :param str word: the word to transform
541
    :returns: the Kölner Phonetik value as a numeric string
542
    :rtype: str
543
544
    >>> koelner_phonetik('Christopher')
545
    '478237'
546
    >>> koelner_phonetik('Niall')
547
    '65'
548
    >>> koelner_phonetik('Smith')
549
    '862'
550
    >>> koelner_phonetik('Schmidt')
551
    '862'
552
    >>> koelner_phonetik('Müller')
553
    '657'
554
    >>> koelner_phonetik('Zimmermann')
555
    '86766'
556
    """
557
    # pylint: disable=too-many-branches
558
    def _after(word, i, letters):
559
        """Return True if word[i] follows one of the supplied letters."""
560
        if i > 0 and word[i-1] in letters:
561
            return True
562
        return False
563
564
    def _before(word, i, letters):
565
        """Return True if word[i] precedes one of the supplied letters."""
566
        if i+1 < len(word) and word[i+1] in letters:
567
            return True
568
        return False
569
570
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
571
572
    sdx = ''
573
574
    word = normalize('NFKD', text_type(word.upper()))
575
    word = word.replace('ß', 'SS')
576
577
    word = word.replace('Ä', 'AE')
578
    word = word.replace('Ö', 'OE')
579
    word = word.replace('Ü', 'UE')
580
    word = ''.join(c for c in word if c in
581
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
582
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
583
                    'Y', 'Z'})
584
585
    # Nothing to convert, return base case
586
    if not word:
587
        return sdx
588
589
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
590 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
591
            sdx += '0'
592
        elif word[i] == 'B':
593
            sdx += '1'
594
        elif word[i] == 'P':
595
            if _before(word, i, {'H'}):
596
                sdx += '3'
597
            else:
598
                sdx += '1'
599
        elif word[i] in {'D', 'T'}:
600
            if _before(word, i, {'C', 'S', 'Z'}):
601
                sdx += '8'
602
            else:
603
                sdx += '2'
604
        elif word[i] in {'F', 'V', 'W'}:
605
            sdx += '3'
606
        elif word[i] in {'G', 'K', 'Q'}:
607
            sdx += '4'
608
        elif word[i] == 'C':
609
            if _after(word, i, {'S', 'Z'}):
610
                sdx += '8'
611
            elif i == 0:
612
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
613
                                     'X'}):
614
                    sdx += '4'
615
                else:
616
                    sdx += '8'
617
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
618
                sdx += '4'
619
            else:
620
                sdx += '8'
621
        elif word[i] == 'X':
622
            if _after(word, i, {'C', 'K', 'Q'}):
623
                sdx += '8'
624
            else:
625
                sdx += '48'
626
        elif word[i] == 'L':
627
            sdx += '5'
628
        elif word[i] in {'M', 'N'}:
629
            sdx += '6'
630
        elif word[i] == 'R':
631
            sdx += '7'
632
        elif word[i] in {'S', 'Z'}:
633
            sdx += '8'
634
635
    sdx = _delete_consecutive_repeats(sdx)
636
637
    if sdx:
638
        sdx = sdx[:1] + sdx[1:].replace('0', '')
639
640
    return sdx
641
642
643
def koelner_phonetik_num_to_alpha(num):
644
    """Convert a Kölner Phonetik code from numeric to alphabetic.
645
646
    :param str num: a numeric Kölner Phonetik representation
647
    :returns: an alphabetic representation of the same word
648
    :rtype: str
649
650
    >>> koelner_phonetik_num_to_alpha(862)
651
    'SNT'
652
    >>> koelner_phonetik_num_to_alpha(657)
653
    'NLR'
654
    >>> koelner_phonetik_num_to_alpha(86766)
655
    'SNRNN'
656
    """
657
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
658
                                        'APTFKLNRS'))
659
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
660
                                                     '5', '6', '7', '8'})
661
    return num.translate(_koelner_num_translation)
662
663
664
def koelner_phonetik_alpha(word):
665
    """Return the Kölner Phonetik (alphabetic output) code for a word.
666
667
    :param str word: the word to transform
668
    :returns: the Kölner Phonetik value as an alphabetic string
669
    :rtype: str
670
671
    >>> koelner_phonetik_alpha('Smith')
672
    'SNT'
673
    >>> koelner_phonetik_alpha('Schmidt')
674
    'SNT'
675
    >>> koelner_phonetik_alpha('Müller')
676
    'NLR'
677
    >>> koelner_phonetik_alpha('Zimmermann')
678
    'SNRNN'
679
    """
680
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
681
682
683
def nysiis(word, maxlength=6, modified=False):
684
    """Return the NYSIIS code for a word.
685
686
    The New York State Identification and Intelligence System algorithm is
687
    defined in :cite:`Taft:1970`.
688
689
    The modified version of this algorithm is described in Appendix B of
690
    :cite:`Lynch:1977`.
691
692
    :param str word: the word to transform
693
    :param int maxlength: the maximum length (default 6) of the code to return
694
    :param bool modified: indicates whether to use USDA modified NYSIIS
695
    :returns: the NYSIIS value
696
    :rtype: str
697
698
    >>> nysiis('Christopher')
699
    'CRASTA'
700
    >>> nysiis('Niall')
701
    'NAL'
702
    >>> nysiis('Smith')
703
    'SNAT'
704
    >>> nysiis('Schmidt')
705
    'SNAD'
706
707
    >>> nysiis('Christopher', maxlength=_INFINITY)
708
    'CRASTAFAR'
709
710
    >>> nysiis('Christopher', maxlength=8, modified=True)
711
    'CRASTAFA'
712
    >>> nysiis('Niall', maxlength=8, modified=True)
713
    'NAL'
714
    >>> nysiis('Smith', maxlength=8, modified=True)
715
    'SNAT'
716
    >>> nysiis('Schmidt', maxlength=8, modified=True)
717
    'SNAD'
718
    """
719
    # Require a maxlength of at least 6
720
    if maxlength:
721
        maxlength = max(6, maxlength)
722
723
    _vowels = {'A', 'E', 'I', 'O', 'U'}
724
725
    word = ''.join(c for c in word.upper() if c.isalpha())
726
    word = word.replace('ß', 'SS')
727
728
    # exit early if there are no alphas
729
    if not word:
730
        return ''
731
732
    if modified:
733
        original_first_char = word[0]
734
735
    if word[:3] == 'MAC':
736
        word = 'MCC'+word[3:]
737
    elif word[:2] == 'KN':
738
        word = 'NN'+word[2:]
739
    elif word[:1] == 'K':
740
        word = 'C'+word[1:]
741
    elif word[:2] in {'PH', 'PF'}:
742
        word = 'FF'+word[2:]
743
    elif word[:3] == 'SCH':
744
        word = 'SSS'+word[3:]
745
    elif modified:
746
        if word[:2] == 'WR':
747
            word = 'RR'+word[2:]
748
        elif word[:2] == 'RH':
749
            word = 'RR'+word[2:]
750
        elif word[:2] == 'DG':
751
            word = 'GG'+word[2:]
752
        elif word[:1] in _vowels:
753
            word = 'A'+word[1:]
754
755
    if modified and word[-1:] in {'S', 'Z'}:
756
        word = word[:-1]
757
758
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
759
                                                  word[-2:] == 'YE'):
760
        word = word[:-2]+'Y'
761
    elif word[-2:] in {'DT', 'RT', 'RD'}:
762
        word = word[:-2]+'D'
763
    elif word[-2:] in {'NT', 'ND'}:
764
        word = word[:-2]+('N' if modified else 'D')
765
    elif modified:
766
        if word[-2:] == 'IX':
767
            word = word[:-2]+'ICK'
768
        elif word[-2:] == 'EX':
769
            word = word[:-2]+'ECK'
770
        elif word[-2:] in {'JR', 'SR'}:
771
            return 'ERROR'  # TODO: decide how best to return an error
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
772
773
    key = word[:1]
774
775
    skip = 0
776
    for i in range(1, len(word)):
777
        if i >= len(word):
778
            continue
779
        elif skip:
780
            skip -= 1
781
            continue
782
        elif word[i:i+2] == 'EV':
783
            word = word[:i] + 'AF' + word[i+2:]
784
            skip = 1
785
        elif word[i] in _vowels:
786
            word = word[:i] + 'A' + word[i+1:]
787
        elif modified and i != len(word)-1 and word[i] == 'Y':
788
            word = word[:i] + 'A' + word[i+1:]
789
        elif word[i] == 'Q':
790
            word = word[:i] + 'G' + word[i+1:]
791
        elif word[i] == 'Z':
792
            word = word[:i] + 'S' + word[i+1:]
793
        elif word[i] == 'M':
794
            word = word[:i] + 'N' + word[i+1:]
795
        elif word[i:i+2] == 'KN':
796
            word = word[:i] + 'N' + word[i+2:]
797
        elif word[i] == 'K':
798
            word = word[:i] + 'C' + word[i+1:]
799
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
800
            word = word[:i] + 'SSA'
801
            skip = 2
802
        elif word[i:i+3] == 'SCH':
803
            word = word[:i] + 'SSS' + word[i+3:]
804
            skip = 2
805
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
806
            word = word[:i] + 'SA'
807
            skip = 1
808
        elif word[i:i+2] == 'SH':
809
            word = word[:i] + 'SS' + word[i+2:]
810
            skip = 1
811
        elif word[i:i+2] == 'PH':
812
            word = word[:i] + 'FF' + word[i+2:]
813
            skip = 1
814
        elif modified and word[i:i+3] == 'GHT':
815
            word = word[:i] + 'TTT' + word[i+3:]
816
            skip = 2
817
        elif modified and word[i:i+2] == 'DG':
818
            word = word[:i] + 'GG' + word[i+2:]
819
            skip = 1
820
        elif modified and word[i:i+2] == 'WR':
821
            word = word[:i] + 'RR' + word[i+2:]
822
            skip = 1
823
        elif word[i] == 'H' and (word[i-1] not in _vowels or
824
                                 word[i+1:i+2] not in _vowels):
825
            word = word[:i] + word[i-1] + word[i+1:]
826
        elif word[i] == 'W' and word[i-1] in _vowels:
827
            word = word[:i] + word[i-1] + word[i+1:]
828
829
        if word[i:i+skip+1] != key[-1:]:
830
            key += word[i:i+skip+1]
831
832
    key = _delete_consecutive_repeats(key)
833
834
    if key[-1:] == 'S':
835
        key = key[:-1]
836
    if key[-2:] == 'AY':
837
        key = key[:-2] + 'Y'
838
    if key[-1:] == 'A':
839
        key = key[:-1]
840
    if modified and key[:1] == 'A':
841
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 732 is False. Are you sure this can never be the case?
Loading history...
842
843
    if maxlength and maxlength < _INFINITY:
844
        key = key[:maxlength]
845
846
    return key
847
848
849
def mra(word):
850
    """Return the MRA personal numeric identifier (PNI) for a word.
851
852
    A description of the Western Airlines Surname Match Rating Algorithm can
853
    be found on page 18 of :cite:`Moore:1977`.
854
855
    :param str word: the word to transform
856
    :returns: the MRA PNI
857
    :rtype: str
858
859
    >>> mra('Christopher')
860
    'CHRPHR'
861
    >>> mra('Niall')
862
    'NL'
863
    >>> mra('Smith')
864
    'SMTH'
865
    >>> mra('Schmidt')
866
    'SCHMDT'
867
    """
868
    if not word:
869
        return word
870
    word = word.upper()
871
    word = word.replace('ß', 'SS')
872
    word = word[0]+''.join(c for c in word[1:] if
873
                           c not in {'A', 'E', 'I', 'O', 'U'})
874
    word = _delete_consecutive_repeats(word)
875
    if len(word) > 6:
876
        word = word[:3]+word[-3:]
877
    return word
878
879
880
def metaphone(word, maxlength=_INFINITY):
881
    """Return the Metaphone code for a word.
882
883
    Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`,
884
    as described in :cite:`Philips:1990b`.
885
    This incorporates some corrections to the above code, particularly
886
    some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`.
887
888
    :param str word: the word to transform
889
    :param int maxlength: the maximum length of the returned Metaphone code
890
        (defaults to unlimited, but in Philips' original implementation
891
        this was 4)
892
    :returns: the Metaphone value
893
    :rtype: str
894
895
896
    >>> metaphone('Christopher')
897
    'KRSTFR'
898
    >>> metaphone('Niall')
899
    'NL'
900
    >>> metaphone('Smith')
901
    'SM0'
902
    >>> metaphone('Schmidt')
903
    'SKMTT'
904
    """
905
    # pylint: disable=too-many-branches
906
    _vowels = {'A', 'E', 'I', 'O', 'U'}
907
    _frontv = {'E', 'I', 'Y'}
908
    _varson = {'C', 'G', 'P', 'S', 'T'}
909
910
    # Require a maxlength of at least 4
911
    if maxlength is not None:
912
        maxlength = max(4, maxlength)
913
    else:
914
        maxlength = 64
915
916
    # As in variable sound--those modified by adding an "h"
917
    ename = ''.join(c for c in word.upper() if c.isalnum())
918
    ename = ename.replace('ß', 'SS')
919
920
    # Delete nonalphanumeric characters and make all caps
921
    if not ename:
922
        return ''
923
    if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}:
924
        ename = ename[1:]
925
    elif ename[0] == 'X':
926
        ename = 'S' + ename[1:]
927
    elif ename[0:2] == 'WH':
928
        ename = 'W' + ename[2:]
929
930
    # Convert to metaph
931
    elen = len(ename)-1
932
    metaph = ''
933
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
934
        if len(metaph) >= maxlength:
935
            break
936
        if ((ename[i] not in {'G', 'T'} and
937
             i > 0 and ename[i-1] == ename[i])):
938
            continue
939
940
        if ename[i] in _vowels and i == 0:
941
            metaph = ename[i]
942
943
        elif ename[i] == 'B':
944
            if i != elen or ename[i-1] != 'M':
945
                metaph += ename[i]
946
947
        elif ename[i] == 'C':
948
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
949
                if ename[i+1:i+3] == 'IA':
950
                    metaph += 'X'
951
                elif ename[i+1:i+2] in _frontv:
952
                    metaph += 'S'
953
                elif i > 0 and ename[i-1:i+2] == 'SCH':
954
                    metaph += 'K'
955
                elif ename[i+1:i+2] == 'H':
956
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
957
                        metaph += 'K'
958
                    else:
959
                        metaph += 'X'
960
                else:
961
                    metaph += 'K'
962
963
        elif ename[i] == 'D':
964
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
965
                metaph += 'J'
966
            else:
967
                metaph += 'T'
968
969
        elif ename[i] == 'G':
970
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
971
                                              ename[i+2:i+3] not in _vowels):
972
                continue
973
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
974
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
975
                continue
976
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
977
                  ename[i+1] in _frontv):
978
                continue
979
            elif ename[i+1:i+2] == 'G':
980
                continue
981
            elif ename[i+1:i+2] in _frontv:
982
                if i == 0 or ename[i-1] != 'G':
983
                    metaph += 'J'
984
                else:
985
                    metaph += 'K'
986
            else:
987
                metaph += 'K'
988
989
        elif ename[i] == 'H':
990
            if ((i > 0 and ename[i-1] in _vowels and
991
                 ename[i+1:i+2] not in _vowels)):
992
                continue
993
            elif i > 0 and ename[i-1] in _varson:
994
                continue
995
            else:
996
                metaph += 'H'
997
998
        elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}:
999
            metaph += ename[i]
1000
1001
        elif ename[i] == 'K':
1002
            if i > 0 and ename[i-1] == 'C':
1003
                continue
1004
            else:
1005
                metaph += 'K'
1006
1007
        elif ename[i] == 'P':
1008
            if ename[i+1:i+2] == 'H':
1009
                metaph += 'F'
1010
            else:
1011
                metaph += 'P'
1012
1013
        elif ename[i] == 'Q':
1014
            metaph += 'K'
1015
1016
        elif ename[i] == 'S':
1017
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1018
                 ename[i+2] in 'OA')):
1019
                metaph += 'X'
1020
            elif ename[i+1:i+2] == 'H':
1021
                metaph += 'X'
1022
            else:
1023
                metaph += 'S'
1024
1025
        elif ename[i] == 'T':
1026
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1027
                 ename[i+2] in {'A', 'O'})):
1028
                metaph += 'X'
1029
            elif ename[i+1:i+2] == 'H':
1030
                metaph += '0'
1031
            elif ename[i+1:i+3] != 'CH':
1032
                if ename[i-1:i] != 'T':
1033
                    metaph += 'T'
1034
1035
        elif ename[i] == 'V':
1036
            metaph += 'F'
1037
1038
        elif ename[i] in 'WY':
1039
            if ename[i+1:i+2] in _vowels:
1040
                metaph += ename[i]
1041
1042
        elif ename[i] == 'X':
1043
            metaph += 'KS'
1044
1045
        elif ename[i] == 'Z':
1046
            metaph += 'S'
1047
1048
    return metaph
1049
1050
1051
def double_metaphone(word, maxlength=_INFINITY):
1052
    """Return the Double Metaphone code for a word.
1053
1054
    Based on Lawrence Philips' (Visual) C++ code from 1999
1055
    :cite:`Philips:2000`.
1056
1057
    :param word: the word to transform
1058
    :param maxlength: the maximum length of the returned Double Metaphone codes
1059
        (defaults to unlimited, but in Philips' original implementation this
1060
        was 4)
1061
    :returns: the Double Metaphone value(s)
1062
    :rtype: tuple
1063
1064
    >>> double_metaphone('Christopher')
1065
    ('KRSTFR', '')
1066
    >>> double_metaphone('Niall')
1067
    ('NL', '')
1068
    >>> double_metaphone('Smith')
1069
    ('SM0', 'XMT')
1070
    >>> double_metaphone('Schmidt')
1071
    ('XMT', 'SMT')
1072
    """
1073
    # pylint: disable=too-many-branches
1074
    # Require a maxlength of at least 4
1075
    if maxlength is not None:
1076
        maxlength = max(4, maxlength)
1077
    else:
1078
        maxlength = 64
1079
1080
    primary = ''
1081
    secondary = ''
1082
1083
    def _slavo_germanic():
1084
        """Return True if the word appears to be Slavic or Germanic."""
1085
        if 'W' in word or 'K' in word or 'CZ' in word:
1086
            return True
1087
        return False
1088
1089
    def _metaph_add(pri, sec=''):
1090
        """Return a new metaphone tuple with the supplied elements."""
1091
        newpri = primary
1092
        newsec = secondary
1093
        if pri:
1094
            newpri += pri
1095
        if sec:
1096
            if sec != ' ':
1097
                newsec += sec
1098
        else:
1099
            newsec += pri
1100
        return (newpri, newsec)
1101
1102
    def _is_vowel(pos):
1103
        """Return True if the character at word[pos] is a vowel."""
1104
        if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1105
            return True
1106
        return False
1107
1108
    def _get_at(pos):
1109
        """Return the character at word[pos]."""
1110
        return word[pos]
1111
1112
    def _string_at(pos, slen, substrings):
1113
        """Return True if word[pos:pos+slen] is in substrings."""
1114
        if pos < 0:
1115
            return False
1116
        return word[pos:pos+slen] in substrings
1117
1118
    current = 0
1119
    length = len(word)
1120
    if length < 1:
1121
        return ('', '')
1122
    last = length - 1
1123
1124
    word = word.upper()
1125
    word = word.replace('ß', 'SS')
1126
1127
    # Pad the original string so that we can index beyond the edge of the world
1128
    word += '     '
1129
1130
    # Skip these when at start of word
1131
    if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
1132
        current += 1
1133
1134
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1135
    if _get_at(0) == 'X':
1136
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1137
        current += 1
1138
1139
    # Main loop
1140
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1141
        if current >= length:
1142
            break
1143
1144
        if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1145
            if current == 0:
1146
                # All init vowels now map to 'A'
1147
                (primary, secondary) = _metaph_add('A')
1148
            current += 1
1149
            continue
1150
1151
        elif _get_at(current) == 'B':
1152
            # "-mb", e.g", "dumb", already skipped over...
1153
            (primary, secondary) = _metaph_add('P')
1154
            if _get_at(current + 1) == 'B':
1155
                current += 2
1156
            else:
1157
                current += 1
1158
            continue
1159
1160
        elif _get_at(current) == 'Ç':
1161
            (primary, secondary) = _metaph_add('S')
1162
            current += 1
1163
            continue
1164
1165
        elif _get_at(current) == 'C':
1166
            # Various Germanic
1167
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1168
                    _string_at((current - 1), 3, {'ACH'}) and
1169
                    ((_get_at(current + 2) != 'I') and
1170
                     ((_get_at(current + 2) != 'E') or
1171
                      _string_at((current - 2), 6,
1172
                                 {'BACHER', 'MACHER'})))):
1173
                (primary, secondary) = _metaph_add('K')
1174
                current += 2
1175
                continue
1176
1177
            # Special case 'caesar'
1178
            elif current == 0 and _string_at(current, 6, {'CAESAR'}):
1179
                (primary, secondary) = _metaph_add('S')
1180
                current += 2
1181
                continue
1182
1183
            # Italian 'chianti'
1184
            elif _string_at(current, 4, {'CHIA'}):
1185
                (primary, secondary) = _metaph_add('K')
1186
                current += 2
1187
                continue
1188
1189
            elif _string_at(current, 2, {'CH'}):
1190
                # Find 'Michael'
1191
                if current > 0 and _string_at(current, 4, {'CHAE'}):
1192
                    (primary, secondary) = _metaph_add('K', 'X')
1193
                    current += 2
1194
                    continue
1195
1196
                # Greek roots e.g. 'chemistry', 'chorus'
1197
                elif (current == 0 and
1198
                      (_string_at((current + 1), 5,
1199
                                  {'HARAC', 'HARIS'}) or
1200
                       _string_at((current + 1), 3,
1201
                                  {'HOR', 'HYM', 'HIA', 'HEM'})) and
1202
                      not _string_at(0, 5, {'CHORE'})):
1203
                    (primary, secondary) = _metaph_add('K')
1204
                    current += 2
1205
                    continue
1206
1207
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1208
                elif ((_string_at(0, 4, {'VAN ', 'VON '}) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1209
                       _string_at(0, 3, {'SCH'})) or
1210
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1211
                      _string_at((current - 2), 6,
1212
                                 {'ORCHES', 'ARCHIT', 'ORCHID'}) or
1213
                      _string_at((current + 2), 1, {'T', 'S'}) or
1214
                      ((_string_at((current - 1), 1,
1215
                                   {'A', 'O', 'U', 'E'}) or
1216
                        (current == 0)) and
1217
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1218
                       _string_at((current + 2), 1,
1219
                                  {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W',
1220
                                   ' '}))):
1221
                    (primary, secondary) = _metaph_add('K')
1222
1223
                else:
1224
                    if current > 0:
1225
                        if _string_at(0, 2, {'MC'}):
1226
                            # e.g., "McHugh"
1227
                            (primary, secondary) = _metaph_add('K')
1228
                        else:
1229
                            (primary, secondary) = _metaph_add('X', 'K')
1230
                    else:
1231
                        (primary, secondary) = _metaph_add('X')
1232
1233
                current += 2
1234
                continue
1235
1236
            # e.g, 'czerny'
1237
            elif (_string_at(current, 2, {'CZ'}) and
1238
                  not _string_at((current - 2), 4, {'WICZ'})):
1239
                (primary, secondary) = _metaph_add('S', 'X')
1240
                current += 2
1241
                continue
1242
1243
            # e.g., 'focaccia'
1244
            elif _string_at((current + 1), 3, {'CIA'}):
1245
                (primary, secondary) = _metaph_add('X')
1246
                current += 3
1247
1248
            # double 'C', but not if e.g. 'McClellan'
1249
            elif (_string_at(current, 2, {'CC'}) and
1250
                  not ((current == 1) and (_get_at(0) == 'M'))):
1251
                # 'bellocchio' but not 'bacchus'
1252
                if ((_string_at((current + 2), 1,
1253
                                {'I', 'E', 'H'}) and
1254
                     not _string_at((current + 2), 2, ['HU']))):
1255
                    # 'accident', 'accede' 'succeed'
1256
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1257
                         _string_at((current - 1), 5,
1258
                                    {'UCCEE', 'UCCES'}))):
1259
                        (primary, secondary) = _metaph_add('KS')
1260
                    # 'bacci', 'bertucci', other italian
1261
                    else:
1262
                        (primary, secondary) = _metaph_add('X')
1263
                    current += 3
1264
                    continue
1265
                else:  # Pierce's rule
1266
                    (primary, secondary) = _metaph_add('K')
1267
                    current += 2
1268
                    continue
1269
1270
            elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
1271
                (primary, secondary) = _metaph_add('K')
1272
                current += 2
1273
                continue
1274
1275
            elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
1276
                # Italian vs. English
1277
                if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
1278
                    (primary, secondary) = _metaph_add('S', 'X')
1279
                else:
1280
                    (primary, secondary) = _metaph_add('S')
1281
                current += 2
1282
                continue
1283
1284
            # else
1285
            else:
1286
                (primary, secondary) = _metaph_add('K')
1287
1288
                # name sent in 'mac caffrey', 'mac gregor
1289
                if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
1290
                    current += 3
1291
                elif (_string_at((current + 1), 1,
1292
                                 {'C', 'K', 'Q'}) and
1293
                      not _string_at((current + 1), 2, {'CE', 'CI'})):
1294
                    current += 2
1295
                else:
1296
                    current += 1
1297
                continue
1298
1299
        elif _get_at(current) == 'D':
1300
            if _string_at(current, 2, {'DG'}):
1301
                if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1302
                    # e.g. 'edge'
1303
                    (primary, secondary) = _metaph_add('J')
1304
                    current += 3
1305
                    continue
1306
                else:
1307
                    # e.g. 'edgar'
1308
                    (primary, secondary) = _metaph_add('TK')
1309
                    current += 2
1310
                    continue
1311
1312
            elif _string_at(current, 2, {'DT', 'DD'}):
1313
                (primary, secondary) = _metaph_add('T')
1314
                current += 2
1315
                continue
1316
1317
            # else
1318
            else:
1319
                (primary, secondary) = _metaph_add('T')
1320
                current += 1
1321
                continue
1322
1323
        elif _get_at(current) == 'F':
1324
            if _get_at(current + 1) == 'F':
1325
                current += 2
1326
            else:
1327
                current += 1
1328
            (primary, secondary) = _metaph_add('F')
1329
            continue
1330
1331
        elif _get_at(current) == 'G':
1332
            if _get_at(current + 1) == 'H':
1333
                if (current > 0) and not _is_vowel(current - 1):
1334
                    (primary, secondary) = _metaph_add('K')
1335
                    current += 2
1336
                    continue
1337
1338
                # 'ghislane', ghiradelli
1339
                elif current == 0:
1340
                    if _get_at(current + 2) == 'I':
1341
                        (primary, secondary) = _metaph_add('J')
1342
                    else:
1343
                        (primary, secondary) = _metaph_add('K')
1344
                    current += 2
1345
                    continue
1346
1347
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1348
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1349
                       _string_at((current - 2), 1, {'B', 'H', 'D'})) or
1350
                      # e.g., 'bough'
1351
                      ((current > 2) and
1352
                       _string_at((current - 3), 1, {'B', 'H', 'D'})) or
1353
                      # e.g., 'broughton'
1354
                      ((current > 3) and
1355
                       _string_at((current - 4), 1, {'B', 'H'}))):
1356
                    current += 2
1357
                    continue
1358
                else:
1359
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1360
                    #      'gough', 'rough', 'tough'
1361
                    if ((current > 2) and
1362
                            (_get_at(current - 1) == 'U') and
1363
                            (_string_at((current - 3), 1,
1364
                                        {'C', 'G', 'L', 'R', 'T'}))):
1365
                        (primary, secondary) = _metaph_add('F')
1366
                    elif (current > 0) and _get_at(current - 1) != 'I':
1367
                        (primary, secondary) = _metaph_add('K')
1368
                    current += 2
1369
                    continue
1370
1371
            elif _get_at(current + 1) == 'N':
1372
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1373
                    (primary, secondary) = _metaph_add('KN', 'N')
1374
                # not e.g. 'cagney'
1375
                elif (not _string_at((current + 2), 2, {'EY'}) and
1376
                      (_get_at(current + 1) != 'Y') and
1377
                      not _slavo_germanic()):
1378
                    (primary, secondary) = _metaph_add('N', 'KN')
1379
                else:
1380
                    (primary, secondary) = _metaph_add('KN')
1381
                current += 2
1382
                continue
1383
1384
            # 'tagliaro'
1385
            elif (_string_at((current + 1), 2, {'LI'}) and
1386
                  not _slavo_germanic()):
1387
                (primary, secondary) = _metaph_add('KL', 'L')
1388
                current += 2
1389
                continue
1390
1391
            # -ges-, -gep-, -gel-, -gie- at beginning
1392
            elif ((current == 0) and
1393
                  ((_get_at(current + 1) == 'Y') or
1394
                   _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY',
1395
                                                 'IB', 'IL', 'IN', 'IE', 'EI',
1396
                                                 'ER'}))):
1397
                (primary, secondary) = _metaph_add('K', 'J')
1398
                current += 2
1399
                continue
1400
1401
            #  -ger-,  -gy-
1402
            elif ((_string_at((current + 1), 2, {'ER'}) or
1403
                   (_get_at(current + 1) == 'Y')) and not
1404
                  _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not
1405
                  _string_at((current - 1), 1, {'E', 'I'}) and not
1406
                  _string_at((current - 1), 3, {'RGY', 'OGY'})):
1407
                (primary, secondary) = _metaph_add('K', 'J')
1408
                current += 2
1409
                continue
1410
1411
            #  italian e.g, 'biaggi'
1412
            elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or
1413
                  _string_at((current - 1), 4, {'AGGI', 'OGGI'})):
1414
                # obvious germanic
1415
                if (((_string_at(0, 4, {'VAN ', 'VON '}) or
1416
                      _string_at(0, 3, {'SCH'})) or
1417
                     _string_at((current + 1), 2, {'ET'}))):
1418
                    (primary, secondary) = _metaph_add('K')
1419
                elif _string_at((current + 1), 4, {'IER '}):
1420
                    (primary, secondary) = _metaph_add('J')
1421
                else:
1422
                    (primary, secondary) = _metaph_add('J', 'K')
1423
                current += 2
1424
                continue
1425
1426
            else:
1427
                if _get_at(current + 1) == 'G':
1428
                    current += 2
1429
                else:
1430
                    current += 1
1431
                (primary, secondary) = _metaph_add('K')
1432
                continue
1433
1434
        elif _get_at(current) == 'H':
1435
            # only keep if first & before vowel or btw. 2 vowels
1436
            if ((((current == 0) or _is_vowel(current - 1)) and
1437
                 _is_vowel(current + 1))):
1438
                (primary, secondary) = _metaph_add('H')
1439
                current += 2
1440
            else:  # also takes care of 'HH'
1441
                current += 1
1442
            continue
1443
1444
        elif _get_at(current) == 'J':
1445
            # obvious spanish, 'jose', 'san jacinto'
1446
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}):
1447
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1448
                     _string_at(0, 4, ['SAN ']))):
1449
                    (primary, secondary) = _metaph_add('H')
1450
                else:
1451
                    (primary, secondary) = _metaph_add('J', 'H')
1452
                current += 1
1453
                continue
1454
1455
            elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
1456
                # Yankelovich/Jankelowicz
1457
                (primary, secondary) = _metaph_add('J', 'A')
1458
            # Spanish pron. of e.g. 'bajador'
1459
            elif (_is_vowel(current - 1) and
1460
                  not _slavo_germanic() and
1461
                  ((_get_at(current + 1) == 'A') or
1462
                   (_get_at(current + 1) == 'O'))):
1463
                (primary, secondary) = _metaph_add('J', 'H')
1464
            elif current == last:
1465
                (primary, secondary) = _metaph_add('J', ' ')
1466
            elif (not _string_at((current + 1), 1,
1467
                                 {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and
1468
                  not _string_at((current - 1), 1, {'S', 'K', 'L'})):
1469
                (primary, secondary) = _metaph_add('J')
1470
1471
            if _get_at(current + 1) == 'J':  # it could happen!
1472
                current += 2
1473
            else:
1474
                current += 1
1475
            continue
1476
1477
        elif _get_at(current) == 'K':
1478
            if _get_at(current + 1) == 'K':
1479
                current += 2
1480
            else:
1481
                current += 1
1482
            (primary, secondary) = _metaph_add('K')
1483
            continue
1484
1485
        elif _get_at(current) == 'L':
1486
            if _get_at(current + 1) == 'L':
1487
                # Spanish e.g. 'cabrillo', 'gallegos'
1488
                if (((current == (length - 3)) and
1489
                     _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or
1490
                        ((_string_at((last - 1), 2, {'AS', 'OS'}) or
1491
                          _string_at(last, 1, {'A', 'O'})) and
1492
                         _string_at((current - 1), 4, {'ALLE'}))):
1493
                    (primary, secondary) = _metaph_add('L', ' ')
1494
                    current += 2
1495
                    continue
1496
                current += 2
1497
            else:
1498
                current += 1
1499
            (primary, secondary) = _metaph_add('L')
1500
            continue
1501
1502
        elif _get_at(current) == 'M':
1503
            if (((_string_at((current - 1), 3, {'UMB'}) and
1504
                  (((current + 1) == last) or
1505
                   _string_at((current + 2), 2, {'ER'}))) or
1506
                 # 'dumb', 'thumb'
1507
                 (_get_at(current + 1) == 'M'))):
1508
                current += 2
1509
            else:
1510
                current += 1
1511
            (primary, secondary) = _metaph_add('M')
1512
            continue
1513
1514
        elif _get_at(current) == 'N':
1515
            if _get_at(current + 1) == 'N':
1516
                current += 2
1517
            else:
1518
                current += 1
1519
            (primary, secondary) = _metaph_add('N')
1520
            continue
1521
1522
        elif _get_at(current) == 'Ñ':
1523
            current += 1
1524
            (primary, secondary) = _metaph_add('N')
1525
            continue
1526
1527
        elif _get_at(current) == 'P':
1528
            if _get_at(current + 1) == 'H':
1529
                (primary, secondary) = _metaph_add('F')
1530
                current += 2
1531
                continue
1532
1533
            # also account for "campbell", "raspberry"
1534
            elif _string_at((current + 1), 1, {'P', 'B'}):
1535
                current += 2
1536
            else:
1537
                current += 1
1538
            (primary, secondary) = _metaph_add('P')
1539
            continue
1540
1541
        elif _get_at(current) == 'Q':
1542
            if _get_at(current + 1) == 'Q':
1543
                current += 2
1544
            else:
1545
                current += 1
1546
            (primary, secondary) = _metaph_add('K')
1547
            continue
1548
1549
        elif _get_at(current) == 'R':
1550
            # french e.g. 'rogier', but exclude 'hochmeier'
1551
            if (((current == last) and
1552
                 not _slavo_germanic() and
1553
                 _string_at((current - 2), 2, {'IE'}) and
1554
                 not _string_at((current - 4), 2, {'ME', 'MA'}))):
1555
                (primary, secondary) = _metaph_add('', 'R')
1556
            else:
1557
                (primary, secondary) = _metaph_add('R')
1558
1559
            if _get_at(current + 1) == 'R':
1560
                current += 2
1561
            else:
1562
                current += 1
1563
            continue
1564
1565
        elif _get_at(current) == 'S':
1566
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1567
            if _string_at((current - 1), 3, {'ISL', 'YSL'}):
1568
                current += 1
1569
                continue
1570
1571
            # special case 'sugar-'
1572
            elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
1573
                (primary, secondary) = _metaph_add('X', 'S')
1574
                current += 1
1575
                continue
1576
1577
            elif _string_at(current, 2, {'SH'}):
1578
                # Germanic
1579
                if _string_at((current + 1), 4,
1580
                              {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}):
1581
                    (primary, secondary) = _metaph_add('S')
1582
                else:
1583
                    (primary, secondary) = _metaph_add('X')
1584
                current += 2
1585
                continue
1586
1587
            # Italian & Armenian
1588
            elif (_string_at(current, 3, {'SIO', 'SIA'}) or
1589
                  _string_at(current, 4, {'SIAN'})):
1590
                if not _slavo_germanic():
1591
                    (primary, secondary) = _metaph_add('S', 'X')
1592
                else:
1593
                    (primary, secondary) = _metaph_add('S')
1594
                current += 3
1595
                continue
1596
1597
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1598
            #                               'snider' match 'schneider'
1599
            # also, -sz- in Slavic language although in Hungarian it is
1600
            #       pronounced 's'
1601
            elif (((current == 0) and
1602
                   _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or
1603
                  _string_at((current + 1), 1, {'Z'})):
1604
                (primary, secondary) = _metaph_add('S', 'X')
1605
                if _string_at((current + 1), 1, {'Z'}):
1606
                    current += 2
1607
                else:
1608
                    current += 1
1609
                continue
1610
1611
            elif _string_at(current, 2, {'SC'}):
1612
                # Schlesinger's rule
1613
                if _get_at(current + 2) == 'H':
1614
                    # dutch origin, e.g. 'school', 'schooner'
1615
                    if _string_at((current + 3), 2,
1616
                                  {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}):
1617
                        # 'schermerhorn', 'schenker'
1618
                        if _string_at((current + 3), 2, {'ER', 'EN'}):
1619
                            (primary, secondary) = _metaph_add('X', 'SK')
1620
                        else:
1621
                            (primary, secondary) = _metaph_add('SK')
1622
                        current += 3
1623
                        continue
1624
                    else:
1625
                        if (((current == 0) and not _is_vowel(3) and
1626
                             (_get_at(3) != 'W'))):
1627
                            (primary, secondary) = _metaph_add('X', 'S')
1628
                        else:
1629
                            (primary, secondary) = _metaph_add('X')
1630
                        current += 3
1631
                        continue
1632
1633
                elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1634
                    (primary, secondary) = _metaph_add('S')
1635
                    current += 3
1636
                    continue
1637
1638
                # else
1639
                else:
1640
                    (primary, secondary) = _metaph_add('SK')
1641
                    current += 3
1642
                    continue
1643
1644
            else:
1645
                # french e.g. 'resnais', 'artois'
1646
                if (current == last) and _string_at((current - 2), 2,
1647
                                                    {'AI', 'OI'}):
1648
                    (primary, secondary) = _metaph_add('', 'S')
1649
                else:
1650
                    (primary, secondary) = _metaph_add('S')
1651
1652
                if _string_at((current + 1), 1, {'S', 'Z'}):
1653
                    current += 2
1654
                else:
1655
                    current += 1
1656
                continue
1657
1658
        elif _get_at(current) == 'T':
1659
            if _string_at(current, 4, {'TION'}):
1660
                (primary, secondary) = _metaph_add('X')
1661
                current += 3
1662
                continue
1663
1664
            elif _string_at(current, 3, {'TIA', 'TCH'}):
1665
                (primary, secondary) = _metaph_add('X')
1666
                current += 3
1667
                continue
1668
1669
            elif (_string_at(current, 2, {'TH'}) or
1670
                  _string_at(current, 3, {'TTH'})):
1671
                # special case 'thomas', 'thames' or germanic
1672
                if ((_string_at((current + 2), 2, {'OM', 'AM'}) or
1673
                     _string_at(0, 4, {'VAN ', 'VON '}) or
1674
                     _string_at(0, 3, {'SCH'}))):
1675
                    (primary, secondary) = _metaph_add('T')
1676
                else:
1677
                    (primary, secondary) = _metaph_add('0', 'T')
1678
                current += 2
1679
                continue
1680
1681
            elif _string_at((current + 1), 1, {'T', 'D'}):
1682
                current += 2
1683
            else:
1684
                current += 1
1685
            (primary, secondary) = _metaph_add('T')
1686
            continue
1687
1688
        elif _get_at(current) == 'V':
1689
            if _get_at(current + 1) == 'V':
1690
                current += 2
1691
            else:
1692
                current += 1
1693
            (primary, secondary) = _metaph_add('F')
1694
            continue
1695
1696
        elif _get_at(current) == 'W':
1697
            # can also be in middle of word
1698
            if _string_at(current, 2, {'WR'}):
1699
                (primary, secondary) = _metaph_add('R')
1700
                current += 2
1701
                continue
1702
            elif ((current == 0) and
1703
                  (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))):
1704
                # Wasserman should match Vasserman
1705
                if _is_vowel(current + 1):
1706
                    (primary, secondary) = _metaph_add('A', 'F')
1707
                else:
1708
                    # need Uomo to match Womo
1709
                    (primary, secondary) = _metaph_add('A')
1710
1711
            # Arnow should match Arnoff
1712
            if ((((current == last) and _is_vowel(current - 1)) or
1713
                 _string_at((current - 1), 5,
1714
                            {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or
1715
                 _string_at(0, 3, ['SCH']))):
1716
                (primary, secondary) = _metaph_add('', 'F')
1717
                current += 1
1718
                continue
1719
            # Polish e.g. 'filipowicz'
1720
            elif _string_at(current, 4, {'WICZ', 'WITZ'}):
1721
                (primary, secondary) = _metaph_add('TS', 'FX')
1722
                current += 4
1723
                continue
1724
            # else skip it
1725
            else:
1726
                current += 1
1727
                continue
1728
1729
        elif _get_at(current) == 'X':
1730
            # French e.g. breaux
1731
            if (not ((current == last) and
1732
                     (_string_at((current - 3), 3, {'IAU', 'EAU'}) or
1733
                      _string_at((current - 2), 2, {'AU', 'OU'})))):
1734
                (primary, secondary) = _metaph_add('KS')
1735
1736
            if _string_at((current + 1), 1, {'C', 'X'}):
1737
                current += 2
1738
            else:
1739
                current += 1
1740
            continue
1741
1742
        elif _get_at(current) == 'Z':
1743
            # Chinese Pinyin e.g. 'zhao'
1744
            if _get_at(current + 1) == 'H':
1745
                (primary, secondary) = _metaph_add('J')
1746
                current += 2
1747
                continue
1748
            elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or
1749
                  (_slavo_germanic() and ((current > 0) and
1750
                                          _get_at(current - 1) != 'T'))):
1751
                (primary, secondary) = _metaph_add('S', 'TS')
1752
            else:
1753
                (primary, secondary) = _metaph_add('S')
1754
1755
            if _get_at(current + 1) == 'Z':
1756
                current += 2
1757
            else:
1758
                current += 1
1759
            continue
1760
1761
        else:
1762
            current += 1
1763
1764
    if maxlength and maxlength < _INFINITY:
1765
        primary = primary[:maxlength]
1766
        secondary = secondary[:maxlength]
1767
    if primary == secondary:
1768
        secondary = ''
1769
1770
    return (primary, secondary)
1771
1772
1773
def caverphone(word, version=2):
1774
    """Return the Caverphone code for a word.
1775
1776
    A description of version 1 of the algorithm can be found in
1777
    :cite:`Hood:2002`.
1778
1779
    A description of version 2 of the algorithm can be found in
1780
    :cite:`Hood:2004`.
1781
1782
    :param str word: the word to transform
1783
    :param int version: the version of Caverphone to employ for encoding
1784
        (defaults to 2)
1785
    :returns: the Caverphone value
1786
    :rtype: str
1787
1788
    >>> caverphone('Christopher')
1789
    'KRSTFA1111'
1790
    >>> caverphone('Niall')
1791
    'NA11111111'
1792
    >>> caverphone('Smith')
1793
    'SMT1111111'
1794
    >>> caverphone('Schmidt')
1795
    'SKMT111111'
1796
1797
    >>> caverphone('Christopher', 1)
1798
    'KRSTF1'
1799
    >>> caverphone('Niall', 1)
1800
    'N11111'
1801
    >>> caverphone('Smith', 1)
1802
    'SMT111'
1803
    >>> caverphone('Schmidt', 1)
1804
    'SKMT11'
1805
    """
1806
    _vowels = {'a', 'e', 'i', 'o', 'u'}
1807
1808
    word = word.lower()
1809
    word = ''.join(c for c in word if c in
1810
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
1811
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
1812
                    'y', 'z'})
1813
1814
    def _squeeze_replace(word, char, new_char):
1815
        """Convert strings of char in word to one instance of new_char."""
1816
        while char * 2 in word:
1817
            word = word.replace(char * 2, char)
1818
        return word.replace(char, new_char)
1819
1820
    # the main replacemet algorithm
1821
    if version != 1 and word[-1:] == 'e':
1822
        word = word[:-1]
1823
    if word:
1824
        if word[:5] == 'cough':
1825
            word = 'cou2f'+word[5:]
1826
        if word[:5] == 'rough':
1827
            word = 'rou2f'+word[5:]
1828
        if word[:5] == 'tough':
1829
            word = 'tou2f'+word[5:]
1830
        if word[:6] == 'enough':
1831
            word = 'enou2f'+word[6:]
1832
        if version != 1 and word[:6] == 'trough':
1833
            word = 'trou2f'+word[6:]
1834
        if word[:2] == 'gn':
1835
            word = '2n'+word[2:]
1836
        if word[-2:] == 'mb':
1837
            word = word[:-1]+'2'
1838
        word = word.replace('cq', '2q')
1839
        word = word.replace('ci', 'si')
1840
        word = word.replace('ce', 'se')
1841
        word = word.replace('cy', 'sy')
1842
        word = word.replace('tch', '2ch')
1843
        word = word.replace('c', 'k')
1844
        word = word.replace('q', 'k')
1845
        word = word.replace('x', 'k')
1846
        word = word.replace('v', 'f')
1847
        word = word.replace('dg', '2g')
1848
        word = word.replace('tio', 'sio')
1849
        word = word.replace('tia', 'sia')
1850
        word = word.replace('d', 't')
1851
        word = word.replace('ph', 'fh')
1852
        word = word.replace('b', 'p')
1853
        word = word.replace('sh', 's2')
1854
        word = word.replace('z', 's')
1855
        if word[0] in _vowels:
1856
            word = 'A'+word[1:]
1857
        word = word.replace('a', '3')
1858
        word = word.replace('e', '3')
1859
        word = word.replace('i', '3')
1860
        word = word.replace('o', '3')
1861
        word = word.replace('u', '3')
1862
        if version != 1:
1863
            word = word.replace('j', 'y')
1864
            if word[:2] == 'y3':
1865
                word = 'Y3'+word[2:]
1866
            if word[:1] == 'y':
1867
                word = 'A'+word[1:]
1868
            word = word.replace('y', '3')
1869
        word = word.replace('3gh3', '3kh3')
1870
        word = word.replace('gh', '22')
1871
        word = word.replace('g', 'k')
1872
1873
        word = _squeeze_replace(word, 's', 'S')
1874
        word = _squeeze_replace(word, 't', 'T')
1875
        word = _squeeze_replace(word, 'p', 'P')
1876
        word = _squeeze_replace(word, 'k', 'K')
1877
        word = _squeeze_replace(word, 'f', 'F')
1878
        word = _squeeze_replace(word, 'm', 'M')
1879
        word = _squeeze_replace(word, 'n', 'N')
1880
1881
        word = word.replace('w3', 'W3')
1882
        if version == 1:
1883
            word = word.replace('wy', 'Wy')
1884
        word = word.replace('wh3', 'Wh3')
1885
        if version == 1:
1886
            word = word.replace('why', 'Why')
1887
        if version != 1 and word[-1:] == 'w':
1888
            word = word[:-1]+'3'
1889
        word = word.replace('w', '2')
1890
        if word[:1] == 'h':
1891
            word = 'A'+word[1:]
1892
        word = word.replace('h', '2')
1893
        word = word.replace('r3', 'R3')
1894
        if version == 1:
1895
            word = word.replace('ry', 'Ry')
1896
        if version != 1 and word[-1:] == 'r':
1897
            word = word[:-1]+'3'
1898
        word = word.replace('r', '2')
1899
        word = word.replace('l3', 'L3')
1900
        if version == 1:
1901
            word = word.replace('ly', 'Ly')
1902
        if version != 1 and word[-1:] == 'l':
1903
            word = word[:-1]+'3'
1904
        word = word.replace('l', '2')
1905
        if version == 1:
1906
            word = word.replace('j', 'y')
1907
            word = word.replace('y3', 'Y3')
1908
            word = word.replace('y', '2')
1909
        word = word.replace('2', '')
1910
        if version != 1 and word[-1:] == '3':
1911
            word = word[:-1]+'A'
1912
        word = word.replace('3', '')
1913
1914
    # pad with 1s, then extract the necessary length of code
1915
    word = word+'1'*10
1916
    if version != 1:
1917
        word = word[:10]
1918
    else:
1919
        word = word[:6]
1920
1921
    return word
1922
1923
1924
def alpha_sis(word, maxlength=14):
1925
    """Return the IBM Alpha Search Inquiry System code for a word.
1926
1927
    The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
1928
    This implementation is based on the description in :cite:`Moore:1977`.
1929
1930
    A collection is necessary since there can be multiple values for a
1931
    single word. But the collection must be ordered since the first value
1932
    is the primary coding.
1933
1934
    :param str word: the word to transform
1935
    :param int maxlength: the length of the code returned (defaults to 14)
1936
    :returns: the Alpha SIS value
1937
    :rtype: tuple
1938
1939
    >>> alpha_sis('Christopher')
1940
    ('06401840000000', '07040184000000', '04018400000000')
1941
    >>> alpha_sis('Niall')
1942
    ('02500000000000',)
1943
    >>> alpha_sis('Smith')
1944
    ('03100000000000',)
1945
    >>> alpha_sis('Schmidt')
1946
    ('06310000000000',)
1947
    """
1948
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1949
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1950
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1951
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1952
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1953
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1954
                                 'Y')
1955
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1956
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1957
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1958
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1959
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1960
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1961
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1962
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1963
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1964
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1965
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1966
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1967
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1968
1969
    alpha = ['']
1970
    pos = 0
1971
    word = normalize('NFKD', text_type(word.upper()))
1972
    word = word.replace('ß', 'SS')
1973
    word = ''.join(c for c in word if c in
1974
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
1975
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
1976
                    'Y', 'Z'})
1977
1978
    # Clamp maxlength to [4, 64]
1979
    if maxlength is not None:
1980
        maxlength = min(max(4, maxlength), 64)
1981
    else:
1982
        maxlength = 64
1983
1984
    # Do special processing for initial substrings
1985
    for k in _alpha_sis_initials_order:
1986
        if word.startswith(k):
1987
            alpha[0] += _alpha_sis_initials[k]
1988
            pos += len(k)
1989
            break
1990
1991
    # Add a '0' if alpha is still empty
1992
    if not alpha[0]:
1993
        alpha[0] += '0'
1994
1995
    # Whether or not any special initial codes were encoded, iterate
1996
    # through the length of the word in the main encoding loop
1997
    while pos < len(word):
1998
        origpos = pos
1999
        for k in _alpha_sis_basic_order:
2000
            if word[pos:].startswith(k):
2001
                if isinstance(_alpha_sis_basic[k], tuple):
2002
                    newalpha = []
2003
                    for i in range(len(_alpha_sis_basic[k])):
2004
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
2005
                    alpha = newalpha
2006
                else:
2007
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
2008
                pos += len(k)
2009
                break
2010
        if pos == origpos:
2011
            alpha = [_ + '_' for _ in alpha]
2012
            pos += 1
2013
2014
    # Trim doublets and placeholders
2015
    for i in range(len(alpha)):
2016
        pos = 1
2017
        while pos < len(alpha[i]):
2018
            if alpha[i][pos] == alpha[i][pos-1]:
2019
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
2020
            pos += 1
2021
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2022
2023
    # Trim codes and return tuple
2024
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2025
    return tuple(alpha)
2026
2027
2028
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2029
    """Return the Fuzzy Soundex code for a word.
2030
2031
    Fuzzy Soundex is an algorithm derived from Soundex, defined in
2032
    :cite:`Holmes:2002`.
2033
2034
    :param str word: the word to transform
2035
    :param int maxlength: the length of the code returned (defaults to 4)
2036
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2037
        a maxlength string
2038
    :returns: the Fuzzy Soundex value
2039
    :rtype: str
2040
2041
    >>> fuzzy_soundex('Christopher')
2042
    'K6931'
2043
    >>> fuzzy_soundex('Niall')
2044
    'N4000'
2045
    >>> fuzzy_soundex('Smith')
2046
    'S5300'
2047
    >>> fuzzy_soundex('Smith')
2048
    'S5300'
2049
    """
2050
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2051
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2052
                                          '0193017-07745501769301-7-9'))
2053
2054
    word = normalize('NFKD', text_type(word.upper()))
2055
    word = word.replace('ß', 'SS')
2056
2057
    # Clamp maxlength to [4, 64]
2058
    if maxlength is not None:
2059
        maxlength = min(max(4, maxlength), 64)
2060
    else:
2061
        maxlength = 64
2062
2063
    if not word:
2064
        if zero_pad:
2065
            return '0' * maxlength
2066
        return '0'
2067
2068
    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
2069
        word = 'SS' + word[2:]
2070
    elif word[:2] == 'GN':
2071
        word = 'NN' + word[2:]
2072
    elif word[:2] in {'HR', 'WR'}:
2073
        word = 'RR' + word[2:]
2074
    elif word[:2] == 'HW':
2075
        word = 'WW' + word[2:]
2076
    elif word[:2] in {'KN', 'NG'}:
2077
        word = 'NN' + word[2:]
2078
2079
    if word[-2:] == 'CH':
2080
        word = word[:-2] + 'KK'
2081
    elif word[-2:] == 'NT':
2082
        word = word[:-2] + 'TT'
2083
    elif word[-2:] == 'RT':
2084
        word = word[:-2] + 'RR'
2085
    elif word[-3:] == 'RDT':
2086
        word = word[:-3] + 'RR'
2087
2088
    word = word.replace('CA', 'KA')
2089
    word = word.replace('CC', 'KK')
2090
    word = word.replace('CK', 'KK')
2091
    word = word.replace('CE', 'SE')
2092
    word = word.replace('CHL', 'KL')
2093
    word = word.replace('CL', 'KL')
2094
    word = word.replace('CHR', 'KR')
2095
    word = word.replace('CR', 'KR')
2096
    word = word.replace('CI', 'SI')
2097
    word = word.replace('CO', 'KO')
2098
    word = word.replace('CU', 'KU')
2099
    word = word.replace('CY', 'SY')
2100
    word = word.replace('DG', 'GG')
2101
    word = word.replace('GH', 'HH')
2102
    word = word.replace('MAC', 'MK')
2103
    word = word.replace('MC', 'MK')
2104
    word = word.replace('NST', 'NSS')
2105
    word = word.replace('PF', 'FF')
2106
    word = word.replace('PH', 'FF')
2107
    word = word.replace('SCH', 'SSS')
2108
    word = word.replace('TIO', 'SIO')
2109
    word = word.replace('TIA', 'SIO')
2110
    word = word.replace('TCH', 'CHH')
2111
2112
    sdx = word.translate(_fuzzy_soundex_translation)
2113
    sdx = sdx.replace('-', '')
2114
2115
    # remove repeating characters
2116
    sdx = _delete_consecutive_repeats(sdx)
2117
2118
    if word[0] in {'H', 'W', 'Y'}:
2119
        sdx = word[0] + sdx
2120
    else:
2121
        sdx = word[0] + sdx[1:]
2122
2123
    sdx = sdx.replace('0', '')
2124
2125
    if zero_pad:
2126
        sdx += ('0'*maxlength)
2127
2128
    return sdx[:maxlength]
2129
2130
2131
def phonex(word, maxlength=4, zero_pad=True):
2132
    """Return the Phonex code for a word.
2133
2134
    Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
2135
2136
    :param str word: the word to transform
2137
    :param int maxlength: the length of the code returned (defaults to 4)
2138
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2139
        a maxlength string
2140
    :returns: the Phonex value
2141
    :rtype: str
2142
2143
    >>> phonex('Christopher')
2144
    'C623'
2145
    >>> phonex('Niall')
2146
    'N400'
2147
    >>> phonex('Schmidt')
2148
    'S253'
2149
    >>> phonex('Smith')
2150
    'S530'
2151
    """
2152
    name = normalize('NFKD', text_type(word.upper()))
2153
    name = name.replace('ß', 'SS')
2154
2155
    # Clamp maxlength to [4, 64]
2156
    if maxlength is not None:
2157
        maxlength = min(max(4, maxlength), 64)
2158
    else:
2159
        maxlength = 64
2160
2161
    name_code = last = ''
2162
2163
    # Deletions effected by replacing with next letter which
2164
    # will be ignored due to duplicate handling of Soundex code.
2165
    # This is faster than 'moving' all subsequent letters.
2166
2167
    # Remove any trailing Ss
2168
    while name[-1:] == 'S':
2169
        name = name[:-1]
2170
2171
    # Phonetic equivalents of first 2 characters
2172
    # Works since duplicate letters are ignored
2173
    if name[:2] == 'KN':
2174
        name = 'N' + name[2:]  # KN.. == N..
2175
    elif name[:2] == 'PH':
2176
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2177
    elif name[:2] == 'WR':
2178
        name = 'R' + name[2:]  # WR.. == R..
2179
2180
    if name:
2181
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2182
        # Works since duplicate letters are ignored
2183
        if name[0] == 'H':
2184
            name = name[1:]
2185
2186
    if name:
2187
        # Phonetic equivalents of first character
2188
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2189
            name = 'A' + name[1:]
2190
        elif name[0] in {'B', 'P'}:
2191
            name = 'B' + name[1:]
2192
        elif name[0] in {'V', 'F'}:
2193
            name = 'F' + name[1:]
2194
        elif name[0] in {'C', 'K', 'Q'}:
2195
            name = 'C' + name[1:]
2196
        elif name[0] in {'G', 'J'}:
2197
            name = 'G' + name[1:]
2198
        elif name[0] in {'S', 'Z'}:
2199
            name = 'S' + name[1:]
2200
2201
        name_code = last = name[0]
2202
2203
    # MODIFIED SOUNDEX CODE
2204
    for i in range(1, len(name)):
2205
        code = '0'
2206
        if name[i] in {'B', 'F', 'P', 'V'}:
2207
            code = '1'
2208
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
2209
            code = '2'
2210
        elif name[i] in {'D', 'T'}:
2211
            if name[i+1:i+2] != 'C':
2212
                code = '3'
2213
        elif name[i] == 'L':
2214
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2215
                    i+1 == len(name)):
2216
                code = '4'
2217
        elif name[i] in {'M', 'N'}:
2218
            if name[i+1:i+2] in {'D', 'G'}:
2219
                name = name[:i+1] + name[i] + name[i+2:]
2220
            code = '5'
2221
        elif name[i] == 'R':
2222
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2223
                    i+1 == len(name)):
2224
                code = '6'
2225
2226
        if code != last and code != '0' and i != 0:
2227
            name_code += code
2228
2229
        last = name_code[-1]
2230
2231
    if zero_pad:
2232
        name_code += '0' * maxlength
2233
    if not name_code:
2234
        name_code = '0'
2235
    return name_code[:maxlength]
2236
2237
2238
def phonem(word):
2239
    """Return the Phonem code for a word.
2240
2241
    Phonem is defined in :cite:`Wilde:1988`.
2242
2243
    This version is based on the Perl implementation documented at
2244
    :cite:`Wilz:2005`.
2245
    It includes some enhancements presented in the Java port at
2246
    :cite:`dcm4che:2011`.
2247
2248
    Phonem is intended chiefly for German names/words.
2249
2250
    :param str word: the word to transform
2251
    :returns: the Phonem value
2252
    :rtype: str
2253
2254
    >>> phonem('Christopher')
2255
    'CRYSDOVR'
2256
    >>> phonem('Niall')
2257
    'NYAL'
2258
    >>> phonem('Smith')
2259
    'SMYD'
2260
    >>> phonem('Schmidt')
2261
    'CMYD'
2262
    """
2263
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2264
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2265
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2266
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2267
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2268
                             ('AU', 'A§'), ('OU', '§'))
2269
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2270
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2271
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2272
2273
    word = normalize('NFC', text_type(word.upper()))
2274
    for i, j in _phonem_substitutions:
2275
        word = word.replace(i, j)
2276
    word = word.translate(_phonem_translation)
2277
2278
    return ''.join(c for c in _delete_consecutive_repeats(word)
2279
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
2280
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})
2281
2282
2283
def phonix(word, maxlength=4, zero_pad=True):
2284
    """Return the Phonix code for a word.
2285
2286
    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
2287
2288
    This implementation is based on:
2289
    - :cite:`Pfeifer:2000`
2290
    - :cite:`Christen:2011`
2291
    - :cite:`Kollar:2007`
2292
2293
    :param str word: the word to transform
2294
    :param int maxlength: the length of the code returned (defaults to 4)
2295
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2296
        a maxlength string
2297
    :returns: the Phonix value
2298
    :rtype: str
2299
2300
    >>> phonix('Christopher')
2301
    'K683'
2302
    >>> phonix('Niall')
2303
    'N400'
2304
    >>> phonix('Smith')
2305
    'S530'
2306
    >>> phonix('Schmidt')
2307
    'S530'
2308
    """
2309
    # pylint: disable=too-many-branches
2310
    def _start_repl(word, src, tar, post=None):
2311
        r"""Replace src with tar at the start of word."""
2312
        if post:
2313
            for i in post:
2314
                if word.startswith(src+i):
2315
                    return tar + word[len(src):]
2316
        elif word.startswith(src):
2317
            return tar + word[len(src):]
2318
        return word
2319
2320
    def _end_repl(word, src, tar, pre=None):
2321
        r"""Replace src with tar at the end of word."""
2322
        if pre:
2323
            for i in pre:
2324
                if word.endswith(i+src):
2325
                    return word[:-len(src)] + tar
2326
        elif word.endswith(src):
2327
            return word[:-len(src)] + tar
2328
        return word
2329
2330
    def _mid_repl(word, src, tar, pre=None, post=None):
2331
        r"""Replace src with tar in the middle of word."""
2332
        if pre or post:
2333
            if not pre:
2334
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2335
            elif not post:
2336
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2337
            return _all_repl(word, src, tar, pre, post)
2338
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2339
                word[-1])
2340
2341
    def _all_repl(word, src, tar, pre=None, post=None):
2342
        r"""Replace src with tar anywhere in word."""
2343
        if pre or post:
2344
            if post:
2345
                post = post
2346
            else:
2347
                post = frozenset(('',))
2348
            if pre:
2349
                pre = pre
2350
            else:
2351
                pre = frozenset(('',))
2352
2353
            for i, j in ((i, j) for i in pre for j in post):
2354
                word = word.replace(i+src+j, i+tar+j)
2355
            return word
2356
        else:
2357
            return word.replace(src, tar)
2358
2359
    _vow = {'A', 'E', 'I', 'O', 'U'}
2360
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
2361
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
2362
2363
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2364
                             (_all_repl, 'CO', 'KO'),
2365
                             (_all_repl, 'CA', 'KA'),
2366
                             (_all_repl, 'CU', 'KU'),
2367
                             (_all_repl, 'CY', 'SI'),
2368
                             (_all_repl, 'CI', 'SI'),
2369
                             (_all_repl, 'CE', 'SE'),
2370
                             (_start_repl, 'CL', 'KL', _vow),
2371
                             (_all_repl, 'CK', 'K'),
2372
                             (_end_repl, 'GC', 'K'),
2373
                             (_end_repl, 'JC', 'K'),
2374
                             (_start_repl, 'CHR', 'KR', _vow),
2375
                             (_start_repl, 'CR', 'KR', _vow),
2376
                             (_start_repl, 'WR', 'R'),
2377
                             (_all_repl, 'NC', 'NK'),
2378
                             (_all_repl, 'CT', 'KT'),
2379
                             (_all_repl, 'PH', 'F'),
2380
                             (_all_repl, 'AA', 'AR'),
2381
                             (_all_repl, 'SCH', 'SH'),
2382
                             (_all_repl, 'BTL', 'TL'),
2383
                             (_all_repl, 'GHT', 'T'),
2384
                             (_all_repl, 'AUGH', 'ARF'),
2385
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2386
                             (_all_repl, 'LOUGH', 'LOW'),
2387
                             (_start_repl, 'Q', 'KW'),
2388
                             (_start_repl, 'KN', 'N'),
2389
                             (_end_repl, 'GN', 'N'),
2390
                             (_all_repl, 'GHN', 'N'),
2391
                             (_end_repl, 'GNE', 'N'),
2392
                             (_all_repl, 'GHNE', 'NE'),
2393
                             (_end_repl, 'GNES', 'NS'),
2394
                             (_start_repl, 'GN', 'N'),
2395
                             (_mid_repl, 'GN', 'N', None, _con),
2396
                             (_end_repl, 'GN', 'N'),
2397
                             (_start_repl, 'PS', 'S'),
2398
                             (_start_repl, 'PT', 'T'),
2399
                             (_start_repl, 'CZ', 'C'),
2400
                             (_mid_repl, 'WZ', 'Z', _vow),
2401
                             (_mid_repl, 'CZ', 'CH'),
2402
                             (_all_repl, 'LZ', 'LSH'),
2403
                             (_all_repl, 'RZ', 'RSH'),
2404
                             (_mid_repl, 'Z', 'S', None, _vow),
2405
                             (_all_repl, 'ZZ', 'TS'),
2406
                             (_mid_repl, 'Z', 'TS', _con),
2407
                             (_all_repl, 'HROUG', 'REW'),
2408
                             (_all_repl, 'OUGH', 'OF'),
2409
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2410
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2411
                             (_start_repl, 'YJ', 'Y', _vow),
2412
                             (_start_repl, 'GH', 'G'),
2413
                             (_end_repl, 'GH', 'E', _vow),
2414
                             (_start_repl, 'CY', 'S'),
2415
                             (_all_repl, 'NX', 'NKS'),
2416
                             (_start_repl, 'PF', 'F'),
2417
                             (_end_repl, 'DT', 'T'),
2418
                             (_end_repl, 'TL', 'TIL'),
2419
                             (_end_repl, 'DL', 'DIL'),
2420
                             (_all_repl, 'YTH', 'ITH'),
2421
                             (_start_repl, 'TJ', 'CH', _vow),
2422
                             (_start_repl, 'TSJ', 'CH', _vow),
2423
                             (_start_repl, 'TS', 'T', _vow),
2424
                             (_all_repl, 'TCH', 'CH'),
2425
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2426
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2427
                             (_start_repl, 'MN', 'N', _vow),
2428
                             (_start_repl, 'PN', 'N', _vow),
2429
                             (_mid_repl, 'STL', 'SL', _vow),
2430
                             (_end_repl, 'STL', 'SL', _vow),
2431
                             (_end_repl, 'TNT', 'ENT'),
2432
                             (_end_repl, 'EAUX', 'OH'),
2433
                             (_all_repl, 'EXCI', 'ECS'),
2434
                             (_all_repl, 'X', 'ECS'),
2435
                             (_end_repl, 'NED', 'ND'),
2436
                             (_all_repl, 'JR', 'DR'),
2437
                             (_end_repl, 'EE', 'EA'),
2438
                             (_all_repl, 'ZS', 'S'),
2439
                             (_mid_repl, 'R', 'AH', _vow, _con),
2440
                             (_end_repl, 'R', 'AH', _vow),
2441
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2442
                             (_end_repl, 'HR', 'AH', _vow),
2443
                             (_end_repl, 'HR', 'AH', _vow),
2444
                             (_end_repl, 'RE', 'AR'),
2445
                             (_end_repl, 'R', 'AH', _vow),
2446
                             (_all_repl, 'LLE', 'LE'),
2447
                             (_end_repl, 'LE', 'ILE', _con),
2448
                             (_end_repl, 'LES', 'ILES', _con),
2449
                             (_end_repl, 'E', ''),
2450
                             (_end_repl, 'ES', 'S'),
2451
                             (_end_repl, 'SS', 'AS', _vow),
2452
                             (_end_repl, 'MB', 'M', _vow),
2453
                             (_all_repl, 'MPTS', 'MPS'),
2454
                             (_all_repl, 'MPS', 'MS'),
2455
                             (_all_repl, 'MPT', 'MT'))
2456
2457
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2458
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2459
                                   '01230720022455012683070808'))
2460
2461
    sdx = ''
2462
2463
    word = normalize('NFKD', text_type(word.upper()))
2464
    word = word.replace('ß', 'SS')
2465
    word = ''.join(c for c in word if c in
2466
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2467
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2468
                    'Y', 'Z'})
2469
    if word:
2470
        for trans in _phonix_substitutions:
2471
            word = trans[0](word, *trans[1:])
2472
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2473
            sdx = 'v' + word[1:].translate(_phonix_translation)
2474
        else:
2475
            sdx = word[0] + word[1:].translate(_phonix_translation)
2476
        sdx = _delete_consecutive_repeats(sdx)
2477
        sdx = sdx.replace('0', '')
2478
2479
    # Clamp maxlength to [4, 64]
2480
    if maxlength is not None:
2481
        maxlength = min(max(4, maxlength), 64)
2482
    else:
2483
        maxlength = 64
2484
2485
    if zero_pad:
2486
        sdx += '0' * maxlength
2487
    if not sdx:
2488
        sdx = '0'
2489
    return sdx[:maxlength]
2490
2491
2492
def sfinxbis(word, maxlength=None):
2493
    """Return the SfinxBis code for a word.
2494
2495
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
2496
2497
    This implementation follows the reference implementation:
2498
    :cite:`Sjoo:2009`.
2499
2500
    SfinxBis is intended chiefly for Swedish names.
2501
2502
    :param str word: the word to transform
2503
    :param int maxlength: the length of the code returned (defaults to
2504
        unlimited)
2505
    :returns: the SfinxBis value
2506
    :rtype: tuple
2507
2508
    >>> sfinxbis('Christopher')
2509
    ('K68376',)
2510
    >>> sfinxbis('Niall')
2511
    ('N4',)
2512
    >>> sfinxbis('Smith')
2513
    ('S53',)
2514
    >>> sfinxbis('Schmidt')
2515
    ('S53',)
2516
2517
    >>> sfinxbis('Johansson')
2518
    ('J585',)
2519
    >>> sfinxbis('Sjöberg')
2520
    ('#162',)
2521
    """
2522
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2523
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2524
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2525
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2526
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2527
                   ' S:T ')
2528
2529
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
2530
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
2531
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
2532
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
2533
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2534
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2535
                'Y', 'Z', 'Ä', 'Å', 'Ö'}
2536
2537
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2538
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2539
                                     '123729224551268378999999999'))
2540
2541
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2542
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2543
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2544
2545
    def _foersvensker(ordet):
2546
        """Return the Swedish-ized form of the word."""
2547
        ordet = ordet.replace('STIERN', 'STJÄRN')
2548
        ordet = ordet.replace('HIE', 'HJ')
2549
        ordet = ordet.replace('SIÖ', 'SJÖ')
2550
        ordet = ordet.replace('SCH', 'SH')
2551
        ordet = ordet.replace('QU', 'KV')
2552
        ordet = ordet.replace('IO', 'JO')
2553
        ordet = ordet.replace('PH', 'F')
2554
2555
        for i in _harde_vokaler:
2556
            ordet = ordet.replace(i+'Ü', i+'J')
2557
            ordet = ordet.replace(i+'Y', i+'J')
2558
            ordet = ordet.replace(i+'I', i+'J')
2559
        for i in _mjuka_vokaler:
2560
            ordet = ordet.replace(i+'Ü', i+'J')
2561
            ordet = ordet.replace(i+'Y', i+'J')
2562
            ordet = ordet.replace(i+'I', i+'J')
2563
2564
        if 'H' in ordet:
2565
            for i in _konsonanter:
2566
                ordet = ordet.replace('H'+i, i)
2567
2568
        ordet = ordet.translate(_sfinxbis_substitutions)
2569
2570
        ordet = ordet.replace('Ð', 'ETH')
2571
        ordet = ordet.replace('Þ', 'TH')
2572
        ordet = ordet.replace('ß', 'SS')
2573
2574
        return ordet
2575
2576
    def _koda_foersta_ljudet(ordet):
2577
        """Return the word with the first sound coded."""
2578
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2579
            ordet = '$' + ordet[1:]
2580
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2581
            ordet = 'J' + ordet[2:]
2582
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2583
            ordet = 'J' + ordet[1:]
2584
        elif ordet[0:1] == 'Q':
2585
            ordet = 'K' + ordet[1:]
2586
        elif (ordet[0:2] == 'CH' and
2587
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2588
            ordet = '#' + ordet[2:]
2589
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2590
            ordet = 'K' + ordet[1:]
2591
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2592
            ordet = 'K' + ordet[1:]
2593
        elif ordet[0:1] == 'X':
2594
            ordet = 'S' + ordet[1:]
2595
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2596
            ordet = 'S' + ordet[1:]
2597
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2598
            ordet = '#' + ordet[3:]
2599
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2600
            ordet = '#' + ordet[2:]
2601
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2602
            ordet = '#' + ordet[2:]
2603
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2604
            ordet = '#' + ordet[1:]
2605
        return ordet
2606
2607
    # Steg 1, Versaler
2608
    word = normalize('NFC', text_type(word.upper()))
2609
    word = word.replace('ß', 'SS')
2610
    word = word.replace('-', ' ')
2611
2612
    # Steg 2, Ta bort adelsprefix
2613
    for adelstitel in adelstitler:
2614
        while adelstitel in word:
2615
            word = word.replace(adelstitel, ' ')
2616
        if word.startswith(adelstitel[1:]):
2617
            word = word[len(adelstitel)-1:]
2618
2619
    # Split word into tokens
2620
    ordlista = word.split()
2621
2622
    # Steg 3, Ta bort dubbelteckning i början på namnet
2623
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2624
    if not ordlista:
2625
        return ('',)
2626
2627
    # Steg 4, Försvenskning
2628
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2629
2630
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2631
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2632
                for ordet in ordlista]
2633
2634
    # Steg 6, Koda första ljudet
2635
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2636
2637
    # Steg 7, Dela upp namnet i två delar
2638
    rest = [ordet[1:] for ordet in ordlista]
2639
2640
    # Steg 8, Utför fonetisk transformation i resten
2641
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2642
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2643
2644
    # Steg 9, Koda resten till en sifferkod
2645
    for vokal in _mjuka_vokaler:
2646
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2647
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2648
2649
    # Steg 10, Ta bort intilliggande dubbletter
2650
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2651
2652
    # Steg 11, Ta bort alla "9"
2653
    rest = [ordet.replace('9', '') for ordet in rest]
2654
2655
    # Steg 12, Sätt ihop delarna igen
2656
    ordlista = [''.join(ordet) for ordet in
2657
                zip((_[0:1] for _ in ordlista), rest)]
2658
2659
    # truncate, if maxlength is set
2660
    if maxlength and maxlength < _INFINITY:
2661
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2662
2663
    return tuple(ordlista)
2664
2665
2666
def phonet(word, mode=1, lang='de', trace=False):
2667
    """Return the phonet code for a word.
2668
2669
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
2670
    documented in :cite:`Michael:1999`.
2671
2672
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
2673
    :cite:`Zedlitz:2015`.
2674
2675
    That is, in turn, based on Michael's C code, which is also licensed LGPL
2676
    :cite:`Michael:2007`.
2677
2678
    :param str word: the word to transform
2679
    :param int mode: the ponet variant to employ (1 or 2)
2680
    :param str lang: 'de' (default) for German
2681
            'none' for no language
2682
    :param bool trace: prints debugging info if True
2683
    :returns: the phonet value
2684
    :rtype: str
2685
2686
    >>> phonet('Christopher')
2687
    'KRISTOFA'
2688
    >>> phonet('Niall')
2689
    'NIAL'
2690
    >>> phonet('Smith')
2691
    'SMIT'
2692
    >>> phonet('Schmidt')
2693
    'SHMIT'
2694
2695
    >>> phonet('Christopher', mode=2)
2696
    'KRIZTUFA'
2697
    >>> phonet('Niall', mode=2)
2698
    'NIAL'
2699
    >>> phonet('Smith', mode=2)
2700
    'ZNIT'
2701
    >>> phonet('Schmidt', mode=2)
2702
    'ZNIT'
2703
2704
    >>> phonet('Christopher', lang='none')
2705
    'CHRISTOPHER'
2706
    >>> phonet('Niall', lang='none')
2707
    'NIAL'
2708
    >>> phonet('Smith', lang='none')
2709
    'SMITH'
2710
    >>> phonet('Schmidt', lang='none')
2711
    'SCHMIDT'
2712
    """
2713
    # pylint: disable=too-many-branches
2714
2715
    _phonet_rules_no_lang = (  # separator chars
2716
        '´', ' ', ' ',
2717
        '"', ' ', ' ',
2718
        '`$', '', '',
2719
        '\'', ' ', ' ',
2720
        ',', ',', ',',
2721
        ';', ',', ',',
2722
        '-', ' ', ' ',
2723
        ' ', ' ', ' ',
2724
        '.', '.', '.',
2725
        ':', '.', '.',
2726
        # German umlauts
2727
        'Ä', 'AE', 'AE',
2728
        'Ö', 'OE', 'OE',
2729
        'Ü', 'UE', 'UE',
2730
        'ß', 'S', 'S',
2731
        # international umlauts
2732
        'À', 'A', 'A',
2733
        'Á', 'A', 'A',
2734
        'Â', 'A', 'A',
2735
        'Ã', 'A', 'A',
2736
        'Å', 'A', 'A',
2737
        'Æ', 'AE', 'AE',
2738
        'Ç', 'C', 'C',
2739
        'Ð', 'DJ', 'DJ',
2740
        'È', 'E', 'E',
2741
        'É', 'E', 'E',
2742
        'Ê', 'E', 'E',
2743
        'Ë', 'E', 'E',
2744
        'Ì', 'I', 'I',
2745
        'Í', 'I', 'I',
2746
        'Î', 'I', 'I',
2747
        'Ï', 'I', 'I',
2748
        'Ñ', 'NH', 'NH',
2749
        'Ò', 'O', 'O',
2750
        'Ó', 'O', 'O',
2751
        'Ô', 'O', 'O',
2752
        'Õ', 'O', 'O',
2753
        'Œ', 'OE', 'OE',
2754
        'Ø', 'OE', 'OE',
2755
        'Š', 'SH', 'SH',
2756
        'Þ', 'TH', 'TH',
2757
        'Ù', 'U', 'U',
2758
        'Ú', 'U', 'U',
2759
        'Û', 'U', 'U',
2760
        'Ý', 'Y', 'Y',
2761
        'Ÿ', 'Y', 'Y',
2762
        # 'normal' letters (A-Z)
2763
        'MC^', 'MAC', 'MAC',
2764
        'MC^', 'MAC', 'MAC',
2765
        'M´^', 'MAC', 'MAC',
2766
        'M\'^', 'MAC', 'MAC',
2767
        'O´^', 'O', 'O',
2768
        'O\'^', 'O', 'O',
2769
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2770
        None, None, None)
2771
2772
    _phonet_rules_german = (  # separator chars
2773
        '´', ' ', ' ',
2774
        '"', ' ', ' ',
2775
        '`$', '', '',
2776
        '\'', ' ', ' ',
2777
        ',', ' ', ' ',
2778
        ';', ' ', ' ',
2779
        '-', ' ', ' ',
2780
        ' ', ' ', ' ',
2781
        '.', '.', '.',
2782
        ':', '.', '.',
2783
        # German umlauts
2784
        'ÄE', 'E', 'E',
2785
        'ÄU<', 'EU', 'EU',
2786
        'ÄV(AEOU)-<', 'EW', None,
2787
        'Ä$', 'Ä', None,
2788
        'Ä<', None, 'E',
2789
        'Ä', 'E', None,
2790
        'ÖE', 'Ö', 'Ö',
2791
        'ÖU', 'Ö', 'Ö',
2792
        'ÖVER--<', 'ÖW', None,
2793
        'ÖV(AOU)-', 'ÖW', None,
2794
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2795
        'ÜBER^^', 'ÜBA', 'IBA',
2796
        'ÜE', 'Ü', 'I',
2797
        'ÜVER--<', 'ÜW', None,
2798
        'ÜV(AOU)-', 'ÜW', None,
2799
        'Ü', None, 'I',
2800
        'ßCH<', None, 'Z',
2801
        'ß<', 'S', 'Z',
2802
        # international umlauts
2803
        'À<', 'A', 'A',
2804
        'Á<', 'A', 'A',
2805
        'Â<', 'A', 'A',
2806
        'Ã<', 'A', 'A',
2807
        'Å<', 'A', 'A',
2808
        'ÆER-', 'E', 'E',
2809
        'ÆU<', 'EU', 'EU',
2810
        'ÆV(AEOU)-<', 'EW', None,
2811
        'Æ$', 'Ä', None,
2812
        'Æ<', None, 'E',
2813
        'Æ', 'E', None,
2814
        'Ç', 'Z', 'Z',
2815
        'ÐÐ-', '', '',
2816
        'Ð', 'DI', 'TI',
2817
        'È<', 'E', 'E',
2818
        'É<', 'E', 'E',
2819
        'Ê<', 'E', 'E',
2820
        'Ë', 'E', 'E',
2821
        'Ì<', 'I', 'I',
2822
        'Í<', 'I', 'I',
2823
        'Î<', 'I', 'I',
2824
        'Ï', 'I', 'I',
2825
        'ÑÑ-', '', '',
2826
        'Ñ', 'NI', 'NI',
2827
        'Ò<', 'O', 'U',
2828
        'Ó<', 'O', 'U',
2829
        'Ô<', 'O', 'U',
2830
        'Õ<', 'O', 'U',
2831
        'Œ<', 'Ö', 'Ö',
2832
        'Ø(IJY)-<', 'E', 'E',
2833
        'Ø<', 'Ö', 'Ö',
2834
        'Š', 'SH', 'Z',
2835
        'Þ', 'T', 'T',
2836
        'Ù<', 'U', 'U',
2837
        'Ú<', 'U', 'U',
2838
        'Û<', 'U', 'U',
2839
        'Ý<', 'I', 'I',
2840
        'Ÿ<', 'I', 'I',
2841
        # 'normal' letters (A-Z)
2842
        'ABELLE$', 'ABL', 'ABL',
2843
        'ABELL$', 'ABL', 'ABL',
2844
        'ABIENNE$', 'ABIN', 'ABIN',
2845
        'ACHME---^', 'ACH', 'AK',
2846
        'ACEY$', 'AZI', 'AZI',
2847
        'ADV', 'ATW', None,
2848
        'AEGL-', 'EK', None,
2849
        'AEU<', 'EU', 'EU',
2850
        'AE2', 'E', 'E',
2851
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2852
        'AGL-1', 'AK', None,
2853
        'AGNI-^', 'AKN', 'AKN',
2854
        'AGNIE-', 'ANI', 'ANI',
2855
        'AGN(AEOU)-$', 'ANI', 'ANI',
2856
        'AH(AIOÖUÜY)-', 'AH', None,
2857
        'AIA2', 'AIA', 'AIA',
2858
        'AIE$', 'E', 'E',
2859
        'AILL(EOU)-', 'ALI', 'ALI',
2860
        'AINE$', 'EN', 'EN',
2861
        'AIRE$', 'ER', 'ER',
2862
        'AIR-', 'E', 'E',
2863
        'AISE$', 'ES', 'EZ',
2864
        'AISSANCE$', 'ESANS', 'EZANZ',
2865
        'AISSE$', 'ES', 'EZ',
2866
        'AIX$', 'EX', 'EX',
2867
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2868
        'AKTIE', 'AXIE', 'AXIE',
2869
        'AKTUEL', 'AKTUEL', None,
2870
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2871
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2872
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2873
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2874
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2875
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2876
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2877
        'ANDERGING----', 'ANDA ', 'ANTA ',
2878
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2879
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2880
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2881
        'ANER(BKO)---^^', 'AN', None,
2882
        'ANHAND---^$', 'AN H', 'AN ',
2883
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2884
        'ANIELLE$', 'ANIEL', 'ANIL',
2885
        'ANIEL', 'ANIEL', None,
2886
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2887
        'ANTI^^', 'ANTI', 'ANTI',
2888
        'ANVER^^', 'ANFA', 'ANFA',
2889
        'ATIA$', 'ATIA', 'ATIA',
2890
        'ATIA(NS)--', 'ATI', 'ATI',
2891
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2892
        'AUAU--', '', '',
2893
        'AUERE$', 'AUERE', None,
2894
        'AUERE(NS)-$', 'AUERE', None,
2895
        'AUERE(AIOUY)--', 'AUER', None,
2896
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2897
        'AUER<', 'AUA', 'AUA',
2898
        'AUF^^', 'AUF', 'AUF',
2899
        'AULT$', 'O', 'U',
2900
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2901
        'AUR$', 'AUA', 'AUA',
2902
        'AUSSE$', 'OS', 'UZ',
2903
        'AUS(ST)-^', 'AUS', 'AUS',
2904
        'AUS^^', 'AUS', 'AUS',
2905
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2906
        'AUTO^^', 'AUTO', 'AUTU',
2907
        'AUX(IY)-', 'AUX', 'AUX',
2908
        'AUX', 'O', 'U',
2909
        'AU', 'AU', 'AU',
2910
        'AVER--<', 'AW', None,
2911
        'AVIER$', 'AWIE', 'AFIE',
2912
        'AV(EÈÉÊI)-^', 'AW', None,
2913
        'AV(AOU)-', 'AW', None,
2914
        'AYRE$', 'EIRE', 'EIRE',
2915
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2916
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2917
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2918
        'AYR<', 'EIA', 'EIA',
2919
        'AYER--<', 'EI', 'EI',
2920
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2921
        'AË', 'E', 'E',
2922
        'A(IJY)<', 'EI', 'EI',
2923
        'BABY^$', 'BEBI', 'BEBI',
2924
        'BAB(IY)^', 'BEBI', 'BEBI',
2925
        'BEAU^$', 'BO', None,
2926
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2927
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2928
        'BEE$', 'BI', 'BI',
2929
        'BEIGE^$', 'BESH', 'BEZ',
2930
        'BENOIT--', 'BENO', 'BENU',
2931
        'BER(DT)-', 'BER', None,
2932
        'BERN(DT)-', 'BERN', None,
2933
        'BE(LMNRST)-^', 'BE', 'BE',
2934
        'BETTE$', 'BET', 'BET',
2935
        'BEVOR^$', 'BEFOR', None,
2936
        'BIC$', 'BIZ', 'BIZ',
2937
        'BOWL(EI)-', 'BOL', 'BUL',
2938
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2939
        'BRINGEND-----^', 'BRI', 'BRI',
2940
        'BRINGEND-----', ' BRI', ' BRI',
2941
        'BROW(NS)-', 'BRAU', 'BRAU',
2942
        'BUDGET7', 'BÜGE', 'BIKE',
2943
        'BUFFET7', 'BÜFE', 'BIFE',
2944
        'BYLLE$', 'BILE', 'BILE',
2945
        'BYLL$', 'BIL', 'BIL',
2946
        'BYPA--^', 'BEI', 'BEI',
2947
        'BYTE<', 'BEIT', 'BEIT',
2948
        'BY9^', 'BÜ', None,
2949
        'B(SßZ)$', 'BS', None,
2950
        'CACH(EI)-^', 'KESH', 'KEZ',
2951
        'CAE--', 'Z', 'Z',
2952
        'CA(IY)$', 'ZEI', 'ZEI',
2953
        'CE(EIJUY)--', 'Z', 'Z',
2954
        'CENT<', 'ZENT', 'ZENT',
2955
        'CERST(EI)----^', 'KE', 'KE',
2956
        'CER$', 'ZA', 'ZA',
2957
        'CE3', 'ZE', 'ZE',
2958
        'CH\'S$', 'X', 'X',
2959
        'CH´S$', 'X', 'X',
2960
        'CHAO(ST)-', 'KAO', 'KAU',
2961
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2962
        'CHAR(AI)-^', 'KAR', 'KAR',
2963
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2964
        'CHÄ(CF)-', 'SHE', 'ZE',
2965
        'CHE(CF)-', 'SHE', 'ZE',
2966
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2967
        'CHEQUE<', 'SHEK', 'ZEK',
2968
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2969
        'CH(AEUY)-<^', 'SH', 'Z',
2970
        'CHK-', '', '',
2971
        'CHO(CKPS)-^', 'SHO', 'ZU',
2972
        'CHRIS-', 'KRI', None,
2973
        'CHRO-', 'KR', None,
2974
        'CH(LOR)-<^', 'K', 'K',
2975
        'CHST-', 'X', 'X',
2976
        'CH(SßXZ)3', 'X', 'X',
2977
        'CHTNI-3', 'CHN', 'KN',
2978
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2979
        'CH', 'CH', 'K',
2980
        'CIC$', 'ZIZ', 'ZIZ',
2981
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
2982
        'CIENCE$', 'EIENS', 'EIENZ',
2983
        'CIER$', 'ZIE', 'ZIE',
2984
        'CYB-^', 'ZEI', 'ZEI',
2985
        'CY9^', 'ZÜ', 'ZI',
2986
        'C(IJY)-<3', 'Z', 'Z',
2987
        'CLOWN-', 'KLAU', 'KLAU',
2988
        'CCH', 'Z', 'Z',
2989
        'CCE-', 'X', 'X',
2990
        'C(CK)-', '', '',
2991
        'CLAUDET---', 'KLO', 'KLU',
2992
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
2993
        'COACH', 'KOSH', 'KUZ',
2994
        'COLE$', 'KOL', 'KUL',
2995
        'COUCH', 'KAUSH', 'KAUZ',
2996
        'COW', 'KAU', 'KAU',
2997
        'CQUES$', 'K', 'K',
2998
        'CQUE', 'K', 'K',
2999
        'CRASH--9', 'KRE', 'KRE',
3000
        'CREAT-^', 'KREA', 'KREA',
3001
        'CST', 'XT', 'XT',
3002
        'CS<^', 'Z', 'Z',
3003
        'C(SßX)', 'X', 'X',
3004
        'CT\'S$', 'X', 'X',
3005
        'CT(SßXZ)', 'X', 'X',
3006
        'CZ<', 'Z', 'Z',
3007
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
3008
        'C.^', 'C.', 'C.',
3009
        'CÄ-', 'Z', 'Z',
3010
        'CÜ$', 'ZÜ', 'ZI',
3011
        'C\'S$', 'X', 'X',
3012
        'C<', 'K', 'K',
3013
        'DAHER^$', 'DAHER', None,
3014
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
3015
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
3016
        'DD(SZ)--<', '', '',
3017
        'DD9', 'D', None,
3018
        'DEPOT7', 'DEPO', 'TEBU',
3019
        'DESIGN', 'DISEIN', 'TIZEIN',
3020
        'DE(LMNRST)-3^', 'DE', 'TE',
3021
        'DETTE$', 'DET', 'TET',
3022
        'DH$', 'T', None,
3023
        'DIC$', 'DIZ', 'TIZ',
3024
        'DIDR-^', 'DIT', None,
3025
        'DIEDR-^', 'DIT', None,
3026
        'DJ(AEIOU)-^', 'I', 'I',
3027
        'DMITR-^', 'DIMIT', 'TINIT',
3028
        'DRY9^', 'DRÜ', None,
3029
        'DT-', '', '',
3030
        'DUIS-^', 'DÜ', 'TI',
3031
        'DURCH^^', 'DURCH', 'TURK',
3032
        'DVA$', 'TWA', None,
3033
        'DY9^', 'DÜ', None,
3034
        'DYS$', 'DIS', None,
3035
        'DS(CH)--<', 'T', 'T',
3036
        'DST', 'ZT', 'ZT',
3037
        'DZS(CH)--', 'T', 'T',
3038
        'D(SßZ)', 'Z', 'Z',
3039
        'D(AÄEIOÖRUÜY)-', 'D', None,
3040
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3041
        'D\'H^', 'D', 'T',
3042
        'D´H^', 'D', 'T',
3043
        'D`H^', 'D', 'T',
3044
        'D\'S3$', 'Z', 'Z',
3045
        'D´S3$', 'Z', 'Z',
3046
        'D^', 'D', None,
3047
        'D', 'T', 'T',
3048
        'EAULT$', 'O', 'U',
3049
        'EAUX$', 'O', 'U',
3050
        'EAU', 'O', 'U',
3051
        'EAV', 'IW', 'IF',
3052
        'EAS3$', 'EAS', None,
3053
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3054
        'EA3$', 'EA', 'EA',
3055
        'EA3', 'I', 'I',
3056
        'EBENSO^$', 'EBNSO', 'EBNZU',
3057
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3058
        'EBEN^^', 'EBN', 'EBN',
3059
        'EE9', 'E', 'E',
3060
        'EGL-1', 'EK', None,
3061
        'EHE(IUY)--1', 'EH', None,
3062
        'EHUNG---1', 'E', None,
3063
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3064
        'EIEI--', '', '',
3065
        'EIERE^$', 'EIERE', None,
3066
        'EIERE$', 'EIERE', None,
3067
        'EIERE(NS)-$', 'EIERE', None,
3068
        'EIERE(AIOUY)--', 'EIER', None,
3069
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3070
        'EIER<', 'EIA', None,
3071
        'EIGL-1', 'EIK', None,
3072
        'EIGH$', 'EI', 'EI',
3073
        'EIH--', 'E', 'E',
3074
        'EILLE$', 'EI', 'EI',
3075
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3076
        'EIR$', 'EIA', 'EIA',
3077
        'EITRAUBEN------', 'EIT ', 'EIT ',
3078
        'EI', 'EI', 'EI',
3079
        'EJ$', 'EI', 'EI',
3080
        'ELIZ^', 'ELIS', None,
3081
        'ELZ^', 'ELS', None,
3082
        'EL-^', 'E', 'E',
3083
        'ELANG----1', 'E', 'E',
3084
        'EL(DKL)--1', 'E', 'E',
3085
        'EL(MNT)--1$', 'E', 'E',
3086
        'ELYNE$', 'ELINE', 'ELINE',
3087
        'ELYN$', 'ELIN', 'ELIN',
3088
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3089
        'EL-1', 'L', 'L',
3090
        'EM-^', None, 'E',
3091
        'EM(DFKMPQT)--1', None, 'E',
3092
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3093
        'EM-1', None, 'N',
3094
        'ENGAG-^', 'ANGA', 'ANKA',
3095
        'EN-^', 'E', 'E',
3096
        'ENTUEL', 'ENTUEL', None,
3097
        'EN(CDGKQSTZ)--1', 'E', 'E',
3098
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3099
        'EN-1', '', '',
3100
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3101
        'ER-^', 'E', 'E',
3102
        'ERREGEND-----', ' ER', ' ER',
3103
        'ERT1$', 'AT', None,
3104
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3105
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3106
        'ER1$', 'A', 'A',
3107
        'ER<1', 'A', 'A',
3108
        'ETAT7', 'ETA', 'ETA',
3109
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3110
        'EUERE$', 'EUERE', None,
3111
        'EUERE(NS)-$', 'EUERE', None,
3112
        'EUERE(AIOUY)--', 'EUER', None,
3113
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3114
        'EUER<', 'EUA', None,
3115
        'EUEU--', '', '',
3116
        'EUILLE$', 'Ö', 'Ö',
3117
        'EUR$', 'ÖR', 'ÖR',
3118
        'EUX', 'Ö', 'Ö',
3119
        'EUSZ$', 'EUS', None,
3120
        'EUTZ$', 'EUS', None,
3121
        'EUYS$', 'EUS', 'EUZ',
3122
        'EUZ$', 'EUS', None,
3123
        'EU', 'EU', 'EU',
3124
        'EVER--<1', 'EW', None,
3125
        'EV(ÄOÖUÜ)-1', 'EW', None,
3126
        'EYER<', 'EIA', 'EIA',
3127
        'EY<', 'EI', 'EI',
3128
        'FACETTE', 'FASET', 'FAZET',
3129
        'FANS--^$', 'FE', 'FE',
3130
        'FAN-^$', 'FE', 'FE',
3131
        'FAULT-', 'FOL', 'FUL',
3132
        'FEE(DL)-', 'FI', 'FI',
3133
        'FEHLER', 'FELA', 'FELA',
3134
        'FE(LMNRST)-3^', 'FE', 'FE',
3135
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3136
        'FOERDERN---', ' FÖRD', ' FÖRT',
3137
        'FOND7', 'FON', 'FUN',
3138
        'FRAIN$', 'FRA', 'FRA',
3139
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3140
        'FY9^', 'FÜ', None,
3141
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3142
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3143
        'GAGS^$', 'GEX', 'KEX',
3144
        'GAG^$', 'GEK', 'KEK',
3145
        'GD', 'KT', 'KT',
3146
        'GEGEN^^', 'GEGN', 'KEKN',
3147
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3148
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3149
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3150
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3151
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3152
        'GENRE', 'IORE', 'IURE',
3153
        'GE(LMNRST)-3^', 'GE', 'KE',
3154
        'GER(DKT)-', 'GER', None,
3155
        'GETTE$', 'GET', 'KET',
3156
        'GGF.', 'GF.', None,
3157
        'GG-', '', '',
3158
        'GH', 'G', None,
3159
        'GI(AOU)-^', 'I', 'I',
3160
        'GION-3', 'KIO', 'KIU',
3161
        'G(CK)-', '', '',
3162
        'GJ(AEIOU)-^', 'I', 'I',
3163
        'GMBH^$', 'GMBH', 'GMBH',
3164
        'GNAC$', 'NIAK', 'NIAK',
3165
        'GNON$', 'NION', 'NIUN',
3166
        'GN$', 'N', 'N',
3167
        'GONCAL-^', 'GONZA', 'KUNZA',
3168
        'GRY9^', 'GRÜ', None,
3169
        'G(SßXZ)-<', 'K', 'K',
3170
        'GUCK-', 'KU', 'KU',
3171
        'GUISEP-^', 'IUSE', 'IUZE',
3172
        'GUI-^', 'G', 'K',
3173
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3174
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3175
        'GY9^', 'GÜ', None,
3176
        'G(AÄEILOÖRUÜY)-', 'G', None,
3177
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3178
        'G\'S$', 'X', 'X',
3179
        'G´S$', 'X', 'X',
3180
        'G^', 'G', None,
3181
        'G', 'K', 'K',
3182
        'HA(HIUY)--1', 'H', None,
3183
        'HANDVOL---^', 'HANT ', 'ANT ',
3184
        'HANNOVE-^', 'HANOF', None,
3185
        'HAVEN7$', 'HAFN', None,
3186
        'HEAD-', 'HE', 'E',
3187
        'HELIEGEN------', 'E ', 'E ',
3188
        'HESTEHEN------', 'E ', 'E ',
3189
        'HE(LMNRST)-3^', 'HE', 'E',
3190
        'HE(LMN)-1', 'E', 'E',
3191
        'HEUR1$', 'ÖR', 'ÖR',
3192
        'HE(HIUY)--1', 'H', None,
3193
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3194
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3195
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3196
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3197
        'HOBBY9^', 'HOBI', None,
3198
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3199
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3200
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3201
        'HO(HIY)--1', 'H', None,
3202
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3203
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3204
        'HUIS^^', 'HÜS', 'IZ',
3205
        'HUIS$', 'ÜS', 'IZ',
3206
        'HUI--1', 'H', None,
3207
        'HYGIEN^', 'HÜKIEN', None,
3208
        'HY9^', 'HÜ', None,
3209
        'HY(BDGMNPST)-', 'Ü', None,
3210
        'H.^', None, 'H.',
3211
        'HÄU--1', 'H', None,
3212
        'H^', 'H', '',
3213
        'H', '', '',
3214
        'ICHELL---', 'ISH', 'IZ',
3215
        'ICHI$', 'ISHI', 'IZI',
3216
        'IEC$', 'IZ', 'IZ',
3217
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3218
        'IEI-3', '', '',
3219
        'IELL3', 'IEL', 'IEL',
3220
        'IENNE$', 'IN', 'IN',
3221
        'IERRE$', 'IER', 'IER',
3222
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3223
        'IETTE$', 'IT', 'IT',
3224
        'IEU', 'IÖ', 'IÖ',
3225
        'IE<4', 'I', 'I',
3226
        'IGL-1', 'IK', None,
3227
        'IGHT3$', 'EIT', 'EIT',
3228
        'IGNI(EO)-', 'INI', 'INI',
3229
        'IGN(AEOU)-$', 'INI', 'INI',
3230
        'IHER(DGLKRT)--1', 'IHE', None,
3231
        'IHE(IUY)--', 'IH', None,
3232
        'IH(AIOÖUÜY)-', 'IH', None,
3233
        'IJ(AOU)-', 'I', 'I',
3234
        'IJ$', 'I', 'I',
3235
        'IJ<', 'EI', 'EI',
3236
        'IKOLE$', 'IKOL', 'IKUL',
3237
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3238
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3239
        'IMSTAN----^', 'IM ', 'IN ',
3240
        'INDELERREGE------', 'INDL ', 'INTL ',
3241
        'INFRAGE-----^$', 'IN ', 'IN ',
3242
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3243
        'INVER-', 'INWE', 'INFE',
3244
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3245
        'IUSZ$', 'IUS', None,
3246
        'IUTZ$', 'IUS', None,
3247
        'IUZ$', 'IUS', None,
3248
        'IVER--<', 'IW', None,
3249
        'IVIER$', 'IWIE', 'IFIE',
3250
        'IV(ÄOÖUÜ)-', 'IW', None,
3251
        'IV<3', 'IW', None,
3252
        'IY2', 'I', None,
3253
        'I(ÈÉÊ)<4', 'I', 'I',
3254
        'JAVIE---<^', 'ZA', 'ZA',
3255
        'JEANS^$', 'JINS', 'INZ',
3256
        'JEANNE^$', 'IAN', 'IAN',
3257
        'JEAN-^', 'IA', 'IA',
3258
        'JER-^', 'IE', 'IE',
3259
        'JE(LMNST)-', 'IE', 'IE',
3260
        'JI^', 'JI', None,
3261
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3262
        'J', 'I', 'I',
3263
        'KC(ÄEIJ)-', 'X', 'X',
3264
        'KD', 'KT', None,
3265
        'KE(LMNRST)-3^', 'KE', 'KE',
3266
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3267
        'KH<^', 'K', 'K',
3268
        'KIC$', 'KIZ', 'KIZ',
3269
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3270
        'KOTELE-^', 'KOTL', 'KUTL',
3271
        'KREAT-^', 'KREA', 'KREA',
3272
        'KRÜS(TZ)--^', 'KRI', None,
3273
        'KRYS(TZ)--^', 'KRI', None,
3274
        'KRY9^', 'KRÜ', None,
3275
        'KSCH---', 'K', 'K',
3276
        'KSH--', 'K', 'K',
3277
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3278
        'KT\'S$', 'X', 'X',
3279
        'KTI(AIOU)-3', 'XI', 'XI',
3280
        'KT(SßXZ)', 'X', 'X',
3281
        'KY9^', 'KÜ', None,
3282
        'K\'S$', 'X', 'X',
3283
        'K´S$', 'X', 'X',
3284
        'LANGES$', ' LANGES', ' LANKEZ',
3285
        'LANGE$', ' LANGE', ' LANKE',
3286
        'LANG$', ' LANK', ' LANK',
3287
        'LARVE-', 'LARF', 'LARF',
3288
        'LD(SßZ)$', 'LS', 'LZ',
3289
        'LD\'S$', 'LS', 'LZ',
3290
        'LD´S$', 'LS', 'LZ',
3291
        'LEAND-^', 'LEAN', 'LEAN',
3292
        'LEERSTEHE-----^', 'LER ', 'LER ',
3293
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3294
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3295
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3296
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3297
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3298
        'LEL-', 'LE', 'LE',
3299
        'LE(MNRST)-3^', 'LE', 'LE',
3300
        'LETTE$', 'LET', 'LET',
3301
        'LFGNAG-', 'LFGAN', 'LFKAN',
3302
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3303
        'LIC$', 'LIZ', 'LIZ',
3304
        'LIVE^$', 'LEIF', 'LEIF',
3305
        'LT(SßZ)$', 'LS', 'LZ',
3306
        'LT\'S$', 'LS', 'LZ',
3307
        'LT´S$', 'LS', 'LZ',
3308
        'LUI(GS)--', 'LU', 'LU',
3309
        'LV(AIO)-', 'LW', None,
3310
        'LY9^', 'LÜ', None,
3311
        'LSTS$', 'LS', 'LZ',
3312
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3313
        'L(SßZ)$', 'LS', None,
3314
        'MAIR-<', 'MEI', 'NEI',
3315
        'MANAG-', 'MENE', 'NENE',
3316
        'MANUEL', 'MANUEL', None,
3317
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3318
        'MATCH', 'MESH', 'NEZ',
3319
        'MAURICE', 'MORIS', 'NURIZ',
3320
        'MBH^$', 'MBH', 'MBH',
3321
        'MB(ßZ)$', 'MS', None,
3322
        'MB(SßTZ)-', 'M', 'N',
3323
        'MCG9^', 'MAK', 'NAK',
3324
        'MC9^', 'MAK', 'NAK',
3325
        'MEMOIR-^', 'MEMOA', 'NENUA',
3326
        'MERHAVEN$', 'MAHAFN', None,
3327
        'ME(LMNRST)-3^', 'ME', 'NE',
3328
        'MEN(STZ)--3', 'ME', None,
3329
        'MEN$', 'MEN', None,
3330
        'MIGUEL-', 'MIGE', 'NIKE',
3331
        'MIKE^$', 'MEIK', 'NEIK',
3332
        'MITHILFE----^$', 'MIT H', 'NIT ',
3333
        'MN$', 'M', None,
3334
        'MN', 'N', 'N',
3335
        'MPJUTE-', 'MPUT', 'NBUT',
3336
        'MP(ßZ)$', 'MS', None,
3337
        'MP(SßTZ)-', 'M', 'N',
3338
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3339
        'MY9^', 'MÜ', None,
3340
        'M(ßZ)$', 'MS', None,
3341
        'M´G7^', 'MAK', 'NAK',
3342
        'M\'G7^', 'MAK', 'NAK',
3343
        'M´^', 'MAK', 'NAK',
3344
        'M\'^', 'MAK', 'NAK',
3345
        'M', None, 'N',
3346
        'NACH^^', 'NACH', 'NAK',
3347
        'NADINE', 'NADIN', 'NATIN',
3348
        'NAIV--', 'NA', 'NA',
3349
        'NAISE$', 'NESE', 'NEZE',
3350
        'NAUGENOMM------', 'NAU ', 'NAU ',
3351
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3352
        'NCH$', 'NSH', 'NZ',
3353
        'NCOISE$', 'SOA', 'ZUA',
3354
        'NCOIS$', 'SOA', 'ZUA',
3355
        'NDAR$', 'NDA', 'NTA',
3356
        'NDERINGEN------', 'NDE ', 'NTE ',
3357
        'NDRO(CDKTZ)-', 'NTRO', None,
3358
        'ND(BFGJLMNPQVW)-', 'NT', None,
3359
        'ND(SßZ)$', 'NS', 'NZ',
3360
        'ND\'S$', 'NS', 'NZ',
3361
        'ND´S$', 'NS', 'NZ',
3362
        'NEBEN^^', 'NEBN', 'NEBN',
3363
        'NENGELERN------', 'NEN ', 'NEN ',
3364
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3365
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3366
        'NE(LMNRST)-3^', 'NE', 'NE',
3367
        'NEN-3', 'NE', 'NE',
3368
        'NETTE$', 'NET', 'NET',
3369
        'NGU^^', 'NU', 'NU',
3370
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3371
        'NH(AUO)-$', 'NI', 'NI',
3372
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3373
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3374
        'NICHTS^^', 'NIX', 'NIX',
3375
        'NICHT^^', 'NICHT', 'NIKT',
3376
        'NINE$', 'NIN', 'NIN',
3377
        'NON^^', 'NON', 'NUN',
3378
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3379
        'NOT^^', 'NOT', 'NUT',
3380
        'NTI(AIOU)-3', 'NZI', 'NZI',
3381
        'NTIEL--3', 'NZI', 'NZI',
3382
        'NT(SßZ)$', 'NS', 'NZ',
3383
        'NT\'S$', 'NS', 'NZ',
3384
        'NT´S$', 'NS', 'NZ',
3385
        'NYLON', 'NEILON', 'NEILUN',
3386
        'NY9^', 'NÜ', None,
3387
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3388
        'NSZ-', 'NS', None,
3389
        'NSTS$', 'NS', 'NZ',
3390
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3391
        'N(SßZ)$', 'NS', None,
3392
        'OBERE-', 'OBER', None,
3393
        'OBER^^', 'OBA', 'UBA',
3394
        'OEU2', 'Ö', 'Ö',
3395
        'OE<2', 'Ö', 'Ö',
3396
        'OGL-', 'OK', None,
3397
        'OGNIE-', 'ONI', 'UNI',
3398
        'OGN(AEOU)-$', 'ONI', 'UNI',
3399
        'OH(AIOÖUÜY)-', 'OH', None,
3400
        'OIE$', 'Ö', 'Ö',
3401
        'OIRE$', 'OA', 'UA',
3402
        'OIR$', 'OA', 'UA',
3403
        'OIX', 'OA', 'UA',
3404
        'OI<3', 'EU', 'EU',
3405
        'OKAY^$', 'OKE', 'UKE',
3406
        'OLYN$', 'OLIN', 'ULIN',
3407
        'OO(DLMZ)-', 'U', None,
3408
        'OO$', 'U', None,
3409
        'OO-', '', '',
3410
        'ORGINAL-----', 'ORI', 'URI',
3411
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3412
        'OUI^', 'WI', 'FI',
3413
        'OUILLE$', 'ULIE', 'ULIE',
3414
        'OU(DT)-^', 'AU', 'AU',
3415
        'OUSE$', 'AUS', 'AUZ',
3416
        'OUT-', 'AU', 'AU',
3417
        'OU', 'U', 'U',
3418
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3419
        'OVER--<', 'OW', None,
3420
        'OV(AOU)-', 'OW', None,
3421
        'OW$', 'AU', 'AU',
3422
        'OWS$', 'OS', 'UZ',
3423
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3424
        'OYER', 'OIA', None,
3425
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3426
        'O(JY)<', 'EU', 'EU',
3427
        'OZ$', 'OS', None,
3428
        'O´^', 'O', 'U',
3429
        'O\'^', 'O', 'U',
3430
        'O', None, 'U',
3431
        'PATIEN--^', 'PAZI', 'PAZI',
3432
        'PENSIO-^', 'PANSI', 'PANZI',
3433
        'PE(LMNRST)-3^', 'PE', 'PE',
3434
        'PFER-^', 'FE', 'FE',
3435
        'P(FH)<', 'F', 'F',
3436
        'PIC^$', 'PIK', 'PIK',
3437
        'PIC$', 'PIZ', 'PIZ',
3438
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3439
        'POLYP-', 'POLÜ', None,
3440
        'POLY^^', 'POLI', 'PULI',
3441
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3442
        'POWER7', 'PAUA', 'PAUA',
3443
        'PP(FH)--<', 'B', 'B',
3444
        'PP-', '', '',
3445
        'PRODUZ-^', 'PRODU', 'BRUTU',
3446
        'PRODUZI--', ' PRODU', ' BRUTU',
3447
        'PRIX^$', 'PRI', 'PRI',
3448
        'PS-^^', 'P', None,
3449
        'P(SßZ)^', None, 'Z',
3450
        'P(SßZ)$', 'BS', None,
3451
        'PT-^', '', '',
3452
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3453
        'PY9^', 'PÜ', None,
3454
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3455
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3456
        'P.^', None, 'P.',
3457
        'P^', 'P', None,
3458
        'P', 'B', 'B',
3459
        'QI-', 'Z', 'Z',
3460
        'QUARANT--', 'KARA', 'KARA',
3461
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3462
        'QUE$', 'K', 'K',
3463
        'QUI(NS)$', 'KI', 'KI',
3464
        'QUIZ7', 'KWIS', None,
3465
        'Q(UV)7', 'KW', 'KF',
3466
        'Q<', 'K', 'K',
3467
        'RADFAHR----', 'RAT ', 'RAT ',
3468
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3469
        'RCH', 'RCH', 'RK',
3470
        'REA(DU)---3^', 'R', None,
3471
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3472
        'RECHERCH^', 'RESHASH', 'REZAZ',
3473
        'RECYCL--', 'RIZEI', 'RIZEI',
3474
        'RE(ALST)-3^', 'RE', None,
3475
        'REE$', 'RI', 'RI',
3476
        'RER$', 'RA', 'RA',
3477
        'RE(MNR)-4', 'RE', 'RE',
3478
        'RETTE$', 'RET', 'RET',
3479
        'REUZ$', 'REUZ', None,
3480
        'REW$', 'RU', 'RU',
3481
        'RH<^', 'R', 'R',
3482
        'RJA(MN)--', 'RI', 'RI',
3483
        'ROWD-^', 'RAU', 'RAU',
3484
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3485
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3486
        'RTIEL--3', 'RZI', 'RZI',
3487
        'RV(AEOU)-3', 'RW', None,
3488
        'RY(KN)-$', 'RI', 'RI',
3489
        'RY9^', 'RÜ', None,
3490
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3491
        'SAISO-^', 'SES', 'ZEZ',
3492
        'SAFE^$', 'SEIF', 'ZEIF',
3493
        'SAUCE-^', 'SOS', 'ZUZ',
3494
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3495
        'SCHSCH---7', '', '',
3496
        'SCHTSCH', 'SH', 'Z',
3497
        'SC(HZ)<', 'SH', 'Z',
3498
        'SC', 'SK', 'ZK',
3499
        'SELBSTST--7^^', 'SELB', 'ZELB',
3500
        'SELBST7^^', 'SELBST', 'ZELBZT',
3501
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3502
        'SERVI-^', 'SERW', None,
3503
        'SE(LMNRST)-3^', 'SE', 'ZE',
3504
        'SETTE$', 'SET', 'ZET',
3505
        'SHP-^', 'S', 'Z',
3506
        'SHST', 'SHT', 'ZT',
3507
        'SHTSH', 'SH', 'Z',
3508
        'SHT', 'ST', 'Z',
3509
        'SHY9^', 'SHÜ', None,
3510
        'SH^^', 'SH', None,
3511
        'SH3', 'SH', 'Z',
3512
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3513
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3514
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3515
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3516
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3517
        'SIEGLI-^', 'SIKL', 'ZIKL',
3518
        'SIGLI-^', 'SIKL', 'ZIKL',
3519
        'SIGHT', 'SEIT', 'ZEIT',
3520
        'SIGN', 'SEIN', 'ZEIN',
3521
        'SKI(NPZ)-', 'SKI', 'ZKI',
3522
        'SKI<^', 'SHI', 'ZI',
3523
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3524
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3525
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3526
        'SOUND-', 'SAUN', 'ZAUN',
3527
        'STAATS^^', 'STAZ', 'ZTAZ',
3528
        'STADT^^', 'STAT', 'ZTAT',
3529
        'STANDE$', ' STANDE', ' ZTANTE',
3530
        'START^^', 'START', 'ZTART',
3531
        'STAURANT7', 'STORAN', 'ZTURAN',
3532
        'STEAK-', 'STE', 'ZTE',
3533
        'STEPHEN-^$', 'STEW', None,
3534
        'STERN', 'STERN', None,
3535
        'STRAF^^', 'STRAF', 'ZTRAF',
3536
        'ST\'S$', 'Z', 'Z',
3537
        'ST´S$', 'Z', 'Z',
3538
        'STST--', '', '',
3539
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3540
        'ST(SZ)', 'Z', 'Z',
3541
        'SPAREN---^', 'SPA', 'ZPA',
3542
        'SPAREND----', ' SPA', ' ZPA',
3543
        'S(PTW)-^^', 'S', None,
3544
        'SP', 'SP', None,
3545
        'STYN(AE)-$', 'STIN', 'ZTIN',
3546
        'ST', 'ST', 'ZT',
3547
        'SUITE<', 'SIUT', 'ZIUT',
3548
        'SUKE--$', 'S', 'Z',
3549
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3550
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3551
        'SYB(IY)--^', 'SIB', None,
3552
        'SYL(KVW)--^', 'SI', None,
3553
        'SY9^', 'SÜ', None,
3554
        'SZE(NPT)-^', 'ZE', 'ZE',
3555
        'SZI(ELN)-^', 'ZI', 'ZI',
3556
        'SZCZ<', 'SH', 'Z',
3557
        'SZT<', 'ST', 'ZT',
3558
        'SZ<3', 'SH', 'Z',
3559
        'SÜL(KVW)--^', 'SI', None,
3560
        'S', None, 'Z',
3561
        'TCH', 'SH', 'Z',
3562
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3563
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3564
        'TEAT-^', 'TEA', 'TEA',
3565
        'TERRAI7^', 'TERA', 'TERA',
3566
        'TE(LMNRST)-3^', 'TE', 'TE',
3567
        'TH<', 'T', 'T',
3568
        'TICHT-', 'TIK', 'TIK',
3569
        'TICH$', 'TIK', 'TIK',
3570
        'TIC$', 'TIZ', 'TIZ',
3571
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3572
        'TIGSTELL-----', 'TIK ', 'TIK ',
3573
        'TOAS-^', 'TO', 'TU',
3574
        'TOILET-', 'TOLE', 'TULE',
3575
        'TOIN-', 'TOA', 'TUA',
3576
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3577
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3578
        'TRAINI-', 'TREN', 'TREN',
3579
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3580
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3581
        'TSCH', 'SH', 'Z',
3582
        'TSH', 'SH', 'Z',
3583
        'TST', 'ZT', 'ZT',
3584
        'T(Sß)', 'Z', 'Z',
3585
        'TT(SZ)--<', '', '',
3586
        'TT9', 'T', 'T',
3587
        'TV^$', 'TV', 'TV',
3588
        'TX(AEIOU)-3', 'SH', 'Z',
3589
        'TY9^', 'TÜ', None,
3590
        'TZ-', '', '',
3591
        'T\'S3$', 'Z', 'Z',
3592
        'T´S3$', 'Z', 'Z',
3593
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3594
        'UEBER^^', 'ÜBA', 'IBA',
3595
        'UE2', 'Ü', 'I',
3596
        'UGL-', 'UK', None,
3597
        'UH(AOÖUÜY)-', 'UH', None,
3598
        'UIE$', 'Ü', 'I',
3599
        'UM^^', 'UM', 'UN',
3600
        'UNTERE--3', 'UNTE', 'UNTE',
3601
        'UNTER^^', 'UNTA', 'UNTA',
3602
        'UNVER^^', 'UNFA', 'UNFA',
3603
        'UN^^', 'UN', 'UN',
3604
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3605
        'UVE-4', 'UW', None,
3606
        'UY2', 'UI', None,
3607
        'UZZ', 'AS', 'AZ',
3608
        'VACL-^', 'WAZ', 'FAZ',
3609
        'VAC$', 'WAZ', 'FAZ',
3610
        'VAN DEN ^', 'FANDN', 'FANTN',
3611
        'VANES-^', 'WANE', None,
3612
        'VATRO-', 'WATR', None,
3613
        'VA(DHJNT)--^', 'F', None,
3614
        'VEDD-^', 'FE', 'FE',
3615
        'VE(BEHIU)--^', 'F', None,
3616
        'VEL(BDLMNT)-^', 'FEL', None,
3617
        'VENTZ-^', 'FEN', None,
3618
        'VEN(NRSZ)-^', 'FEN', None,
3619
        'VER(AB)-^$', 'WER', None,
3620
        'VERBAL^$', 'WERBAL', None,
3621
        'VERBAL(EINS)-^', 'WERBAL', None,
3622
        'VERTEBR--', 'WERTE', None,
3623
        'VEREIN-----', 'F', None,
3624
        'VEREN(AEIOU)-^', 'WEREN', None,
3625
        'VERIFI', 'WERIFI', None,
3626
        'VERON(AEIOU)-^', 'WERON', None,
3627
        'VERSEN^', 'FERSN', 'FAZN',
3628
        'VERSIERT--^', 'WERSI', None,
3629
        'VERSIO--^', 'WERS', None,
3630
        'VERSUS', 'WERSUS', None,
3631
        'VERTI(GK)-', 'WERTI', None,
3632
        'VER^^', 'FER', 'FA',
3633
        'VERSPRECHE-------', ' FER', ' FA',
3634
        'VER$', 'WA', None,
3635
        'VER', 'FA', 'FA',
3636
        'VET(HT)-^', 'FET', 'FET',
3637
        'VETTE$', 'WET', 'FET',
3638
        'VE^', 'WE', None,
3639
        'VIC$', 'WIZ', 'FIZ',
3640
        'VIELSAGE----', 'FIL ', 'FIL ',
3641
        'VIEL', 'FIL', 'FIL',
3642
        'VIEW', 'WIU', 'FIU',
3643
        'VILL(AE)-', 'WIL', None,
3644
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3645
        'VI(ELS)--^', 'F', None,
3646
        'VILLON--', 'WILI', 'FILI',
3647
        'VIZE^^', 'FIZE', 'FIZE',
3648
        'VLIE--^', 'FL', None,
3649
        'VL(AEIOU)--', 'W', None,
3650
        'VOKA-^', 'WOK', None,
3651
        'VOL(ATUVW)--^', 'WO', None,
3652
        'VOR^^', 'FOR', 'FUR',
3653
        'VR(AEIOU)--', 'W', None,
3654
        'VV9', 'W', None,
3655
        'VY9^', 'WÜ', 'FI',
3656
        'V(ÜY)-', 'W', None,
3657
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3658
        'V(AEIJLRU)-<', 'W', None,
3659
        'V.^', 'V.', None,
3660
        'V<', 'F', 'F',
3661
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3662
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3663
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3664
        'WE(LMNRST)-3^', 'WE', 'FE',
3665
        'WER(DST)-', 'WER', None,
3666
        'WIC$', 'WIZ', 'FIZ',
3667
        'WIEDERU--', 'WIDE', 'FITE',
3668
        'WIEDER^$', 'WIDA', 'FITA',
3669
        'WIEDER^^', 'WIDA ', 'FITA ',
3670
        'WIEVIEL', 'WI FIL', 'FI FIL',
3671
        'WISUEL', 'WISUEL', None,
3672
        'WR-^', 'W', None,
3673
        'WY9^', 'WÜ', 'FI',
3674
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3675
        'W$', 'F', None,
3676
        'W', None, 'F',
3677
        'X<^', 'Z', 'Z',
3678
        'XHAVEN$', 'XAFN', None,
3679
        'X(CSZ)', 'X', 'X',
3680
        'XTS(CH)--', 'XT', 'XT',
3681
        'XT(SZ)', 'Z', 'Z',
3682
        'YE(LMNRST)-3^', 'IE', 'IE',
3683
        'YE-3', 'I', 'I',
3684
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3685
        'Y(AOU)-<7', 'I', 'I',
3686
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3687
        'YVES^$', 'IF', 'IF',
3688
        'YVONNE^$', 'IWON', 'IFUN',
3689
        'Y.^', 'Y.', None,
3690
        'Y', 'I', 'I',
3691
        'ZC(AOU)-', 'SK', 'ZK',
3692
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3693
        'ZIEJ$', 'ZI', 'ZI',
3694
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3695
        'ZL(AEIOU)-', 'SL', None,
3696
        'ZS(CHT)--', '', '',
3697
        'ZS', 'SH', 'Z',
3698
        'ZUERST', 'ZUERST', 'ZUERST',
3699
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3700
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3701
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3702
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3703
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3704
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3705
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3706
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3707
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3708
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3709
        'ZUVER^^', 'ZUFA', 'ZUFA',
3710
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3711
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3712
        'ZY9^', 'ZÜ', None,
3713
        'ZYK3$', 'ZIK', None,
3714
        'Z(VW)7^', 'SW', None,
3715
        None, None, None)
3716
3717
    phonet_hash = Counter()
3718
    alpha_pos = Counter()
3719
3720
    phonet_hash_1 = Counter()
3721
    phonet_hash_2 = Counter()
3722
3723
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3724
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3725
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3726
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3727
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3728
3729
    def _trinfo(text, rule, err_text, lang):
3730
        """Output debug information."""
3731
        if lang == 'none':
3732
            _phonet_rules = _phonet_rules_no_lang
3733
        else:
3734
            _phonet_rules = _phonet_rules_german
3735
3736
        from_rule = ('(NULL)' if _phonet_rules[rule] is None else
3737
                     _phonet_rules[rule])
3738
        to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else
3739
                    _phonet_rules[rule + 1])
3740
        to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else
3741
                    _phonet_rules[rule + 2])
3742
        print('"{} {}:  "{}"{}"{}" {}'.format(text, ((rule / 3) + 1),
3743
                                              from_rule, to_rule1, to_rule2,
3744
                                              err_text))
3745
3746
    def _initialize_phonet(lang):
3747
        """Initialize phonet variables."""
3748
        if lang == 'none':
3749
            _phonet_rules = _phonet_rules_no_lang
3750
        else:
3751
            _phonet_rules = _phonet_rules_german
3752
3753
        phonet_hash[''] = -1
3754
3755
        # German and international umlauts
3756
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
3757
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
3758
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
3759
            alpha_pos[j] = 1
3760
            phonet_hash[j] = -1
3761
3762
        # "normal" letters ('A'-'Z')
3763
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3764
            alpha_pos[j] = i + 2
3765
            phonet_hash[j] = -1
3766
3767
        for i in range(26):
3768
            for j in range(28):
3769
                phonet_hash_1[i, j] = -1
3770
                phonet_hash_2[i, j] = -1
3771
3772
        # for each phonetc rule
3773
        for i in range(len(_phonet_rules)):
3774
            rule = _phonet_rules[i]
3775
3776
            if rule and i % 3 == 0:
3777
                # calculate first hash value
3778
                k = _phonet_rules[i][0]
3779
3780
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3781
                                           _phonet_rules[i+2]):
3782
                    phonet_hash[k] = i
3783
3784
                # calculate second hash values
3785
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3786
                    k = alpha_pos[k]
3787
3788
                    j = k-2
3789
                    rule = rule[1:]
3790
3791
                    if not rule:
3792
                        rule = ' '
3793
                    elif rule[0] == '(':
3794
                        rule = rule[1:]
3795
                    else:
3796
                        rule = rule[0]
3797
3798
                    while rule and (rule[0] != ')'):
3799
                        k = alpha_pos[rule[0]]
3800
3801
                        if k > 0:
3802
                            # add hash value for this letter
3803
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3804
                                phonet_hash_1[j, k] = i
3805
                                phonet_hash_2[j, k] = i
3806
3807
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3808
                                phonet_hash_2[j, k] = i
3809
                            else:
3810
                                k = -1
3811
3812
                        if k <= 0:
3813
                            # add hash value for all letters
3814
                            if phonet_hash_1[j, 0] < 0:
3815
                                phonet_hash_1[j, 0] = i
3816
3817
                            phonet_hash_2[j, 0] = i
3818
3819
                        rule = rule[1:]
3820
3821
    def _phonet(term, mode, lang, trace):
3822
        """Return the phonet coded form of a term."""
3823
        if lang == 'none':
3824
            _phonet_rules = _phonet_rules_no_lang
3825
        else:
3826
            _phonet_rules = _phonet_rules_german
3827
3828
        char0 = ''
3829
        dest = term
3830
3831
        if not term:
3832
            return ''
3833
3834
        term_length = len(term)
3835
3836
        # convert input string to upper-case
3837
        src = term.translate(_phonet_upper_translation)
3838
3839
        # check "src"
3840
        i = 0
3841
        j = 0
3842
        zeta = 0
3843
3844
        while i < len(src):
3845
            char = src[i]
3846
3847
            if trace:
3848
                print('\ncheck position {}:  src = "{}",  dest = "{}"'.format
3849
                      (j, src[i:], dest[:j]))
3850
3851
            pos = alpha_pos[char]
3852
3853
            if pos >= 2:
3854
                xpos = pos-2
3855
3856
                if i+1 == len(src):
3857
                    pos = alpha_pos['']
3858
                else:
3859
                    pos = alpha_pos[src[i+1]]
3860
3861
                start1 = phonet_hash_1[xpos, pos]
3862
                start2 = phonet_hash_1[xpos, 0]
3863
                end1 = phonet_hash_2[xpos, pos]
3864
                end2 = phonet_hash_2[xpos, 0]
3865
3866
                # preserve rule priorities
3867
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3868
                    pos = start1
3869
                    start1 = start2
3870
                    start2 = pos
3871
                    pos = end1
3872
                    end1 = end2
3873
                    end2 = pos
3874
3875
                if (end1 >= start2) and (start2 >= 0):
3876
                    if end2 > end1:
3877
                        end1 = end2
3878
3879
                    start2 = -1
3880
                    end2 = -1
3881
            else:
3882
                pos = phonet_hash[char]
3883
                start1 = pos
3884
                end1 = 10000
3885
                start2 = -1
3886
                end2 = -1
3887
3888
            pos = start1
3889
            zeta0 = 0
3890
3891
            if pos >= 0:
3892
                # check rules for this char
3893
                while ((_phonet_rules[pos] is None) or
3894
                       (_phonet_rules[pos][0] == char)):
3895
                    if pos > end1:
3896
                        if start2 > 0:
3897
                            pos = start2
3898
                            start1 = start2
3899
                            start2 = -1
3900
                            end1 = end2
3901
                            end2 = -1
3902
                            continue
3903
3904
                        break
3905
3906
                    if (((_phonet_rules[pos] is None) or
3907
                         (_phonet_rules[pos + mode] is None))):
3908
                        # no conversion rule available
3909
                        pos += 3
3910
                        continue
3911
3912
                    if trace:
3913
                        _trinfo('> rule no.', pos, 'is being checked', lang)
3914
3915
                    # check whole string
3916
                    matches = 1  # number of matching letters
3917
                    priority = 5  # default priority
3918
                    rule = _phonet_rules[pos]
3919
                    rule = rule[1:]
3920
3921
                    while (rule and
3922
                           (len(src) > (i + matches)) and
3923
                           (src[i + matches] == rule[0]) and
3924
                           not rule[0].isdigit() and
3925
                           (rule not in '(-<^$')):
3926
                        matches += 1
3927
                        rule = rule[1:]
3928
3929
                    if rule and (rule[0] == '('):
3930
                        # check an array of letters
3931
                        if (((len(src) > (i + matches)) and
3932
                             src[i + matches].isalpha() and
3933
                             (src[i + matches] in rule[1:]))):
3934
                            matches += 1
3935
3936
                            while rule and rule[0] != ')':
3937
                                rule = rule[1:]
3938
3939
                            # if rule[0] == ')':
3940
                            rule = rule[1:]
3941
3942
                    if rule:
3943
                        priority0 = ord(rule[0])
3944
                    else:
3945
                        priority0 = 0
3946
3947
                    matches0 = matches
3948
3949
                    while rule and rule[0] == '-' and matches > 1:
3950
                        matches -= 1
3951
                        rule = rule[1:]
3952
3953
                    if rule and rule[0] == '<':
3954
                        rule = rule[1:]
3955
3956
                    if rule and rule[0].isdigit():
3957
                        # read priority
3958
                        priority = int(rule[0])
3959
                        rule = rule[1:]
3960
3961
                    if rule and rule[0:2] == '^^':
3962
                        rule = rule[1:]
3963
3964
                    if (not rule or
3965
                            ((rule[0] == '^') and
3966
                             ((i == 0) or not src[i-1].isalpha()) and
3967
                             ((rule[1:2] != '$') or
3968
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3969
                               (src[i+matches0:i+matches0+1] != '.')))) or
3970
                            ((rule[0] == '$') and (i > 0) and
3971
                             src[i-1].isalpha() and
3972
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3973
                              (src[i+matches0:i+matches0+1] != '.')))):
3974
                        # look for continuation, if:
3975
                        # matches > 1 und NO '-' in first string */
3976
                        pos0 = -1
3977
3978
                        start3 = 0
3979
                        start4 = 0
3980
                        end3 = 0
3981
                        end4 = 0
3982
3983
                        if (((matches > 1) and
3984
                             src[i+matches:i+matches+1] and
3985
                             (priority0 != ord('-')))):
3986
                            char0 = src[i+matches-1]
3987
                            pos0 = alpha_pos[char0]
3988
3989
                            if pos0 >= 2 and src[i+matches]:
3990
                                xpos = pos0 - 2
3991
                                pos0 = alpha_pos[src[i+matches]]
3992
                                start3 = phonet_hash_1[xpos, pos0]
3993
                                start4 = phonet_hash_1[xpos, 0]
3994
                                end3 = phonet_hash_2[xpos, pos0]
3995
                                end4 = phonet_hash_2[xpos, 0]
3996
3997
                                # preserve rule priorities
3998
                                if (((start4 >= 0) and
3999
                                     ((start3 < 0) or (start4 < start3)))):
4000
                                    pos0 = start3
4001
                                    start3 = start4
4002
                                    start4 = pos0
4003
                                    pos0 = end3
4004
                                    end3 = end4
4005
                                    end4 = pos0
4006
4007
                                if (end3 >= start4) and (start4 >= 0):
4008
                                    if end4 > end3:
4009
                                        end3 = end4
4010
4011
                                    start4 = -1
4012
                                    end4 = -1
4013
                            else:
4014
                                pos0 = phonet_hash[char0]
4015
                                start3 = pos0
4016
                                end3 = 10000
4017
                                start4 = -1
4018
                                end4 = -1
4019
4020
                            pos0 = start3
4021
4022
                        # check continuation rules for src[i+matches]
4023
                        if pos0 >= 0:
4024
                            while ((_phonet_rules[pos0] is None) or
4025
                                   (_phonet_rules[pos0][0] == char0)):
4026
                                if pos0 > end3:
4027
                                    if start4 > 0:
4028
                                        pos0 = start4
4029
                                        start3 = start4
4030
                                        start4 = -1
4031
                                        end3 = end4
4032
                                        end4 = -1
4033
                                        continue
4034
4035
                                    priority0 = -1
4036
4037
                                    # important
4038
                                    break
4039
4040
                                if (((_phonet_rules[pos0] is None) or
4041
                                     (_phonet_rules[pos0 + mode] is None))):
4042
                                    # no conversion rule available
4043
                                    pos0 += 3
4044
                                    continue
4045
4046
                                if trace:
4047
                                    _trinfo('> > continuation rule no.', pos0,
4048
                                            'is being checked', lang)
4049
4050
                                # check whole string
4051
                                matches0 = matches
4052
                                priority0 = 5
4053
                                rule = _phonet_rules[pos0]
4054
                                rule = rule[1:]
4055
4056
                                while (rule and
4057
                                       (src[i+matches0:i+matches0+1] ==
4058
                                        rule[0]) and
4059
                                       (not rule[0].isdigit() or
4060
                                        (rule in '(-<^$'))):
4061
                                    matches0 += 1
4062
                                    rule = rule[1:]
4063
4064
                                if rule and rule[0] == '(':
4065
                                    # check an array of letters
4066
                                    if ((src[i+matches0:i+matches0+1]
4067
                                         .isalpha() and
4068
                                         (src[i+matches0] in rule[1:]))):
4069
                                        matches0 += 1
4070
4071
                                        while rule and rule[0] != ')':
4072
                                            rule = rule[1:]
4073
4074
                                        # if rule[0] == ')':
4075
                                        rule = rule[1:]
4076
4077
                                while rule and rule[0] == '-':
4078
                                    # "matches0" is NOT decremented
4079
                                    # because of  "if (matches0 == matches)"
4080
                                    rule = rule[1:]
4081
4082
                                if rule and rule[0] == '<':
4083
                                    rule = rule[1:]
4084
4085
                                if rule and rule[0].isdigit():
4086
                                    priority0 = int(rule[0])
4087
                                    rule = rule[1:]
4088
4089
                                if (not rule or
4090
                                        # rule == '^' is not possible here
4091
                                        ((rule[0] == '$') and not
4092
                                         src[i+matches0:i+matches0+1]
4093
                                         .isalpha() and
4094
                                         (src[i+matches0:i+matches0+1]
4095
                                          != '.'))):
4096
                                    if matches0 == matches:
4097
                                        # this is only a partial string
4098
                                        if trace:
4099
                                            _trinfo('> > continuation ' +
4100
                                                    'rule no.',
4101
                                                    pos0,
4102
                                                    'not used (too short)',
4103
                                                    lang)
4104
4105
                                        pos0 += 3
4106
                                        continue
4107
4108
                                    if priority0 < priority:
4109
                                        # priority is too low
4110
                                        if trace:
4111
                                            _trinfo('> > continuation ' +
4112
                                                    'rule no.',
4113
                                                    pos0,
4114
                                                    'not used (priority)',
4115
                                                    lang)
4116
4117
                                        pos0 += 3
4118
                                        continue
4119
4120
                                    # continuation rule found
4121
                                    break
4122
4123
                                if trace:
4124
                                    _trinfo('> > continuation rule no.', pos0,
4125
                                            'not used', lang)
4126
4127
                                pos0 += 3
4128
4129
                            # end of "while"
4130
                            if ((priority0 >= priority) and
4131
                                    ((_phonet_rules[pos0] is not None) and
4132
                                     (_phonet_rules[pos0][0] == char0))):
4133
4134
                                if trace:
4135
                                    _trinfo('> rule no.', pos, '', lang)
4136
                                    _trinfo('> not used because of ' +
4137
                                            'continuation', pos0, '', lang)
4138
4139
                                pos += 3
4140
                                continue
4141
4142
                        # replace string
4143
                        if trace:
4144
                            _trinfo('Rule no.', pos, 'is applied', lang)
4145
4146
                        if ((_phonet_rules[pos] and
4147
                             ('<' in _phonet_rules[pos][1:]))):
4148
                            priority0 = 1
4149
                        else:
4150
                            priority0 = 0
4151
4152
                        rule = _phonet_rules[pos + mode]
4153
4154
                        if (priority0 == 1) and (zeta == 0):
4155
                            # rule with '<' is applied
4156
                            if ((j > 0) and rule and
4157
                                    ((dest[j-1] == char) or
4158
                                     (dest[j-1] == rule[0]))):
4159
                                j -= 1
4160
4161
                            zeta0 = 1
4162
                            zeta += 1
4163
                            matches0 = 0
4164
4165
                            while rule and src[i+matches0]:
4166
                                src = (src[0:i+matches0] + rule[0] +
4167
                                       src[i+matches0+1:])
4168
                                matches0 += 1
4169
                                rule = rule[1:]
4170
4171
                            if matches0 < matches:
4172
                                src = (src[0:i+matches0] +
4173
                                       src[i+matches:])
4174
4175
                            char = src[i]
4176
                        else:
4177
                            i = i + matches - 1
4178
                            zeta = 0
4179
4180
                            while len(rule) > 1:
4181
                                if (j == 0) or (dest[j - 1] != rule[0]):
4182
                                    dest = (dest[0:j] + rule[0] +
4183
                                            dest[min(len(dest), j+1):])
4184
                                    j += 1
4185
4186
                                rule = rule[1:]
4187
4188
                            # new "current char"
4189
                            if not rule:
4190
                                rule = ''
4191
                                char = ''
4192
                            else:
4193
                                char = rule[0]
4194
4195
                            if ((_phonet_rules[pos] and
4196
                                 '^^' in _phonet_rules[pos][1:])):
4197
                                if char:  # pragma: no branch
4198
                                    dest = (dest[0:j] + char +
4199
                                            dest[min(len(dest), j + 1):])
4200
                                    j += 1
4201
4202
                                src = src[i + 1:]
4203
                                i = 0
4204
                                zeta0 = 1
4205
4206
                        break
4207
4208
                    pos += 3
4209
4210
                    if pos > end1 and start2 > 0:
4211
                        pos = start2
4212
                        start1 = start2
4213
                        end1 = end2
4214
                        start2 = -1
4215
                        end2 = -1
4216
4217
            if zeta0 == 0:
4218
                if char and ((j == 0) or (dest[j-1] != char)):
4219
                    # delete multiple letters only
4220
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4221
                    j += 1
4222
4223
                i += 1
4224
                zeta = 0
4225
4226
        dest = dest[0:j]
4227
4228
        return dest
4229
4230
    _initialize_phonet(lang)
4231
4232
    word = normalize('NFKC', text_type(word))
4233
    return _phonet(word, mode, lang, trace)
4234
4235
4236
def spfc(word):
4237
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4238
4239
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4240
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
4241
4242
    :param str word: the word to transform
4243
    :returns: the SPFC value
4244
    :rtype: str
4245
4246
    >>> spfc('Christopher Smith')
4247
    '01160'
4248
    >>> spfc('Christopher Schmidt')
4249
    '01160'
4250
    >>> spfc('Niall Smith')
4251
    '01660'
4252
    >>> spfc('Niall Schmidt')
4253
    '01660'
4254
4255
    >>> spfc('L.Smith')
4256
    '01960'
4257
    >>> spfc('R.Miller')
4258
    '65490'
4259
4260
    >>> spfc(('L', 'Smith'))
4261
    '01960'
4262
    >>> spfc(('R', 'Miller'))
4263
    '65490'
4264
    """
4265
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4266
                    '0011112222334445556666777'))
4267
    _pf2 = dict(zip((ord(_) for _ in
4268
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4269
                    '0011122233445556677788899'))
4270
    _pf3 = dict(zip((ord(_) for _ in
4271
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4272
                    '00000112223334456677777777'))
4273
4274
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4275
                      ('MN', 'N'))
4276
4277
    def _raise_word_ex():
4278
        """Raise an AttributeError."""
4279
        raise AttributeError('word attribute must be a string with a space ' +
4280
                             'or period dividing the first and last names ' +
4281
                             'or a tuple/list consisting of the first and ' +
4282
                             'last names')
4283
4284
    if not word:
4285
        return ''
4286
4287
    if isinstance(word, (str, text_type)):
4288
        names = word.split('.', 1)
4289
        if len(names) != 2:
4290
            names = word.split(' ', 1)
4291
            if len(names) != 2:
4292
                _raise_word_ex()
4293
    elif hasattr(word, '__iter__'):
4294
        if len(word) != 2:
4295
            _raise_word_ex()
4296
        names = word
4297
    else:
4298
        _raise_word_ex()
4299
4300
    names = [normalize('NFKD', text_type(_.strip()
4301
                                         .replace('ß', 'SS')
4302
                                         .upper()))
4303
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4304
    code = ''
4305
4306
    def steps_one_to_three(name):
4307
        """Perform the first three steps of SPFC."""
4308
        # filter out non A-Z
4309
        name = ''.join(_ for _ in name if _ in
4310
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
4311
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
4312
                        'W', 'X', 'Y', 'Z'})
4313
4314
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4315
        # and MN to N
4316
        for subst in _substitutions:
4317
            name = name.replace(subst[0], subst[1])
4318
4319
        # 2. In the name field, replace multiple letters with a single letter
4320
        name = _delete_consecutive_repeats(name)
4321
4322
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4323
        # field.
4324
        if name:
4325
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4326
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
4327
        return name
4328
4329
    names = [steps_one_to_three(_) for _ in names]
4330
4331
    # 4. The first digit of the code is obtained using PF1 and the first letter
4332
    # of the name field. Remove this letter after coding.
4333
    if names[1]:
4334
        code += names[1][0].translate(_pf1)
4335
        names[1] = names[1][1:]
4336
4337
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4338
    # second digit of the code. Use as many letters as possible and remove
4339
    # after coding.
4340
    if names[1]:
4341
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4342
            code += '8'
4343
            names[1] = names[1][:-3]
4344
        elif names[1][-2:] == 'SN':
4345
            code += '8'
4346
            names[1] = names[1][:-2]
4347
        elif names[1][-3:] == 'STR':
4348
            code += '9'
4349
            names[1] = names[1][:-3]
4350
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
4351
            code += '9'
4352
            names[1] = names[1][:-2]
4353
        elif names[1][-3:] == 'DRS':
4354
            code += '7'
4355
            names[1] = names[1][:-3]
4356
        elif names[1][-2:] in {'TR', 'MN'}:
4357
            code += '7'
4358
            names[1] = names[1][:-2]
4359
        else:
4360
            code += names[1][-1].translate(_pf3)
4361
            names[1] = names[1][:-1]
4362
4363
    # 6. The third digit is found using Table PF2 and the first character of
4364
    # the first name. Remove after coding.
4365
    if names[0]:
4366
        code += names[0][0].translate(_pf2)
4367
        names[0] = names[0][1:]
4368
4369
    # 7. The fourth digit is found using Table PF2 and the first character of
4370
    # the name field. If no letters remain use zero. After coding remove the
4371
    # letter.
4372
    # 8. The fifth digit is found in the same manner as the fourth using the
4373
    # remaining characters of the name field if any.
4374
    for _ in range(2):
4375
        if names[1]:
4376
            code += names[1][0].translate(_pf2)
4377
            names[1] = names[1][1:]
4378
        else:
4379
            code += '0'
4380
4381
    return code
4382
4383
4384
def statistics_canada(word, maxlength=4):
4385
    """Return the Statistics Canada code for a word.
4386
4387
    The original description of this algorithm could not be located, and
4388
    may only have been specified in an unpublished TR. The coding does not
4389
    appear to be in use by Statistics Canada any longer. In its place, this is
4390
    an implementation of the "Census modified Statistics Canada name coding
4391
    procedure".
4392
4393
    The modified version of this algorithm is described in Appendix B of
4394
     :cite:`Moore:1977`.
4395
4396
    :param str word: the word to transform
4397
    :param int maxlength: the maximum length (default 6) of the code to return
4398
    :param bool modified: indicates whether to use USDA modified algorithm
4399
    :returns: the Statistics Canada name code value
4400
    :rtype: str
4401
4402
    >>> statistics_canada('Christopher')
4403
    'CHRS'
4404
    >>> statistics_canada('Niall')
4405
    'NL'
4406
    >>> statistics_canada('Smith')
4407
    'SMTH'
4408
    >>> statistics_canada('Schmidt')
4409
    'SCHM'
4410
    """
4411
    # uppercase, normalize, decompose, and filter non-A-Z out
4412
    word = normalize('NFKD', text_type(word.upper()))
4413
    word = word.replace('ß', 'SS')
4414
    word = ''.join(c for c in word if c in
4415
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4416
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4417
                    'Y', 'Z'})
4418
    if not word:
4419
        return ''
4420
4421
    code = word[1:]
4422
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4423
        code = code.replace(vowel, '')
4424
    code = word[0]+code
4425
    code = _delete_consecutive_repeats(code)
4426
    code = code.replace(' ', '')
4427
4428
    return code[:maxlength]
4429
4430
4431
def lein(word, maxlength=4, zero_pad=True):
4432
    """Return the Lein code for a word.
4433
4434
    This is Lein name coding, described in :cite:`Moore:1977`.
4435
4436
    :param str word: the word to transform
4437
    :param int maxlength: the maximum length (default 4) of the code to return
4438
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4439
        maxlength string
4440
    :returns: the Lein code
4441
    :rtype: str
4442
4443
    >>> lein('Christopher')
4444
    'C351'
4445
    >>> lein('Niall')
4446
    'N300'
4447
    >>> lein('Smith')
4448
    'S210'
4449
    >>> lein('Schmidt')
4450
    'S521'
4451
    """
4452
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4453
                                  'BCDFGJKLMNPQRSTVXZ'),
4454
                                 '451455532245351455'))
4455
4456
    # uppercase, normalize, decompose, and filter non-A-Z out
4457
    word = normalize('NFKD', text_type(word.upper()))
4458
    word = word.replace('ß', 'SS')
4459
    word = ''.join(c for c in word if c in
4460
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4461
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4462
                    'Y', 'Z'})
4463
4464
    if not word:
4465
        return ''
4466
4467
    code = word[0]  # Rule 1
4468
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
4469
                               73: None, 79: None, 85: None, 87: None,
4470
                               89: None})  # Rule 2
4471
    word = _delete_consecutive_repeats(word)  # Rule 3
4472
    code += word.translate(_lein_translation)  # Rule 4
4473
4474
    if zero_pad:
4475
        code += ('0'*maxlength)  # Rule 4
4476
4477
    return code[:maxlength]
4478
4479
4480
def roger_root(word, maxlength=5, zero_pad=True):
4481
    """Return the Roger Root code for a word.
4482
4483
    This is Roger Root name coding, described in :cite:`Moore:1977`.
4484
4485
    :param str word: the word to transform
4486
    :param int maxlength: the maximum length (default 5) of the code to return
4487
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4488
        maxlength string
4489
    :returns: the Roger Root code
4490
    :rtype: str
4491
4492
    >>> roger_root('Christopher')
4493
    '06401'
4494
    >>> roger_root('Niall')
4495
    '02500'
4496
    >>> roger_root('Smith')
4497
    '00310'
4498
    >>> roger_root('Schmidt')
4499
    '06310'
4500
    """
4501
    # uppercase, normalize, decompose, and filter non-A-Z out
4502
    word = normalize('NFKD', text_type(word.upper()))
4503
    word = word.replace('ß', 'SS')
4504
    word = ''.join(c for c in word if c in
4505
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4506
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4507
                    'Y', 'Z'})
4508
4509
    if not word:
4510
        return ''
4511
4512
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4513
    _init_patterns = {4: {'TSCH': '06'},
4514
                      3: {'TSH': '06', 'SCH': '06'},
4515
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4516
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4517
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4518
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4519
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4520
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4521
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4522
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4523
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4524
                          'Y': '5', 'Z': '0*0'}}
4525
4526
    _med_patterns = {4: {'TSCH': '6'},
4527
                     3: {'TSH': '6', 'SCH': '6'},
4528
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4529
                         'PH': '8', 'SH': '6', 'TS': '0'},
4530
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4531
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4532
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4533
                         'V': '8', 'X': '7', 'Z': '0',
4534
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4535
                         'U': '*', 'W': '*', 'Y': '*'}}
4536
4537
    code = ''
4538
    pos = 0
4539
4540
    # Do first digit(s) first
4541
    for num in range(4, 0, -1):
4542
        if word[:num] in _init_patterns[num]:
4543
            code = _init_patterns[num][word[:num]]
4544
            pos += num
4545
            break
4546
    else:
4547
        pos += 1  # Advance if nothing is recognized
4548
4549
    # Then code subsequent digits
4550
    while pos < len(word):
4551
        for num in range(4, 0, -1):
4552
            if word[pos:pos+num] in _med_patterns[num]:
4553
                code += _med_patterns[num][word[pos:pos+num]]
4554
                pos += num
4555
                break
4556
        else:
4557
            pos += 1  # Advance if nothing is recognized
4558
4559
    code = _delete_consecutive_repeats(code)
4560
    code = code.replace('*', '')
4561
4562
    if zero_pad:
4563
        code += '0'*maxlength
4564
4565
    return code[:maxlength]
4566
4567
4568
def onca(word, maxlength=4, zero_pad=True):
4569
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.
4570
4571
    This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`.
4572
4573
    I can find no complete description of the "anglicised version of the NYSIIS
4574
    method" identified as the first step in this algorithm, so this is likely
4575
    not a precisely correct implementation, in that it employs the standard
4576
    NYSIIS algorithm.
4577
4578
    :param str word: the word to transform
4579
    :param int maxlength: the maximum length (default 5) of the code to return
4580
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4581
        maxlength string
4582
    :returns: the ONCA code
4583
    :rtype: str
4584
4585
    >>> onca('Christopher')
4586
    'C623'
4587
    >>> onca('Niall')
4588
    'N400'
4589
    >>> onca('Smith')
4590
    'S530'
4591
    >>> onca('Schmidt')
4592
    'S530'
4593
    """
4594
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
4595
    # to one character of output, so give it triple the maxlength.
4596
    return soundex(nysiis(word, maxlength=maxlength*3), maxlength,
4597
                   zero_pad=zero_pad)
4598
4599
4600
def eudex(word, maxlength=8):
4601
    """Return the eudex phonetic hash of a word.
4602
4603
    This implementation of eudex phonetic hashing is based on the specification
4604
    (not the reference implementation) at :cite:`Ticki:2016`.
4605
4606
    Further details can be found at :cite:`Ticki:2016b`.
4607
4608
    :param str word: the word to transform
4609
    :param int maxlength: the length of the code returned (defaults to 8)
4610
    :returns: the eudex hash
4611
    :rtype: str
4612
    """
4613
    _trailing_phones = {
4614
        'a': 0,  # a
4615
        'b': 0b01001000,  # b
4616
        'c': 0b00001100,  # c
4617
        'd': 0b00011000,  # d
4618
        'e': 0,  # e
4619
        'f': 0b01000100,  # f
4620
        'g': 0b00001000,  # g
4621
        'h': 0b00000100,  # h
4622
        'i': 1,  # i
4623
        'j': 0b00000101,  # j
4624
        'k': 0b00001001,  # k
4625
        'l': 0b10100000,  # l
4626
        'm': 0b00000010,  # m
4627
        'n': 0b00010010,  # n
4628
        'o': 0,  # o
4629
        'p': 0b01001001,  # p
4630
        'q': 0b10101000,  # q
4631
        'r': 0b10100001,  # r
4632
        's': 0b00010100,  # s
4633
        't': 0b00011101,  # t
4634
        'u': 1,  # u
4635
        'v': 0b01000101,  # v
4636
        'w': 0b00000000,  # w
4637
        'x': 0b10000100,  # x
4638
        'y': 1,  # y
4639
        'z': 0b10010100,  # z
4640
4641
        'ß': 0b00010101,  # ß
4642
        'à': 0,  # à
4643
        'á': 0,  # á
4644
        'â': 0,  # â
4645
        'ã': 0,  # ã
4646
        'ä': 0,  # ä[æ]
4647
        'å': 1,  # å[oː]
4648
        'æ': 0,  # æ[æ]
4649
        'ç': 0b10010101,  # ç[t͡ʃ]
4650
        'è': 1,  # è
4651
        'é': 1,  # é
4652
        'ê': 1,  # ê
4653
        'ë': 1,  # ë
4654
        'ì': 1,  # ì
4655
        'í': 1,  # í
4656
        'î': 1,  # î
4657
        'ï': 1,  # ï
4658
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
4659
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
4660
        'ò': 0,  # ò
4661
        'ó': 0,  # ó
4662
        'ô': 0,  # ô
4663
        'õ': 0,  # õ
4664
        'ö': 1,  # ö[ø]
4665
        '÷': 0b11111111,  # ÷
4666
        'ø': 1,  # ø[ø]
4667
        'ù': 1,  # ù
4668
        'ú': 1,  # ú
4669
        'û': 1,  # û
4670
        'ü': 1,  # ü
4671
        'ý': 1,  # ý
4672
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
4673
        'ÿ': 1,  # ÿ
4674
    }
4675
4676
    _initial_phones = {
4677
        'a': 0b10000100,  # a*
4678
        'b': 0b00100100,  # b
4679
        'c': 0b00000110,  # c
4680
        'd': 0b00001100,  # d
4681
        'e': 0b11011000,  # e*
4682
        'f': 0b00100010,  # f
4683
        'g': 0b00000100,  # g
4684
        'h': 0b00000010,  # h
4685
        'i': 0b11111000,  # i*
4686
        'j': 0b00000011,  # j
4687
        'k': 0b00000101,  # k
4688
        'l': 0b01010000,  # l
4689
        'm': 0b00000001,  # m
4690
        'n': 0b00001001,  # n
4691
        'o': 0b10010100,  # o*
4692
        'p': 0b00100101,  # p
4693
        'q': 0b01010100,  # q
4694
        'r': 0b01010001,  # r
4695
        's': 0b00001010,  # s
4696
        't': 0b00001110,  # t
4697
        'u': 0b11100000,  # u*
4698
        'v': 0b00100011,  # v
4699
        'w': 0b00000000,  # w
4700
        'x': 0b01000010,  # x
4701
        'y': 0b11100100,  # y*
4702
        'z': 0b01001010,  # z
4703
4704
        'ß': 0b00001011,  # ß
4705
        'à': 0b10000101,  # à
4706
        'á': 0b10000101,  # á
4707
        'â': 0b10000000,  # â
4708
        'ã': 0b10000110,  # ã
4709
        'ä': 0b10100110,  # ä [æ]
4710
        'å': 0b11000010,  # å [oː]
4711
        'æ': 0b10100111,  # æ [æ]
4712
        'ç': 0b01010100,  # ç [t͡ʃ]
4713
        'è': 0b11011001,  # è
4714
        'é': 0b11011001,  # é
4715
        'ê': 0b11011001,  # ê
4716
        'ë': 0b11000110,  # ë [ə] or [œ]
4717
        'ì': 0b11111001,  # ì
4718
        'í': 0b11111001,  # í
4719
        'î': 0b11111001,  # î
4720
        'ï': 0b11111001,  # ï
4721
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
4722
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
4723
        'ò': 0b10010101,  # ò
4724
        'ó': 0b10010101,  # ó
4725
        'ô': 0b10010101,  # ô
4726
        'õ': 0b10010101,  # õ
4727
        'ö': 0b11011100,  # ö [œ] or [ø]
4728
        '÷': 0b11111111,  # ÷
4729
        'ø': 0b11011101,  # ø [œ] or [ø]
4730
        'ù': 0b11100001,  # ù
4731
        'ú': 0b11100001,  # ú
4732
        'û': 0b11100001,  # û
4733
        'ü': 0b11100101,  # ü
4734
        'ý': 0b11100101,  # ý
4735
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
4736
        'ÿ': 0b11100101,  # ÿ
4737
    }
4738
    # Lowercase input & filter unknown characters
4739
    word = ''.join(char for char in word.lower() if char in _initial_phones)
4740
4741
    if not word:
4742
        word = '÷'
4743
4744
    # Perform initial eudex coding of each character
4745
    values = [_initial_phones[word[0]]]
4746
    values += [_trailing_phones[char] for char in word[1:]]
4747
4748
    # Right-shift by one to determine if second instance should be skipped
4749
    shifted_values = [_ >> 1 for _ in values]
4750
    condensed_values = [values[0]]
4751
    for n in range(1, len(shifted_values)):
4752
        if shifted_values[n] != shifted_values[n-1]:
4753
            condensed_values.append(values[n])
4754
4755
    # Add padding after first character & trim beyond maxlength
4756
    values = ([condensed_values[0]] +
4757
              [0]*max(0, maxlength - len(condensed_values)) +
4758
              condensed_values[1:maxlength])
4759
4760
    # Combine individual character values into eudex hash
4761
    hash_value = 0
4762
    for val in values:
4763
        hash_value = (hash_value << 8) | val
4764
4765
    return hash_value
4766
4767
4768
def haase_phonetik(word, primary_only=False):
4769
    """Return the Haase Phonetik (numeric output) code for a word.
4770
4771
    Based on the algorithm described at :cite:`Prante:2015`.
4772
4773
    Based on the original :cite:`Haase:2000`.
4774
4775
    While the output code is numeric, it is nevertheless a str.
4776
4777
    :param str word: the word to transform
4778
    :returns: the Haase Phonetik value as a numeric string
4779
    :rtype: str
4780
    """
4781
    def _after(word, i, letters):
4782
        """Return True if word[i] follows one of the supplied letters."""
4783
        if i > 0 and word[i-1] in letters:
4784
            return True
4785
        return False
4786
4787
    def _before(word, i, letters):
4788
        """Return True if word[i] precedes one of the supplied letters."""
4789
        if i+1 < len(word) and word[i+1] in letters:
4790
            return True
4791
        return False
4792
4793
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
4794
4795
    word = normalize('NFKD', text_type(word.upper()))
4796
    word = word.replace('ß', 'SS')
4797
4798
    word = word.replace('Ä', 'AE')
4799
    word = word.replace('Ö', 'OE')
4800
    word = word.replace('Ü', 'UE')
4801
    word = ''.join(c for c in word if c in
4802
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4803
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4804
                    'Y', 'Z'})
4805
4806
    # Nothing to convert, return base case
4807
    if not word:
4808
        return ''
4809
4810
    variants = []
4811
    if primary_only:
4812
        variants = [word]
4813
    else:
4814
        pos = 0
4815
        if word[:2] == 'CH':
4816
            variants.append(('CH', 'SCH'))
4817
            pos += 2
4818
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
4819
                      'AUX': 'O', 'EUX': 'O'}
4820
        while pos < len(word):
4821
            if word[pos:pos+4] == 'ILLE':
4822
                variants.append(('ILLE', 'I'))
4823
                pos += 4
4824
            elif word[pos:pos+3] in len_3_vars:
4825
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
4826
                pos += 3
4827
            elif word[pos:pos+2] == 'RB':
4828
                variants.append(('RB', 'RW'))
4829
                pos += 2
4830
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
4831
                variants.append(('EAU', 'O'))
4832
                pos += 3
4833
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
4834
                if word[pos:] == 'O':
4835
                    variants.append(('O', 'OW'))
4836
                else:
4837
                    variants.append(('A', 'AR'))
4838
                pos += 1
4839
            else:
4840
                variants.append((word[pos],))
4841
                pos += 1
4842
4843
        variants = [''.join(letters) for letters in product(*variants)]
4844
4845
    def _haase_code(word):
4846
        sdx = ''
4847
        for i in range(len(word)):
4848 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
4849
                sdx += '9'
4850
            elif word[i] == 'B':
4851
                sdx += '1'
4852
            elif word[i] == 'P':
4853
                if _before(word, i, {'H'}):
4854
                    sdx += '3'
4855
                else:
4856
                    sdx += '1'
4857
            elif word[i] in {'D', 'T'}:
4858
                if _before(word, i, {'C', 'S', 'Z'}):
4859
                    sdx += '8'
4860
                else:
4861
                    sdx += '2'
4862
            elif word[i] in {'F', 'V', 'W'}:
4863
                sdx += '3'
4864
            elif word[i] in {'G', 'K', 'Q'}:
4865
                sdx += '4'
4866
            elif word[i] == 'C':
4867
                if _after(word, i, {'S', 'Z'}):
4868
                    sdx += '8'
4869
                elif i == 0:
4870
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
4871
                                         'U', 'X'}):
4872
                        sdx += '4'
4873
                    else:
4874
                        sdx += '8'
4875
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
4876
                    sdx += '4'
4877
                else:
4878
                    sdx += '8'
4879
            elif word[i] == 'X':
4880
                if _after(word, i, {'C', 'K', 'Q'}):
4881
                    sdx += '8'
4882
                else:
4883
                    sdx += '48'
4884
            elif word[i] == 'L':
4885
                sdx += '5'
4886
            elif word[i] in {'M', 'N'}:
4887
                sdx += '6'
4888
            elif word[i] == 'R':
4889
                sdx += '7'
4890
            elif word[i] in {'S', 'Z'}:
4891
                sdx += '8'
4892
4893
        sdx = _delete_consecutive_repeats(sdx)
4894
4895
        # if sdx:
4896
        #     sdx = sdx[0] + sdx[1:].replace('9', '')
4897
4898
        return sdx
4899
4900
    return tuple(_haase_code(word) for word in variants)
4901
4902
4903
def reth_schek_phonetik(word):
4904
    """Return Reth-Schek Phonetik code for a word.
4905
4906
    This algorithm is proposed in :cite:`Reth:1977`.
4907
4908
    Since I couldn't secure a copy of that document (maybe I'll look for it
4909
    next time I'm in Germany), this implementation is based on what I could
4910
    glean from the implementations published by German Record Linkage
4911
    Center (www.record-linkage.de):
4912
4913
    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
4914
    - Merge ToolBox (in Java) :cite:`Schnell:2004`
4915
4916
    Rules that are unclear:
4917
4918
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
4919
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
4920
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
4921
      think of a German word with '-tui-' in it.)
4922
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
4923
4924
    :param word:
4925
    :return:
4926
    """
4927
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
4928
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
4929
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
4930
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
4931
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
4932
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
4933
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
4934
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
4935
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
4936
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
4937
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
4938
                        'SS': 'S', 'KW': 'QU'},
4939
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
4940
                        'K': 'G', 'Y': 'I'}}
4941
4942
    # Uppercase
4943
    word = word.upper()
4944
4945
    # Replace umlauts/eszett
4946
    word = word.replace('Ä', 'AE')
4947
    word = word.replace('Ö', 'OE')
4948
    word = word.replace('Ü', 'UE')
4949
    word = word.replace('ß', 'SS')
4950
4951
    # Main loop, using above replacements table
4952
    pos = 0
4953
    while pos < len(word):
4954
        for num in range(3, 0, -1):
4955
            if word[pos:pos+num] in replacements[num]:
4956
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
4957
                        + word[pos+num:])
4958
                pos += 1
4959
                break
4960
        else:
4961
            pos += 1  # Advance if nothing is recognized
4962
4963
    # Change 'CH' back(?) to 'SCH'
4964
    word = word.replace('CH', 'SCH')
4965
4966
    # Replace final sequences
4967
    if word[-2:] == 'ER':
4968
        word = word[:-2]+'R'
4969
    elif word[-2:] == 'EL':
4970
        word = word[:-2]+'L'
4971
    elif word[-1] == 'H':
4972
        word = word[:-1]
4973
4974
    return word
4975
4976
4977
def fonem(word):
4978
    """Return the FONEM code of a word.
4979
4980
    FONEM is a phonetic algorithm designed for French (particularly surnames in
4981
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.
4982
4983
    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
4984
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
4985
    was also consulted for this implementation.
4986
4987
    :param str word: the word to transform
4988
    :returns: the FONEM code
4989
    :rtype: str
4990
    """
4991
    # I don't see a sane way of doing this without regexps :(
4992
    rule_table = {
4993
        # Vowels & groups of vowels
4994
        'V-1':     (re_compile('E?AU'), 'O'),
4995
        'V-2,5':   (re_compile('(E?AU|O)L[TX]$'), 'O'),
4996
        'V-3,4':   (re_compile('E?AU[TX]$'), 'O'),
4997
        'V-6':     (re_compile('E?AUL?D$'), 'O'),
4998
        'V-7':     (re_compile(r'(?<!G)AY$'), 'E'),
4999
        'V-8':     (re_compile('EUX$'), 'EU'),
5000
        'V-9':     (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
5001
        'V-10':    ('Y', 'I'),
5002
        'V-11':    (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
5003
        'V-12':    (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
5004
        'V-13':    (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
5005
        'V-14':    (re_compile(r'([AEIOUY])(?=\1)'), ''),
5006
        # Nasal vowels
5007
        'V-15':    (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
5008
        'V-16':    (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
5009
        'V-17':    (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
5010
        'V-18':    (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
5011
                    'IN'),
5012
        'V-19':    (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
5013
        'V-20':    (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
5014
                               'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
5015
        # Consonants and groups of consonants
5016
        'C-1':     ('BV', 'V'),
5017
        'C-2':     (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
5018
        'C-3':     (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
5019
        'C-4':     (re_compile('^C(?=[EIY])'), 'S'),
5020
        'C-5':     (re_compile('^C(?=[OUA])'), 'K'),
5021
        'C-6':     (re_compile('(?<=[AEIOUY])C$'), 'K'),
5022
        'C-7':     (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
5023
        'C-8':     (re_compile('CC(?=[AOU])'), 'K'),
5024
        'C-9':     (re_compile('CC(?=[EIY])'), 'X'),
5025
        'C-10':    (re_compile('G(?=[EIY])'), 'J'),
5026
        'C-11':    (re_compile('GA(?=I?[MN])'), 'G#'),
5027
        'C-12':    (re_compile('GE(O|AU)'), 'JO'),
5028
        'C-13':    (re_compile('GNI(?=[AEIOUY])'), 'GN'),
5029
        'C-14':    (re_compile('(?<![PCS])H'), ''),
5030
        'C-15':    ('JEA', 'JA'),
5031
        'C-16':    (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
5032
        'C-17':    (re_compile('^MC'), 'MA#'),
5033
        'C-18':    ('PH', 'F'),
5034
        'C-19':    ('QU', 'K'),
5035
        'C-20':    (re_compile('^SC(?=[EIY])'), 'S'),
5036
        'C-21':    (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
5037
        'C-22':    (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
5038
        'C-23':    ('SH', 'CH'),
5039
        'C-24':    (re_compile('TIA$'), 'SSIA'),
5040
        'C-25':    (re_compile('(?<=[AIOUY])W'), ''),
5041
        'C-26':    (re_compile('X[CSZ]'), 'X'),
5042
        'C-27':    (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
5043
                               'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
5044
        'C-28':    (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
5045
        'C-28a':   (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
5046
        'C-28b':   (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
5047
        'C-28bb':  (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
5048
        'C-28c':   (re_compile('((?<=[^I])|^)LL'), 'L'),
5049
        'C-28d':   (re_compile('ILE$'), 'ILLE'),
5050
        'C-29':    (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
5051
                               'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
5052
                    lambda m: (m.group(1) or '') + (m.group(2) or '')),
5053
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
5054
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
5055
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
5056
        'C-34':    ('G#', 'GA'),
5057
        'C-35':    ('MA#', 'MAC')
5058
    }
5059
    rule_order = [
5060
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5061
        'C-12',
5062
        'C-8', 'C-9', 'C-10',
5063
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
5064
        'V-2,5', 'V-3,4', 'V-6',
5065
        'V-1', 'C-14',
5066
        'C-31,33', 'C-30,32',
5067
        'C-11', 'V-15', 'V-17', 'V-18',
5068
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
5069
        'V-19', 'V-20',
5070
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
5071
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
5072
        'C-25', 'C-26', 'C-27',
5073
        'C-29',
5074
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5075
        'C-34', 'C-35'
5076
    ]
5077
5078
    # normalize, upper-case, and filter non-French letters
5079
    word = normalize('NFKD', text_type(word.upper()))
5080
    word = word.translate({198: 'AE', 338: 'OE'})
5081
    word = ''.join(c for c in word if c in
5082
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5083
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5084
                    'Y', 'Z', '-'})
5085
5086
    for rule in rule_order:
5087
        regex, repl = rule_table[rule]
5088
        if isinstance(regex, text_type):
5089
            word = word.replace(regex, repl)
5090
        else:
5091
            word = regex.sub(repl, word)
5092
        # print(rule, word)
5093
5094
    return word
5095
5096
5097
def parmar_kumbharana(word):
5098
    """Return the Parmar-Kumbharana encoding of a word.
5099
5100
    This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`.
5101
5102
    :param word:
5103
    :return:
5104
    """
5105
    rule_table = {4: {'OUGH': 'F'},
5106
                  3: {'DGE': 'J',
5107
                      'OUL': 'U',
5108
                      'GHT': 'T'},
5109
                  2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
5110
                      'GE': 'J', 'GI': 'J', 'GY': 'J',
5111
                      'WR': 'R',
5112
                      'GN': 'N', 'KN': 'N', 'PN': 'N',
5113
                      'CK': 'K',
5114
                      'SH': 'S'}}
5115
    vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
5116
5117
    word = word.upper()  # Rule 3
5118
    word = _delete_consecutive_repeats(word)  # Rule 4
5119
5120
    # Rule 5
5121
    i = 0
5122
    while i < len(word):
5123
        for match_len in range(4, 1, -1):
5124
            if word[i:i+match_len] in rule_table[match_len]:
5125
                repl = rule_table[match_len][word[i:i+match_len]]
5126
                word = (word[:i] + repl + word[i+match_len:])
5127
                i += len(repl)
5128
                break
5129
        else:
5130
            i += 1
5131
5132
    word = word[0]+word[1:].translate(vowel_trans)  # Rule 6
5133
    return word
5134
5135
5136
def davidson(lname, fname='.', omit_fname=False):
5137
    """Return Davidson's Consonant Code.
5138
5139
    This is based on the name compression system described in
5140
    :cite:`Davidson:1962`.
5141
5142
    :cite:`Dolby:1970` identifies this as having been the name compression
5143
    algorithm used by SABRE.
5144
5145
    :param str lname: Last name (or word) to be encoded
5146
    :param str fname: First name (optional), of which the first character is
5147
        included in the code.
5148
    :param str omit_fname: Set to True to completely omit the first character
5149
        of the first name
5150
    :return: Davidson's Consonant Code
5151
    """
5152
    trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''}
5153
5154
    lname = text_type(lname.upper())
5155
    code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans))
5156
    code = code[:4] + (4-len(code))*' '
5157
5158
    if not omit_fname:
5159
        code += fname[:1].upper()
5160
5161
    return code
5162
5163
5164
def sound_d(word, maxlength=4):
5165
    """Return the SoundD code.
5166
5167
    SoundD is defined in :cite:`Varol:2012`.
5168
5169
    :param str word: the word to transform
5170
    :param int maxlength: the length of the code returned (defaults to 4)
5171
    :return:
5172
    """
5173
    _ref_soundd_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5174
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5175
                                       '01230120022455012623010202'))
5176
5177
    word = normalize('NFKD', text_type(word.upper()))
5178
    word = word.replace('ß', 'SS')
5179
    word = ''.join(c for c in word if c in
5180
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5181
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5182
                    'Y', 'Z'})
5183
5184
    if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
5185
        word = word[1:]
5186
    elif word[:1] == 'X':
5187
        word = 'S'+word[1:]
5188
    elif word[:2] == 'WH':
5189
        word = 'W'+word[2:]
5190
5191
    word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')
5192
5193
    word = word.translate(_ref_soundd_translation)
5194
    word = _delete_consecutive_repeats(word)
5195
    word = word.replace('0', '')
5196
5197
    if maxlength is not None:
5198
        if len(word) < maxlength:
5199
            word += '0' * (maxlength-len(word))
5200
        else:
5201
            word = word[:maxlength]
5202
5203
    return word
5204
5205
5206
def pshp_soundex_last(lname, maxlength=4, german=False):
5207
    """Calculate the PSHP Soundex/Viewex Coding of a last name.
5208
5209
    This coding is based on :cite:`Hershberg:1976`.
5210
5211
    Reference was also made to the German version of the same:
5212
    :cite:`Hershberg:1979`.
5213
5214
    A separate function, pshp_soundex_first() is used for first names.
5215
5216
    :param lname: the last name to encode
5217
    :param german: set to True if the name is German (different rules apply)
5218
    :return:
5219
    """
5220
    lname = normalize('NFKD', text_type(lname.upper()))
5221
    lname = lname.replace('ß', 'SS')
5222
    lname = ''.join(c for c in lname if c in
5223
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5224
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5225
                     'W', 'X', 'Y', 'Z'})
5226
5227
    # A. Prefix treatment
5228
    if lname[:3] == 'VON' or lname[:3] == 'VAN':
5229
        lname = lname[3:].strip()
5230
5231
    # The rule implemented below says "MC, MAC become 1". I believe it meant to
5232
    # say they become M except in German data (where superscripted 1 indicates
5233
    # "except in German data"). It doesn't make sense for them to become 1
5234
    # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
5235
    # this error(?).
5236
    if not german:
5237
        if lname[:3] == 'MAC':
5238
            lname = 'M'+lname[3:]
5239
        elif lname[:2] == 'MC':
5240
            lname = 'M'+lname[2:]
5241
5242
    # The non-German-only rule to strip ' is unnecessary due to filtering
5243
5244
    if lname[:1] in {'E', 'I', 'O', 'U'}:
5245
        lname = 'A' + lname[1:]
5246
    elif lname[:2] in {'GE', 'GI', 'GY'}:
5247
        lname = 'J' + lname[1:]
5248
    elif lname[:2] in {'CE', 'CI', 'CY'}:
5249
        lname = 'S' + lname[1:]
5250
    elif lname[:3] == 'CHR':
5251
        lname = 'K' + lname[1:]
5252
    elif lname[:1] == 'C' and lname[:2] != 'CH':
5253
        lname = 'K' + lname[1:]
5254
5255
    if lname[:2] == 'KN':
5256
        lname = 'N' + lname[1:]
5257
    elif lname[:2] == 'PH':
5258
        lname = 'F' + lname[1:]
5259
    elif lname[:3] in {'WIE', 'WEI'}:
5260
        lname = 'V' + lname[1:]
5261
5262
    if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
5263
        lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]
5264
5265
    code = lname[:1]
5266
5267
    # B. Postfix treatment
5268
    if lname[-1:] == 'R':
5269
        lname = lname[:-1] + 'N'
5270
    elif lname[-2:] in {'SE', 'CE'}:
5271
        lname = lname[:-2]
5272
    if lname[-2:] == 'SS':
5273
        lname = lname[:-2]
5274
    elif lname[-1:] == 'S':
5275
        lname = lname[:-1]
5276
5277
    if not german:
5278
        l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
5279
        l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
5280
                   'STON': 'SAON'}
5281
        if lname[-5:] in l5_repl:
5282
            lname = lname[:-5] + l5_repl[lname[-5:]]
5283
        elif lname[-4:] in l4_repl:
5284
            lname = lname[:-4] + l4_repl[lname[-4:]]
5285
5286
    if lname[-2:] in {'NG', 'ND'}:
5287
        lname = lname[:-1]
5288
    if not german and lname[-3:] in {'GAN', 'GEN'}:
5289
        lname = lname[:-3]+'A'+lname[-2:]
5290
5291
    if german:
5292
        if lname[-3:] == 'TES':
5293
            lname = lname[:-3]
5294
        elif lname[-2:] == 'TS':
5295
            lname = lname[:-2]
5296
        if lname[-3:] == 'TZE':
5297
            lname = lname[:-3]
5298
        elif lname[-2:] == 'ZE':
5299
            lname = lname[:-2]
5300
        if lname[-1:] == 'Z':
5301
            lname = lname[:-1]
5302
        elif lname[-2:] == 'TE':
5303
            lname = lname[:-2]
5304
5305
    # C. Infix Treatment
5306
    lname = lname.replace('CK', 'C')
5307
    lname = lname.replace('SCH', 'S')
5308
    lname = lname.replace('DT', 'T')
5309
    lname = lname.replace('ND', 'N')
5310
    lname = lname.replace('NG', 'N')
5311
    lname = lname.replace('LM', 'M')
5312
    lname = lname.replace('MN', 'M')
5313
    lname = lname.replace('WIE', 'VIE')
5314
    lname = lname.replace('WEI', 'VEI')
5315
5316
    # D. Soundexing
5317
    # code for X & Y are unspecified, but presumably are 2 & 0
5318
    _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5319
                                  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5320
                                 '01230120022455012523010202'))
5321
5322
    lname = lname.translate(_pshp_translation)
5323
    lname = _delete_consecutive_repeats(lname)
5324
5325
    code += lname[1:]
5326
    code = code.replace('0', '')  # rule 1
5327
5328
    if maxlength is not None:
5329
        if len(code) < maxlength:
5330
            code += '0' * (maxlength-len(code))
5331
        else:
5332
            code = code[:maxlength]
5333
5334
    return code
5335
5336
5337
def pshp_soundex_first(fname, maxlength=4, german=False):
5338
    """Calculate the PSHP Soundex/Viewex Coding of a first name.
5339
5340
    This coding is based on :cite:`Hershberg:1976`.
5341
5342
    Reference was also made to the German version of the same:
5343
    :cite:`Hershberg:1979`.
5344
5345
    A separate function, pshp_soundex_last() is used for last names.
5346
5347
    :param fname: the first name to encode
5348
    :param german: set to True if the name is German (different rules apply)
5349
    :return:
5350
    """
5351
    fname = normalize('NFKD', text_type(fname.upper()))
5352
    fname = fname.replace('ß', 'SS')
5353
    fname = ''.join(c for c in fname if c in
5354
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5355
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5356
                     'W', 'X', 'Y', 'Z'})
5357
5358
    # special rules
5359
    if fname == 'JAMES':
5360
        code = 'J7'
5361
    elif fname == 'PAT':
5362
        code = 'P7'
5363
5364
    else:
5365
        # A. Prefix treatment
5366
        if fname[:2] in {'GE', 'GI', 'GY'}:
5367
            fname = 'J' + fname[1:]
5368
        elif fname[:2] in {'CE', 'CI', 'CY'}:
5369
            fname = 'S' + fname[1:]
5370
        elif fname[:3] == 'CHR':
5371
            fname = 'K' + fname[1:]
5372
        elif fname[:1] == 'C' and fname[:2] != 'CH':
5373
            fname = 'K' + fname[1:]
5374
5375
        if fname[:2] == 'KN':
5376
            fname = 'N' + fname[1:]
5377
        elif fname[:2] == 'PH':
5378
            fname = 'F' + fname[1:]
5379
        elif fname[:3] in {'WIE', 'WEI'}:
5380
            fname = 'V' + fname[1:]
5381
5382
        if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
5383
            fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
5384
                     fname[1:])
5385
5386
        code = fname[:1]
5387
5388
        # B. Soundex coding
5389
        # code for Y unspecified, but presumably is 0
5390
        _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5391
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5392
                                     '01230120022455012523010202'))
5393
5394
        fname = fname.translate(_pshp_translation)
5395
        fname = _delete_consecutive_repeats(fname)
5396
5397
        code += fname[1:]
5398
        syl_ptr = code.find('0')
5399
        syl2_ptr = code[syl_ptr + 1:].find('0')
5400
        if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
5401
            code = code[:syl_ptr + 2]
5402
5403
        code = code.replace('0', '')  # rule 1
5404
5405
    if maxlength is not None:
5406
        if len(code) < maxlength:
5407
            code += '0' * (maxlength-len(code))
5408
        else:
5409
            code = code[:maxlength]
5410
5411
    return code
5412
5413
5414
def henry_early(word, maxlength=3):
5415
    """Calculate the early version of the Henry code for a word.
5416
5417
    The early version of Henry coding is given in :cite:`Legare:1972`. This is
5418
    different from the later version defined in :cite:`Henry:1976`.
5419
5420
    :param word:
5421
    :param int maxlength: the length of the code returned (defaults to 3)
5422
    :return:
5423
    """
5424
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
5425
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
5426
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
5427
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
5428
             'EU': 'U'}
5429
    _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
5430
    _simple = {'W': 'V', 'X': 'S', 'V': 'S'}
5431
5432
    word = normalize('NFKD', text_type(word.upper()))
5433
    word = ''.join(c for c in word if c in
5434
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5435
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5436
                    'Y', 'Z'})
5437
5438
    if not word:
5439
        return ''
5440
5441
    # Rule Ia seems to be covered entirely in II
5442
5443
    # Rule Ib
5444
    if word[0] in _vows:
5445
        # Ib1
5446
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
5447
             (word[1:2] in _cons and word[2:3] not in _cons))):
5448
            if word[0] == 'Y':
5449
                word = 'I'+word[1:]
5450
        # Ib2
5451
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
5452
            if word[0] == 'E':
5453
                word = 'A'+word[1:]
5454
            elif word[0] in {'I', 'U', 'Y'}:
5455
                word = 'E'+word[1:]
5456
        # Ib3
5457
        elif word[:2] in _diph:
5458
            word = _diph[word[:2]]+word[2:]
5459
        # Ib4
5460
        elif word[1:2] in _vows and word[0] == 'Y':
5461
            word = 'I' + word[1:]
5462
5463
    code = ''
5464
    skip = 0
5465
5466
    # Rule II
5467
    for pos, char in enumerate(word):
5468
        nxch = char[pos+1:pos+2]
5469
        prev = char[pos-1:pos]
5470
5471
        if skip:
5472
            skip -= 1
5473
        elif char in _vows:
5474
            code += char
5475
        # IIc
5476
        elif char == nxch:
5477
            skip = 1
5478
            code += char
5479
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
5480
            skip = 1
5481
            code += word[pos+1]
5482
        # IId
5483
        elif char == 'H' and prev in _cons:
5484
            continue
5485
        elif char == 'S' and nxch in _cons:
5486
            continue
5487
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
5488
            continue
5489
        elif char == 'L' and nxch in {'M', 'N'}:
5490
            continue
5491
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
5492
            continue
5493
        # IIa
5494
        elif char in _unaltered:
5495
            code += char
5496
        # IIb
5497
        elif char in _simple:
5498
            code += _simple[char]
5499
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
5500
            if char == 'C':
5501
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5502
                    code += 'K'
5503
                elif nxch in {'E', 'I', 'Y'}:
5504
                    code += 'J'
5505
                elif nxch == 'H':
5506
                    if word[pos+2:pos+3] in _vows:
5507
                        code += 'C'
5508
                    elif word[pos+2:pos+3] in {'R', 'L'}:
5509
                        code += 'K'
5510
            elif char == 'G':
5511
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5512
                    code += 'G'
5513
                elif nxch in {'E', 'I', 'Y'}:
5514
                    code += 'J'
5515
                elif nxch == 'N':
5516
                    code += 'N'
5517
            elif char == 'P':
5518
                if nxch != 'H':
5519
                    code += 'P'
5520
                else:
5521
                    code += 'F'
5522
            elif char == 'Q':
5523
                if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}:
5524
                    char += 'G'
5525
                elif word[pos + 1:pos + 2] in {'UA', 'UO'}:
5526
                    char += 'K'
5527
            elif char == 'S':
5528
                if word[pos:pos+6] == 'SAINTE':
5529
                    code += 'X'
5530
                    skip = 5
5531
                elif word[pos:pos+5] == 'SAINT':
5532
                    code += 'X'
5533
                    skip = 4
5534
                elif word[pos:pos+3] == 'STE':
5535
                    code += 'X'
5536
                    skip = 2
5537
                elif word[pos:pos+2] == 'ST':
5538
                    code += 'X'
5539
                    skip = 1
5540
                else:
5541
                    code += 'S'
5542
        else:  # this should not be possible
5543
            continue
5544
5545
    # IIe1
5546
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
5547
        code = code[:-2]
5548
    elif code[-4:-3] in _vows and code[-3:] == 'MPS':
5549
        code = code[:-3]
5550
    elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}:
5551
        code = code[:-2]
5552
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
5553
        code = code[:-1]
5554
    # IIe2
5555
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
5556
        code = code[:-1]
5557
    elif code[-2:] == 'ER':
5558
        code = code[:-1]
5559
5560
    # Drop non-initial vowels
5561
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
5562
                                        89: ''})
5563
5564
    if maxlength is not None:
5565
            code = code[:maxlength]
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 8 spaces were expected, but 12 were found.
Loading history...
5566
5567
    return code
5568
5569
5570
def norphone(word):
5571
    """Return the Norphone code.
5572
5573
    The reference implementation by Lars Marius Garshol is available in
5574
    :cite:`Garshol:2015`.
5575
5576
    Norphone was designed for Norwegian, but this implementation has been
5577
    extended to support Swedish vowels as well. This function incorporates
5578
    the "not implemented" rules from the above file's rule set.
5579
5580
    :param word:
5581
    :return:
5582
    """
5583
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
5584
5585
    replacements = {4: {'SKEI': 'X'},
5586
                    3: {'SKJ': 'X', 'KEI': 'X'},
5587
                    2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
5588
                        'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
5589
                        'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
5590
                    1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}
5591
5592
    word = word.upper()
5593
5594
    code = ''
5595
    skip = 0
5596
5597
    if word[0:2] == 'AA':
5598
        code = 'Å'
5599
        skip = 2
5600
    elif word[0:2] == 'GI':
5601
        code = 'J'
5602
        skip = 2
5603
    elif word[0:3] == 'SKY':
5604
        code = 'X'
5605
        skip = 3
5606
    elif word[0:2] == 'EI':
5607
        code = 'Æ'
5608
        skip = 2
5609
    elif word[0:2] == 'KY':
5610
        code = 'X'
5611
        skip = 2
5612
    elif word[:1] == 'C':
5613
        code = 'K'
5614
        skip = 1
5615
    elif word[:1] == 'Ä':
5616
        code = 'Æ'
5617
        skip = 1
5618
    elif word[:1] == 'Ö':
5619
        code = 'Ø'
5620
        skip = 1
5621
5622
    if word[-2:] == 'DT':
5623
        word = word[:-2]+'T'
5624
    # Though the rules indicate this rule applies in all positions, the
5625
    # reference implementation indicates it applies only in final position.
5626
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
5627
        word = word[:-2]
5628
5629
    for pos, char in enumerate(word):
5630
        if skip:
5631
            skip -= 1
5632
        else:
5633
            for length in sorted(replacements, reverse=True):
5634
                if word[pos:pos+length] in replacements[length]:
5635
                    code += replacements[length][word[pos:pos+length]]
5636
                    skip = length-1
5637
                    break
5638
            else:
5639
                if not pos or char not in _vowels:
5640
                    code += char
5641
5642
    code = _delete_consecutive_repeats(code)
5643
5644
    return code
5645
5646
5647
def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'):
5648
    r"""Return the Dolby Code of a name.
5649
5650
    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
5651
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
5652
5653
    :param word: the word to encode
5654
    :param maxlength: maximum length of the returned Dolby code -- this also
5655
        activates the fixed-length code mode
5656
    :param keep_vowels: if True, retains all vowel markers
5657
    :param vowel_char: the vowel marker character (default to \*)
5658
    :return:
5659
    """
5660
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
5661
5662
    # uppercase, normalize, decompose, and filter non-A-Z out
5663
    word = normalize('NFKD', text_type(word.upper()))
5664
    word = word.replace('ß', 'SS')
5665
    word = ''.join(c for c in word if c in
5666
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5667
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5668
                    'Y', 'Z'})
5669
5670
    # Rule 1 (FL2)
5671
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
5672
        word = 'MK'+word[3:]
5673
    elif word[:2] == 'MC':
5674
        word = 'MK'+word[2:]
5675
5676
    # Rule 2 (FL3)
5677
    pos = len(word)-2
5678
    while pos > -1:
5679
        if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
5680
                               'SK', 'ST'}:
5681
            word = word[:pos+1]+word[pos+2:]
5682
            pos += 1
5683
        pos -= 1
5684
5685
    # Rule 3 (FL4)
5686
    # Although the rule indicates "after the first letter", the test cases make
5687
    # it clear that these apply to the first letter also.
5688
    word = word.replace('X', 'KS')
5689
    word = word.replace('CE', 'SE')
5690
    word = word.replace('CI', 'SI')
5691
    word = word.replace('CY', 'SI')
5692
5693
    # not in the rule set, but they seem to have intended it
5694
    word = word.replace('TCH', 'CH')
5695
5696
    pos = word.find('CH', 1)
5697
    while pos != -1:
5698
        if word[pos-1:pos] not in _vowels:
5699
            word = word[:pos]+'S'+word[pos+1:]
5700
        pos = word.find('CH', pos+1)
5701
5702
    word = word.replace('C', 'K')
5703
    word = word.replace('Z', 'S')
5704
5705
    word = word.replace('WR', 'R')
5706
    word = word.replace('DG', 'G')
5707
    word = word.replace('QU', 'K')
5708
    word = word.replace('T', 'D')
5709
    word = word.replace('PH', 'F')
5710
5711
    # Rule 4 (FL5)
5712
    # Although the rule indicates "after the first letter", the test cases make
5713
    # it clear that these apply to the first letter also.
5714
    pos = word.find('K', 0)
5715
    while pos != -1:
5716
        if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
5717
            word = word[:pos-1]+word[pos:]
5718
            pos -= 1
5719
        pos = word.find('K', pos+1)
5720
5721
    # Rule FL6
5722
    if maxlength and word[-1:] == 'E':
5723
        word = word[:-1]
5724
5725
    # Rule 5 (FL7)
5726
    word = _delete_consecutive_repeats(word)
5727
5728
    # Rule 6 (FL8)
5729
    if word[:2] == 'PF':
5730
        word = word[1:]
5731
    if word[-2:] == 'PF':
5732
        word = word[:-1]
5733
    elif word[-2:] == 'GH':
5734
        if word[-3:-2] in _vowels:
5735
            word = word[:-2]+'F'
5736
        else:
5737
            word = word[:-2]+'G'
5738
    word = word.replace('GH', '')
5739
5740
    # Rule FL9
5741
    if maxlength:
5742
        word = word.replace('V', 'F')
5743
5744
    # Rules 7-9 (FL10-FL12)
5745
    first = 1 + (1 if maxlength else 0)
5746
    code = ''
5747
    for pos, char in enumerate(word):
5748
        if char in _vowels:
5749
            if first or keep_vowels:
5750
                code += vowel_char
5751
                first -= 1
5752
            else:
5753
                continue
5754
        elif pos > 0 and char in {'W', 'H'}:
5755
            continue
5756
        else:
5757
            code += char
5758
5759
    if maxlength:
5760
        # Rule FL13
5761
        if len(code) > maxlength and code[-1:] == 'S':
5762
            code = code[:-1]
5763
        if keep_vowels:
5764
            code = code[:maxlength]
5765
        else:
5766
            # Rule FL14
5767
            code = code[:maxlength + 2]
5768
            # Rule FL15
5769
            while len(code) > maxlength:
5770
                vowels = len(code) - maxlength
5771
                excess = vowels - 1
5772
                word = code
5773
                code = ''
5774
                for char in word:
5775
                    if char == vowel_char:
5776
                        if vowels:
5777
                            code += char
5778
                            vowels -= 1
5779
                    else:
5780
                        code += char
5781
                code = code[:maxlength + excess]
5782
5783
        # Rule FL16
5784
        code += ' ' * (maxlength - len(code))
5785
5786
    return code
5787
5788
5789
def phonetic_spanish(word, maxlength=None):
5790
    """Return the PhoneticSpanish coding of word.
5791
5792
    This follows the coding described in :cite:`Amon:2012` and
5793
    :cite:`delPilarAngeles:2015`.
5794
5795
    :param word:
5796
    :return:
5797
    """
5798
    _es_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5799
                                        'BCDFGHJKLMNPQRSTVXYZ'),
5800
                                       '14328287566079431454'))
5801
5802
    # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
5803
    word = normalize('NFKD', text_type(word.upper()))
5804
    word = ''.join(c for c in word if c in
5805
                   {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N',
5806
                    'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'})
5807
5808
    # merge repeated Ls & Rs
5809
    word = word.replace('LL', 'L')
5810
    word = word.replace('R', 'R')
5811
5812
    # apply the Soundex algorithm
5813
    sdx = word.translate(_es_soundex_translation)
5814
5815
    if maxlength:
5816
        sdx = sdx[:maxlength]
5817
5818
    return sdx
5819
5820
5821
def spanish_metaphone(word, maxlength=6, modified=False):
5822
    """Return the Spanish Metaphone of a word.
5823
5824
    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
5825
    https://github.com/amsqr/Spanish-Metaphone and discussed in
5826
    :cite:`Mosquera:2012`.
5827
5828
    Modified version based on :cite:`delPilarAngeles:2016`.
5829
5830
    :param word:
5831
    :param maxlength:
5832
    :param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's
5833
        modified version of the algorithm
5834
    :return:
5835
    """
5836
    def _is_vowel(pos):
5837
        """Return True if the character at word[pos] is a vowel."""
5838
        if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}:
5839
            return True
5840
        return False
5841
5842
    word = normalize('NFC', text_type(word.upper()))
5843
5844
    meta_key = ''
5845
    pos = 0
5846
5847
    # do some replacements for the modified version
5848
    if modified:
5849
        word = word.replace('MB', 'NB')
5850
        word = word.replace('MP', 'NP')
5851
        word = word.replace('BS', 'S')
5852
        if word[:2] == 'PS':
5853
            word = word[1:]
5854
5855
    # simple replacements
5856
    word = word.replace('Á', 'A')
5857
    word = word.replace('CH', 'X')
5858
    word = word.replace('Ç', 'S')
5859
    word = word.replace('É', 'E')
5860
    word = word.replace('Í', 'I')
5861
    word = word.replace('Ó', 'O')
5862
    word = word.replace('Ú', 'U')
5863
    word = word.replace('Ñ', 'NY')
5864
    word = word.replace('GÜ', 'W')
5865
    word = word.replace('Ü', 'U')
5866
    word = word.replace('B', 'V')
5867
    word = word.replace('LL', 'Y')
5868
5869
    while len(meta_key) < maxlength:
5870
        if pos >= len(word):
5871
            break
5872
5873
        # get the next character
5874
        current_char = word[pos]
5875
5876
        # if a vowel in pos 0, add to key
5877
        if _is_vowel(pos) and pos == 0:
5878
            meta_key += current_char
5879
            pos += 1
5880
        # otherwise, do consonant rules
5881
        else:
5882
            # simple consonants (unmutated)
5883
            if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V',
5884
                                'L', 'Y'}:
5885
                meta_key += current_char
5886
                # skip doubled consonants
5887
                if word[pos+1:pos+2] == current_char:
5888
                    pos += 2
5889
                else:
5890
                    pos += 1
5891
            else:
5892
                if current_char == 'C':
5893
                    # special case 'acción', 'reacción',etc.
5894
                    if word[pos+1:pos+2] == 'C':
5895
                        meta_key += 'X'
5896
                        pos += 2
5897
                    # special case 'cesar', 'cien', 'cid', 'conciencia'
5898
                    elif word[pos+1:pos+2] in {'E', 'I'}:
5899
                        meta_key += 'Z'
5900
                        pos += 2
5901
                    # base case
5902
                    else:
5903
                        meta_key += 'K'
5904
                        pos += 1
5905
                elif current_char == 'G':
5906
                    # special case 'gente', 'ecologia',etc
5907
                    if word[pos + 1:pos + 2] in {'E', 'I'}:
5908
                        meta_key += 'J'
5909
                        pos += 2
5910
                    # base case
5911
                    else:
5912
                        meta_key += 'G'
5913
                        pos += 1
5914
                elif current_char == 'H':
5915
                    # since the letter 'H' is silent in Spanish,
5916
                    # set the meta key to the vowel after the letter 'H'
5917
                    if _is_vowel(pos+1):
5918
                        meta_key += word[pos+1]
5919
                        pos += 2
5920
                    else:
5921
                        meta_key += 'H'
5922
                        pos += 1
5923
                elif current_char == 'Q':
5924
                    if word[pos+1:pos+2] == 'U':
5925
                        pos += 2
5926
                    else:
5927
                        pos += 1
5928
                    meta_key += 'K'
5929
                elif current_char == 'W':
5930
                    meta_key += 'U'
5931
                    pos += 1
5932
                elif current_char == 'R':
5933
                    meta_key += 'R'
5934
                    pos += 1
5935
                elif current_char == 'S':
5936
                    if not _is_vowel(pos+1) and pos == 0:
5937
                        meta_key += 'ES'
5938
                        pos += 1
5939
                    else:
5940
                        meta_key += 'S'
5941
                        pos += 1
5942
                elif current_char == 'Z':
5943
                    meta_key += 'Z'
5944
                    pos += 1
5945
                elif current_char == 'X':
5946
                    if len(word) > 1 and pos == 0 and not _is_vowel(pos+1):
5947
                        meta_key += 'EX'
5948
                        pos += 1
5949
                    else:
5950
                        meta_key += 'X'
5951
                        pos += 1
5952
                else:
5953
                    pos += 1
5954
5955
    # Final change from S to Z in modified version
5956
    if modified:
5957
        meta_key = meta_key.replace('S', 'Z')
5958
5959
    return meta_key
5960
5961
5962
def metasoundex(word, language='en'):
5963
    """Return the MetaSoundex code for a word.
5964
5965
    This is based on :cite:`Koneru:2017`.
5966
5967
    :param word:
5968
    :param language: either 'en' for English or 'es' for Spanish
5969
    :return:
5970
    """
5971
    _metasoundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5972
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5973
                                        '07430755015866075943077514'))
5974
5975
    if language == 'es':
5976
        return phonetic_spanish(spanish_metaphone(word))
5977
5978
    word = soundex(metaphone(word))
5979
    word = word[0].translate(_metasoundex_translation)+word[1:]
5980
5981
    return word
5982
5983
5984
def soundex_br(word, maxlength=4, zero_pad=True):
5985
    """Return the SoundexBR encoding of a word.
5986
5987
    :param word:
5988
    :return:
5989
    """
5990
    _soundex_br_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5991
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5992
                                        '01230120022455012623010202'))
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (remove 1 space).
Loading history...
5993
5994
    word = normalize('NFKD', text_type(word.upper()))
5995
    word = ''.join(c for c in word if c in
5996
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5997
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5998
                    'Y', 'Z'})
5999
6000
    if word[:2] == 'WA':
6001
        first = 'V'
6002
    elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
6003
        first = 'C'
6004
    elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
6005
        first = 'S'
6006
    elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
6007
        first = 'J'
6008
    elif word[:1] == 'Y':
6009
        first = 'I'
6010
    elif word[:1] == 'H':
6011
        first = word[1:2]
6012
        word = word[1:]
6013
    else:
6014
        first = word[:1]
6015
6016
    sdx = first + word[1:].translate(_soundex_br_translation)
6017
    sdx = _delete_consecutive_repeats(sdx)
6018
    sdx = sdx.replace('0', '')
6019
6020
    if zero_pad:
6021
        sdx += ('0'*maxlength)
6022
6023
    return sdx[:maxlength]
6024
6025
6026
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
6027
         concat=False, filter_langs=False):
6028
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
6029
6030
    The Beider-Morse Phonetic Matching algorithm is described in
6031
    :cite:`Beider:2008`.
6032
    The reference implementation is licensed under GPLv3.
6033
6034
    :param str word: the word to transform
6035
    :param str language_arg: the language of the term; supported values
6036
        include:
6037
6038
            - 'any'
6039
            - 'arabic'
6040
            - 'cyrillic'
6041
            - 'czech'
6042
            - 'dutch'
6043
            - 'english'
6044
            - 'french'
6045
            - 'german'
6046
            - 'greek'
6047
            - 'greeklatin'
6048
            - 'hebrew'
6049
            - 'hungarian'
6050
            - 'italian'
6051
            - 'polish'
6052
            - 'portuguese'
6053
            - 'romanian'
6054
            - 'russian'
6055
            - 'spanish'
6056
            - 'turkish'
6057
            - 'germandjsg'
6058
            - 'polishdjskp'
6059
            - 'russiandjsre'
6060
6061
    :param str name_mode: the name mode of the algorithm:
6062
6063
            - 'gen' -- general (default)
6064
            - 'ash' -- Ashkenazi
6065
            - 'sep' -- Sephardic
6066
6067
    :param str match_mode: matching mode: 'approx' or 'exact'
6068
    :param bool concat: concatenation mode
6069
    :param bool filter_langs: filter out incompatible languages
6070
    :returns: the BMPM value(s)
6071
    :rtype: tuple
6072
6073
    >>> bmpm('Christopher')
6074
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6075
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
6076
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
6077
    zritofi'
6078
    >>> bmpm('Niall')
6079
    'nial niol'
6080
    >>> bmpm('Smith')
6081
    'zmit'
6082
    >>> bmpm('Schmidt')
6083
    'zmit stzmit'
6084
6085
    >>> bmpm('Christopher', language_arg='German')
6086
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6087
    xristYfir'
6088
    >>> bmpm('Christopher', language_arg='English')
6089
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
6090
    xrQstafir'
6091
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
6092
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6093
    xristYfir'
6094
6095
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
6096
    'xriStopher xriStofer xristopher xristofer'
6097
    """
6098
    return _bmpm(word, language_arg, name_mode, match_mode,
6099
                 concat, filter_langs)
6100
6101
6102
if __name__ == '__main__':
6103
    import doctest
6104
    doctest.testmod()
6105