Completed
Push — master ( 9304cd...7e413a )
by Chris
11:51
created

abydos.phonetic.roger_root()   B

Complexity

Conditions 7

Size

Total Lines 79
Code Lines 47

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
eloc 47
nop 3
dl 0
loc 79
rs 7.3345
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (6456/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Refined Soundex
26
    - Daitch-Mokotoff Soundex
27
    - Kölner Phonetik
28
    - NYSIIS
29
    - Match Rating Algorithm
30
    - Metaphone
31
    - Double Metaphone
32
    - Caverphone
33
    - Alpha Search Inquiry System
34
    - Fuzzy Soundex
35
    - Phonex
36
    - Phonem
37
    - Phonix
38
    - SfinxBis
39
    - phonet
40
    - Standardized Phonetic Frequency Code
41
    - Statistics Canada
42
    - Lein
43
    - Roger Root
44
    - Oxford Name Compression Algorithm (ONCA)
45
    - Eudex phonetic hash
46
    - Haase Phonetik
47
    - Reth-Schek Phonetik
48
    - FONEM
49
    - Parmar-Kumbharana
50
    - Davidson's Consonant Code
51
    - SoundD
52
    - PSHP Soundex/Viewex Coding
53
    - an early version of Henry Code
54
    - Norphone
55
    - Dolby Code
56
    - Phonetic Spanish
57
    - Spanish Metaphone
58
    - MetaSoundex
59
    - SoundexBR
60
    - NRL English-to-phoneme
61
    - Beider-Morse Phonetic Matching
62
"""
63
64
from __future__ import division, unicode_literals
65
66
from collections import Counter
67
from itertools import groupby, product
68
from re import compile as re_compile
69
from re import match as re_match
70
from unicodedata import normalize
71
72
from six import text_type
73
from six.moves import range
74
75
from ._bm import _bmpm
76
77
_INFINITY = float('inf')
78
79
__all__ = ['alpha_sis', 'bmpm', 'caverphone', 'davidson', 'dm_soundex',
80
           'dolby', 'double_metaphone', 'eudex', 'fonem', 'fuzzy_soundex',
81
           'haase_phonetik', 'henry_early', 'koelner_phonetik',
82
           'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha', 'lein',
83
           'metaphone', 'metasoundex', 'mra', 'norphone', 'nrl', 'nysiis',
84
           'onca', 'parmar_kumbharana', 'phonem', 'phonet', 'phonetic_spanish',
85
           'phonex', 'phonix', 'pshp_soundex_first', 'pshp_soundex_last',
86
           'refined_soundex', 'reth_schek_phonetik', 'roger_root',
87
           'russell_index', 'russell_index_alpha',
88
           'russell_index_num_to_alpha', 'sfinxbis', 'sound_d', 'soundex',
89
           'soundex_br', 'spanish_metaphone', 'spfc', 'statistics_canada']
90
91
92
def _delete_consecutive_repeats(word):
93
    """Delete consecutive repeated characters in a word.
94
95
    :param str word: the word to transform
96
    :returns: word with consecutive repeating characters collapsed to
97
        a single instance
98
    :rtype: str
99
    """
100
    return ''.join(char for char, _ in groupby(word))
101
102
103
def russell_index(word):
104
    """Return the Russell Index (integer output) of a word.
105
106
    This follows Robert C. Russell's Index algorithm, as described in
107
    :cite:`Russell:1917`.
108
109
    :param str word: the word to transform
110
    :returns: the Russell Index value
111
    :rtype: int
112
113
    >>> russell_index('Christopher')
114
    3813428
115
    >>> russell_index('Niall')
116
    715
117
    >>> russell_index('Smith')
118
    3614
119
    >>> russell_index('Schmidt')
120
    3614
121
    """
122
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
123
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
124
                                    '12341231356712383412313'))
125
126
    word = normalize('NFKD', text_type(word.upper()))
127
    word = word.replace('ß', 'SS')
128
    word = word.replace('GH', '')  # discard gh (rule 3)
129
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
130
131
    # translate according to Russell's mapping
132
    word = ''.join(c for c in word if c in
133
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
134
                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
135
    sdx = word.translate(_russell_translation)
136
137
    # remove any 1s after the first occurrence
138
    one = sdx.find('1')+1
139
    if one:
140
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
141
142
    # remove repeating characters
143
    sdx = _delete_consecutive_repeats(sdx)
144
145
    # return as an int
146
    return int(sdx) if sdx else float('NaN')
147
148
149
def russell_index_num_to_alpha(num):
150
    """Convert the Russell Index integer to an alphabetic string.
151
152
    This follows Robert C. Russell's Index algorithm, as described in
153
    :cite:`Russell:1917`.
154
155
    :param int num: a Russell Index integer value
156
    :returns: the Russell Index as an alphabetic string
157
    :rtype: str
158
159
    >>> russell_index_num_to_alpha(3813428)
160
    'CRACDBR'
161
    >>> russell_index_num_to_alpha(715)
162
    'NAL'
163
    >>> russell_index_num_to_alpha(3614)
164
    'CMAD'
165
    """
166
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
167
                                        'ABCDLMNR'))
168
    num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
169
                                                     '6', '7', '8'})
170
    if num:
171
        return num.translate(_russell_num_translation)
172
    return ''
173
174
175
def russell_index_alpha(word):
176
    """Return the Russell Index (alphabetic output) for the word.
177
178
    This follows Robert C. Russell's Index algorithm, as described in
179
    :cite:`Russell:1917`.
180
181
    :param str word: the word to transform
182
    :returns: the Russell Index value as an alphabetic string
183
    :rtype: str
184
185
    >>> russell_index_alpha('Christopher')
186
    'CRACDBR'
187
    >>> russell_index_alpha('Niall')
188
    'NAL'
189
    >>> russell_index_alpha('Smith')
190
    'CMAD'
191
    >>> russell_index_alpha('Schmidt')
192
    'CMAD'
193
    """
194
    if word:
195
        return russell_index_num_to_alpha(russell_index(word))
196
    return ''
197
198
199
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
200
    """Return the Soundex code for a word.
201
202
    :param str word: the word to transform
203
    :param int maxlength: the length of the code returned (defaults to 4)
204
    :param str var: the variant of the algorithm to employ (defaults to
205
        'American'):
206
207
        - 'American' follows the American Soundex algorithm, as described at
208
          :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
209
          Miracode
210
        - 'special' follows the rules from the 1880-1910 US Census
211
          retrospective re-analysis, in which h & w are not treated as blocking
212
          consonants but as vowels. Cf. :cite:`Repici:2013`.
213
        - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
214
          US Census, including coding prefixed and unprefixed versions of some
215
          names
216
217
    :param bool reverse: reverse the word before computing the selected Soundex
218
        (defaults to False); This results in "Reverse Soundex", which is useful
219
        for blocking in cases where the initial elements may be in error.
220
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
221
        maxlength string
222
    :returns: the Soundex value
223
    :rtype: str
224
225
    >>> soundex("Christopher")
226
    'C623'
227
    >>> soundex("Niall")
228
    'N400'
229
    >>> soundex('Smith')
230
    'S530'
231
    >>> soundex('Schmidt')
232
    'S530'
233
234
235
    >>> soundex('Christopher', maxlength=_INFINITY)
236
    'C623160000000000000000000000000000000000000000000000000000000000'
237
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
238
    'C62316'
239
240
    >>> soundex('Christopher', reverse=True)
241
    'R132'
242
243
    >>> soundex('Ashcroft')
244
    'A261'
245
    >>> soundex('Asicroft')
246
    'A226'
247
    >>> soundex('Ashcroft', var='special')
248
    'A226'
249
    >>> soundex('Asicroft', var='special')
250
    'A226'
251
    """
252
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
253
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
254
                                    '01230129022455012623019202'))
255
256
    # Require a maxlength of at least 4 and not more than 64
257
    if maxlength is not None:
258
        maxlength = min(max(4, maxlength), 64)
259
    else:
260
        maxlength = 64
261
262
    # uppercase, normalize, decompose, and filter non-A-Z out
263
    word = normalize('NFKD', text_type(word.upper()))
264
    word = word.replace('ß', 'SS')
265
266
    if var == 'Census':
267
        # TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
268
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
269
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
270
                    soundex(word[3:], maxlength, 'American', reverse,
271
                            zero_pad))
272
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
273
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
274
                    soundex(word[2:], maxlength, 'American', reverse,
275
                            zero_pad))
276
        # Otherwise, proceed as usual (var='American' mode, ostensibly)
277
278
    word = ''.join(c for c in word if c in
279
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
280
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
281
                    'Y', 'Z'})
282
283
    # Nothing to convert, return base case
284
    if not word:
285
        if zero_pad:
286
            return '0'*maxlength
287
        return '0'
288
289
    # Reverse word if computing Reverse Soundex
290
    if reverse:
291
        word = word[::-1]
292
293
    # apply the Soundex algorithm
294
    sdx = word.translate(_soundex_translation)
295
296
    if var == 'special':
297
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
298
    else:
299
        sdx = sdx.replace('9', '')  # rule 1
300
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
301
302
    if word[0] in 'HW':
303
        sdx = word[0] + sdx
304
    else:
305
        sdx = word[0] + sdx[1:]
306
    sdx = sdx.replace('0', '')  # rule 1
307
308
    if zero_pad:
309
        sdx += ('0'*maxlength)  # rule 4
310
311
    return sdx[:maxlength]
312
313
314
def refined_soundex(word, maxlength=_INFINITY, zero_pad=False,
315
                    retain_vowels=False):
316
    """Return the Refined Soundex code for a word.
317
318
    This is Soundex, but with more character classes. It was defined at
319
    :cite:`Boyce:1998`.
320
321
    :param word: the word to transform
322
    :param maxlength: the length of the code returned (defaults to unlimited)
323
    :param zero_pad: pad the end of the return value with 0s to achieve a
324
        maxlength string
325
    :param retain_vowels: retain vowels (as 0) in the resulting code
326
    :returns: the Refined Soundex value
327
    :rtype: str
328
329
    >>> refined_soundex('Christopher')
330
    'C393619'
331
    >>> refined_soundex('Niall')
332
    'N87'
333
    >>> refined_soundex('Smith')
334
    'S386'
335
    >>> refined_soundex('Schmidt')
336
    'S386'
337
    """
338
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
339
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
340
                                        '01360240043788015936020505'))
341
342
    # uppercase, normalize, decompose, and filter non-A-Z out
343
    word = normalize('NFKD', text_type(word.upper()))
344
    word = word.replace('ß', 'SS')
345
    word = ''.join(c for c in word if c in
346
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
347
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
348
                    'Y', 'Z'})
349
350
    # apply the Soundex algorithm
351
    sdx = word[:1] + word.translate(_ref_soundex_translation)
352
    sdx = _delete_consecutive_repeats(sdx)
353
    if not retain_vowels:
354
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y
355
356
    if maxlength < _INFINITY:
357
        if zero_pad:
358
            sdx += ('0' * maxlength)
359
        sdx = sdx[:maxlength]
360
361
    return sdx
362
363
364
def dm_soundex(word, maxlength=6, zero_pad=True):
365
    """Return the Daitch-Mokotoff Soundex code for a word.
366
367
    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
368
    of a word as a set. A collection is necessary since there can be multiple
369
    values for a single word.
370
371
    :param word: the word to transform
372
    :param maxlength: the length of the code returned (defaults to 6)
373
    :param zero_pad: pad the end of the return value with 0s to achieve a
374
        maxlength string
375
    :returns: the Daitch-Mokotoff Soundex value
376
    :rtype: str
377
378
    >>> sorted(dm_soundex('Christopher'))
379
    ['494379', '594379']
380
    >>> dm_soundex('Niall')
381
    {'680000'}
382
    >>> dm_soundex('Smith')
383
    {'463000'}
384
    >>> dm_soundex('Schmidt')
385
    {'463000'}
386
387
    >>> sorted(dm_soundex('The quick brown fox', maxlength=20, zero_pad=False))
388
    ['35457976754', '3557976754']
389
    """
390
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
391
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
392
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
393
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
394
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
395
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
396
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
397
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
398
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
399
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
400
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
401
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
402
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
403
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
404
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
405
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
406
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
407
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
408
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
409
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
410
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
411
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
412
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
413
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
414
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
415
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
416
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
417
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
418
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
419
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
420
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
421
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
422
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
423
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
424
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
425
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
426
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
427
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
428
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
429
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
430
                  'CH': ((5, 4), (5, 4), (5, 4)),
431
                  'CK': ((5, 45), (5, 45), (5, 45)),
432
                  'C': ((5, 4), (5, 4), (5, 4)),
433
                  'J': ((1, 4), ('_', 4), ('_', 4)),
434
                  'RZ': ((94, 4), (94, 4), (94, 4)),
435
                  'RS': ((94, 4), (94, 4), (94, 4))}
436
437
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
438
                  'B': ('B'),
439
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
440
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
441
                        'DZ', 'D'),
442
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
443
                  'F': ('FB', 'F'),
444
                  'G': ('G'),
445
                  'H': ('H'),
446
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
447
                  'J': ('J'),
448
                  'K': ('KH', 'KS', 'K'),
449
                  'L': ('L'),
450
                  'M': ('MN', 'M'),
451
                  'N': ('NM', 'N'),
452
                  'O': ('OI', 'OJ', 'OY', 'O'),
453
                  'P': ('PF', 'PH', 'P'),
454
                  'Q': ('Q'),
455
                  'R': ('RS', 'RZ', 'R'),
456
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
457
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
458
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
459
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
460
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
461
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
462
                        'TS', 'TZ', 'T'),
463
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
464
                  'V': ('V'),
465
                  'W': ('W'),
466
                  'X': ('X'),
467
                  'Y': ('Y'),
468
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
469
                        'ZH', 'ZS', 'Z')}
470
471
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
472
    dms = ['']  # initialize empty code list
473
474
    # Require a maxlength of at least 6 and not more than 64
475
    if maxlength is not None:
476
        maxlength = min(max(6, maxlength), 64)
477
    else:
478
        maxlength = 64
479
480
    # uppercase, normalize, decompose, and filter non-A-Z
481
    word = normalize('NFKD', text_type(word.upper()))
482
    word = word.replace('ß', 'SS')
483
    word = ''.join(c for c in word if c in
484
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
485
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
486
                    'Y', 'Z'})
487
488
    # Nothing to convert, return base case
489
    if not word:
490
        if zero_pad:
491
            return {'0'*maxlength}
492
        return {'0'}
493
494
    pos = 0
495
    while pos < len(word):
496
        # Iterate through _dms_order, which specifies the possible substrings
497
        # for which codes exist in the Daitch-Mokotoff coding
498
        for sstr in _dms_order[word[pos]]:
499
            if word[pos:].startswith(sstr):
500
                # Having determined a valid substring start, retrieve the code
501
                dm_val = _dms_table[sstr]
502
503
                # Having retried the code (triple), determine the correct
504
                # positional variant (first, pre-vocalic, elsewhere)
505
                if pos == 0:
506
                    dm_val = dm_val[0]
507
                elif (pos+len(sstr) < len(word) and
508
                      word[pos+len(sstr)] in _vowels):
509
                    dm_val = dm_val[1]
510
                else:
511
                    dm_val = dm_val[2]
512
513
                # Build the code strings
514
                if isinstance(dm_val, tuple):
515
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
516
                            + [_ + text_type(dm_val[1]) for _ in dms]
517
                else:
518
                    dms = [_ + text_type(dm_val) for _ in dms]
519
                pos += len(sstr)
520
                break
521
522
    # Filter out double letters and _ placeholders
523
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
524
           for _ in dms)
525
526
    # Trim codes and return set
527
    if zero_pad:
528
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
529
    else:
530
        dms = (_[:maxlength] for _ in dms)
531
    return set(dms)
532
533
534
def koelner_phonetik(word):
535
    """Return the Kölner Phonetik (numeric output) code for a word.
536
537
    Based on the algorithm defined by :cite:`Postel:1969`.
538
539
    While the output code is numeric, it is still a str because 0s can lead
540
    the code.
541
542
    :param str word: the word to transform
543
    :returns: the Kölner Phonetik value as a numeric string
544
    :rtype: str
545
546
    >>> koelner_phonetik('Christopher')
547
    '478237'
548
    >>> koelner_phonetik('Niall')
549
    '65'
550
    >>> koelner_phonetik('Smith')
551
    '862'
552
    >>> koelner_phonetik('Schmidt')
553
    '862'
554
    >>> koelner_phonetik('Müller')
555
    '657'
556
    >>> koelner_phonetik('Zimmermann')
557
    '86766'
558
    """
559
    def _after(word, i, letters):
560
        """Return True if word[i] follows one of the supplied letters."""
561
        if i > 0 and word[i-1] in letters:
562
            return True
563
        return False
564
565
    def _before(word, i, letters):
566
        """Return True if word[i] precedes one of the supplied letters."""
567
        if i+1 < len(word) and word[i+1] in letters:
568
            return True
569
        return False
570
571
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
572
573
    sdx = ''
574
575
    word = normalize('NFKD', text_type(word.upper()))
576
    word = word.replace('ß', 'SS')
577
578
    word = word.replace('Ä', 'AE')
579
    word = word.replace('Ö', 'OE')
580
    word = word.replace('Ü', 'UE')
581
    word = ''.join(c for c in word if c in
582
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
583
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
584
                    'Y', 'Z'})
585
586
    # Nothing to convert, return base case
587
    if not word:
588
        return sdx
589
590
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
591 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
592
            sdx += '0'
593
        elif word[i] == 'B':
594
            sdx += '1'
595
        elif word[i] == 'P':
596
            if _before(word, i, {'H'}):
597
                sdx += '3'
598
            else:
599
                sdx += '1'
600
        elif word[i] in {'D', 'T'}:
601
            if _before(word, i, {'C', 'S', 'Z'}):
602
                sdx += '8'
603
            else:
604
                sdx += '2'
605
        elif word[i] in {'F', 'V', 'W'}:
606
            sdx += '3'
607
        elif word[i] in {'G', 'K', 'Q'}:
608
            sdx += '4'
609
        elif word[i] == 'C':
610
            if _after(word, i, {'S', 'Z'}):
611
                sdx += '8'
612
            elif i == 0:
613
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
614
                                     'X'}):
615
                    sdx += '4'
616
                else:
617
                    sdx += '8'
618
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
619
                sdx += '4'
620
            else:
621
                sdx += '8'
622
        elif word[i] == 'X':
623
            if _after(word, i, {'C', 'K', 'Q'}):
624
                sdx += '8'
625
            else:
626
                sdx += '48'
627
        elif word[i] == 'L':
628
            sdx += '5'
629
        elif word[i] in {'M', 'N'}:
630
            sdx += '6'
631
        elif word[i] == 'R':
632
            sdx += '7'
633
        elif word[i] in {'S', 'Z'}:
634
            sdx += '8'
635
636
    sdx = _delete_consecutive_repeats(sdx)
637
638
    if sdx:
639
        sdx = sdx[:1] + sdx[1:].replace('0', '')
640
641
    return sdx
642
643
644
def koelner_phonetik_num_to_alpha(num):
645
    """Convert a Kölner Phonetik code from numeric to alphabetic.
646
647
    :param str num: a numeric Kölner Phonetik representation
648
    :returns: an alphabetic representation of the same word
649
    :rtype: str
650
651
    >>> koelner_phonetik_num_to_alpha(862)
652
    'SNT'
653
    >>> koelner_phonetik_num_to_alpha(657)
654
    'NLR'
655
    >>> koelner_phonetik_num_to_alpha(86766)
656
    'SNRNN'
657
    """
658
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
659
                                        'APTFKLNRS'))
660
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
661
                                                     '5', '6', '7', '8'})
662
    return num.translate(_koelner_num_translation)
663
664
665
def koelner_phonetik_alpha(word):
666
    """Return the Kölner Phonetik (alphabetic output) code for a word.
667
668
    :param str word: the word to transform
669
    :returns: the Kölner Phonetik value as an alphabetic string
670
    :rtype: str
671
672
    >>> koelner_phonetik_alpha('Smith')
673
    'SNT'
674
    >>> koelner_phonetik_alpha('Schmidt')
675
    'SNT'
676
    >>> koelner_phonetik_alpha('Müller')
677
    'NLR'
678
    >>> koelner_phonetik_alpha('Zimmermann')
679
    'SNRNN'
680
    """
681
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
682
683
684
def nysiis(word, maxlength=6, modified=False):
685
    """Return the NYSIIS code for a word.
686
687
    The New York State Identification and Intelligence System algorithm is
688
    defined in :cite:`Taft:1970`.
689
690
    The modified version of this algorithm is described in Appendix B of
691
    :cite:`Lynch:1977`.
692
693
    :param str word: the word to transform
694
    :param int maxlength: the maximum length (default 6) of the code to return
695
    :param bool modified: indicates whether to use USDA modified NYSIIS
696
    :returns: the NYSIIS value
697
    :rtype: str
698
699
    >>> nysiis('Christopher')
700
    'CRASTA'
701
    >>> nysiis('Niall')
702
    'NAL'
703
    >>> nysiis('Smith')
704
    'SNAT'
705
    >>> nysiis('Schmidt')
706
    'SNAD'
707
708
    >>> nysiis('Christopher', maxlength=_INFINITY)
709
    'CRASTAFAR'
710
711
    >>> nysiis('Christopher', maxlength=8, modified=True)
712
    'CRASTAFA'
713
    >>> nysiis('Niall', maxlength=8, modified=True)
714
    'NAL'
715
    >>> nysiis('Smith', maxlength=8, modified=True)
716
    'SNAT'
717
    >>> nysiis('Schmidt', maxlength=8, modified=True)
718
    'SNAD'
719
    """
720
    # Require a maxlength of at least 6
721
    if maxlength:
722
        maxlength = max(6, maxlength)
723
724
    _vowels = {'A', 'E', 'I', 'O', 'U'}
725
726
    word = ''.join(c for c in word.upper() if c.isalpha())
727
    word = word.replace('ß', 'SS')
728
729
    # exit early if there are no alphas
730
    if not word:
731
        return ''
732
733
    if modified:
734
        original_first_char = word[0]
735
736
    if word[:3] == 'MAC':
737
        word = 'MCC'+word[3:]
738
    elif word[:2] == 'KN':
739
        word = 'NN'+word[2:]
740
    elif word[:1] == 'K':
741
        word = 'C'+word[1:]
742
    elif word[:2] in {'PH', 'PF'}:
743
        word = 'FF'+word[2:]
744
    elif word[:3] == 'SCH':
745
        word = 'SSS'+word[3:]
746
    elif modified:
747
        if word[:2] == 'WR':
748
            word = 'RR'+word[2:]
749
        elif word[:2] == 'RH':
750
            word = 'RR'+word[2:]
751
        elif word[:2] == 'DG':
752
            word = 'GG'+word[2:]
753
        elif word[:1] in _vowels:
754
            word = 'A'+word[1:]
755
756
    if modified and word[-1:] in {'S', 'Z'}:
757
        word = word[:-1]
758
759
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
760
                                                  word[-2:] == 'YE'):
761
        word = word[:-2]+'Y'
762
    elif word[-2:] in {'DT', 'RT', 'RD'}:
763
        word = word[:-2]+'D'
764
    elif word[-2:] in {'NT', 'ND'}:
765
        word = word[:-2]+('N' if modified else 'D')
766
    elif modified:
767
        if word[-2:] == 'IX':
768
            word = word[:-2]+'ICK'
769
        elif word[-2:] == 'EX':
770
            word = word[:-2]+'ECK'
771
        elif word[-2:] in {'JR', 'SR'}:
772
            return 'ERROR'
773
774
    key = word[:1]
775
776
    skip = 0
777
    for i in range(1, len(word)):
778
        if i >= len(word):
779
            continue
780
        elif skip:
781
            skip -= 1
782
            continue
783
        elif word[i:i+2] == 'EV':
784
            word = word[:i] + 'AF' + word[i+2:]
785
            skip = 1
786
        elif word[i] in _vowels:
787
            word = word[:i] + 'A' + word[i+1:]
788
        elif modified and i != len(word)-1 and word[i] == 'Y':
789
            word = word[:i] + 'A' + word[i+1:]
790
        elif word[i] == 'Q':
791
            word = word[:i] + 'G' + word[i+1:]
792
        elif word[i] == 'Z':
793
            word = word[:i] + 'S' + word[i+1:]
794
        elif word[i] == 'M':
795
            word = word[:i] + 'N' + word[i+1:]
796
        elif word[i:i+2] == 'KN':
797
            word = word[:i] + 'N' + word[i+2:]
798
        elif word[i] == 'K':
799
            word = word[:i] + 'C' + word[i+1:]
800
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
801
            word = word[:i] + 'SSA'
802
            skip = 2
803
        elif word[i:i+3] == 'SCH':
804
            word = word[:i] + 'SSS' + word[i+3:]
805
            skip = 2
806
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
807
            word = word[:i] + 'SA'
808
            skip = 1
809
        elif word[i:i+2] == 'SH':
810
            word = word[:i] + 'SS' + word[i+2:]
811
            skip = 1
812
        elif word[i:i+2] == 'PH':
813
            word = word[:i] + 'FF' + word[i+2:]
814
            skip = 1
815
        elif modified and word[i:i+3] == 'GHT':
816
            word = word[:i] + 'TTT' + word[i+3:]
817
            skip = 2
818
        elif modified and word[i:i+2] == 'DG':
819
            word = word[:i] + 'GG' + word[i+2:]
820
            skip = 1
821
        elif modified and word[i:i+2] == 'WR':
822
            word = word[:i] + 'RR' + word[i+2:]
823
            skip = 1
824
        elif word[i] == 'H' and (word[i-1] not in _vowels or
825
                                 word[i+1:i+2] not in _vowels):
826
            word = word[:i] + word[i-1] + word[i+1:]
827
        elif word[i] == 'W' and word[i-1] in _vowels:
828
            word = word[:i] + word[i-1] + word[i+1:]
829
830
        if word[i:i+skip+1] != key[-1:]:
831
            key += word[i:i+skip+1]
832
833
    key = _delete_consecutive_repeats(key)
834
835
    if key[-1:] == 'S':
836
        key = key[:-1]
837
    if key[-2:] == 'AY':
838
        key = key[:-2] + 'Y'
839
    if key[-1:] == 'A':
840
        key = key[:-1]
841
    if modified and key[:1] == 'A':
842
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 733 is False. Are you sure this can never be the case?
Loading history...
843
844
    if maxlength and maxlength < _INFINITY:
845
        key = key[:maxlength]
846
847
    return key
848
849
850
def mra(word):
851
    """Return the MRA personal numeric identifier (PNI) for a word.
852
853
    A description of the Western Airlines Surname Match Rating Algorithm can
854
    be found on page 18 of :cite:`Moore:1977`.
855
856
    :param str word: the word to transform
857
    :returns: the MRA PNI
858
    :rtype: str
859
860
    >>> mra('Christopher')
861
    'CHRPHR'
862
    >>> mra('Niall')
863
    'NL'
864
    >>> mra('Smith')
865
    'SMTH'
866
    >>> mra('Schmidt')
867
    'SCHMDT'
868
    """
869
    if not word:
870
        return word
871
    word = word.upper()
872
    word = word.replace('ß', 'SS')
873
    word = word[0]+''.join(c for c in word[1:] if
874
                           c not in {'A', 'E', 'I', 'O', 'U'})
875
    word = _delete_consecutive_repeats(word)
876
    if len(word) > 6:
877
        word = word[:3]+word[-3:]
878
    return word
879
880
881
def metaphone(word, maxlength=_INFINITY):
882
    """Return the Metaphone code for a word.
883
884
    Based on Lawrence Philips' Pick BASIC code from 1990 :cite:`Philips:1990`,
885
    as described in :cite:`Philips:1990b`.
886
    This incorporates some corrections to the above code, particularly
887
    some of those suggested by Michael Kuhn in :cite:`Kuhn:1995`.
888
889
    :param str word: the word to transform
890
    :param int maxlength: the maximum length of the returned Metaphone code
891
        (defaults to unlimited, but in Philips' original implementation
892
        this was 4)
893
    :returns: the Metaphone value
894
    :rtype: str
895
896
897
    >>> metaphone('Christopher')
898
    'KRSTFR'
899
    >>> metaphone('Niall')
900
    'NL'
901
    >>> metaphone('Smith')
902
    'SM0'
903
    >>> metaphone('Schmidt')
904
    'SKMTT'
905
    """
906
    _vowels = {'A', 'E', 'I', 'O', 'U'}
907
    _frontv = {'E', 'I', 'Y'}
908
    _varson = {'C', 'G', 'P', 'S', 'T'}
909
910
    # Require a maxlength of at least 4
911
    if maxlength is not None:
912
        maxlength = max(4, maxlength)
913
    else:
914
        maxlength = 64
915
916
    # As in variable sound--those modified by adding an "h"
917
    ename = ''.join(c for c in word.upper() if c.isalnum())
918
    ename = ename.replace('ß', 'SS')
919
920
    # Delete nonalphanumeric characters and make all caps
921
    if not ename:
922
        return ''
923
    if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}:
924
        ename = ename[1:]
925
    elif ename[0] == 'X':
926
        ename = 'S' + ename[1:]
927
    elif ename[0:2] == 'WH':
928
        ename = 'W' + ename[2:]
929
930
    # Convert to metaph
931
    elen = len(ename)-1
932
    metaph = ''
933
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
934
        if len(metaph) >= maxlength:
935
            break
936
        if ((ename[i] not in {'G', 'T'} and
937
             i > 0 and ename[i-1] == ename[i])):
938
            continue
939
940
        if ename[i] in _vowels and i == 0:
941
            metaph = ename[i]
942
943
        elif ename[i] == 'B':
944
            if i != elen or ename[i-1] != 'M':
945
                metaph += ename[i]
946
947
        elif ename[i] == 'C':
948
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
949
                if ename[i+1:i+3] == 'IA':
950
                    metaph += 'X'
951
                elif ename[i+1:i+2] in _frontv:
952
                    metaph += 'S'
953
                elif i > 0 and ename[i-1:i+2] == 'SCH':
954
                    metaph += 'K'
955
                elif ename[i+1:i+2] == 'H':
956
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
957
                        metaph += 'K'
958
                    else:
959
                        metaph += 'X'
960
                else:
961
                    metaph += 'K'
962
963
        elif ename[i] == 'D':
964
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
965
                metaph += 'J'
966
            else:
967
                metaph += 'T'
968
969
        elif ename[i] == 'G':
970
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
971
                                              ename[i+2:i+3] not in _vowels):
972
                continue
973
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
974
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
975
                continue
976
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
977
                  ename[i+1] in _frontv):
978
                continue
979
            elif ename[i+1:i+2] == 'G':
980
                continue
981
            elif ename[i+1:i+2] in _frontv:
982
                if i == 0 or ename[i-1] != 'G':
983
                    metaph += 'J'
984
                else:
985
                    metaph += 'K'
986
            else:
987
                metaph += 'K'
988
989
        elif ename[i] == 'H':
990
            if ((i > 0 and ename[i-1] in _vowels and
991
                 ename[i+1:i+2] not in _vowels)):
992
                continue
993
            elif i > 0 and ename[i-1] in _varson:
994
                continue
995
            else:
996
                metaph += 'H'
997
998
        elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}:
999
            metaph += ename[i]
1000
1001
        elif ename[i] == 'K':
1002
            if i > 0 and ename[i-1] == 'C':
1003
                continue
1004
            else:
1005
                metaph += 'K'
1006
1007
        elif ename[i] == 'P':
1008
            if ename[i+1:i+2] == 'H':
1009
                metaph += 'F'
1010
            else:
1011
                metaph += 'P'
1012
1013
        elif ename[i] == 'Q':
1014
            metaph += 'K'
1015
1016
        elif ename[i] == 'S':
1017
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1018
                 ename[i+2] in 'OA')):
1019
                metaph += 'X'
1020
            elif ename[i+1:i+2] == 'H':
1021
                metaph += 'X'
1022
            else:
1023
                metaph += 'S'
1024
1025
        elif ename[i] == 'T':
1026
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1027
                 ename[i+2] in {'A', 'O'})):
1028
                metaph += 'X'
1029
            elif ename[i+1:i+2] == 'H':
1030
                metaph += '0'
1031
            elif ename[i+1:i+3] != 'CH':
1032
                if ename[i-1:i] != 'T':
1033
                    metaph += 'T'
1034
1035
        elif ename[i] == 'V':
1036
            metaph += 'F'
1037
1038
        elif ename[i] in 'WY':
1039
            if ename[i+1:i+2] in _vowels:
1040
                metaph += ename[i]
1041
1042
        elif ename[i] == 'X':
1043
            metaph += 'KS'
1044
1045
        elif ename[i] == 'Z':
1046
            metaph += 'S'
1047
1048
    return metaph
1049
1050
1051
def double_metaphone(word, maxlength=_INFINITY):
1052
    """Return the Double Metaphone code for a word.
1053
1054
    Based on Lawrence Philips' (Visual) C++ code from 1999
1055
    :cite:`Philips:2000`.
1056
1057
    :param word: the word to transform
1058
    :param maxlength: the maximum length of the returned Double Metaphone codes
1059
        (defaults to unlimited, but in Philips' original implementation this
1060
        was 4)
1061
    :returns: the Double Metaphone value(s)
1062
    :rtype: tuple
1063
1064
    >>> double_metaphone('Christopher')
1065
    ('KRSTFR', '')
1066
    >>> double_metaphone('Niall')
1067
    ('NL', '')
1068
    >>> double_metaphone('Smith')
1069
    ('SM0', 'XMT')
1070
    >>> double_metaphone('Schmidt')
1071
    ('XMT', 'SMT')
1072
    """
1073
    # Require a maxlength of at least 4
1074
    if maxlength is not None:
1075
        maxlength = max(4, maxlength)
1076
    else:
1077
        maxlength = 64
1078
1079
    primary = ''
1080
    secondary = ''
1081
1082
    def _slavo_germanic():
1083
        """Return True if the word appears to be Slavic or Germanic."""
1084
        if 'W' in word or 'K' in word or 'CZ' in word:
1085
            return True
1086
        return False
1087
1088
    def _metaph_add(pri, sec=''):
1089
        """Return a new metaphone tuple with the supplied elements."""
1090
        newpri = primary
1091
        newsec = secondary
1092
        if pri:
1093
            newpri += pri
1094
        if sec:
1095
            if sec != ' ':
1096
                newsec += sec
1097
        else:
1098
            newsec += pri
1099
        return (newpri, newsec)
1100
1101
    def _is_vowel(pos):
1102
        """Return True if the character at word[pos] is a vowel."""
1103
        if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1104
            return True
1105
        return False
1106
1107
    def _get_at(pos):
1108
        """Return the character at word[pos]."""
1109
        return word[pos]
1110
1111
    def _string_at(pos, slen, substrings):
1112
        """Return True if word[pos:pos+slen] is in substrings."""
1113
        if pos < 0:
1114
            return False
1115
        return word[pos:pos+slen] in substrings
1116
1117
    current = 0
1118
    length = len(word)
1119
    if length < 1:
1120
        return ('', '')
1121
    last = length - 1
1122
1123
    word = word.upper()
1124
    word = word.replace('ß', 'SS')
1125
1126
    # Pad the original string so that we can index beyond the edge of the world
1127
    word += '     '
1128
1129
    # Skip these when at start of word
1130
    if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
1131
        current += 1
1132
1133
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1134
    if _get_at(0) == 'X':
1135
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1136
        current += 1
1137
1138
    # Main loop
1139
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1140
        if current >= length:
1141
            break
1142
1143
        if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1144
            if current == 0:
1145
                # All init vowels now map to 'A'
1146
                (primary, secondary) = _metaph_add('A')
1147
            current += 1
1148
            continue
1149
1150
        elif _get_at(current) == 'B':
1151
            # "-mb", e.g", "dumb", already skipped over...
1152
            (primary, secondary) = _metaph_add('P')
1153
            if _get_at(current + 1) == 'B':
1154
                current += 2
1155
            else:
1156
                current += 1
1157
            continue
1158
1159
        elif _get_at(current) == 'Ç':
1160
            (primary, secondary) = _metaph_add('S')
1161
            current += 1
1162
            continue
1163
1164
        elif _get_at(current) == 'C':
1165
            # Various Germanic
1166
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1167
                    _string_at((current - 1), 3, {'ACH'}) and
1168
                    ((_get_at(current + 2) != 'I') and
1169
                     ((_get_at(current + 2) != 'E') or
1170
                      _string_at((current - 2), 6,
1171
                                 {'BACHER', 'MACHER'})))):
1172
                (primary, secondary) = _metaph_add('K')
1173
                current += 2
1174
                continue
1175
1176
            # Special case 'caesar'
1177
            elif current == 0 and _string_at(current, 6, {'CAESAR'}):
1178
                (primary, secondary) = _metaph_add('S')
1179
                current += 2
1180
                continue
1181
1182
            # Italian 'chianti'
1183
            elif _string_at(current, 4, {'CHIA'}):
1184
                (primary, secondary) = _metaph_add('K')
1185
                current += 2
1186
                continue
1187
1188
            elif _string_at(current, 2, {'CH'}):
1189
                # Find 'Michael'
1190
                if current > 0 and _string_at(current, 4, {'CHAE'}):
1191
                    (primary, secondary) = _metaph_add('K', 'X')
1192
                    current += 2
1193
                    continue
1194
1195
                # Greek roots e.g. 'chemistry', 'chorus'
1196
                elif (current == 0 and
1197
                      (_string_at((current + 1), 5,
1198
                                  {'HARAC', 'HARIS'}) or
1199
                       _string_at((current + 1), 3,
1200
                                  {'HOR', 'HYM', 'HIA', 'HEM'})) and
1201
                      not _string_at(0, 5, {'CHORE'})):
1202
                    (primary, secondary) = _metaph_add('K')
1203
                    current += 2
1204
                    continue
1205
1206
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1207
                elif ((_string_at(0, 4, {'VAN ', 'VON '}) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1208
                       _string_at(0, 3, {'SCH'})) or
1209
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1210
                      _string_at((current - 2), 6,
1211
                                 {'ORCHES', 'ARCHIT', 'ORCHID'}) or
1212
                      _string_at((current + 2), 1, {'T', 'S'}) or
1213
                      ((_string_at((current - 1), 1,
1214
                                   {'A', 'O', 'U', 'E'}) or
1215
                        (current == 0)) and
1216
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1217
                       _string_at((current + 2), 1,
1218
                                  {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W',
1219
                                   ' '}))):
1220
                    (primary, secondary) = _metaph_add('K')
1221
1222
                else:
1223
                    if current > 0:
1224
                        if _string_at(0, 2, {'MC'}):
1225
                            # e.g., "McHugh"
1226
                            (primary, secondary) = _metaph_add('K')
1227
                        else:
1228
                            (primary, secondary) = _metaph_add('X', 'K')
1229
                    else:
1230
                        (primary, secondary) = _metaph_add('X')
1231
1232
                current += 2
1233
                continue
1234
1235
            # e.g, 'czerny'
1236
            elif (_string_at(current, 2, {'CZ'}) and
1237
                  not _string_at((current - 2), 4, {'WICZ'})):
1238
                (primary, secondary) = _metaph_add('S', 'X')
1239
                current += 2
1240
                continue
1241
1242
            # e.g., 'focaccia'
1243
            elif _string_at((current + 1), 3, {'CIA'}):
1244
                (primary, secondary) = _metaph_add('X')
1245
                current += 3
1246
1247
            # double 'C', but not if e.g. 'McClellan'
1248
            elif (_string_at(current, 2, {'CC'}) and
1249
                  not ((current == 1) and (_get_at(0) == 'M'))):
1250
                # 'bellocchio' but not 'bacchus'
1251
                if ((_string_at((current + 2), 1,
1252
                                {'I', 'E', 'H'}) and
1253
                     not _string_at((current + 2), 2, ['HU']))):
1254
                    # 'accident', 'accede' 'succeed'
1255
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1256
                         _string_at((current - 1), 5,
1257
                                    {'UCCEE', 'UCCES'}))):
1258
                        (primary, secondary) = _metaph_add('KS')
1259
                    # 'bacci', 'bertucci', other italian
1260
                    else:
1261
                        (primary, secondary) = _metaph_add('X')
1262
                    current += 3
1263
                    continue
1264
                else:  # Pierce's rule
1265
                    (primary, secondary) = _metaph_add('K')
1266
                    current += 2
1267
                    continue
1268
1269
            elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
1270
                (primary, secondary) = _metaph_add('K')
1271
                current += 2
1272
                continue
1273
1274
            elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
1275
                # Italian vs. English
1276
                if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
1277
                    (primary, secondary) = _metaph_add('S', 'X')
1278
                else:
1279
                    (primary, secondary) = _metaph_add('S')
1280
                current += 2
1281
                continue
1282
1283
            # else
1284
            else:
1285
                (primary, secondary) = _metaph_add('K')
1286
1287
                # name sent in 'mac caffrey', 'mac gregor
1288
                if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
1289
                    current += 3
1290
                elif (_string_at((current + 1), 1,
1291
                                 {'C', 'K', 'Q'}) and
1292
                      not _string_at((current + 1), 2, {'CE', 'CI'})):
1293
                    current += 2
1294
                else:
1295
                    current += 1
1296
                continue
1297
1298
        elif _get_at(current) == 'D':
1299
            if _string_at(current, 2, {'DG'}):
1300
                if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1301
                    # e.g. 'edge'
1302
                    (primary, secondary) = _metaph_add('J')
1303
                    current += 3
1304
                    continue
1305
                else:
1306
                    # e.g. 'edgar'
1307
                    (primary, secondary) = _metaph_add('TK')
1308
                    current += 2
1309
                    continue
1310
1311
            elif _string_at(current, 2, {'DT', 'DD'}):
1312
                (primary, secondary) = _metaph_add('T')
1313
                current += 2
1314
                continue
1315
1316
            # else
1317
            else:
1318
                (primary, secondary) = _metaph_add('T')
1319
                current += 1
1320
                continue
1321
1322
        elif _get_at(current) == 'F':
1323
            if _get_at(current + 1) == 'F':
1324
                current += 2
1325
            else:
1326
                current += 1
1327
            (primary, secondary) = _metaph_add('F')
1328
            continue
1329
1330
        elif _get_at(current) == 'G':
1331
            if _get_at(current + 1) == 'H':
1332
                if (current > 0) and not _is_vowel(current - 1):
1333
                    (primary, secondary) = _metaph_add('K')
1334
                    current += 2
1335
                    continue
1336
1337
                # 'ghislane', ghiradelli
1338
                elif current == 0:
1339
                    if _get_at(current + 2) == 'I':
1340
                        (primary, secondary) = _metaph_add('J')
1341
                    else:
1342
                        (primary, secondary) = _metaph_add('K')
1343
                    current += 2
1344
                    continue
1345
1346
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1347
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1348
                       _string_at((current - 2), 1, {'B', 'H', 'D'})) or
1349
                      # e.g., 'bough'
1350
                      ((current > 2) and
1351
                       _string_at((current - 3), 1, {'B', 'H', 'D'})) or
1352
                      # e.g., 'broughton'
1353
                      ((current > 3) and
1354
                       _string_at((current - 4), 1, {'B', 'H'}))):
1355
                    current += 2
1356
                    continue
1357
                else:
1358
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1359
                    #      'gough', 'rough', 'tough'
1360
                    if ((current > 2) and
1361
                            (_get_at(current - 1) == 'U') and
1362
                            (_string_at((current - 3), 1,
1363
                                        {'C', 'G', 'L', 'R', 'T'}))):
1364
                        (primary, secondary) = _metaph_add('F')
1365
                    elif (current > 0) and _get_at(current - 1) != 'I':
1366
                        (primary, secondary) = _metaph_add('K')
1367
                    current += 2
1368
                    continue
1369
1370
            elif _get_at(current + 1) == 'N':
1371
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1372
                    (primary, secondary) = _metaph_add('KN', 'N')
1373
                # not e.g. 'cagney'
1374
                elif (not _string_at((current + 2), 2, {'EY'}) and
1375
                      (_get_at(current + 1) != 'Y') and
1376
                      not _slavo_germanic()):
1377
                    (primary, secondary) = _metaph_add('N', 'KN')
1378
                else:
1379
                    (primary, secondary) = _metaph_add('KN')
1380
                current += 2
1381
                continue
1382
1383
            # 'tagliaro'
1384
            elif (_string_at((current + 1), 2, {'LI'}) and
1385
                  not _slavo_germanic()):
1386
                (primary, secondary) = _metaph_add('KL', 'L')
1387
                current += 2
1388
                continue
1389
1390
            # -ges-, -gep-, -gel-, -gie- at beginning
1391
            elif ((current == 0) and
1392
                  ((_get_at(current + 1) == 'Y') or
1393
                   _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY',
1394
                                                 'IB', 'IL', 'IN', 'IE', 'EI',
1395
                                                 'ER'}))):
1396
                (primary, secondary) = _metaph_add('K', 'J')
1397
                current += 2
1398
                continue
1399
1400
            #  -ger-,  -gy-
1401
            elif ((_string_at((current + 1), 2, {'ER'}) or
1402
                   (_get_at(current + 1) == 'Y')) and not
1403
                  _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not
1404
                  _string_at((current - 1), 1, {'E', 'I'}) and not
1405
                  _string_at((current - 1), 3, {'RGY', 'OGY'})):
1406
                (primary, secondary) = _metaph_add('K', 'J')
1407
                current += 2
1408
                continue
1409
1410
            #  italian e.g, 'biaggi'
1411
            elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or
1412
                  _string_at((current - 1), 4, {'AGGI', 'OGGI'})):
1413
                # obvious germanic
1414
                if (((_string_at(0, 4, {'VAN ', 'VON '}) or
1415
                      _string_at(0, 3, {'SCH'})) or
1416
                     _string_at((current + 1), 2, {'ET'}))):
1417
                    (primary, secondary) = _metaph_add('K')
1418
                elif _string_at((current + 1), 4, {'IER '}):
1419
                    (primary, secondary) = _metaph_add('J')
1420
                else:
1421
                    (primary, secondary) = _metaph_add('J', 'K')
1422
                current += 2
1423
                continue
1424
1425
            else:
1426
                if _get_at(current + 1) == 'G':
1427
                    current += 2
1428
                else:
1429
                    current += 1
1430
                (primary, secondary) = _metaph_add('K')
1431
                continue
1432
1433
        elif _get_at(current) == 'H':
1434
            # only keep if first & before vowel or btw. 2 vowels
1435
            if ((((current == 0) or _is_vowel(current - 1)) and
1436
                 _is_vowel(current + 1))):
1437
                (primary, secondary) = _metaph_add('H')
1438
                current += 2
1439
            else:  # also takes care of 'HH'
1440
                current += 1
1441
            continue
1442
1443
        elif _get_at(current) == 'J':
1444
            # obvious spanish, 'jose', 'san jacinto'
1445
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}):
1446
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1447
                     _string_at(0, 4, ['SAN ']))):
1448
                    (primary, secondary) = _metaph_add('H')
1449
                else:
1450
                    (primary, secondary) = _metaph_add('J', 'H')
1451
                current += 1
1452
                continue
1453
1454
            elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
1455
                # Yankelovich/Jankelowicz
1456
                (primary, secondary) = _metaph_add('J', 'A')
1457
            # Spanish pron. of e.g. 'bajador'
1458
            elif (_is_vowel(current - 1) and
1459
                  not _slavo_germanic() and
1460
                  ((_get_at(current + 1) == 'A') or
1461
                   (_get_at(current + 1) == 'O'))):
1462
                (primary, secondary) = _metaph_add('J', 'H')
1463
            elif current == last:
1464
                (primary, secondary) = _metaph_add('J', ' ')
1465
            elif (not _string_at((current + 1), 1,
1466
                                 {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and
1467
                  not _string_at((current - 1), 1, {'S', 'K', 'L'})):
1468
                (primary, secondary) = _metaph_add('J')
1469
1470
            if _get_at(current + 1) == 'J':  # it could happen!
1471
                current += 2
1472
            else:
1473
                current += 1
1474
            continue
1475
1476
        elif _get_at(current) == 'K':
1477
            if _get_at(current + 1) == 'K':
1478
                current += 2
1479
            else:
1480
                current += 1
1481
            (primary, secondary) = _metaph_add('K')
1482
            continue
1483
1484
        elif _get_at(current) == 'L':
1485
            if _get_at(current + 1) == 'L':
1486
                # Spanish e.g. 'cabrillo', 'gallegos'
1487
                if (((current == (length - 3)) and
1488
                     _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or
1489
                        ((_string_at((last - 1), 2, {'AS', 'OS'}) or
1490
                          _string_at(last, 1, {'A', 'O'})) and
1491
                         _string_at((current - 1), 4, {'ALLE'}))):
1492
                    (primary, secondary) = _metaph_add('L', ' ')
1493
                    current += 2
1494
                    continue
1495
                current += 2
1496
            else:
1497
                current += 1
1498
            (primary, secondary) = _metaph_add('L')
1499
            continue
1500
1501
        elif _get_at(current) == 'M':
1502
            if (((_string_at((current - 1), 3, {'UMB'}) and
1503
                  (((current + 1) == last) or
1504
                   _string_at((current + 2), 2, {'ER'}))) or
1505
                 # 'dumb', 'thumb'
1506
                 (_get_at(current + 1) == 'M'))):
1507
                current += 2
1508
            else:
1509
                current += 1
1510
            (primary, secondary) = _metaph_add('M')
1511
            continue
1512
1513
        elif _get_at(current) == 'N':
1514
            if _get_at(current + 1) == 'N':
1515
                current += 2
1516
            else:
1517
                current += 1
1518
            (primary, secondary) = _metaph_add('N')
1519
            continue
1520
1521
        elif _get_at(current) == 'Ñ':
1522
            current += 1
1523
            (primary, secondary) = _metaph_add('N')
1524
            continue
1525
1526
        elif _get_at(current) == 'P':
1527
            if _get_at(current + 1) == 'H':
1528
                (primary, secondary) = _metaph_add('F')
1529
                current += 2
1530
                continue
1531
1532
            # also account for "campbell", "raspberry"
1533
            elif _string_at((current + 1), 1, {'P', 'B'}):
1534
                current += 2
1535
            else:
1536
                current += 1
1537
            (primary, secondary) = _metaph_add('P')
1538
            continue
1539
1540
        elif _get_at(current) == 'Q':
1541
            if _get_at(current + 1) == 'Q':
1542
                current += 2
1543
            else:
1544
                current += 1
1545
            (primary, secondary) = _metaph_add('K')
1546
            continue
1547
1548
        elif _get_at(current) == 'R':
1549
            # french e.g. 'rogier', but exclude 'hochmeier'
1550
            if (((current == last) and
1551
                 not _slavo_germanic() and
1552
                 _string_at((current - 2), 2, {'IE'}) and
1553
                 not _string_at((current - 4), 2, {'ME', 'MA'}))):
1554
                (primary, secondary) = _metaph_add('', 'R')
1555
            else:
1556
                (primary, secondary) = _metaph_add('R')
1557
1558
            if _get_at(current + 1) == 'R':
1559
                current += 2
1560
            else:
1561
                current += 1
1562
            continue
1563
1564
        elif _get_at(current) == 'S':
1565
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1566
            if _string_at((current - 1), 3, {'ISL', 'YSL'}):
1567
                current += 1
1568
                continue
1569
1570
            # special case 'sugar-'
1571
            elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
1572
                (primary, secondary) = _metaph_add('X', 'S')
1573
                current += 1
1574
                continue
1575
1576
            elif _string_at(current, 2, {'SH'}):
1577
                # Germanic
1578
                if _string_at((current + 1), 4,
1579
                              {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}):
1580
                    (primary, secondary) = _metaph_add('S')
1581
                else:
1582
                    (primary, secondary) = _metaph_add('X')
1583
                current += 2
1584
                continue
1585
1586
            # Italian & Armenian
1587
            elif (_string_at(current, 3, {'SIO', 'SIA'}) or
1588
                  _string_at(current, 4, {'SIAN'})):
1589
                if not _slavo_germanic():
1590
                    (primary, secondary) = _metaph_add('S', 'X')
1591
                else:
1592
                    (primary, secondary) = _metaph_add('S')
1593
                current += 3
1594
                continue
1595
1596
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1597
            #                               'snider' match 'schneider'
1598
            # also, -sz- in Slavic language although in Hungarian it is
1599
            #       pronounced 's'
1600
            elif (((current == 0) and
1601
                   _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or
1602
                  _string_at((current + 1), 1, {'Z'})):
1603
                (primary, secondary) = _metaph_add('S', 'X')
1604
                if _string_at((current + 1), 1, {'Z'}):
1605
                    current += 2
1606
                else:
1607
                    current += 1
1608
                continue
1609
1610
            elif _string_at(current, 2, {'SC'}):
1611
                # Schlesinger's rule
1612
                if _get_at(current + 2) == 'H':
1613
                    # dutch origin, e.g. 'school', 'schooner'
1614
                    if _string_at((current + 3), 2,
1615
                                  {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}):
1616
                        # 'schermerhorn', 'schenker'
1617
                        if _string_at((current + 3), 2, {'ER', 'EN'}):
1618
                            (primary, secondary) = _metaph_add('X', 'SK')
1619
                        else:
1620
                            (primary, secondary) = _metaph_add('SK')
1621
                        current += 3
1622
                        continue
1623
                    else:
1624
                        if (((current == 0) and not _is_vowel(3) and
1625
                             (_get_at(3) != 'W'))):
1626
                            (primary, secondary) = _metaph_add('X', 'S')
1627
                        else:
1628
                            (primary, secondary) = _metaph_add('X')
1629
                        current += 3
1630
                        continue
1631
1632
                elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1633
                    (primary, secondary) = _metaph_add('S')
1634
                    current += 3
1635
                    continue
1636
1637
                # else
1638
                else:
1639
                    (primary, secondary) = _metaph_add('SK')
1640
                    current += 3
1641
                    continue
1642
1643
            else:
1644
                # french e.g. 'resnais', 'artois'
1645
                if (current == last) and _string_at((current - 2), 2,
1646
                                                    {'AI', 'OI'}):
1647
                    (primary, secondary) = _metaph_add('', 'S')
1648
                else:
1649
                    (primary, secondary) = _metaph_add('S')
1650
1651
                if _string_at((current + 1), 1, {'S', 'Z'}):
1652
                    current += 2
1653
                else:
1654
                    current += 1
1655
                continue
1656
1657
        elif _get_at(current) == 'T':
1658
            if _string_at(current, 4, {'TION'}):
1659
                (primary, secondary) = _metaph_add('X')
1660
                current += 3
1661
                continue
1662
1663
            elif _string_at(current, 3, {'TIA', 'TCH'}):
1664
                (primary, secondary) = _metaph_add('X')
1665
                current += 3
1666
                continue
1667
1668
            elif (_string_at(current, 2, {'TH'}) or
1669
                  _string_at(current, 3, {'TTH'})):
1670
                # special case 'thomas', 'thames' or germanic
1671
                if ((_string_at((current + 2), 2, {'OM', 'AM'}) or
1672
                     _string_at(0, 4, {'VAN ', 'VON '}) or
1673
                     _string_at(0, 3, {'SCH'}))):
1674
                    (primary, secondary) = _metaph_add('T')
1675
                else:
1676
                    (primary, secondary) = _metaph_add('0', 'T')
1677
                current += 2
1678
                continue
1679
1680
            elif _string_at((current + 1), 1, {'T', 'D'}):
1681
                current += 2
1682
            else:
1683
                current += 1
1684
            (primary, secondary) = _metaph_add('T')
1685
            continue
1686
1687
        elif _get_at(current) == 'V':
1688
            if _get_at(current + 1) == 'V':
1689
                current += 2
1690
            else:
1691
                current += 1
1692
            (primary, secondary) = _metaph_add('F')
1693
            continue
1694
1695
        elif _get_at(current) == 'W':
1696
            # can also be in middle of word
1697
            if _string_at(current, 2, {'WR'}):
1698
                (primary, secondary) = _metaph_add('R')
1699
                current += 2
1700
                continue
1701
            elif ((current == 0) and
1702
                  (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))):
1703
                # Wasserman should match Vasserman
1704
                if _is_vowel(current + 1):
1705
                    (primary, secondary) = _metaph_add('A', 'F')
1706
                else:
1707
                    # need Uomo to match Womo
1708
                    (primary, secondary) = _metaph_add('A')
1709
1710
            # Arnow should match Arnoff
1711
            if ((((current == last) and _is_vowel(current - 1)) or
1712
                 _string_at((current - 1), 5,
1713
                            {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or
1714
                 _string_at(0, 3, ['SCH']))):
1715
                (primary, secondary) = _metaph_add('', 'F')
1716
                current += 1
1717
                continue
1718
            # Polish e.g. 'filipowicz'
1719
            elif _string_at(current, 4, {'WICZ', 'WITZ'}):
1720
                (primary, secondary) = _metaph_add('TS', 'FX')
1721
                current += 4
1722
                continue
1723
            # else skip it
1724
            else:
1725
                current += 1
1726
                continue
1727
1728
        elif _get_at(current) == 'X':
1729
            # French e.g. breaux
1730
            if (not ((current == last) and
1731
                     (_string_at((current - 3), 3, {'IAU', 'EAU'}) or
1732
                      _string_at((current - 2), 2, {'AU', 'OU'})))):
1733
                (primary, secondary) = _metaph_add('KS')
1734
1735
            if _string_at((current + 1), 1, {'C', 'X'}):
1736
                current += 2
1737
            else:
1738
                current += 1
1739
            continue
1740
1741
        elif _get_at(current) == 'Z':
1742
            # Chinese Pinyin e.g. 'zhao'
1743
            if _get_at(current + 1) == 'H':
1744
                (primary, secondary) = _metaph_add('J')
1745
                current += 2
1746
                continue
1747
            elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or
1748
                  (_slavo_germanic() and ((current > 0) and
1749
                                          _get_at(current - 1) != 'T'))):
1750
                (primary, secondary) = _metaph_add('S', 'TS')
1751
            else:
1752
                (primary, secondary) = _metaph_add('S')
1753
1754
            if _get_at(current + 1) == 'Z':
1755
                current += 2
1756
            else:
1757
                current += 1
1758
            continue
1759
1760
        else:
1761
            current += 1
1762
1763
    if maxlength and maxlength < _INFINITY:
1764
        primary = primary[:maxlength]
1765
        secondary = secondary[:maxlength]
1766
    if primary == secondary:
1767
        secondary = ''
1768
1769
    return (primary, secondary)
1770
1771
1772
def caverphone(word, version=2):
1773
    """Return the Caverphone code for a word.
1774
1775
    A description of version 1 of the algorithm can be found in
1776
    :cite:`Hood:2002`.
1777
1778
    A description of version 2 of the algorithm can be found in
1779
    :cite:`Hood:2004`.
1780
1781
    :param str word: the word to transform
1782
    :param int version: the version of Caverphone to employ for encoding
1783
        (defaults to 2)
1784
    :returns: the Caverphone value
1785
    :rtype: str
1786
1787
    >>> caverphone('Christopher')
1788
    'KRSTFA1111'
1789
    >>> caverphone('Niall')
1790
    'NA11111111'
1791
    >>> caverphone('Smith')
1792
    'SMT1111111'
1793
    >>> caverphone('Schmidt')
1794
    'SKMT111111'
1795
1796
    >>> caverphone('Christopher', 1)
1797
    'KRSTF1'
1798
    >>> caverphone('Niall', 1)
1799
    'N11111'
1800
    >>> caverphone('Smith', 1)
1801
    'SMT111'
1802
    >>> caverphone('Schmidt', 1)
1803
    'SKMT11'
1804
    """
1805
    _vowels = {'a', 'e', 'i', 'o', 'u'}
1806
1807
    word = word.lower()
1808
    word = ''.join(c for c in word if c in
1809
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
1810
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
1811
                    'y', 'z'})
1812
1813
    def _squeeze_replace(word, char, new_char):
1814
        """Convert strings of char in word to one instance of new_char."""
1815
        while char * 2 in word:
1816
            word = word.replace(char * 2, char)
1817
        return word.replace(char, new_char)
1818
1819
    # the main replacemet algorithm
1820
    if version != 1 and word[-1:] == 'e':
1821
        word = word[:-1]
1822
    if word:
1823
        if word[:5] == 'cough':
1824
            word = 'cou2f'+word[5:]
1825
        if word[:5] == 'rough':
1826
            word = 'rou2f'+word[5:]
1827
        if word[:5] == 'tough':
1828
            word = 'tou2f'+word[5:]
1829
        if word[:6] == 'enough':
1830
            word = 'enou2f'+word[6:]
1831
        if version != 1 and word[:6] == 'trough':
1832
            word = 'trou2f'+word[6:]
1833
        if word[:2] == 'gn':
1834
            word = '2n'+word[2:]
1835
        if word[-2:] == 'mb':
1836
            word = word[:-1]+'2'
1837
        word = word.replace('cq', '2q')
1838
        word = word.replace('ci', 'si')
1839
        word = word.replace('ce', 'se')
1840
        word = word.replace('cy', 'sy')
1841
        word = word.replace('tch', '2ch')
1842
        word = word.replace('c', 'k')
1843
        word = word.replace('q', 'k')
1844
        word = word.replace('x', 'k')
1845
        word = word.replace('v', 'f')
1846
        word = word.replace('dg', '2g')
1847
        word = word.replace('tio', 'sio')
1848
        word = word.replace('tia', 'sia')
1849
        word = word.replace('d', 't')
1850
        word = word.replace('ph', 'fh')
1851
        word = word.replace('b', 'p')
1852
        word = word.replace('sh', 's2')
1853
        word = word.replace('z', 's')
1854
        if word[0] in _vowels:
1855
            word = 'A'+word[1:]
1856
        word = word.replace('a', '3')
1857
        word = word.replace('e', '3')
1858
        word = word.replace('i', '3')
1859
        word = word.replace('o', '3')
1860
        word = word.replace('u', '3')
1861
        if version != 1:
1862
            word = word.replace('j', 'y')
1863
            if word[:2] == 'y3':
1864
                word = 'Y3'+word[2:]
1865
            if word[:1] == 'y':
1866
                word = 'A'+word[1:]
1867
            word = word.replace('y', '3')
1868
        word = word.replace('3gh3', '3kh3')
1869
        word = word.replace('gh', '22')
1870
        word = word.replace('g', 'k')
1871
1872
        word = _squeeze_replace(word, 's', 'S')
1873
        word = _squeeze_replace(word, 't', 'T')
1874
        word = _squeeze_replace(word, 'p', 'P')
1875
        word = _squeeze_replace(word, 'k', 'K')
1876
        word = _squeeze_replace(word, 'f', 'F')
1877
        word = _squeeze_replace(word, 'm', 'M')
1878
        word = _squeeze_replace(word, 'n', 'N')
1879
1880
        word = word.replace('w3', 'W3')
1881
        if version == 1:
1882
            word = word.replace('wy', 'Wy')
1883
        word = word.replace('wh3', 'Wh3')
1884
        if version == 1:
1885
            word = word.replace('why', 'Why')
1886
        if version != 1 and word[-1:] == 'w':
1887
            word = word[:-1]+'3'
1888
        word = word.replace('w', '2')
1889
        if word[:1] == 'h':
1890
            word = 'A'+word[1:]
1891
        word = word.replace('h', '2')
1892
        word = word.replace('r3', 'R3')
1893
        if version == 1:
1894
            word = word.replace('ry', 'Ry')
1895
        if version != 1 and word[-1:] == 'r':
1896
            word = word[:-1]+'3'
1897
        word = word.replace('r', '2')
1898
        word = word.replace('l3', 'L3')
1899
        if version == 1:
1900
            word = word.replace('ly', 'Ly')
1901
        if version != 1 and word[-1:] == 'l':
1902
            word = word[:-1]+'3'
1903
        word = word.replace('l', '2')
1904
        if version == 1:
1905
            word = word.replace('j', 'y')
1906
            word = word.replace('y3', 'Y3')
1907
            word = word.replace('y', '2')
1908
        word = word.replace('2', '')
1909
        if version != 1 and word[-1:] == '3':
1910
            word = word[:-1]+'A'
1911
        word = word.replace('3', '')
1912
1913
    # pad with 1s, then extract the necessary length of code
1914
    word = word+'1'*10
1915
    if version != 1:
1916
        word = word[:10]
1917
    else:
1918
        word = word[:6]
1919
1920
    return word
1921
1922
1923
def alpha_sis(word, maxlength=14):
1924
    """Return the IBM Alpha Search Inquiry System code for a word.
1925
1926
    The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
1927
    This implementation is based on the description in :cite:`Moore:1977`.
1928
1929
    A collection is necessary since there can be multiple values for a
1930
    single word. But the collection must be ordered since the first value
1931
    is the primary coding.
1932
1933
    :param str word: the word to transform
1934
    :param int maxlength: the length of the code returned (defaults to 14)
1935
    :returns: the Alpha SIS value
1936
    :rtype: tuple
1937
1938
    >>> alpha_sis('Christopher')
1939
    ('06401840000000', '07040184000000', '04018400000000')
1940
    >>> alpha_sis('Niall')
1941
    ('02500000000000',)
1942
    >>> alpha_sis('Smith')
1943
    ('03100000000000',)
1944
    >>> alpha_sis('Schmidt')
1945
    ('06310000000000',)
1946
    """
1947
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1948
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1949
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1950
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1951
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1952
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1953
                                 'Y')
1954
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1955
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1956
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1957
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1958
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1959
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1960
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1961
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1962
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1963
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1964
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1965
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1966
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1967
1968
    alpha = ['']
1969
    pos = 0
1970
    word = normalize('NFKD', text_type(word.upper()))
1971
    word = word.replace('ß', 'SS')
1972
    word = ''.join(c for c in word if c in
1973
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
1974
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
1975
                    'Y', 'Z'})
1976
1977
    # Clamp maxlength to [4, 64]
1978
    if maxlength is not None:
1979
        maxlength = min(max(4, maxlength), 64)
1980
    else:
1981
        maxlength = 64
1982
1983
    # Do special processing for initial substrings
1984
    for k in _alpha_sis_initials_order:
1985
        if word.startswith(k):
1986
            alpha[0] += _alpha_sis_initials[k]
1987
            pos += len(k)
1988
            break
1989
1990
    # Add a '0' if alpha is still empty
1991
    if not alpha[0]:
1992
        alpha[0] += '0'
1993
1994
    # Whether or not any special initial codes were encoded, iterate
1995
    # through the length of the word in the main encoding loop
1996
    while pos < len(word):
1997
        origpos = pos
1998
        for k in _alpha_sis_basic_order:
1999
            if word[pos:].startswith(k):
2000
                if isinstance(_alpha_sis_basic[k], tuple):
2001
                    newalpha = []
2002
                    for i in range(len(_alpha_sis_basic[k])):
2003
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
2004
                    alpha = newalpha
2005
                else:
2006
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
2007
                pos += len(k)
2008
                break
2009
        if pos == origpos:
2010
            alpha = [_ + '_' for _ in alpha]
2011
            pos += 1
2012
2013
    # Trim doublets and placeholders
2014
    for i in range(len(alpha)):
2015
        pos = 1
2016
        while pos < len(alpha[i]):
2017
            if alpha[i][pos] == alpha[i][pos-1]:
2018
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
2019
            pos += 1
2020
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2021
2022
    # Trim codes and return tuple
2023
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2024
    return tuple(alpha)
2025
2026
2027
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2028
    """Return the Fuzzy Soundex code for a word.
2029
2030
    Fuzzy Soundex is an algorithm derived from Soundex, defined in
2031
    :cite:`Holmes:2002`.
2032
2033
    :param str word: the word to transform
2034
    :param int maxlength: the length of the code returned (defaults to 4)
2035
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2036
        a maxlength string
2037
    :returns: the Fuzzy Soundex value
2038
    :rtype: str
2039
2040
    >>> fuzzy_soundex('Christopher')
2041
    'K6931'
2042
    >>> fuzzy_soundex('Niall')
2043
    'N4000'
2044
    >>> fuzzy_soundex('Smith')
2045
    'S5300'
2046
    >>> fuzzy_soundex('Smith')
2047
    'S5300'
2048
    """
2049
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2050
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2051
                                          '0193017-07745501769301-7-9'))
2052
2053
    word = normalize('NFKD', text_type(word.upper()))
2054
    word = word.replace('ß', 'SS')
2055
2056
    # Clamp maxlength to [4, 64]
2057
    if maxlength is not None:
2058
        maxlength = min(max(4, maxlength), 64)
2059
    else:
2060
        maxlength = 64
2061
2062
    if not word:
2063
        if zero_pad:
2064
            return '0' * maxlength
2065
        return '0'
2066
2067
    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
2068
        word = 'SS' + word[2:]
2069
    elif word[:2] == 'GN':
2070
        word = 'NN' + word[2:]
2071
    elif word[:2] in {'HR', 'WR'}:
2072
        word = 'RR' + word[2:]
2073
    elif word[:2] == 'HW':
2074
        word = 'WW' + word[2:]
2075
    elif word[:2] in {'KN', 'NG'}:
2076
        word = 'NN' + word[2:]
2077
2078
    if word[-2:] == 'CH':
2079
        word = word[:-2] + 'KK'
2080
    elif word[-2:] == 'NT':
2081
        word = word[:-2] + 'TT'
2082
    elif word[-2:] == 'RT':
2083
        word = word[:-2] + 'RR'
2084
    elif word[-3:] == 'RDT':
2085
        word = word[:-3] + 'RR'
2086
2087
    word = word.replace('CA', 'KA')
2088
    word = word.replace('CC', 'KK')
2089
    word = word.replace('CK', 'KK')
2090
    word = word.replace('CE', 'SE')
2091
    word = word.replace('CHL', 'KL')
2092
    word = word.replace('CL', 'KL')
2093
    word = word.replace('CHR', 'KR')
2094
    word = word.replace('CR', 'KR')
2095
    word = word.replace('CI', 'SI')
2096
    word = word.replace('CO', 'KO')
2097
    word = word.replace('CU', 'KU')
2098
    word = word.replace('CY', 'SY')
2099
    word = word.replace('DG', 'GG')
2100
    word = word.replace('GH', 'HH')
2101
    word = word.replace('MAC', 'MK')
2102
    word = word.replace('MC', 'MK')
2103
    word = word.replace('NST', 'NSS')
2104
    word = word.replace('PF', 'FF')
2105
    word = word.replace('PH', 'FF')
2106
    word = word.replace('SCH', 'SSS')
2107
    word = word.replace('TIO', 'SIO')
2108
    word = word.replace('TIA', 'SIO')
2109
    word = word.replace('TCH', 'CHH')
2110
2111
    sdx = word.translate(_fuzzy_soundex_translation)
2112
    sdx = sdx.replace('-', '')
2113
2114
    # remove repeating characters
2115
    sdx = _delete_consecutive_repeats(sdx)
2116
2117
    if word[0] in {'H', 'W', 'Y'}:
2118
        sdx = word[0] + sdx
2119
    else:
2120
        sdx = word[0] + sdx[1:]
2121
2122
    sdx = sdx.replace('0', '')
2123
2124
    if zero_pad:
2125
        sdx += ('0'*maxlength)
2126
2127
    return sdx[:maxlength]
2128
2129
2130
def phonex(word, maxlength=4, zero_pad=True):
2131
    """Return the Phonex code for a word.
2132
2133
    Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
2134
2135
    :param str word: the word to transform
2136
    :param int maxlength: the length of the code returned (defaults to 4)
2137
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2138
        a maxlength string
2139
    :returns: the Phonex value
2140
    :rtype: str
2141
2142
    >>> phonex('Christopher')
2143
    'C623'
2144
    >>> phonex('Niall')
2145
    'N400'
2146
    >>> phonex('Schmidt')
2147
    'S253'
2148
    >>> phonex('Smith')
2149
    'S530'
2150
    """
2151
    name = normalize('NFKD', text_type(word.upper()))
2152
    name = name.replace('ß', 'SS')
2153
2154
    # Clamp maxlength to [4, 64]
2155
    if maxlength is not None:
2156
        maxlength = min(max(4, maxlength), 64)
2157
    else:
2158
        maxlength = 64
2159
2160
    name_code = last = ''
2161
2162
    # Deletions effected by replacing with next letter which
2163
    # will be ignored due to duplicate handling of Soundex code.
2164
    # This is faster than 'moving' all subsequent letters.
2165
2166
    # Remove any trailing Ss
2167
    while name[-1:] == 'S':
2168
        name = name[:-1]
2169
2170
    # Phonetic equivalents of first 2 characters
2171
    # Works since duplicate letters are ignored
2172
    if name[:2] == 'KN':
2173
        name = 'N' + name[2:]  # KN.. == N..
2174
    elif name[:2] == 'PH':
2175
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2176
    elif name[:2] == 'WR':
2177
        name = 'R' + name[2:]  # WR.. == R..
2178
2179
    if name:
2180
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2181
        # Works since duplicate letters are ignored
2182
        if name[0] == 'H':
2183
            name = name[1:]
2184
2185
    if name:
2186
        # Phonetic equivalents of first character
2187
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2188
            name = 'A' + name[1:]
2189
        elif name[0] in {'B', 'P'}:
2190
            name = 'B' + name[1:]
2191
        elif name[0] in {'V', 'F'}:
2192
            name = 'F' + name[1:]
2193
        elif name[0] in {'C', 'K', 'Q'}:
2194
            name = 'C' + name[1:]
2195
        elif name[0] in {'G', 'J'}:
2196
            name = 'G' + name[1:]
2197
        elif name[0] in {'S', 'Z'}:
2198
            name = 'S' + name[1:]
2199
2200
        name_code = last = name[0]
2201
2202
    # MODIFIED SOUNDEX CODE
2203
    for i in range(1, len(name)):
2204
        code = '0'
2205
        if name[i] in {'B', 'F', 'P', 'V'}:
2206
            code = '1'
2207
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
2208
            code = '2'
2209
        elif name[i] in {'D', 'T'}:
2210
            if name[i+1:i+2] != 'C':
2211
                code = '3'
2212
        elif name[i] == 'L':
2213
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2214
                    i+1 == len(name)):
2215
                code = '4'
2216
        elif name[i] in {'M', 'N'}:
2217
            if name[i+1:i+2] in {'D', 'G'}:
2218
                name = name[:i+1] + name[i] + name[i+2:]
2219
            code = '5'
2220
        elif name[i] == 'R':
2221
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2222
                    i+1 == len(name)):
2223
                code = '6'
2224
2225
        if code != last and code != '0' and i != 0:
2226
            name_code += code
2227
2228
        last = name_code[-1]
2229
2230
    if zero_pad:
2231
        name_code += '0' * maxlength
2232
    if not name_code:
2233
        name_code = '0'
2234
    return name_code[:maxlength]
2235
2236
2237
def phonem(word):
2238
    """Return the Phonem code for a word.
2239
2240
    Phonem is defined in :cite:`Wilde:1988`.
2241
2242
    This version is based on the Perl implementation documented at
2243
    :cite:`Wilz:2005`.
2244
    It includes some enhancements presented in the Java port at
2245
    :cite:`dcm4che:2011`.
2246
2247
    Phonem is intended chiefly for German names/words.
2248
2249
    :param str word: the word to transform
2250
    :returns: the Phonem value
2251
    :rtype: str
2252
2253
    >>> phonem('Christopher')
2254
    'CRYSDOVR'
2255
    >>> phonem('Niall')
2256
    'NYAL'
2257
    >>> phonem('Smith')
2258
    'SMYD'
2259
    >>> phonem('Schmidt')
2260
    'CMYD'
2261
    """
2262
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2263
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2264
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2265
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2266
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2267
                             ('AU', 'A§'), ('OU', '§'))
2268
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2269
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2270
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2271
2272
    word = normalize('NFC', text_type(word.upper()))
2273
    for i, j in _phonem_substitutions:
2274
        word = word.replace(i, j)
2275
    word = word.translate(_phonem_translation)
2276
2277
    return ''.join(c for c in _delete_consecutive_repeats(word)
2278
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
2279
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})
2280
2281
2282
def phonix(word, maxlength=4, zero_pad=True):
2283
    """Return the Phonix code for a word.
2284
2285
    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
2286
2287
    This implementation is based on:
2288
    - :cite:`Pfeifer:2000`
2289
    - :cite:`Christen:2011`
2290
    - :cite:`Kollar:2007`
2291
2292
    :param str word: the word to transform
2293
    :param int maxlength: the length of the code returned (defaults to 4)
2294
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2295
        a maxlength string
2296
    :returns: the Phonix value
2297
    :rtype: str
2298
2299
    >>> phonix('Christopher')
2300
    'K683'
2301
    >>> phonix('Niall')
2302
    'N400'
2303
    >>> phonix('Smith')
2304
    'S530'
2305
    >>> phonix('Schmidt')
2306
    'S530'
2307
    """
2308
    def _start_repl(word, src, tar, post=None):
2309
        r"""Replace src with tar at the start of word."""
2310
        if post:
2311
            for i in post:
2312
                if word.startswith(src+i):
2313
                    return tar + word[len(src):]
2314
        elif word.startswith(src):
2315
            return tar + word[len(src):]
2316
        return word
2317
2318
    def _end_repl(word, src, tar, pre=None):
2319
        r"""Replace src with tar at the end of word."""
2320
        if pre:
2321
            for i in pre:
2322
                if word.endswith(i+src):
2323
                    return word[:-len(src)] + tar
2324
        elif word.endswith(src):
2325
            return word[:-len(src)] + tar
2326
        return word
2327
2328
    def _mid_repl(word, src, tar, pre=None, post=None):
2329
        r"""Replace src with tar in the middle of word."""
2330
        if pre or post:
2331
            if not pre:
2332
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2333
            elif not post:
2334
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2335
            return _all_repl(word, src, tar, pre, post)
2336
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2337
                word[-1])
2338
2339
    def _all_repl(word, src, tar, pre=None, post=None):
2340
        r"""Replace src with tar anywhere in word."""
2341
        if pre or post:
2342
            if post:
2343
                post = post
2344
            else:
2345
                post = frozenset(('',))
2346
            if pre:
2347
                pre = pre
2348
            else:
2349
                pre = frozenset(('',))
2350
2351
            for i, j in ((i, j) for i in pre for j in post):
2352
                word = word.replace(i+src+j, i+tar+j)
2353
            return word
2354
        else:
2355
            return word.replace(src, tar)
2356
2357
    _vow = {'A', 'E', 'I', 'O', 'U'}
2358
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
2359
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
2360
2361
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2362
                             (_all_repl, 'CO', 'KO'),
2363
                             (_all_repl, 'CA', 'KA'),
2364
                             (_all_repl, 'CU', 'KU'),
2365
                             (_all_repl, 'CY', 'SI'),
2366
                             (_all_repl, 'CI', 'SI'),
2367
                             (_all_repl, 'CE', 'SE'),
2368
                             (_start_repl, 'CL', 'KL', _vow),
2369
                             (_all_repl, 'CK', 'K'),
2370
                             (_end_repl, 'GC', 'K'),
2371
                             (_end_repl, 'JC', 'K'),
2372
                             (_start_repl, 'CHR', 'KR', _vow),
2373
                             (_start_repl, 'CR', 'KR', _vow),
2374
                             (_start_repl, 'WR', 'R'),
2375
                             (_all_repl, 'NC', 'NK'),
2376
                             (_all_repl, 'CT', 'KT'),
2377
                             (_all_repl, 'PH', 'F'),
2378
                             (_all_repl, 'AA', 'AR'),
2379
                             (_all_repl, 'SCH', 'SH'),
2380
                             (_all_repl, 'BTL', 'TL'),
2381
                             (_all_repl, 'GHT', 'T'),
2382
                             (_all_repl, 'AUGH', 'ARF'),
2383
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2384
                             (_all_repl, 'LOUGH', 'LOW'),
2385
                             (_start_repl, 'Q', 'KW'),
2386
                             (_start_repl, 'KN', 'N'),
2387
                             (_end_repl, 'GN', 'N'),
2388
                             (_all_repl, 'GHN', 'N'),
2389
                             (_end_repl, 'GNE', 'N'),
2390
                             (_all_repl, 'GHNE', 'NE'),
2391
                             (_end_repl, 'GNES', 'NS'),
2392
                             (_start_repl, 'GN', 'N'),
2393
                             (_mid_repl, 'GN', 'N', None, _con),
2394
                             (_end_repl, 'GN', 'N'),
2395
                             (_start_repl, 'PS', 'S'),
2396
                             (_start_repl, 'PT', 'T'),
2397
                             (_start_repl, 'CZ', 'C'),
2398
                             (_mid_repl, 'WZ', 'Z', _vow),
2399
                             (_mid_repl, 'CZ', 'CH'),
2400
                             (_all_repl, 'LZ', 'LSH'),
2401
                             (_all_repl, 'RZ', 'RSH'),
2402
                             (_mid_repl, 'Z', 'S', None, _vow),
2403
                             (_all_repl, 'ZZ', 'TS'),
2404
                             (_mid_repl, 'Z', 'TS', _con),
2405
                             (_all_repl, 'HROUG', 'REW'),
2406
                             (_all_repl, 'OUGH', 'OF'),
2407
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2408
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2409
                             (_start_repl, 'YJ', 'Y', _vow),
2410
                             (_start_repl, 'GH', 'G'),
2411
                             (_end_repl, 'GH', 'E', _vow),
2412
                             (_start_repl, 'CY', 'S'),
2413
                             (_all_repl, 'NX', 'NKS'),
2414
                             (_start_repl, 'PF', 'F'),
2415
                             (_end_repl, 'DT', 'T'),
2416
                             (_end_repl, 'TL', 'TIL'),
2417
                             (_end_repl, 'DL', 'DIL'),
2418
                             (_all_repl, 'YTH', 'ITH'),
2419
                             (_start_repl, 'TJ', 'CH', _vow),
2420
                             (_start_repl, 'TSJ', 'CH', _vow),
2421
                             (_start_repl, 'TS', 'T', _vow),
2422
                             (_all_repl, 'TCH', 'CH'),
2423
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2424
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2425
                             (_start_repl, 'MN', 'N', _vow),
2426
                             (_start_repl, 'PN', 'N', _vow),
2427
                             (_mid_repl, 'STL', 'SL', _vow),
2428
                             (_end_repl, 'STL', 'SL', _vow),
2429
                             (_end_repl, 'TNT', 'ENT'),
2430
                             (_end_repl, 'EAUX', 'OH'),
2431
                             (_all_repl, 'EXCI', 'ECS'),
2432
                             (_all_repl, 'X', 'ECS'),
2433
                             (_end_repl, 'NED', 'ND'),
2434
                             (_all_repl, 'JR', 'DR'),
2435
                             (_end_repl, 'EE', 'EA'),
2436
                             (_all_repl, 'ZS', 'S'),
2437
                             (_mid_repl, 'R', 'AH', _vow, _con),
2438
                             (_end_repl, 'R', 'AH', _vow),
2439
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2440
                             (_end_repl, 'HR', 'AH', _vow),
2441
                             (_end_repl, 'HR', 'AH', _vow),
2442
                             (_end_repl, 'RE', 'AR'),
2443
                             (_end_repl, 'R', 'AH', _vow),
2444
                             (_all_repl, 'LLE', 'LE'),
2445
                             (_end_repl, 'LE', 'ILE', _con),
2446
                             (_end_repl, 'LES', 'ILES', _con),
2447
                             (_end_repl, 'E', ''),
2448
                             (_end_repl, 'ES', 'S'),
2449
                             (_end_repl, 'SS', 'AS', _vow),
2450
                             (_end_repl, 'MB', 'M', _vow),
2451
                             (_all_repl, 'MPTS', 'MPS'),
2452
                             (_all_repl, 'MPS', 'MS'),
2453
                             (_all_repl, 'MPT', 'MT'))
2454
2455
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2456
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2457
                                   '01230720022455012683070808'))
2458
2459
    sdx = ''
2460
2461
    word = normalize('NFKD', text_type(word.upper()))
2462
    word = word.replace('ß', 'SS')
2463
    word = ''.join(c for c in word if c in
2464
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2465
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2466
                    'Y', 'Z'})
2467
    if word:
2468
        for trans in _phonix_substitutions:
2469
            word = trans[0](word, *trans[1:])
2470
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2471
            sdx = 'v' + word[1:].translate(_phonix_translation)
2472
        else:
2473
            sdx = word[0] + word[1:].translate(_phonix_translation)
2474
        sdx = _delete_consecutive_repeats(sdx)
2475
        sdx = sdx.replace('0', '')
2476
2477
    # Clamp maxlength to [4, 64]
2478
    if maxlength is not None:
2479
        maxlength = min(max(4, maxlength), 64)
2480
    else:
2481
        maxlength = 64
2482
2483
    if zero_pad:
2484
        sdx += '0' * maxlength
2485
    if not sdx:
2486
        sdx = '0'
2487
    return sdx[:maxlength]
2488
2489
2490
def sfinxbis(word, maxlength=None):
2491
    """Return the SfinxBis code for a word.
2492
2493
    SfinxBis is a Soundex-like algorithm defined in :cite:`Axelsson:2009`.
2494
2495
    This implementation follows the reference implementation:
2496
    :cite:`Sjoo:2009`.
2497
2498
    SfinxBis is intended chiefly for Swedish names.
2499
2500
    :param str word: the word to transform
2501
    :param int maxlength: the length of the code returned (defaults to
2502
        unlimited)
2503
    :returns: the SfinxBis value
2504
    :rtype: tuple
2505
2506
    >>> sfinxbis('Christopher')
2507
    ('K68376',)
2508
    >>> sfinxbis('Niall')
2509
    ('N4',)
2510
    >>> sfinxbis('Smith')
2511
    ('S53',)
2512
    >>> sfinxbis('Schmidt')
2513
    ('S53',)
2514
2515
    >>> sfinxbis('Johansson')
2516
    ('J585',)
2517
    >>> sfinxbis('Sjöberg')
2518
    ('#162',)
2519
    """
2520
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2521
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2522
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2523
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2524
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2525
                   ' S:T ')
2526
2527
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
2528
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
2529
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
2530
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
2531
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2532
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2533
                'Y', 'Z', 'Ä', 'Å', 'Ö'}
2534
2535
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2536
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2537
                                     '123729224551268378999999999'))
2538
2539
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2540
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2541
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2542
2543
    def _foersvensker(ordet):
2544
        """Return the Swedish-ized form of the word."""
2545
        ordet = ordet.replace('STIERN', 'STJÄRN')
2546
        ordet = ordet.replace('HIE', 'HJ')
2547
        ordet = ordet.replace('SIÖ', 'SJÖ')
2548
        ordet = ordet.replace('SCH', 'SH')
2549
        ordet = ordet.replace('QU', 'KV')
2550
        ordet = ordet.replace('IO', 'JO')
2551
        ordet = ordet.replace('PH', 'F')
2552
2553
        for i in _harde_vokaler:
2554
            ordet = ordet.replace(i+'Ü', i+'J')
2555
            ordet = ordet.replace(i+'Y', i+'J')
2556
            ordet = ordet.replace(i+'I', i+'J')
2557
        for i in _mjuka_vokaler:
2558
            ordet = ordet.replace(i+'Ü', i+'J')
2559
            ordet = ordet.replace(i+'Y', i+'J')
2560
            ordet = ordet.replace(i+'I', i+'J')
2561
2562
        if 'H' in ordet:
2563
            for i in _konsonanter:
2564
                ordet = ordet.replace('H'+i, i)
2565
2566
        ordet = ordet.translate(_sfinxbis_substitutions)
2567
2568
        ordet = ordet.replace('Ð', 'ETH')
2569
        ordet = ordet.replace('Þ', 'TH')
2570
        ordet = ordet.replace('ß', 'SS')
2571
2572
        return ordet
2573
2574
    def _koda_foersta_ljudet(ordet):
2575
        """Return the word with the first sound coded."""
2576
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2577
            ordet = '$' + ordet[1:]
2578
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2579
            ordet = 'J' + ordet[2:]
2580
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2581
            ordet = 'J' + ordet[1:]
2582
        elif ordet[0:1] == 'Q':
2583
            ordet = 'K' + ordet[1:]
2584
        elif (ordet[0:2] == 'CH' and
2585
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2586
            ordet = '#' + ordet[2:]
2587
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2588
            ordet = 'K' + ordet[1:]
2589
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2590
            ordet = 'K' + ordet[1:]
2591
        elif ordet[0:1] == 'X':
2592
            ordet = 'S' + ordet[1:]
2593
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2594
            ordet = 'S' + ordet[1:]
2595
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2596
            ordet = '#' + ordet[3:]
2597
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2598
            ordet = '#' + ordet[2:]
2599
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2600
            ordet = '#' + ordet[2:]
2601
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2602
            ordet = '#' + ordet[1:]
2603
        return ordet
2604
2605
    # Steg 1, Versaler
2606
    word = normalize('NFC', text_type(word.upper()))
2607
    word = word.replace('ß', 'SS')
2608
    word = word.replace('-', ' ')
2609
2610
    # Steg 2, Ta bort adelsprefix
2611
    for adelstitel in adelstitler:
2612
        while adelstitel in word:
2613
            word = word.replace(adelstitel, ' ')
2614
        if word.startswith(adelstitel[1:]):
2615
            word = word[len(adelstitel)-1:]
2616
2617
    # Split word into tokens
2618
    ordlista = word.split()
2619
2620
    # Steg 3, Ta bort dubbelteckning i början på namnet
2621
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2622
    if not ordlista:
2623
        return ('',)
2624
2625
    # Steg 4, Försvenskning
2626
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2627
2628
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2629
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2630
                for ordet in ordlista]
2631
2632
    # Steg 6, Koda första ljudet
2633
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2634
2635
    # Steg 7, Dela upp namnet i två delar
2636
    rest = [ordet[1:] for ordet in ordlista]
2637
2638
    # Steg 8, Utför fonetisk transformation i resten
2639
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2640
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2641
2642
    # Steg 9, Koda resten till en sifferkod
2643
    for vokal in _mjuka_vokaler:
2644
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2645
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2646
2647
    # Steg 10, Ta bort intilliggande dubbletter
2648
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2649
2650
    # Steg 11, Ta bort alla "9"
2651
    rest = [ordet.replace('9', '') for ordet in rest]
2652
2653
    # Steg 12, Sätt ihop delarna igen
2654
    ordlista = [''.join(ordet) for ordet in
2655
                zip((_[0:1] for _ in ordlista), rest)]
2656
2657
    # truncate, if maxlength is set
2658
    if maxlength and maxlength < _INFINITY:
2659
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2660
2661
    return tuple(ordlista)
2662
2663
2664
def phonet(word, mode=1, lang='de'):
2665
    """Return the phonet code for a word.
2666
2667
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
2668
    documented in :cite:`Michael:1999`.
2669
2670
    This is a port of Jesper Zedlitz's code, which is licensed LGPL
2671
    :cite:`Zedlitz:2015`.
2672
2673
    That is, in turn, based on Michael's C code, which is also licensed LGPL
2674
    :cite:`Michael:2007`.
2675
2676
    :param str word: the word to transform
2677
    :param int mode: the ponet variant to employ (1 or 2)
2678
    :param str lang: 'de' (default) for German
2679
            'none' for no language
2680
    :returns: the phonet value
2681
    :rtype: str
2682
2683
    >>> phonet('Christopher')
2684
    'KRISTOFA'
2685
    >>> phonet('Niall')
2686
    'NIAL'
2687
    >>> phonet('Smith')
2688
    'SMIT'
2689
    >>> phonet('Schmidt')
2690
    'SHMIT'
2691
2692
    >>> phonet('Christopher', mode=2)
2693
    'KRIZTUFA'
2694
    >>> phonet('Niall', mode=2)
2695
    'NIAL'
2696
    >>> phonet('Smith', mode=2)
2697
    'ZNIT'
2698
    >>> phonet('Schmidt', mode=2)
2699
    'ZNIT'
2700
2701
    >>> phonet('Christopher', lang='none')
2702
    'CHRISTOPHER'
2703
    >>> phonet('Niall', lang='none')
2704
    'NIAL'
2705
    >>> phonet('Smith', lang='none')
2706
    'SMITH'
2707
    >>> phonet('Schmidt', lang='none')
2708
    'SCHMIDT'
2709
    """
2710
    _phonet_rules_no_lang = (  # separator chars
2711
        '´', ' ', ' ',
2712
        '"', ' ', ' ',
2713
        '`$', '', '',
2714
        '\'', ' ', ' ',
2715
        ',', ',', ',',
2716
        ';', ',', ',',
2717
        '-', ' ', ' ',
2718
        ' ', ' ', ' ',
2719
        '.', '.', '.',
2720
        ':', '.', '.',
2721
        # German umlauts
2722
        'Ä', 'AE', 'AE',
2723
        'Ö', 'OE', 'OE',
2724
        'Ü', 'UE', 'UE',
2725
        'ß', 'S', 'S',
2726
        # international umlauts
2727
        'À', 'A', 'A',
2728
        'Á', 'A', 'A',
2729
        'Â', 'A', 'A',
2730
        'Ã', 'A', 'A',
2731
        'Å', 'A', 'A',
2732
        'Æ', 'AE', 'AE',
2733
        'Ç', 'C', 'C',
2734
        'Ð', 'DJ', 'DJ',
2735
        'È', 'E', 'E',
2736
        'É', 'E', 'E',
2737
        'Ê', 'E', 'E',
2738
        'Ë', 'E', 'E',
2739
        'Ì', 'I', 'I',
2740
        'Í', 'I', 'I',
2741
        'Î', 'I', 'I',
2742
        'Ï', 'I', 'I',
2743
        'Ñ', 'NH', 'NH',
2744
        'Ò', 'O', 'O',
2745
        'Ó', 'O', 'O',
2746
        'Ô', 'O', 'O',
2747
        'Õ', 'O', 'O',
2748
        'Œ', 'OE', 'OE',
2749
        'Ø', 'OE', 'OE',
2750
        'Š', 'SH', 'SH',
2751
        'Þ', 'TH', 'TH',
2752
        'Ù', 'U', 'U',
2753
        'Ú', 'U', 'U',
2754
        'Û', 'U', 'U',
2755
        'Ý', 'Y', 'Y',
2756
        'Ÿ', 'Y', 'Y',
2757
        # 'normal' letters (A-Z)
2758
        'MC^', 'MAC', 'MAC',
2759
        'MC^', 'MAC', 'MAC',
2760
        'M´^', 'MAC', 'MAC',
2761
        'M\'^', 'MAC', 'MAC',
2762
        'O´^', 'O', 'O',
2763
        'O\'^', 'O', 'O',
2764
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2765
        None, None, None)
2766
2767
    _phonet_rules_german = (  # separator chars
2768
        '´', ' ', ' ',
2769
        '"', ' ', ' ',
2770
        '`$', '', '',
2771
        '\'', ' ', ' ',
2772
        ',', ' ', ' ',
2773
        ';', ' ', ' ',
2774
        '-', ' ', ' ',
2775
        ' ', ' ', ' ',
2776
        '.', '.', '.',
2777
        ':', '.', '.',
2778
        # German umlauts
2779
        'ÄE', 'E', 'E',
2780
        'ÄU<', 'EU', 'EU',
2781
        'ÄV(AEOU)-<', 'EW', None,
2782
        'Ä$', 'Ä', None,
2783
        'Ä<', None, 'E',
2784
        'Ä', 'E', None,
2785
        'ÖE', 'Ö', 'Ö',
2786
        'ÖU', 'Ö', 'Ö',
2787
        'ÖVER--<', 'ÖW', None,
2788
        'ÖV(AOU)-', 'ÖW', None,
2789
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2790
        'ÜBER^^', 'ÜBA', 'IBA',
2791
        'ÜE', 'Ü', 'I',
2792
        'ÜVER--<', 'ÜW', None,
2793
        'ÜV(AOU)-', 'ÜW', None,
2794
        'Ü', None, 'I',
2795
        'ßCH<', None, 'Z',
2796
        'ß<', 'S', 'Z',
2797
        # international umlauts
2798
        'À<', 'A', 'A',
2799
        'Á<', 'A', 'A',
2800
        'Â<', 'A', 'A',
2801
        'Ã<', 'A', 'A',
2802
        'Å<', 'A', 'A',
2803
        'ÆER-', 'E', 'E',
2804
        'ÆU<', 'EU', 'EU',
2805
        'ÆV(AEOU)-<', 'EW', None,
2806
        'Æ$', 'Ä', None,
2807
        'Æ<', None, 'E',
2808
        'Æ', 'E', None,
2809
        'Ç', 'Z', 'Z',
2810
        'ÐÐ-', '', '',
2811
        'Ð', 'DI', 'TI',
2812
        'È<', 'E', 'E',
2813
        'É<', 'E', 'E',
2814
        'Ê<', 'E', 'E',
2815
        'Ë', 'E', 'E',
2816
        'Ì<', 'I', 'I',
2817
        'Í<', 'I', 'I',
2818
        'Î<', 'I', 'I',
2819
        'Ï', 'I', 'I',
2820
        'ÑÑ-', '', '',
2821
        'Ñ', 'NI', 'NI',
2822
        'Ò<', 'O', 'U',
2823
        'Ó<', 'O', 'U',
2824
        'Ô<', 'O', 'U',
2825
        'Õ<', 'O', 'U',
2826
        'Œ<', 'Ö', 'Ö',
2827
        'Ø(IJY)-<', 'E', 'E',
2828
        'Ø<', 'Ö', 'Ö',
2829
        'Š', 'SH', 'Z',
2830
        'Þ', 'T', 'T',
2831
        'Ù<', 'U', 'U',
2832
        'Ú<', 'U', 'U',
2833
        'Û<', 'U', 'U',
2834
        'Ý<', 'I', 'I',
2835
        'Ÿ<', 'I', 'I',
2836
        # 'normal' letters (A-Z)
2837
        'ABELLE$', 'ABL', 'ABL',
2838
        'ABELL$', 'ABL', 'ABL',
2839
        'ABIENNE$', 'ABIN', 'ABIN',
2840
        'ACHME---^', 'ACH', 'AK',
2841
        'ACEY$', 'AZI', 'AZI',
2842
        'ADV', 'ATW', None,
2843
        'AEGL-', 'EK', None,
2844
        'AEU<', 'EU', 'EU',
2845
        'AE2', 'E', 'E',
2846
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2847
        'AGL-1', 'AK', None,
2848
        'AGNI-^', 'AKN', 'AKN',
2849
        'AGNIE-', 'ANI', 'ANI',
2850
        'AGN(AEOU)-$', 'ANI', 'ANI',
2851
        'AH(AIOÖUÜY)-', 'AH', None,
2852
        'AIA2', 'AIA', 'AIA',
2853
        'AIE$', 'E', 'E',
2854
        'AILL(EOU)-', 'ALI', 'ALI',
2855
        'AINE$', 'EN', 'EN',
2856
        'AIRE$', 'ER', 'ER',
2857
        'AIR-', 'E', 'E',
2858
        'AISE$', 'ES', 'EZ',
2859
        'AISSANCE$', 'ESANS', 'EZANZ',
2860
        'AISSE$', 'ES', 'EZ',
2861
        'AIX$', 'EX', 'EX',
2862
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2863
        'AKTIE', 'AXIE', 'AXIE',
2864
        'AKTUEL', 'AKTUEL', None,
2865
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2866
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2867
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2868
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2869
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2870
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2871
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2872
        'ANDERGING----', 'ANDA ', 'ANTA ',
2873
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2874
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2875
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2876
        'ANER(BKO)---^^', 'AN', None,
2877
        'ANHAND---^$', 'AN H', 'AN ',
2878
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2879
        'ANIELLE$', 'ANIEL', 'ANIL',
2880
        'ANIEL', 'ANIEL', None,
2881
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2882
        'ANTI^^', 'ANTI', 'ANTI',
2883
        'ANVER^^', 'ANFA', 'ANFA',
2884
        'ATIA$', 'ATIA', 'ATIA',
2885
        'ATIA(NS)--', 'ATI', 'ATI',
2886
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2887
        'AUAU--', '', '',
2888
        'AUERE$', 'AUERE', None,
2889
        'AUERE(NS)-$', 'AUERE', None,
2890
        'AUERE(AIOUY)--', 'AUER', None,
2891
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2892
        'AUER<', 'AUA', 'AUA',
2893
        'AUF^^', 'AUF', 'AUF',
2894
        'AULT$', 'O', 'U',
2895
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2896
        'AUR$', 'AUA', 'AUA',
2897
        'AUSSE$', 'OS', 'UZ',
2898
        'AUS(ST)-^', 'AUS', 'AUS',
2899
        'AUS^^', 'AUS', 'AUS',
2900
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2901
        'AUTO^^', 'AUTO', 'AUTU',
2902
        'AUX(IY)-', 'AUX', 'AUX',
2903
        'AUX', 'O', 'U',
2904
        'AU', 'AU', 'AU',
2905
        'AVER--<', 'AW', None,
2906
        'AVIER$', 'AWIE', 'AFIE',
2907
        'AV(EÈÉÊI)-^', 'AW', None,
2908
        'AV(AOU)-', 'AW', None,
2909
        'AYRE$', 'EIRE', 'EIRE',
2910
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2911
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2912
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2913
        'AYR<', 'EIA', 'EIA',
2914
        'AYER--<', 'EI', 'EI',
2915
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2916
        'AË', 'E', 'E',
2917
        'A(IJY)<', 'EI', 'EI',
2918
        'BABY^$', 'BEBI', 'BEBI',
2919
        'BAB(IY)^', 'BEBI', 'BEBI',
2920
        'BEAU^$', 'BO', None,
2921
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2922
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2923
        'BEE$', 'BI', 'BI',
2924
        'BEIGE^$', 'BESH', 'BEZ',
2925
        'BENOIT--', 'BENO', 'BENU',
2926
        'BER(DT)-', 'BER', None,
2927
        'BERN(DT)-', 'BERN', None,
2928
        'BE(LMNRST)-^', 'BE', 'BE',
2929
        'BETTE$', 'BET', 'BET',
2930
        'BEVOR^$', 'BEFOR', None,
2931
        'BIC$', 'BIZ', 'BIZ',
2932
        'BOWL(EI)-', 'BOL', 'BUL',
2933
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2934
        'BRINGEND-----^', 'BRI', 'BRI',
2935
        'BRINGEND-----', ' BRI', ' BRI',
2936
        'BROW(NS)-', 'BRAU', 'BRAU',
2937
        'BUDGET7', 'BÜGE', 'BIKE',
2938
        'BUFFET7', 'BÜFE', 'BIFE',
2939
        'BYLLE$', 'BILE', 'BILE',
2940
        'BYLL$', 'BIL', 'BIL',
2941
        'BYPA--^', 'BEI', 'BEI',
2942
        'BYTE<', 'BEIT', 'BEIT',
2943
        'BY9^', 'BÜ', None,
2944
        'B(SßZ)$', 'BS', None,
2945
        'CACH(EI)-^', 'KESH', 'KEZ',
2946
        'CAE--', 'Z', 'Z',
2947
        'CA(IY)$', 'ZEI', 'ZEI',
2948
        'CE(EIJUY)--', 'Z', 'Z',
2949
        'CENT<', 'ZENT', 'ZENT',
2950
        'CERST(EI)----^', 'KE', 'KE',
2951
        'CER$', 'ZA', 'ZA',
2952
        'CE3', 'ZE', 'ZE',
2953
        'CH\'S$', 'X', 'X',
2954
        'CH´S$', 'X', 'X',
2955
        'CHAO(ST)-', 'KAO', 'KAU',
2956
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2957
        'CHAR(AI)-^', 'KAR', 'KAR',
2958
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2959
        'CHÄ(CF)-', 'SHE', 'ZE',
2960
        'CHE(CF)-', 'SHE', 'ZE',
2961
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2962
        'CHEQUE<', 'SHEK', 'ZEK',
2963
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2964
        'CH(AEUY)-<^', 'SH', 'Z',
2965
        'CHK-', '', '',
2966
        'CHO(CKPS)-^', 'SHO', 'ZU',
2967
        'CHRIS-', 'KRI', None,
2968
        'CHRO-', 'KR', None,
2969
        'CH(LOR)-<^', 'K', 'K',
2970
        'CHST-', 'X', 'X',
2971
        'CH(SßXZ)3', 'X', 'X',
2972
        'CHTNI-3', 'CHN', 'KN',
2973
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2974
        'CH', 'CH', 'K',
2975
        'CIC$', 'ZIZ', 'ZIZ',
2976
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
2977
        'CIENCE$', 'EIENS', 'EIENZ',
2978
        'CIER$', 'ZIE', 'ZIE',
2979
        'CYB-^', 'ZEI', 'ZEI',
2980
        'CY9^', 'ZÜ', 'ZI',
2981
        'C(IJY)-<3', 'Z', 'Z',
2982
        'CLOWN-', 'KLAU', 'KLAU',
2983
        'CCH', 'Z', 'Z',
2984
        'CCE-', 'X', 'X',
2985
        'C(CK)-', '', '',
2986
        'CLAUDET---', 'KLO', 'KLU',
2987
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
2988
        'COACH', 'KOSH', 'KUZ',
2989
        'COLE$', 'KOL', 'KUL',
2990
        'COUCH', 'KAUSH', 'KAUZ',
2991
        'COW', 'KAU', 'KAU',
2992
        'CQUES$', 'K', 'K',
2993
        'CQUE', 'K', 'K',
2994
        'CRASH--9', 'KRE', 'KRE',
2995
        'CREAT-^', 'KREA', 'KREA',
2996
        'CST', 'XT', 'XT',
2997
        'CS<^', 'Z', 'Z',
2998
        'C(SßX)', 'X', 'X',
2999
        'CT\'S$', 'X', 'X',
3000
        'CT(SßXZ)', 'X', 'X',
3001
        'CZ<', 'Z', 'Z',
3002
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
3003
        'C.^', 'C.', 'C.',
3004
        'CÄ-', 'Z', 'Z',
3005
        'CÜ$', 'ZÜ', 'ZI',
3006
        'C\'S$', 'X', 'X',
3007
        'C<', 'K', 'K',
3008
        'DAHER^$', 'DAHER', None,
3009
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
3010
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
3011
        'DD(SZ)--<', '', '',
3012
        'DD9', 'D', None,
3013
        'DEPOT7', 'DEPO', 'TEBU',
3014
        'DESIGN', 'DISEIN', 'TIZEIN',
3015
        'DE(LMNRST)-3^', 'DE', 'TE',
3016
        'DETTE$', 'DET', 'TET',
3017
        'DH$', 'T', None,
3018
        'DIC$', 'DIZ', 'TIZ',
3019
        'DIDR-^', 'DIT', None,
3020
        'DIEDR-^', 'DIT', None,
3021
        'DJ(AEIOU)-^', 'I', 'I',
3022
        'DMITR-^', 'DIMIT', 'TINIT',
3023
        'DRY9^', 'DRÜ', None,
3024
        'DT-', '', '',
3025
        'DUIS-^', 'DÜ', 'TI',
3026
        'DURCH^^', 'DURCH', 'TURK',
3027
        'DVA$', 'TWA', None,
3028
        'DY9^', 'DÜ', None,
3029
        'DYS$', 'DIS', None,
3030
        'DS(CH)--<', 'T', 'T',
3031
        'DST', 'ZT', 'ZT',
3032
        'DZS(CH)--', 'T', 'T',
3033
        'D(SßZ)', 'Z', 'Z',
3034
        'D(AÄEIOÖRUÜY)-', 'D', None,
3035
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3036
        'D\'H^', 'D', 'T',
3037
        'D´H^', 'D', 'T',
3038
        'D`H^', 'D', 'T',
3039
        'D\'S3$', 'Z', 'Z',
3040
        'D´S3$', 'Z', 'Z',
3041
        'D^', 'D', None,
3042
        'D', 'T', 'T',
3043
        'EAULT$', 'O', 'U',
3044
        'EAUX$', 'O', 'U',
3045
        'EAU', 'O', 'U',
3046
        'EAV', 'IW', 'IF',
3047
        'EAS3$', 'EAS', None,
3048
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3049
        'EA3$', 'EA', 'EA',
3050
        'EA3', 'I', 'I',
3051
        'EBENSO^$', 'EBNSO', 'EBNZU',
3052
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3053
        'EBEN^^', 'EBN', 'EBN',
3054
        'EE9', 'E', 'E',
3055
        'EGL-1', 'EK', None,
3056
        'EHE(IUY)--1', 'EH', None,
3057
        'EHUNG---1', 'E', None,
3058
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3059
        'EIEI--', '', '',
3060
        'EIERE^$', 'EIERE', None,
3061
        'EIERE$', 'EIERE', None,
3062
        'EIERE(NS)-$', 'EIERE', None,
3063
        'EIERE(AIOUY)--', 'EIER', None,
3064
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3065
        'EIER<', 'EIA', None,
3066
        'EIGL-1', 'EIK', None,
3067
        'EIGH$', 'EI', 'EI',
3068
        'EIH--', 'E', 'E',
3069
        'EILLE$', 'EI', 'EI',
3070
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3071
        'EIR$', 'EIA', 'EIA',
3072
        'EITRAUBEN------', 'EIT ', 'EIT ',
3073
        'EI', 'EI', 'EI',
3074
        'EJ$', 'EI', 'EI',
3075
        'ELIZ^', 'ELIS', None,
3076
        'ELZ^', 'ELS', None,
3077
        'EL-^', 'E', 'E',
3078
        'ELANG----1', 'E', 'E',
3079
        'EL(DKL)--1', 'E', 'E',
3080
        'EL(MNT)--1$', 'E', 'E',
3081
        'ELYNE$', 'ELINE', 'ELINE',
3082
        'ELYN$', 'ELIN', 'ELIN',
3083
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3084
        'EL-1', 'L', 'L',
3085
        'EM-^', None, 'E',
3086
        'EM(DFKMPQT)--1', None, 'E',
3087
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3088
        'EM-1', None, 'N',
3089
        'ENGAG-^', 'ANGA', 'ANKA',
3090
        'EN-^', 'E', 'E',
3091
        'ENTUEL', 'ENTUEL', None,
3092
        'EN(CDGKQSTZ)--1', 'E', 'E',
3093
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3094
        'EN-1', '', '',
3095
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3096
        'ER-^', 'E', 'E',
3097
        'ERREGEND-----', ' ER', ' ER',
3098
        'ERT1$', 'AT', None,
3099
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3100
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3101
        'ER1$', 'A', 'A',
3102
        'ER<1', 'A', 'A',
3103
        'ETAT7', 'ETA', 'ETA',
3104
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3105
        'EUERE$', 'EUERE', None,
3106
        'EUERE(NS)-$', 'EUERE', None,
3107
        'EUERE(AIOUY)--', 'EUER', None,
3108
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3109
        'EUER<', 'EUA', None,
3110
        'EUEU--', '', '',
3111
        'EUILLE$', 'Ö', 'Ö',
3112
        'EUR$', 'ÖR', 'ÖR',
3113
        'EUX', 'Ö', 'Ö',
3114
        'EUSZ$', 'EUS', None,
3115
        'EUTZ$', 'EUS', None,
3116
        'EUYS$', 'EUS', 'EUZ',
3117
        'EUZ$', 'EUS', None,
3118
        'EU', 'EU', 'EU',
3119
        'EVER--<1', 'EW', None,
3120
        'EV(ÄOÖUÜ)-1', 'EW', None,
3121
        'EYER<', 'EIA', 'EIA',
3122
        'EY<', 'EI', 'EI',
3123
        'FACETTE', 'FASET', 'FAZET',
3124
        'FANS--^$', 'FE', 'FE',
3125
        'FAN-^$', 'FE', 'FE',
3126
        'FAULT-', 'FOL', 'FUL',
3127
        'FEE(DL)-', 'FI', 'FI',
3128
        'FEHLER', 'FELA', 'FELA',
3129
        'FE(LMNRST)-3^', 'FE', 'FE',
3130
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3131
        'FOERDERN---', ' FÖRD', ' FÖRT',
3132
        'FOND7', 'FON', 'FUN',
3133
        'FRAIN$', 'FRA', 'FRA',
3134
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3135
        'FY9^', 'FÜ', None,
3136
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3137
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3138
        'GAGS^$', 'GEX', 'KEX',
3139
        'GAG^$', 'GEK', 'KEK',
3140
        'GD', 'KT', 'KT',
3141
        'GEGEN^^', 'GEGN', 'KEKN',
3142
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3143
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3144
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3145
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3146
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3147
        'GENRE', 'IORE', 'IURE',
3148
        'GE(LMNRST)-3^', 'GE', 'KE',
3149
        'GER(DKT)-', 'GER', None,
3150
        'GETTE$', 'GET', 'KET',
3151
        'GGF.', 'GF.', None,
3152
        'GG-', '', '',
3153
        'GH', 'G', None,
3154
        'GI(AOU)-^', 'I', 'I',
3155
        'GION-3', 'KIO', 'KIU',
3156
        'G(CK)-', '', '',
3157
        'GJ(AEIOU)-^', 'I', 'I',
3158
        'GMBH^$', 'GMBH', 'GMBH',
3159
        'GNAC$', 'NIAK', 'NIAK',
3160
        'GNON$', 'NION', 'NIUN',
3161
        'GN$', 'N', 'N',
3162
        'GONCAL-^', 'GONZA', 'KUNZA',
3163
        'GRY9^', 'GRÜ', None,
3164
        'G(SßXZ)-<', 'K', 'K',
3165
        'GUCK-', 'KU', 'KU',
3166
        'GUISEP-^', 'IUSE', 'IUZE',
3167
        'GUI-^', 'G', 'K',
3168
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3169
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3170
        'GY9^', 'GÜ', None,
3171
        'G(AÄEILOÖRUÜY)-', 'G', None,
3172
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3173
        'G\'S$', 'X', 'X',
3174
        'G´S$', 'X', 'X',
3175
        'G^', 'G', None,
3176
        'G', 'K', 'K',
3177
        'HA(HIUY)--1', 'H', None,
3178
        'HANDVOL---^', 'HANT ', 'ANT ',
3179
        'HANNOVE-^', 'HANOF', None,
3180
        'HAVEN7$', 'HAFN', None,
3181
        'HEAD-', 'HE', 'E',
3182
        'HELIEGEN------', 'E ', 'E ',
3183
        'HESTEHEN------', 'E ', 'E ',
3184
        'HE(LMNRST)-3^', 'HE', 'E',
3185
        'HE(LMN)-1', 'E', 'E',
3186
        'HEUR1$', 'ÖR', 'ÖR',
3187
        'HE(HIUY)--1', 'H', None,
3188
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3189
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3190
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3191
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3192
        'HOBBY9^', 'HOBI', None,
3193
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3194
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3195
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3196
        'HO(HIY)--1', 'H', None,
3197
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3198
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3199
        'HUIS^^', 'HÜS', 'IZ',
3200
        'HUIS$', 'ÜS', 'IZ',
3201
        'HUI--1', 'H', None,
3202
        'HYGIEN^', 'HÜKIEN', None,
3203
        'HY9^', 'HÜ', None,
3204
        'HY(BDGMNPST)-', 'Ü', None,
3205
        'H.^', None, 'H.',
3206
        'HÄU--1', 'H', None,
3207
        'H^', 'H', '',
3208
        'H', '', '',
3209
        'ICHELL---', 'ISH', 'IZ',
3210
        'ICHI$', 'ISHI', 'IZI',
3211
        'IEC$', 'IZ', 'IZ',
3212
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3213
        'IEI-3', '', '',
3214
        'IELL3', 'IEL', 'IEL',
3215
        'IENNE$', 'IN', 'IN',
3216
        'IERRE$', 'IER', 'IER',
3217
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3218
        'IETTE$', 'IT', 'IT',
3219
        'IEU', 'IÖ', 'IÖ',
3220
        'IE<4', 'I', 'I',
3221
        'IGL-1', 'IK', None,
3222
        'IGHT3$', 'EIT', 'EIT',
3223
        'IGNI(EO)-', 'INI', 'INI',
3224
        'IGN(AEOU)-$', 'INI', 'INI',
3225
        'IHER(DGLKRT)--1', 'IHE', None,
3226
        'IHE(IUY)--', 'IH', None,
3227
        'IH(AIOÖUÜY)-', 'IH', None,
3228
        'IJ(AOU)-', 'I', 'I',
3229
        'IJ$', 'I', 'I',
3230
        'IJ<', 'EI', 'EI',
3231
        'IKOLE$', 'IKOL', 'IKUL',
3232
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3233
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3234
        'IMSTAN----^', 'IM ', 'IN ',
3235
        'INDELERREGE------', 'INDL ', 'INTL ',
3236
        'INFRAGE-----^$', 'IN ', 'IN ',
3237
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3238
        'INVER-', 'INWE', 'INFE',
3239
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3240
        'IUSZ$', 'IUS', None,
3241
        'IUTZ$', 'IUS', None,
3242
        'IUZ$', 'IUS', None,
3243
        'IVER--<', 'IW', None,
3244
        'IVIER$', 'IWIE', 'IFIE',
3245
        'IV(ÄOÖUÜ)-', 'IW', None,
3246
        'IV<3', 'IW', None,
3247
        'IY2', 'I', None,
3248
        'I(ÈÉÊ)<4', 'I', 'I',
3249
        'JAVIE---<^', 'ZA', 'ZA',
3250
        'JEANS^$', 'JINS', 'INZ',
3251
        'JEANNE^$', 'IAN', 'IAN',
3252
        'JEAN-^', 'IA', 'IA',
3253
        'JER-^', 'IE', 'IE',
3254
        'JE(LMNST)-', 'IE', 'IE',
3255
        'JI^', 'JI', None,
3256
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3257
        'J', 'I', 'I',
3258
        'KC(ÄEIJ)-', 'X', 'X',
3259
        'KD', 'KT', None,
3260
        'KE(LMNRST)-3^', 'KE', 'KE',
3261
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3262
        'KH<^', 'K', 'K',
3263
        'KIC$', 'KIZ', 'KIZ',
3264
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3265
        'KOTELE-^', 'KOTL', 'KUTL',
3266
        'KREAT-^', 'KREA', 'KREA',
3267
        'KRÜS(TZ)--^', 'KRI', None,
3268
        'KRYS(TZ)--^', 'KRI', None,
3269
        'KRY9^', 'KRÜ', None,
3270
        'KSCH---', 'K', 'K',
3271
        'KSH--', 'K', 'K',
3272
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3273
        'KT\'S$', 'X', 'X',
3274
        'KTI(AIOU)-3', 'XI', 'XI',
3275
        'KT(SßXZ)', 'X', 'X',
3276
        'KY9^', 'KÜ', None,
3277
        'K\'S$', 'X', 'X',
3278
        'K´S$', 'X', 'X',
3279
        'LANGES$', ' LANGES', ' LANKEZ',
3280
        'LANGE$', ' LANGE', ' LANKE',
3281
        'LANG$', ' LANK', ' LANK',
3282
        'LARVE-', 'LARF', 'LARF',
3283
        'LD(SßZ)$', 'LS', 'LZ',
3284
        'LD\'S$', 'LS', 'LZ',
3285
        'LD´S$', 'LS', 'LZ',
3286
        'LEAND-^', 'LEAN', 'LEAN',
3287
        'LEERSTEHE-----^', 'LER ', 'LER ',
3288
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3289
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3290
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3291
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3292
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3293
        'LEL-', 'LE', 'LE',
3294
        'LE(MNRST)-3^', 'LE', 'LE',
3295
        'LETTE$', 'LET', 'LET',
3296
        'LFGNAG-', 'LFGAN', 'LFKAN',
3297
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3298
        'LIC$', 'LIZ', 'LIZ',
3299
        'LIVE^$', 'LEIF', 'LEIF',
3300
        'LT(SßZ)$', 'LS', 'LZ',
3301
        'LT\'S$', 'LS', 'LZ',
3302
        'LT´S$', 'LS', 'LZ',
3303
        'LUI(GS)--', 'LU', 'LU',
3304
        'LV(AIO)-', 'LW', None,
3305
        'LY9^', 'LÜ', None,
3306
        'LSTS$', 'LS', 'LZ',
3307
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3308
        'L(SßZ)$', 'LS', None,
3309
        'MAIR-<', 'MEI', 'NEI',
3310
        'MANAG-', 'MENE', 'NENE',
3311
        'MANUEL', 'MANUEL', None,
3312
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3313
        'MATCH', 'MESH', 'NEZ',
3314
        'MAURICE', 'MORIS', 'NURIZ',
3315
        'MBH^$', 'MBH', 'MBH',
3316
        'MB(ßZ)$', 'MS', None,
3317
        'MB(SßTZ)-', 'M', 'N',
3318
        'MCG9^', 'MAK', 'NAK',
3319
        'MC9^', 'MAK', 'NAK',
3320
        'MEMOIR-^', 'MEMOA', 'NENUA',
3321
        'MERHAVEN$', 'MAHAFN', None,
3322
        'ME(LMNRST)-3^', 'ME', 'NE',
3323
        'MEN(STZ)--3', 'ME', None,
3324
        'MEN$', 'MEN', None,
3325
        'MIGUEL-', 'MIGE', 'NIKE',
3326
        'MIKE^$', 'MEIK', 'NEIK',
3327
        'MITHILFE----^$', 'MIT H', 'NIT ',
3328
        'MN$', 'M', None,
3329
        'MN', 'N', 'N',
3330
        'MPJUTE-', 'MPUT', 'NBUT',
3331
        'MP(ßZ)$', 'MS', None,
3332
        'MP(SßTZ)-', 'M', 'N',
3333
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3334
        'MY9^', 'MÜ', None,
3335
        'M(ßZ)$', 'MS', None,
3336
        'M´G7^', 'MAK', 'NAK',
3337
        'M\'G7^', 'MAK', 'NAK',
3338
        'M´^', 'MAK', 'NAK',
3339
        'M\'^', 'MAK', 'NAK',
3340
        'M', None, 'N',
3341
        'NACH^^', 'NACH', 'NAK',
3342
        'NADINE', 'NADIN', 'NATIN',
3343
        'NAIV--', 'NA', 'NA',
3344
        'NAISE$', 'NESE', 'NEZE',
3345
        'NAUGENOMM------', 'NAU ', 'NAU ',
3346
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3347
        'NCH$', 'NSH', 'NZ',
3348
        'NCOISE$', 'SOA', 'ZUA',
3349
        'NCOIS$', 'SOA', 'ZUA',
3350
        'NDAR$', 'NDA', 'NTA',
3351
        'NDERINGEN------', 'NDE ', 'NTE ',
3352
        'NDRO(CDKTZ)-', 'NTRO', None,
3353
        'ND(BFGJLMNPQVW)-', 'NT', None,
3354
        'ND(SßZ)$', 'NS', 'NZ',
3355
        'ND\'S$', 'NS', 'NZ',
3356
        'ND´S$', 'NS', 'NZ',
3357
        'NEBEN^^', 'NEBN', 'NEBN',
3358
        'NENGELERN------', 'NEN ', 'NEN ',
3359
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3360
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3361
        'NE(LMNRST)-3^', 'NE', 'NE',
3362
        'NEN-3', 'NE', 'NE',
3363
        'NETTE$', 'NET', 'NET',
3364
        'NGU^^', 'NU', 'NU',
3365
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3366
        'NH(AUO)-$', 'NI', 'NI',
3367
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3368
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3369
        'NICHTS^^', 'NIX', 'NIX',
3370
        'NICHT^^', 'NICHT', 'NIKT',
3371
        'NINE$', 'NIN', 'NIN',
3372
        'NON^^', 'NON', 'NUN',
3373
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3374
        'NOT^^', 'NOT', 'NUT',
3375
        'NTI(AIOU)-3', 'NZI', 'NZI',
3376
        'NTIEL--3', 'NZI', 'NZI',
3377
        'NT(SßZ)$', 'NS', 'NZ',
3378
        'NT\'S$', 'NS', 'NZ',
3379
        'NT´S$', 'NS', 'NZ',
3380
        'NYLON', 'NEILON', 'NEILUN',
3381
        'NY9^', 'NÜ', None,
3382
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3383
        'NSZ-', 'NS', None,
3384
        'NSTS$', 'NS', 'NZ',
3385
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3386
        'N(SßZ)$', 'NS', None,
3387
        'OBERE-', 'OBER', None,
3388
        'OBER^^', 'OBA', 'UBA',
3389
        'OEU2', 'Ö', 'Ö',
3390
        'OE<2', 'Ö', 'Ö',
3391
        'OGL-', 'OK', None,
3392
        'OGNIE-', 'ONI', 'UNI',
3393
        'OGN(AEOU)-$', 'ONI', 'UNI',
3394
        'OH(AIOÖUÜY)-', 'OH', None,
3395
        'OIE$', 'Ö', 'Ö',
3396
        'OIRE$', 'OA', 'UA',
3397
        'OIR$', 'OA', 'UA',
3398
        'OIX', 'OA', 'UA',
3399
        'OI<3', 'EU', 'EU',
3400
        'OKAY^$', 'OKE', 'UKE',
3401
        'OLYN$', 'OLIN', 'ULIN',
3402
        'OO(DLMZ)-', 'U', None,
3403
        'OO$', 'U', None,
3404
        'OO-', '', '',
3405
        'ORGINAL-----', 'ORI', 'URI',
3406
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3407
        'OUI^', 'WI', 'FI',
3408
        'OUILLE$', 'ULIE', 'ULIE',
3409
        'OU(DT)-^', 'AU', 'AU',
3410
        'OUSE$', 'AUS', 'AUZ',
3411
        'OUT-', 'AU', 'AU',
3412
        'OU', 'U', 'U',
3413
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3414
        'OVER--<', 'OW', None,
3415
        'OV(AOU)-', 'OW', None,
3416
        'OW$', 'AU', 'AU',
3417
        'OWS$', 'OS', 'UZ',
3418
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3419
        'OYER', 'OIA', None,
3420
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3421
        'O(JY)<', 'EU', 'EU',
3422
        'OZ$', 'OS', None,
3423
        'O´^', 'O', 'U',
3424
        'O\'^', 'O', 'U',
3425
        'O', None, 'U',
3426
        'PATIEN--^', 'PAZI', 'PAZI',
3427
        'PENSIO-^', 'PANSI', 'PANZI',
3428
        'PE(LMNRST)-3^', 'PE', 'PE',
3429
        'PFER-^', 'FE', 'FE',
3430
        'P(FH)<', 'F', 'F',
3431
        'PIC^$', 'PIK', 'PIK',
3432
        'PIC$', 'PIZ', 'PIZ',
3433
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3434
        'POLYP-', 'POLÜ', None,
3435
        'POLY^^', 'POLI', 'PULI',
3436
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3437
        'POWER7', 'PAUA', 'PAUA',
3438
        'PP(FH)--<', 'B', 'B',
3439
        'PP-', '', '',
3440
        'PRODUZ-^', 'PRODU', 'BRUTU',
3441
        'PRODUZI--', ' PRODU', ' BRUTU',
3442
        'PRIX^$', 'PRI', 'PRI',
3443
        'PS-^^', 'P', None,
3444
        'P(SßZ)^', None, 'Z',
3445
        'P(SßZ)$', 'BS', None,
3446
        'PT-^', '', '',
3447
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3448
        'PY9^', 'PÜ', None,
3449
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3450
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3451
        'P.^', None, 'P.',
3452
        'P^', 'P', None,
3453
        'P', 'B', 'B',
3454
        'QI-', 'Z', 'Z',
3455
        'QUARANT--', 'KARA', 'KARA',
3456
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3457
        'QUE$', 'K', 'K',
3458
        'QUI(NS)$', 'KI', 'KI',
3459
        'QUIZ7', 'KWIS', None,
3460
        'Q(UV)7', 'KW', 'KF',
3461
        'Q<', 'K', 'K',
3462
        'RADFAHR----', 'RAT ', 'RAT ',
3463
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3464
        'RCH', 'RCH', 'RK',
3465
        'REA(DU)---3^', 'R', None,
3466
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3467
        'RECHERCH^', 'RESHASH', 'REZAZ',
3468
        'RECYCL--', 'RIZEI', 'RIZEI',
3469
        'RE(ALST)-3^', 'RE', None,
3470
        'REE$', 'RI', 'RI',
3471
        'RER$', 'RA', 'RA',
3472
        'RE(MNR)-4', 'RE', 'RE',
3473
        'RETTE$', 'RET', 'RET',
3474
        'REUZ$', 'REUZ', None,
3475
        'REW$', 'RU', 'RU',
3476
        'RH<^', 'R', 'R',
3477
        'RJA(MN)--', 'RI', 'RI',
3478
        'ROWD-^', 'RAU', 'RAU',
3479
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3480
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3481
        'RTIEL--3', 'RZI', 'RZI',
3482
        'RV(AEOU)-3', 'RW', None,
3483
        'RY(KN)-$', 'RI', 'RI',
3484
        'RY9^', 'RÜ', None,
3485
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3486
        'SAISO-^', 'SES', 'ZEZ',
3487
        'SAFE^$', 'SEIF', 'ZEIF',
3488
        'SAUCE-^', 'SOS', 'ZUZ',
3489
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3490
        'SCHSCH---7', '', '',
3491
        'SCHTSCH', 'SH', 'Z',
3492
        'SC(HZ)<', 'SH', 'Z',
3493
        'SC', 'SK', 'ZK',
3494
        'SELBSTST--7^^', 'SELB', 'ZELB',
3495
        'SELBST7^^', 'SELBST', 'ZELBZT',
3496
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3497
        'SERVI-^', 'SERW', None,
3498
        'SE(LMNRST)-3^', 'SE', 'ZE',
3499
        'SETTE$', 'SET', 'ZET',
3500
        'SHP-^', 'S', 'Z',
3501
        'SHST', 'SHT', 'ZT',
3502
        'SHTSH', 'SH', 'Z',
3503
        'SHT', 'ST', 'Z',
3504
        'SHY9^', 'SHÜ', None,
3505
        'SH^^', 'SH', None,
3506
        'SH3', 'SH', 'Z',
3507
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3508
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3509
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3510
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3511
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3512
        'SIEGLI-^', 'SIKL', 'ZIKL',
3513
        'SIGLI-^', 'SIKL', 'ZIKL',
3514
        'SIGHT', 'SEIT', 'ZEIT',
3515
        'SIGN', 'SEIN', 'ZEIN',
3516
        'SKI(NPZ)-', 'SKI', 'ZKI',
3517
        'SKI<^', 'SHI', 'ZI',
3518
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3519
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3520
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3521
        'SOUND-', 'SAUN', 'ZAUN',
3522
        'STAATS^^', 'STAZ', 'ZTAZ',
3523
        'STADT^^', 'STAT', 'ZTAT',
3524
        'STANDE$', ' STANDE', ' ZTANTE',
3525
        'START^^', 'START', 'ZTART',
3526
        'STAURANT7', 'STORAN', 'ZTURAN',
3527
        'STEAK-', 'STE', 'ZTE',
3528
        'STEPHEN-^$', 'STEW', None,
3529
        'STERN', 'STERN', None,
3530
        'STRAF^^', 'STRAF', 'ZTRAF',
3531
        'ST\'S$', 'Z', 'Z',
3532
        'ST´S$', 'Z', 'Z',
3533
        'STST--', '', '',
3534
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3535
        'ST(SZ)', 'Z', 'Z',
3536
        'SPAREN---^', 'SPA', 'ZPA',
3537
        'SPAREND----', ' SPA', ' ZPA',
3538
        'S(PTW)-^^', 'S', None,
3539
        'SP', 'SP', None,
3540
        'STYN(AE)-$', 'STIN', 'ZTIN',
3541
        'ST', 'ST', 'ZT',
3542
        'SUITE<', 'SIUT', 'ZIUT',
3543
        'SUKE--$', 'S', 'Z',
3544
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3545
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3546
        'SYB(IY)--^', 'SIB', None,
3547
        'SYL(KVW)--^', 'SI', None,
3548
        'SY9^', 'SÜ', None,
3549
        'SZE(NPT)-^', 'ZE', 'ZE',
3550
        'SZI(ELN)-^', 'ZI', 'ZI',
3551
        'SZCZ<', 'SH', 'Z',
3552
        'SZT<', 'ST', 'ZT',
3553
        'SZ<3', 'SH', 'Z',
3554
        'SÜL(KVW)--^', 'SI', None,
3555
        'S', None, 'Z',
3556
        'TCH', 'SH', 'Z',
3557
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3558
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3559
        'TEAT-^', 'TEA', 'TEA',
3560
        'TERRAI7^', 'TERA', 'TERA',
3561
        'TE(LMNRST)-3^', 'TE', 'TE',
3562
        'TH<', 'T', 'T',
3563
        'TICHT-', 'TIK', 'TIK',
3564
        'TICH$', 'TIK', 'TIK',
3565
        'TIC$', 'TIZ', 'TIZ',
3566
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3567
        'TIGSTELL-----', 'TIK ', 'TIK ',
3568
        'TOAS-^', 'TO', 'TU',
3569
        'TOILET-', 'TOLE', 'TULE',
3570
        'TOIN-', 'TOA', 'TUA',
3571
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3572
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3573
        'TRAINI-', 'TREN', 'TREN',
3574
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3575
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3576
        'TSCH', 'SH', 'Z',
3577
        'TSH', 'SH', 'Z',
3578
        'TST', 'ZT', 'ZT',
3579
        'T(Sß)', 'Z', 'Z',
3580
        'TT(SZ)--<', '', '',
3581
        'TT9', 'T', 'T',
3582
        'TV^$', 'TV', 'TV',
3583
        'TX(AEIOU)-3', 'SH', 'Z',
3584
        'TY9^', 'TÜ', None,
3585
        'TZ-', '', '',
3586
        'T\'S3$', 'Z', 'Z',
3587
        'T´S3$', 'Z', 'Z',
3588
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3589
        'UEBER^^', 'ÜBA', 'IBA',
3590
        'UE2', 'Ü', 'I',
3591
        'UGL-', 'UK', None,
3592
        'UH(AOÖUÜY)-', 'UH', None,
3593
        'UIE$', 'Ü', 'I',
3594
        'UM^^', 'UM', 'UN',
3595
        'UNTERE--3', 'UNTE', 'UNTE',
3596
        'UNTER^^', 'UNTA', 'UNTA',
3597
        'UNVER^^', 'UNFA', 'UNFA',
3598
        'UN^^', 'UN', 'UN',
3599
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3600
        'UVE-4', 'UW', None,
3601
        'UY2', 'UI', None,
3602
        'UZZ', 'AS', 'AZ',
3603
        'VACL-^', 'WAZ', 'FAZ',
3604
        'VAC$', 'WAZ', 'FAZ',
3605
        'VAN DEN ^', 'FANDN', 'FANTN',
3606
        'VANES-^', 'WANE', None,
3607
        'VATRO-', 'WATR', None,
3608
        'VA(DHJNT)--^', 'F', None,
3609
        'VEDD-^', 'FE', 'FE',
3610
        'VE(BEHIU)--^', 'F', None,
3611
        'VEL(BDLMNT)-^', 'FEL', None,
3612
        'VENTZ-^', 'FEN', None,
3613
        'VEN(NRSZ)-^', 'FEN', None,
3614
        'VER(AB)-^$', 'WER', None,
3615
        'VERBAL^$', 'WERBAL', None,
3616
        'VERBAL(EINS)-^', 'WERBAL', None,
3617
        'VERTEBR--', 'WERTE', None,
3618
        'VEREIN-----', 'F', None,
3619
        'VEREN(AEIOU)-^', 'WEREN', None,
3620
        'VERIFI', 'WERIFI', None,
3621
        'VERON(AEIOU)-^', 'WERON', None,
3622
        'VERSEN^', 'FERSN', 'FAZN',
3623
        'VERSIERT--^', 'WERSI', None,
3624
        'VERSIO--^', 'WERS', None,
3625
        'VERSUS', 'WERSUS', None,
3626
        'VERTI(GK)-', 'WERTI', None,
3627
        'VER^^', 'FER', 'FA',
3628
        'VERSPRECHE-------', ' FER', ' FA',
3629
        'VER$', 'WA', None,
3630
        'VER', 'FA', 'FA',
3631
        'VET(HT)-^', 'FET', 'FET',
3632
        'VETTE$', 'WET', 'FET',
3633
        'VE^', 'WE', None,
3634
        'VIC$', 'WIZ', 'FIZ',
3635
        'VIELSAGE----', 'FIL ', 'FIL ',
3636
        'VIEL', 'FIL', 'FIL',
3637
        'VIEW', 'WIU', 'FIU',
3638
        'VILL(AE)-', 'WIL', None,
3639
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3640
        'VI(ELS)--^', 'F', None,
3641
        'VILLON--', 'WILI', 'FILI',
3642
        'VIZE^^', 'FIZE', 'FIZE',
3643
        'VLIE--^', 'FL', None,
3644
        'VL(AEIOU)--', 'W', None,
3645
        'VOKA-^', 'WOK', None,
3646
        'VOL(ATUVW)--^', 'WO', None,
3647
        'VOR^^', 'FOR', 'FUR',
3648
        'VR(AEIOU)--', 'W', None,
3649
        'VV9', 'W', None,
3650
        'VY9^', 'WÜ', 'FI',
3651
        'V(ÜY)-', 'W', None,
3652
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3653
        'V(AEIJLRU)-<', 'W', None,
3654
        'V.^', 'V.', None,
3655
        'V<', 'F', 'F',
3656
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3657
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3658
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3659
        'WE(LMNRST)-3^', 'WE', 'FE',
3660
        'WER(DST)-', 'WER', None,
3661
        'WIC$', 'WIZ', 'FIZ',
3662
        'WIEDERU--', 'WIDE', 'FITE',
3663
        'WIEDER^$', 'WIDA', 'FITA',
3664
        'WIEDER^^', 'WIDA ', 'FITA ',
3665
        'WIEVIEL', 'WI FIL', 'FI FIL',
3666
        'WISUEL', 'WISUEL', None,
3667
        'WR-^', 'W', None,
3668
        'WY9^', 'WÜ', 'FI',
3669
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3670
        'W$', 'F', None,
3671
        'W', None, 'F',
3672
        'X<^', 'Z', 'Z',
3673
        'XHAVEN$', 'XAFN', None,
3674
        'X(CSZ)', 'X', 'X',
3675
        'XTS(CH)--', 'XT', 'XT',
3676
        'XT(SZ)', 'Z', 'Z',
3677
        'YE(LMNRST)-3^', 'IE', 'IE',
3678
        'YE-3', 'I', 'I',
3679
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3680
        'Y(AOU)-<7', 'I', 'I',
3681
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3682
        'YVES^$', 'IF', 'IF',
3683
        'YVONNE^$', 'IWON', 'IFUN',
3684
        'Y.^', 'Y.', None,
3685
        'Y', 'I', 'I',
3686
        'ZC(AOU)-', 'SK', 'ZK',
3687
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3688
        'ZIEJ$', 'ZI', 'ZI',
3689
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3690
        'ZL(AEIOU)-', 'SL', None,
3691
        'ZS(CHT)--', '', '',
3692
        'ZS', 'SH', 'Z',
3693
        'ZUERST', 'ZUERST', 'ZUERST',
3694
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3695
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3696
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3697
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3698
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3699
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3700
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3701
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3702
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3703
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3704
        'ZUVER^^', 'ZUFA', 'ZUFA',
3705
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3706
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3707
        'ZY9^', 'ZÜ', None,
3708
        'ZYK3$', 'ZIK', None,
3709
        'Z(VW)7^', 'SW', None,
3710
        None, None, None)
3711
3712
    phonet_hash = Counter()
3713
    alpha_pos = Counter()
3714
3715
    phonet_hash_1 = Counter()
3716
    phonet_hash_2 = Counter()
3717
3718
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3719
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3720
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3721
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3722
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3723
3724
    def _initialize_phonet(lang):
3725
        """Initialize phonet variables."""
3726
        if lang == 'none':
3727
            _phonet_rules = _phonet_rules_no_lang
3728
        else:
3729
            _phonet_rules = _phonet_rules_german
3730
3731
        phonet_hash[''] = -1
3732
3733
        # German and international umlauts
3734
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
3735
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
3736
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
3737
            alpha_pos[j] = 1
3738
            phonet_hash[j] = -1
3739
3740
        # "normal" letters ('A'-'Z')
3741
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3742
            alpha_pos[j] = i + 2
3743
            phonet_hash[j] = -1
3744
3745
        for i in range(26):
3746
            for j in range(28):
3747
                phonet_hash_1[i, j] = -1
3748
                phonet_hash_2[i, j] = -1
3749
3750
        # for each phonetc rule
3751
        for i in range(len(_phonet_rules)):
3752
            rule = _phonet_rules[i]
3753
3754
            if rule and i % 3 == 0:
3755
                # calculate first hash value
3756
                k = _phonet_rules[i][0]
3757
3758
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3759
                                           _phonet_rules[i+2]):
3760
                    phonet_hash[k] = i
3761
3762
                # calculate second hash values
3763
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3764
                    k = alpha_pos[k]
3765
3766
                    j = k-2
3767
                    rule = rule[1:]
3768
3769
                    if not rule:
3770
                        rule = ' '
3771
                    elif rule[0] == '(':
3772
                        rule = rule[1:]
3773
                    else:
3774
                        rule = rule[0]
3775
3776
                    while rule and (rule[0] != ')'):
3777
                        k = alpha_pos[rule[0]]
3778
3779
                        if k > 0:
3780
                            # add hash value for this letter
3781
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3782
                                phonet_hash_1[j, k] = i
3783
                                phonet_hash_2[j, k] = i
3784
3785
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3786
                                phonet_hash_2[j, k] = i
3787
                            else:
3788
                                k = -1
3789
3790
                        if k <= 0:
3791
                            # add hash value for all letters
3792
                            if phonet_hash_1[j, 0] < 0:
3793
                                phonet_hash_1[j, 0] = i
3794
3795
                            phonet_hash_2[j, 0] = i
3796
3797
                        rule = rule[1:]
3798
3799
    def _phonet(term, mode, lang):
3800
        """Return the phonet coded form of a term."""
3801
        if lang == 'none':
3802
            _phonet_rules = _phonet_rules_no_lang
3803
        else:
3804
            _phonet_rules = _phonet_rules_german
3805
3806
        char0 = ''
3807
        dest = term
3808
3809
        if not term:
3810
            return ''
3811
3812
        term_length = len(term)
3813
3814
        # convert input string to upper-case
3815
        src = term.translate(_phonet_upper_translation)
3816
3817
        # check "src"
3818
        i = 0
3819
        j = 0
3820
        zeta = 0
3821
3822
        while i < len(src):
3823
            char = src[i]
3824
3825
            pos = alpha_pos[char]
3826
3827
            if pos >= 2:
3828
                xpos = pos-2
3829
3830
                if i+1 == len(src):
3831
                    pos = alpha_pos['']
3832
                else:
3833
                    pos = alpha_pos[src[i+1]]
3834
3835
                start1 = phonet_hash_1[xpos, pos]
3836
                start2 = phonet_hash_1[xpos, 0]
3837
                end1 = phonet_hash_2[xpos, pos]
3838
                end2 = phonet_hash_2[xpos, 0]
3839
3840
                # preserve rule priorities
3841
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3842
                    pos = start1
3843
                    start1 = start2
3844
                    start2 = pos
3845
                    pos = end1
3846
                    end1 = end2
3847
                    end2 = pos
3848
3849
                if (end1 >= start2) and (start2 >= 0):
3850
                    if end2 > end1:
3851
                        end1 = end2
3852
3853
                    start2 = -1
3854
                    end2 = -1
3855
            else:
3856
                pos = phonet_hash[char]
3857
                start1 = pos
3858
                end1 = 10000
3859
                start2 = -1
3860
                end2 = -1
3861
3862
            pos = start1
3863
            zeta0 = 0
3864
3865
            if pos >= 0:
3866
                # check rules for this char
3867
                while ((_phonet_rules[pos] is None) or
3868
                       (_phonet_rules[pos][0] == char)):
3869
                    if pos > end1:
3870
                        if start2 > 0:
3871
                            pos = start2
3872
                            start1 = start2
3873
                            start2 = -1
3874
                            end1 = end2
3875
                            end2 = -1
3876
                            continue
3877
3878
                        break
3879
3880
                    if (((_phonet_rules[pos] is None) or
3881
                         (_phonet_rules[pos + mode] is None))):
3882
                        # no conversion rule available
3883
                        pos += 3
3884
                        continue
3885
3886
                    # check whole string
3887
                    matches = 1  # number of matching letters
3888
                    priority = 5  # default priority
3889
                    rule = _phonet_rules[pos]
3890
                    rule = rule[1:]
3891
3892
                    while (rule and
3893
                           (len(src) > (i + matches)) and
3894
                           (src[i + matches] == rule[0]) and
3895
                           not rule[0].isdigit() and
3896
                           (rule not in '(-<^$')):
3897
                        matches += 1
3898
                        rule = rule[1:]
3899
3900
                    if rule and (rule[0] == '('):
3901
                        # check an array of letters
3902
                        if (((len(src) > (i + matches)) and
3903
                             src[i + matches].isalpha() and
3904
                             (src[i + matches] in rule[1:]))):
3905
                            matches += 1
3906
3907
                            while rule and rule[0] != ')':
3908
                                rule = rule[1:]
3909
3910
                            # if rule[0] == ')':
3911
                            rule = rule[1:]
3912
3913
                    if rule:
3914
                        priority0 = ord(rule[0])
3915
                    else:
3916
                        priority0 = 0
3917
3918
                    matches0 = matches
3919
3920
                    while rule and rule[0] == '-' and matches > 1:
3921
                        matches -= 1
3922
                        rule = rule[1:]
3923
3924
                    if rule and rule[0] == '<':
3925
                        rule = rule[1:]
3926
3927
                    if rule and rule[0].isdigit():
3928
                        # read priority
3929
                        priority = int(rule[0])
3930
                        rule = rule[1:]
3931
3932
                    if rule and rule[0:2] == '^^':
3933
                        rule = rule[1:]
3934
3935
                    if (not rule or
3936
                            ((rule[0] == '^') and
3937
                             ((i == 0) or not src[i-1].isalpha()) and
3938
                             ((rule[1:2] != '$') or
3939
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3940
                               (src[i+matches0:i+matches0+1] != '.')))) or
3941
                            ((rule[0] == '$') and (i > 0) and
3942
                             src[i-1].isalpha() and
3943
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3944
                              (src[i+matches0:i+matches0+1] != '.')))):
3945
                        # look for continuation, if:
3946
                        # matches > 1 und NO '-' in first string */
3947
                        pos0 = -1
3948
3949
                        start3 = 0
3950
                        start4 = 0
3951
                        end3 = 0
3952
                        end4 = 0
3953
3954
                        if (((matches > 1) and
3955
                             src[i+matches:i+matches+1] and
3956
                             (priority0 != ord('-')))):
3957
                            char0 = src[i+matches-1]
3958
                            pos0 = alpha_pos[char0]
3959
3960
                            if pos0 >= 2 and src[i+matches]:
3961
                                xpos = pos0 - 2
3962
                                pos0 = alpha_pos[src[i+matches]]
3963
                                start3 = phonet_hash_1[xpos, pos0]
3964
                                start4 = phonet_hash_1[xpos, 0]
3965
                                end3 = phonet_hash_2[xpos, pos0]
3966
                                end4 = phonet_hash_2[xpos, 0]
3967
3968
                                # preserve rule priorities
3969
                                if (((start4 >= 0) and
3970
                                     ((start3 < 0) or (start4 < start3)))):
3971
                                    pos0 = start3
3972
                                    start3 = start4
3973
                                    start4 = pos0
3974
                                    pos0 = end3
3975
                                    end3 = end4
3976
                                    end4 = pos0
3977
3978
                                if (end3 >= start4) and (start4 >= 0):
3979
                                    if end4 > end3:
3980
                                        end3 = end4
3981
3982
                                    start4 = -1
3983
                                    end4 = -1
3984
                            else:
3985
                                pos0 = phonet_hash[char0]
3986
                                start3 = pos0
3987
                                end3 = 10000
3988
                                start4 = -1
3989
                                end4 = -1
3990
3991
                            pos0 = start3
3992
3993
                        # check continuation rules for src[i+matches]
3994
                        if pos0 >= 0:
3995
                            while ((_phonet_rules[pos0] is None) or
3996
                                   (_phonet_rules[pos0][0] == char0)):
3997
                                if pos0 > end3:
3998
                                    if start4 > 0:
3999
                                        pos0 = start4
4000
                                        start3 = start4
4001
                                        start4 = -1
4002
                                        end3 = end4
4003
                                        end4 = -1
4004
                                        continue
4005
4006
                                    priority0 = -1
4007
4008
                                    # important
4009
                                    break
4010
4011
                                if (((_phonet_rules[pos0] is None) or
4012
                                     (_phonet_rules[pos0 + mode] is None))):
4013
                                    # no conversion rule available
4014
                                    pos0 += 3
4015
                                    continue
4016
4017
                                # check whole string
4018
                                matches0 = matches
4019
                                priority0 = 5
4020
                                rule = _phonet_rules[pos0]
4021
                                rule = rule[1:]
4022
4023
                                while (rule and
4024
                                       (src[i+matches0:i+matches0+1] ==
4025
                                        rule[0]) and
4026
                                       (not rule[0].isdigit() or
4027
                                        (rule in '(-<^$'))):
4028
                                    matches0 += 1
4029
                                    rule = rule[1:]
4030
4031
                                if rule and rule[0] == '(':
4032
                                    # check an array of letters
4033
                                    if ((src[i+matches0:i+matches0+1]
4034
                                         .isalpha() and
4035
                                         (src[i+matches0] in rule[1:]))):
4036
                                        matches0 += 1
4037
4038
                                        while rule and rule[0] != ')':
4039
                                            rule = rule[1:]
4040
4041
                                        # if rule[0] == ')':
4042
                                        rule = rule[1:]
4043
4044
                                while rule and rule[0] == '-':
4045
                                    # "matches0" is NOT decremented
4046
                                    # because of  "if (matches0 == matches)"
4047
                                    rule = rule[1:]
4048
4049
                                if rule and rule[0] == '<':
4050
                                    rule = rule[1:]
4051
4052
                                if rule and rule[0].isdigit():
4053
                                    priority0 = int(rule[0])
4054
                                    rule = rule[1:]
4055
4056
                                if (not rule or
4057
                                        # rule == '^' is not possible here
4058
                                        ((rule[0] == '$') and not
4059
                                         src[i+matches0:i+matches0+1]
4060
                                         .isalpha() and
4061
                                         (src[i+matches0:i+matches0+1]
4062
                                          != '.'))):
4063
                                    if matches0 == matches:
4064
                                        # this is only a partial string
4065
                                        pos0 += 3
4066
                                        continue
4067
4068
                                    if priority0 < priority:
4069
                                        # priority is too low
4070
                                        pos0 += 3
4071
                                        continue
4072
4073
                                    # continuation rule found
4074
                                    break
4075
4076
                                pos0 += 3
4077
4078
                            # end of "while"
4079
                            if ((priority0 >= priority) and
4080
                                    ((_phonet_rules[pos0] is not None) and
4081
                                     (_phonet_rules[pos0][0] == char0))):
4082
4083
                                pos += 3
4084
                                continue
4085
4086
                        # replace string
4087
                        if ((_phonet_rules[pos] and
4088
                             ('<' in _phonet_rules[pos][1:]))):
4089
                            priority0 = 1
4090
                        else:
4091
                            priority0 = 0
4092
4093
                        rule = _phonet_rules[pos + mode]
4094
4095
                        if (priority0 == 1) and (zeta == 0):
4096
                            # rule with '<' is applied
4097
                            if ((j > 0) and rule and
4098
                                    ((dest[j-1] == char) or
4099
                                     (dest[j-1] == rule[0]))):
4100
                                j -= 1
4101
4102
                            zeta0 = 1
4103
                            zeta += 1
4104
                            matches0 = 0
4105
4106
                            while rule and src[i+matches0]:
4107
                                src = (src[0:i+matches0] + rule[0] +
4108
                                       src[i+matches0+1:])
4109
                                matches0 += 1
4110
                                rule = rule[1:]
4111
4112
                            if matches0 < matches:
4113
                                src = (src[0:i+matches0] +
4114
                                       src[i+matches:])
4115
4116
                            char = src[i]
4117
                        else:
4118
                            i = i + matches - 1
4119
                            zeta = 0
4120
4121
                            while len(rule) > 1:
4122
                                if (j == 0) or (dest[j - 1] != rule[0]):
4123
                                    dest = (dest[0:j] + rule[0] +
4124
                                            dest[min(len(dest), j+1):])
4125
                                    j += 1
4126
4127
                                rule = rule[1:]
4128
4129
                            # new "current char"
4130
                            if not rule:
4131
                                rule = ''
4132
                                char = ''
4133
                            else:
4134
                                char = rule[0]
4135
4136
                            if ((_phonet_rules[pos] and
4137
                                 '^^' in _phonet_rules[pos][1:])):
4138
                                if char:
4139
                                    dest = (dest[0:j] + char +
4140
                                            dest[min(len(dest), j + 1):])
4141
                                    j += 1
4142
4143
                                src = src[i + 1:]
4144
                                i = 0
4145
                                zeta0 = 1
4146
4147
                        break
4148
4149
                    pos += 3
4150
4151
                    if pos > end1 and start2 > 0:
4152
                        pos = start2
4153
                        start1 = start2
4154
                        end1 = end2
4155
                        start2 = -1
4156
                        end2 = -1
4157
4158
            if zeta0 == 0:
4159
                if char and ((j == 0) or (dest[j-1] != char)):
4160
                    # delete multiple letters only
4161
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4162
                    j += 1
4163
4164
                i += 1
4165
                zeta = 0
4166
4167
        dest = dest[0:j]
4168
4169
        return dest
4170
4171
    _initialize_phonet(lang)
4172
4173
    word = normalize('NFKC', text_type(word))
4174
    return _phonet(word, mode, lang)
4175
4176
4177
def spfc(word):
4178
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4179
4180
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4181
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
4182
4183
    :param str word: the word to transform
4184
    :returns: the SPFC value
4185
    :rtype: str
4186
4187
    >>> spfc('Christopher Smith')
4188
    '01160'
4189
    >>> spfc('Christopher Schmidt')
4190
    '01160'
4191
    >>> spfc('Niall Smith')
4192
    '01660'
4193
    >>> spfc('Niall Schmidt')
4194
    '01660'
4195
4196
    >>> spfc('L.Smith')
4197
    '01960'
4198
    >>> spfc('R.Miller')
4199
    '65490'
4200
4201
    >>> spfc(('L', 'Smith'))
4202
    '01960'
4203
    >>> spfc(('R', 'Miller'))
4204
    '65490'
4205
    """
4206
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4207
                    '0011112222334445556666777'))
4208
    _pf2 = dict(zip((ord(_) for _ in
4209
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4210
                    '0011122233445556677788899'))
4211
    _pf3 = dict(zip((ord(_) for _ in
4212
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4213
                    '00000112223334456677777777'))
4214
4215
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4216
                      ('MN', 'N'))
4217
4218
    def _raise_word_ex():
4219
        """Raise an AttributeError."""
4220
        raise AttributeError('word attribute must be a string with a space ' +
4221
                             'or period dividing the first and last names ' +
4222
                             'or a tuple/list consisting of the first and ' +
4223
                             'last names')
4224
4225
    if not word:
4226
        return ''
4227
4228
    if isinstance(word, (str, text_type)):
4229
        names = word.split('.', 1)
4230
        if len(names) != 2:
4231
            names = word.split(' ', 1)
4232
            if len(names) != 2:
4233
                _raise_word_ex()
4234
    elif hasattr(word, '__iter__'):
4235
        if len(word) != 2:
4236
            _raise_word_ex()
4237
        names = word
4238
    else:
4239
        _raise_word_ex()
4240
4241
    names = [normalize('NFKD', text_type(_.strip()
4242
                                         .replace('ß', 'SS')
4243
                                         .upper()))
4244
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4245
    code = ''
4246
4247
    def steps_one_to_three(name):
4248
        """Perform the first three steps of SPFC."""
4249
        # filter out non A-Z
4250
        name = ''.join(_ for _ in name if _ in
4251
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
4252
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
4253
                        'W', 'X', 'Y', 'Z'})
4254
4255
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4256
        # and MN to N
4257
        for subst in _substitutions:
4258
            name = name.replace(subst[0], subst[1])
4259
4260
        # 2. In the name field, replace multiple letters with a single letter
4261
        name = _delete_consecutive_repeats(name)
4262
4263
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4264
        # field.
4265
        if name:
4266
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4267
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
4268
        return name
4269
4270
    names = [steps_one_to_three(_) for _ in names]
4271
4272
    # 4. The first digit of the code is obtained using PF1 and the first letter
4273
    # of the name field. Remove this letter after coding.
4274
    if names[1]:
4275
        code += names[1][0].translate(_pf1)
4276
        names[1] = names[1][1:]
4277
4278
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4279
    # second digit of the code. Use as many letters as possible and remove
4280
    # after coding.
4281
    if names[1]:
4282
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4283
            code += '8'
4284
            names[1] = names[1][:-3]
4285
        elif names[1][-2:] == 'SN':
4286
            code += '8'
4287
            names[1] = names[1][:-2]
4288
        elif names[1][-3:] == 'STR':
4289
            code += '9'
4290
            names[1] = names[1][:-3]
4291
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
4292
            code += '9'
4293
            names[1] = names[1][:-2]
4294
        elif names[1][-3:] == 'DRS':
4295
            code += '7'
4296
            names[1] = names[1][:-3]
4297
        elif names[1][-2:] in {'TR', 'MN'}:
4298
            code += '7'
4299
            names[1] = names[1][:-2]
4300
        else:
4301
            code += names[1][-1].translate(_pf3)
4302
            names[1] = names[1][:-1]
4303
4304
    # 6. The third digit is found using Table PF2 and the first character of
4305
    # the first name. Remove after coding.
4306
    if names[0]:
4307
        code += names[0][0].translate(_pf2)
4308
        names[0] = names[0][1:]
4309
4310
    # 7. The fourth digit is found using Table PF2 and the first character of
4311
    # the name field. If no letters remain use zero. After coding remove the
4312
    # letter.
4313
    # 8. The fifth digit is found in the same manner as the fourth using the
4314
    # remaining characters of the name field if any.
4315
    for _ in range(2):
4316
        if names[1]:
4317
            code += names[1][0].translate(_pf2)
4318
            names[1] = names[1][1:]
4319
        else:
4320
            code += '0'
4321
4322
    return code
4323
4324
4325
def statistics_canada(word, maxlength=4):
4326
    """Return the Statistics Canada code for a word.
4327
4328
    The original description of this algorithm could not be located, and
4329
    may only have been specified in an unpublished TR. The coding does not
4330
    appear to be in use by Statistics Canada any longer. In its place, this is
4331
    an implementation of the "Census modified Statistics Canada name coding
4332
    procedure".
4333
4334
    The modified version of this algorithm is described in Appendix B of
4335
     :cite:`Moore:1977`.
4336
4337
    :param str word: the word to transform
4338
    :param int maxlength: the maximum length (default 6) of the code to return
4339
    :param bool modified: indicates whether to use USDA modified algorithm
4340
    :returns: the Statistics Canada name code value
4341
    :rtype: str
4342
4343
    >>> statistics_canada('Christopher')
4344
    'CHRS'
4345
    >>> statistics_canada('Niall')
4346
    'NL'
4347
    >>> statistics_canada('Smith')
4348
    'SMTH'
4349
    >>> statistics_canada('Schmidt')
4350
    'SCHM'
4351
    """
4352
    # uppercase, normalize, decompose, and filter non-A-Z out
4353
    word = normalize('NFKD', text_type(word.upper()))
4354
    word = word.replace('ß', 'SS')
4355
    word = ''.join(c for c in word if c in
4356
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4357
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4358
                    'Y', 'Z'})
4359
    if not word:
4360
        return ''
4361
4362
    code = word[1:]
4363
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4364
        code = code.replace(vowel, '')
4365
    code = word[0]+code
4366
    code = _delete_consecutive_repeats(code)
4367
    code = code.replace(' ', '')
4368
4369
    return code[:maxlength]
4370
4371
4372
def lein(word, maxlength=4, zero_pad=True):
4373
    """Return the Lein code for a word.
4374
4375
    This is Lein name coding, described in :cite:`Moore:1977`.
4376
4377
    :param str word: the word to transform
4378
    :param int maxlength: the maximum length (default 4) of the code to return
4379
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4380
        maxlength string
4381
    :returns: the Lein code
4382
    :rtype: str
4383
4384
    >>> lein('Christopher')
4385
    'C351'
4386
    >>> lein('Niall')
4387
    'N300'
4388
    >>> lein('Smith')
4389
    'S210'
4390
    >>> lein('Schmidt')
4391
    'S521'
4392
    """
4393
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4394
                                  'BCDFGJKLMNPQRSTVXZ'),
4395
                                 '451455532245351455'))
4396
4397
    # uppercase, normalize, decompose, and filter non-A-Z out
4398
    word = normalize('NFKD', text_type(word.upper()))
4399
    word = word.replace('ß', 'SS')
4400
    word = ''.join(c for c in word if c in
4401
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4402
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4403
                    'Y', 'Z'})
4404
4405
    if not word:
4406
        return ''
4407
4408
    code = word[0]  # Rule 1
4409
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
4410
                               73: None, 79: None, 85: None, 87: None,
4411
                               89: None})  # Rule 2
4412
    word = _delete_consecutive_repeats(word)  # Rule 3
4413
    code += word.translate(_lein_translation)  # Rule 4
4414
4415
    if zero_pad:
4416
        code += ('0'*maxlength)  # Rule 4
4417
4418
    return code[:maxlength]
4419
4420
4421
def roger_root(word, maxlength=5, zero_pad=True):
4422
    """Return the Roger Root code for a word.
4423
4424
    This is Roger Root name coding, described in :cite:`Moore:1977`.
4425
4426
    :param str word: the word to transform
4427
    :param int maxlength: the maximum length (default 5) of the code to return
4428
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4429
        maxlength string
4430
    :returns: the Roger Root code
4431
    :rtype: str
4432
4433
    >>> roger_root('Christopher')
4434
    '06401'
4435
    >>> roger_root('Niall')
4436
    '02500'
4437
    >>> roger_root('Smith')
4438
    '00310'
4439
    >>> roger_root('Schmidt')
4440
    '06310'
4441
    """
4442
    # uppercase, normalize, decompose, and filter non-A-Z out
4443
    word = normalize('NFKD', text_type(word.upper()))
4444
    word = word.replace('ß', 'SS')
4445
    word = ''.join(c for c in word if c in
4446
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4447
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4448
                    'Y', 'Z'})
4449
4450
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4451
    _init_patterns = {4: {'TSCH': '06'},
4452
                      3: {'TSH': '06', 'SCH': '06'},
4453
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4454
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4455
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4456
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4457
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4458
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4459
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4460
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4461
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4462
                          'Y': '5', 'Z': '0*0'}}
4463
4464
    _med_patterns = {4: {'TSCH': '6'},
4465
                     3: {'TSH': '6', 'SCH': '6'},
4466
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4467
                         'PH': '8', 'SH': '6', 'TS': '0'},
4468
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4469
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4470
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4471
                         'V': '8', 'X': '7', 'Z': '0',
4472
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4473
                         'U': '*', 'W': '*', 'Y': '*'}}
4474
4475
    code = ''
4476
    pos = 0
4477
4478
    # Do first digit(s) first
4479
    for num in range(4, 0, -1):
4480
        if word[:num] in _init_patterns[num]:
4481
            code = _init_patterns[num][word[:num]]
4482
            pos += num
4483
            break
4484
4485
    # Then code subsequent digits
4486
    while pos < len(word):
4487
        for num in range(4, 0, -1):
4488
            if word[pos:pos+num] in _med_patterns[num]:
4489
                code += _med_patterns[num][word[pos:pos+num]]
4490
                pos += num
4491
                break
4492
4493
    code = _delete_consecutive_repeats(code)
4494
    code = code.replace('*', '')
4495
4496
    if zero_pad:
4497
        code += '0'*maxlength
4498
4499
    return code[:maxlength]
4500
4501
4502
def onca(word, maxlength=4, zero_pad=True):
4503
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.
4504
4505
    This is the Oxford Name Compression Algorithm, based on :cite:`Gill:1997`.
4506
4507
    I can find no complete description of the "anglicised version of the NYSIIS
4508
    method" identified as the first step in this algorithm, so this is likely
4509
    not a precisely correct implementation, in that it employs the standard
4510
    NYSIIS algorithm.
4511
4512
    :param str word: the word to transform
4513
    :param int maxlength: the maximum length (default 5) of the code to return
4514
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4515
        maxlength string
4516
    :returns: the ONCA code
4517
    :rtype: str
4518
4519
    >>> onca('Christopher')
4520
    'C623'
4521
    >>> onca('Niall')
4522
    'N400'
4523
    >>> onca('Smith')
4524
    'S530'
4525
    >>> onca('Schmidt')
4526
    'S530'
4527
    """
4528
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
4529
    # to one character of output, so give it triple the maxlength.
4530
    return soundex(nysiis(word, maxlength=maxlength*3), maxlength,
4531
                   zero_pad=zero_pad)
4532
4533
4534
def eudex(word, maxlength=8):
4535
    """Return the eudex phonetic hash of a word.
4536
4537
    This implementation of eudex phonetic hashing is based on the specification
4538
    (not the reference implementation) at :cite:`Ticki:2016`.
4539
4540
    Further details can be found at :cite:`Ticki:2016b`.
4541
4542
    :param str word: the word to transform
4543
    :param int maxlength: the length of the code returned (defaults to 8)
4544
    :returns: the eudex hash
4545
    :rtype: str
4546
    """
4547
    _trailing_phones = {
4548
        'a': 0,  # a
4549
        'b': 0b01001000,  # b
4550
        'c': 0b00001100,  # c
4551
        'd': 0b00011000,  # d
4552
        'e': 0,  # e
4553
        'f': 0b01000100,  # f
4554
        'g': 0b00001000,  # g
4555
        'h': 0b00000100,  # h
4556
        'i': 1,  # i
4557
        'j': 0b00000101,  # j
4558
        'k': 0b00001001,  # k
4559
        'l': 0b10100000,  # l
4560
        'm': 0b00000010,  # m
4561
        'n': 0b00010010,  # n
4562
        'o': 0,  # o
4563
        'p': 0b01001001,  # p
4564
        'q': 0b10101000,  # q
4565
        'r': 0b10100001,  # r
4566
        's': 0b00010100,  # s
4567
        't': 0b00011101,  # t
4568
        'u': 1,  # u
4569
        'v': 0b01000101,  # v
4570
        'w': 0b00000000,  # w
4571
        'x': 0b10000100,  # x
4572
        'y': 1,  # y
4573
        'z': 0b10010100,  # z
4574
4575
        'ß': 0b00010101,  # ß
4576
        'à': 0,  # à
4577
        'á': 0,  # á
4578
        'â': 0,  # â
4579
        'ã': 0,  # ã
4580
        'ä': 0,  # ä[æ]
4581
        'å': 1,  # å[oː]
4582
        'æ': 0,  # æ[æ]
4583
        'ç': 0b10010101,  # ç[t͡ʃ]
4584
        'è': 1,  # è
4585
        'é': 1,  # é
4586
        'ê': 1,  # ê
4587
        'ë': 1,  # ë
4588
        'ì': 1,  # ì
4589
        'í': 1,  # í
4590
        'î': 1,  # î
4591
        'ï': 1,  # ï
4592
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
4593
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
4594
        'ò': 0,  # ò
4595
        'ó': 0,  # ó
4596
        'ô': 0,  # ô
4597
        'õ': 0,  # õ
4598
        'ö': 1,  # ö[ø]
4599
        '÷': 0b11111111,  # ÷
4600
        'ø': 1,  # ø[ø]
4601
        'ù': 1,  # ù
4602
        'ú': 1,  # ú
4603
        'û': 1,  # û
4604
        'ü': 1,  # ü
4605
        'ý': 1,  # ý
4606
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
4607
        'ÿ': 1,  # ÿ
4608
    }
4609
4610
    _initial_phones = {
4611
        'a': 0b10000100,  # a*
4612
        'b': 0b00100100,  # b
4613
        'c': 0b00000110,  # c
4614
        'd': 0b00001100,  # d
4615
        'e': 0b11011000,  # e*
4616
        'f': 0b00100010,  # f
4617
        'g': 0b00000100,  # g
4618
        'h': 0b00000010,  # h
4619
        'i': 0b11111000,  # i*
4620
        'j': 0b00000011,  # j
4621
        'k': 0b00000101,  # k
4622
        'l': 0b01010000,  # l
4623
        'm': 0b00000001,  # m
4624
        'n': 0b00001001,  # n
4625
        'o': 0b10010100,  # o*
4626
        'p': 0b00100101,  # p
4627
        'q': 0b01010100,  # q
4628
        'r': 0b01010001,  # r
4629
        's': 0b00001010,  # s
4630
        't': 0b00001110,  # t
4631
        'u': 0b11100000,  # u*
4632
        'v': 0b00100011,  # v
4633
        'w': 0b00000000,  # w
4634
        'x': 0b01000010,  # x
4635
        'y': 0b11100100,  # y*
4636
        'z': 0b01001010,  # z
4637
4638
        'ß': 0b00001011,  # ß
4639
        'à': 0b10000101,  # à
4640
        'á': 0b10000101,  # á
4641
        'â': 0b10000000,  # â
4642
        'ã': 0b10000110,  # ã
4643
        'ä': 0b10100110,  # ä [æ]
4644
        'å': 0b11000010,  # å [oː]
4645
        'æ': 0b10100111,  # æ [æ]
4646
        'ç': 0b01010100,  # ç [t͡ʃ]
4647
        'è': 0b11011001,  # è
4648
        'é': 0b11011001,  # é
4649
        'ê': 0b11011001,  # ê
4650
        'ë': 0b11000110,  # ë [ə] or [œ]
4651
        'ì': 0b11111001,  # ì
4652
        'í': 0b11111001,  # í
4653
        'î': 0b11111001,  # î
4654
        'ï': 0b11111001,  # ï
4655
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
4656
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
4657
        'ò': 0b10010101,  # ò
4658
        'ó': 0b10010101,  # ó
4659
        'ô': 0b10010101,  # ô
4660
        'õ': 0b10010101,  # õ
4661
        'ö': 0b11011100,  # ö [œ] or [ø]
4662
        '÷': 0b11111111,  # ÷
4663
        'ø': 0b11011101,  # ø [œ] or [ø]
4664
        'ù': 0b11100001,  # ù
4665
        'ú': 0b11100001,  # ú
4666
        'û': 0b11100001,  # û
4667
        'ü': 0b11100101,  # ü
4668
        'ý': 0b11100101,  # ý
4669
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
4670
        'ÿ': 0b11100101,  # ÿ
4671
    }
4672
    # Lowercase input & filter unknown characters
4673
    word = ''.join(char for char in word.lower() if char in _initial_phones)
4674
4675
    if not word:
4676
        word = '÷'
4677
4678
    # Perform initial eudex coding of each character
4679
    values = [_initial_phones[word[0]]]
4680
    values += [_trailing_phones[char] for char in word[1:]]
4681
4682
    # Right-shift by one to determine if second instance should be skipped
4683
    shifted_values = [_ >> 1 for _ in values]
4684
    condensed_values = [values[0]]
4685
    for n in range(1, len(shifted_values)):
4686
        if shifted_values[n] != shifted_values[n-1]:
4687
            condensed_values.append(values[n])
4688
4689
    # Add padding after first character & trim beyond maxlength
4690
    values = ([condensed_values[0]] +
4691
              [0]*max(0, maxlength - len(condensed_values)) +
4692
              condensed_values[1:maxlength])
4693
4694
    # Combine individual character values into eudex hash
4695
    hash_value = 0
4696
    for val in values:
4697
        hash_value = (hash_value << 8) | val
4698
4699
    return hash_value
4700
4701
4702
def haase_phonetik(word, primary_only=False):
4703
    """Return the Haase Phonetik (numeric output) code for a word.
4704
4705
    Based on the algorithm described at :cite:`Prante:2015`.
4706
4707
    Based on the original :cite:`Haase:2000`.
4708
4709
    While the output code is numeric, it is nevertheless a str.
4710
4711
    :param str word: the word to transform
4712
    :returns: the Haase Phonetik value as a numeric string
4713
    :rtype: str
4714
    """
4715
    def _after(word, i, letters):
4716
        """Return True if word[i] follows one of the supplied letters."""
4717
        if i > 0 and word[i-1] in letters:
4718
            return True
4719
        return False
4720
4721
    def _before(word, i, letters):
4722
        """Return True if word[i] precedes one of the supplied letters."""
4723
        if i+1 < len(word) and word[i+1] in letters:
4724
            return True
4725
        return False
4726
4727
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
4728
4729
    word = normalize('NFKD', text_type(word.upper()))
4730
    word = word.replace('ß', 'SS')
4731
4732
    word = word.replace('Ä', 'AE')
4733
    word = word.replace('Ö', 'OE')
4734
    word = word.replace('Ü', 'UE')
4735
    word = ''.join(c for c in word if c in
4736
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4737
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4738
                    'Y', 'Z'})
4739
4740
    variants = []
4741
    if primary_only:
4742
        variants = [word]
4743
    else:
4744
        pos = 0
4745
        if word[:2] == 'CH':
4746
            variants.append(('CH', 'SCH'))
4747
            pos += 2
4748
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
4749
                      'AUX': 'O', 'EUX': 'O'}
4750
        while pos < len(word):
4751
            if word[pos:pos+4] == 'ILLE':
4752
                variants.append(('ILLE', 'I'))
4753
                pos += 4
4754
            elif word[pos:pos+3] in len_3_vars:
4755
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
4756
                pos += 3
4757
            elif word[pos:pos+2] == 'RB':
4758
                variants.append(('RB', 'RW'))
4759
                pos += 2
4760
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
4761
                variants.append(('EAU', 'O'))
4762
                pos += 3
4763
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
4764
                if word[pos:] == 'O':
4765
                    variants.append(('O', 'OW'))
4766
                else:
4767
                    variants.append(('A', 'AR'))
4768
                pos += 1
4769
            else:
4770
                variants.append((word[pos],))
4771
                pos += 1
4772
4773
        variants = [''.join(letters) for letters in product(*variants)]
4774
4775
    def _haase_code(word):
4776
        sdx = ''
4777
        for i in range(len(word)):
4778 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
4779
                sdx += '9'
4780
            elif word[i] == 'B':
4781
                sdx += '1'
4782
            elif word[i] == 'P':
4783
                if _before(word, i, {'H'}):
4784
                    sdx += '3'
4785
                else:
4786
                    sdx += '1'
4787
            elif word[i] in {'D', 'T'}:
4788
                if _before(word, i, {'C', 'S', 'Z'}):
4789
                    sdx += '8'
4790
                else:
4791
                    sdx += '2'
4792
            elif word[i] in {'F', 'V', 'W'}:
4793
                sdx += '3'
4794
            elif word[i] in {'G', 'K', 'Q'}:
4795
                sdx += '4'
4796
            elif word[i] == 'C':
4797
                if _after(word, i, {'S', 'Z'}):
4798
                    sdx += '8'
4799
                elif i == 0:
4800
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
4801
                                         'U', 'X'}):
4802
                        sdx += '4'
4803
                    else:
4804
                        sdx += '8'
4805
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
4806
                    sdx += '4'
4807
                else:
4808
                    sdx += '8'
4809
            elif word[i] == 'X':
4810
                if _after(word, i, {'C', 'K', 'Q'}):
4811
                    sdx += '8'
4812
                else:
4813
                    sdx += '48'
4814
            elif word[i] == 'L':
4815
                sdx += '5'
4816
            elif word[i] in {'M', 'N'}:
4817
                sdx += '6'
4818
            elif word[i] == 'R':
4819
                sdx += '7'
4820
            elif word[i] in {'S', 'Z'}:
4821
                sdx += '8'
4822
4823
        sdx = _delete_consecutive_repeats(sdx)
4824
4825
        return sdx
4826
4827
    encoded = tuple(_haase_code(word) for word in variants)
4828
    if len(encoded) > 1:
4829
        encoded_set = set()
4830
        encoded_single = []
4831
        for code in encoded:
4832
            if code not in encoded_set:
4833
                encoded_set.add(code)
4834
                encoded_single.append(code)
4835
        return tuple(encoded_single)
4836
4837
    return encoded
4838
4839
4840
def reth_schek_phonetik(word):
4841
    """Return Reth-Schek Phonetik code for a word.
4842
4843
    This algorithm is proposed in :cite:`Reth:1977`.
4844
4845
    Since I couldn't secure a copy of that document (maybe I'll look for it
4846
    next time I'm in Germany), this implementation is based on what I could
4847
    glean from the implementations published by German Record Linkage
4848
    Center (www.record-linkage.de):
4849
4850
    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
4851
    - Merge ToolBox (in Java) :cite:`Schnell:2004`
4852
4853
    Rules that are unclear:
4854
4855
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
4856
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
4857
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
4858
      think of a German word with '-tui-' in it.)
4859
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
4860
4861
    :param word:
4862
    :return:
4863
    """
4864
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
4865
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
4866
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
4867
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
4868
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
4869
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
4870
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
4871
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
4872
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
4873
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
4874
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
4875
                        'SS': 'S', 'KW': 'QU'},
4876
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
4877
                        'K': 'G', 'Y': 'I'}}
4878
4879
    # Uppercase
4880
    word = word.upper()
4881
4882
    # Replace umlauts/eszett
4883
    word = word.replace('Ä', 'AE')
4884
    word = word.replace('Ö', 'OE')
4885
    word = word.replace('Ü', 'UE')
4886
    word = word.replace('ß', 'SS')
4887
4888
    # Main loop, using above replacements table
4889
    pos = 0
4890
    while pos < len(word):
4891
        for num in range(3, 0, -1):
4892
            if word[pos:pos+num] in replacements[num]:
4893
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
4894
                        + word[pos+num:])
4895
                pos += 1
4896
                break
4897
        else:
4898
            pos += 1  # Advance if nothing is recognized
4899
4900
    # Change 'CH' back(?) to 'SCH'
4901
    word = word.replace('CH', 'SCH')
4902
4903
    # Replace final sequences
4904
    if word[-2:] == 'ER':
4905
        word = word[:-2]+'R'
4906
    elif word[-2:] == 'EL':
4907
        word = word[:-2]+'L'
4908
    elif word[-1:] == 'H':
4909
        word = word[:-1]
4910
4911
    return word
4912
4913
4914
def fonem(word):
4915
    """Return the FONEM code of a word.
4916
4917
    FONEM is a phonetic algorithm designed for French (particularly surnames in
4918
    Saguenay, Canada), defined in :cite:`Bouchard:1981`.
4919
4920
    Guillaume Plique's Javascript implementation :cite:`Plique:2018` at
4921
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
4922
    was also consulted for this implementation.
4923
4924
    :param str word: the word to transform
4925
    :returns: the FONEM code
4926
    :rtype: str
4927
    """
4928
    # I don't see a sane way of doing this without regexps :(
4929
    rule_table = {
4930
        # Vowels & groups of vowels
4931
        'V-1':     (re_compile('E?AU'), 'O'),
4932
        'V-2,5':   (re_compile('(E?AU|O)L[TX]$'), 'O'),
4933
        'V-3,4':   (re_compile('E?AU[TX]$'), 'O'),
4934
        'V-6':     (re_compile('E?AUL?D$'), 'O'),
4935
        'V-7':     (re_compile(r'(?<!G)AY$'), 'E'),
4936
        'V-8':     (re_compile('EUX$'), 'EU'),
4937
        'V-9':     (re_compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
4938
        'V-10':    ('Y', 'I'),
4939
        'V-11':    (re_compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
4940
        'V-12':    (re_compile('(?<=[AEIOUY])ILL'), 'Y'),
4941
        'V-13':    (re_compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
4942
        'V-14':    (re_compile(r'([AEIOUY])(?=\1)'), ''),
4943
        # Nasal vowels
4944
        'V-15':    (re_compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
4945
        'V-16':    (re_compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
4946
        'V-17':    (re_compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
4947
        'V-18':    (re_compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
4948
                    'IN'),
4949
        'V-19':    (re_compile('B(O|U|OU)RNE?$'), 'BURN'),
4950
        'V-20':    (re_compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
4951
                               'IM(?=[BCDFGHJKLMPQRSTVWXZ]))'), 'IN'),
4952
        # Consonants and groups of consonants
4953
        'C-1':     ('BV', 'V'),
4954
        'C-2':     (re_compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
4955
        'C-3':     (re_compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
4956
        'C-4':     (re_compile('^C(?=[EIY])'), 'S'),
4957
        'C-5':     (re_compile('^C(?=[OUA])'), 'K'),
4958
        'C-6':     (re_compile('(?<=[AEIOUY])C$'), 'K'),
4959
        'C-7':     (re_compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
4960
        'C-8':     (re_compile('CC(?=[AOU])'), 'K'),
4961
        'C-9':     (re_compile('CC(?=[EIY])'), 'X'),
4962
        'C-10':    (re_compile('G(?=[EIY])'), 'J'),
4963
        'C-11':    (re_compile('GA(?=I?[MN])'), 'G#'),
4964
        'C-12':    (re_compile('GE(O|AU)'), 'JO'),
4965
        'C-13':    (re_compile('GNI(?=[AEIOUY])'), 'GN'),
4966
        'C-14':    (re_compile('(?<![PCS])H'), ''),
4967
        'C-15':    ('JEA', 'JA'),
4968
        'C-16':    (re_compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
4969
        'C-17':    (re_compile('^MC'), 'MA#'),
4970
        'C-18':    ('PH', 'F'),
4971
        'C-19':    ('QU', 'K'),
4972
        'C-20':    (re_compile('^SC(?=[EIY])'), 'S'),
4973
        'C-21':    (re_compile('(?<=.)SC(?=[EIY])'), 'SS'),
4974
        'C-22':    (re_compile('(?<=.)SC(?=[AOU])'), 'SK'),
4975
        'C-23':    ('SH', 'CH'),
4976
        'C-24':    (re_compile('TIA$'), 'SSIA'),
4977
        'C-25':    (re_compile('(?<=[AIOUY])W'), ''),
4978
        'C-26':    (re_compile('X[CSZ]'), 'X'),
4979
        'C-27':    (re_compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])' +
4980
                               'Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
4981
        'C-28':    (re_compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
4982
        'C-28a':   (re_compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
4983
        'C-28b':   (re_compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
4984
        'C-28bb':  (re_compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
4985
        'C-28c':   (re_compile('((?<=[^I])|^)LL'), 'L'),
4986
        'C-28d':   (re_compile('ILE$'), 'ILLE'),
4987
        'C-29':    (re_compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKL' +
4988
                               'MNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'),
4989
                    lambda m: (m.group(1) or '') + (m.group(2) or '')),
4990
        'C-30,32': (re_compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
4991
        'C-31,33': (re_compile('^(SAINTE|STE)-?'), 'STE-'),
4992
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
4993
        'C-34':    ('G#', 'GA'),
4994
        'C-35':    ('MA#', 'MAC')
4995
    }
4996
    rule_order = [
4997
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
4998
        'C-12',
4999
        'C-8', 'C-9', 'C-10',
5000
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
5001
        'V-2,5', 'V-3,4', 'V-6',
5002
        'V-1', 'C-14',
5003
        'C-31,33', 'C-30,32',
5004
        'C-11', 'V-15', 'V-17', 'V-18',
5005
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
5006
        'V-19', 'V-20',
5007
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
5008
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
5009
        'C-25', 'C-26', 'C-27',
5010
        'C-29',
5011
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5012
        'C-34', 'C-35'
5013
    ]
5014
5015
    # normalize, upper-case, and filter non-French letters
5016
    word = normalize('NFKD', text_type(word.upper()))
5017
    word = word.translate({198: 'AE', 338: 'OE'})
5018
    word = ''.join(c for c in word if c in
5019
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5020
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5021
                    'Y', 'Z', '-'})
5022
5023
    for rule in rule_order:
5024
        regex, repl = rule_table[rule]
5025
        if isinstance(regex, text_type):
5026
            word = word.replace(regex, repl)
5027
        else:
5028
            word = regex.sub(repl, word)
5029
5030
    return word
5031
5032
5033
def parmar_kumbharana(word):
5034
    """Return the Parmar-Kumbharana encoding of a word.
5035
5036
    This is based on the phonetic algorithm proposed in :cite:`Parmar:2014`.
5037
5038
    :param word:
5039
    :return:
5040
    """
5041
    rule_table = {4: {'OUGH': 'F'},
5042
                  3: {'DGE': 'J',
5043
                      'OUL': 'U',
5044
                      'GHT': 'T'},
5045
                  2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
5046
                      'GE': 'J', 'GI': 'J', 'GY': 'J',
5047
                      'WR': 'R',
5048
                      'GN': 'N', 'KN': 'N', 'PN': 'N',
5049
                      'CK': 'K',
5050
                      'SH': 'S'}}
5051
    vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
5052
5053
    word = word.upper()  # Rule 3
5054
    word = _delete_consecutive_repeats(word)  # Rule 4
5055
5056
    # Rule 5
5057
    i = 0
5058
    while i < len(word):
5059
        for match_len in range(4, 1, -1):
5060
            if word[i:i+match_len] in rule_table[match_len]:
5061
                repl = rule_table[match_len][word[i:i+match_len]]
5062
                word = (word[:i] + repl + word[i+match_len:])
5063
                i += len(repl)
5064
                break
5065
        else:
5066
            i += 1
5067
5068
    word = word[0]+word[1:].translate(vowel_trans)  # Rule 6
5069
    return word
5070
5071
5072
def davidson(lname, fname='.', omit_fname=False):
5073
    """Return Davidson's Consonant Code.
5074
5075
    This is based on the name compression system described in
5076
    :cite:`Davidson:1962`.
5077
5078
    :cite:`Dolby:1970` identifies this as having been the name compression
5079
    algorithm used by SABRE.
5080
5081
    :param str lname: Last name (or word) to be encoded
5082
    :param str fname: First name (optional), of which the first character is
5083
        included in the code.
5084
    :param str omit_fname: Set to True to completely omit the first character
5085
        of the first name
5086
    :return: Davidson's Consonant Code
5087
    """
5088
    trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''}
5089
5090
    lname = text_type(lname.upper())
5091
    code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans))
5092
    code = code[:4] + (4-len(code))*' '
5093
5094
    if not omit_fname:
5095
        code += fname[:1].upper()
5096
5097
    return code
5098
5099
5100
def sound_d(word, maxlength=4):
5101
    """Return the SoundD code.
5102
5103
    SoundD is defined in :cite:`Varol:2012`.
5104
5105
    :param str word: the word to transform
5106
    :param int maxlength: the length of the code returned (defaults to 4)
5107
    :return:
5108
    """
5109
    _ref_soundd_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5110
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5111
                                       '01230120022455012623010202'))
5112
5113
    word = normalize('NFKD', text_type(word.upper()))
5114
    word = word.replace('ß', 'SS')
5115
    word = ''.join(c for c in word if c in
5116
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5117
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5118
                    'Y', 'Z'})
5119
5120
    if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
5121
        word = word[1:]
5122
    elif word[:1] == 'X':
5123
        word = 'S'+word[1:]
5124
    elif word[:2] == 'WH':
5125
        word = 'W'+word[2:]
5126
5127
    word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')
5128
5129
    word = word.translate(_ref_soundd_translation)
5130
    word = _delete_consecutive_repeats(word)
5131
    word = word.replace('0', '')
5132
5133
    if maxlength is not None:
5134
        if len(word) < maxlength:
5135
            word += '0' * (maxlength-len(word))
5136
        else:
5137
            word = word[:maxlength]
5138
5139
    return word
5140
5141
5142
def pshp_soundex_last(lname, maxlength=4, german=False):
5143
    """Calculate the PSHP Soundex/Viewex Coding of a last name.
5144
5145
    This coding is based on :cite:`Hershberg:1976`.
5146
5147
    Reference was also made to the German version of the same:
5148
    :cite:`Hershberg:1979`.
5149
5150
    A separate function, pshp_soundex_first() is used for first names.
5151
5152
    :param lname: the last name to encode
5153
    :param german: set to True if the name is German (different rules apply)
5154
    :return:
5155
    """
5156
    lname = normalize('NFKD', text_type(lname.upper()))
5157
    lname = lname.replace('ß', 'SS')
5158
    lname = ''.join(c for c in lname if c in
5159
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5160
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5161
                     'W', 'X', 'Y', 'Z'})
5162
5163
    # A. Prefix treatment
5164
    if lname[:3] == 'VON' or lname[:3] == 'VAN':
5165
        lname = lname[3:].strip()
5166
5167
    # The rule implemented below says "MC, MAC become 1". I believe it meant to
5168
    # say they become M except in German data (where superscripted 1 indicates
5169
    # "except in German data"). It doesn't make sense for them to become 1
5170
    # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
5171
    # this error(?).
5172
    if not german:
5173
        if lname[:3] == 'MAC':
5174
            lname = 'M'+lname[3:]
5175
        elif lname[:2] == 'MC':
5176
            lname = 'M'+lname[2:]
5177
5178
    # The non-German-only rule to strip ' is unnecessary due to filtering
5179
5180
    if lname[:1] in {'E', 'I', 'O', 'U'}:
5181
        lname = 'A' + lname[1:]
5182
    elif lname[:2] in {'GE', 'GI', 'GY'}:
5183
        lname = 'J' + lname[1:]
5184
    elif lname[:2] in {'CE', 'CI', 'CY'}:
5185
        lname = 'S' + lname[1:]
5186
    elif lname[:3] == 'CHR':
5187
        lname = 'K' + lname[1:]
5188
    elif lname[:1] == 'C' and lname[:2] != 'CH':
5189
        lname = 'K' + lname[1:]
5190
5191
    if lname[:2] == 'KN':
5192
        lname = 'N' + lname[1:]
5193
    elif lname[:2] == 'PH':
5194
        lname = 'F' + lname[1:]
5195
    elif lname[:3] in {'WIE', 'WEI'}:
5196
        lname = 'V' + lname[1:]
5197
5198
    if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
5199
        lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]
5200
5201
    code = lname[:1]
5202
5203
    # B. Postfix treatment
5204
    if german:  # moved from end of postfix treatment due to blocking
5205
        if lname[-3:] == 'TES':
5206
            lname = lname[:-3]
5207
        elif lname[-2:] == 'TS':
5208
            lname = lname[:-2]
5209
        if lname[-3:] == 'TZE':
5210
            lname = lname[:-3]
5211
        elif lname[-2:] == 'ZE':
5212
            lname = lname[:-2]
5213
        if lname[-1:] == 'Z':
5214
            lname = lname[:-1]
5215
        elif lname[-2:] == 'TE':
5216
            lname = lname[:-2]
5217
5218
    if lname[-1:] == 'R':
5219
        lname = lname[:-1] + 'N'
5220
    elif lname[-2:] in {'SE', 'CE'}:
5221
        lname = lname[:-2]
5222
    if lname[-2:] == 'SS':
5223
        lname = lname[:-2]
5224
    elif lname[-1:] == 'S':
5225
        lname = lname[:-1]
5226
5227
    if not german:
5228
        l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
5229
        l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
5230
                   'STON': 'SAON'}
5231
        if lname[-5:] in l5_repl:
5232
            lname = lname[:-5] + l5_repl[lname[-5:]]
5233
        elif lname[-4:] in l4_repl:
5234
            lname = lname[:-4] + l4_repl[lname[-4:]]
5235
5236
    if lname[-2:] in {'NG', 'ND'}:
5237
        lname = lname[:-1]
5238
    if not german and lname[-3:] in {'GAN', 'GEN'}:
5239
        lname = lname[:-3]+'A'+lname[-2:]
5240
5241
    # C. Infix Treatment
5242
    lname = lname.replace('CK', 'C')
5243
    lname = lname.replace('SCH', 'S')
5244
    lname = lname.replace('DT', 'T')
5245
    lname = lname.replace('ND', 'N')
5246
    lname = lname.replace('NG', 'N')
5247
    lname = lname.replace('LM', 'M')
5248
    lname = lname.replace('MN', 'M')
5249
    lname = lname.replace('WIE', 'VIE')
5250
    lname = lname.replace('WEI', 'VEI')
5251
5252
    # D. Soundexing
5253
    # code for X & Y are unspecified, but presumably are 2 & 0
5254
    _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5255
                                  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5256
                                 '01230120022455012523010202'))
5257
5258
    lname = lname.translate(_pshp_translation)
5259
    lname = _delete_consecutive_repeats(lname)
5260
5261
    code += lname[1:]
5262
    code = code.replace('0', '')  # rule 1
5263
5264
    if maxlength is not None:
5265
        if len(code) < maxlength:
5266
            code += '0' * (maxlength-len(code))
5267
        else:
5268
            code = code[:maxlength]
5269
5270
    return code
5271
5272
5273
def pshp_soundex_first(fname, maxlength=4, german=False):
5274
    """Calculate the PSHP Soundex/Viewex Coding of a first name.
5275
5276
    This coding is based on :cite:`Hershberg:1976`.
5277
5278
    Reference was also made to the German version of the same:
5279
    :cite:`Hershberg:1979`.
5280
5281
    A separate function, pshp_soundex_last() is used for last names.
5282
5283
    :param fname: the first name to encode
5284
    :param german: set to True if the name is German (different rules apply)
5285
    :return:
5286
    """
5287
    fname = normalize('NFKD', text_type(fname.upper()))
5288
    fname = fname.replace('ß', 'SS')
5289
    fname = ''.join(c for c in fname if c in
5290
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5291
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5292
                     'W', 'X', 'Y', 'Z'})
5293
5294
    # special rules
5295
    if fname == 'JAMES':
5296
        code = 'J7'
5297
    elif fname == 'PAT':
5298
        code = 'P7'
5299
5300
    else:
5301
        # A. Prefix treatment
5302
        if fname[:2] in {'GE', 'GI', 'GY'}:
5303
            fname = 'J' + fname[1:]
5304
        elif fname[:2] in {'CE', 'CI', 'CY'}:
5305
            fname = 'S' + fname[1:]
5306
        elif fname[:3] == 'CHR':
5307
            fname = 'K' + fname[1:]
5308
        elif fname[:1] == 'C' and fname[:2] != 'CH':
5309
            fname = 'K' + fname[1:]
5310
5311
        if fname[:2] == 'KN':
5312
            fname = 'N' + fname[1:]
5313
        elif fname[:2] == 'PH':
5314
            fname = 'F' + fname[1:]
5315
        elif fname[:3] in {'WIE', 'WEI'}:
5316
            fname = 'V' + fname[1:]
5317
5318
        if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
5319
            fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
5320
                     fname[1:])
5321
5322
        code = fname[:1]
5323
5324
        # B. Soundex coding
5325
        # code for Y unspecified, but presumably is 0
5326
        _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5327
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5328
                                     '01230120022455012523010202'))
5329
5330
        fname = fname.translate(_pshp_translation)
5331
        fname = _delete_consecutive_repeats(fname)
5332
5333
        code += fname[1:]
5334
        syl_ptr = code.find('0')
5335
        syl2_ptr = code[syl_ptr + 1:].find('0')
5336
        if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
5337
            code = code[:syl_ptr + 2]
5338
5339
        code = code.replace('0', '')  # rule 1
5340
5341
    if maxlength is not None:
5342
        if len(code) < maxlength:
5343
            code += '0' * (maxlength-len(code))
5344
        else:
5345
            code = code[:maxlength]
5346
5347
    return code
5348
5349
5350
def henry_early(word, maxlength=3):
5351
    """Calculate the early version of the Henry code for a word.
5352
5353
    The early version of Henry coding is given in :cite:`Legare:1972`. This is
5354
    different from the later version defined in :cite:`Henry:1976`.
5355
5356
    :param word:
5357
    :param int maxlength: the length of the code returned (defaults to 3)
5358
    :return:
5359
    """
5360
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
5361
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
5362
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
5363
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
5364
             'EU': 'U'}
5365
    # _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
5366
    _simple = {'W': 'V', 'X': 'S', 'Z': 'S'}
5367
5368
    word = normalize('NFKD', text_type(word.upper()))
5369
    word = ''.join(c for c in word if c in
5370
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5371
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5372
                    'Y', 'Z'})
5373
5374
    if not word:
5375
        return ''
5376
5377
    # Rule Ia seems to be covered entirely in II
5378
5379
    # Rule Ib
5380
    if word[0] in _vows:
5381
        # Ib1
5382
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
5383
             (word[1:2] in _cons and word[2:3] not in _cons))):
5384
            if word[0] == 'Y':
5385
                word = 'I'+word[1:]
5386
        # Ib2
5387
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
5388
            if word[0] == 'E':
5389
                word = 'A'+word[1:]
5390
            elif word[0] in {'I', 'U', 'Y'}:
5391
                word = 'E'+word[1:]
5392
        # Ib3
5393
        elif word[:2] in _diph:
5394
            word = _diph[word[:2]]+word[2:]
5395
        # Ib4
5396
        elif word[1:2] in _vows and word[0] == 'Y':
5397
            word = 'I' + word[1:]
5398
5399
    code = ''
5400
    skip = 0
5401
5402
    # Rule II
5403
    for pos, char in enumerate(word):
5404
        nxch = word[pos+1:pos+2]
5405
        prev = word[pos-1:pos]
5406
5407
        if skip:
5408
            skip -= 1
5409
        elif char in _vows:
5410
            code += char
5411
        # IIc
5412
        elif char == nxch:
5413
            skip = 1
5414
            code += char
5415
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
5416
            continue
5417
        # IIb
5418
        elif char in _simple:
5419
            code += _simple[char]
5420
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
5421
            if char == 'C':
5422
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5423
                    code += 'K'
5424
                elif nxch in {'E', 'I', 'Y'}:
5425
                    code += 'S'
5426
                elif nxch == 'H':
5427
                    if word[pos+2:pos+3] in _vows:
5428
                        code += 'C'
5429
                    else:  # CHR, CHL, etc.
5430
                        code += 'K'
5431
                else:
5432
                    code += 'C'
5433
            elif char == 'G':
5434
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5435
                    code += 'G'
5436
                elif nxch in {'E', 'I', 'Y'}:
5437
                    code += 'J'
5438
                elif nxch == 'N':
5439
                    code += 'N'
5440
            elif char == 'P':
5441
                if nxch != 'H':
5442
                    code += 'P'
5443
                else:
5444
                    code += 'F'
5445
            elif char == 'Q':
5446
                if word[pos+1:pos+3] in {'UE', 'UI', 'UY'}:
5447
                    code += 'G'
5448
                else:  # QUA, QUO, etc.
5449
                    code += 'K'
5450
            else:  # S...
5451
                if word[pos:pos+6] == 'SAINTE':
5452
                    code += 'X'
5453
                    skip = 5
5454
                elif word[pos:pos+5] == 'SAINT':
5455
                    code += 'X'
5456
                    skip = 4
5457
                elif word[pos:pos+3] == 'STE':
5458
                    code += 'X'
5459
                    skip = 2
5460
                elif word[pos:pos+2] == 'ST':
5461
                    code += 'X'
5462
                    skip = 1
5463
                elif nxch in _cons:
5464
                    continue
5465
                else:
5466
                    code += 'S'
5467
        # IId
5468
        elif char == 'H' and prev in _cons:
5469
            continue
5470
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
5471
            continue
5472
        elif char == 'L' and nxch in {'M', 'N'}:
5473
            continue
5474
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
5475
            continue
5476
        # IIa
5477
        else:
5478
            code += char
5479
5480
    # IIe1
5481
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
5482
        code = code[:-2]
5483
    # The following are blocked by rules above
5484
    # elif code[-4:-3] in _vows and code[-3:] == 'MPS':
5485
    #    code = code[:-3]
5486
    # elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}:
5487
    #    code = code[:-2]
5488
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
5489
        code = code[:-1]
5490
    # IIe2
5491
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
5492
        code = code[:-1]
5493
    elif code[-2:] == 'ER':
5494
        code = code[:-1]
5495
5496
    # Drop non-initial vowels
5497
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
5498
                                        89: ''})
5499
5500
    if maxlength is not None:
5501
            code = code[:maxlength]
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 8 spaces were expected, but 12 were found.
Loading history...
5502
5503
    return code
5504
5505
5506
def norphone(word):
5507
    """Return the Norphone code.
5508
5509
    The reference implementation by Lars Marius Garshol is available in
5510
    :cite:`Garshol:2015`.
5511
5512
    Norphone was designed for Norwegian, but this implementation has been
5513
    extended to support Swedish vowels as well. This function incorporates
5514
    the "not implemented" rules from the above file's rule set.
5515
5516
    :param word:
5517
    :return:
5518
    """
5519
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
5520
5521
    replacements = {4: {'SKEI': 'X'},
5522
                    3: {'SKJ': 'X', 'KEI': 'X'},
5523
                    2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
5524
                        'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
5525
                        'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
5526
                    1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}
5527
5528
    word = word.upper()
5529
5530
    code = ''
5531
    skip = 0
5532
5533
    if word[0:2] == 'AA':
5534
        code = 'Å'
5535
        skip = 2
5536
    elif word[0:2] == 'GI':
5537
        code = 'J'
5538
        skip = 2
5539
    elif word[0:3] == 'SKY':
5540
        code = 'X'
5541
        skip = 3
5542
    elif word[0:2] == 'EI':
5543
        code = 'Æ'
5544
        skip = 2
5545
    elif word[0:2] == 'KY':
5546
        code = 'X'
5547
        skip = 2
5548
    elif word[:1] == 'C':
5549
        code = 'K'
5550
        skip = 1
5551
    elif word[:1] == 'Ä':
5552
        code = 'Æ'
5553
        skip = 1
5554
    elif word[:1] == 'Ö':
5555
        code = 'Ø'
5556
        skip = 1
5557
5558
    if word[-2:] == 'DT':
5559
        word = word[:-2]+'T'
5560
    # Though the rules indicate this rule applies in all positions, the
5561
    # reference implementation indicates it applies only in final position.
5562
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
5563
        word = word[:-2]
5564
5565
    for pos, char in enumerate(word):
5566
        if skip:
5567
            skip -= 1
5568
        else:
5569
            for length in sorted(replacements, reverse=True):
5570
                if word[pos:pos+length] in replacements[length]:
5571
                    code += replacements[length][word[pos:pos+length]]
5572
                    skip = length-1
5573
                    break
5574
            else:
5575
                if not pos or char not in _vowels:
5576
                    code += char
5577
5578
    code = _delete_consecutive_repeats(code)
5579
5580
    return code
5581
5582
5583
def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'):
5584
    r"""Return the Dolby Code of a name.
5585
5586
    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
5587
    Names" from :cite:`Dolby:1970` and :cite:`Cunningham:1969`.
5588
5589
    :param word: the word to encode
5590
    :param maxlength: maximum length of the returned Dolby code -- this also
5591
        activates the fixed-length code mode
5592
    :param keep_vowels: if True, retains all vowel markers
5593
    :param vowel_char: the vowel marker character (default to \*)
5594
    :return:
5595
    """
5596
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
5597
5598
    # uppercase, normalize, decompose, and filter non-A-Z out
5599
    word = normalize('NFKD', text_type(word.upper()))
5600
    word = word.replace('ß', 'SS')
5601
    word = ''.join(c for c in word if c in
5602
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5603
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5604
                    'Y', 'Z'})
5605
5606
    # Rule 1 (FL2)
5607
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
5608
        word = 'MK'+word[3:]
5609
    elif word[:2] == 'MC':
5610
        word = 'MK'+word[2:]
5611
5612
    # Rule 2 (FL3)
5613
    pos = len(word)-2
5614
    while pos > -1:
5615
        if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
5616
                               'SK', 'ST'}:
5617
            word = word[:pos+1]+word[pos+2:]
5618
            pos += 1
5619
        pos -= 1
5620
5621
    # Rule 3 (FL4)
5622
    # Although the rule indicates "after the first letter", the test cases make
5623
    # it clear that these apply to the first letter also.
5624
    word = word.replace('X', 'KS')
5625
    word = word.replace('CE', 'SE')
5626
    word = word.replace('CI', 'SI')
5627
    word = word.replace('CY', 'SI')
5628
5629
    # not in the rule set, but they seem to have intended it
5630
    word = word.replace('TCH', 'CH')
5631
5632
    pos = word.find('CH', 1)
5633
    while pos != -1:
5634
        if word[pos-1:pos] not in _vowels:
5635
            word = word[:pos]+'S'+word[pos+1:]
5636
        pos = word.find('CH', pos+1)
5637
5638
    word = word.replace('C', 'K')
5639
    word = word.replace('Z', 'S')
5640
5641
    word = word.replace('WR', 'R')
5642
    word = word.replace('DG', 'G')
5643
    word = word.replace('QU', 'K')
5644
    word = word.replace('T', 'D')
5645
    word = word.replace('PH', 'F')
5646
5647
    # Rule 4 (FL5)
5648
    # Although the rule indicates "after the first letter", the test cases make
5649
    # it clear that these apply to the first letter also.
5650
    pos = word.find('K', 0)
5651
    while pos != -1:
5652
        if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
5653
            word = word[:pos-1]+word[pos:]
5654
            pos -= 1
5655
        pos = word.find('K', pos+1)
5656
5657
    # Rule FL6
5658
    if maxlength and word[-1:] == 'E':
5659
        word = word[:-1]
5660
5661
    # Rule 5 (FL7)
5662
    word = _delete_consecutive_repeats(word)
5663
5664
    # Rule 6 (FL8)
5665
    if word[:2] == 'PF':
5666
        word = word[1:]
5667
    if word[-2:] == 'PF':
5668
        word = word[:-1]
5669
    elif word[-2:] == 'GH':
5670
        if word[-3:-2] in _vowels:
5671
            word = word[:-2]+'F'
5672
        else:
5673
            word = word[:-2]+'G'
5674
    word = word.replace('GH', '')
5675
5676
    # Rule FL9
5677
    if maxlength:
5678
        word = word.replace('V', 'F')
5679
5680
    # Rules 7-9 (FL10-FL12)
5681
    first = 1 + (1 if maxlength else 0)
5682
    code = ''
5683
    for pos, char in enumerate(word):
5684
        if char in _vowels:
5685
            if first or keep_vowels:
5686
                code += vowel_char
5687
                first -= 1
5688
            else:
5689
                continue
5690
        elif pos > 0 and char in {'W', 'H'}:
5691
            continue
5692
        else:
5693
            code += char
5694
5695
    if maxlength:
5696
        # Rule FL13
5697
        if len(code) > maxlength and code[-1:] == 'S':
5698
            code = code[:-1]
5699
        if keep_vowels:
5700
            code = code[:maxlength]
5701
        else:
5702
            # Rule FL14
5703
            code = code[:maxlength + 2]
5704
            # Rule FL15
5705
            while len(code) > maxlength:
5706
                vowels = len(code) - maxlength
5707
                excess = vowels - 1
5708
                word = code
5709
                code = ''
5710
                for char in word:
5711
                    if char == vowel_char:
5712
                        if vowels:
5713
                            code += char
5714
                            vowels -= 1
5715
                    else:
5716
                        code += char
5717
                code = code[:maxlength + excess]
5718
5719
        # Rule FL16
5720
        code += ' ' * (maxlength - len(code))
5721
5722
    return code
5723
5724
5725
def phonetic_spanish(word, maxlength=None):
5726
    """Return the PhoneticSpanish coding of word.
5727
5728
    This follows the coding described in :cite:`Amon:2012` and
5729
    :cite:`delPilarAngeles:2015`.
5730
5731
    :param word:
5732
    :return:
5733
    """
5734
    _es_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5735
                                        'BCDFGHJKLMNPQRSTVXYZ'),
5736
                                       '14328287566079431454'))
5737
5738
    # uppercase, normalize, and decompose, filter to A-Z minus vowels & W
5739
    word = normalize('NFKD', text_type(word.upper()))
5740
    word = ''.join(c for c in word if c in
5741
                   {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N',
5742
                    'P', 'Q', 'R', 'S', 'T', 'V', 'X', 'Y', 'Z'})
5743
5744
    # merge repeated Ls & Rs
5745
    word = word.replace('LL', 'L')
5746
    word = word.replace('R', 'R')
5747
5748
    # apply the Soundex algorithm
5749
    sdx = word.translate(_es_soundex_translation)
5750
5751
    if maxlength:
5752
        sdx = sdx[:maxlength]
5753
5754
    return sdx
5755
5756
5757
def spanish_metaphone(word, maxlength=6, modified=False):
5758
    """Return the Spanish Metaphone of a word.
5759
5760
    This is a quick rewrite of the Spanish Metaphone Algorithm, as presented at
5761
    https://github.com/amsqr/Spanish-Metaphone and discussed in
5762
    :cite:`Mosquera:2012`.
5763
5764
    Modified version based on :cite:`delPilarAngeles:2016`.
5765
5766
    :param word:
5767
    :param maxlength:
5768
    :param modified: Set to True to use del Pilar Angeles & Bailón-Miguel's
5769
        modified version of the algorithm
5770
    :return:
5771
    """
5772
    def _is_vowel(pos):
5773
        """Return True if the character at word[pos] is a vowel."""
5774
        if pos < len(word) and word[pos] in {'A', 'E', 'I', 'O', 'U'}:
5775
            return True
5776
        return False
5777
5778
    word = normalize('NFC', text_type(word.upper()))
5779
5780
    meta_key = ''
5781
    pos = 0
5782
5783
    # do some replacements for the modified version
5784
    if modified:
5785
        word = word.replace('MB', 'NB')
5786
        word = word.replace('MP', 'NP')
5787
        word = word.replace('BS', 'S')
5788
        if word[:2] == 'PS':
5789
            word = word[1:]
5790
5791
    # simple replacements
5792
    word = word.replace('Á', 'A')
5793
    word = word.replace('CH', 'X')
5794
    word = word.replace('Ç', 'S')
5795
    word = word.replace('É', 'E')
5796
    word = word.replace('Í', 'I')
5797
    word = word.replace('Ó', 'O')
5798
    word = word.replace('Ú', 'U')
5799
    word = word.replace('Ñ', 'NY')
5800
    word = word.replace('GÜ', 'W')
5801
    word = word.replace('Ü', 'U')
5802
    word = word.replace('B', 'V')
5803
    word = word.replace('LL', 'Y')
5804
5805
    while len(meta_key) < maxlength:
5806
        if pos >= len(word):
5807
            break
5808
5809
        # get the next character
5810
        current_char = word[pos]
5811
5812
        # if a vowel in pos 0, add to key
5813
        if _is_vowel(pos) and pos == 0:
5814
            meta_key += current_char
5815
            pos += 1
5816
        # otherwise, do consonant rules
5817
        else:
5818
            # simple consonants (unmutated)
5819
            if current_char in {'D', 'F', 'J', 'K', 'M', 'N', 'P', 'T', 'V',
5820
                                'L', 'Y'}:
5821
                meta_key += current_char
5822
                # skip doubled consonants
5823
                if word[pos+1:pos+2] == current_char:
5824
                    pos += 2
5825
                else:
5826
                    pos += 1
5827
            else:
5828
                if current_char == 'C':
5829
                    # special case 'acción', 'reacción',etc.
5830
                    if word[pos+1:pos+2] == 'C':
5831
                        meta_key += 'X'
5832
                        pos += 2
5833
                    # special case 'cesar', 'cien', 'cid', 'conciencia'
5834
                    elif word[pos+1:pos+2] in {'E', 'I'}:
5835
                        meta_key += 'Z'
5836
                        pos += 2
5837
                    # base case
5838
                    else:
5839
                        meta_key += 'K'
5840
                        pos += 1
5841
                elif current_char == 'G':
5842
                    # special case 'gente', 'ecologia',etc
5843
                    if word[pos + 1:pos + 2] in {'E', 'I'}:
5844
                        meta_key += 'J'
5845
                        pos += 2
5846
                    # base case
5847
                    else:
5848
                        meta_key += 'G'
5849
                        pos += 1
5850
                elif current_char == 'H':
5851
                    # since the letter 'H' is silent in Spanish,
5852
                    # set the meta key to the vowel after the letter 'H'
5853
                    if _is_vowel(pos+1):
5854
                        meta_key += word[pos+1]
5855
                        pos += 2
5856
                    else:
5857
                        meta_key += 'H'
5858
                        pos += 1
5859
                elif current_char == 'Q':
5860
                    if word[pos+1:pos+2] == 'U':
5861
                        pos += 2
5862
                    else:
5863
                        pos += 1
5864
                    meta_key += 'K'
5865
                elif current_char == 'W':
5866
                    meta_key += 'U'
5867
                    pos += 1
5868
                elif current_char == 'R':
5869
                    meta_key += 'R'
5870
                    pos += 1
5871
                elif current_char == 'S':
5872
                    if not _is_vowel(pos+1) and pos == 0:
5873
                        meta_key += 'ES'
5874
                        pos += 1
5875
                    else:
5876
                        meta_key += 'S'
5877
                        pos += 1
5878
                elif current_char == 'Z':
5879
                    meta_key += 'Z'
5880
                    pos += 1
5881
                elif current_char == 'X':
5882
                    if len(word) > 1 and pos == 0 and not _is_vowel(pos+1):
5883
                        meta_key += 'EX'
5884
                        pos += 1
5885
                    else:
5886
                        meta_key += 'X'
5887
                        pos += 1
5888
                else:
5889
                    pos += 1
5890
5891
    # Final change from S to Z in modified version
5892
    if modified:
5893
        meta_key = meta_key.replace('S', 'Z')
5894
5895
    return meta_key
5896
5897
5898
def metasoundex(word, language='en'):
5899
    """Return the MetaSoundex code for a word.
5900
5901
    This is based on :cite:`Koneru:2017`.
5902
5903
    :param word:
5904
    :param language: either 'en' for English or 'es' for Spanish
5905
    :return:
5906
    """
5907
    _metasoundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5908
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5909
                                        '07430755015866075943077514'))
5910
5911
    if language == 'es':
5912
        return phonetic_spanish(spanish_metaphone(word))
5913
5914
    word = soundex(metaphone(word))
5915
    word = word[0].translate(_metasoundex_translation)+word[1:]
5916
5917
    return word
5918
5919
5920
def soundex_br(word, maxlength=4, zero_pad=True):
5921
    """Return the SoundexBR encoding of a word.
5922
5923
    This is based on :cite:`Marcelino:2015`.
5924
5925
    :param word:
5926
    :return:
5927
    """
5928
    _soundex_br_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5929
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5930
                                       '01230120022455012623010202'))
5931
5932
    word = normalize('NFKD', text_type(word.upper()))
5933
    word = ''.join(c for c in word if c in
5934
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5935
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5936
                    'Y', 'Z'})
5937
5938
    if word[:2] == 'WA':
5939
        first = 'V'
5940
    elif word[:1] == 'K' and word[1:2] in {'A', 'O', 'U'}:
5941
        first = 'C'
5942
    elif word[:1] == 'C' and word[1:2] in {'I', 'E'}:
5943
        first = 'S'
5944
    elif word[:1] == 'G' and word[1:2] in {'E', 'I'}:
5945
        first = 'J'
5946
    elif word[:1] == 'Y':
5947
        first = 'I'
5948
    elif word[:1] == 'H':
5949
        first = word[1:2]
5950
        word = word[1:]
5951
    else:
5952
        first = word[:1]
5953
5954
    sdx = first + word[1:].translate(_soundex_br_translation)
5955
    sdx = _delete_consecutive_repeats(sdx)
5956
    sdx = sdx.replace('0', '')
5957
5958
    if zero_pad:
5959
        sdx += ('0'*maxlength)
5960
5961
    return sdx[:maxlength]
5962
5963
5964
def nrl(word):
5965
    """Return the Naval Research Laboratory phonetic encoding of a word.
5966
5967
    This is defined by :cite:`Elovitz:1976`.
5968
5969
    :param word:
5970
    :return:
5971
    """
5972
    def to_regex(pattern, left=True):
5973
        new_pattern = ''
5974
        replacements = {'#': '[AEIOU]+',
5975
                        ':': '[BCDFGHJKLMNPQRSTVWXYZ]*',
5976
                        '^': '[BCDFGHJKLMNPQRSTVWXYZ]',
5977
                        '.': '[BDVGJLMNTWZ]',
5978
                        '%': '(ER|E|ES|ED|ING|ELY)',
5979
                        '+': '[EIY]',
5980
                        ' ': '^'}
5981
        for char in pattern:
5982
            new_pattern += (replacements[char] if char in replacements
5983
                            else char)
5984
5985
        if left:
5986
            new_pattern += '$'
5987
            if '^' not in pattern:
5988
                new_pattern = '^.*' + new_pattern
5989
        else:
5990
            new_pattern = '^' + new_pattern.replace('^', '$')
5991
            if '$' not in new_pattern:
5992
                new_pattern += '.*$'
5993
5994
        return new_pattern
5995
5996
    rules = {' ': (('', ' ', '', ' '),
5997
                   ('', '-', '', ''),
5998
                   ('.', '\'S', '', 'z'),
5999
                   ('#:.E', '\'S', '', 'z'),
6000
                   ('#', '\'S', '', 'z'),
6001
                   ('', '\'', '', ''),
6002
                   ('', ',', '', ' '),
6003
                   ('', '.', '', ' '),
6004
                   ('', '?', '', ' '),
6005
                   ('', '!', '', ' ')),
6006
             'A': (('', 'A', ' ', 'AX'),
6007
                   (' ', 'ARE', ' ', 'AAr'),
6008
                   (' ', 'AR', 'O', 'AXr'),
6009
                   ('', 'AR', '#', 'EHr'),
6010
                   ('^', 'AS', '#', 'EYs'),
6011
                   ('', 'A', 'WA', 'AX'),
6012
                   ('', 'AW', '', 'AO'),
6013
                   (' :', 'ANY', '', 'EHnIY'),
6014
                   ('', 'A', '^+#', 'EY'),
6015
                   ('#:', 'ALLY', '', 'AXlIY'),
6016
                   (' ', 'AL', '#', 'AXl'),
6017
                   ('', 'AGAIN', '', 'AXgEHn'),
6018
                   ('#:', 'AG', 'E', 'IHj'),
6019
                   ('', 'A', '^+:#', 'AE'),
6020
                   (' :', 'A', '^+ ', 'EY'),
6021
                   ('', 'A', '^%', 'EY'),
6022
                   (' ', 'ARR', '', 'AXr'),
6023
                   ('', 'ARR', '', 'AEr'),
6024
                   (' :', 'AR', ' ', 'AAr'),
6025
                   ('', 'AR', ' ', 'ER'),
6026
                   ('', 'AR', '', 'AAr'),
6027
                   ('', 'AIR', '', 'EHr'),
6028
                   ('', 'AI', '', 'EY'),
6029
                   ('', 'AY', '', 'EY'),
6030
                   ('', 'AU', '', 'AO'),
6031
                   ('#:', 'AL', ' ', 'AXl'),
6032
                   ('#:', 'ALS', ' ', 'AXlz'),
6033
                   ('', 'ALK', '', 'AOk'),
6034
                   ('', 'AL', '^', 'AOl'),
6035
                   (' :', 'ABLE', '', 'EYbAXl'),
6036
                   ('', 'ABLE', '', 'AXbAXl'),
6037
                   ('', 'ANG', '+', 'EYnj'),
6038
                   ('', 'A', '', 'AE')),
6039
             'B': ((' ', 'BE', '^#', 'bIH'),
6040
                   ('', 'BEING', '', 'bIYIHNG'),
6041
                   (' ', 'BOTH', ' ', 'bOWTH'),
6042
                   (' ', 'BUS', '#', 'bIHz'),
6043
                   ('', 'BUIL', '', 'bIHl'),
6044
                   ('', 'B', '', 'b')),
6045
             'C': ((' ', 'CH', '^', 'k'),
6046
                   ('^E', 'CH', '', 'k'),
6047
                   ('', 'CH', '', 'CH'),
6048
                   (' S', 'CI', '#', 'sAY'),
6049
                   ('', 'CI', 'A', 'SH'),
6050
                   ('', 'CI', 'O', 'SH'),
6051
                   ('', 'CI', 'EN', 'SH'),
6052
                   ('', 'C', '+', 's'),
6053
                   ('', 'CK', '', 'k'),
6054
                   ('', 'COM', '%', 'kAHm'),
6055
                   ('', 'C', '', 'k')),
6056
             'D': (('#:', 'DED', ' ', 'dIHd'),
6057
                   ('.E', 'D', ' ', 'd'),
6058
                   ('#:^E', 'D', ' ', 't'),
6059
                   (' ', 'DE', '^#', 'dIH'),
6060
                   (' ', 'DO', ' ', 'dUW'),
6061
                   (' ', 'DOES', '', 'dAHz'),
6062
                   (' ', 'DOING', '', 'dUWIHNG'),
6063
                   (' ', 'DOW', '', 'dAW'),
6064
                   ('', 'DU', 'A', 'jUW'),
6065
                   ('', 'D', '', 'd')),
6066
             'E': (('#:', 'E', ' ', ''),
6067
                   ('\':^', 'E', ' ', ''),
6068
                   (' :', 'E', ' ', 'IY'),
6069
                   ('#', 'ED', ' ', 'd'),
6070
                   ('#:', 'E', 'D ', ''),
6071
                   ('', 'EV', 'ER', 'EHv'),
6072
                   ('', 'E', '^%', 'IY'),
6073
                   ('', 'ERI', '#', 'IYrIY'),
6074
                   ('', 'ERI', '', 'EHrIH'),
6075
                   ('#:', 'ER', '#', 'ER'),
6076
                   ('', 'ER', '#', 'EHr'),
6077
                   ('', 'ER', '', 'ER'),
6078
                   (' ', 'EVEN', '', 'IYvEHn'),
6079
                   ('#:', 'E', 'W', ''),
6080
                   ('T', 'EW', '', 'UW'),
6081
                   ('S', 'EW', '', 'UW'),
6082
                   ('R', 'EW', '', 'UW'),
6083
                   ('D', 'EW', '', 'UW'),
6084
                   ('L', 'EW', '', 'UW'),
6085
                   ('Z', 'EW', '', 'UW'),
6086
                   ('N', 'EW', '', 'UW'),
6087
                   ('J', 'EW', '', 'UW'),
6088
                   ('TH', 'EW', '', 'UW'),
6089
                   ('CH', 'EW', '', 'UW'),
6090
                   ('SH', 'EW', '', 'UW'),
6091
                   ('', 'EW', '', 'yUW'),
6092
                   ('', 'E', 'O', 'IY'),
6093
                   ('#:S', 'ES', ' ', 'IHz'),
6094
                   ('#:C', 'ES', ' ', 'IHz'),
6095
                   ('#:G', 'ES', ' ', 'IHz'),
6096
                   ('#:Z', 'ES', ' ', 'IHz'),
6097
                   ('#:X', 'ES', ' ', 'IHz'),
6098
                   ('#:J', 'ES', ' ', 'IHz'),
6099
                   ('#:CH', 'ES', ' ', 'IHz'),
6100
                   ('#:SH', 'ES', ' ', 'IHz'),
6101
                   ('#:', 'E', 'S ', ''),
6102
                   ('#:', 'ELY', ' ', 'lIY'),
6103
                   ('#:', 'EMENT', '', 'mEHnt'),
6104
                   ('', 'EFUL', '', 'fUHl'),
6105
                   ('', 'EE', '', 'IY'),
6106
                   ('', 'EARN', '', 'ERn'),
6107
                   (' ', 'EAR', '^', 'ER'),
6108
                   ('', 'EAD', '', 'EHd'),
6109
                   ('#:', 'EA', ' ', 'IYAX'),
6110
                   ('', 'EA', 'SU', 'EH'),
6111
                   ('', 'EA', '', 'IY'),
6112
                   ('', 'EIGH', '', 'EY'),
6113
                   ('', 'EI', '', 'IY'),
6114
                   (' ', 'EYE', '', 'AY'),
6115
                   ('', 'EY', '', 'IY'),
6116
                   ('', 'EU', '', 'yUW'),
6117
                   ('', 'E', '', 'EH')),
6118
             'F': (('', 'FUL', '', 'fUHl'),
6119
                   ('', 'F', '', 'f')),
6120
             'G': (('', 'GIV', '', 'gIHv'),
6121
                   (' ', 'G', 'I^', 'g'),
6122
                   ('', 'GE', 'T', 'gEH'),
6123
                   ('SU', 'GGES', '', 'gjEHs'),
6124
                   ('', 'GG', '', 'g'),
6125
                   (' B#', 'G', '', 'g'),
6126
                   ('', 'G', '+', 'j'),
6127
                   ('', 'GREAT', '', 'grEYt'),
6128
                   ('#', 'GH', '', ''),
6129
                   ('', 'G', '', 'g')),
6130
             'H': ((' ', 'HAV', '', 'hAEv'),
6131
                   (' ', 'HERE', '', 'hIYr'),
6132
                   (' ', 'HOUR', '', 'AWER'),
6133
                   ('', 'HOW', '', 'hAW'),
6134
                   ('', 'H', '#', 'h'),
6135
                   ('', 'H', '', '')),
6136
             'I': ((' ', 'IN', '', 'IHn'),
6137
                   (' ', 'I', ' ', 'AY'),
6138
                   ('', 'IN', 'D', 'AYn'),
6139
                   ('', 'IER', '', 'IYER'),
6140
                   ('#:R', 'IED', '', 'IYd'),
6141
                   ('', 'IED', ' ', 'AYd'),
6142
                   ('', 'IEN', '', 'IYEHn'),
6143
                   ('', 'IE', 'T', 'AYEH'),
6144
                   (' :', 'I', '%', 'AY'),
6145
                   ('', 'I', '%', 'IY'),
6146
                   ('', 'IE', '', 'IY'),
6147
                   ('', 'I', '^+:#', 'IH'),
6148
                   ('', 'IR', '#', 'AYr'),
6149
                   ('', 'IZ', '%', 'AYz'),
6150
                   ('', 'IS', '%', 'AYz'),
6151
                   ('', 'I', 'D%', 'AY'),
6152
                   ('+^', 'I', '^+', 'IH'),
6153
                   ('', 'I', 'T%', 'AY'),
6154
                   ('#:^', 'I', '^+', 'IH'),
6155
                   ('', 'I', '^+', 'AY'),
6156
                   ('', 'IR', '', 'ER'),
6157
                   ('', 'IGH', '', 'AY'),
6158
                   ('', 'ILD', '', 'AYld'),
6159
                   ('', 'IGN', ' ', 'AYn'),
6160
                   ('', 'IGN', '^', 'AYn'),
6161
                   ('', 'IGN', '%', 'AYn'),
6162
                   ('', 'IQUE', '', 'IYk'),
6163
                   ('', 'I', '', 'IH')),
6164
             'J': (('', 'J', '', 'j'),),
6165
             'K': ((' ', 'K', 'N', ''),
6166
                   ('', 'K', '', 'k')),
6167
             'L': (('', 'LO', 'C#', 'lOW'),
6168
                   ('L', 'L', '', ''),
6169
                   ('#:^', 'L', '%', 'AXl'),
6170
                   ('', 'LEAD', '', 'lIYd'),
6171
                   ('', 'L', '', 'l')),
6172
             'M': (('', 'MOV', '', 'mUWv'),
6173
                   ('', 'M', '', 'm')),
6174
             'N': (('E', 'NG', '+', 'nj'),
6175
                   ('', 'NG', 'R', 'NGg'),
6176
                   ('', 'NG', '#', 'NGg'),
6177
                   ('', 'NGL', '%', 'NGgAXl'),
6178
                   ('', 'NG', '', 'NG'),
6179
                   ('', 'NK', '', 'NGk'),
6180
                   (' ', 'NOW', ' ', 'nAW'),
6181
                   ('', 'N', '', 'n')),
6182
             'O': (('', 'OF', ' ', 'AXv'),
6183
                   ('', 'OROUGH', '', 'EROW'),
6184
                   ('#:', 'OR', ' ', 'ER'),
6185
                   ('#:', 'ORS', ' ', 'ERz'),
6186
                   ('', 'OR', '', 'AOr'),
6187
                   (' ', 'ONE', '', 'wAHn'),
6188
                   ('', 'OW', '', 'OW'),
6189
                   (' ', 'OVER', '', 'OWvER'),
6190
                   ('', 'OV', '', 'AHv'),
6191
                   ('', 'O', '^%', 'OW'),
6192
                   ('', 'O', '^EN', 'OW'),
6193
                   ('', 'O', '^I#', 'OW'),
6194
                   ('', 'OL', 'D', 'OWl'),
6195
                   ('', 'OUGHT', '', 'AOt'),
6196
                   ('', 'OUGH', '', 'AHf'),
6197
                   (' ', 'OU', '', 'AW'),
6198
                   ('H', 'OU', 'S#', 'AW'),
6199
                   ('', 'OUS', '', 'AXs'),
6200
                   ('', 'OUR', '', 'AOr'),
6201
                   ('', 'OULD', '', 'UHd'),
6202
                   ('^', 'OU', '^L', 'AH'),
6203
                   ('', 'OUP', '', 'UWp'),
6204
                   ('', 'OU', '', 'AW'),
6205
                   ('', 'OY', '', 'OY'),
6206
                   ('', 'OING', '', 'OWIHNG'),
6207
                   ('', 'OI', '', 'OY'),
6208
                   ('', 'OOR', '', 'AOr'),
6209
                   ('', 'OOK', '', 'UHk'),
6210
                   ('', 'OOD', '', 'UHd'),
6211
                   ('', 'OO', '', 'UW'),
6212
                   ('', 'O', 'E', 'OW'),
6213
                   ('', 'O', ' ', 'OW'),
6214
                   ('', 'OA', '', 'OW'),
6215
                   (' ', 'ONLY', '', 'OWnlIY'),
6216
                   (' ', 'ONCE', '', 'wAHns'),
6217
                   ('', 'ON\'T', '', 'OWnt'),
6218
                   ('C', 'O', 'N', 'AA'),
6219
                   ('', 'O', 'NG', 'AO'),
6220
                   (' :^', 'O', 'N', 'AH'),
6221
                   ('I', 'ON', '', 'AXn'),
6222
                   ('#:', 'ON', ' ', 'AXn'),
6223
                   ('#^', 'ON', '', 'AXn'),
6224
                   ('', 'O', 'ST ', 'OW'),
6225
                   ('', 'OF', '^', 'AOf'),
6226
                   ('', 'OTHER', '', 'AHDHER'),
6227
                   ('', 'OSS', ' ', 'AOs'),
6228
                   ('#:^', 'OM', '', 'AHm'),
6229
                   ('', 'O', '', 'AA')),
6230
             'P': (('', 'PH', '', 'f'),
6231
                   ('', 'PEOP', '', 'pIYp'),
6232
                   ('', 'POW', '', 'pAW'),
6233
                   ('', 'PUT', ' ', 'pUHt'),
6234
                   ('', 'P', '', 'p')),
6235
             'Q': (('', 'QUAR', '', 'kwAOr'),
6236
                   ('', 'QU', '', 'kw'),
6237
                   ('', 'Q', '', 'k')),
6238
             'R': ((' ', 'RE', '^#', 'rIY'),
6239
                   ('', 'R', '', 'r')),
6240
             'S': (('', 'SH', '', 'SH'),
6241
                   ('#', 'SION', '', 'ZHAXn'),
6242
                   ('', 'SOME', '', 'sAHm'),
6243
                   ('#', 'SUR', '#', 'ZHER'),
6244
                   ('', 'SUR', '#', 'SHER'),
6245
                   ('#', 'SU', '#', 'ZHUW'),
6246
                   ('#', 'SSU', '#', 'SHUW'),
6247
                   ('#', 'SED', ' ', 'zd'),
6248
                   ('#', 'S', '#', 'z'),
6249
                   ('', 'SAID', '', 'sEHd'),
6250
                   ('^', 'SION', '', 'SHAXn'),
6251
                   ('', 'S', 'S', ''),
6252
                   ('.', 'S', ' ', 'z'),
6253
                   ('#:.E', 'S', ' ', 'z'),
6254
                   ('#:^##', 'S', ' ', 'z'),
6255
                   ('#:^#', 'S', ' ', 's'),
6256
                   ('U', 'S', ' ', 's'),
6257
                   (' :#', 'S', ' ', 'z'),
6258
                   (' ', 'SCH', '', 'sk'),
6259
                   ('', 'S', 'C+', ''),
6260
                   ('#', 'SM', '', 'zm'),
6261
                   ('#', 'SN', '\'', 'zAXn'),
6262
                   ('', 'S', '', 's')),
6263
             'T': ((' ', 'THE', ' ', 'DHAX'),
6264
                   ('', 'TO', ' ', 'tUW'),
6265
                   ('', 'THAT', ' ', 'DHAEt'),
6266
                   (' ', 'THIS', ' ', 'DHIHs'),
6267
                   (' ', 'THEY', '', 'DHEY'),
6268
                   (' ', 'THERE', '', 'DHEHr'),
6269
                   ('', 'THER', '', 'DHER'),
6270
                   ('', 'THEIR', '', 'DHEHr'),
6271
                   (' ', 'THAN', ' ', 'DHAEn'),
6272
                   (' ', 'THEM', ' ', 'DHEHm'),
6273
                   ('', 'THESE', ' ', 'DHIYz'),
6274
                   (' ', 'THEN', '', 'DHEHn'),
6275
                   ('', 'THROUGH', '', 'THrUW'),
6276
                   ('', 'THOSE', '', 'DHOWz'),
6277
                   ('', 'THOUGH', ' ', 'DHOW'),
6278
                   (' ', 'THUS', '', 'DHAHs'),
6279
                   ('', 'TH', '', 'TH'),
6280
                   ('#:', 'TED', ' ', 'tIHd'),
6281
                   ('S', 'TI', '#N', 'CH'),
6282
                   ('', 'TI', 'O', 'SH'),
6283
                   ('', 'TI', 'A', 'SH'),
6284
                   ('', 'TIEN', '', 'SHAXn'),
6285
                   ('', 'TUR', '#', 'CHER'),
6286
                   ('', 'TU', 'A', 'CHUW'),
6287
                   (' ', 'TWO', '', 'tUW'),
6288
                   ('', 'T', '', 't')),
6289
             'U': ((' ', 'UN', 'I', 'yUWn'),
6290
                   (' ', 'UN', '', 'AHn'),
6291
                   (' ', 'UPON', '', 'AXpAOn'),
6292
                   ('T', 'UR', '#', 'UHr'),
6293
                   ('S', 'UR', '#', 'UHr'),
6294
                   ('R', 'UR', '#', 'UHr'),
6295
                   ('D', 'UR', '#', 'UHr'),
6296
                   ('L', 'UR', '#', 'UHr'),
6297
                   ('Z', 'UR', '#', 'UHr'),
6298
                   ('N', 'UR', '#', 'UHr'),
6299
                   ('J', 'UR', '#', 'UHr'),
6300
                   ('TH', 'UR', '#', 'UHr'),
6301
                   ('CH', 'UR', '#', 'UHr'),
6302
                   ('SH', 'UR', '#', 'UHr'),
6303
                   ('', 'UR', '#', 'yUHr'),
6304
                   ('', 'UR', '', 'ER'),
6305
                   ('', 'U', '^ ', 'AH'),
6306
                   ('', 'U', '^^', 'AH'),
6307
                   ('', 'UY', '', 'AY'),
6308
                   (' G', 'U', '#', ''),
6309
                   ('G', 'U', '%', ''),
6310
                   ('G', 'U', '#', 'w'),
6311
                   ('#N', 'U', '', 'yUW'),
6312
                   ('T', 'U', '', 'UW'),
6313
                   ('S', 'U', '', 'UW'),
6314
                   ('R', 'U', '', 'UW'),
6315
                   ('D', 'U', '', 'UW'),
6316
                   ('L', 'U', '', 'UW'),
6317
                   ('Z', 'U', '', 'UW'),
6318
                   ('N', 'U', '', 'UW'),
6319
                   ('J', 'U', '', 'UW'),
6320
                   ('TH', 'U', '', 'UW'),
6321
                   ('CH', 'U', '', 'UW'),
6322
                   ('SH', 'U', '', 'UW'),
6323
                   ('', 'U', '', 'yUW')),
6324
             'V': (('', 'VIEW', '', 'vyUW'),
6325
                   ('', 'V', '', 'v')),
6326
             'W': ((' ', 'WERE', '', 'wER'),
6327
                   ('', 'WA', 'S', 'wAA'),
6328
                   ('', 'WA', 'T', 'wAA'),
6329
                   ('', 'WHERE', '', 'WHEHr'),
6330
                   ('', 'WHAT', '', 'WHAAt'),
6331
                   ('', 'WHOL', '', 'hOWl'),
6332
                   ('', 'WHO', '', 'hUW'),
6333
                   ('', 'WH', '', 'WH'),
6334
                   ('', 'WAR', '', 'wAOr'),
6335
                   ('', 'WOR', '^', 'wER'),
6336
                   ('', 'WR', '', 'r'),
6337
                   ('', 'W', '', 'w')),
6338
             'X': (('', 'X', '', 'ks'),),
6339
             'Y': (('', 'YOUNG', '', 'yAHNG'),
6340
                   (' ', 'YOU', '', 'yUW'),
6341
                   (' ', 'YES', '', 'yEHs'),
6342
                   (' ', 'Y', '', 'y'),
6343
                   ('#:^', 'Y', ' ', 'IY'),
6344
                   ('#:^', 'Y', 'I', 'IY'),
6345
                   (' :', 'Y', ' ', 'AY'),
6346
                   (' :', 'Y', '#', 'AY'),
6347
                   (' :', 'Y', '^+:#', 'IH'),
6348
                   (' :', 'Y', '^#', 'AY'),
6349
                   ('', 'Y', '', 'IH')),
6350
             'Z': (('', 'Z', '', 'z'),)}
6351
6352
    word = word.upper()
6353
6354
    pron = ''
6355
    pos = 0
6356
    while pos < len(word):
6357
        left_orig = word[:pos]
6358
        right_orig = word[pos:]
6359
        first = word[pos] if word[pos] in rules else ' '
6360
        for rule in rules[first]:
6361
            left, match, right, out = rule
6362
            if right_orig.startswith(match):
6363
                if left:
6364
                    l_pattern = to_regex(left, left=True)
6365
                if right:
6366
                    r_pattern = to_regex(right, left=False)
6367
                if ((not left or re_match(l_pattern, left_orig)) and
0 ignored issues
show
introduced by
The variable l_pattern does not seem to be defined for all execution paths.
Loading history...
6368
                        (not right or
6369
                         re_match(r_pattern, right_orig[len(match):]))):
0 ignored issues
show
introduced by
The variable r_pattern does not seem to be defined for all execution paths.
Loading history...
6370
                    pron += out
6371
                    pos += len(match)
6372
                    break
6373
        else:
6374
            pron += word[pos]
6375
            pos += 1
6376
6377
    return pron
6378
6379
6380
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
6381
         concat=False, filter_langs=False):
6382
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
6383
6384
    The Beider-Morse Phonetic Matching algorithm is described in
6385
    :cite:`Beider:2008`.
6386
    The reference implementation is licensed under GPLv3.
6387
6388
    :param str word: the word to transform
6389
    :param str language_arg: the language of the term; supported values
6390
        include:
6391
6392
            - 'any'
6393
            - 'arabic'
6394
            - 'cyrillic'
6395
            - 'czech'
6396
            - 'dutch'
6397
            - 'english'
6398
            - 'french'
6399
            - 'german'
6400
            - 'greek'
6401
            - 'greeklatin'
6402
            - 'hebrew'
6403
            - 'hungarian'
6404
            - 'italian'
6405
            - 'latvian'
6406
            - 'polish'
6407
            - 'portuguese'
6408
            - 'romanian'
6409
            - 'russian'
6410
            - 'spanish'
6411
            - 'turkish'
6412
6413
    :param str name_mode: the name mode of the algorithm:
6414
6415
            - 'gen' -- general (default)
6416
            - 'ash' -- Ashkenazi
6417
            - 'sep' -- Sephardic
6418
6419
    :param str match_mode: matching mode: 'approx' or 'exact'
6420
    :param bool concat: concatenation mode
6421
    :param bool filter_langs: filter out incompatible languages
6422
    :returns: the BMPM value(s)
6423
    :rtype: tuple
6424
6425
    >>> bmpm('Christopher')
6426
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6427
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
6428
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
6429
    zritofi'
6430
    >>> bmpm('Niall')
6431
    'nial niol'
6432
    >>> bmpm('Smith')
6433
    'zmit'
6434
    >>> bmpm('Schmidt')
6435
    'zmit stzmit'
6436
6437
    >>> bmpm('Christopher', language_arg='German')
6438
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6439
    xristYfir'
6440
    >>> bmpm('Christopher', language_arg='English')
6441
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
6442
    xrQstafir'
6443
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
6444
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
6445
    xristYfir'
6446
6447
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
6448
    'xriStopher xriStofer xristopher xristofer'
6449
    """
6450
    return _bmpm(word, language_arg, name_mode, match_mode,
6451
                 concat, filter_langs)
6452
6453
6454
if __name__ == '__main__':
6455
    import doctest
6456
    doctest.testmod()
6457