Completed
Push — master ( 7c68d6...c05634 )
by Chris
12:01
created

abydos.phonetic.parmar_kumbharana()   B

Complexity

Conditions 5

Size

Total Lines 41
Code Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 24
nop 1
dl 0
loc 41
rs 8.8373
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (5244/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Refined Soundex
26
    - Daitch-Mokotoff Soundex
27
    - Kölner Phonetik
28
    - NYSIIS
29
    - Match Rating Algorithm
30
    - Metaphone
31
    - Double Metaphone
32
    - Caverphone
33
    - Alpha Search Inquiry System
34
    - Fuzzy Soundex
35
    - Phonex
36
    - Phonem
37
    - Phonix
38
    - SfinxBis
39
    - phonet
40
    - Standardized Phonetic Frequency Code
41
    - Statistics Canada
42
    - Lein
43
    - Roger Root
44
    - Oxford Name Compression Algorithm (ONCA)
45
    - Eudex phonetic hash
46
    - Haase Phonetik
47
    - Reth-Schek Phonetik
48
    - FONEM
49
    - Parmar-Kumbharana
50
    - Beider-Morse Phonetic Matching
51
"""
52
53
from __future__ import division, unicode_literals
54
55
import re
56
import unicodedata
57
from collections import Counter
58
from itertools import groupby, product
59
60
from six import text_type
61
from six.moves import range
62
63
from ._bm import _bmpm
64
65
_INFINITY = float('inf')
66
67
68
def _delete_consecutive_repeats(word):
69
    """Delete consecutive repeated characters in a word.
70
71
    :param str word: the word to transform
72
    :returns: word with consecutive repeating characters collapsed to
73
        a single instance
74
    :rtype: str
75
    """
76
    return ''.join(char for char, _ in groupby(word))
77
78
79
def russell_index(word):
80
    """Return the Russell Index (integer output) of a word.
81
82
    This follows Robert C. Russell's Index algorithm, as described in
83
    US Patent 1,261,167 (1917)
84
85
    :param str word: the word to transform
86
    :returns: the Russell Index value
87
    :rtype: int
88
89
    >>> russell_index('Christopher')
90
    3813428
91
    >>> russell_index('Niall')
92
    715
93
    >>> russell_index('Smith')
94
    3614
95
    >>> russell_index('Schmidt')
96
    3614
97
    """
98
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
99
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
100
                                    '12341231356712383412313'))
101
102
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
103
    word = word.replace('ß', 'SS')
104
    word = word.replace('GH', '')  # discard gh (rule 3)
105
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
106
107
    # translate according to Russell's mapping
108
    word = ''.join(c for c in word if c in
109
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
110
                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
111
    sdx = word.translate(_russell_translation)
112
113
    # remove any 1s after the first occurrence
114
    one = sdx.find('1')+1
115
    if one:
116
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
117
118
    # remove repeating characters
119
    sdx = _delete_consecutive_repeats(sdx)
120
121
    # return as an int
122
    return int(sdx) if sdx else float('NaN')
123
124
125
def russell_index_num_to_alpha(num):
126
    """Convert the Russell Index integer to an alphabetic string.
127
128
    This follows Robert C. Russell's Index algorithm, as described in
129
    US Patent 1,261,167 (1917)
130
131
    :param int num: a Russell Index integer value
132
    :returns: the Russell Index as an alphabetic string
133
    :rtype: str
134
135
    >>> russell_index_num_to_alpha(3813428)
136
    'CRACDBR'
137
    >>> russell_index_num_to_alpha(715)
138
    'NAL'
139
    >>> russell_index_num_to_alpha(3614)
140
    'CMAD'
141
    """
142
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
143
                                        'ABCDLMNR'))
144
    num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
145
                                                     '6', '7', '8'})
146
    if num:
147
        return num.translate(_russell_num_translation)
148
    return ''
149
150
151
def russell_index_alpha(word):
152
    """Return the Russell Index (alphabetic output) for the word.
153
154
    This follows Robert C. Russell's Index algorithm, as described in
155
    US Patent 1,261,167 (1917)
156
157
    :param str word: the word to transform
158
    :returns: the Russell Index value as an alphabetic string
159
    :rtype: str
160
161
    >>> russell_index_alpha('Christopher')
162
    'CRACDBR'
163
    >>> russell_index_alpha('Niall')
164
    'NAL'
165
    >>> russell_index_alpha('Smith')
166
    'CMAD'
167
    >>> russell_index_alpha('Schmidt')
168
    'CMAD'
169
    """
170
    if word:
171
        return russell_index_num_to_alpha(russell_index(word))
172
    return ''
173
174
175
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
176
    """Return the Soundex code for a word.
177
178
    :param str word: the word to transform
179
    :param int maxlength: the length of the code returned (defaults to 4)
180
    :param str var: the variant of the algorithm to employ (defaults to
181
        'American'):
182
183
        - 'American' follows the American Soundex algorithm, as described at
184
          http://www.archives.gov/publications/general-info-leaflets/55-census.html
185
          and in Knuth(1998:394); this is also called Miracode
186
        - 'special' follows the rules from the 1880-1910 US Census
187
          retrospective re-analysis, in which h & w are not treated as blocking
188
          consonants but as vowels.
189
          Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
190
        - 'Census' follows the rules laid out in GIL 55 by the US Census,
191
          including coding prefixed and unprefixed versions of some names
192
193
    :param bool reverse: reverse the word before computing the selected Soundex
194
        (defaults to False); This results in "Reverse Soundex"
195
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
196
        maxlength string
197
    :returns: the Soundex value
198
    :rtype: str
199
200
    >>> soundex("Christopher")
201
    'C623'
202
    >>> soundex("Niall")
203
    'N400'
204
    >>> soundex('Smith')
205
    'S530'
206
    >>> soundex('Schmidt')
207
    'S530'
208
209
210
    >>> soundex('Christopher', maxlength=_INFINITY)
211
    'C623160000000000000000000000000000000000000000000000000000000000'
212
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
213
    'C62316'
214
215
    >>> soundex('Christopher', reverse=True)
216
    'R132'
217
218
    >>> soundex('Ashcroft')
219
    'A261'
220
    >>> soundex('Asicroft')
221
    'A226'
222
    >>> soundex('Ashcroft', var='special')
223
    'A226'
224
    >>> soundex('Asicroft', var='special')
225
    'A226'
226
    """
227
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
228
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
229
                                    '01230129022455012623019202'))
230
231
    # Require a maxlength of at least 4 and not more than 64
232
    if maxlength is not None:
233
        maxlength = min(max(4, maxlength), 64)
234
    else:
235
        maxlength = 64
236
237
    # uppercase, normalize, decompose, and filter non-A-Z out
238
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
239
    word = word.replace('ß', 'SS')
240
241
    if var == 'Census':
242
        # Should these prefixes be supplemented? (VANDE, DELA, VON)
243
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
244
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
245
                    soundex(word[3:], maxlength, 'American', reverse,
246
                            zero_pad))
247
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
248
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
249
                    soundex(word[2:], maxlength, 'American', reverse,
250
                            zero_pad))
251
        # Otherwise, proceed as usual (var='American' mode, ostensibly)
252
253
    word = ''.join(c for c in word if c in
254
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
255
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
256
                    'Y', 'Z'})
257
258
    # Nothing to convert, return base case
259
    if not word:
260
        if zero_pad:
261
            return '0'*maxlength
262
        return '0'
263
264
    # Reverse word if computing Reverse Soundex
265
    if reverse:
266
        word = word[::-1]
267
268
    # apply the Soundex algorithm
269
    sdx = word.translate(_soundex_translation)
270
271
    if var == 'special':
272
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
273
    else:
274
        sdx = sdx.replace('9', '')  # rule 1
275
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
276
277
    if word[0] in 'HW':
278
        sdx = word[0] + sdx
279
    else:
280
        sdx = word[0] + sdx[1:]
281
    sdx = sdx.replace('0', '')  # rule 1
282
283
    if zero_pad:
284
        sdx += ('0'*maxlength)  # rule 4
285
286
    return sdx[:maxlength]
287
288
289
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False,
290
                    retain_vowels=False):
291
    """Return the Refined Soundex code for a word.
292
293
    This is Soundex, but with more character classes. It was defined by
294
    Carolyn B. Boyce:
295
    https://web.archive.org/web/20010513121003/http://www.bluepoof.com:80/Soundex/info2.html
296
297
    :param word: the word to transform
298
    :param maxlength: the length of the code returned (defaults to unlimited)
299
    :param reverse: reverse the word before computing the selected Soundex
300
        (defaults to False); This results in "Reverse Soundex"
301
    :param zero_pad: pad the end of the return value with 0s to achieve a
302
        maxlength string
303
    :param retain_vowels: retain vowels (as 0) in the resulting code
304
    :returns: the Refined Soundex value
305
    :rtype: str
306
307
    >>> refined_soundex('Christopher')
308
    'C3090360109'
309
    >>> refined_soundex('Niall')
310
    'N807'
311
    >>> refined_soundex('Smith')
312
    'S38060'
313
    >>> refined_soundex('Schmidt')
314
    'S30806'
315
    """
316
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
317
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
318
                                        '01360240043788015936020505'))
319
320
    # uppercase, normalize, decompose, and filter non-A-Z out
321
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
322
    word = word.replace('ß', 'SS')
323
    word = ''.join(c for c in word if c in
324
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
325
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
326
                    'Y', 'Z'})
327
328
    # Reverse word if computing Reverse Soundex
329
    if reverse:
330
        word = word[::-1]
331
332
    # apply the Soundex algorithm
333
    sdx = word[0] + word.translate(_ref_soundex_translation)
334
    sdx = _delete_consecutive_repeats(sdx)
335
    if not retain_vowels:
336
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y
337
338
    if maxlength < _INFINITY:
339
        if zero_pad:
340
            sdx += ('0' * maxlength)
341
        if maxlength:
342
            sdx = sdx[:maxlength]
343
344
    return sdx
345
346
347
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True):
348
    """Return the Daitch-Mokotoff Soundex code for a word.
349
350
    Returns values of a word as a set. A collection is necessary since there
351
    can be multiple values for a single word.
352
353
    :param word: the word to transform
354
    :param maxlength: the length of the code returned (defaults to 6)
355
    :param reverse: reverse the word before computing the selected Soundex
356
        (defaults to False); This results in "Reverse Soundex"
357
    :param zero_pad: pad the end of the return value with 0s to achieve a
358
        maxlength string
359
    :returns: the Daitch-Mokotoff Soundex value
360
    :rtype: str
361
362
    >>> dm_soundex('Christopher')
363
    {'494379', '594379'}
364
    >>> dm_soundex('Niall')
365
    {'680000'}
366
    >>> dm_soundex('Smith')
367
    {'463000'}
368
    >>> dm_soundex('Schmidt')
369
    {'463000'}
370
371
    >>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)
372
    {'35457976754', '3557976754'}
373
    """
374
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
375
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
376
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
377
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
378
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
379
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
380
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
381
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
382
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
383
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
384
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
385
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
386
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
387
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
388
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
389
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
390
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
391
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
392
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
393
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
394
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
395
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
396
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
397
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
398
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
399
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
400
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
401
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
402
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
403
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
404
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
405
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
406
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
407
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
408
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
409
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
410
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
411
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
412
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
413
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
414
                  'CH': ((5, 4), (5, 4), (5, 4)),
415
                  'CK': ((5, 45), (5, 45), (5, 45)),
416
                  'C': ((5, 4), (5, 4), (5, 4)),
417
                  'J': ((1, 4), ('_', 4), ('_', 4)),
418
                  'RZ': ((94, 4), (94, 4), (94, 4)),
419
                  'RS': ((94, 4), (94, 4), (94, 4))}
420
421
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
422
                  'B': ('B'),
423
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
424
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
425
                        'DZ', 'D'),
426
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
427
                  'F': ('FB', 'F'),
428
                  'G': ('G'),
429
                  'H': ('H'),
430
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
431
                  'J': ('J'),
432
                  'K': ('KH', 'KS', 'K'),
433
                  'L': ('L'),
434
                  'M': ('MN', 'M'),
435
                  'N': ('NM', 'N'),
436
                  'O': ('OI', 'OJ', 'OY', 'O'),
437
                  'P': ('PF', 'PH', 'P'),
438
                  'Q': ('Q'),
439
                  'R': ('RS', 'RZ', 'R'),
440
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
441
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
442
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
443
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
444
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
445
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
446
                        'TS', 'TZ', 'T'),
447
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
448
                  'V': ('V'),
449
                  'W': ('W'),
450
                  'X': ('X'),
451
                  'Y': ('Y'),
452
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
453
                        'ZH', 'ZS', 'Z')}
454
455
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
456
    dms = ['']  # initialize empty code list
457
458
    # Require a maxlength of at least 6 and not more than 64
459
    if maxlength is not None:
460
        maxlength = min(max(6, maxlength), 64)
461
    else:
462
        maxlength = 64
463
464
    # uppercase, normalize, decompose, and filter non-A-Z
465
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
466
    word = word.replace('ß', 'SS')
467
    word = ''.join(c for c in word if c in
468
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
469
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
470
                    'Y', 'Z'})
471
472
    # Nothing to convert, return base case
473
    if not word:
474
        if zero_pad:
475
            return {'0'*maxlength}
476
        return {'0'}
477
478
    # Reverse word if computing Reverse Soundex
479
    if reverse:
480
        word = word[::-1]
481
482
    pos = 0
483
    while pos < len(word):
484
        # Iterate through _dms_order, which specifies the possible substrings
485
        # for which codes exist in the Daitch-Mokotoff coding
486
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
487
            if word[pos:].startswith(sstr):
488
                # Having determined a valid substring start, retrieve the code
489
                dm_val = _dms_table[sstr]
490
491
                # Having retried the code (triple), determine the correct
492
                # positional variant (first, pre-vocalic, elsewhere)
493
                if pos == 0:
494
                    dm_val = dm_val[0]
495
                elif (pos+len(sstr) < len(word) and
496
                      word[pos+len(sstr)] in _vowels):
497
                    dm_val = dm_val[1]
498
                else:
499
                    dm_val = dm_val[2]
500
501
                # Build the code strings
502
                if isinstance(dm_val, tuple):
503
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
504
                            + [_ + text_type(dm_val[1]) for _ in dms]
505
                else:
506
                    dms = [_ + text_type(dm_val) for _ in dms]
507
                pos += len(sstr)
508
                break
509
510
    # Filter out double letters and _ placeholders
511
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
512
           for _ in dms)
513
514
    # Trim codes and return set
515
    if zero_pad:
516
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
517
    else:
518
        dms = (_[:maxlength] for _ in dms)
519
    return set(dms)
520
521
522
def koelner_phonetik(word):
523
    """Return the Kölner Phonetik (numeric output) code for a word.
524
525
    Based on the algorithm described at
526
    https://de.wikipedia.org/wiki/Kölner_Phonetik
527
528
    While the output code is numeric, it is still a str because 0s can lead
529
    the code.
530
531
    :param str word: the word to transform
532
    :returns: the Kölner Phonetik value as a numeric string
533
    :rtype: str
534
535
    >>> koelner_phonetik('Christopher')
536
    '478237'
537
    >>> koelner_phonetik('Niall')
538
    '65'
539
    >>> koelner_phonetik('Smith')
540
    '862'
541
    >>> koelner_phonetik('Schmidt')
542
    '862'
543
    >>> koelner_phonetik('Müller')
544
    '657'
545
    >>> koelner_phonetik('Zimmermann')
546
    '86766'
547
    """
548
    # pylint: disable=too-many-branches
549
    def _after(word, i, letters):
550
        """Return True if word[i] follows one of the supplied letters."""
551
        if i > 0 and word[i-1] in letters:
552
            return True
553
        return False
554
555
    def _before(word, i, letters):
556
        """Return True if word[i] precedes one of the supplied letters."""
557
        if i+1 < len(word) and word[i+1] in letters:
558
            return True
559
        return False
560
561
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
562
563
    sdx = ''
564
565
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
566
    word = word.replace('ß', 'SS')
567
568
    word = word.replace('Ä', 'AE')
569
    word = word.replace('Ö', 'OE')
570
    word = word.replace('Ü', 'UE')
571
    word = ''.join(c for c in word if c in
572
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
573
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
574
                    'Y', 'Z'})
575
576
    # Nothing to convert, return base case
577
    if not word:
578
        return sdx
579
580
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
581 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
582
            sdx += '0'
583
        elif word[i] == 'B':
584
            sdx += '1'
585
        elif word[i] == 'P':
586
            if _before(word, i, {'H'}):
587
                sdx += '3'
588
            else:
589
                sdx += '1'
590
        elif word[i] in {'D', 'T'}:
591
            if _before(word, i, {'C', 'S', 'Z'}):
592
                sdx += '8'
593
            else:
594
                sdx += '2'
595
        elif word[i] in {'F', 'V', 'W'}:
596
            sdx += '3'
597
        elif word[i] in {'G', 'K', 'Q'}:
598
            sdx += '4'
599
        elif word[i] == 'C':
600
            if _after(word, i, {'S', 'Z'}):
601
                sdx += '8'
602
            elif i == 0:
603
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
604
                                     'X'}):
605
                    sdx += '4'
606
                else:
607
                    sdx += '8'
608
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
609
                sdx += '4'
610
            else:
611
                sdx += '8'
612
        elif word[i] == 'X':
613
            if _after(word, i, {'C', 'K', 'Q'}):
614
                sdx += '8'
615
            else:
616
                sdx += '48'
617
        elif word[i] == 'L':
618
            sdx += '5'
619
        elif word[i] in {'M', 'N'}:
620
            sdx += '6'
621
        elif word[i] == 'R':
622
            sdx += '7'
623
        elif word[i] in {'S', 'Z'}:
624
            sdx += '8'
625
626
    sdx = _delete_consecutive_repeats(sdx)
627
628
    if sdx:
629
        sdx = sdx[0] + sdx[1:].replace('0', '')
630
631
    return sdx
632
633
634
def koelner_phonetik_num_to_alpha(num):
635
    """Convert a Kölner Phonetik code from numeric to alphabetic.
636
637
    :param str num: a numeric Kölner Phonetik representation
638
    :returns: an alphabetic representation of the same word
639
    :rtype: str
640
641
    >>> koelner_phonetik_num_to_alpha(862)
642
    'SNT'
643
    >>> koelner_phonetik_num_to_alpha(657)
644
    'NLR'
645
    >>> koelner_phonetik_num_to_alpha(86766)
646
    'SNRNN'
647
    """
648
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
649
                                        'APTFKLNRS'))
650
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
651
                                                     '5', '6', '7', '8'})
652
    return num.translate(_koelner_num_translation)
653
654
655
def koelner_phonetik_alpha(word):
656
    """Return the Kölner Phonetik (alphabetic output) code for a word.
657
658
    :param str word: the word to transform
659
    :returns: the Kölner Phonetik value as an alphabetic string
660
    :rtype: str
661
662
    >>> koelner_phonetik_alpha('Smith')
663
    'SNT'
664
    >>> koelner_phonetik_alpha('Schmidt')
665
    'SNT'
666
    >>> koelner_phonetik_alpha('Müller')
667
    'NLR'
668
    >>> koelner_phonetik_alpha('Zimmermann')
669
    'SNRNN'
670
    """
671
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
672
673
674
def nysiis(word, maxlength=6, modified=False):
675
    """Return the NYSIIS code for a word.
676
677
    A description of the New York State Identification and Intelligence System
678
    algorithm can be found at
679
    https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System
680
681
    The modified version of this algorithm is described in Appendix B of
682
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
683
    Procedure for the SRS Record Linkage System.` Statistical Reporting
684
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
685
    https://naldc.nal.usda.gov/download/27833/PDF
686
687
    :param str word: the word to transform
688
    :param int maxlength: the maximum length (default 6) of the code to return
689
    :param bool modified: indicates whether to use USDA modified NYSIIS
690
    :returns: the NYSIIS value
691
    :rtype: str
692
693
    >>> nysiis('Christopher')
694
    'CRASTA'
695
    >>> nysiis('Niall')
696
    'NAL'
697
    >>> nysiis('Smith')
698
    'SNAT'
699
    >>> nysiis('Schmidt')
700
    'SNAD'
701
702
    >>> nysiis('Christopher', maxlength=_INFINITY)
703
    'CRASTAFAR'
704
705
    >>> nysiis('Christopher', maxlength=8, modified=True)
706
    'CRASTAFA'
707
    >>> nysiis('Niall', maxlength=8, modified=True)
708
    'NAL'
709
    >>> nysiis('Smith', maxlength=8, modified=True)
710
    'SNAT'
711
    >>> nysiis('Schmidt', maxlength=8, modified=True)
712
    'SNAD'
713
    """
714
    # Require a maxlength of at least 6
715
    if maxlength:
716
        maxlength = max(6, maxlength)
717
718
    _vowels = {'A', 'E', 'I', 'O', 'U'}
719
720
    word = ''.join(c for c in word.upper() if c.isalpha())
721
    word = word.replace('ß', 'SS')
722
723
    # exit early if there are no alphas
724
    if not word:
725
        return ''
726
727
    if modified:
728
        original_first_char = word[0]
729
730
    if word[:3] == 'MAC':
731
        word = 'MCC'+word[3:]
732
    elif word[:2] == 'KN':
733
        word = 'NN'+word[2:]
734
    elif word[:1] == 'K':
735
        word = 'C'+word[1:]
736
    elif word[:2] in {'PH', 'PF'}:
737
        word = 'FF'+word[2:]
738
    elif word[:3] == 'SCH':
739
        word = 'SSS'+word[3:]
740
    elif modified:
741
        if word[:2] == 'WR':
742
            word = 'RR'+word[2:]
743
        elif word[:2] == 'RH':
744
            word = 'RR'+word[2:]
745
        elif word[:2] == 'DG':
746
            word = 'GG'+word[2:]
747
        elif word[:1] in _vowels:
748
            word = 'A'+word[1:]
749
750
    if modified and word[-1] in {'S', 'Z'}:
751
        word = word[:-1]
752
753
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
754
                                                  word[-2:] == 'YE'):
755
        word = word[:-2]+'Y'
756
    elif word[-2:] in {'DT', 'RT', 'RD'}:
757
        word = word[:-2]+'D'
758
    elif word[-2:] in {'NT', 'ND'}:
759
        word = word[:-2]+('N' if modified else 'D')
760
    elif modified:
761
        if word[-2:] == 'IX':
762
            word = word[:-2]+'ICK'
763
        elif word[-2:] == 'EX':
764
            word = word[:-2]+'ECK'
765
        elif word[-2:] in {'JR', 'SR'}:
766
            return 'ERROR'  # TODO: decide how best to return an error
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
767
768
    key = word[0]
769
770
    skip = 0
771
    for i in range(1, len(word)):
772
        if i >= len(word):
773
            continue
774
        elif skip:
775
            skip -= 1
776
            continue
777
        elif word[i:i+2] == 'EV':
778
            word = word[:i] + 'AF' + word[i+2:]
779
            skip = 1
780
        elif word[i] in _vowels:
781
            word = word[:i] + 'A' + word[i+1:]
782
        elif modified and i != len(word)-1 and word[i] == 'Y':
783
            word = word[:i] + 'A' + word[i+1:]
784
        elif word[i] == 'Q':
785
            word = word[:i] + 'G' + word[i+1:]
786
        elif word[i] == 'Z':
787
            word = word[:i] + 'S' + word[i+1:]
788
        elif word[i] == 'M':
789
            word = word[:i] + 'N' + word[i+1:]
790
        elif word[i:i+2] == 'KN':
791
            word = word[:i] + 'N' + word[i+2:]
792
        elif word[i] == 'K':
793
            word = word[:i] + 'C' + word[i+1:]
794
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
795
            word = word[:i] + 'SSA'
796
            skip = 2
797
        elif word[i:i+3] == 'SCH':
798
            word = word[:i] + 'SSS' + word[i+3:]
799
            skip = 2
800
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
801
            word = word[:i] + 'SA'
802
            skip = 1
803
        elif word[i:i+2] == 'SH':
804
            word = word[:i] + 'SS' + word[i+2:]
805
            skip = 1
806
        elif word[i:i+2] == 'PH':
807
            word = word[:i] + 'FF' + word[i+2:]
808
            skip = 1
809
        elif modified and word[i:i+3] == 'GHT':
810
            word = word[:i] + 'TTT' + word[i+3:]
811
            skip = 2
812
        elif modified and word[i:i+2] == 'DG':
813
            word = word[:i] + 'GG' + word[i+2:]
814
            skip = 1
815
        elif modified and word[i:i+2] == 'WR':
816
            word = word[:i] + 'RR' + word[i+2:]
817
            skip = 1
818
        elif word[i] == 'H' and (word[i-1] not in _vowels or
819
                                 word[i+1:i+2] not in _vowels):
820
            word = word[:i] + word[i-1] + word[i+1:]
821
        elif word[i] == 'W' and word[i-1] in _vowels:
822
            word = word[:i] + word[i-1] + word[i+1:]
823
824
        if word[i:i+skip+1] != key[-1:]:
825
            key += word[i:i+skip+1]
826
827
    key = _delete_consecutive_repeats(key)
828
829
    if key[-1] == 'S':
830
        key = key[:-1]
831
    if key[-2:] == 'AY':
832
        key = key[:-2] + 'Y'
833
    if key[-1:] == 'A':
834
        key = key[:-1]
835
    if modified and key[0] == 'A':
836
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 727 is False. Are you sure this can never be the case?
Loading history...
837
838
    if maxlength and maxlength < _INFINITY:
839
        key = key[:maxlength]
840
841
    return key
842
843
844
def mra(word):
845
    """Return the MRA personal numeric identifier (PNI) for a word.
846
847
    A description of the Western Airlines Surname Match Rating Algorithm can
848
    be found on page 18 of
849
    https://archive.org/details/accessingindivid00moor
850
851
    :param str word: the word to transform
852
    :returns: the MRA PNI
853
    :rtype: str
854
855
    >>> mra('Christopher')
856
    'CHRPHR'
857
    >>> mra('Niall')
858
    'NL'
859
    >>> mra('Smith')
860
    'SMTH'
861
    >>> mra('Schmidt')
862
    'SCHMDT'
863
    """
864
    if not word:
865
        return word
866
    word = word.upper()
867
    word = word.replace('ß', 'SS')
868
    word = word[0]+''.join(c for c in word[1:] if
869
                           c not in {'A', 'E', 'I', 'O', 'U'})
870
    word = _delete_consecutive_repeats(word)
871
    if len(word) > 6:
872
        word = word[:3]+word[-3:]
873
    return word
874
875
876
def metaphone(word, maxlength=_INFINITY):
877
    """Return the Metaphone code for a word.
878
879
    Based on Lawrence Philips' Pick BASIC code from 1990:
880
    http://aspell.net/metaphone/metaphone.basic
881
    This incorporates some corrections to the above code, particularly
882
    some of those suggested by Michael Kuhn in:
883
    http://aspell.net/metaphone/metaphone-kuhn.txt
884
885
    :param str word: the word to transform
886
    :param int maxlength: the maximum length of the returned Metaphone code
887
        (defaults to unlimited, but in Philips' original implementation
888
        this was 4)
889
    :returns: the Metaphone value
890
    :rtype: str
891
892
893
    >>> metaphone('Christopher')
894
    'KRSTFR'
895
    >>> metaphone('Niall')
896
    'NL'
897
    >>> metaphone('Smith')
898
    'SM0'
899
    >>> metaphone('Schmidt')
900
    'SKMTT'
901
    """
902
    # pylint: disable=too-many-branches
903
    _vowels = {'A', 'E', 'I', 'O', 'U'}
904
    _frontv = {'E', 'I', 'Y'}
905
    _varson = {'C', 'G', 'P', 'S', 'T'}
906
907
    # Require a maxlength of at least 4
908
    if maxlength is not None:
909
        maxlength = max(4, maxlength)
910
    else:
911
        maxlength = 64
912
913
    # As in variable sound--those modified by adding an "h"
914
    ename = ''.join(c for c in word.upper() if c.isalnum())
915
    ename = ename.replace('ß', 'SS')
916
917
    # Delete nonalphanumeric characters and make all caps
918
    if not ename:
919
        return ''
920
    if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}:
921
        ename = ename[1:]
922
    elif ename[0] == 'X':
923
        ename = 'S' + ename[1:]
924
    elif ename[0:2] == 'WH':
925
        ename = 'W' + ename[2:]
926
927
    # Convert to metaph
928
    elen = len(ename)-1
929
    metaph = ''
930
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
931
        if len(metaph) >= maxlength:
932
            break
933
        if ((ename[i] not in {'G', 'T'} and
934
             i > 0 and ename[i-1] == ename[i])):
935
            continue
936
937
        if ename[i] in _vowels and i == 0:
938
            metaph = ename[i]
939
940
        elif ename[i] == 'B':
941
            if i != elen or ename[i-1] != 'M':
942
                metaph += ename[i]
943
944
        elif ename[i] == 'C':
945
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
946
                if ename[i+1:i+3] == 'IA':
947
                    metaph += 'X'
948
                elif ename[i+1:i+2] in _frontv:
949
                    metaph += 'S'
950
                elif i > 0 and ename[i-1:i+2] == 'SCH':
951
                    metaph += 'K'
952
                elif ename[i+1:i+2] == 'H':
953
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
954
                        metaph += 'K'
955
                    else:
956
                        metaph += 'X'
957
                else:
958
                    metaph += 'K'
959
960
        elif ename[i] == 'D':
961
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
962
                metaph += 'J'
963
            else:
964
                metaph += 'T'
965
966
        elif ename[i] == 'G':
967
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
968
                                              ename[i+2:i+3] not in _vowels):
969
                continue
970
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
971
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
972
                continue
973
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
974
                  ename[i+1] in _frontv):
975
                continue
976
            elif ename[i+1:i+2] == 'G':
977
                continue
978
            elif ename[i+1:i+2] in _frontv:
979
                if i == 0 or ename[i-1] != 'G':
980
                    metaph += 'J'
981
                else:
982
                    metaph += 'K'
983
            else:
984
                metaph += 'K'
985
986
        elif ename[i] == 'H':
987
            if ((i > 0 and ename[i-1] in _vowels and
988
                 ename[i+1:i+2] not in _vowels)):
989
                continue
990
            elif i > 0 and ename[i-1] in _varson:
991
                continue
992
            else:
993
                metaph += 'H'
994
995
        elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}:
996
            metaph += ename[i]
997
998
        elif ename[i] == 'K':
999
            if i > 0 and ename[i-1] == 'C':
1000
                continue
1001
            else:
1002
                metaph += 'K'
1003
1004
        elif ename[i] == 'P':
1005
            if ename[i+1:i+2] == 'H':
1006
                metaph += 'F'
1007
            else:
1008
                metaph += 'P'
1009
1010
        elif ename[i] == 'Q':
1011
            metaph += 'K'
1012
1013
        elif ename[i] == 'S':
1014
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1015
                 ename[i+2] in 'OA')):
1016
                metaph += 'X'
1017
            elif ename[i+1:i+2] == 'H':
1018
                metaph += 'X'
1019
            else:
1020
                metaph += 'S'
1021
1022
        elif ename[i] == 'T':
1023
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1024
                 ename[i+2] in {'A', 'O'})):
1025
                metaph += 'X'
1026
            elif ename[i+1:i+2] == 'H':
1027
                metaph += '0'
1028
            elif ename[i+1:i+3] != 'CH':
1029
                if ename[i-1:i] != 'T':
1030
                    metaph += 'T'
1031
1032
        elif ename[i] == 'V':
1033
            metaph += 'F'
1034
1035
        elif ename[i] in 'WY':
1036
            if ename[i+1:i+2] in _vowels:
1037
                metaph += ename[i]
1038
1039
        elif ename[i] == 'X':
1040
            metaph += 'KS'
1041
1042
        elif ename[i] == 'Z':
1043
            metaph += 'S'
1044
1045
    return metaph
1046
1047
1048
def double_metaphone(word, maxlength=_INFINITY):
1049
    """Return the Double Metaphone code for a word.
1050
1051
    Based on Lawrence Philips' (Visual) C++ code from 1999:
1052
    http://aspell.net/metaphone/dmetaph.cpp
1053
1054
    :param word: the word to transform
1055
    :param maxlength: the maximum length of the returned Double Metaphone codes
1056
        (defaults to unlimited, but in Philips' original implementation this
1057
        was 4)
1058
    :returns: the Double Metaphone value(s)
1059
    :rtype: tuple
1060
1061
    >>> double_metaphone('Christopher')
1062
    ('KRSTFR', '')
1063
    >>> double_metaphone('Niall')
1064
    ('NL', '')
1065
    >>> double_metaphone('Smith')
1066
    ('SM0', 'XMT')
1067
    >>> double_metaphone('Schmidt')
1068
    ('XMT', 'SMT')
1069
    """
1070
    # pylint: disable=too-many-branches
1071
    # Require a maxlength of at least 4
1072
    if maxlength is not None:
1073
        maxlength = max(4, maxlength)
1074
    else:
1075
        maxlength = 64
1076
1077
    primary = ''
1078
    secondary = ''
1079
1080
    def _slavo_germanic():
1081
        """Return True if the word appears to be Slavic or Germanic."""
1082
        if 'W' in word or 'K' in word or 'CZ' in word:
1083
            return True
1084
        return False
1085
1086
    def _metaph_add(pri, sec=''):
1087
        """Return a new metaphone tuple with the supplied elements."""
1088
        newpri = primary
1089
        newsec = secondary
1090
        if pri:
1091
            newpri += pri
1092
        if sec:
1093
            if sec != ' ':
1094
                newsec += sec
1095
        else:
1096
            newsec += pri
1097
        return (newpri, newsec)
1098
1099
    def _is_vowel(pos):
1100
        """Return True if the character at word[pos] is a vowel."""
1101
        if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1102
            return True
1103
        return False
1104
1105
    def _get_at(pos):
1106
        """Return the character at word[pos]."""
1107
        return word[pos]
1108
1109
    def _string_at(pos, slen, substrings):
1110
        """Return True if word[pos:pos+slen] is in substrings."""
1111
        if pos < 0:
1112
            return False
1113
        return word[pos:pos+slen] in substrings
1114
1115
    current = 0
1116
    length = len(word)
1117
    if length < 1:
1118
        return ('', '')
1119
    last = length - 1
1120
1121
    word = word.upper()
1122
    word = word.replace('ß', 'SS')
1123
1124
    # Pad the original string so that we can index beyond the edge of the world
1125
    word += '     '
1126
1127
    # Skip these when at start of word
1128
    if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
1129
        current += 1
1130
1131
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1132
    if _get_at(0) == 'X':
1133
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1134
        current += 1
1135
1136
    # Main loop
1137
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1138
        if current >= length:
1139
            break
1140
1141
        if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1142
            if current == 0:
1143
                # All init vowels now map to 'A'
1144
                (primary, secondary) = _metaph_add('A')
1145
            current += 1
1146
            continue
1147
1148
        elif _get_at(current) == 'B':
1149
            # "-mb", e.g", "dumb", already skipped over...
1150
            (primary, secondary) = _metaph_add('P')
1151
            if _get_at(current + 1) == 'B':
1152
                current += 2
1153
            else:
1154
                current += 1
1155
            continue
1156
1157
        elif _get_at(current) == 'Ç':
1158
            (primary, secondary) = _metaph_add('S')
1159
            current += 1
1160
            continue
1161
1162
        elif _get_at(current) == 'C':
1163
            # Various Germanic
1164
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1165
                    _string_at((current - 1), 3, {'ACH'}) and
1166
                    ((_get_at(current + 2) != 'I') and
1167
                     ((_get_at(current + 2) != 'E') or
1168
                      _string_at((current - 2), 6,
1169
                                 {'BACHER', 'MACHER'})))):
1170
                (primary, secondary) = _metaph_add('K')
1171
                current += 2
1172
                continue
1173
1174
            # Special case 'caesar'
1175
            elif current == 0 and _string_at(current, 6, {'CAESAR'}):
1176
                (primary, secondary) = _metaph_add('S')
1177
                current += 2
1178
                continue
1179
1180
            # Italian 'chianti'
1181
            elif _string_at(current, 4, {'CHIA'}):
1182
                (primary, secondary) = _metaph_add('K')
1183
                current += 2
1184
                continue
1185
1186
            elif _string_at(current, 2, {'CH'}):
1187
                # Find 'Michael'
1188
                if current > 0 and _string_at(current, 4, {'CHAE'}):
1189
                    (primary, secondary) = _metaph_add('K', 'X')
1190
                    current += 2
1191
                    continue
1192
1193
                # Greek roots e.g. 'chemistry', 'chorus'
1194
                elif (current == 0 and
1195
                      (_string_at((current + 1), 5,
1196
                                  {'HARAC', 'HARIS'}) or
1197
                       _string_at((current + 1), 3,
1198
                                  {'HOR', 'HYM', 'HIA', 'HEM'})) and
1199
                      not _string_at(0, 5, {'CHORE'})):
1200
                    (primary, secondary) = _metaph_add('K')
1201
                    current += 2
1202
                    continue
1203
1204
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1205
                elif ((_string_at(0, 4, {'VAN ', 'VON '}) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1206
                       _string_at(0, 3, {'SCH'})) or
1207
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1208
                      _string_at((current - 2), 6,
1209
                                 {'ORCHES', 'ARCHIT', 'ORCHID'}) or
1210
                      _string_at((current + 2), 1, {'T', 'S'}) or
1211
                      ((_string_at((current - 1), 1,
1212
                                   {'A', 'O', 'U', 'E'}) or
1213
                        (current == 0)) and
1214
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1215
                       _string_at((current + 2), 1,
1216
                                  {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W',
1217
                                   ' '}))):
1218
                    (primary, secondary) = _metaph_add('K')
1219
1220
                else:
1221
                    if current > 0:
1222
                        if _string_at(0, 2, {'MC'}):
1223
                            # e.g., "McHugh"
1224
                            (primary, secondary) = _metaph_add('K')
1225
                        else:
1226
                            (primary, secondary) = _metaph_add('X', 'K')
1227
                    else:
1228
                        (primary, secondary) = _metaph_add('X')
1229
1230
                current += 2
1231
                continue
1232
1233
            # e.g, 'czerny'
1234
            elif (_string_at(current, 2, {'CZ'}) and
1235
                  not _string_at((current - 2), 4, {'WICZ'})):
1236
                (primary, secondary) = _metaph_add('S', 'X')
1237
                current += 2
1238
                continue
1239
1240
            # e.g., 'focaccia'
1241
            elif _string_at((current + 1), 3, {'CIA'}):
1242
                (primary, secondary) = _metaph_add('X')
1243
                current += 3
1244
1245
            # double 'C', but not if e.g. 'McClellan'
1246
            elif (_string_at(current, 2, {'CC'}) and
1247
                  not ((current == 1) and (_get_at(0) == 'M'))):
1248
                # 'bellocchio' but not 'bacchus'
1249
                if ((_string_at((current + 2), 1,
1250
                                {'I', 'E', 'H'}) and
1251
                     not _string_at((current + 2), 2, ['HU']))):
1252
                    # 'accident', 'accede' 'succeed'
1253
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1254
                         _string_at((current - 1), 5,
1255
                                    {'UCCEE', 'UCCES'}))):
1256
                        (primary, secondary) = _metaph_add('KS')
1257
                    # 'bacci', 'bertucci', other italian
1258
                    else:
1259
                        (primary, secondary) = _metaph_add('X')
1260
                    current += 3
1261
                    continue
1262
                else:  # Pierce's rule
1263
                    (primary, secondary) = _metaph_add('K')
1264
                    current += 2
1265
                    continue
1266
1267
            elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
1268
                (primary, secondary) = _metaph_add('K')
1269
                current += 2
1270
                continue
1271
1272
            elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
1273
                # Italian vs. English
1274
                if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
1275
                    (primary, secondary) = _metaph_add('S', 'X')
1276
                else:
1277
                    (primary, secondary) = _metaph_add('S')
1278
                current += 2
1279
                continue
1280
1281
            # else
1282
            else:
1283
                (primary, secondary) = _metaph_add('K')
1284
1285
                # name sent in 'mac caffrey', 'mac gregor
1286
                if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
1287
                    current += 3
1288
                elif (_string_at((current + 1), 1,
1289
                                 {'C', 'K', 'Q'}) and
1290
                      not _string_at((current + 1), 2, {'CE', 'CI'})):
1291
                    current += 2
1292
                else:
1293
                    current += 1
1294
                continue
1295
1296
        elif _get_at(current) == 'D':
1297
            if _string_at(current, 2, {'DG'}):
1298
                if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1299
                    # e.g. 'edge'
1300
                    (primary, secondary) = _metaph_add('J')
1301
                    current += 3
1302
                    continue
1303
                else:
1304
                    # e.g. 'edgar'
1305
                    (primary, secondary) = _metaph_add('TK')
1306
                    current += 2
1307
                    continue
1308
1309
            elif _string_at(current, 2, {'DT', 'DD'}):
1310
                (primary, secondary) = _metaph_add('T')
1311
                current += 2
1312
                continue
1313
1314
            # else
1315
            else:
1316
                (primary, secondary) = _metaph_add('T')
1317
                current += 1
1318
                continue
1319
1320
        elif _get_at(current) == 'F':
1321
            if _get_at(current + 1) == 'F':
1322
                current += 2
1323
            else:
1324
                current += 1
1325
            (primary, secondary) = _metaph_add('F')
1326
            continue
1327
1328
        elif _get_at(current) == 'G':
1329
            if _get_at(current + 1) == 'H':
1330
                if (current > 0) and not _is_vowel(current - 1):
1331
                    (primary, secondary) = _metaph_add('K')
1332
                    current += 2
1333
                    continue
1334
1335
                # 'ghislane', ghiradelli
1336
                elif current == 0:
1337
                    if _get_at(current + 2) == 'I':
1338
                        (primary, secondary) = _metaph_add('J')
1339
                    else:
1340
                        (primary, secondary) = _metaph_add('K')
1341
                    current += 2
1342
                    continue
1343
1344
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1345
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1346
                       _string_at((current - 2), 1, {'B', 'H', 'D'})) or
1347
                      # e.g., 'bough'
1348
                      ((current > 2) and
1349
                       _string_at((current - 3), 1, {'B', 'H', 'D'})) or
1350
                      # e.g., 'broughton'
1351
                      ((current > 3) and
1352
                       _string_at((current - 4), 1, {'B', 'H'}))):
1353
                    current += 2
1354
                    continue
1355
                else:
1356
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1357
                    #      'gough', 'rough', 'tough'
1358
                    if ((current > 2) and
1359
                            (_get_at(current - 1) == 'U') and
1360
                            (_string_at((current - 3), 1,
1361
                                        {'C', 'G', 'L', 'R', 'T'}))):
1362
                        (primary, secondary) = _metaph_add('F')
1363
                    elif (current > 0) and _get_at(current - 1) != 'I':
1364
                        (primary, secondary) = _metaph_add('K')
1365
                    current += 2
1366
                    continue
1367
1368
            elif _get_at(current + 1) == 'N':
1369
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1370
                    (primary, secondary) = _metaph_add('KN', 'N')
1371
                # not e.g. 'cagney'
1372
                elif (not _string_at((current + 2), 2, {'EY'}) and
1373
                      (_get_at(current + 1) != 'Y') and
1374
                      not _slavo_germanic()):
1375
                    (primary, secondary) = _metaph_add('N', 'KN')
1376
                else:
1377
                    (primary, secondary) = _metaph_add('KN')
1378
                current += 2
1379
                continue
1380
1381
            # 'tagliaro'
1382
            elif (_string_at((current + 1), 2, {'LI'}) and
1383
                  not _slavo_germanic()):
1384
                (primary, secondary) = _metaph_add('KL', 'L')
1385
                current += 2
1386
                continue
1387
1388
            # -ges-, -gep-, -gel-, -gie- at beginning
1389
            elif ((current == 0) and
1390
                  ((_get_at(current + 1) == 'Y') or
1391
                   _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY',
1392
                                                 'IB', 'IL', 'IN', 'IE', 'EI',
1393
                                                 'ER'}))):
1394
                (primary, secondary) = _metaph_add('K', 'J')
1395
                current += 2
1396
                continue
1397
1398
            #  -ger-,  -gy-
1399
            elif ((_string_at((current + 1), 2, {'ER'}) or
1400
                   (_get_at(current + 1) == 'Y')) and not
1401
                  _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not
1402
                  _string_at((current - 1), 1, {'E', 'I'}) and not
1403
                  _string_at((current - 1), 3, {'RGY', 'OGY'})):
1404
                (primary, secondary) = _metaph_add('K', 'J')
1405
                current += 2
1406
                continue
1407
1408
            #  italian e.g, 'biaggi'
1409
            elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or
1410
                  _string_at((current - 1), 4, {'AGGI', 'OGGI'})):
1411
                # obvious germanic
1412
                if (((_string_at(0, 4, {'VAN ', 'VON '}) or
1413
                      _string_at(0, 3, {'SCH'})) or
1414
                     _string_at((current + 1), 2, {'ET'}))):
1415
                    (primary, secondary) = _metaph_add('K')
1416
                elif _string_at((current + 1), 4, {'IER '}):
1417
                    (primary, secondary) = _metaph_add('J')
1418
                else:
1419
                    (primary, secondary) = _metaph_add('J', 'K')
1420
                current += 2
1421
                continue
1422
1423
            else:
1424
                if _get_at(current + 1) == 'G':
1425
                    current += 2
1426
                else:
1427
                    current += 1
1428
                (primary, secondary) = _metaph_add('K')
1429
                continue
1430
1431
        elif _get_at(current) == 'H':
1432
            # only keep if first & before vowel or btw. 2 vowels
1433
            if ((((current == 0) or _is_vowel(current - 1)) and
1434
                 _is_vowel(current + 1))):
1435
                (primary, secondary) = _metaph_add('H')
1436
                current += 2
1437
            else:  # also takes care of 'HH'
1438
                current += 1
1439
            continue
1440
1441
        elif _get_at(current) == 'J':
1442
            # obvious spanish, 'jose', 'san jacinto'
1443
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}):
1444
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1445
                     _string_at(0, 4, ['SAN ']))):
1446
                    (primary, secondary) = _metaph_add('H')
1447
                else:
1448
                    (primary, secondary) = _metaph_add('J', 'H')
1449
                current += 1
1450
                continue
1451
1452
            elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
1453
                # Yankelovich/Jankelowicz
1454
                (primary, secondary) = _metaph_add('J', 'A')
1455
            # Spanish pron. of e.g. 'bajador'
1456
            elif (_is_vowel(current - 1) and
1457
                  not _slavo_germanic() and
1458
                  ((_get_at(current + 1) == 'A') or
1459
                   (_get_at(current + 1) == 'O'))):
1460
                (primary, secondary) = _metaph_add('J', 'H')
1461
            elif current == last:
1462
                (primary, secondary) = _metaph_add('J', ' ')
1463
            elif (not _string_at((current + 1), 1,
1464
                                 {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and
1465
                  not _string_at((current - 1), 1, {'S', 'K', 'L'})):
1466
                (primary, secondary) = _metaph_add('J')
1467
1468
            if _get_at(current + 1) == 'J':  # it could happen!
1469
                current += 2
1470
            else:
1471
                current += 1
1472
            continue
1473
1474
        elif _get_at(current) == 'K':
1475
            if _get_at(current + 1) == 'K':
1476
                current += 2
1477
            else:
1478
                current += 1
1479
            (primary, secondary) = _metaph_add('K')
1480
            continue
1481
1482
        elif _get_at(current) == 'L':
1483
            if _get_at(current + 1) == 'L':
1484
                # Spanish e.g. 'cabrillo', 'gallegos'
1485
                if (((current == (length - 3)) and
1486
                     _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or
1487
                        ((_string_at((last - 1), 2, {'AS', 'OS'}) or
1488
                          _string_at(last, 1, {'A', 'O'})) and
1489
                         _string_at((current - 1), 4, {'ALLE'}))):
1490
                    (primary, secondary) = _metaph_add('L', ' ')
1491
                    current += 2
1492
                    continue
1493
                current += 2
1494
            else:
1495
                current += 1
1496
            (primary, secondary) = _metaph_add('L')
1497
            continue
1498
1499
        elif _get_at(current) == 'M':
1500
            if (((_string_at((current - 1), 3, {'UMB'}) and
1501
                  (((current + 1) == last) or
1502
                   _string_at((current + 2), 2, {'ER'}))) or
1503
                 # 'dumb', 'thumb'
1504
                 (_get_at(current + 1) == 'M'))):
1505
                current += 2
1506
            else:
1507
                current += 1
1508
            (primary, secondary) = _metaph_add('M')
1509
            continue
1510
1511
        elif _get_at(current) == 'N':
1512
            if _get_at(current + 1) == 'N':
1513
                current += 2
1514
            else:
1515
                current += 1
1516
            (primary, secondary) = _metaph_add('N')
1517
            continue
1518
1519
        elif _get_at(current) == 'Ñ':
1520
            current += 1
1521
            (primary, secondary) = _metaph_add('N')
1522
            continue
1523
1524
        elif _get_at(current) == 'P':
1525
            if _get_at(current + 1) == 'H':
1526
                (primary, secondary) = _metaph_add('F')
1527
                current += 2
1528
                continue
1529
1530
            # also account for "campbell", "raspberry"
1531
            elif _string_at((current + 1), 1, {'P', 'B'}):
1532
                current += 2
1533
            else:
1534
                current += 1
1535
            (primary, secondary) = _metaph_add('P')
1536
            continue
1537
1538
        elif _get_at(current) == 'Q':
1539
            if _get_at(current + 1) == 'Q':
1540
                current += 2
1541
            else:
1542
                current += 1
1543
            (primary, secondary) = _metaph_add('K')
1544
            continue
1545
1546
        elif _get_at(current) == 'R':
1547
            # french e.g. 'rogier', but exclude 'hochmeier'
1548
            if (((current == last) and
1549
                 not _slavo_germanic() and
1550
                 _string_at((current - 2), 2, {'IE'}) and
1551
                 not _string_at((current - 4), 2, {'ME', 'MA'}))):
1552
                (primary, secondary) = _metaph_add('', 'R')
1553
            else:
1554
                (primary, secondary) = _metaph_add('R')
1555
1556
            if _get_at(current + 1) == 'R':
1557
                current += 2
1558
            else:
1559
                current += 1
1560
            continue
1561
1562
        elif _get_at(current) == 'S':
1563
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1564
            if _string_at((current - 1), 3, {'ISL', 'YSL'}):
1565
                current += 1
1566
                continue
1567
1568
            # special case 'sugar-'
1569
            elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
1570
                (primary, secondary) = _metaph_add('X', 'S')
1571
                current += 1
1572
                continue
1573
1574
            elif _string_at(current, 2, {'SH'}):
1575
                # Germanic
1576
                if _string_at((current + 1), 4,
1577
                              {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}):
1578
                    (primary, secondary) = _metaph_add('S')
1579
                else:
1580
                    (primary, secondary) = _metaph_add('X')
1581
                current += 2
1582
                continue
1583
1584
            # Italian & Armenian
1585
            elif (_string_at(current, 3, {'SIO', 'SIA'}) or
1586
                  _string_at(current, 4, {'SIAN'})):
1587
                if not _slavo_germanic():
1588
                    (primary, secondary) = _metaph_add('S', 'X')
1589
                else:
1590
                    (primary, secondary) = _metaph_add('S')
1591
                current += 3
1592
                continue
1593
1594
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1595
            #                               'snider' match 'schneider'
1596
            # also, -sz- in Slavic language although in Hungarian it is
1597
            #       pronounced 's'
1598
            elif (((current == 0) and
1599
                   _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or
1600
                  _string_at((current + 1), 1, {'Z'})):
1601
                (primary, secondary) = _metaph_add('S', 'X')
1602
                if _string_at((current + 1), 1, {'Z'}):
1603
                    current += 2
1604
                else:
1605
                    current += 1
1606
                continue
1607
1608
            elif _string_at(current, 2, {'SC'}):
1609
                # Schlesinger's rule
1610
                if _get_at(current + 2) == 'H':
1611
                    # dutch origin, e.g. 'school', 'schooner'
1612
                    if _string_at((current + 3), 2,
1613
                                  {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}):
1614
                        # 'schermerhorn', 'schenker'
1615
                        if _string_at((current + 3), 2, {'ER', 'EN'}):
1616
                            (primary, secondary) = _metaph_add('X', 'SK')
1617
                        else:
1618
                            (primary, secondary) = _metaph_add('SK')
1619
                        current += 3
1620
                        continue
1621
                    else:
1622
                        if (((current == 0) and not _is_vowel(3) and
1623
                             (_get_at(3) != 'W'))):
1624
                            (primary, secondary) = _metaph_add('X', 'S')
1625
                        else:
1626
                            (primary, secondary) = _metaph_add('X')
1627
                        current += 3
1628
                        continue
1629
1630
                elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1631
                    (primary, secondary) = _metaph_add('S')
1632
                    current += 3
1633
                    continue
1634
1635
                # else
1636
                else:
1637
                    (primary, secondary) = _metaph_add('SK')
1638
                    current += 3
1639
                    continue
1640
1641
            else:
1642
                # french e.g. 'resnais', 'artois'
1643
                if (current == last) and _string_at((current - 2), 2,
1644
                                                    {'AI', 'OI'}):
1645
                    (primary, secondary) = _metaph_add('', 'S')
1646
                else:
1647
                    (primary, secondary) = _metaph_add('S')
1648
1649
                if _string_at((current + 1), 1, {'S', 'Z'}):
1650
                    current += 2
1651
                else:
1652
                    current += 1
1653
                continue
1654
1655
        elif _get_at(current) == 'T':
1656
            if _string_at(current, 4, {'TION'}):
1657
                (primary, secondary) = _metaph_add('X')
1658
                current += 3
1659
                continue
1660
1661
            elif _string_at(current, 3, {'TIA', 'TCH'}):
1662
                (primary, secondary) = _metaph_add('X')
1663
                current += 3
1664
                continue
1665
1666
            elif (_string_at(current, 2, {'TH'}) or
1667
                  _string_at(current, 3, {'TTH'})):
1668
                # special case 'thomas', 'thames' or germanic
1669
                if ((_string_at((current + 2), 2, {'OM', 'AM'}) or
1670
                     _string_at(0, 4, {'VAN ', 'VON '}) or
1671
                     _string_at(0, 3, {'SCH'}))):
1672
                    (primary, secondary) = _metaph_add('T')
1673
                else:
1674
                    (primary, secondary) = _metaph_add('0', 'T')
1675
                current += 2
1676
                continue
1677
1678
            elif _string_at((current + 1), 1, {'T', 'D'}):
1679
                current += 2
1680
            else:
1681
                current += 1
1682
            (primary, secondary) = _metaph_add('T')
1683
            continue
1684
1685
        elif _get_at(current) == 'V':
1686
            if _get_at(current + 1) == 'V':
1687
                current += 2
1688
            else:
1689
                current += 1
1690
            (primary, secondary) = _metaph_add('F')
1691
            continue
1692
1693
        elif _get_at(current) == 'W':
1694
            # can also be in middle of word
1695
            if _string_at(current, 2, {'WR'}):
1696
                (primary, secondary) = _metaph_add('R')
1697
                current += 2
1698
                continue
1699
            elif ((current == 0) and
1700
                  (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))):
1701
                # Wasserman should match Vasserman
1702
                if _is_vowel(current + 1):
1703
                    (primary, secondary) = _metaph_add('A', 'F')
1704
                else:
1705
                    # need Uomo to match Womo
1706
                    (primary, secondary) = _metaph_add('A')
1707
1708
            # Arnow should match Arnoff
1709
            if ((((current == last) and _is_vowel(current - 1)) or
1710
                 _string_at((current - 1), 5,
1711
                            {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or
1712
                 _string_at(0, 3, ['SCH']))):
1713
                (primary, secondary) = _metaph_add('', 'F')
1714
                current += 1
1715
                continue
1716
            # Polish e.g. 'filipowicz'
1717
            elif _string_at(current, 4, {'WICZ', 'WITZ'}):
1718
                (primary, secondary) = _metaph_add('TS', 'FX')
1719
                current += 4
1720
                continue
1721
            # else skip it
1722
            else:
1723
                current += 1
1724
                continue
1725
1726
        elif _get_at(current) == 'X':
1727
            # French e.g. breaux
1728
            if (not ((current == last) and
1729
                     (_string_at((current - 3), 3, {'IAU', 'EAU'}) or
1730
                      _string_at((current - 2), 2, {'AU', 'OU'})))):
1731
                (primary, secondary) = _metaph_add('KS')
1732
1733
            if _string_at((current + 1), 1, {'C', 'X'}):
1734
                current += 2
1735
            else:
1736
                current += 1
1737
            continue
1738
1739
        elif _get_at(current) == 'Z':
1740
            # Chinese Pinyin e.g. 'zhao'
1741
            if _get_at(current + 1) == 'H':
1742
                (primary, secondary) = _metaph_add('J')
1743
                current += 2
1744
                continue
1745
            elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or
1746
                  (_slavo_germanic() and ((current > 0) and
1747
                                          _get_at(current - 1) != 'T'))):
1748
                (primary, secondary) = _metaph_add('S', 'TS')
1749
            else:
1750
                (primary, secondary) = _metaph_add('S')
1751
1752
            if _get_at(current + 1) == 'Z':
1753
                current += 2
1754
            else:
1755
                current += 1
1756
            continue
1757
1758
        else:
1759
            current += 1
1760
1761
    if maxlength and maxlength < _INFINITY:
1762
        primary = primary[:maxlength]
1763
        secondary = secondary[:maxlength]
1764
    if primary == secondary:
1765
        secondary = ''
1766
1767
    return (primary, secondary)
1768
1769
1770
def caverphone(word, version=2):
1771
    """Return the Caverphone code for a word.
1772
1773
    A description of version 1 of the algorithm can be found at:
1774
    http://caversham.otago.ac.nz/files/working/ctp060902.pdf
1775
1776
    A description of version 2 of the algorithm can be found at:
1777
    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
1778
1779
    :param str word: the word to transform
1780
    :param int version: the version of Caverphone to employ for encoding
1781
        (defaults to 2)
1782
    :returns: the Caverphone value
1783
    :rtype: str
1784
1785
    >>> caverphone('Christopher')
1786
    'KRSTFA1111'
1787
    >>> caverphone('Niall')
1788
    'NA11111111'
1789
    >>> caverphone('Smith')
1790
    'SMT1111111'
1791
    >>> caverphone('Schmidt')
1792
    'SKMT111111'
1793
1794
    >>> caverphone('Christopher', 1)
1795
    'KRSTF1'
1796
    >>> caverphone('Niall', 1)
1797
    'N11111'
1798
    >>> caverphone('Smith', 1)
1799
    'SMT111'
1800
    >>> caverphone('Schmidt', 1)
1801
    'SKMT11'
1802
    """
1803
    _vowels = {'a', 'e', 'i', 'o', 'u'}
1804
1805
    word = word.lower()
1806
    word = ''.join(c for c in word if c in
1807
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
1808
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
1809
                    'y', 'z'})
1810
1811
    def _squeeze_replace(word, char, new_char):
1812
        """Convert strings of char in word to one instance of new_char."""
1813
        while char * 2 in word:
1814
            word = word.replace(char * 2, char)
1815
        return word.replace(char, new_char)
1816
1817
    # the main replacemet algorithm
1818
    if version != 1 and word[-1:] == 'e':
1819
        word = word[:-1]
1820
    if word:
1821
        if word[:5] == 'cough':
1822
            word = 'cou2f'+word[5:]
1823
        if word[:5] == 'rough':
1824
            word = 'rou2f'+word[5:]
1825
        if word[:5] == 'tough':
1826
            word = 'tou2f'+word[5:]
1827
        if word[:6] == 'enough':
1828
            word = 'enou2f'+word[6:]
1829
        if version != 1 and word[:6] == 'trough':
1830
            word = 'trou2f'+word[6:]
1831
        if word[:2] == 'gn':
1832
            word = '2n'+word[2:]
1833
        if word[-2:] == 'mb':
1834
            word = word[:-1]+'2'
1835
        word = word.replace('cq', '2q')
1836
        word = word.replace('ci', 'si')
1837
        word = word.replace('ce', 'se')
1838
        word = word.replace('cy', 'sy')
1839
        word = word.replace('tch', '2ch')
1840
        word = word.replace('c', 'k')
1841
        word = word.replace('q', 'k')
1842
        word = word.replace('x', 'k')
1843
        word = word.replace('v', 'f')
1844
        word = word.replace('dg', '2g')
1845
        word = word.replace('tio', 'sio')
1846
        word = word.replace('tia', 'sia')
1847
        word = word.replace('d', 't')
1848
        word = word.replace('ph', 'fh')
1849
        word = word.replace('b', 'p')
1850
        word = word.replace('sh', 's2')
1851
        word = word.replace('z', 's')
1852
        if word[0] in _vowels:
1853
            word = 'A'+word[1:]
1854
        word = word.replace('a', '3')
1855
        word = word.replace('e', '3')
1856
        word = word.replace('i', '3')
1857
        word = word.replace('o', '3')
1858
        word = word.replace('u', '3')
1859
        if version != 1:
1860
            word = word.replace('j', 'y')
1861
            if word[:2] == 'y3':
1862
                word = 'Y3'+word[2:]
1863
            if word[:1] == 'y':
1864
                word = 'A'+word[1:]
1865
            word = word.replace('y', '3')
1866
        word = word.replace('3gh3', '3kh3')
1867
        word = word.replace('gh', '22')
1868
        word = word.replace('g', 'k')
1869
1870
        word = _squeeze_replace(word, 's', 'S')
1871
        word = _squeeze_replace(word, 't', 'T')
1872
        word = _squeeze_replace(word, 'p', 'P')
1873
        word = _squeeze_replace(word, 'k', 'K')
1874
        word = _squeeze_replace(word, 'f', 'F')
1875
        word = _squeeze_replace(word, 'm', 'M')
1876
        word = _squeeze_replace(word, 'n', 'N')
1877
1878
        word = word.replace('w3', 'W3')
1879
        if version == 1:
1880
            word = word.replace('wy', 'Wy')
1881
        word = word.replace('wh3', 'Wh3')
1882
        if version == 1:
1883
            word = word.replace('why', 'Why')
1884
        if version != 1 and word[-1:] == 'w':
1885
            word = word[:-1]+'3'
1886
        word = word.replace('w', '2')
1887
        if word[:1] == 'h':
1888
            word = 'A'+word[1:]
1889
        word = word.replace('h', '2')
1890
        word = word.replace('r3', 'R3')
1891
        if version == 1:
1892
            word = word.replace('ry', 'Ry')
1893
        if version != 1 and word[-1:] == 'r':
1894
            word = word[:-1]+'3'
1895
        word = word.replace('r', '2')
1896
        word = word.replace('l3', 'L3')
1897
        if version == 1:
1898
            word = word.replace('ly', 'Ly')
1899
        if version != 1 and word[-1:] == 'l':
1900
            word = word[:-1]+'3'
1901
        word = word.replace('l', '2')
1902
        if version == 1:
1903
            word = word.replace('j', 'y')
1904
            word = word.replace('y3', 'Y3')
1905
            word = word.replace('y', '2')
1906
        word = word.replace('2', '')
1907
        if version != 1 and word[-1:] == '3':
1908
            word = word[:-1]+'A'
1909
        word = word.replace('3', '')
1910
1911
    # pad with 1s, then extract the necessary length of code
1912
    word = word+'1'*10
1913
    if version != 1:
1914
        word = word[:10]
1915
    else:
1916
        word = word[:6]
1917
1918
    return word
1919
1920
1921
def alpha_sis(word, maxlength=14):
1922
    """Return the IBM Alpha Search Inquiry System code for a word.
1923
1924
    Based on the algorithm described in "Accessing individual records from
1925
    personal data files using non-unique identifiers" / Gwendolyn B. Moore,
1926
    et al.; prepared for the Institute for Computer Sciences and Technology,
1927
    National Bureau of Standards, Washington, D.C (1977):
1928
    https://archive.org/stream/accessingindivid00moor#page/15/mode/1up
1929
1930
    A collection is necessary since there can be multiple values for a
1931
    single word. But the collection must be ordered since the first value
1932
    is the primary coding.
1933
1934
    :param str word: the word to transform
1935
    :param int maxlength: the length of the code returned (defaults to 14)
1936
    :returns: the Alpha SIS value
1937
    :rtype: tuple
1938
1939
    >>> alpha_sis('Christopher')
1940
    ('06401840000000', '07040184000000', '04018400000000')
1941
    >>> alpha_sis('Niall')
1942
    ('02500000000000',)
1943
    >>> alpha_sis('Smith')
1944
    ('03100000000000',)
1945
    >>> alpha_sis('Schmidt')
1946
    ('06310000000000',)
1947
    """
1948
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1949
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1950
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1951
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1952
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1953
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1954
                                 'Y')
1955
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1956
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1957
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1958
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1959
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1960
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1961
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1962
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1963
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1964
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1965
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1966
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1967
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1968
1969
    alpha = ['']
1970
    pos = 0
1971
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
1972
    word = word.replace('ß', 'SS')
1973
    word = ''.join(c for c in word if c in
1974
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
1975
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
1976
                    'Y', 'Z'})
1977
1978
    # Clamp maxlength to [4, 64]
1979
    if maxlength is not None:
1980
        maxlength = min(max(4, maxlength), 64)
1981
    else:
1982
        maxlength = 64
1983
1984
    # Do special processing for initial substrings
1985
    for k in _alpha_sis_initials_order:
1986
        if word.startswith(k):
1987
            alpha[0] += _alpha_sis_initials[k]
1988
            pos += len(k)
1989
            break
1990
1991
    # Add a '0' if alpha is still empty
1992
    if not alpha[0]:
1993
        alpha[0] += '0'
1994
1995
    # Whether or not any special initial codes were encoded, iterate
1996
    # through the length of the word in the main encoding loop
1997
    while pos < len(word):
1998
        origpos = pos
1999
        for k in _alpha_sis_basic_order:
2000
            if word[pos:].startswith(k):
2001
                if isinstance(_alpha_sis_basic[k], tuple):
2002
                    newalpha = []
2003
                    for i in range(len(_alpha_sis_basic[k])):
2004
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
2005
                    alpha = newalpha
2006
                else:
2007
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
2008
                pos += len(k)
2009
                break
2010
        if pos == origpos:
2011
            alpha = [_ + '_' for _ in alpha]
2012
            pos += 1
2013
2014
    # Trim doublets and placeholders
2015
    for i in range(len(alpha)):
2016
        pos = 1
2017
        while pos < len(alpha[i]):
2018
            if alpha[i][pos] == alpha[i][pos-1]:
2019
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
2020
            pos += 1
2021
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2022
2023
    # Trim codes and return tuple
2024
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2025
    return tuple(alpha)
2026
2027
2028
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2029
    """Return the Fuzzy Soundex code for a word.
2030
2031
    Fuzzy Soundex is an algorithm derived from Soundex, defined in:
2032
    Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for
2033
    Soundex Retrieval."
2034
    http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
2035
2036
    :param str word: the word to transform
2037
    :param int maxlength: the length of the code returned (defaults to 4)
2038
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2039
        a maxlength string
2040
    :returns: the Fuzzy Soundex value
2041
    :rtype: str
2042
2043
    >>> fuzzy_soundex('Christopher')
2044
    'K6931'
2045
    >>> fuzzy_soundex('Niall')
2046
    'N4000'
2047
    >>> fuzzy_soundex('Smith')
2048
    'S5300'
2049
    >>> fuzzy_soundex('Smith')
2050
    'S5300'
2051
    """
2052
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2053
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2054
                                          '0193017-07745501769301-7-9'))
2055
2056
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2057
    word = word.replace('ß', 'SS')
2058
2059
    # Clamp maxlength to [4, 64]
2060
    if maxlength is not None:
2061
        maxlength = min(max(4, maxlength), 64)
2062
    else:
2063
        maxlength = 64
2064
2065
    if not word:
2066
        if zero_pad:
2067
            return '0' * maxlength
2068
        return '0'
2069
2070
    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
2071
        word = 'SS' + word[2:]
2072
    elif word[:2] == 'GN':
2073
        word = 'NN' + word[2:]
2074
    elif word[:2] in {'HR', 'WR'}:
2075
        word = 'RR' + word[2:]
2076
    elif word[:2] == 'HW':
2077
        word = 'WW' + word[2:]
2078
    elif word[:2] in {'KN', 'NG'}:
2079
        word = 'NN' + word[2:]
2080
2081
    if word[-2:] == 'CH':
2082
        word = word[:-2] + 'KK'
2083
    elif word[-2:] == 'NT':
2084
        word = word[:-2] + 'TT'
2085
    elif word[-2:] == 'RT':
2086
        word = word[:-2] + 'RR'
2087
    elif word[-3:] == 'RDT':
2088
        word = word[:-3] + 'RR'
2089
2090
    word = word.replace('CA', 'KA')
2091
    word = word.replace('CC', 'KK')
2092
    word = word.replace('CK', 'KK')
2093
    word = word.replace('CE', 'SE')
2094
    word = word.replace('CHL', 'KL')
2095
    word = word.replace('CL', 'KL')
2096
    word = word.replace('CHR', 'KR')
2097
    word = word.replace('CR', 'KR')
2098
    word = word.replace('CI', 'SI')
2099
    word = word.replace('CO', 'KO')
2100
    word = word.replace('CU', 'KU')
2101
    word = word.replace('CY', 'SY')
2102
    word = word.replace('DG', 'GG')
2103
    word = word.replace('GH', 'HH')
2104
    word = word.replace('MAC', 'MK')
2105
    word = word.replace('MC', 'MK')
2106
    word = word.replace('NST', 'NSS')
2107
    word = word.replace('PF', 'FF')
2108
    word = word.replace('PH', 'FF')
2109
    word = word.replace('SCH', 'SSS')
2110
    word = word.replace('TIO', 'SIO')
2111
    word = word.replace('TIA', 'SIO')
2112
    word = word.replace('TCH', 'CHH')
2113
2114
    sdx = word.translate(_fuzzy_soundex_translation)
2115
    sdx = sdx.replace('-', '')
2116
2117
    # remove repeating characters
2118
    sdx = _delete_consecutive_repeats(sdx)
2119
2120
    if word[0] in {'H', 'W', 'Y'}:
2121
        sdx = word[0] + sdx
2122
    else:
2123
        sdx = word[0] + sdx[1:]
2124
2125
    sdx = sdx.replace('0', '')
2126
2127
    if zero_pad:
2128
        sdx += ('0'*maxlength)
2129
2130
    return sdx[:maxlength]
2131
2132
2133
def phonex(word, maxlength=4, zero_pad=True):
2134
    """Return the Phonex code for a word.
2135
2136
    Phonex is an algorithm derived from Soundex, defined in:
2137
    Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms".
2138
    http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf
2139
2140
    :param str word: the word to transform
2141
    :param int maxlength: the length of the code returned (defaults to 4)
2142
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2143
        a maxlength string
2144
    :returns: the Phonex value
2145
    :rtype: str
2146
2147
    >>> phonex('Christopher')
2148
    'C623'
2149
    >>> phonex('Niall')
2150
    'N400'
2151
    >>> phonex('Schmidt')
2152
    'S253'
2153
    >>> phonex('Smith')
2154
    'S530'
2155
    """
2156
    name = unicodedata.normalize('NFKD', text_type(word.upper()))
2157
    name = name.replace('ß', 'SS')
2158
2159
    # Clamp maxlength to [4, 64]
2160
    if maxlength is not None:
2161
        maxlength = min(max(4, maxlength), 64)
2162
    else:
2163
        maxlength = 64
2164
2165
    name_code = last = ''
2166
2167
    # Deletions effected by replacing with next letter which
2168
    # will be ignored due to duplicate handling of Soundex code.
2169
    # This is faster than 'moving' all subsequent letters.
2170
2171
    # Remove any trailing Ss
2172
    while name[-1:] == 'S':
2173
        name = name[:-1]
2174
2175
    # Phonetic equivalents of first 2 characters
2176
    # Works since duplicate letters are ignored
2177
    if name[:2] == 'KN':
2178
        name = 'N' + name[2:]  # KN.. == N..
2179
    elif name[:2] == 'PH':
2180
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2181
    elif name[:2] == 'WR':
2182
        name = 'R' + name[2:]  # WR.. == R..
2183
2184
    if name:
2185
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2186
        # Works since duplicate letters are ignored
2187
        if name[0] == 'H':
2188
            name = name[1:]
2189
2190
    if name:
2191
        # Phonetic equivalents of first character
2192
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2193
            name = 'A' + name[1:]
2194
        elif name[0] in {'B', 'P'}:
2195
            name = 'B' + name[1:]
2196
        elif name[0] in {'V', 'F'}:
2197
            name = 'F' + name[1:]
2198
        elif name[0] in {'C', 'K', 'Q'}:
2199
            name = 'C' + name[1:]
2200
        elif name[0] in {'G', 'J'}:
2201
            name = 'G' + name[1:]
2202
        elif name[0] in {'S', 'Z'}:
2203
            name = 'S' + name[1:]
2204
2205
        name_code = last = name[0]
2206
2207
    # MODIFIED SOUNDEX CODE
2208
    for i in range(1, len(name)):
2209
        code = '0'
2210
        if name[i] in {'B', 'F', 'P', 'V'}:
2211
            code = '1'
2212
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
2213
            code = '2'
2214
        elif name[i] in {'D', 'T'}:
2215
            if name[i+1:i+2] != 'C':
2216
                code = '3'
2217
        elif name[i] == 'L':
2218
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2219
                    i+1 == len(name)):
2220
                code = '4'
2221
        elif name[i] in {'M', 'N'}:
2222
            if name[i+1:i+2] in {'D', 'G'}:
2223
                name = name[:i+1] + name[i] + name[i+2:]
2224
            code = '5'
2225
        elif name[i] == 'R':
2226
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2227
                    i+1 == len(name)):
2228
                code = '6'
2229
2230
        if code != last and code != '0' and i != 0:
2231
            name_code += code
2232
2233
        last = name_code[-1]
2234
2235
    if zero_pad:
2236
        name_code += '0' * maxlength
2237
    if not name_code:
2238
        name_code = '0'
2239
    return name_code[:maxlength]
2240
2241
2242
def phonem(word):
2243
    """Return the Phonem code for a word.
2244
2245
    Phonem is defined in Wilde, Georg and Carsten Meyer. 1999. "Doppelgaenger
2246
    gesucht - Ein Programm fuer kontextsensitive phonetische Textumwandlung."
2247
    ct Magazin fuer Computer & Technik 25/1999.
2248
2249
    This version is based on the Perl implementation documented at:
2250
    http://phonetik.phil-fak.uni-koeln.de/fileadmin/home/ritters/Allgemeine_Dateien/Martin_Wilz.pdf
2251
    It includes some enhancements presented in the Java port at:
2252
    https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java
2253
2254
    Phonem is intended chiefly for German names/words.
2255
2256
    :param str word: the word to transform
2257
    :returns: the Phonem value
2258
    :rtype: str
2259
2260
    >>> phonem('Christopher')
2261
    'CRYSDOVR'
2262
    >>> phonem('Niall')
2263
    'NYAL'
2264
    >>> phonem('Smith')
2265
    'SMYD'
2266
    >>> phonem('Schmidt')
2267
    'CMYD'
2268
    """
2269
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2270
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2271
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2272
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2273
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2274
                             ('AU', 'A§'), ('OU', '§'))
2275
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2276
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2277
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2278
2279
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2280
    for i, j in _phonem_substitutions:
2281
        word = word.replace(i, j)
2282
    word = word.translate(_phonem_translation)
2283
2284
    return ''.join(c for c in _delete_consecutive_repeats(word)
2285
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
2286
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})
2287
2288
2289
def phonix(word, maxlength=4, zero_pad=True):
2290
    """Return the Phonix code for a word.
2291
2292
    Phonix is a Soundex-like algorithm defined in:
2293
    T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366.
2294
2295
    This implementation is based on
2296
    http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c
2297
    http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py
2298
    and
2299
    https://metacpan.org/pod/Text::Phonetic::Phonix
2300
2301
    :param str word: the word to transform
2302
    :param int maxlength: the length of the code returned (defaults to 4)
2303
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2304
        a maxlength string
2305
    :returns: the Phonix value
2306
    :rtype: str
2307
2308
    >>> phonix('Christopher')
2309
    'K683'
2310
    >>> phonix('Niall')
2311
    'N400'
2312
    >>> phonix('Smith')
2313
    'S530'
2314
    >>> phonix('Schmidt')
2315
    'S530'
2316
    """
2317
    # pylint: disable=too-many-branches
2318
    def _start_repl(word, src, tar, post=None):
2319
        r"""Replace src with tar at the start of word."""
2320
        if post:
2321
            for i in post:
2322
                if word.startswith(src+i):
2323
                    return tar + word[len(src):]
2324
        elif word.startswith(src):
2325
            return tar + word[len(src):]
2326
        return word
2327
2328
    def _end_repl(word, src, tar, pre=None):
2329
        r"""Replace src with tar at the end of word."""
2330
        if pre:
2331
            for i in pre:
2332
                if word.endswith(i+src):
2333
                    return word[:-len(src)] + tar
2334
        elif word.endswith(src):
2335
            return word[:-len(src)] + tar
2336
        return word
2337
2338
    def _mid_repl(word, src, tar, pre=None, post=None):
2339
        r"""Replace src with tar in the middle of word."""
2340
        if pre or post:
2341
            if not pre:
2342
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2343
            elif not post:
2344
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2345
            return _all_repl(word, src, tar, pre, post)
2346
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2347
                word[-1])
2348
2349
    def _all_repl(word, src, tar, pre=None, post=None):
2350
        r"""Replace src with tar anywhere in word."""
2351
        if pre or post:
2352
            if post:
2353
                post = post
2354
            else:
2355
                post = frozenset(('',))
2356
            if pre:
2357
                pre = pre
2358
            else:
2359
                pre = frozenset(('',))
2360
2361
            for i, j in ((i, j) for i in pre for j in post):
2362
                word = word.replace(i+src+j, i+tar+j)
2363
            return word
2364
        else:
2365
            return word.replace(src, tar)
2366
2367
    _vow = {'A', 'E', 'I', 'O', 'U'}
2368
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
2369
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
2370
2371
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2372
                             (_all_repl, 'CO', 'KO'),
2373
                             (_all_repl, 'CA', 'KA'),
2374
                             (_all_repl, 'CU', 'KU'),
2375
                             (_all_repl, 'CY', 'SI'),
2376
                             (_all_repl, 'CI', 'SI'),
2377
                             (_all_repl, 'CE', 'SE'),
2378
                             (_start_repl, 'CL', 'KL', _vow),
2379
                             (_all_repl, 'CK', 'K'),
2380
                             (_end_repl, 'GC', 'K'),
2381
                             (_end_repl, 'JC', 'K'),
2382
                             (_start_repl, 'CHR', 'KR', _vow),
2383
                             (_start_repl, 'CR', 'KR', _vow),
2384
                             (_start_repl, 'WR', 'R'),
2385
                             (_all_repl, 'NC', 'NK'),
2386
                             (_all_repl, 'CT', 'KT'),
2387
                             (_all_repl, 'PH', 'F'),
2388
                             (_all_repl, 'AA', 'AR'),
2389
                             (_all_repl, 'SCH', 'SH'),
2390
                             (_all_repl, 'BTL', 'TL'),
2391
                             (_all_repl, 'GHT', 'T'),
2392
                             (_all_repl, 'AUGH', 'ARF'),
2393
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2394
                             (_all_repl, 'LOUGH', 'LOW'),
2395
                             (_start_repl, 'Q', 'KW'),
2396
                             (_start_repl, 'KN', 'N'),
2397
                             (_end_repl, 'GN', 'N'),
2398
                             (_all_repl, 'GHN', 'N'),
2399
                             (_end_repl, 'GNE', 'N'),
2400
                             (_all_repl, 'GHNE', 'NE'),
2401
                             (_end_repl, 'GNES', 'NS'),
2402
                             (_start_repl, 'GN', 'N'),
2403
                             (_mid_repl, 'GN', 'N', None, _con),
2404
                             (_end_repl, 'GN', 'N'),
2405
                             (_start_repl, 'PS', 'S'),
2406
                             (_start_repl, 'PT', 'T'),
2407
                             (_start_repl, 'CZ', 'C'),
2408
                             (_mid_repl, 'WZ', 'Z', _vow),
2409
                             (_mid_repl, 'CZ', 'CH'),
2410
                             (_all_repl, 'LZ', 'LSH'),
2411
                             (_all_repl, 'RZ', 'RSH'),
2412
                             (_mid_repl, 'Z', 'S', None, _vow),
2413
                             (_all_repl, 'ZZ', 'TS'),
2414
                             (_mid_repl, 'Z', 'TS', _con),
2415
                             (_all_repl, 'HROUG', 'REW'),
2416
                             (_all_repl, 'OUGH', 'OF'),
2417
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2418
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2419
                             (_start_repl, 'YJ', 'Y', _vow),
2420
                             (_start_repl, 'GH', 'G'),
2421
                             (_end_repl, 'GH', 'E', _vow),
2422
                             (_start_repl, 'CY', 'S'),
2423
                             (_all_repl, 'NX', 'NKS'),
2424
                             (_start_repl, 'PF', 'F'),
2425
                             (_end_repl, 'DT', 'T'),
2426
                             (_end_repl, 'TL', 'TIL'),
2427
                             (_end_repl, 'DL', 'DIL'),
2428
                             (_all_repl, 'YTH', 'ITH'),
2429
                             (_start_repl, 'TJ', 'CH', _vow),
2430
                             (_start_repl, 'TSJ', 'CH', _vow),
2431
                             (_start_repl, 'TS', 'T', _vow),
2432
                             (_all_repl, 'TCH', 'CH'),
2433
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2434
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2435
                             (_start_repl, 'MN', 'N', _vow),
2436
                             (_start_repl, 'PN', 'N', _vow),
2437
                             (_mid_repl, 'STL', 'SL', _vow),
2438
                             (_end_repl, 'STL', 'SL', _vow),
2439
                             (_end_repl, 'TNT', 'ENT'),
2440
                             (_end_repl, 'EAUX', 'OH'),
2441
                             (_all_repl, 'EXCI', 'ECS'),
2442
                             (_all_repl, 'X', 'ECS'),
2443
                             (_end_repl, 'NED', 'ND'),
2444
                             (_all_repl, 'JR', 'DR'),
2445
                             (_end_repl, 'EE', 'EA'),
2446
                             (_all_repl, 'ZS', 'S'),
2447
                             (_mid_repl, 'R', 'AH', _vow, _con),
2448
                             (_end_repl, 'R', 'AH', _vow),
2449
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2450
                             (_end_repl, 'HR', 'AH', _vow),
2451
                             (_end_repl, 'HR', 'AH', _vow),
2452
                             (_end_repl, 'RE', 'AR'),
2453
                             (_end_repl, 'R', 'AH', _vow),
2454
                             (_all_repl, 'LLE', 'LE'),
2455
                             (_end_repl, 'LE', 'ILE', _con),
2456
                             (_end_repl, 'LES', 'ILES', _con),
2457
                             (_end_repl, 'E', ''),
2458
                             (_end_repl, 'ES', 'S'),
2459
                             (_end_repl, 'SS', 'AS', _vow),
2460
                             (_end_repl, 'MB', 'M', _vow),
2461
                             (_all_repl, 'MPTS', 'MPS'),
2462
                             (_all_repl, 'MPS', 'MS'),
2463
                             (_all_repl, 'MPT', 'MT'))
2464
2465
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2466
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2467
                                   '01230720022455012683070808'))
2468
2469
    sdx = ''
2470
2471
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2472
    word = word.replace('ß', 'SS')
2473
    word = ''.join(c for c in word if c in
2474
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2475
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2476
                    'Y', 'Z'})
2477
    if word:
2478
        for trans in _phonix_substitutions:
2479
            word = trans[0](word, *trans[1:])
2480
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2481
            sdx = 'v' + word[1:].translate(_phonix_translation)
2482
        else:
2483
            sdx = word[0] + word[1:].translate(_phonix_translation)
2484
        sdx = _delete_consecutive_repeats(sdx)
2485
        sdx = sdx.replace('0', '')
2486
2487
    # Clamp maxlength to [4, 64]
2488
    if maxlength is not None:
2489
        maxlength = min(max(4, maxlength), 64)
2490
    else:
2491
        maxlength = 64
2492
2493
    if zero_pad:
2494
        sdx += '0' * maxlength
2495
    if not sdx:
2496
        sdx = '0'
2497
    return sdx[:maxlength]
2498
2499
2500
def sfinxbis(word, maxlength=None):
2501
    """Return the SfinxBis code for a word.
2502
2503
    SfinxBis is a Soundex-like algorithm defined in:
2504
    http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf
2505
2506
    This implementation follows the reference implementation:
2507
    http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt
2508
2509
    SfinxBis is intended chiefly for Swedish names.
2510
2511
    :param str word: the word to transform
2512
    :param int maxlength: the length of the code returned (defaults to
2513
        unlimited)
2514
    :returns: the SfinxBis value
2515
    :rtype: tuple
2516
2517
    >>> sfinxbis('Christopher')
2518
    ('K68376',)
2519
    >>> sfinxbis('Niall')
2520
    ('N4',)
2521
    >>> sfinxbis('Smith')
2522
    ('S53',)
2523
    >>> sfinxbis('Schmidt')
2524
    ('S53',)
2525
2526
    >>> sfinxbis('Johansson')
2527
    ('J585',)
2528
    >>> sfinxbis('Sjöberg')
2529
    ('#162',)
2530
    """
2531
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2532
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2533
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2534
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2535
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2536
                   ' S:T ')
2537
2538
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
2539
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
2540
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
2541
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
2542
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2543
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2544
                'Y', 'Z', 'Ä', 'Å', 'Ö'}
2545
2546
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2547
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2548
                                     '123729224551268378999999999'))
2549
2550
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2551
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2552
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2553
2554
    def _foersvensker(ordet):
2555
        """Return the Swedish-ized form of the word."""
2556
        ordet = ordet.replace('STIERN', 'STJÄRN')
2557
        ordet = ordet.replace('HIE', 'HJ')
2558
        ordet = ordet.replace('SIÖ', 'SJÖ')
2559
        ordet = ordet.replace('SCH', 'SH')
2560
        ordet = ordet.replace('QU', 'KV')
2561
        ordet = ordet.replace('IO', 'JO')
2562
        ordet = ordet.replace('PH', 'F')
2563
2564
        for i in _harde_vokaler:
2565
            ordet = ordet.replace(i+'Ü', i+'J')
2566
            ordet = ordet.replace(i+'Y', i+'J')
2567
            ordet = ordet.replace(i+'I', i+'J')
2568
        for i in _mjuka_vokaler:
2569
            ordet = ordet.replace(i+'Ü', i+'J')
2570
            ordet = ordet.replace(i+'Y', i+'J')
2571
            ordet = ordet.replace(i+'I', i+'J')
2572
2573
        if 'H' in ordet:
2574
            for i in _konsonanter:
2575
                ordet = ordet.replace('H'+i, i)
2576
2577
        ordet = ordet.translate(_sfinxbis_substitutions)
2578
2579
        ordet = ordet.replace('Ð', 'ETH')
2580
        ordet = ordet.replace('Þ', 'TH')
2581
        ordet = ordet.replace('ß', 'SS')
2582
2583
        return ordet
2584
2585
    def _koda_foersta_ljudet(ordet):
2586
        """Return the word with the first sound coded."""
2587
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2588
            ordet = '$' + ordet[1:]
2589
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2590
            ordet = 'J' + ordet[2:]
2591
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2592
            ordet = 'J' + ordet[1:]
2593
        elif ordet[0:1] == 'Q':
2594
            ordet = 'K' + ordet[1:]
2595
        elif (ordet[0:2] == 'CH' and
2596
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2597
            ordet = '#' + ordet[2:]
2598
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2599
            ordet = 'K' + ordet[1:]
2600
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2601
            ordet = 'K' + ordet[1:]
2602
        elif ordet[0:1] == 'X':
2603
            ordet = 'S' + ordet[1:]
2604
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2605
            ordet = 'S' + ordet[1:]
2606
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2607
            ordet = '#' + ordet[3:]
2608
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2609
            ordet = '#' + ordet[2:]
2610
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2611
            ordet = '#' + ordet[2:]
2612
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2613
            ordet = '#' + ordet[1:]
2614
        return ordet
2615
2616
    # Steg 1, Versaler
2617
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2618
    word = word.replace('ß', 'SS')
2619
    word = word.replace('-', ' ')
2620
2621
    # Steg 2, Ta bort adelsprefix
2622
    for adelstitel in adelstitler:
2623
        while adelstitel in word:
2624
            word = word.replace(adelstitel, ' ')
2625
        if word.startswith(adelstitel[1:]):
2626
            word = word[len(adelstitel)-1:]
2627
2628
    # Split word into tokens
2629
    ordlista = word.split()
2630
2631
    # Steg 3, Ta bort dubbelteckning i början på namnet
2632
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2633
    if not ordlista:
2634
        return ('',)
2635
2636
    # Steg 4, Försvenskning
2637
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2638
2639
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2640
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2641
                for ordet in ordlista]
2642
2643
    # Steg 6, Koda första ljudet
2644
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2645
2646
    # Steg 7, Dela upp namnet i två delar
2647
    rest = [ordet[1:] for ordet in ordlista]
2648
2649
    # Steg 8, Utför fonetisk transformation i resten
2650
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2651
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2652
2653
    # Steg 9, Koda resten till en sifferkod
2654
    for vokal in _mjuka_vokaler:
2655
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2656
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2657
2658
    # Steg 10, Ta bort intilliggande dubbletter
2659
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2660
2661
    # Steg 11, Ta bort alla "9"
2662
    rest = [ordet.replace('9', '') for ordet in rest]
2663
2664
    # Steg 12, Sätt ihop delarna igen
2665
    ordlista = [''.join(ordet) for ordet in
2666
                zip((_[0:1] for _ in ordlista), rest)]
2667
2668
    # truncate, if maxlength is set
2669
    if maxlength and maxlength < _INFINITY:
2670
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2671
2672
    return tuple(ordlista)
2673
2674
2675
def phonet(word, mode=1, lang='de', trace=False):
2676
    """Return the phonet code for a word.
2677
2678
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
2679
    documented in c't magazine vol. 25/1999, p. 252. It is a phonetic
2680
    algorithm designed primarily for German.
2681
    Cf. http://www.heise.de/ct/ftp/99/25/252/
2682
2683
    This is a port of Jesper Zedlitz's code, which is licensed LGPL:
2684
    https://code.google.com/p/phonet4java/source/browse/trunk/src/main/java/com/googlecode/phonet4java/Phonet.java
2685
2686
    That is, in turn, based on Michael's C code, which is also licensed LGPL:
2687
    ftp://ftp.heise.de/pub/ct/listings/phonet.zip
2688
2689
    :param str word: the word to transform
2690
    :param int mode: the ponet variant to employ (1 or 2)
2691
    :param str lang: 'de' (default) for German
2692
            'none' for no language
2693
    :param bool trace: prints debugging info if True
2694
    :returns: the phonet value
2695
    :rtype: str
2696
2697
    >>> phonet('Christopher')
2698
    'KRISTOFA'
2699
    >>> phonet('Niall')
2700
    'NIAL'
2701
    >>> phonet('Smith')
2702
    'SMIT'
2703
    >>> phonet('Schmidt')
2704
    'SHMIT'
2705
2706
    >>> phonet('Christopher', mode=2)
2707
    'KRIZTUFA'
2708
    >>> phonet('Niall', mode=2)
2709
    'NIAL'
2710
    >>> phonet('Smith', mode=2)
2711
    'ZNIT'
2712
    >>> phonet('Schmidt', mode=2)
2713
    'ZNIT'
2714
2715
    >>> phonet('Christopher', lang='none')
2716
    'CHRISTOPHER'
2717
    >>> phonet('Niall', lang='none')
2718
    'NIAL'
2719
    >>> phonet('Smith', lang='none')
2720
    'SMITH'
2721
    >>> phonet('Schmidt', lang='none')
2722
    'SCHMIDT'
2723
    """
2724
    # pylint: disable=too-many-branches
2725
2726
    _phonet_rules_no_lang = (  # separator chars
2727
        '´', ' ', ' ',
2728
        '"', ' ', ' ',
2729
        '`$', '', '',
2730
        '\'', ' ', ' ',
2731
        ',', ',', ',',
2732
        ';', ',', ',',
2733
        '-', ' ', ' ',
2734
        ' ', ' ', ' ',
2735
        '.', '.', '.',
2736
        ':', '.', '.',
2737
        # German umlauts
2738
        'Ä', 'AE', 'AE',
2739
        'Ö', 'OE', 'OE',
2740
        'Ü', 'UE', 'UE',
2741
        'ß', 'S', 'S',
2742
        # international umlauts
2743
        'À', 'A', 'A',
2744
        'Á', 'A', 'A',
2745
        'Â', 'A', 'A',
2746
        'Ã', 'A', 'A',
2747
        'Å', 'A', 'A',
2748
        'Æ', 'AE', 'AE',
2749
        'Ç', 'C', 'C',
2750
        'Ð', 'DJ', 'DJ',
2751
        'È', 'E', 'E',
2752
        'É', 'E', 'E',
2753
        'Ê', 'E', 'E',
2754
        'Ë', 'E', 'E',
2755
        'Ì', 'I', 'I',
2756
        'Í', 'I', 'I',
2757
        'Î', 'I', 'I',
2758
        'Ï', 'I', 'I',
2759
        'Ñ', 'NH', 'NH',
2760
        'Ò', 'O', 'O',
2761
        'Ó', 'O', 'O',
2762
        'Ô', 'O', 'O',
2763
        'Õ', 'O', 'O',
2764
        'Œ', 'OE', 'OE',
2765
        'Ø', 'OE', 'OE',
2766
        'Š', 'SH', 'SH',
2767
        'Þ', 'TH', 'TH',
2768
        'Ù', 'U', 'U',
2769
        'Ú', 'U', 'U',
2770
        'Û', 'U', 'U',
2771
        'Ý', 'Y', 'Y',
2772
        'Ÿ', 'Y', 'Y',
2773
        # 'normal' letters (A-Z)
2774
        'MC^', 'MAC', 'MAC',
2775
        'MC^', 'MAC', 'MAC',
2776
        'M´^', 'MAC', 'MAC',
2777
        'M\'^', 'MAC', 'MAC',
2778
        'O´^', 'O', 'O',
2779
        'O\'^', 'O', 'O',
2780
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2781
        None, None, None)
2782
2783
    _phonet_rules_german = (  # separator chars
2784
        '´', ' ', ' ',
2785
        '"', ' ', ' ',
2786
        '`$', '', '',
2787
        '\'', ' ', ' ',
2788
        ',', ' ', ' ',
2789
        ';', ' ', ' ',
2790
        '-', ' ', ' ',
2791
        ' ', ' ', ' ',
2792
        '.', '.', '.',
2793
        ':', '.', '.',
2794
        # German umlauts
2795
        'ÄE', 'E', 'E',
2796
        'ÄU<', 'EU', 'EU',
2797
        'ÄV(AEOU)-<', 'EW', None,
2798
        'Ä$', 'Ä', None,
2799
        'Ä<', None, 'E',
2800
        'Ä', 'E', None,
2801
        'ÖE', 'Ö', 'Ö',
2802
        'ÖU', 'Ö', 'Ö',
2803
        'ÖVER--<', 'ÖW', None,
2804
        'ÖV(AOU)-', 'ÖW', None,
2805
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2806
        'ÜBER^^', 'ÜBA', 'IBA',
2807
        'ÜE', 'Ü', 'I',
2808
        'ÜVER--<', 'ÜW', None,
2809
        'ÜV(AOU)-', 'ÜW', None,
2810
        'Ü', None, 'I',
2811
        'ßCH<', None, 'Z',
2812
        'ß<', 'S', 'Z',
2813
        # international umlauts
2814
        'À<', 'A', 'A',
2815
        'Á<', 'A', 'A',
2816
        'Â<', 'A', 'A',
2817
        'Ã<', 'A', 'A',
2818
        'Å<', 'A', 'A',
2819
        'ÆER-', 'E', 'E',
2820
        'ÆU<', 'EU', 'EU',
2821
        'ÆV(AEOU)-<', 'EW', None,
2822
        'Æ$', 'Ä', None,
2823
        'Æ<', None, 'E',
2824
        'Æ', 'E', None,
2825
        'Ç', 'Z', 'Z',
2826
        'ÐÐ-', '', '',
2827
        'Ð', 'DI', 'TI',
2828
        'È<', 'E', 'E',
2829
        'É<', 'E', 'E',
2830
        'Ê<', 'E', 'E',
2831
        'Ë', 'E', 'E',
2832
        'Ì<', 'I', 'I',
2833
        'Í<', 'I', 'I',
2834
        'Î<', 'I', 'I',
2835
        'Ï', 'I', 'I',
2836
        'ÑÑ-', '', '',
2837
        'Ñ', 'NI', 'NI',
2838
        'Ò<', 'O', 'U',
2839
        'Ó<', 'O', 'U',
2840
        'Ô<', 'O', 'U',
2841
        'Õ<', 'O', 'U',
2842
        'Œ<', 'Ö', 'Ö',
2843
        'Ø(IJY)-<', 'E', 'E',
2844
        'Ø<', 'Ö', 'Ö',
2845
        'Š', 'SH', 'Z',
2846
        'Þ', 'T', 'T',
2847
        'Ù<', 'U', 'U',
2848
        'Ú<', 'U', 'U',
2849
        'Û<', 'U', 'U',
2850
        'Ý<', 'I', 'I',
2851
        'Ÿ<', 'I', 'I',
2852
        # 'normal' letters (A-Z)
2853
        'ABELLE$', 'ABL', 'ABL',
2854
        'ABELL$', 'ABL', 'ABL',
2855
        'ABIENNE$', 'ABIN', 'ABIN',
2856
        'ACHME---^', 'ACH', 'AK',
2857
        'ACEY$', 'AZI', 'AZI',
2858
        'ADV', 'ATW', None,
2859
        'AEGL-', 'EK', None,
2860
        'AEU<', 'EU', 'EU',
2861
        'AE2', 'E', 'E',
2862
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2863
        'AGL-1', 'AK', None,
2864
        'AGNI-^', 'AKN', 'AKN',
2865
        'AGNIE-', 'ANI', 'ANI',
2866
        'AGN(AEOU)-$', 'ANI', 'ANI',
2867
        'AH(AIOÖUÜY)-', 'AH', None,
2868
        'AIA2', 'AIA', 'AIA',
2869
        'AIE$', 'E', 'E',
2870
        'AILL(EOU)-', 'ALI', 'ALI',
2871
        'AINE$', 'EN', 'EN',
2872
        'AIRE$', 'ER', 'ER',
2873
        'AIR-', 'E', 'E',
2874
        'AISE$', 'ES', 'EZ',
2875
        'AISSANCE$', 'ESANS', 'EZANZ',
2876
        'AISSE$', 'ES', 'EZ',
2877
        'AIX$', 'EX', 'EX',
2878
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2879
        'AKTIE', 'AXIE', 'AXIE',
2880
        'AKTUEL', 'AKTUEL', None,
2881
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2882
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2883
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2884
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2885
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2886
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2887
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2888
        'ANDERGING----', 'ANDA ', 'ANTA ',
2889
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2890
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2891
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2892
        'ANER(BKO)---^^', 'AN', None,
2893
        'ANHAND---^$', 'AN H', 'AN ',
2894
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2895
        'ANIELLE$', 'ANIEL', 'ANIL',
2896
        'ANIEL', 'ANIEL', None,
2897
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2898
        'ANTI^^', 'ANTI', 'ANTI',
2899
        'ANVER^^', 'ANFA', 'ANFA',
2900
        'ATIA$', 'ATIA', 'ATIA',
2901
        'ATIA(NS)--', 'ATI', 'ATI',
2902
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2903
        'AUAU--', '', '',
2904
        'AUERE$', 'AUERE', None,
2905
        'AUERE(NS)-$', 'AUERE', None,
2906
        'AUERE(AIOUY)--', 'AUER', None,
2907
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2908
        'AUER<', 'AUA', 'AUA',
2909
        'AUF^^', 'AUF', 'AUF',
2910
        'AULT$', 'O', 'U',
2911
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2912
        'AUR$', 'AUA', 'AUA',
2913
        'AUSSE$', 'OS', 'UZ',
2914
        'AUS(ST)-^', 'AUS', 'AUS',
2915
        'AUS^^', 'AUS', 'AUS',
2916
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2917
        'AUTO^^', 'AUTO', 'AUTU',
2918
        'AUX(IY)-', 'AUX', 'AUX',
2919
        'AUX', 'O', 'U',
2920
        'AU', 'AU', 'AU',
2921
        'AVER--<', 'AW', None,
2922
        'AVIER$', 'AWIE', 'AFIE',
2923
        'AV(EÈÉÊI)-^', 'AW', None,
2924
        'AV(AOU)-', 'AW', None,
2925
        'AYRE$', 'EIRE', 'EIRE',
2926
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2927
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2928
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2929
        'AYR<', 'EIA', 'EIA',
2930
        'AYER--<', 'EI', 'EI',
2931
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2932
        'AË', 'E', 'E',
2933
        'A(IJY)<', 'EI', 'EI',
2934
        'BABY^$', 'BEBI', 'BEBI',
2935
        'BAB(IY)^', 'BEBI', 'BEBI',
2936
        'BEAU^$', 'BO', None,
2937
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2938
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2939
        'BEE$', 'BI', 'BI',
2940
        'BEIGE^$', 'BESH', 'BEZ',
2941
        'BENOIT--', 'BENO', 'BENU',
2942
        'BER(DT)-', 'BER', None,
2943
        'BERN(DT)-', 'BERN', None,
2944
        'BE(LMNRST)-^', 'BE', 'BE',
2945
        'BETTE$', 'BET', 'BET',
2946
        'BEVOR^$', 'BEFOR', None,
2947
        'BIC$', 'BIZ', 'BIZ',
2948
        'BOWL(EI)-', 'BOL', 'BUL',
2949
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2950
        'BRINGEND-----^', 'BRI', 'BRI',
2951
        'BRINGEND-----', ' BRI', ' BRI',
2952
        'BROW(NS)-', 'BRAU', 'BRAU',
2953
        'BUDGET7', 'BÜGE', 'BIKE',
2954
        'BUFFET7', 'BÜFE', 'BIFE',
2955
        'BYLLE$', 'BILE', 'BILE',
2956
        'BYLL$', 'BIL', 'BIL',
2957
        'BYPA--^', 'BEI', 'BEI',
2958
        'BYTE<', 'BEIT', 'BEIT',
2959
        'BY9^', 'BÜ', None,
2960
        'B(SßZ)$', 'BS', None,
2961
        'CACH(EI)-^', 'KESH', 'KEZ',
2962
        'CAE--', 'Z', 'Z',
2963
        'CA(IY)$', 'ZEI', 'ZEI',
2964
        'CE(EIJUY)--', 'Z', 'Z',
2965
        'CENT<', 'ZENT', 'ZENT',
2966
        'CERST(EI)----^', 'KE', 'KE',
2967
        'CER$', 'ZA', 'ZA',
2968
        'CE3', 'ZE', 'ZE',
2969
        'CH\'S$', 'X', 'X',
2970
        'CH´S$', 'X', 'X',
2971
        'CHAO(ST)-', 'KAO', 'KAU',
2972
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2973
        'CHAR(AI)-^', 'KAR', 'KAR',
2974
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2975
        'CHÄ(CF)-', 'SHE', 'ZE',
2976
        'CHE(CF)-', 'SHE', 'ZE',
2977
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2978
        'CHEQUE<', 'SHEK', 'ZEK',
2979
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2980
        'CH(AEUY)-<^', 'SH', 'Z',
2981
        'CHK-', '', '',
2982
        'CHO(CKPS)-^', 'SHO', 'ZU',
2983
        'CHRIS-', 'KRI', None,
2984
        'CHRO-', 'KR', None,
2985
        'CH(LOR)-<^', 'K', 'K',
2986
        'CHST-', 'X', 'X',
2987
        'CH(SßXZ)3', 'X', 'X',
2988
        'CHTNI-3', 'CHN', 'KN',
2989
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2990
        'CH', 'CH', 'K',
2991
        'CIC$', 'ZIZ', 'ZIZ',
2992
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
2993
        'CIENCE$', 'EIENS', 'EIENZ',
2994
        'CIER$', 'ZIE', 'ZIE',
2995
        'CYB-^', 'ZEI', 'ZEI',
2996
        'CY9^', 'ZÜ', 'ZI',
2997
        'C(IJY)-<3', 'Z', 'Z',
2998
        'CLOWN-', 'KLAU', 'KLAU',
2999
        'CCH', 'Z', 'Z',
3000
        'CCE-', 'X', 'X',
3001
        'C(CK)-', '', '',
3002
        'CLAUDET---', 'KLO', 'KLU',
3003
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
3004
        'COACH', 'KOSH', 'KUZ',
3005
        'COLE$', 'KOL', 'KUL',
3006
        'COUCH', 'KAUSH', 'KAUZ',
3007
        'COW', 'KAU', 'KAU',
3008
        'CQUES$', 'K', 'K',
3009
        'CQUE', 'K', 'K',
3010
        'CRASH--9', 'KRE', 'KRE',
3011
        'CREAT-^', 'KREA', 'KREA',
3012
        'CST', 'XT', 'XT',
3013
        'CS<^', 'Z', 'Z',
3014
        'C(SßX)', 'X', 'X',
3015
        'CT\'S$', 'X', 'X',
3016
        'CT(SßXZ)', 'X', 'X',
3017
        'CZ<', 'Z', 'Z',
3018
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
3019
        'C.^', 'C.', 'C.',
3020
        'CÄ-', 'Z', 'Z',
3021
        'CÜ$', 'ZÜ', 'ZI',
3022
        'C\'S$', 'X', 'X',
3023
        'C<', 'K', 'K',
3024
        'DAHER^$', 'DAHER', None,
3025
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
3026
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
3027
        'DD(SZ)--<', '', '',
3028
        'DD9', 'D', None,
3029
        'DEPOT7', 'DEPO', 'TEBU',
3030
        'DESIGN', 'DISEIN', 'TIZEIN',
3031
        'DE(LMNRST)-3^', 'DE', 'TE',
3032
        'DETTE$', 'DET', 'TET',
3033
        'DH$', 'T', None,
3034
        'DIC$', 'DIZ', 'TIZ',
3035
        'DIDR-^', 'DIT', None,
3036
        'DIEDR-^', 'DIT', None,
3037
        'DJ(AEIOU)-^', 'I', 'I',
3038
        'DMITR-^', 'DIMIT', 'TINIT',
3039
        'DRY9^', 'DRÜ', None,
3040
        'DT-', '', '',
3041
        'DUIS-^', 'DÜ', 'TI',
3042
        'DURCH^^', 'DURCH', 'TURK',
3043
        'DVA$', 'TWA', None,
3044
        'DY9^', 'DÜ', None,
3045
        'DYS$', 'DIS', None,
3046
        'DS(CH)--<', 'T', 'T',
3047
        'DST', 'ZT', 'ZT',
3048
        'DZS(CH)--', 'T', 'T',
3049
        'D(SßZ)', 'Z', 'Z',
3050
        'D(AÄEIOÖRUÜY)-', 'D', None,
3051
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3052
        'D\'H^', 'D', 'T',
3053
        'D´H^', 'D', 'T',
3054
        'D`H^', 'D', 'T',
3055
        'D\'S3$', 'Z', 'Z',
3056
        'D´S3$', 'Z', 'Z',
3057
        'D^', 'D', None,
3058
        'D', 'T', 'T',
3059
        'EAULT$', 'O', 'U',
3060
        'EAUX$', 'O', 'U',
3061
        'EAU', 'O', 'U',
3062
        'EAV', 'IW', 'IF',
3063
        'EAS3$', 'EAS', None,
3064
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3065
        'EA3$', 'EA', 'EA',
3066
        'EA3', 'I', 'I',
3067
        'EBENSO^$', 'EBNSO', 'EBNZU',
3068
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3069
        'EBEN^^', 'EBN', 'EBN',
3070
        'EE9', 'E', 'E',
3071
        'EGL-1', 'EK', None,
3072
        'EHE(IUY)--1', 'EH', None,
3073
        'EHUNG---1', 'E', None,
3074
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3075
        'EIEI--', '', '',
3076
        'EIERE^$', 'EIERE', None,
3077
        'EIERE$', 'EIERE', None,
3078
        'EIERE(NS)-$', 'EIERE', None,
3079
        'EIERE(AIOUY)--', 'EIER', None,
3080
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3081
        'EIER<', 'EIA', None,
3082
        'EIGL-1', 'EIK', None,
3083
        'EIGH$', 'EI', 'EI',
3084
        'EIH--', 'E', 'E',
3085
        'EILLE$', 'EI', 'EI',
3086
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3087
        'EIR$', 'EIA', 'EIA',
3088
        'EITRAUBEN------', 'EIT ', 'EIT ',
3089
        'EI', 'EI', 'EI',
3090
        'EJ$', 'EI', 'EI',
3091
        'ELIZ^', 'ELIS', None,
3092
        'ELZ^', 'ELS', None,
3093
        'EL-^', 'E', 'E',
3094
        'ELANG----1', 'E', 'E',
3095
        'EL(DKL)--1', 'E', 'E',
3096
        'EL(MNT)--1$', 'E', 'E',
3097
        'ELYNE$', 'ELINE', 'ELINE',
3098
        'ELYN$', 'ELIN', 'ELIN',
3099
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3100
        'EL-1', 'L', 'L',
3101
        'EM-^', None, 'E',
3102
        'EM(DFKMPQT)--1', None, 'E',
3103
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3104
        'EM-1', None, 'N',
3105
        'ENGAG-^', 'ANGA', 'ANKA',
3106
        'EN-^', 'E', 'E',
3107
        'ENTUEL', 'ENTUEL', None,
3108
        'EN(CDGKQSTZ)--1', 'E', 'E',
3109
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3110
        'EN-1', '', '',
3111
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3112
        'ER-^', 'E', 'E',
3113
        'ERREGEND-----', ' ER', ' ER',
3114
        'ERT1$', 'AT', None,
3115
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3116
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3117
        'ER1$', 'A', 'A',
3118
        'ER<1', 'A', 'A',
3119
        'ETAT7', 'ETA', 'ETA',
3120
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3121
        'EUERE$', 'EUERE', None,
3122
        'EUERE(NS)-$', 'EUERE', None,
3123
        'EUERE(AIOUY)--', 'EUER', None,
3124
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3125
        'EUER<', 'EUA', None,
3126
        'EUEU--', '', '',
3127
        'EUILLE$', 'Ö', 'Ö',
3128
        'EUR$', 'ÖR', 'ÖR',
3129
        'EUX', 'Ö', 'Ö',
3130
        'EUSZ$', 'EUS', None,
3131
        'EUTZ$', 'EUS', None,
3132
        'EUYS$', 'EUS', 'EUZ',
3133
        'EUZ$', 'EUS', None,
3134
        'EU', 'EU', 'EU',
3135
        'EVER--<1', 'EW', None,
3136
        'EV(ÄOÖUÜ)-1', 'EW', None,
3137
        'EYER<', 'EIA', 'EIA',
3138
        'EY<', 'EI', 'EI',
3139
        'FACETTE', 'FASET', 'FAZET',
3140
        'FANS--^$', 'FE', 'FE',
3141
        'FAN-^$', 'FE', 'FE',
3142
        'FAULT-', 'FOL', 'FUL',
3143
        'FEE(DL)-', 'FI', 'FI',
3144
        'FEHLER', 'FELA', 'FELA',
3145
        'FE(LMNRST)-3^', 'FE', 'FE',
3146
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3147
        'FOERDERN---', ' FÖRD', ' FÖRT',
3148
        'FOND7', 'FON', 'FUN',
3149
        'FRAIN$', 'FRA', 'FRA',
3150
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3151
        'FY9^', 'FÜ', None,
3152
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3153
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3154
        'GAGS^$', 'GEX', 'KEX',
3155
        'GAG^$', 'GEK', 'KEK',
3156
        'GD', 'KT', 'KT',
3157
        'GEGEN^^', 'GEGN', 'KEKN',
3158
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3159
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3160
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3161
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3162
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3163
        'GENRE', 'IORE', 'IURE',
3164
        'GE(LMNRST)-3^', 'GE', 'KE',
3165
        'GER(DKT)-', 'GER', None,
3166
        'GETTE$', 'GET', 'KET',
3167
        'GGF.', 'GF.', None,
3168
        'GG-', '', '',
3169
        'GH', 'G', None,
3170
        'GI(AOU)-^', 'I', 'I',
3171
        'GION-3', 'KIO', 'KIU',
3172
        'G(CK)-', '', '',
3173
        'GJ(AEIOU)-^', 'I', 'I',
3174
        'GMBH^$', 'GMBH', 'GMBH',
3175
        'GNAC$', 'NIAK', 'NIAK',
3176
        'GNON$', 'NION', 'NIUN',
3177
        'GN$', 'N', 'N',
3178
        'GONCAL-^', 'GONZA', 'KUNZA',
3179
        'GRY9^', 'GRÜ', None,
3180
        'G(SßXZ)-<', 'K', 'K',
3181
        'GUCK-', 'KU', 'KU',
3182
        'GUISEP-^', 'IUSE', 'IUZE',
3183
        'GUI-^', 'G', 'K',
3184
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3185
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3186
        'GY9^', 'GÜ', None,
3187
        'G(AÄEILOÖRUÜY)-', 'G', None,
3188
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3189
        'G\'S$', 'X', 'X',
3190
        'G´S$', 'X', 'X',
3191
        'G^', 'G', None,
3192
        'G', 'K', 'K',
3193
        'HA(HIUY)--1', 'H', None,
3194
        'HANDVOL---^', 'HANT ', 'ANT ',
3195
        'HANNOVE-^', 'HANOF', None,
3196
        'HAVEN7$', 'HAFN', None,
3197
        'HEAD-', 'HE', 'E',
3198
        'HELIEGEN------', 'E ', 'E ',
3199
        'HESTEHEN------', 'E ', 'E ',
3200
        'HE(LMNRST)-3^', 'HE', 'E',
3201
        'HE(LMN)-1', 'E', 'E',
3202
        'HEUR1$', 'ÖR', 'ÖR',
3203
        'HE(HIUY)--1', 'H', None,
3204
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3205
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3206
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3207
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3208
        'HOBBY9^', 'HOBI', None,
3209
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3210
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3211
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3212
        'HO(HIY)--1', 'H', None,
3213
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3214
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3215
        'HUIS^^', 'HÜS', 'IZ',
3216
        'HUIS$', 'ÜS', 'IZ',
3217
        'HUI--1', 'H', None,
3218
        'HYGIEN^', 'HÜKIEN', None,
3219
        'HY9^', 'HÜ', None,
3220
        'HY(BDGMNPST)-', 'Ü', None,
3221
        'H.^', None, 'H.',
3222
        'HÄU--1', 'H', None,
3223
        'H^', 'H', '',
3224
        'H', '', '',
3225
        'ICHELL---', 'ISH', 'IZ',
3226
        'ICHI$', 'ISHI', 'IZI',
3227
        'IEC$', 'IZ', 'IZ',
3228
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3229
        'IEI-3', '', '',
3230
        'IELL3', 'IEL', 'IEL',
3231
        'IENNE$', 'IN', 'IN',
3232
        'IERRE$', 'IER', 'IER',
3233
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3234
        'IETTE$', 'IT', 'IT',
3235
        'IEU', 'IÖ', 'IÖ',
3236
        'IE<4', 'I', 'I',
3237
        'IGL-1', 'IK', None,
3238
        'IGHT3$', 'EIT', 'EIT',
3239
        'IGNI(EO)-', 'INI', 'INI',
3240
        'IGN(AEOU)-$', 'INI', 'INI',
3241
        'IHER(DGLKRT)--1', 'IHE', None,
3242
        'IHE(IUY)--', 'IH', None,
3243
        'IH(AIOÖUÜY)-', 'IH', None,
3244
        'IJ(AOU)-', 'I', 'I',
3245
        'IJ$', 'I', 'I',
3246
        'IJ<', 'EI', 'EI',
3247
        'IKOLE$', 'IKOL', 'IKUL',
3248
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3249
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3250
        'IMSTAN----^', 'IM ', 'IN ',
3251
        'INDELERREGE------', 'INDL ', 'INTL ',
3252
        'INFRAGE-----^$', 'IN ', 'IN ',
3253
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3254
        'INVER-', 'INWE', 'INFE',
3255
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3256
        'IUSZ$', 'IUS', None,
3257
        'IUTZ$', 'IUS', None,
3258
        'IUZ$', 'IUS', None,
3259
        'IVER--<', 'IW', None,
3260
        'IVIER$', 'IWIE', 'IFIE',
3261
        'IV(ÄOÖUÜ)-', 'IW', None,
3262
        'IV<3', 'IW', None,
3263
        'IY2', 'I', None,
3264
        'I(ÈÉÊ)<4', 'I', 'I',
3265
        'JAVIE---<^', 'ZA', 'ZA',
3266
        'JEANS^$', 'JINS', 'INZ',
3267
        'JEANNE^$', 'IAN', 'IAN',
3268
        'JEAN-^', 'IA', 'IA',
3269
        'JER-^', 'IE', 'IE',
3270
        'JE(LMNST)-', 'IE', 'IE',
3271
        'JI^', 'JI', None,
3272
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3273
        'J', 'I', 'I',
3274
        'KC(ÄEIJ)-', 'X', 'X',
3275
        'KD', 'KT', None,
3276
        'KE(LMNRST)-3^', 'KE', 'KE',
3277
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3278
        'KH<^', 'K', 'K',
3279
        'KIC$', 'KIZ', 'KIZ',
3280
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3281
        'KOTELE-^', 'KOTL', 'KUTL',
3282
        'KREAT-^', 'KREA', 'KREA',
3283
        'KRÜS(TZ)--^', 'KRI', None,
3284
        'KRYS(TZ)--^', 'KRI', None,
3285
        'KRY9^', 'KRÜ', None,
3286
        'KSCH---', 'K', 'K',
3287
        'KSH--', 'K', 'K',
3288
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3289
        'KT\'S$', 'X', 'X',
3290
        'KTI(AIOU)-3', 'XI', 'XI',
3291
        'KT(SßXZ)', 'X', 'X',
3292
        'KY9^', 'KÜ', None,
3293
        'K\'S$', 'X', 'X',
3294
        'K´S$', 'X', 'X',
3295
        'LANGES$', ' LANGES', ' LANKEZ',
3296
        'LANGE$', ' LANGE', ' LANKE',
3297
        'LANG$', ' LANK', ' LANK',
3298
        'LARVE-', 'LARF', 'LARF',
3299
        'LD(SßZ)$', 'LS', 'LZ',
3300
        'LD\'S$', 'LS', 'LZ',
3301
        'LD´S$', 'LS', 'LZ',
3302
        'LEAND-^', 'LEAN', 'LEAN',
3303
        'LEERSTEHE-----^', 'LER ', 'LER ',
3304
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3305
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3306
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3307
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3308
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3309
        'LEL-', 'LE', 'LE',
3310
        'LE(MNRST)-3^', 'LE', 'LE',
3311
        'LETTE$', 'LET', 'LET',
3312
        'LFGNAG-', 'LFGAN', 'LFKAN',
3313
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3314
        'LIC$', 'LIZ', 'LIZ',
3315
        'LIVE^$', 'LEIF', 'LEIF',
3316
        'LT(SßZ)$', 'LS', 'LZ',
3317
        'LT\'S$', 'LS', 'LZ',
3318
        'LT´S$', 'LS', 'LZ',
3319
        'LUI(GS)--', 'LU', 'LU',
3320
        'LV(AIO)-', 'LW', None,
3321
        'LY9^', 'LÜ', None,
3322
        'LSTS$', 'LS', 'LZ',
3323
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3324
        'L(SßZ)$', 'LS', None,
3325
        'MAIR-<', 'MEI', 'NEI',
3326
        'MANAG-', 'MENE', 'NENE',
3327
        'MANUEL', 'MANUEL', None,
3328
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3329
        'MATCH', 'MESH', 'NEZ',
3330
        'MAURICE', 'MORIS', 'NURIZ',
3331
        'MBH^$', 'MBH', 'MBH',
3332
        'MB(ßZ)$', 'MS', None,
3333
        'MB(SßTZ)-', 'M', 'N',
3334
        'MCG9^', 'MAK', 'NAK',
3335
        'MC9^', 'MAK', 'NAK',
3336
        'MEMOIR-^', 'MEMOA', 'NENUA',
3337
        'MERHAVEN$', 'MAHAFN', None,
3338
        'ME(LMNRST)-3^', 'ME', 'NE',
3339
        'MEN(STZ)--3', 'ME', None,
3340
        'MEN$', 'MEN', None,
3341
        'MIGUEL-', 'MIGE', 'NIKE',
3342
        'MIKE^$', 'MEIK', 'NEIK',
3343
        'MITHILFE----^$', 'MIT H', 'NIT ',
3344
        'MN$', 'M', None,
3345
        'MN', 'N', 'N',
3346
        'MPJUTE-', 'MPUT', 'NBUT',
3347
        'MP(ßZ)$', 'MS', None,
3348
        'MP(SßTZ)-', 'M', 'N',
3349
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3350
        'MY9^', 'MÜ', None,
3351
        'M(ßZ)$', 'MS', None,
3352
        'M´G7^', 'MAK', 'NAK',
3353
        'M\'G7^', 'MAK', 'NAK',
3354
        'M´^', 'MAK', 'NAK',
3355
        'M\'^', 'MAK', 'NAK',
3356
        'M', None, 'N',
3357
        'NACH^^', 'NACH', 'NAK',
3358
        'NADINE', 'NADIN', 'NATIN',
3359
        'NAIV--', 'NA', 'NA',
3360
        'NAISE$', 'NESE', 'NEZE',
3361
        'NAUGENOMM------', 'NAU ', 'NAU ',
3362
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3363
        'NCH$', 'NSH', 'NZ',
3364
        'NCOISE$', 'SOA', 'ZUA',
3365
        'NCOIS$', 'SOA', 'ZUA',
3366
        'NDAR$', 'NDA', 'NTA',
3367
        'NDERINGEN------', 'NDE ', 'NTE ',
3368
        'NDRO(CDKTZ)-', 'NTRO', None,
3369
        'ND(BFGJLMNPQVW)-', 'NT', None,
3370
        'ND(SßZ)$', 'NS', 'NZ',
3371
        'ND\'S$', 'NS', 'NZ',
3372
        'ND´S$', 'NS', 'NZ',
3373
        'NEBEN^^', 'NEBN', 'NEBN',
3374
        'NENGELERN------', 'NEN ', 'NEN ',
3375
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3376
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3377
        'NE(LMNRST)-3^', 'NE', 'NE',
3378
        'NEN-3', 'NE', 'NE',
3379
        'NETTE$', 'NET', 'NET',
3380
        'NGU^^', 'NU', 'NU',
3381
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3382
        'NH(AUO)-$', 'NI', 'NI',
3383
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3384
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3385
        'NICHTS^^', 'NIX', 'NIX',
3386
        'NICHT^^', 'NICHT', 'NIKT',
3387
        'NINE$', 'NIN', 'NIN',
3388
        'NON^^', 'NON', 'NUN',
3389
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3390
        'NOT^^', 'NOT', 'NUT',
3391
        'NTI(AIOU)-3', 'NZI', 'NZI',
3392
        'NTIEL--3', 'NZI', 'NZI',
3393
        'NT(SßZ)$', 'NS', 'NZ',
3394
        'NT\'S$', 'NS', 'NZ',
3395
        'NT´S$', 'NS', 'NZ',
3396
        'NYLON', 'NEILON', 'NEILUN',
3397
        'NY9^', 'NÜ', None,
3398
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3399
        'NSZ-', 'NS', None,
3400
        'NSTS$', 'NS', 'NZ',
3401
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3402
        'N(SßZ)$', 'NS', None,
3403
        'OBERE-', 'OBER', None,
3404
        'OBER^^', 'OBA', 'UBA',
3405
        'OEU2', 'Ö', 'Ö',
3406
        'OE<2', 'Ö', 'Ö',
3407
        'OGL-', 'OK', None,
3408
        'OGNIE-', 'ONI', 'UNI',
3409
        'OGN(AEOU)-$', 'ONI', 'UNI',
3410
        'OH(AIOÖUÜY)-', 'OH', None,
3411
        'OIE$', 'Ö', 'Ö',
3412
        'OIRE$', 'OA', 'UA',
3413
        'OIR$', 'OA', 'UA',
3414
        'OIX', 'OA', 'UA',
3415
        'OI<3', 'EU', 'EU',
3416
        'OKAY^$', 'OKE', 'UKE',
3417
        'OLYN$', 'OLIN', 'ULIN',
3418
        'OO(DLMZ)-', 'U', None,
3419
        'OO$', 'U', None,
3420
        'OO-', '', '',
3421
        'ORGINAL-----', 'ORI', 'URI',
3422
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3423
        'OUI^', 'WI', 'FI',
3424
        'OUILLE$', 'ULIE', 'ULIE',
3425
        'OU(DT)-^', 'AU', 'AU',
3426
        'OUSE$', 'AUS', 'AUZ',
3427
        'OUT-', 'AU', 'AU',
3428
        'OU', 'U', 'U',
3429
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3430
        'OVER--<', 'OW', None,
3431
        'OV(AOU)-', 'OW', None,
3432
        'OW$', 'AU', 'AU',
3433
        'OWS$', 'OS', 'UZ',
3434
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3435
        'OYER', 'OIA', None,
3436
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3437
        'O(JY)<', 'EU', 'EU',
3438
        'OZ$', 'OS', None,
3439
        'O´^', 'O', 'U',
3440
        'O\'^', 'O', 'U',
3441
        'O', None, 'U',
3442
        'PATIEN--^', 'PAZI', 'PAZI',
3443
        'PENSIO-^', 'PANSI', 'PANZI',
3444
        'PE(LMNRST)-3^', 'PE', 'PE',
3445
        'PFER-^', 'FE', 'FE',
3446
        'P(FH)<', 'F', 'F',
3447
        'PIC^$', 'PIK', 'PIK',
3448
        'PIC$', 'PIZ', 'PIZ',
3449
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3450
        'POLYP-', 'POLÜ', None,
3451
        'POLY^^', 'POLI', 'PULI',
3452
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3453
        'POWER7', 'PAUA', 'PAUA',
3454
        'PP(FH)--<', 'B', 'B',
3455
        'PP-', '', '',
3456
        'PRODUZ-^', 'PRODU', 'BRUTU',
3457
        'PRODUZI--', ' PRODU', ' BRUTU',
3458
        'PRIX^$', 'PRI', 'PRI',
3459
        'PS-^^', 'P', None,
3460
        'P(SßZ)^', None, 'Z',
3461
        'P(SßZ)$', 'BS', None,
3462
        'PT-^', '', '',
3463
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3464
        'PY9^', 'PÜ', None,
3465
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3466
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3467
        'P.^', None, 'P.',
3468
        'P^', 'P', None,
3469
        'P', 'B', 'B',
3470
        'QI-', 'Z', 'Z',
3471
        'QUARANT--', 'KARA', 'KARA',
3472
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3473
        'QUE$', 'K', 'K',
3474
        'QUI(NS)$', 'KI', 'KI',
3475
        'QUIZ7', 'KWIS', None,
3476
        'Q(UV)7', 'KW', 'KF',
3477
        'Q<', 'K', 'K',
3478
        'RADFAHR----', 'RAT ', 'RAT ',
3479
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3480
        'RCH', 'RCH', 'RK',
3481
        'REA(DU)---3^', 'R', None,
3482
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3483
        'RECHERCH^', 'RESHASH', 'REZAZ',
3484
        'RECYCL--', 'RIZEI', 'RIZEI',
3485
        'RE(ALST)-3^', 'RE', None,
3486
        'REE$', 'RI', 'RI',
3487
        'RER$', 'RA', 'RA',
3488
        'RE(MNR)-4', 'RE', 'RE',
3489
        'RETTE$', 'RET', 'RET',
3490
        'REUZ$', 'REUZ', None,
3491
        'REW$', 'RU', 'RU',
3492
        'RH<^', 'R', 'R',
3493
        'RJA(MN)--', 'RI', 'RI',
3494
        'ROWD-^', 'RAU', 'RAU',
3495
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3496
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3497
        'RTIEL--3', 'RZI', 'RZI',
3498
        'RV(AEOU)-3', 'RW', None,
3499
        'RY(KN)-$', 'RI', 'RI',
3500
        'RY9^', 'RÜ', None,
3501
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3502
        'SAISO-^', 'SES', 'ZEZ',
3503
        'SAFE^$', 'SEIF', 'ZEIF',
3504
        'SAUCE-^', 'SOS', 'ZUZ',
3505
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3506
        'SCHSCH---7', '', '',
3507
        'SCHTSCH', 'SH', 'Z',
3508
        'SC(HZ)<', 'SH', 'Z',
3509
        'SC', 'SK', 'ZK',
3510
        'SELBSTST--7^^', 'SELB', 'ZELB',
3511
        'SELBST7^^', 'SELBST', 'ZELBZT',
3512
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3513
        'SERVI-^', 'SERW', None,
3514
        'SE(LMNRST)-3^', 'SE', 'ZE',
3515
        'SETTE$', 'SET', 'ZET',
3516
        'SHP-^', 'S', 'Z',
3517
        'SHST', 'SHT', 'ZT',
3518
        'SHTSH', 'SH', 'Z',
3519
        'SHT', 'ST', 'Z',
3520
        'SHY9^', 'SHÜ', None,
3521
        'SH^^', 'SH', None,
3522
        'SH3', 'SH', 'Z',
3523
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3524
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3525
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3526
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3527
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3528
        'SIEGLI-^', 'SIKL', 'ZIKL',
3529
        'SIGLI-^', 'SIKL', 'ZIKL',
3530
        'SIGHT', 'SEIT', 'ZEIT',
3531
        'SIGN', 'SEIN', 'ZEIN',
3532
        'SKI(NPZ)-', 'SKI', 'ZKI',
3533
        'SKI<^', 'SHI', 'ZI',
3534
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3535
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3536
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3537
        'SOUND-', 'SAUN', 'ZAUN',
3538
        'STAATS^^', 'STAZ', 'ZTAZ',
3539
        'STADT^^', 'STAT', 'ZTAT',
3540
        'STANDE$', ' STANDE', ' ZTANTE',
3541
        'START^^', 'START', 'ZTART',
3542
        'STAURANT7', 'STORAN', 'ZTURAN',
3543
        'STEAK-', 'STE', 'ZTE',
3544
        'STEPHEN-^$', 'STEW', None,
3545
        'STERN', 'STERN', None,
3546
        'STRAF^^', 'STRAF', 'ZTRAF',
3547
        'ST\'S$', 'Z', 'Z',
3548
        'ST´S$', 'Z', 'Z',
3549
        'STST--', '', '',
3550
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3551
        'ST(SZ)', 'Z', 'Z',
3552
        'SPAREN---^', 'SPA', 'ZPA',
3553
        'SPAREND----', ' SPA', ' ZPA',
3554
        'S(PTW)-^^', 'S', None,
3555
        'SP', 'SP', None,
3556
        'STYN(AE)-$', 'STIN', 'ZTIN',
3557
        'ST', 'ST', 'ZT',
3558
        'SUITE<', 'SIUT', 'ZIUT',
3559
        'SUKE--$', 'S', 'Z',
3560
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3561
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3562
        'SYB(IY)--^', 'SIB', None,
3563
        'SYL(KVW)--^', 'SI', None,
3564
        'SY9^', 'SÜ', None,
3565
        'SZE(NPT)-^', 'ZE', 'ZE',
3566
        'SZI(ELN)-^', 'ZI', 'ZI',
3567
        'SZCZ<', 'SH', 'Z',
3568
        'SZT<', 'ST', 'ZT',
3569
        'SZ<3', 'SH', 'Z',
3570
        'SÜL(KVW)--^', 'SI', None,
3571
        'S', None, 'Z',
3572
        'TCH', 'SH', 'Z',
3573
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3574
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3575
        'TEAT-^', 'TEA', 'TEA',
3576
        'TERRAI7^', 'TERA', 'TERA',
3577
        'TE(LMNRST)-3^', 'TE', 'TE',
3578
        'TH<', 'T', 'T',
3579
        'TICHT-', 'TIK', 'TIK',
3580
        'TICH$', 'TIK', 'TIK',
3581
        'TIC$', 'TIZ', 'TIZ',
3582
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3583
        'TIGSTELL-----', 'TIK ', 'TIK ',
3584
        'TOAS-^', 'TO', 'TU',
3585
        'TOILET-', 'TOLE', 'TULE',
3586
        'TOIN-', 'TOA', 'TUA',
3587
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3588
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3589
        'TRAINI-', 'TREN', 'TREN',
3590
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3591
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3592
        'TSCH', 'SH', 'Z',
3593
        'TSH', 'SH', 'Z',
3594
        'TST', 'ZT', 'ZT',
3595
        'T(Sß)', 'Z', 'Z',
3596
        'TT(SZ)--<', '', '',
3597
        'TT9', 'T', 'T',
3598
        'TV^$', 'TV', 'TV',
3599
        'TX(AEIOU)-3', 'SH', 'Z',
3600
        'TY9^', 'TÜ', None,
3601
        'TZ-', '', '',
3602
        'T\'S3$', 'Z', 'Z',
3603
        'T´S3$', 'Z', 'Z',
3604
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3605
        'UEBER^^', 'ÜBA', 'IBA',
3606
        'UE2', 'Ü', 'I',
3607
        'UGL-', 'UK', None,
3608
        'UH(AOÖUÜY)-', 'UH', None,
3609
        'UIE$', 'Ü', 'I',
3610
        'UM^^', 'UM', 'UN',
3611
        'UNTERE--3', 'UNTE', 'UNTE',
3612
        'UNTER^^', 'UNTA', 'UNTA',
3613
        'UNVER^^', 'UNFA', 'UNFA',
3614
        'UN^^', 'UN', 'UN',
3615
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3616
        'UVE-4', 'UW', None,
3617
        'UY2', 'UI', None,
3618
        'UZZ', 'AS', 'AZ',
3619
        'VACL-^', 'WAZ', 'FAZ',
3620
        'VAC$', 'WAZ', 'FAZ',
3621
        'VAN DEN ^', 'FANDN', 'FANTN',
3622
        'VANES-^', 'WANE', None,
3623
        'VATRO-', 'WATR', None,
3624
        'VA(DHJNT)--^', 'F', None,
3625
        'VEDD-^', 'FE', 'FE',
3626
        'VE(BEHIU)--^', 'F', None,
3627
        'VEL(BDLMNT)-^', 'FEL', None,
3628
        'VENTZ-^', 'FEN', None,
3629
        'VEN(NRSZ)-^', 'FEN', None,
3630
        'VER(AB)-^$', 'WER', None,
3631
        'VERBAL^$', 'WERBAL', None,
3632
        'VERBAL(EINS)-^', 'WERBAL', None,
3633
        'VERTEBR--', 'WERTE', None,
3634
        'VEREIN-----', 'F', None,
3635
        'VEREN(AEIOU)-^', 'WEREN', None,
3636
        'VERIFI', 'WERIFI', None,
3637
        'VERON(AEIOU)-^', 'WERON', None,
3638
        'VERSEN^', 'FERSN', 'FAZN',
3639
        'VERSIERT--^', 'WERSI', None,
3640
        'VERSIO--^', 'WERS', None,
3641
        'VERSUS', 'WERSUS', None,
3642
        'VERTI(GK)-', 'WERTI', None,
3643
        'VER^^', 'FER', 'FA',
3644
        'VERSPRECHE-------', ' FER', ' FA',
3645
        'VER$', 'WA', None,
3646
        'VER', 'FA', 'FA',
3647
        'VET(HT)-^', 'FET', 'FET',
3648
        'VETTE$', 'WET', 'FET',
3649
        'VE^', 'WE', None,
3650
        'VIC$', 'WIZ', 'FIZ',
3651
        'VIELSAGE----', 'FIL ', 'FIL ',
3652
        'VIEL', 'FIL', 'FIL',
3653
        'VIEW', 'WIU', 'FIU',
3654
        'VILL(AE)-', 'WIL', None,
3655
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3656
        'VI(ELS)--^', 'F', None,
3657
        'VILLON--', 'WILI', 'FILI',
3658
        'VIZE^^', 'FIZE', 'FIZE',
3659
        'VLIE--^', 'FL', None,
3660
        'VL(AEIOU)--', 'W', None,
3661
        'VOKA-^', 'WOK', None,
3662
        'VOL(ATUVW)--^', 'WO', None,
3663
        'VOR^^', 'FOR', 'FUR',
3664
        'VR(AEIOU)--', 'W', None,
3665
        'VV9', 'W', None,
3666
        'VY9^', 'WÜ', 'FI',
3667
        'V(ÜY)-', 'W', None,
3668
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3669
        'V(AEIJLRU)-<', 'W', None,
3670
        'V.^', 'V.', None,
3671
        'V<', 'F', 'F',
3672
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3673
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3674
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3675
        'WE(LMNRST)-3^', 'WE', 'FE',
3676
        'WER(DST)-', 'WER', None,
3677
        'WIC$', 'WIZ', 'FIZ',
3678
        'WIEDERU--', 'WIDE', 'FITE',
3679
        'WIEDER^$', 'WIDA', 'FITA',
3680
        'WIEDER^^', 'WIDA ', 'FITA ',
3681
        'WIEVIEL', 'WI FIL', 'FI FIL',
3682
        'WISUEL', 'WISUEL', None,
3683
        'WR-^', 'W', None,
3684
        'WY9^', 'WÜ', 'FI',
3685
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3686
        'W$', 'F', None,
3687
        'W', None, 'F',
3688
        'X<^', 'Z', 'Z',
3689
        'XHAVEN$', 'XAFN', None,
3690
        'X(CSZ)', 'X', 'X',
3691
        'XTS(CH)--', 'XT', 'XT',
3692
        'XT(SZ)', 'Z', 'Z',
3693
        'YE(LMNRST)-3^', 'IE', 'IE',
3694
        'YE-3', 'I', 'I',
3695
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3696
        'Y(AOU)-<7', 'I', 'I',
3697
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3698
        'YVES^$', 'IF', 'IF',
3699
        'YVONNE^$', 'IWON', 'IFUN',
3700
        'Y.^', 'Y.', None,
3701
        'Y', 'I', 'I',
3702
        'ZC(AOU)-', 'SK', 'ZK',
3703
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3704
        'ZIEJ$', 'ZI', 'ZI',
3705
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3706
        'ZL(AEIOU)-', 'SL', None,
3707
        'ZS(CHT)--', '', '',
3708
        'ZS', 'SH', 'Z',
3709
        'ZUERST', 'ZUERST', 'ZUERST',
3710
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3711
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3712
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3713
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3714
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3715
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3716
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3717
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3718
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3719
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3720
        'ZUVER^^', 'ZUFA', 'ZUFA',
3721
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3722
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3723
        'ZY9^', 'ZÜ', None,
3724
        'ZYK3$', 'ZIK', None,
3725
        'Z(VW)7^', 'SW', None,
3726
        None, None, None)
3727
3728
    phonet_hash = Counter()
3729
    alpha_pos = Counter()
3730
3731
    phonet_hash_1 = Counter()
3732
    phonet_hash_2 = Counter()
3733
3734
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3735
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3736
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3737
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3738
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3739
3740
    def _trinfo(text, rule, err_text, lang):
3741
        """Output debug information."""
3742
        if lang == 'none':
3743
            _phonet_rules = _phonet_rules_no_lang
3744
        else:
3745
            _phonet_rules = _phonet_rules_german
3746
3747
        from_rule = ('(NULL)' if _phonet_rules[rule] is None else
3748
                     _phonet_rules[rule])
3749
        to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else
3750
                    _phonet_rules[rule + 1])
3751
        to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else
3752
                    _phonet_rules[rule + 2])
3753
        print('"{} {}:  "{}"{}"{}" {}'.format(text, ((rule / 3) + 1),
3754
                                              from_rule, to_rule1, to_rule2,
3755
                                              err_text))
3756
3757
    def _initialize_phonet(lang):
3758
        """Initialize phonet variables."""
3759
        if lang == 'none':
3760
            _phonet_rules = _phonet_rules_no_lang
3761
        else:
3762
            _phonet_rules = _phonet_rules_german
3763
3764
        phonet_hash[''] = -1
3765
3766
        # German and international umlauts
3767
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
3768
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
3769
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
3770
            alpha_pos[j] = 1
3771
            phonet_hash[j] = -1
3772
3773
        # "normal" letters ('A'-'Z')
3774
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3775
            alpha_pos[j] = i + 2
3776
            phonet_hash[j] = -1
3777
3778
        for i in range(26):
3779
            for j in range(28):
3780
                phonet_hash_1[i, j] = -1
3781
                phonet_hash_2[i, j] = -1
3782
3783
        # for each phonetc rule
3784
        for i in range(len(_phonet_rules)):
3785
            rule = _phonet_rules[i]
3786
3787
            if rule and i % 3 == 0:
3788
                # calculate first hash value
3789
                k = _phonet_rules[i][0]
3790
3791
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3792
                                           _phonet_rules[i+2]):
3793
                    phonet_hash[k] = i
3794
3795
                # calculate second hash values
3796
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3797
                    k = alpha_pos[k]
3798
3799
                    j = k-2
3800
                    rule = rule[1:]
3801
3802
                    if not rule:
3803
                        rule = ' '
3804
                    elif rule[0] == '(':
3805
                        rule = rule[1:]
3806
                    else:
3807
                        rule = rule[0]
3808
3809
                    while rule and (rule[0] != ')'):
3810
                        k = alpha_pos[rule[0]]
3811
3812
                        if k > 0:
3813
                            # add hash value for this letter
3814
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3815
                                phonet_hash_1[j, k] = i
3816
                                phonet_hash_2[j, k] = i
3817
3818
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3819
                                phonet_hash_2[j, k] = i
3820
                            else:
3821
                                k = -1
3822
3823
                        if k <= 0:
3824
                            # add hash value for all letters
3825
                            if phonet_hash_1[j, 0] < 0:
3826
                                phonet_hash_1[j, 0] = i
3827
3828
                            phonet_hash_2[j, 0] = i
3829
3830
                        rule = rule[1:]
3831
3832
    def _phonet(term, mode, lang, trace):
3833
        """Return the phonet coded form of a term."""
3834
        if lang == 'none':
3835
            _phonet_rules = _phonet_rules_no_lang
3836
        else:
3837
            _phonet_rules = _phonet_rules_german
3838
3839
        char0 = ''
3840
        dest = term
3841
3842
        if not term:
3843
            return ''
3844
3845
        term_length = len(term)
3846
3847
        # convert input string to upper-case
3848
        src = term.translate(_phonet_upper_translation)
3849
3850
        # check "src"
3851
        i = 0
3852
        j = 0
3853
        zeta = 0
3854
3855
        while i < len(src):
3856
            char = src[i]
3857
3858
            if trace:
3859
                print('\ncheck position {}:  src = "{}",  dest = "{}"'.format
3860
                      (j, src[i:], dest[:j]))
3861
3862
            pos = alpha_pos[char]
3863
3864
            if pos >= 2:
3865
                xpos = pos-2
3866
3867
                if i+1 == len(src):
3868
                    pos = alpha_pos['']
3869
                else:
3870
                    pos = alpha_pos[src[i+1]]
3871
3872
                start1 = phonet_hash_1[xpos, pos]
3873
                start2 = phonet_hash_1[xpos, 0]
3874
                end1 = phonet_hash_2[xpos, pos]
3875
                end2 = phonet_hash_2[xpos, 0]
3876
3877
                # preserve rule priorities
3878
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3879
                    pos = start1
3880
                    start1 = start2
3881
                    start2 = pos
3882
                    pos = end1
3883
                    end1 = end2
3884
                    end2 = pos
3885
3886
                if (end1 >= start2) and (start2 >= 0):
3887
                    if end2 > end1:
3888
                        end1 = end2
3889
3890
                    start2 = -1
3891
                    end2 = -1
3892
            else:
3893
                pos = phonet_hash[char]
3894
                start1 = pos
3895
                end1 = 10000
3896
                start2 = -1
3897
                end2 = -1
3898
3899
            pos = start1
3900
            zeta0 = 0
3901
3902
            if pos >= 0:
3903
                # check rules for this char
3904
                while ((_phonet_rules[pos] is None) or
3905
                       (_phonet_rules[pos][0] == char)):
3906
                    if pos > end1:
3907
                        if start2 > 0:
3908
                            pos = start2
3909
                            start1 = start2
3910
                            start2 = -1
3911
                            end1 = end2
3912
                            end2 = -1
3913
                            continue
3914
3915
                        break
3916
3917
                    if (((_phonet_rules[pos] is None) or
3918
                         (_phonet_rules[pos + mode] is None))):
3919
                        # no conversion rule available
3920
                        pos += 3
3921
                        continue
3922
3923
                    if trace:
3924
                        _trinfo('> rule no.', pos, 'is being checked', lang)
3925
3926
                    # check whole string
3927
                    matches = 1  # number of matching letters
3928
                    priority = 5  # default priority
3929
                    rule = _phonet_rules[pos]
3930
                    rule = rule[1:]
3931
3932
                    while (rule and
3933
                           (len(src) > (i + matches)) and
3934
                           (src[i + matches] == rule[0]) and
3935
                           not rule[0].isdigit() and
3936
                           (rule not in '(-<^$')):
3937
                        matches += 1
3938
                        rule = rule[1:]
3939
3940
                    if rule and (rule[0] == '('):
3941
                        # check an array of letters
3942
                        if (((len(src) > (i + matches)) and
3943
                             src[i + matches].isalpha() and
3944
                             (src[i + matches] in rule[1:]))):
3945
                            matches += 1
3946
3947
                            while rule and rule[0] != ')':
3948
                                rule = rule[1:]
3949
3950
                            # if rule[0] == ')':
3951
                            rule = rule[1:]
3952
3953
                    if rule:
3954
                        priority0 = ord(rule[0])
3955
                    else:
3956
                        priority0 = 0
3957
3958
                    matches0 = matches
3959
3960
                    while rule and rule[0] == '-' and matches > 1:
3961
                        matches -= 1
3962
                        rule = rule[1:]
3963
3964
                    if rule and rule[0] == '<':
3965
                        rule = rule[1:]
3966
3967
                    if rule and rule[0].isdigit():
3968
                        # read priority
3969
                        priority = int(rule[0])
3970
                        rule = rule[1:]
3971
3972
                    if rule and rule[0:2] == '^^':
3973
                        rule = rule[1:]
3974
3975
                    if (not rule or
3976
                            ((rule[0] == '^') and
3977
                             ((i == 0) or not src[i-1].isalpha()) and
3978
                             ((rule[1:2] != '$') or
3979
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3980
                               (src[i+matches0:i+matches0+1] != '.')))) or
3981
                            ((rule[0] == '$') and (i > 0) and
3982
                             src[i-1].isalpha() and
3983
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3984
                              (src[i+matches0:i+matches0+1] != '.')))):
3985
                        # look for continuation, if:
3986
                        # matches > 1 und NO '-' in first string */
3987
                        pos0 = -1
3988
3989
                        start3 = 0
3990
                        start4 = 0
3991
                        end3 = 0
3992
                        end4 = 0
3993
3994
                        if (((matches > 1) and
3995
                             src[i+matches:i+matches+1] and
3996
                             (priority0 != ord('-')))):
3997
                            char0 = src[i+matches-1]
3998
                            pos0 = alpha_pos[char0]
3999
4000
                            if pos0 >= 2 and src[i+matches]:
4001
                                xpos = pos0 - 2
4002
                                pos0 = alpha_pos[src[i+matches]]
4003
                                start3 = phonet_hash_1[xpos, pos0]
4004
                                start4 = phonet_hash_1[xpos, 0]
4005
                                end3 = phonet_hash_2[xpos, pos0]
4006
                                end4 = phonet_hash_2[xpos, 0]
4007
4008
                                # preserve rule priorities
4009
                                if (((start4 >= 0) and
4010
                                     ((start3 < 0) or (start4 < start3)))):
4011
                                    pos0 = start3
4012
                                    start3 = start4
4013
                                    start4 = pos0
4014
                                    pos0 = end3
4015
                                    end3 = end4
4016
                                    end4 = pos0
4017
4018
                                if (end3 >= start4) and (start4 >= 0):
4019
                                    if end4 > end3:
4020
                                        end3 = end4
4021
4022
                                    start4 = -1
4023
                                    end4 = -1
4024
                            else:
4025
                                pos0 = phonet_hash[char0]
4026
                                start3 = pos0
4027
                                end3 = 10000
4028
                                start4 = -1
4029
                                end4 = -1
4030
4031
                            pos0 = start3
4032
4033
                        # check continuation rules for src[i+matches]
4034
                        if pos0 >= 0:
4035
                            while ((_phonet_rules[pos0] is None) or
4036
                                   (_phonet_rules[pos0][0] == char0)):
4037
                                if pos0 > end3:
4038
                                    if start4 > 0:
4039
                                        pos0 = start4
4040
                                        start3 = start4
4041
                                        start4 = -1
4042
                                        end3 = end4
4043
                                        end4 = -1
4044
                                        continue
4045
4046
                                    priority0 = -1
4047
4048
                                    # important
4049
                                    break
4050
4051
                                if (((_phonet_rules[pos0] is None) or
4052
                                     (_phonet_rules[pos0 + mode] is None))):
4053
                                    # no conversion rule available
4054
                                    pos0 += 3
4055
                                    continue
4056
4057
                                if trace:
4058
                                    _trinfo('> > continuation rule no.', pos0,
4059
                                            'is being checked', lang)
4060
4061
                                # check whole string
4062
                                matches0 = matches
4063
                                priority0 = 5
4064
                                rule = _phonet_rules[pos0]
4065
                                rule = rule[1:]
4066
4067
                                while (rule and
4068
                                       (src[i+matches0:i+matches0+1] ==
4069
                                        rule[0]) and
4070
                                       (not rule[0].isdigit() or
4071
                                        (rule in '(-<^$'))):
4072
                                    matches0 += 1
4073
                                    rule = rule[1:]
4074
4075
                                if rule and rule[0] == '(':
4076
                                    # check an array of letters
4077
                                    if ((src[i+matches0:i+matches0+1]
4078
                                         .isalpha() and
4079
                                         (src[i+matches0] in rule[1:]))):
4080
                                        matches0 += 1
4081
4082
                                        while rule and rule[0] != ')':
4083
                                            rule = rule[1:]
4084
4085
                                        # if rule[0] == ')':
4086
                                        rule = rule[1:]
4087
4088
                                while rule and rule[0] == '-':
4089
                                    # "matches0" is NOT decremented
4090
                                    # because of  "if (matches0 == matches)"
4091
                                    rule = rule[1:]
4092
4093
                                if rule and rule[0] == '<':
4094
                                    rule = rule[1:]
4095
4096
                                if rule and rule[0].isdigit():
4097
                                    priority0 = int(rule[0])
4098
                                    rule = rule[1:]
4099
4100
                                if (not rule or
4101
                                        # rule == '^' is not possible here
4102
                                        ((rule[0] == '$') and not
4103
                                         src[i+matches0:i+matches0+1]
4104
                                         .isalpha() and
4105
                                         (src[i+matches0:i+matches0+1]
4106
                                          != '.'))):
4107
                                    if matches0 == matches:
4108
                                        # this is only a partial string
4109
                                        if trace:
4110
                                            _trinfo('> > continuation ' +
4111
                                                    'rule no.',
4112
                                                    pos0,
4113
                                                    'not used (too short)',
4114
                                                    lang)
4115
4116
                                        pos0 += 3
4117
                                        continue
4118
4119
                                    if priority0 < priority:
4120
                                        # priority is too low
4121
                                        if trace:
4122
                                            _trinfo('> > continuation ' +
4123
                                                    'rule no.',
4124
                                                    pos0,
4125
                                                    'not used (priority)',
4126
                                                    lang)
4127
4128
                                        pos0 += 3
4129
                                        continue
4130
4131
                                    # continuation rule found
4132
                                    break
4133
4134
                                if trace:
4135
                                    _trinfo('> > continuation rule no.', pos0,
4136
                                            'not used', lang)
4137
4138
                                pos0 += 3
4139
4140
                            # end of "while"
4141
                            if ((priority0 >= priority) and
4142
                                    ((_phonet_rules[pos0] is not None) and
4143
                                     (_phonet_rules[pos0][0] == char0))):
4144
4145
                                if trace:
4146
                                    _trinfo('> rule no.', pos, '', lang)
4147
                                    _trinfo('> not used because of ' +
4148
                                            'continuation', pos0, '', lang)
4149
4150
                                pos += 3
4151
                                continue
4152
4153
                        # replace string
4154
                        if trace:
4155
                            _trinfo('Rule no.', pos, 'is applied', lang)
4156
4157
                        if ((_phonet_rules[pos] and
4158
                             ('<' in _phonet_rules[pos][1:]))):
4159
                            priority0 = 1
4160
                        else:
4161
                            priority0 = 0
4162
4163
                        rule = _phonet_rules[pos + mode]
4164
4165
                        if (priority0 == 1) and (zeta == 0):
4166
                            # rule with '<' is applied
4167
                            if ((j > 0) and rule and
4168
                                    ((dest[j-1] == char) or
4169
                                     (dest[j-1] == rule[0]))):
4170
                                j -= 1
4171
4172
                            zeta0 = 1
4173
                            zeta += 1
4174
                            matches0 = 0
4175
4176
                            while rule and src[i+matches0]:
4177
                                src = (src[0:i+matches0] + rule[0] +
4178
                                       src[i+matches0+1:])
4179
                                matches0 += 1
4180
                                rule = rule[1:]
4181
4182
                            if matches0 < matches:
4183
                                src = (src[0:i+matches0] +
4184
                                       src[i+matches:])
4185
4186
                            char = src[i]
4187
                        else:
4188
                            i = i + matches - 1
4189
                            zeta = 0
4190
4191
                            while len(rule) > 1:
4192
                                if (j == 0) or (dest[j - 1] != rule[0]):
4193
                                    dest = (dest[0:j] + rule[0] +
4194
                                            dest[min(len(dest), j+1):])
4195
                                    j += 1
4196
4197
                                rule = rule[1:]
4198
4199
                            # new "current char"
4200
                            if not rule:
4201
                                rule = ''
4202
                                char = ''
4203
                            else:
4204
                                char = rule[0]
4205
4206
                            if ((_phonet_rules[pos] and
4207
                                 '^^' in _phonet_rules[pos][1:])):
4208
                                if char:  # pragma: no branch
4209
                                    dest = (dest[0:j] + char +
4210
                                            dest[min(len(dest), j + 1):])
4211
                                    j += 1
4212
4213
                                src = src[i + 1:]
4214
                                i = 0
4215
                                zeta0 = 1
4216
4217
                        break
4218
4219
                    pos += 3
4220
4221
                    if pos > end1 and start2 > 0:
4222
                        pos = start2
4223
                        start1 = start2
4224
                        end1 = end2
4225
                        start2 = -1
4226
                        end2 = -1
4227
4228
            if zeta0 == 0:
4229
                if char and ((j == 0) or (dest[j-1] != char)):
4230
                    # delete multiple letters only
4231
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4232
                    j += 1
4233
4234
                i += 1
4235
                zeta = 0
4236
4237
        dest = dest[0:j]
4238
4239
        return dest
4240
4241
    _initialize_phonet(lang)
4242
4243
    word = unicodedata.normalize('NFKC', text_type(word))
4244
    return _phonet(word, mode, lang, trace)
4245
4246
4247
def spfc(word):
4248
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4249
4250
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4251
    This implementation is based on page 19-21 of
4252
    https://archive.org/stream/accessingindivid00moor#page/19/mode/1up
4253
4254
    :param str word: the word to transform
4255
    :returns: the SPFC value
4256
    :rtype: str
4257
4258
    >>> spfc('Christopher Smith')
4259
    '01160'
4260
    >>> spfc('Christopher Schmidt')
4261
    '01160'
4262
    >>> spfc('Niall Smith')
4263
    '01660'
4264
    >>> spfc('Niall Schmidt')
4265
4266
    >>> spfc('L.Smith')
4267
    '01960'
4268
    >>> spfc('R.Miller')
4269
    '65490'
4270
4271
    >>> spfc(('L', 'Smith'))
4272
    '01960'
4273
    >>> spfc(('R', 'Miller'))
4274
    '65490'
4275
    """
4276
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4277
                    '0011112222334445556666777'))
4278
    _pf2 = dict(zip((ord(_) for _ in
4279
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4280
                    '0011122233445556677788899'))
4281
    _pf3 = dict(zip((ord(_) for _ in
4282
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4283
                    '00000112223334456677777777'))
4284
4285
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4286
                      ('MN', 'N'))
4287
4288
    def _raise_word_ex():
4289
        """Raise an AttributeError."""
4290
        raise AttributeError('word attribute must be a string with a space ' +
4291
                             'or period dividing the first and last names ' +
4292
                             'or a tuple/list consisting of the first and ' +
4293
                             'last names')
4294
4295
    if not word:
4296
        return ''
4297
4298
    if isinstance(word, (str, text_type)):
4299
        names = word.split('.', 1)
4300
        if len(names) != 2:
4301
            names = word.split(' ', 1)
4302
            if len(names) != 2:
4303
                _raise_word_ex()
4304
    elif hasattr(word, '__iter__'):
4305
        if len(word) != 2:
4306
            _raise_word_ex()
4307
        names = word
4308
    else:
4309
        _raise_word_ex()
4310
4311
    names = [unicodedata.normalize('NFKD', text_type(_.strip()
4312
                                                     .replace('ß', 'SS')
4313
                                                     .upper()))
4314
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4315
    code = ''
4316
4317
    def steps_one_to_three(name):
4318
        """Perform the first three steps of SPFC."""
4319
        # filter out non A-Z
4320
        name = ''.join(_ for _ in name if _ in
4321
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
4322
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
4323
                        'W', 'X', 'Y', 'Z'})
4324
4325
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4326
        # and MN to N
4327
        for subst in _substitutions:
4328
            name = name.replace(subst[0], subst[1])
4329
4330
        # 2. In the name field, replace multiple letters with a single letter
4331
        name = _delete_consecutive_repeats(name)
4332
4333
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4334
        # field.
4335
        if name:
4336
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4337
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
4338
        return name
4339
4340
    names = [steps_one_to_three(_) for _ in names]
4341
4342
    # 4. The first digit of the code is obtained using PF1 and the first letter
4343
    # of the name field. Remove this letter after coding.
4344
    if names[1]:
4345
        code += names[1][0].translate(_pf1)
4346
        names[1] = names[1][1:]
4347
4348
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4349
    # second digit of the code. Use as many letters as possible and remove
4350
    # after coding.
4351
    if names[1]:
4352
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4353
            code += '8'
4354
            names[1] = names[1][:-3]
4355
        elif names[1][-2:] == 'SN':
4356
            code += '8'
4357
            names[1] = names[1][:-2]
4358
        elif names[1][-3:] == 'STR':
4359
            code += '9'
4360
            names[1] = names[1][:-3]
4361
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
4362
            code += '9'
4363
            names[1] = names[1][:-2]
4364
        elif names[1][-3:] == 'DRS':
4365
            code += '7'
4366
            names[1] = names[1][:-3]
4367
        elif names[1][-2:] in {'TR', 'MN'}:
4368
            code += '7'
4369
            names[1] = names[1][:-2]
4370
        else:
4371
            code += names[1][-1].translate(_pf3)
4372
            names[1] = names[1][:-1]
4373
4374
    # 6. The third digit is found using Table PF2 and the first character of
4375
    # the first name. Remove after coding.
4376
    if names[0]:
4377
        code += names[0][0].translate(_pf2)
4378
        names[0] = names[0][1:]
4379
4380
    # 7. The fourth digit is found using Table PF2 and the first character of
4381
    # the name field. If no letters remain use zero. After coding remove the
4382
    # letter.
4383
    # 8. The fifth digit is found in the same manner as the fourth using the
4384
    # remaining characters of the name field if any.
4385
    for _ in range(2):
4386
        if names[1]:
4387
            code += names[1][0].translate(_pf2)
4388
            names[1] = names[1][1:]
4389
        else:
4390
            code += '0'
4391
4392
    return code
4393
4394
4395
def statistics_canada(word, maxlength=4):
4396
    """Return the Statistics Canada code for a word.
4397
4398
    The original description of this algorithm could not be located, and
4399
    may only have been specified in an unpublished TR. The coding does not
4400
    appear to be in use by Statistics Canada any longer. In its place, this is
4401
    an implementation of the "Census modified Statistics Canada name coding
4402
    procedure".
4403
4404
    The modified version of this algorithm is described in Appendix B of
4405
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
4406
    Procedure for the SRS Record Linkage System.` Statistical Reporting
4407
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
4408
    https://naldc.nal.usda.gov/download/27833/PDF
4409
4410
    :param str word: the word to transform
4411
    :param int maxlength: the maximum length (default 6) of the code to return
4412
    :param bool modified: indicates whether to use USDA modified algorithm
4413
    :returns: the Statistics Canada name code value
4414
    :rtype: str
4415
4416
    >>> statistics_canada('Christopher')
4417
    'CHRS'
4418
    >>> statistics_canada('Niall')
4419
    'NL'
4420
    >>> statistics_canada('Smith')
4421
    'SMTH'
4422
    >>> statistics_canada('Schmidt')
4423
    'SCHM'
4424
    """
4425
    # uppercase, normalize, decompose, and filter non-A-Z out
4426
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4427
    word = word.replace('ß', 'SS')
4428
    word = ''.join(c for c in word if c in
4429
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4430
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4431
                    'Y', 'Z'})
4432
    if not word:
4433
        return ''
4434
4435
    code = word[1:]
4436
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4437
        code = code.replace(vowel, '')
4438
    code = word[0]+code
4439
    code = _delete_consecutive_repeats(code)
4440
    code = code.replace(' ', '')
4441
4442
    return code[:maxlength]
4443
4444
4445
def lein(word, maxlength=4, zero_pad=True):
4446
    """Return the Lein code for a word.
4447
4448
    This is Lein name coding, based on
4449
    https://naldc.nal.usda.gov/download/27833/PDF
4450
4451
    :param str word: the word to transform
4452
    :param int maxlength: the maximum length (default 4) of the code to return
4453
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4454
        maxlength string
4455
    :returns: the Lein code
4456
    :rtype: str
4457
4458
    >>> lein('Christopher')
4459
    'C351'
4460
    >>> lein('Niall')
4461
    'N300'
4462
    >>> lein('Smith')
4463
    'S210'
4464
    >>> lein('Schmidt')
4465
    'S521'
4466
    """
4467
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4468
                                  'BCDFGJKLMNPQRSTVXZ'),
4469
                                 '451455532245351455'))
4470
4471
    # uppercase, normalize, decompose, and filter non-A-Z out
4472
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4473
    word = word.replace('ß', 'SS')
4474
    word = ''.join(c for c in word if c in
4475
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4476
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4477
                    'Y', 'Z'})
4478
4479
    if not word:
4480
        return ''
4481
4482
    code = word[0]  # Rule 1
4483
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
4484
                               73: None, 79: None, 85: None, 87: None,
4485
                               89: None})  # Rule 2
4486
    word = _delete_consecutive_repeats(word)  # Rule 3
4487
    code += word.translate(_lein_translation)  # Rule 4
4488
4489
    if zero_pad:
4490
        code += ('0'*maxlength)  # Rule 4
4491
4492
    return code[:maxlength]
4493
4494
4495
def roger_root(word, maxlength=5, zero_pad=True):
4496
    """Return the Roger Root code for a word.
4497
4498
    This is Roger Root name coding, based on
4499
    https://naldc.nal.usda.gov/download/27833/PDF
4500
4501
    :param str word: the word to transform
4502
    :param int maxlength: the maximum length (default 5) of the code to return
4503
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4504
        maxlength string
4505
    :returns: the Roger Root code
4506
    :rtype: str
4507
4508
    >>> roger_root('Christopher')
4509
    '06401'
4510
    >>> roger_root('Niall')
4511
    '02500'
4512
    >>> roger_root('Smith')
4513
    '00310'
4514
    >>> roger_root('Schmidt')
4515
    '06310'
4516
    """
4517
    # uppercase, normalize, decompose, and filter non-A-Z out
4518
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4519
    word = word.replace('ß', 'SS')
4520
    word = ''.join(c for c in word if c in
4521
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4522
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4523
                    'Y', 'Z'})
4524
4525
    if not word:
4526
        return ''
4527
4528
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4529
    _init_patterns = {4: {'TSCH': '06'},
4530
                      3: {'TSH': '06', 'SCH': '06'},
4531
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4532
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4533
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4534
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4535
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4536
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4537
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4538
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4539
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4540
                          'Y': '5', 'Z': '0*0'}}
4541
4542
    _med_patterns = {4: {'TSCH': '6'},
4543
                     3: {'TSH': '6', 'SCH': '6'},
4544
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4545
                         'PH': '8', 'SH': '6', 'TS': '0'},
4546
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4547
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4548
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4549
                         'V': '8', 'X': '7', 'Z': '0',
4550
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4551
                         'U': '*', 'W': '*', 'Y': '*'}}
4552
4553
    code = ''
4554
    pos = 0
4555
4556
    # Do first digit(s) first
4557
    for num in range(4, 0, -1):
4558
        if word[:num] in _init_patterns[num]:
4559
            code = _init_patterns[num][word[:num]]
4560
            pos += num
4561
            break
4562
    else:
4563
        pos += 1  # Advance if nothing is recognized
4564
4565
    # Then code subsequent digits
4566
    while pos < len(word):
4567
        for num in range(4, 0, -1):
4568
            if word[pos:pos+num] in _med_patterns[num]:
4569
                code += _med_patterns[num][word[pos:pos+num]]
4570
                pos += num
4571
                break
4572
        else:
4573
            pos += 1  # Advance if nothing is recognized
4574
4575
    code = _delete_consecutive_repeats(code)
4576
    code = code.replace('*', '')
4577
4578
    if zero_pad:
4579
        code += '0'*maxlength
4580
4581
    return code[:maxlength]
4582
4583
4584
def onca(word, maxlength=4, zero_pad=True):
4585
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.
4586
4587
    This is the Oxford Name Compression Algorithm, based on:
4588
    Gill, Leicester E. 1997. "OX-LINK: The Oxford Medical Record Linkage
4589
    System." In ``Record Linkage Techniques -- 1997``. Arlington, VA. March
4590
    20--21, 1997.
4591
    https://nces.ed.gov/FCSM/pdf/RLT97.pdf
4592
4593
    I can find no complete description of the "anglicised version of the NYSIIS
4594
    method" identified as the first step in this algorithm, so this is likely
4595
    not a correct implementation, in that it employs the standard NYSIIS
4596
    algorithm.
4597
4598
    :param str word: the word to transform
4599
    :param int maxlength: the maximum length (default 5) of the code to return
4600
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4601
        maxlength string
4602
    :returns: the ONCA code
4603
    :rtype: str
4604
4605
    >>> onca('Christopher')
4606
    'C623'
4607
    >>> onca('Niall')
4608
    'N400'
4609
    >>> onca('Smith')
4610
    'S530'
4611
    >>> onca('Schmidt')
4612
    'S530'
4613
    """
4614
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
4615
    # to one character of output, so give it triple the maxlength.
4616
    return soundex(nysiis(word, maxlength=maxlength*3), maxlength,
4617
                   zero_pad=zero_pad)
4618
4619
4620
def eudex(word, maxlength=8):
4621
    """Return the eudex phonetic hash of a word.
4622
4623
    This implementation of eudex phonetic hashing is based on the specification
4624
    (not the reference implementation) at:
4625
    Ticki. 2017. "Eudex: A blazingly fast phonetic reduction/hashing
4626
    algorithm." https://docs.rs/crate/eudex
4627
4628
    Further details can be found at
4629
    http://ticki.github.io/blog/the-eudex-algorithm/
4630
4631
    :param str word: the word to transform
4632
    :param int maxlength: the length of the code returned (defaults to 8)
4633
    :returns: the eudex hash
4634
    :rtype: str
4635
    """
4636
    _trailing_phones = {
4637
        'a': 0,  # a
4638
        'b': 0b01001000,  # b
4639
        'c': 0b00001100,  # c
4640
        'd': 0b00011000,  # d
4641
        'e': 0,  # e
4642
        'f': 0b01000100,  # f
4643
        'g': 0b00001000,  # g
4644
        'h': 0b00000100,  # h
4645
        'i': 1,  # i
4646
        'j': 0b00000101,  # j
4647
        'k': 0b00001001,  # k
4648
        'l': 0b10100000,  # l
4649
        'm': 0b00000010,  # m
4650
        'n': 0b00010010,  # n
4651
        'o': 0,  # o
4652
        'p': 0b01001001,  # p
4653
        'q': 0b10101000,  # q
4654
        'r': 0b10100001,  # r
4655
        's': 0b00010100,  # s
4656
        't': 0b00011101,  # t
4657
        'u': 1,  # u
4658
        'v': 0b01000101,  # v
4659
        'w': 0b00000000,  # w
4660
        'x': 0b10000100,  # x
4661
        'y': 1,  # y
4662
        'z': 0b10010100,  # z
4663
4664
        'ß': 0b00010101,  # ß
4665
        'à': 0,  # à
4666
        'á': 0,  # á
4667
        'â': 0,  # â
4668
        'ã': 0,  # ã
4669
        'ä': 0,  # ä[æ]
4670
        'å': 1,  # å[oː]
4671
        'æ': 0,  # æ[æ]
4672
        'ç': 0b10010101,  # ç[t͡ʃ]
4673
        'è': 1,  # è
4674
        'é': 1,  # é
4675
        'ê': 1,  # ê
4676
        'ë': 1,  # ë
4677
        'ì': 1,  # ì
4678
        'í': 1,  # í
4679
        'î': 1,  # î
4680
        'ï': 1,  # ï
4681
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
4682
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
4683
        'ò': 0,  # ò
4684
        'ó': 0,  # ó
4685
        'ô': 0,  # ô
4686
        'õ': 0,  # õ
4687
        'ö': 1,  # ö[ø]
4688
        '÷': 0b11111111,  # ÷
4689
        'ø': 1,  # ø[ø]
4690
        'ù': 1,  # ù
4691
        'ú': 1,  # ú
4692
        'û': 1,  # û
4693
        'ü': 1,  # ü
4694
        'ý': 1,  # ý
4695
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
4696
        'ÿ': 1,  # ÿ
4697
    }
4698
4699
    _initial_phones = {
4700
        'a': 0b10000100,  # a*
4701
        'b': 0b00100100,  # b
4702
        'c': 0b00000110,  # c
4703
        'd': 0b00001100,  # d
4704
        'e': 0b11011000,  # e*
4705
        'f': 0b00100010,  # f
4706
        'g': 0b00000100,  # g
4707
        'h': 0b00000010,  # h
4708
        'i': 0b11111000,  # i*
4709
        'j': 0b00000011,  # j
4710
        'k': 0b00000101,  # k
4711
        'l': 0b01010000,  # l
4712
        'm': 0b00000001,  # m
4713
        'n': 0b00001001,  # n
4714
        'o': 0b10010100,  # o*
4715
        'p': 0b00100101,  # p
4716
        'q': 0b01010100,  # q
4717
        'r': 0b01010001,  # r
4718
        's': 0b00001010,  # s
4719
        't': 0b00001110,  # t
4720
        'u': 0b11100000,  # u*
4721
        'v': 0b00100011,  # v
4722
        'w': 0b00000000,  # w
4723
        'x': 0b01000010,  # x
4724
        'y': 0b11100100,  # y*
4725
        'z': 0b01001010,  # z
4726
4727
        'ß': 0b00001011,  # ß
4728
        'à': 0b10000101,  # à
4729
        'á': 0b10000101,  # á
4730
        'â': 0b10000000,  # â
4731
        'ã': 0b10000110,  # ã
4732
        'ä': 0b10100110,  # ä [æ]
4733
        'å': 0b11000010,  # å [oː]
4734
        'æ': 0b10100111,  # æ [æ]
4735
        'ç': 0b01010100,  # ç [t͡ʃ]
4736
        'è': 0b11011001,  # è
4737
        'é': 0b11011001,  # é
4738
        'ê': 0b11011001,  # ê
4739
        'ë': 0b11000110,  # ë [ə] or [œ]
4740
        'ì': 0b11111001,  # ì
4741
        'í': 0b11111001,  # í
4742
        'î': 0b11111001,  # î
4743
        'ï': 0b11111001,  # ï
4744
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
4745
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
4746
        'ò': 0b10010101,  # ò
4747
        'ó': 0b10010101,  # ó
4748
        'ô': 0b10010101,  # ô
4749
        'õ': 0b10010101,  # õ
4750
        'ö': 0b11011100,  # ö [œ] or [ø]
4751
        '÷': 0b11111111,  # ÷
4752
        'ø': 0b11011101,  # ø [œ] or [ø]
4753
        'ù': 0b11100001,  # ù
4754
        'ú': 0b11100001,  # ú
4755
        'û': 0b11100001,  # û
4756
        'ü': 0b11100101,  # ü
4757
        'ý': 0b11100101,  # ý
4758
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
4759
        'ÿ': 0b11100101,  # ÿ
4760
    }
4761
    # Lowercase input & filter unknown characters
4762
    word = ''.join(char for char in word.lower() if char in _initial_phones)
4763
4764
    # Perform initial eudex coding of each character
4765
    values = [_initial_phones[word[0]]]
4766
    values += [_trailing_phones[char] for char in word[1:]]
4767
4768
    # Right-shift by one to determine if second instance should be skipped
4769
    shifted_values = [_ >> 1 for _ in values]
4770
    condensed_values = [values[0]]
4771
    for n in range(1, len(shifted_values)):
4772
        if shifted_values[n] != shifted_values[n-1]:
4773
            condensed_values.append(values[n])
4774
4775
    # Add padding after first character & trim beyond maxlength
4776
    values = ([condensed_values[0]] +
4777
              [0]*max(0, maxlength - len(condensed_values)) +
4778
              condensed_values[1:maxlength])
4779
4780
    # Combine individual character values into eudex hash
4781
    hash_value = 0
4782
    for val in values:
4783
        hash_value = (hash_value << 8) | val
4784
4785
    return hash_value
4786
4787
4788
def haase_phonetik(word, primary_only=False):
4789
    """Return the Haase Phonetik (numeric output) code for a word.
4790
4791
    Based on the algorithm described at
4792
    https://github.com/elastic/elasticsearch/blob/master/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java
4793
4794
    Based on the original
4795
    Haase, Martin and Kai Heitmann. 2000. Die Erweiterte Kölner Phonetik.
4796
4797
    While the output code is numeric, it is still a str.
4798
4799
    :param str word: the word to transform
4800
    :returns: the Haase Phonetik value as a numeric string
4801
    :rtype: str
4802
    """
4803
    def _after(word, i, letters):
4804
        """Return True if word[i] follows one of the supplied letters."""
4805
        if i > 0 and word[i-1] in letters:
4806
            return True
4807
        return False
4808
4809
    def _before(word, i, letters):
4810
        """Return True if word[i] precedes one of the supplied letters."""
4811
        if i+1 < len(word) and word[i+1] in letters:
4812
            return True
4813
        return False
4814
4815
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
4816
4817
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4818
    word = word.replace('ß', 'SS')
4819
4820
    word = word.replace('Ä', 'AE')
4821
    word = word.replace('Ö', 'OE')
4822
    word = word.replace('Ü', 'UE')
4823
    word = ''.join(c for c in word if c in
4824
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4825
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4826
                    'Y', 'Z'})
4827
4828
    # Nothing to convert, return base case
4829
    if not word:
4830
        return ''
4831
4832
    variants = []
4833
    if primary_only:
4834
        variants = [word]
4835
    else:
4836
        pos = 0
4837
        if word[:2] == 'CH':
4838
            variants.append(('CH', 'SCH'))
4839
            pos += 2
4840
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
4841
                      'AUX': 'O', 'EUX': 'O'}
4842
        while pos < len(word):
4843
            if word[pos:pos+4] == 'ILLE':
4844
                variants.append(('ILLE', 'I'))
4845
                pos += 4
4846
            elif word[pos:pos+3] in len_3_vars:
4847
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
4848
                pos += 3
4849
            elif word[pos:pos+2] == 'RB':
4850
                variants.append(('RB', 'RW'))
4851
                pos += 2
4852
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
4853
                variants.append(('EAU', 'O'))
4854
                pos += 3
4855
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
4856
                if word[pos:] == 'O':
4857
                    variants.append(('O', 'OW'))
4858
                else:
4859
                    variants.append(('A', 'AR'))
4860
                pos += 1
4861
            else:
4862
                variants.append((word[pos],))
4863
                pos += 1
4864
4865
        variants = [''.join(letters) for letters in product(*variants)]
4866
4867
    def _haase_code(word):
4868
        sdx = ''
4869
        for i in range(len(word)):
4870 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
4871
                sdx += '9'
4872
            elif word[i] == 'B':
4873
                sdx += '1'
4874
            elif word[i] == 'P':
4875
                if _before(word, i, {'H'}):
4876
                    sdx += '3'
4877
                else:
4878
                    sdx += '1'
4879
            elif word[i] in {'D', 'T'}:
4880
                if _before(word, i, {'C', 'S', 'Z'}):
4881
                    sdx += '8'
4882
                else:
4883
                    sdx += '2'
4884
            elif word[i] in {'F', 'V', 'W'}:
4885
                sdx += '3'
4886
            elif word[i] in {'G', 'K', 'Q'}:
4887
                sdx += '4'
4888
            elif word[i] == 'C':
4889
                if _after(word, i, {'S', 'Z'}):
4890
                    sdx += '8'
4891
                elif i == 0:
4892
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
4893
                                         'U', 'X'}):
4894
                        sdx += '4'
4895
                    else:
4896
                        sdx += '8'
4897
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
4898
                    sdx += '4'
4899
                else:
4900
                    sdx += '8'
4901
            elif word[i] == 'X':
4902
                if _after(word, i, {'C', 'K', 'Q'}):
4903
                    sdx += '8'
4904
                else:
4905
                    sdx += '48'
4906
            elif word[i] == 'L':
4907
                sdx += '5'
4908
            elif word[i] in {'M', 'N'}:
4909
                sdx += '6'
4910
            elif word[i] == 'R':
4911
                sdx += '7'
4912
            elif word[i] in {'S', 'Z'}:
4913
                sdx += '8'
4914
4915
        sdx = _delete_consecutive_repeats(sdx)
4916
4917
        # if sdx:
4918
        #     sdx = sdx[0] + sdx[1:].replace('9', '')
4919
4920
        return sdx
4921
4922
    return tuple(_haase_code(word) for word in variants)
4923
4924
4925
def reth_schek_phonetik(word):
4926
    """Return Reth-Schek Phonetik code for a word.
4927
4928
    This algorithm is proposed in:
4929
    von Reth, Hans-Peter and Schek, Hans-Jörg. 1977. "Eine Zugriffsmethode für
4930
    die phonetische Ähnlichkeitssuche." Heidelberg Scientific Center technical
4931
    reports 77.03.002. IBM Deutschland GmbH.
4932
4933
    Since I couldn't secure a copy of that document (maybe I'll look for it
4934
    next time I'm in Germany), this implementation is based on what I could
4935
    glean from the implementations published by German Record Linkage
4936
    Center (www.record-linkage.de):
4937
    - Privacy-preserving Record Linkage (PPRL) (in R)
4938
    - Merge ToolBox (in Java)
4939
4940
    Rules that are unclear:
4941
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
4942
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
4943
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
4944
        think of a German word with '-tui-' in it.)
4945
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
4946
4947
    :param word:
4948
    :return:
4949
    """
4950
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
4951
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
4952
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
4953
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
4954
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
4955
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
4956
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
4957
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
4958
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
4959
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
4960
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
4961
                        'SS': 'S', 'KW': 'QU'},
4962
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
4963
                        'K': 'G', 'Y': 'I'}}
4964
4965
    # Uppercase
4966
    word = word.upper()
4967
4968
    # Replace umlauts/eszett
4969
    word = word.replace('Ä', 'AE')
4970
    word = word.replace('Ö', 'OE')
4971
    word = word.replace('Ü', 'UE')
4972
    word = word.replace('ß', 'SS')
4973
4974
    # Main loop, using above replacements table
4975
    pos = 0
4976
    while pos < len(word):
4977
        for num in range(3, 0, -1):
4978
            if word[pos:pos+num] in replacements[num]:
4979
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
4980
                        + word[pos+num:])
4981
                pos += 1
4982
                break
4983
        else:
4984
            pos += 1  # Advance if nothing is recognized
4985
4986
    # Change 'CH' back(?) to 'SCH'
4987
    word = word.replace('CH', 'SCH')
4988
4989
    # Replace final sequences
4990
    if word[-2:] == 'ER':
4991
        word = word[:-2]+'R'
4992
    elif word[-2:] == 'EL':
4993
        word = word[:-2]+'L'
4994
    elif word[-1] == 'H':
4995
        word = word[:-1]
4996
4997
    return word
4998
4999
5000
def fonem(word):
5001
    """Return the FONEM code of a word.
5002
5003
    FONEM is a phonetic algorithm designed for French (particularly surnames in
5004
    Saguenay, Canada), defined in:
5005
    Bouchard, Gérard, Patrick Brard, and Yolande Lavoie. 1981. "FONEM: Un code
5006
    de transcription phonétique pour la reconstitution automatique des
5007
    familles saguenayennes." Population. 36(6). 1085--1103.
5008
    https://doi.org/10.2307/1532326
5009
    http://www.persee.fr/doc/pop_0032-4663_1981_num_36_6_17248
5010
5011
    Guillaume Plique's Javascript implementation at
5012
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
5013
    was also consulted for this implementation.
5014
5015
    :param str word: the word to transform
5016
    :returns: the FONEM code
5017
    :rtype: str
5018
    """
5019
    # I don't see a sane way of doing this without regexps :(
5020
    rule_table = {
5021
        # Vowels & groups of vowels
5022
        'V-1':     (re.compile('E?AU'), 'O'),
5023
        'V-2,5':   (re.compile('(E?AU|O)L[TX]$'), 'O'),
5024
        'V-3,4':   (re.compile('E?AU[TX]$'), 'O'),
5025
        'V-6':     (re.compile('E?AUL?D$'), 'O'),
5026
        'V-7':     (re.compile(r'(?<!G)AY$'), 'E'),
5027
        'V-8':     (re.compile('EUX$'), 'EU'),
5028
        'V-9':     (re.compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
5029
        'V-10':    ('Y', 'I'),
5030
        'V-11':    (re.compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
5031
        'V-12':    (re.compile('(?<=[AEIOUY])ILL'), 'Y'),
5032
        'V-13':    (re.compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
5033
        'V-14':    (re.compile(r'([AEIOUY])(?=\1)'), ''),
5034
        # Nasal vowels
5035
        'V-15':    (re.compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
5036
        'V-16':    (re.compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
5037
        'V-17':    (re.compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
5038
        'V-18':    (re.compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
5039
                    'IN'),
5040
        'V-19':    (re.compile('B(O|U|OU)RNE?$'), 'BURN'),
5041
        'V-20':    (re.compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])IM(?=[BCDFGHJKLMPQRSTVWXZ]))'),
5042
                    'IN'),
5043
        # Consonants and groups of consonants
5044
        'C-1':     ('BV', 'V'),
5045
        'C-2':     (re.compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
5046
        'C-3':     (re.compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
5047
        'C-4':     (re.compile('^C(?=[EIY])'), 'S'),
5048
        'C-5':     (re.compile('^C(?=[OUA])'), 'K'),
5049
        'C-6':     (re.compile('(?<=[AEIOUY])C$'), 'K'),
5050
        'C-7':     (re.compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
5051
        'C-8':     (re.compile('CC(?=[AOU])'), 'K'),
5052
        'C-9':     (re.compile('CC(?=[EIY])'), 'X'),
5053
        'C-10':    (re.compile('G(?=[EIY])'), 'J'),
5054
        'C-11':    (re.compile('GA(?=I?[MN])'), 'G#'),
5055
        'C-12':    (re.compile('GE(O|AU)'), 'JO'),
5056
        'C-13':    (re.compile('GNI(?=[AEIOUY])'), 'GN'),
5057
        'C-14':    (re.compile('(?<![PCS])H'), ''),
5058
        'C-15':    ('JEA', 'JA'),
5059
        'C-16':    (re.compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
5060
        'C-17':    (re.compile('^MC'), 'MA#'),
5061
        'C-18':    ('PH', 'F'),
5062
        'C-19':    ('QU', 'K'),
5063
        'C-20':    (re.compile('^SC(?=[EIY])'), 'S'),
5064
        'C-21':    (re.compile('(?<=.)SC(?=[EIY])'), 'SS'),
5065
        'C-22':    (re.compile('(?<=.)SC(?=[AOU])'), 'SK'),
5066
        'C-23':    ('SH', 'CH'),
5067
        'C-24':    (re.compile('TIA$'), 'SSIA'),
5068
        'C-25':    (re.compile('(?<=[AIOUY])W'), ''),
5069
        'C-26':    (re.compile('X[CSZ]'), 'X'),
5070
        'C-27':    (re.compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (110/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
5071
        'C-28':    (re.compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
5072
        'C-28a':   (re.compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
5073
        'C-28b':   (re.compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
5074
        'C-28bb':  (re.compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
5075
        'C-28c':   (re.compile('((?<=[^I])|^)LL'), 'L'),
5076
        'C-28d':   (re.compile('ILE$'), 'ILLE'),
5077
        'C-29':    (re.compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKLMNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), r'\1\2'),
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (122/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
5078
        'C-30,32': (re.compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
5079
        'C-31,33': (re.compile('^(SAINTE|STE)-?'), 'STE-'),
5080
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
5081
        'C-34':    ('G#', 'GA'),
5082
        'C-35':    ('MA#', 'MAC')
5083
    }
5084
    rule_order = [
5085
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5086
        'C-12',
5087
        'C-8', 'C-9', 'C-10',
5088
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
5089
        'V-2,5', 'V-3,4', 'V-6',
5090
        'V-1', 'C-14',
5091
        'C-31,33', 'C-30,32',
5092
        'C-11', 'V-15', 'V-17', 'V-18',
5093
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
5094
        'V-19', 'V-20',
5095
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
5096
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
5097
        'C-25', 'C-26', 'C-27',
5098
        'C-29',
5099
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5100
        'C-34', 'C-35'
5101
    ]
5102
5103
    # normalize, upper-case, and filter non-French letters
5104
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
5105
    word = word.translate({198: 'AE', 338: 'OE'})
5106
    word = ''.join(c for c in word if c in
5107
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5108
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5109
                    'Y', 'Z', '-'})
5110
5111
    for rule in rule_order:
5112
        regex, repl = rule_table[rule]
5113
        if isinstance(regex, text_type):
5114
            word = word.replace(regex, repl)
5115
        else:
5116
            word = regex.sub(repl, word)
5117
        # print(rule, word)
5118
5119
    return word
5120
5121
5122
def parmar_kumbharana(word):
5123
    """Return the Parmar-Kumbharana encoding of a word.
5124
5125
    This is based on the phonetic algorithm proposed in
5126
    Parmar, Vimal P. and CK Kumbharana. 2014. "Study Existing Various Phonetic
5127
    Algorithms and Designing and Development of a working model for the New
5128
    Developed Algorithm and Comparison by implementing ti with Existing
5129
    Algorithm(s)." International Journal of Computer Applications. 98(19).
5130
    https://doi.org/10.5120/17295-7795
5131
5132
    :param word:
5133
    :return:
5134
    """
5135
    rule_table = {4: {'OUGH': 'F'},
5136
                  3: {'DGE': 'J',
5137
                      'OUL': 'U',
5138
                      'GHT': 'T'},
5139
                  2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
5140
                      'GE': 'J', 'GI': 'J', 'GY': 'J',
5141
                      'WR': 'R',
5142
                      'GN': 'N', 'KN': 'N', 'PN': 'N',
5143
                      'CK': 'K',
5144
                      'SH': 'S'}}
5145
    vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
5146
5147
    word = word.upper()  # Rule 3
5148
    word = _delete_consecutive_repeats(word)  # Rule 4
5149
5150
    # Rule 5
5151
    i = 0
5152
    while i < len(word):
5153
        for match_len in range(4, 1, -1):
5154
            if word[i:i+match_len] in rule_table[match_len]:
5155
                repl = rule_table[match_len][word[i:i+match_len]]
5156
                word = (word[:i] + repl + word[i+match_len:])
5157
                i += len(repl)
5158
        else:
5159
            i += 1
5160
5161
    word = word[0]+word[1:].translate(vowel_trans)  # Rule 6
5162
    return word
5163
5164
5165
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
5166
         concat=False, filter_langs=False):
5167
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
5168
5169
    The Beider-Morse Phonetic Matching algorithm is described at:
5170
    http://stevemorse.org/phonetics/bmpm.htm
5171
    The reference implementation is licensed under GPLv3 and available at:
5172
    http://stevemorse.org/phoneticinfo.htm
5173
5174
    :param str word: the word to transform
5175
    :param str language_arg: the language of the term; supported values
5176
        include:
5177
5178
            - 'any'
5179
            - 'arabic'
5180
            - 'cyrillic'
5181
            - 'czech'
5182
            - 'dutch'
5183
            - 'english'
5184
            - 'french'
5185
            - 'german'
5186
            - 'greek'
5187
            - 'greeklatin'
5188
            - 'hebrew'
5189
            - 'hungarian'
5190
            - 'italian'
5191
            - 'polish'
5192
            - 'portuguese'
5193
            - 'romanian'
5194
            - 'russian'
5195
            - 'spanish'
5196
            - 'turkish'
5197
            - 'germandjsg'
5198
            - 'polishdjskp'
5199
            - 'russiandjsre'
5200
5201
    :param str name_mode: the name mode of the algorithm:
5202
5203
            - 'gen' -- general (default)
5204
            - 'ash' -- Ashkenazi
5205
            - 'sep' -- Sephardic
5206
5207
    :param str match_mode: matching mode: 'approx' or 'exact'
5208
    :param bool concat: concatenation mode
5209
    :param bool filter_langs: filter out incompatible languages
5210
    :returns: the BMPM value(s)
5211
    :rtype: tuple
5212
5213
    >>> bmpm('Christopher')
5214
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5215
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
5216
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
5217
    zritofi'
5218
    >>> bmpm('Niall')
5219
    'nial niol'
5220
    >>> bmpm('Smith')
5221
    'zmit'
5222
    >>> bmpm('Schmidt')
5223
    'zmit stzmit'
5224
5225
    >>> bmpm('Christopher', language_arg='German')
5226
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5227
    xristYfir'
5228
    >>> bmpm('Christopher', language_arg='English')
5229
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
5230
    xrQstafir'
5231
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
5232
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5233
    xristYfir'
5234
5235
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
5236
    'xriStopher xriStofer xristopher xristofer'
5237
    """
5238
    return _bmpm(word, language_arg, name_mode, match_mode,
5239
                 concat, filter_langs)
5240
5241
5242
if __name__ == '__main__':
5243
    import doctest
5244
    doctest.testmod()
5245