Completed
Push — master ( 4580fb...b7249c )
by Chris
16:36 queued 06:17
created

abydos.phonetic.russell_index_alpha()   A

Complexity

Conditions 2

Size

Total Lines 22
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
nop 1
dl 0
loc 22
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (5922/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.
20
21
The phonetic module implements phonetic algorithms including:
22
23
    - Robert C. Russell's Index
24
    - American Soundex
25
    - Refined Soundex
26
    - Daitch-Mokotoff Soundex
27
    - Kölner Phonetik
28
    - NYSIIS
29
    - Match Rating Algorithm
30
    - Metaphone
31
    - Double Metaphone
32
    - Caverphone
33
    - Alpha Search Inquiry System
34
    - Fuzzy Soundex
35
    - Phonex
36
    - Phonem
37
    - Phonix
38
    - SfinxBis
39
    - phonet
40
    - Standardized Phonetic Frequency Code
41
    - Statistics Canada
42
    - Lein
43
    - Roger Root
44
    - Oxford Name Compression Algorithm (ONCA)
45
    - Eudex phonetic hash
46
    - Haase Phonetik
47
    - Reth-Schek Phonetik
48
    - FONEM
49
    - Parmar-Kumbharana
50
    - Davidson's Consonant Code
51
    - SoundD
52
    - PSHP Soundex/Viewex Coding
53
    - an early version of Henry Code
54
    - Norphone
55
    - Dolby Code
56
    - Beider-Morse Phonetic Matching
57
"""
58
59
from __future__ import division, unicode_literals
60
61
import re
62
import unicodedata
63
from collections import Counter
64
from itertools import groupby, product
65
66
from six import text_type
67
from six.moves import range
68
69
from ._bm import _bmpm
70
71
_INFINITY = float('inf')
72
73
74
def _delete_consecutive_repeats(word):
75
    """Delete consecutive repeated characters in a word.
76
77
    :param str word: the word to transform
78
    :returns: word with consecutive repeating characters collapsed to
79
        a single instance
80
    :rtype: str
81
    """
82
    return ''.join(char for char, _ in groupby(word))
83
84
85
def russell_index(word):
86
    """Return the Russell Index (integer output) of a word.
87
88
    This follows Robert C. Russell's Index algorithm, as described in
89
    US Patent 1,261,167 (1917)
90
91
    :param str word: the word to transform
92
    :returns: the Russell Index value
93
    :rtype: int
94
95
    >>> russell_index('Christopher')
96
    3813428
97
    >>> russell_index('Niall')
98
    715
99
    >>> russell_index('Smith')
100
    3614
101
    >>> russell_index('Schmidt')
102
    3614
103
    """
104
    _russell_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
105
                                     'ABCDEFGIKLMNOPQRSTUVXYZ'),
106
                                    '12341231356712383412313'))
107
108
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
109
    word = word.replace('ß', 'SS')
110
    word = word.replace('GH', '')  # discard gh (rule 3)
111
    word = word.rstrip('SZ')  # discard /[sz]$/ (rule 3)
112
113
    # translate according to Russell's mapping
114
    word = ''.join(c for c in word if c in
115
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'I', 'K', 'L', 'M', 'N',
116
                    'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'X', 'Y', 'Z'})
117
    sdx = word.translate(_russell_translation)
118
119
    # remove any 1s after the first occurrence
120
    one = sdx.find('1')+1
121
    if one:
122
        sdx = sdx[:one] + ''.join(c for c in sdx[one:] if c != '1')
123
124
    # remove repeating characters
125
    sdx = _delete_consecutive_repeats(sdx)
126
127
    # return as an int
128
    return int(sdx) if sdx else float('NaN')
129
130
131
def russell_index_num_to_alpha(num):
132
    """Convert the Russell Index integer to an alphabetic string.
133
134
    This follows Robert C. Russell's Index algorithm, as described in
135
    US Patent 1,261,167 (1917)
136
137
    :param int num: a Russell Index integer value
138
    :returns: the Russell Index as an alphabetic string
139
    :rtype: str
140
141
    >>> russell_index_num_to_alpha(3813428)
142
    'CRACDBR'
143
    >>> russell_index_num_to_alpha(715)
144
    'NAL'
145
    >>> russell_index_num_to_alpha(3614)
146
    'CMAD'
147
    """
148
    _russell_num_translation = dict(zip((ord(_) for _ in '12345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
149
                                        'ABCDLMNR'))
150
    num = ''.join(c for c in text_type(num) if c in {'1', '2', '3', '4', '5',
151
                                                     '6', '7', '8'})
152
    if num:
153
        return num.translate(_russell_num_translation)
154
    return ''
155
156
157
def russell_index_alpha(word):
158
    """Return the Russell Index (alphabetic output) for the word.
159
160
    This follows Robert C. Russell's Index algorithm, as described in
161
    US Patent 1,261,167 (1917)
162
163
    :param str word: the word to transform
164
    :returns: the Russell Index value as an alphabetic string
165
    :rtype: str
166
167
    >>> russell_index_alpha('Christopher')
168
    'CRACDBR'
169
    >>> russell_index_alpha('Niall')
170
    'NAL'
171
    >>> russell_index_alpha('Smith')
172
    'CMAD'
173
    >>> russell_index_alpha('Schmidt')
174
    'CMAD'
175
    """
176
    if word:
177
        return russell_index_num_to_alpha(russell_index(word))
178
    return ''
179
180
181
def soundex(word, maxlength=4, var='American', reverse=False, zero_pad=True):
182
    """Return the Soundex code for a word.
183
184
    :param str word: the word to transform
185
    :param int maxlength: the length of the code returned (defaults to 4)
186
    :param str var: the variant of the algorithm to employ (defaults to
187
        'American'):
188
189
        - 'American' follows the American Soundex algorithm, as described at
190
          http://www.archives.gov/publications/general-info-leaflets/55-census.html
191
          and in Knuth(1998:394); this is also called Miracode
192
        - 'special' follows the rules from the 1880-1910 US Census
193
          retrospective re-analysis, in which h & w are not treated as blocking
194
          consonants but as vowels.
195
          Cf. http://creativyst.com/Doc/Articles/SoundEx1/SoundEx1.htm
196
        - 'Census' follows the rules laid out in GIL 55 by the US Census,
197
          including coding prefixed and unprefixed versions of some names
198
199
    :param bool reverse: reverse the word before computing the selected Soundex
200
        (defaults to False); This results in "Reverse Soundex"
201
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
202
        maxlength string
203
    :returns: the Soundex value
204
    :rtype: str
205
206
    >>> soundex("Christopher")
207
    'C623'
208
    >>> soundex("Niall")
209
    'N400'
210
    >>> soundex('Smith')
211
    'S530'
212
    >>> soundex('Schmidt')
213
    'S530'
214
215
216
    >>> soundex('Christopher', maxlength=_INFINITY)
217
    'C623160000000000000000000000000000000000000000000000000000000000'
218
    >>> soundex('Christopher', maxlength=_INFINITY, zero_pad=False)
219
    'C62316'
220
221
    >>> soundex('Christopher', reverse=True)
222
    'R132'
223
224
    >>> soundex('Ashcroft')
225
    'A261'
226
    >>> soundex('Asicroft')
227
    'A226'
228
    >>> soundex('Ashcroft', var='special')
229
    'A226'
230
    >>> soundex('Asicroft', var='special')
231
    'A226'
232
    """
233
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
234
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
235
                                    '01230129022455012623019202'))
236
237
    # Require a maxlength of at least 4 and not more than 64
238
    if maxlength is not None:
239
        maxlength = min(max(4, maxlength), 64)
240
    else:
241
        maxlength = 64
242
243
    # uppercase, normalize, decompose, and filter non-A-Z out
244
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
245
    word = word.replace('ß', 'SS')
246
247
    if var == 'Census':
248
        # Should these prefixes be supplemented? (VANDE, DELA, VON)
249
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
250
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
251
                    soundex(word[3:], maxlength, 'American', reverse,
252
                            zero_pad))
253
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
254
            return (soundex(word, maxlength, 'American', reverse, zero_pad),
255
                    soundex(word[2:], maxlength, 'American', reverse,
256
                            zero_pad))
257
        # Otherwise, proceed as usual (var='American' mode, ostensibly)
258
259
    word = ''.join(c for c in word if c in
260
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
261
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
262
                    'Y', 'Z'})
263
264
    # Nothing to convert, return base case
265
    if not word:
266
        if zero_pad:
267
            return '0'*maxlength
268
        return '0'
269
270
    # Reverse word if computing Reverse Soundex
271
    if reverse:
272
        word = word[::-1]
273
274
    # apply the Soundex algorithm
275
    sdx = word.translate(_soundex_translation)
276
277
    if var == 'special':
278
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
279
    else:
280
        sdx = sdx.replace('9', '')  # rule 1
281
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
282
283
    if word[0] in 'HW':
284
        sdx = word[0] + sdx
285
    else:
286
        sdx = word[0] + sdx[1:]
287
    sdx = sdx.replace('0', '')  # rule 1
288
289
    if zero_pad:
290
        sdx += ('0'*maxlength)  # rule 4
291
292
    return sdx[:maxlength]
293
294
295
def refined_soundex(word, maxlength=_INFINITY, reverse=False, zero_pad=False,
296
                    retain_vowels=False):
297
    """Return the Refined Soundex code for a word.
298
299
    This is Soundex, but with more character classes. It was defined by
300
    Carolyn B. Boyce:
301
    https://web.archive.org/web/20010513121003/http://www.bluepoof.com:80/Soundex/info2.html
302
303
    :param word: the word to transform
304
    :param maxlength: the length of the code returned (defaults to unlimited)
305
    :param reverse: reverse the word before computing the selected Soundex
306
        (defaults to False); This results in "Reverse Soundex"
307
    :param zero_pad: pad the end of the return value with 0s to achieve a
308
        maxlength string
309
    :param retain_vowels: retain vowels (as 0) in the resulting code
310
    :returns: the Refined Soundex value
311
    :rtype: str
312
313
    >>> refined_soundex('Christopher')
314
    'C3090360109'
315
    >>> refined_soundex('Niall')
316
    'N807'
317
    >>> refined_soundex('Smith')
318
    'S38060'
319
    >>> refined_soundex('Schmidt')
320
    'S30806'
321
    """
322
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
323
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
324
                                        '01360240043788015936020505'))
325
326
    # uppercase, normalize, decompose, and filter non-A-Z out
327
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
328
    word = word.replace('ß', 'SS')
329
    word = ''.join(c for c in word if c in
330
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
331
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
332
                    'Y', 'Z'})
333
334
    # Reverse word if computing Reverse Soundex
335
    if reverse:
336
        word = word[::-1]
337
338
    # apply the Soundex algorithm
339
    sdx = word[0] + word.translate(_ref_soundex_translation)
340
    sdx = _delete_consecutive_repeats(sdx)
341
    if not retain_vowels:
342
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y
343
344
    if maxlength < _INFINITY:
345
        if zero_pad:
346
            sdx += ('0' * maxlength)
347
        if maxlength:
348
            sdx = sdx[:maxlength]
349
350
    return sdx
351
352
353
def dm_soundex(word, maxlength=6, reverse=False, zero_pad=True):
354
    """Return the Daitch-Mokotoff Soundex code for a word.
355
356
    Returns values of a word as a set. A collection is necessary since there
357
    can be multiple values for a single word.
358
359
    :param word: the word to transform
360
    :param maxlength: the length of the code returned (defaults to 6)
361
    :param reverse: reverse the word before computing the selected Soundex
362
        (defaults to False); This results in "Reverse Soundex"
363
    :param zero_pad: pad the end of the return value with 0s to achieve a
364
        maxlength string
365
    :returns: the Daitch-Mokotoff Soundex value
366
    :rtype: str
367
368
    >>> dm_soundex('Christopher')
369
    {'494379', '594379'}
370
    >>> dm_soundex('Niall')
371
    {'680000'}
372
    >>> dm_soundex('Smith')
373
    {'463000'}
374
    >>> dm_soundex('Schmidt')
375
    {'463000'}
376
377
    >>> dm_soundex('The quick brown fox', maxlength=20, zero_pad=False)
378
    {'35457976754', '3557976754'}
379
    """
380
    _dms_table = {'STCH': (2, 4, 4), 'DRZ': (4, 4, 4), 'ZH': (4, 4, 4),
381
                  'ZHDZH': (2, 4, 4), 'DZH': (4, 4, 4), 'DRS': (4, 4, 4),
382
                  'DZS': (4, 4, 4), 'SCHTCH': (2, 4, 4), 'SHTSH': (2, 4, 4),
383
                  'SZCZ': (2, 4, 4), 'TZS': (4, 4, 4), 'SZCS': (2, 4, 4),
384
                  'STSH': (2, 4, 4), 'SHCH': (2, 4, 4), 'D': (3, 3, 3),
385
                  'H': (5, 5, '_'), 'TTSCH': (4, 4, 4), 'THS': (4, 4, 4),
386
                  'L': (8, 8, 8), 'P': (7, 7, 7), 'CHS': (5, 54, 54),
387
                  'T': (3, 3, 3), 'X': (5, 54, 54), 'OJ': (0, 1, '_'),
388
                  'OI': (0, 1, '_'), 'SCHTSH': (2, 4, 4), 'OY': (0, 1, '_'),
389
                  'Y': (1, '_', '_'), 'TSH': (4, 4, 4), 'ZDZ': (2, 4, 4),
390
                  'TSZ': (4, 4, 4), 'SHT': (2, 43, 43), 'SCHTSCH': (2, 4, 4),
391
                  'TTSZ': (4, 4, 4), 'TTZ': (4, 4, 4), 'SCH': (4, 4, 4),
392
                  'TTS': (4, 4, 4), 'SZD': (2, 43, 43), 'AI': (0, 1, '_'),
393
                  'PF': (7, 7, 7), 'TCH': (4, 4, 4), 'PH': (7, 7, 7),
394
                  'TTCH': (4, 4, 4), 'SZT': (2, 43, 43), 'ZDZH': (2, 4, 4),
395
                  'EI': (0, 1, '_'), 'G': (5, 5, 5), 'EJ': (0, 1, '_'),
396
                  'ZD': (2, 43, 43), 'IU': (1, '_', '_'), 'K': (5, 5, 5),
397
                  'O': (0, '_', '_'), 'SHTCH': (2, 4, 4), 'S': (4, 4, 4),
398
                  'TRZ': (4, 4, 4), 'SHD': (2, 43, 43), 'DSH': (4, 4, 4),
399
                  'CSZ': (4, 4, 4), 'EU': (1, 1, '_'), 'TRS': (4, 4, 4),
400
                  'ZS': (4, 4, 4), 'STRZ': (2, 4, 4), 'UY': (0, 1, '_'),
401
                  'STRS': (2, 4, 4), 'CZS': (4, 4, 4),
402
                  'MN': ('6_6', '6_6', '6_6'), 'UI': (0, 1, '_'),
403
                  'UJ': (0, 1, '_'), 'UE': (0, '_', '_'), 'EY': (0, 1, '_'),
404
                  'W': (7, 7, 7), 'IA': (1, '_', '_'), 'FB': (7, 7, 7),
405
                  'STSCH': (2, 4, 4), 'SCHT': (2, 43, 43),
406
                  'NM': ('6_6', '6_6', '6_6'), 'SCHD': (2, 43, 43),
407
                  'B': (7, 7, 7), 'DSZ': (4, 4, 4), 'F': (7, 7, 7),
408
                  'N': (6, 6, 6), 'CZ': (4, 4, 4), 'R': (9, 9, 9),
409
                  'U': (0, '_', '_'), 'V': (7, 7, 7), 'CS': (4, 4, 4),
410
                  'Z': (4, 4, 4), 'SZ': (4, 4, 4), 'TSCH': (4, 4, 4),
411
                  'KH': (5, 5, 5), 'ST': (2, 43, 43), 'KS': (5, 54, 54),
412
                  'SH': (4, 4, 4), 'SC': (2, 4, 4), 'SD': (2, 43, 43),
413
                  'DZ': (4, 4, 4), 'ZHD': (2, 43, 43), 'DT': (3, 3, 3),
414
                  'ZSH': (4, 4, 4), 'DS': (4, 4, 4), 'TZ': (4, 4, 4),
415
                  'TS': (4, 4, 4), 'TH': (3, 3, 3), 'TC': (4, 4, 4),
416
                  'A': (0, '_', '_'), 'E': (0, '_', '_'), 'I': (0, '_', '_'),
417
                  'AJ': (0, 1, '_'), 'M': (6, 6, 6), 'Q': (5, 5, 5),
418
                  'AU': (0, 7, '_'), 'IO': (1, '_', '_'), 'AY': (0, 1, '_'),
419
                  'IE': (1, '_', '_'), 'ZSCH': (4, 4, 4),
420
                  'CH': ((5, 4), (5, 4), (5, 4)),
421
                  'CK': ((5, 45), (5, 45), (5, 45)),
422
                  'C': ((5, 4), (5, 4), (5, 4)),
423
                  'J': ((1, 4), ('_', 4), ('_', 4)),
424
                  'RZ': ((94, 4), (94, 4), (94, 4)),
425
                  'RS': ((94, 4), (94, 4), (94, 4))}
426
427
    _dms_order = {'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
428
                  'B': ('B'),
429
                  'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
430
                  'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT',
431
                        'DZ', 'D'),
432
                  'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
433
                  'F': ('FB', 'F'),
434
                  'G': ('G'),
435
                  'H': ('H'),
436
                  'I': ('IA', 'IE', 'IO', 'IU', 'I'),
437
                  'J': ('J'),
438
                  'K': ('KH', 'KS', 'K'),
439
                  'L': ('L'),
440
                  'M': ('MN', 'M'),
441
                  'N': ('NM', 'N'),
442
                  'O': ('OI', 'OJ', 'OY', 'O'),
443
                  'P': ('PF', 'PH', 'P'),
444
                  'Q': ('Q'),
445
                  'R': ('RS', 'RZ', 'R'),
446
                  'S': ('SCHTSCH', 'SCHTCH', 'SCHTSH', 'SHTCH', 'SHTSH',
447
                        'STSCH', 'SCHD', 'SCHT', 'SHCH', 'STCH', 'STRS',
448
                        'STRZ', 'STSH', 'SZCS', 'SZCZ', 'SCH', 'SHD', 'SHT',
449
                        'SZD', 'SZT', 'SC', 'SD', 'SH', 'ST', 'SZ', 'S'),
450
                  'T': ('TTSCH', 'TSCH', 'TTCH', 'TTSZ', 'TCH', 'THS', 'TRS',
451
                        'TRZ', 'TSH', 'TSZ', 'TTS', 'TTZ', 'TZS', 'TC', 'TH',
452
                        'TS', 'TZ', 'T'),
453
                  'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
454
                  'V': ('V'),
455
                  'W': ('W'),
456
                  'X': ('X'),
457
                  'Y': ('Y'),
458
                  'Z': ('ZHDZH', 'ZDZH', 'ZSCH', 'ZDZ', 'ZHD', 'ZSH', 'ZD',
459
                        'ZH', 'ZS', 'Z')}
460
461
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
462
    dms = ['']  # initialize empty code list
463
464
    # Require a maxlength of at least 6 and not more than 64
465
    if maxlength is not None:
466
        maxlength = min(max(6, maxlength), 64)
467
    else:
468
        maxlength = 64
469
470
    # uppercase, normalize, decompose, and filter non-A-Z
471
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
472
    word = word.replace('ß', 'SS')
473
    word = ''.join(c for c in word if c in
474
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
475
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
476
                    'Y', 'Z'})
477
478
    # Nothing to convert, return base case
479
    if not word:
480
        if zero_pad:
481
            return {'0'*maxlength}
482
        return {'0'}
483
484
    # Reverse word if computing Reverse Soundex
485
    if reverse:
486
        word = word[::-1]
487
488
    pos = 0
489
    while pos < len(word):
490
        # Iterate through _dms_order, which specifies the possible substrings
491
        # for which codes exist in the Daitch-Mokotoff coding
492
        for sstr in _dms_order[word[pos]]:  # pragma: no branch
493
            if word[pos:].startswith(sstr):
494
                # Having determined a valid substring start, retrieve the code
495
                dm_val = _dms_table[sstr]
496
497
                # Having retried the code (triple), determine the correct
498
                # positional variant (first, pre-vocalic, elsewhere)
499
                if pos == 0:
500
                    dm_val = dm_val[0]
501
                elif (pos+len(sstr) < len(word) and
502
                      word[pos+len(sstr)] in _vowels):
503
                    dm_val = dm_val[1]
504
                else:
505
                    dm_val = dm_val[2]
506
507
                # Build the code strings
508
                if isinstance(dm_val, tuple):
509
                    dms = [_ + text_type(dm_val[0]) for _ in dms] \
510
                            + [_ + text_type(dm_val[1]) for _ in dms]
511
                else:
512
                    dms = [_ + text_type(dm_val) for _ in dms]
513
                pos += len(sstr)
514
                break
515
516
    # Filter out double letters and _ placeholders
517
    dms = (''.join(c for c in _delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
518
           for _ in dms)
519
520
    # Trim codes and return set
521
    if zero_pad:
522
        dms = ((_ + ('0'*maxlength))[:maxlength] for _ in dms)
523
    else:
524
        dms = (_[:maxlength] for _ in dms)
525
    return set(dms)
526
527
528
def koelner_phonetik(word):
529
    """Return the Kölner Phonetik (numeric output) code for a word.
530
531
    Based on the algorithm described at
532
    https://de.wikipedia.org/wiki/Kölner_Phonetik
533
534
    While the output code is numeric, it is still a str because 0s can lead
535
    the code.
536
537
    :param str word: the word to transform
538
    :returns: the Kölner Phonetik value as a numeric string
539
    :rtype: str
540
541
    >>> koelner_phonetik('Christopher')
542
    '478237'
543
    >>> koelner_phonetik('Niall')
544
    '65'
545
    >>> koelner_phonetik('Smith')
546
    '862'
547
    >>> koelner_phonetik('Schmidt')
548
    '862'
549
    >>> koelner_phonetik('Müller')
550
    '657'
551
    >>> koelner_phonetik('Zimmermann')
552
    '86766'
553
    """
554
    # pylint: disable=too-many-branches
555
    def _after(word, i, letters):
556
        """Return True if word[i] follows one of the supplied letters."""
557
        if i > 0 and word[i-1] in letters:
558
            return True
559
        return False
560
561
    def _before(word, i, letters):
562
        """Return True if word[i] precedes one of the supplied letters."""
563
        if i+1 < len(word) and word[i+1] in letters:
564
            return True
565
        return False
566
567
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
568
569
    sdx = ''
570
571
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
572
    word = word.replace('ß', 'SS')
573
574
    word = word.replace('Ä', 'AE')
575
    word = word.replace('Ö', 'OE')
576
    word = word.replace('Ü', 'UE')
577
    word = ''.join(c for c in word if c in
578
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
579
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
580
                    'Y', 'Z'})
581
582
    # Nothing to convert, return base case
583
    if not word:
584
        return sdx
585
586
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
587 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
588
            sdx += '0'
589
        elif word[i] == 'B':
590
            sdx += '1'
591
        elif word[i] == 'P':
592
            if _before(word, i, {'H'}):
593
                sdx += '3'
594
            else:
595
                sdx += '1'
596
        elif word[i] in {'D', 'T'}:
597
            if _before(word, i, {'C', 'S', 'Z'}):
598
                sdx += '8'
599
            else:
600
                sdx += '2'
601
        elif word[i] in {'F', 'V', 'W'}:
602
            sdx += '3'
603
        elif word[i] in {'G', 'K', 'Q'}:
604
            sdx += '4'
605
        elif word[i] == 'C':
606
            if _after(word, i, {'S', 'Z'}):
607
                sdx += '8'
608
            elif i == 0:
609
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
610
                                     'X'}):
611
                    sdx += '4'
612
                else:
613
                    sdx += '8'
614
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
615
                sdx += '4'
616
            else:
617
                sdx += '8'
618
        elif word[i] == 'X':
619
            if _after(word, i, {'C', 'K', 'Q'}):
620
                sdx += '8'
621
            else:
622
                sdx += '48'
623
        elif word[i] == 'L':
624
            sdx += '5'
625
        elif word[i] in {'M', 'N'}:
626
            sdx += '6'
627
        elif word[i] == 'R':
628
            sdx += '7'
629
        elif word[i] in {'S', 'Z'}:
630
            sdx += '8'
631
632
    sdx = _delete_consecutive_repeats(sdx)
633
634
    if sdx:
635
        sdx = sdx[0] + sdx[1:].replace('0', '')
636
637
    return sdx
638
639
640
def koelner_phonetik_num_to_alpha(num):
641
    """Convert a Kölner Phonetik code from numeric to alphabetic.
642
643
    :param str num: a numeric Kölner Phonetik representation
644
    :returns: an alphabetic representation of the same word
645
    :rtype: str
646
647
    >>> koelner_phonetik_num_to_alpha(862)
648
    'SNT'
649
    >>> koelner_phonetik_num_to_alpha(657)
650
    'NLR'
651
    >>> koelner_phonetik_num_to_alpha(86766)
652
    'SNRNN'
653
    """
654
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
655
                                        'APTFKLNRS'))
656
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
657
                                                     '5', '6', '7', '8'})
658
    return num.translate(_koelner_num_translation)
659
660
661
def koelner_phonetik_alpha(word):
662
    """Return the Kölner Phonetik (alphabetic output) code for a word.
663
664
    :param str word: the word to transform
665
    :returns: the Kölner Phonetik value as an alphabetic string
666
    :rtype: str
667
668
    >>> koelner_phonetik_alpha('Smith')
669
    'SNT'
670
    >>> koelner_phonetik_alpha('Schmidt')
671
    'SNT'
672
    >>> koelner_phonetik_alpha('Müller')
673
    'NLR'
674
    >>> koelner_phonetik_alpha('Zimmermann')
675
    'SNRNN'
676
    """
677
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
678
679
680
def nysiis(word, maxlength=6, modified=False):
681
    """Return the NYSIIS code for a word.
682
683
    A description of the New York State Identification and Intelligence System
684
    algorithm can be found at
685
    https://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System
686
687
    The modified version of this algorithm is described in Appendix B of
688
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
689
    Procedure for the SRS Record Linkage System.` Statistical Reporting
690
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
691
    https://naldc.nal.usda.gov/download/27833/PDF
692
693
    :param str word: the word to transform
694
    :param int maxlength: the maximum length (default 6) of the code to return
695
    :param bool modified: indicates whether to use USDA modified NYSIIS
696
    :returns: the NYSIIS value
697
    :rtype: str
698
699
    >>> nysiis('Christopher')
700
    'CRASTA'
701
    >>> nysiis('Niall')
702
    'NAL'
703
    >>> nysiis('Smith')
704
    'SNAT'
705
    >>> nysiis('Schmidt')
706
    'SNAD'
707
708
    >>> nysiis('Christopher', maxlength=_INFINITY)
709
    'CRASTAFAR'
710
711
    >>> nysiis('Christopher', maxlength=8, modified=True)
712
    'CRASTAFA'
713
    >>> nysiis('Niall', maxlength=8, modified=True)
714
    'NAL'
715
    >>> nysiis('Smith', maxlength=8, modified=True)
716
    'SNAT'
717
    >>> nysiis('Schmidt', maxlength=8, modified=True)
718
    'SNAD'
719
    """
720
    # Require a maxlength of at least 6
721
    if maxlength:
722
        maxlength = max(6, maxlength)
723
724
    _vowels = {'A', 'E', 'I', 'O', 'U'}
725
726
    word = ''.join(c for c in word.upper() if c.isalpha())
727
    word = word.replace('ß', 'SS')
728
729
    # exit early if there are no alphas
730
    if not word:
731
        return ''
732
733
    if modified:
734
        original_first_char = word[0]
735
736
    if word[:3] == 'MAC':
737
        word = 'MCC'+word[3:]
738
    elif word[:2] == 'KN':
739
        word = 'NN'+word[2:]
740
    elif word[:1] == 'K':
741
        word = 'C'+word[1:]
742
    elif word[:2] in {'PH', 'PF'}:
743
        word = 'FF'+word[2:]
744
    elif word[:3] == 'SCH':
745
        word = 'SSS'+word[3:]
746
    elif modified:
747
        if word[:2] == 'WR':
748
            word = 'RR'+word[2:]
749
        elif word[:2] == 'RH':
750
            word = 'RR'+word[2:]
751
        elif word[:2] == 'DG':
752
            word = 'GG'+word[2:]
753
        elif word[:1] in _vowels:
754
            word = 'A'+word[1:]
755
756
    if modified and word[-1] in {'S', 'Z'}:
757
        word = word[:-1]
758
759
    if word[-2:] == 'EE' or word[-2:] == 'IE' or (modified and
760
                                                  word[-2:] == 'YE'):
761
        word = word[:-2]+'Y'
762
    elif word[-2:] in {'DT', 'RT', 'RD'}:
763
        word = word[:-2]+'D'
764
    elif word[-2:] in {'NT', 'ND'}:
765
        word = word[:-2]+('N' if modified else 'D')
766
    elif modified:
767
        if word[-2:] == 'IX':
768
            word = word[:-2]+'ICK'
769
        elif word[-2:] == 'EX':
770
            word = word[:-2]+'ECK'
771
        elif word[-2:] in {'JR', 'SR'}:
772
            return 'ERROR'  # TODO: decide how best to return an error
0 ignored issues
show
Coding Style introduced by
TODO and FIXME comments should generally be avoided.
Loading history...
773
774
    key = word[0]
775
776
    skip = 0
777
    for i in range(1, len(word)):
778
        if i >= len(word):
779
            continue
780
        elif skip:
781
            skip -= 1
782
            continue
783
        elif word[i:i+2] == 'EV':
784
            word = word[:i] + 'AF' + word[i+2:]
785
            skip = 1
786
        elif word[i] in _vowels:
787
            word = word[:i] + 'A' + word[i+1:]
788
        elif modified and i != len(word)-1 and word[i] == 'Y':
789
            word = word[:i] + 'A' + word[i+1:]
790
        elif word[i] == 'Q':
791
            word = word[:i] + 'G' + word[i+1:]
792
        elif word[i] == 'Z':
793
            word = word[:i] + 'S' + word[i+1:]
794
        elif word[i] == 'M':
795
            word = word[:i] + 'N' + word[i+1:]
796
        elif word[i:i+2] == 'KN':
797
            word = word[:i] + 'N' + word[i+2:]
798
        elif word[i] == 'K':
799
            word = word[:i] + 'C' + word[i+1:]
800
        elif modified and i == len(word)-3 and word[i:i+3] == 'SCH':
801
            word = word[:i] + 'SSA'
802
            skip = 2
803
        elif word[i:i+3] == 'SCH':
804
            word = word[:i] + 'SSS' + word[i+3:]
805
            skip = 2
806
        elif modified and i == len(word)-2 and word[i:i+2] == 'SH':
807
            word = word[:i] + 'SA'
808
            skip = 1
809
        elif word[i:i+2] == 'SH':
810
            word = word[:i] + 'SS' + word[i+2:]
811
            skip = 1
812
        elif word[i:i+2] == 'PH':
813
            word = word[:i] + 'FF' + word[i+2:]
814
            skip = 1
815
        elif modified and word[i:i+3] == 'GHT':
816
            word = word[:i] + 'TTT' + word[i+3:]
817
            skip = 2
818
        elif modified and word[i:i+2] == 'DG':
819
            word = word[:i] + 'GG' + word[i+2:]
820
            skip = 1
821
        elif modified and word[i:i+2] == 'WR':
822
            word = word[:i] + 'RR' + word[i+2:]
823
            skip = 1
824
        elif word[i] == 'H' and (word[i-1] not in _vowels or
825
                                 word[i+1:i+2] not in _vowels):
826
            word = word[:i] + word[i-1] + word[i+1:]
827
        elif word[i] == 'W' and word[i-1] in _vowels:
828
            word = word[:i] + word[i-1] + word[i+1:]
829
830
        if word[i:i+skip+1] != key[-1:]:
831
            key += word[i:i+skip+1]
832
833
    key = _delete_consecutive_repeats(key)
834
835
    if key[-1] == 'S':
836
        key = key[:-1]
837
    if key[-2:] == 'AY':
838
        key = key[:-2] + 'Y'
839
    if key[-1:] == 'A':
840
        key = key[:-1]
841
    if modified and key[0] == 'A':
842
        key = original_first_char + key[1:]
0 ignored issues
show
introduced by
The variable original_first_char does not seem to be defined in case modified on line 733 is False. Are you sure this can never be the case?
Loading history...
843
844
    if maxlength and maxlength < _INFINITY:
845
        key = key[:maxlength]
846
847
    return key
848
849
850
def mra(word):
851
    """Return the MRA personal numeric identifier (PNI) for a word.
852
853
    A description of the Western Airlines Surname Match Rating Algorithm can
854
    be found on page 18 of
855
    https://archive.org/details/accessingindivid00moor
856
857
    :param str word: the word to transform
858
    :returns: the MRA PNI
859
    :rtype: str
860
861
    >>> mra('Christopher')
862
    'CHRPHR'
863
    >>> mra('Niall')
864
    'NL'
865
    >>> mra('Smith')
866
    'SMTH'
867
    >>> mra('Schmidt')
868
    'SCHMDT'
869
    """
870
    if not word:
871
        return word
872
    word = word.upper()
873
    word = word.replace('ß', 'SS')
874
    word = word[0]+''.join(c for c in word[1:] if
875
                           c not in {'A', 'E', 'I', 'O', 'U'})
876
    word = _delete_consecutive_repeats(word)
877
    if len(word) > 6:
878
        word = word[:3]+word[-3:]
879
    return word
880
881
882
def metaphone(word, maxlength=_INFINITY):
883
    """Return the Metaphone code for a word.
884
885
    Based on Lawrence Philips' Pick BASIC code from 1990:
886
    http://aspell.net/metaphone/metaphone.basic
887
    This incorporates some corrections to the above code, particularly
888
    some of those suggested by Michael Kuhn in:
889
    http://aspell.net/metaphone/metaphone-kuhn.txt
890
891
    :param str word: the word to transform
892
    :param int maxlength: the maximum length of the returned Metaphone code
893
        (defaults to unlimited, but in Philips' original implementation
894
        this was 4)
895
    :returns: the Metaphone value
896
    :rtype: str
897
898
899
    >>> metaphone('Christopher')
900
    'KRSTFR'
901
    >>> metaphone('Niall')
902
    'NL'
903
    >>> metaphone('Smith')
904
    'SM0'
905
    >>> metaphone('Schmidt')
906
    'SKMTT'
907
    """
908
    # pylint: disable=too-many-branches
909
    _vowels = {'A', 'E', 'I', 'O', 'U'}
910
    _frontv = {'E', 'I', 'Y'}
911
    _varson = {'C', 'G', 'P', 'S', 'T'}
912
913
    # Require a maxlength of at least 4
914
    if maxlength is not None:
915
        maxlength = max(4, maxlength)
916
    else:
917
        maxlength = 64
918
919
    # As in variable sound--those modified by adding an "h"
920
    ename = ''.join(c for c in word.upper() if c.isalnum())
921
    ename = ename.replace('ß', 'SS')
922
923
    # Delete nonalphanumeric characters and make all caps
924
    if not ename:
925
        return ''
926
    if ename[0:2] in {'PN', 'AE', 'KN', 'GN', 'WR'}:
927
        ename = ename[1:]
928
    elif ename[0] == 'X':
929
        ename = 'S' + ename[1:]
930
    elif ename[0:2] == 'WH':
931
        ename = 'W' + ename[2:]
932
933
    # Convert to metaph
934
    elen = len(ename)-1
935
    metaph = ''
936
    for i in range(len(ename)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
937
        if len(metaph) >= maxlength:
938
            break
939
        if ((ename[i] not in {'G', 'T'} and
940
             i > 0 and ename[i-1] == ename[i])):
941
            continue
942
943
        if ename[i] in _vowels and i == 0:
944
            metaph = ename[i]
945
946
        elif ename[i] == 'B':
947
            if i != elen or ename[i-1] != 'M':
948
                metaph += ename[i]
949
950
        elif ename[i] == 'C':
951
            if not (i > 0 and ename[i-1] == 'S' and ename[i+1:i+2] in _frontv):
952
                if ename[i+1:i+3] == 'IA':
953
                    metaph += 'X'
954
                elif ename[i+1:i+2] in _frontv:
955
                    metaph += 'S'
956
                elif i > 0 and ename[i-1:i+2] == 'SCH':
957
                    metaph += 'K'
958
                elif ename[i+1:i+2] == 'H':
959
                    if i == 0 and i+1 < elen and ename[i+2:i+3] not in _vowels:
960
                        metaph += 'K'
961
                    else:
962
                        metaph += 'X'
963
                else:
964
                    metaph += 'K'
965
966
        elif ename[i] == 'D':
967
            if ename[i+1:i+2] == 'G' and ename[i+2:i+3] in _frontv:
968
                metaph += 'J'
969
            else:
970
                metaph += 'T'
971
972
        elif ename[i] == 'G':
973
            if ename[i+1:i+2] == 'H' and not (i+1 == elen or
974
                                              ename[i+2:i+3] not in _vowels):
975
                continue
976
            elif i > 0 and ((i+1 == elen and ename[i+1] == 'N') or
977
                            (i+3 == elen and ename[i+1:i+4] == 'NED')):
978
                continue
979
            elif (i-1 > 0 and i+1 <= elen and ename[i-1] == 'D' and
980
                  ename[i+1] in _frontv):
981
                continue
982
            elif ename[i+1:i+2] == 'G':
983
                continue
984
            elif ename[i+1:i+2] in _frontv:
985
                if i == 0 or ename[i-1] != 'G':
986
                    metaph += 'J'
987
                else:
988
                    metaph += 'K'
989
            else:
990
                metaph += 'K'
991
992
        elif ename[i] == 'H':
993
            if ((i > 0 and ename[i-1] in _vowels and
994
                 ename[i+1:i+2] not in _vowels)):
995
                continue
996
            elif i > 0 and ename[i-1] in _varson:
997
                continue
998
            else:
999
                metaph += 'H'
1000
1001
        elif ename[i] in {'F', 'J', 'L', 'M', 'N', 'R'}:
1002
            metaph += ename[i]
1003
1004
        elif ename[i] == 'K':
1005
            if i > 0 and ename[i-1] == 'C':
1006
                continue
1007
            else:
1008
                metaph += 'K'
1009
1010
        elif ename[i] == 'P':
1011
            if ename[i+1:i+2] == 'H':
1012
                metaph += 'F'
1013
            else:
1014
                metaph += 'P'
1015
1016
        elif ename[i] == 'Q':
1017
            metaph += 'K'
1018
1019
        elif ename[i] == 'S':
1020
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1021
                 ename[i+2] in 'OA')):
1022
                metaph += 'X'
1023
            elif ename[i+1:i+2] == 'H':
1024
                metaph += 'X'
1025
            else:
1026
                metaph += 'S'
1027
1028
        elif ename[i] == 'T':
1029
            if ((i > 0 and i+2 <= elen and ename[i+1] == 'I' and
1030
                 ename[i+2] in {'A', 'O'})):
1031
                metaph += 'X'
1032
            elif ename[i+1:i+2] == 'H':
1033
                metaph += '0'
1034
            elif ename[i+1:i+3] != 'CH':
1035
                if ename[i-1:i] != 'T':
1036
                    metaph += 'T'
1037
1038
        elif ename[i] == 'V':
1039
            metaph += 'F'
1040
1041
        elif ename[i] in 'WY':
1042
            if ename[i+1:i+2] in _vowels:
1043
                metaph += ename[i]
1044
1045
        elif ename[i] == 'X':
1046
            metaph += 'KS'
1047
1048
        elif ename[i] == 'Z':
1049
            metaph += 'S'
1050
1051
    return metaph
1052
1053
1054
def double_metaphone(word, maxlength=_INFINITY):
1055
    """Return the Double Metaphone code for a word.
1056
1057
    Based on Lawrence Philips' (Visual) C++ code from 1999:
1058
    http://aspell.net/metaphone/dmetaph.cpp
1059
1060
    :param word: the word to transform
1061
    :param maxlength: the maximum length of the returned Double Metaphone codes
1062
        (defaults to unlimited, but in Philips' original implementation this
1063
        was 4)
1064
    :returns: the Double Metaphone value(s)
1065
    :rtype: tuple
1066
1067
    >>> double_metaphone('Christopher')
1068
    ('KRSTFR', '')
1069
    >>> double_metaphone('Niall')
1070
    ('NL', '')
1071
    >>> double_metaphone('Smith')
1072
    ('SM0', 'XMT')
1073
    >>> double_metaphone('Schmidt')
1074
    ('XMT', 'SMT')
1075
    """
1076
    # pylint: disable=too-many-branches
1077
    # Require a maxlength of at least 4
1078
    if maxlength is not None:
1079
        maxlength = max(4, maxlength)
1080
    else:
1081
        maxlength = 64
1082
1083
    primary = ''
1084
    secondary = ''
1085
1086
    def _slavo_germanic():
1087
        """Return True if the word appears to be Slavic or Germanic."""
1088
        if 'W' in word or 'K' in word or 'CZ' in word:
1089
            return True
1090
        return False
1091
1092
    def _metaph_add(pri, sec=''):
1093
        """Return a new metaphone tuple with the supplied elements."""
1094
        newpri = primary
1095
        newsec = secondary
1096
        if pri:
1097
            newpri += pri
1098
        if sec:
1099
            if sec != ' ':
1100
                newsec += sec
1101
        else:
1102
            newsec += pri
1103
        return (newpri, newsec)
1104
1105
    def _is_vowel(pos):
1106
        """Return True if the character at word[pos] is a vowel."""
1107
        if pos >= 0 and word[pos] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1108
            return True
1109
        return False
1110
1111
    def _get_at(pos):
1112
        """Return the character at word[pos]."""
1113
        return word[pos]
1114
1115
    def _string_at(pos, slen, substrings):
1116
        """Return True if word[pos:pos+slen] is in substrings."""
1117
        if pos < 0:
1118
            return False
1119
        return word[pos:pos+slen] in substrings
1120
1121
    current = 0
1122
    length = len(word)
1123
    if length < 1:
1124
        return ('', '')
1125
    last = length - 1
1126
1127
    word = word.upper()
1128
    word = word.replace('ß', 'SS')
1129
1130
    # Pad the original string so that we can index beyond the edge of the world
1131
    word += '     '
1132
1133
    # Skip these when at start of word
1134
    if word[0:2] in {'GN', 'KN', 'PN', 'WR', 'PS'}:
1135
        current += 1
1136
1137
    # Initial 'X' is pronounced 'Z' e.g. 'Xavier'
1138
    if _get_at(0) == 'X':
1139
        (primary, secondary) = _metaph_add('S')  # 'Z' maps to 'S'
1140
        current += 1
1141
1142
    # Main loop
1143
    while True:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
1144
        if current >= length:
1145
            break
1146
1147
        if _get_at(current) in {'A', 'E', 'I', 'O', 'U', 'Y'}:
1148
            if current == 0:
1149
                # All init vowels now map to 'A'
1150
                (primary, secondary) = _metaph_add('A')
1151
            current += 1
1152
            continue
1153
1154
        elif _get_at(current) == 'B':
1155
            # "-mb", e.g", "dumb", already skipped over...
1156
            (primary, secondary) = _metaph_add('P')
1157
            if _get_at(current + 1) == 'B':
1158
                current += 2
1159
            else:
1160
                current += 1
1161
            continue
1162
1163
        elif _get_at(current) == 'Ç':
1164
            (primary, secondary) = _metaph_add('S')
1165
            current += 1
1166
            continue
1167
1168
        elif _get_at(current) == 'C':
1169
            # Various Germanic
1170
            if (current > 1 and not _is_vowel(current - 2) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1171
                    _string_at((current - 1), 3, {'ACH'}) and
1172
                    ((_get_at(current + 2) != 'I') and
1173
                     ((_get_at(current + 2) != 'E') or
1174
                      _string_at((current - 2), 6,
1175
                                 {'BACHER', 'MACHER'})))):
1176
                (primary, secondary) = _metaph_add('K')
1177
                current += 2
1178
                continue
1179
1180
            # Special case 'caesar'
1181
            elif current == 0 and _string_at(current, 6, {'CAESAR'}):
1182
                (primary, secondary) = _metaph_add('S')
1183
                current += 2
1184
                continue
1185
1186
            # Italian 'chianti'
1187
            elif _string_at(current, 4, {'CHIA'}):
1188
                (primary, secondary) = _metaph_add('K')
1189
                current += 2
1190
                continue
1191
1192
            elif _string_at(current, 2, {'CH'}):
1193
                # Find 'Michael'
1194
                if current > 0 and _string_at(current, 4, {'CHAE'}):
1195
                    (primary, secondary) = _metaph_add('K', 'X')
1196
                    current += 2
1197
                    continue
1198
1199
                # Greek roots e.g. 'chemistry', 'chorus'
1200
                elif (current == 0 and
1201
                      (_string_at((current + 1), 5,
1202
                                  {'HARAC', 'HARIS'}) or
1203
                       _string_at((current + 1), 3,
1204
                                  {'HOR', 'HYM', 'HIA', 'HEM'})) and
1205
                      not _string_at(0, 5, {'CHORE'})):
1206
                    (primary, secondary) = _metaph_add('K')
1207
                    current += 2
1208
                    continue
1209
1210
                # Germanic, Greek, or otherwise 'ch' for 'kh' sound
1211
                elif ((_string_at(0, 4, {'VAN ', 'VON '}) or
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (7/5)
Loading history...
1212
                       _string_at(0, 3, {'SCH'})) or
1213
                      # 'architect but not 'arch', 'orchestra', 'orchid'
1214
                      _string_at((current - 2), 6,
1215
                                 {'ORCHES', 'ARCHIT', 'ORCHID'}) or
1216
                      _string_at((current + 2), 1, {'T', 'S'}) or
1217
                      ((_string_at((current - 1), 1,
1218
                                   {'A', 'O', 'U', 'E'}) or
1219
                        (current == 0)) and
1220
                       # e.g., 'wachtler', 'wechsler', but not 'tichner'
1221
                       _string_at((current + 2), 1,
1222
                                  {'L', 'R', 'N', 'M', 'B', 'H', 'F', 'V', 'W',
1223
                                   ' '}))):
1224
                    (primary, secondary) = _metaph_add('K')
1225
1226
                else:
1227
                    if current > 0:
1228
                        if _string_at(0, 2, {'MC'}):
1229
                            # e.g., "McHugh"
1230
                            (primary, secondary) = _metaph_add('K')
1231
                        else:
1232
                            (primary, secondary) = _metaph_add('X', 'K')
1233
                    else:
1234
                        (primary, secondary) = _metaph_add('X')
1235
1236
                current += 2
1237
                continue
1238
1239
            # e.g, 'czerny'
1240
            elif (_string_at(current, 2, {'CZ'}) and
1241
                  not _string_at((current - 2), 4, {'WICZ'})):
1242
                (primary, secondary) = _metaph_add('S', 'X')
1243
                current += 2
1244
                continue
1245
1246
            # e.g., 'focaccia'
1247
            elif _string_at((current + 1), 3, {'CIA'}):
1248
                (primary, secondary) = _metaph_add('X')
1249
                current += 3
1250
1251
            # double 'C', but not if e.g. 'McClellan'
1252
            elif (_string_at(current, 2, {'CC'}) and
1253
                  not ((current == 1) and (_get_at(0) == 'M'))):
1254
                # 'bellocchio' but not 'bacchus'
1255
                if ((_string_at((current + 2), 1,
1256
                                {'I', 'E', 'H'}) and
1257
                     not _string_at((current + 2), 2, ['HU']))):
1258
                    # 'accident', 'accede' 'succeed'
1259
                    if ((((current == 1) and _get_at(current - 1) == 'A') or
1260
                         _string_at((current - 1), 5,
1261
                                    {'UCCEE', 'UCCES'}))):
1262
                        (primary, secondary) = _metaph_add('KS')
1263
                    # 'bacci', 'bertucci', other italian
1264
                    else:
1265
                        (primary, secondary) = _metaph_add('X')
1266
                    current += 3
1267
                    continue
1268
                else:  # Pierce's rule
1269
                    (primary, secondary) = _metaph_add('K')
1270
                    current += 2
1271
                    continue
1272
1273
            elif _string_at(current, 2, {'CK', 'CG', 'CQ'}):
1274
                (primary, secondary) = _metaph_add('K')
1275
                current += 2
1276
                continue
1277
1278
            elif _string_at(current, 2, {'CI', 'CE', 'CY'}):
1279
                # Italian vs. English
1280
                if _string_at(current, 3, {'CIO', 'CIE', 'CIA'}):
1281
                    (primary, secondary) = _metaph_add('S', 'X')
1282
                else:
1283
                    (primary, secondary) = _metaph_add('S')
1284
                current += 2
1285
                continue
1286
1287
            # else
1288
            else:
1289
                (primary, secondary) = _metaph_add('K')
1290
1291
                # name sent in 'mac caffrey', 'mac gregor
1292
                if _string_at((current + 1), 2, {' C', ' Q', ' G'}):
1293
                    current += 3
1294
                elif (_string_at((current + 1), 1,
1295
                                 {'C', 'K', 'Q'}) and
1296
                      not _string_at((current + 1), 2, {'CE', 'CI'})):
1297
                    current += 2
1298
                else:
1299
                    current += 1
1300
                continue
1301
1302
        elif _get_at(current) == 'D':
1303
            if _string_at(current, 2, {'DG'}):
1304
                if _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1305
                    # e.g. 'edge'
1306
                    (primary, secondary) = _metaph_add('J')
1307
                    current += 3
1308
                    continue
1309
                else:
1310
                    # e.g. 'edgar'
1311
                    (primary, secondary) = _metaph_add('TK')
1312
                    current += 2
1313
                    continue
1314
1315
            elif _string_at(current, 2, {'DT', 'DD'}):
1316
                (primary, secondary) = _metaph_add('T')
1317
                current += 2
1318
                continue
1319
1320
            # else
1321
            else:
1322
                (primary, secondary) = _metaph_add('T')
1323
                current += 1
1324
                continue
1325
1326
        elif _get_at(current) == 'F':
1327
            if _get_at(current + 1) == 'F':
1328
                current += 2
1329
            else:
1330
                current += 1
1331
            (primary, secondary) = _metaph_add('F')
1332
            continue
1333
1334
        elif _get_at(current) == 'G':
1335
            if _get_at(current + 1) == 'H':
1336
                if (current > 0) and not _is_vowel(current - 1):
1337
                    (primary, secondary) = _metaph_add('K')
1338
                    current += 2
1339
                    continue
1340
1341
                # 'ghislane', ghiradelli
1342
                elif current == 0:
1343
                    if _get_at(current + 2) == 'I':
1344
                        (primary, secondary) = _metaph_add('J')
1345
                    else:
1346
                        (primary, secondary) = _metaph_add('K')
1347
                    current += 2
1348
                    continue
1349
1350
                # Parker's rule (with some further refinements) - e.g., 'hugh'
1351
                elif (((current > 1) and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1352
                       _string_at((current - 2), 1, {'B', 'H', 'D'})) or
1353
                      # e.g., 'bough'
1354
                      ((current > 2) and
1355
                       _string_at((current - 3), 1, {'B', 'H', 'D'})) or
1356
                      # e.g., 'broughton'
1357
                      ((current > 3) and
1358
                       _string_at((current - 4), 1, {'B', 'H'}))):
1359
                    current += 2
1360
                    continue
1361
                else:
1362
                    # e.g. 'laugh', 'McLaughlin', 'cough',
1363
                    #      'gough', 'rough', 'tough'
1364
                    if ((current > 2) and
1365
                            (_get_at(current - 1) == 'U') and
1366
                            (_string_at((current - 3), 1,
1367
                                        {'C', 'G', 'L', 'R', 'T'}))):
1368
                        (primary, secondary) = _metaph_add('F')
1369
                    elif (current > 0) and _get_at(current - 1) != 'I':
1370
                        (primary, secondary) = _metaph_add('K')
1371
                    current += 2
1372
                    continue
1373
1374
            elif _get_at(current + 1) == 'N':
1375
                if (current == 1) and _is_vowel(0) and not _slavo_germanic():
1376
                    (primary, secondary) = _metaph_add('KN', 'N')
1377
                # not e.g. 'cagney'
1378
                elif (not _string_at((current + 2), 2, {'EY'}) and
1379
                      (_get_at(current + 1) != 'Y') and
1380
                      not _slavo_germanic()):
1381
                    (primary, secondary) = _metaph_add('N', 'KN')
1382
                else:
1383
                    (primary, secondary) = _metaph_add('KN')
1384
                current += 2
1385
                continue
1386
1387
            # 'tagliaro'
1388
            elif (_string_at((current + 1), 2, {'LI'}) and
1389
                  not _slavo_germanic()):
1390
                (primary, secondary) = _metaph_add('KL', 'L')
1391
                current += 2
1392
                continue
1393
1394
            # -ges-, -gep-, -gel-, -gie- at beginning
1395
            elif ((current == 0) and
1396
                  ((_get_at(current + 1) == 'Y') or
1397
                   _string_at((current + 1), 2, {'ES', 'EP', 'EB', 'EL', 'EY',
1398
                                                 'IB', 'IL', 'IN', 'IE', 'EI',
1399
                                                 'ER'}))):
1400
                (primary, secondary) = _metaph_add('K', 'J')
1401
                current += 2
1402
                continue
1403
1404
            #  -ger-,  -gy-
1405
            elif ((_string_at((current + 1), 2, {'ER'}) or
1406
                   (_get_at(current + 1) == 'Y')) and not
1407
                  _string_at(0, 6, {'DANGER', 'RANGER', 'MANGER'}) and not
1408
                  _string_at((current - 1), 1, {'E', 'I'}) and not
1409
                  _string_at((current - 1), 3, {'RGY', 'OGY'})):
1410
                (primary, secondary) = _metaph_add('K', 'J')
1411
                current += 2
1412
                continue
1413
1414
            #  italian e.g, 'biaggi'
1415
            elif (_string_at((current + 1), 1, {'E', 'I', 'Y'}) or
1416
                  _string_at((current - 1), 4, {'AGGI', 'OGGI'})):
1417
                # obvious germanic
1418
                if (((_string_at(0, 4, {'VAN ', 'VON '}) or
1419
                      _string_at(0, 3, {'SCH'})) or
1420
                     _string_at((current + 1), 2, {'ET'}))):
1421
                    (primary, secondary) = _metaph_add('K')
1422
                elif _string_at((current + 1), 4, {'IER '}):
1423
                    (primary, secondary) = _metaph_add('J')
1424
                else:
1425
                    (primary, secondary) = _metaph_add('J', 'K')
1426
                current += 2
1427
                continue
1428
1429
            else:
1430
                if _get_at(current + 1) == 'G':
1431
                    current += 2
1432
                else:
1433
                    current += 1
1434
                (primary, secondary) = _metaph_add('K')
1435
                continue
1436
1437
        elif _get_at(current) == 'H':
1438
            # only keep if first & before vowel or btw. 2 vowels
1439
            if ((((current == 0) or _is_vowel(current - 1)) and
1440
                 _is_vowel(current + 1))):
1441
                (primary, secondary) = _metaph_add('H')
1442
                current += 2
1443
            else:  # also takes care of 'HH'
1444
                current += 1
1445
            continue
1446
1447
        elif _get_at(current) == 'J':
1448
            # obvious spanish, 'jose', 'san jacinto'
1449
            if _string_at(current, 4, ['JOSE']) or _string_at(0, 4, {'SAN '}):
1450
                if ((((current == 0) and (_get_at(current + 4) == ' ')) or
1451
                     _string_at(0, 4, ['SAN ']))):
1452
                    (primary, secondary) = _metaph_add('H')
1453
                else:
1454
                    (primary, secondary) = _metaph_add('J', 'H')
1455
                current += 1
1456
                continue
1457
1458
            elif (current == 0) and not _string_at(current, 4, {'JOSE'}):
1459
                # Yankelovich/Jankelowicz
1460
                (primary, secondary) = _metaph_add('J', 'A')
1461
            # Spanish pron. of e.g. 'bajador'
1462
            elif (_is_vowel(current - 1) and
1463
                  not _slavo_germanic() and
1464
                  ((_get_at(current + 1) == 'A') or
1465
                   (_get_at(current + 1) == 'O'))):
1466
                (primary, secondary) = _metaph_add('J', 'H')
1467
            elif current == last:
1468
                (primary, secondary) = _metaph_add('J', ' ')
1469
            elif (not _string_at((current + 1), 1,
1470
                                 {'L', 'T', 'K', 'S', 'N', 'M', 'B', 'Z'}) and
1471
                  not _string_at((current - 1), 1, {'S', 'K', 'L'})):
1472
                (primary, secondary) = _metaph_add('J')
1473
1474
            if _get_at(current + 1) == 'J':  # it could happen!
1475
                current += 2
1476
            else:
1477
                current += 1
1478
            continue
1479
1480
        elif _get_at(current) == 'K':
1481
            if _get_at(current + 1) == 'K':
1482
                current += 2
1483
            else:
1484
                current += 1
1485
            (primary, secondary) = _metaph_add('K')
1486
            continue
1487
1488
        elif _get_at(current) == 'L':
1489
            if _get_at(current + 1) == 'L':
1490
                # Spanish e.g. 'cabrillo', 'gallegos'
1491
                if (((current == (length - 3)) and
1492
                     _string_at((current - 1), 4, {'ILLO', 'ILLA', 'ALLE'})) or
1493
                        ((_string_at((last - 1), 2, {'AS', 'OS'}) or
1494
                          _string_at(last, 1, {'A', 'O'})) and
1495
                         _string_at((current - 1), 4, {'ALLE'}))):
1496
                    (primary, secondary) = _metaph_add('L', ' ')
1497
                    current += 2
1498
                    continue
1499
                current += 2
1500
            else:
1501
                current += 1
1502
            (primary, secondary) = _metaph_add('L')
1503
            continue
1504
1505
        elif _get_at(current) == 'M':
1506
            if (((_string_at((current - 1), 3, {'UMB'}) and
1507
                  (((current + 1) == last) or
1508
                   _string_at((current + 2), 2, {'ER'}))) or
1509
                 # 'dumb', 'thumb'
1510
                 (_get_at(current + 1) == 'M'))):
1511
                current += 2
1512
            else:
1513
                current += 1
1514
            (primary, secondary) = _metaph_add('M')
1515
            continue
1516
1517
        elif _get_at(current) == 'N':
1518
            if _get_at(current + 1) == 'N':
1519
                current += 2
1520
            else:
1521
                current += 1
1522
            (primary, secondary) = _metaph_add('N')
1523
            continue
1524
1525
        elif _get_at(current) == 'Ñ':
1526
            current += 1
1527
            (primary, secondary) = _metaph_add('N')
1528
            continue
1529
1530
        elif _get_at(current) == 'P':
1531
            if _get_at(current + 1) == 'H':
1532
                (primary, secondary) = _metaph_add('F')
1533
                current += 2
1534
                continue
1535
1536
            # also account for "campbell", "raspberry"
1537
            elif _string_at((current + 1), 1, {'P', 'B'}):
1538
                current += 2
1539
            else:
1540
                current += 1
1541
            (primary, secondary) = _metaph_add('P')
1542
            continue
1543
1544
        elif _get_at(current) == 'Q':
1545
            if _get_at(current + 1) == 'Q':
1546
                current += 2
1547
            else:
1548
                current += 1
1549
            (primary, secondary) = _metaph_add('K')
1550
            continue
1551
1552
        elif _get_at(current) == 'R':
1553
            # french e.g. 'rogier', but exclude 'hochmeier'
1554
            if (((current == last) and
1555
                 not _slavo_germanic() and
1556
                 _string_at((current - 2), 2, {'IE'}) and
1557
                 not _string_at((current - 4), 2, {'ME', 'MA'}))):
1558
                (primary, secondary) = _metaph_add('', 'R')
1559
            else:
1560
                (primary, secondary) = _metaph_add('R')
1561
1562
            if _get_at(current + 1) == 'R':
1563
                current += 2
1564
            else:
1565
                current += 1
1566
            continue
1567
1568
        elif _get_at(current) == 'S':
1569
            # special cases 'island', 'isle', 'carlisle', 'carlysle'
1570
            if _string_at((current - 1), 3, {'ISL', 'YSL'}):
1571
                current += 1
1572
                continue
1573
1574
            # special case 'sugar-'
1575
            elif (current == 0) and _string_at(current, 5, {'SUGAR'}):
1576
                (primary, secondary) = _metaph_add('X', 'S')
1577
                current += 1
1578
                continue
1579
1580
            elif _string_at(current, 2, {'SH'}):
1581
                # Germanic
1582
                if _string_at((current + 1), 4,
1583
                              {'HEIM', 'HOEK', 'HOLM', 'HOLZ'}):
1584
                    (primary, secondary) = _metaph_add('S')
1585
                else:
1586
                    (primary, secondary) = _metaph_add('X')
1587
                current += 2
1588
                continue
1589
1590
            # Italian & Armenian
1591
            elif (_string_at(current, 3, {'SIO', 'SIA'}) or
1592
                  _string_at(current, 4, {'SIAN'})):
1593
                if not _slavo_germanic():
1594
                    (primary, secondary) = _metaph_add('S', 'X')
1595
                else:
1596
                    (primary, secondary) = _metaph_add('S')
1597
                current += 3
1598
                continue
1599
1600
            # German & anglicisations, e.g. 'smith' match 'schmidt',
1601
            #                               'snider' match 'schneider'
1602
            # also, -sz- in Slavic language although in Hungarian it is
1603
            #       pronounced 's'
1604
            elif (((current == 0) and
1605
                   _string_at((current + 1), 1, {'M', 'N', 'L', 'W'})) or
1606
                  _string_at((current + 1), 1, {'Z'})):
1607
                (primary, secondary) = _metaph_add('S', 'X')
1608
                if _string_at((current + 1), 1, {'Z'}):
1609
                    current += 2
1610
                else:
1611
                    current += 1
1612
                continue
1613
1614
            elif _string_at(current, 2, {'SC'}):
1615
                # Schlesinger's rule
1616
                if _get_at(current + 2) == 'H':
1617
                    # dutch origin, e.g. 'school', 'schooner'
1618
                    if _string_at((current + 3), 2,
1619
                                  {'OO', 'ER', 'EN', 'UY', 'ED', 'EM'}):
1620
                        # 'schermerhorn', 'schenker'
1621
                        if _string_at((current + 3), 2, {'ER', 'EN'}):
1622
                            (primary, secondary) = _metaph_add('X', 'SK')
1623
                        else:
1624
                            (primary, secondary) = _metaph_add('SK')
1625
                        current += 3
1626
                        continue
1627
                    else:
1628
                        if (((current == 0) and not _is_vowel(3) and
1629
                             (_get_at(3) != 'W'))):
1630
                            (primary, secondary) = _metaph_add('X', 'S')
1631
                        else:
1632
                            (primary, secondary) = _metaph_add('X')
1633
                        current += 3
1634
                        continue
1635
1636
                elif _string_at((current + 2), 1, {'I', 'E', 'Y'}):
1637
                    (primary, secondary) = _metaph_add('S')
1638
                    current += 3
1639
                    continue
1640
1641
                # else
1642
                else:
1643
                    (primary, secondary) = _metaph_add('SK')
1644
                    current += 3
1645
                    continue
1646
1647
            else:
1648
                # french e.g. 'resnais', 'artois'
1649
                if (current == last) and _string_at((current - 2), 2,
1650
                                                    {'AI', 'OI'}):
1651
                    (primary, secondary) = _metaph_add('', 'S')
1652
                else:
1653
                    (primary, secondary) = _metaph_add('S')
1654
1655
                if _string_at((current + 1), 1, {'S', 'Z'}):
1656
                    current += 2
1657
                else:
1658
                    current += 1
1659
                continue
1660
1661
        elif _get_at(current) == 'T':
1662
            if _string_at(current, 4, {'TION'}):
1663
                (primary, secondary) = _metaph_add('X')
1664
                current += 3
1665
                continue
1666
1667
            elif _string_at(current, 3, {'TIA', 'TCH'}):
1668
                (primary, secondary) = _metaph_add('X')
1669
                current += 3
1670
                continue
1671
1672
            elif (_string_at(current, 2, {'TH'}) or
1673
                  _string_at(current, 3, {'TTH'})):
1674
                # special case 'thomas', 'thames' or germanic
1675
                if ((_string_at((current + 2), 2, {'OM', 'AM'}) or
1676
                     _string_at(0, 4, {'VAN ', 'VON '}) or
1677
                     _string_at(0, 3, {'SCH'}))):
1678
                    (primary, secondary) = _metaph_add('T')
1679
                else:
1680
                    (primary, secondary) = _metaph_add('0', 'T')
1681
                current += 2
1682
                continue
1683
1684
            elif _string_at((current + 1), 1, {'T', 'D'}):
1685
                current += 2
1686
            else:
1687
                current += 1
1688
            (primary, secondary) = _metaph_add('T')
1689
            continue
1690
1691
        elif _get_at(current) == 'V':
1692
            if _get_at(current + 1) == 'V':
1693
                current += 2
1694
            else:
1695
                current += 1
1696
            (primary, secondary) = _metaph_add('F')
1697
            continue
1698
1699
        elif _get_at(current) == 'W':
1700
            # can also be in middle of word
1701
            if _string_at(current, 2, {'WR'}):
1702
                (primary, secondary) = _metaph_add('R')
1703
                current += 2
1704
                continue
1705
            elif ((current == 0) and
1706
                  (_is_vowel(current + 1) or _string_at(current, 2, {'WH'}))):
1707
                # Wasserman should match Vasserman
1708
                if _is_vowel(current + 1):
1709
                    (primary, secondary) = _metaph_add('A', 'F')
1710
                else:
1711
                    # need Uomo to match Womo
1712
                    (primary, secondary) = _metaph_add('A')
1713
1714
            # Arnow should match Arnoff
1715
            if ((((current == last) and _is_vowel(current - 1)) or
1716
                 _string_at((current - 1), 5,
1717
                            {'EWSKI', 'EWSKY', 'OWSKI', 'OWSKY'}) or
1718
                 _string_at(0, 3, ['SCH']))):
1719
                (primary, secondary) = _metaph_add('', 'F')
1720
                current += 1
1721
                continue
1722
            # Polish e.g. 'filipowicz'
1723
            elif _string_at(current, 4, {'WICZ', 'WITZ'}):
1724
                (primary, secondary) = _metaph_add('TS', 'FX')
1725
                current += 4
1726
                continue
1727
            # else skip it
1728
            else:
1729
                current += 1
1730
                continue
1731
1732
        elif _get_at(current) == 'X':
1733
            # French e.g. breaux
1734
            if (not ((current == last) and
1735
                     (_string_at((current - 3), 3, {'IAU', 'EAU'}) or
1736
                      _string_at((current - 2), 2, {'AU', 'OU'})))):
1737
                (primary, secondary) = _metaph_add('KS')
1738
1739
            if _string_at((current + 1), 1, {'C', 'X'}):
1740
                current += 2
1741
            else:
1742
                current += 1
1743
            continue
1744
1745
        elif _get_at(current) == 'Z':
1746
            # Chinese Pinyin e.g. 'zhao'
1747
            if _get_at(current + 1) == 'H':
1748
                (primary, secondary) = _metaph_add('J')
1749
                current += 2
1750
                continue
1751
            elif (_string_at((current + 1), 2, {'ZO', 'ZI', 'ZA'}) or
1752
                  (_slavo_germanic() and ((current > 0) and
1753
                                          _get_at(current - 1) != 'T'))):
1754
                (primary, secondary) = _metaph_add('S', 'TS')
1755
            else:
1756
                (primary, secondary) = _metaph_add('S')
1757
1758
            if _get_at(current + 1) == 'Z':
1759
                current += 2
1760
            else:
1761
                current += 1
1762
            continue
1763
1764
        else:
1765
            current += 1
1766
1767
    if maxlength and maxlength < _INFINITY:
1768
        primary = primary[:maxlength]
1769
        secondary = secondary[:maxlength]
1770
    if primary == secondary:
1771
        secondary = ''
1772
1773
    return (primary, secondary)
1774
1775
1776
def caverphone(word, version=2):
1777
    """Return the Caverphone code for a word.
1778
1779
    A description of version 1 of the algorithm can be found at:
1780
    http://caversham.otago.ac.nz/files/working/ctp060902.pdf
1781
1782
    A description of version 2 of the algorithm can be found at:
1783
    http://caversham.otago.ac.nz/files/working/ctp150804.pdf
1784
1785
    :param str word: the word to transform
1786
    :param int version: the version of Caverphone to employ for encoding
1787
        (defaults to 2)
1788
    :returns: the Caverphone value
1789
    :rtype: str
1790
1791
    >>> caverphone('Christopher')
1792
    'KRSTFA1111'
1793
    >>> caverphone('Niall')
1794
    'NA11111111'
1795
    >>> caverphone('Smith')
1796
    'SMT1111111'
1797
    >>> caverphone('Schmidt')
1798
    'SKMT111111'
1799
1800
    >>> caverphone('Christopher', 1)
1801
    'KRSTF1'
1802
    >>> caverphone('Niall', 1)
1803
    'N11111'
1804
    >>> caverphone('Smith', 1)
1805
    'SMT111'
1806
    >>> caverphone('Schmidt', 1)
1807
    'SKMT11'
1808
    """
1809
    _vowels = {'a', 'e', 'i', 'o', 'u'}
1810
1811
    word = word.lower()
1812
    word = ''.join(c for c in word if c in
1813
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
1814
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
1815
                    'y', 'z'})
1816
1817
    def _squeeze_replace(word, char, new_char):
1818
        """Convert strings of char in word to one instance of new_char."""
1819
        while char * 2 in word:
1820
            word = word.replace(char * 2, char)
1821
        return word.replace(char, new_char)
1822
1823
    # the main replacemet algorithm
1824
    if version != 1 and word[-1:] == 'e':
1825
        word = word[:-1]
1826
    if word:
1827
        if word[:5] == 'cough':
1828
            word = 'cou2f'+word[5:]
1829
        if word[:5] == 'rough':
1830
            word = 'rou2f'+word[5:]
1831
        if word[:5] == 'tough':
1832
            word = 'tou2f'+word[5:]
1833
        if word[:6] == 'enough':
1834
            word = 'enou2f'+word[6:]
1835
        if version != 1 and word[:6] == 'trough':
1836
            word = 'trou2f'+word[6:]
1837
        if word[:2] == 'gn':
1838
            word = '2n'+word[2:]
1839
        if word[-2:] == 'mb':
1840
            word = word[:-1]+'2'
1841
        word = word.replace('cq', '2q')
1842
        word = word.replace('ci', 'si')
1843
        word = word.replace('ce', 'se')
1844
        word = word.replace('cy', 'sy')
1845
        word = word.replace('tch', '2ch')
1846
        word = word.replace('c', 'k')
1847
        word = word.replace('q', 'k')
1848
        word = word.replace('x', 'k')
1849
        word = word.replace('v', 'f')
1850
        word = word.replace('dg', '2g')
1851
        word = word.replace('tio', 'sio')
1852
        word = word.replace('tia', 'sia')
1853
        word = word.replace('d', 't')
1854
        word = word.replace('ph', 'fh')
1855
        word = word.replace('b', 'p')
1856
        word = word.replace('sh', 's2')
1857
        word = word.replace('z', 's')
1858
        if word[0] in _vowels:
1859
            word = 'A'+word[1:]
1860
        word = word.replace('a', '3')
1861
        word = word.replace('e', '3')
1862
        word = word.replace('i', '3')
1863
        word = word.replace('o', '3')
1864
        word = word.replace('u', '3')
1865
        if version != 1:
1866
            word = word.replace('j', 'y')
1867
            if word[:2] == 'y3':
1868
                word = 'Y3'+word[2:]
1869
            if word[:1] == 'y':
1870
                word = 'A'+word[1:]
1871
            word = word.replace('y', '3')
1872
        word = word.replace('3gh3', '3kh3')
1873
        word = word.replace('gh', '22')
1874
        word = word.replace('g', 'k')
1875
1876
        word = _squeeze_replace(word, 's', 'S')
1877
        word = _squeeze_replace(word, 't', 'T')
1878
        word = _squeeze_replace(word, 'p', 'P')
1879
        word = _squeeze_replace(word, 'k', 'K')
1880
        word = _squeeze_replace(word, 'f', 'F')
1881
        word = _squeeze_replace(word, 'm', 'M')
1882
        word = _squeeze_replace(word, 'n', 'N')
1883
1884
        word = word.replace('w3', 'W3')
1885
        if version == 1:
1886
            word = word.replace('wy', 'Wy')
1887
        word = word.replace('wh3', 'Wh3')
1888
        if version == 1:
1889
            word = word.replace('why', 'Why')
1890
        if version != 1 and word[-1:] == 'w':
1891
            word = word[:-1]+'3'
1892
        word = word.replace('w', '2')
1893
        if word[:1] == 'h':
1894
            word = 'A'+word[1:]
1895
        word = word.replace('h', '2')
1896
        word = word.replace('r3', 'R3')
1897
        if version == 1:
1898
            word = word.replace('ry', 'Ry')
1899
        if version != 1 and word[-1:] == 'r':
1900
            word = word[:-1]+'3'
1901
        word = word.replace('r', '2')
1902
        word = word.replace('l3', 'L3')
1903
        if version == 1:
1904
            word = word.replace('ly', 'Ly')
1905
        if version != 1 and word[-1:] == 'l':
1906
            word = word[:-1]+'3'
1907
        word = word.replace('l', '2')
1908
        if version == 1:
1909
            word = word.replace('j', 'y')
1910
            word = word.replace('y3', 'Y3')
1911
            word = word.replace('y', '2')
1912
        word = word.replace('2', '')
1913
        if version != 1 and word[-1:] == '3':
1914
            word = word[:-1]+'A'
1915
        word = word.replace('3', '')
1916
1917
    # pad with 1s, then extract the necessary length of code
1918
    word = word+'1'*10
1919
    if version != 1:
1920
        word = word[:10]
1921
    else:
1922
        word = word[:6]
1923
1924
    return word
1925
1926
1927
def alpha_sis(word, maxlength=14):
1928
    """Return the IBM Alpha Search Inquiry System code for a word.
1929
1930
    Based on the algorithm described in "Accessing individual records from
1931
    personal data files using non-unique identifiers" / Gwendolyn B. Moore,
1932
    et al.; prepared for the Institute for Computer Sciences and Technology,
1933
    National Bureau of Standards, Washington, D.C (1977):
1934
    https://archive.org/stream/accessingindivid00moor#page/15/mode/1up
1935
1936
    A collection is necessary since there can be multiple values for a
1937
    single word. But the collection must be ordered since the first value
1938
    is the primary coding.
1939
1940
    :param str word: the word to transform
1941
    :param int maxlength: the length of the code returned (defaults to 14)
1942
    :returns: the Alpha SIS value
1943
    :rtype: tuple
1944
1945
    >>> alpha_sis('Christopher')
1946
    ('06401840000000', '07040184000000', '04018400000000')
1947
    >>> alpha_sis('Niall')
1948
    ('02500000000000',)
1949
    >>> alpha_sis('Smith')
1950
    ('03100000000000',)
1951
    >>> alpha_sis('Schmidt')
1952
    ('06310000000000',)
1953
    """
1954
    _alpha_sis_initials = {'GF': '08', 'GM': '03', 'GN': '02', 'KN': '02',
1955
                           'PF': '08', 'PN': '02', 'PS': '00', 'WR': '04',
1956
                           'A': '1', 'E': '1', 'H': '2', 'I': '1', 'J': '3',
1957
                           'O': '1', 'U': '1', 'W': '4', 'Y': '5'}
1958
    _alpha_sis_initials_order = ('GF', 'GM', 'GN', 'KN', 'PF', 'PN', 'PS',
1959
                                 'WR', 'A', 'E', 'H', 'I', 'J', 'O', 'U', 'W',
1960
                                 'Y')
1961
    _alpha_sis_basic = {'SCH': '6', 'CZ': ('70', '6', '0'),
1962
                        'CH': ('6', '70', '0'), 'CK': ('7', '6'),
1963
                        'DS': ('0', '10'), 'DZ': ('0', '10'),
1964
                        'TS': ('0', '10'), 'TZ': ('0', '10'), 'CI': '0',
1965
                        'CY': '0', 'CE': '0', 'SH': '6', 'DG': '7', 'PH': '8',
1966
                        'C': ('7', '6'), 'K': ('7', '6'), 'Z': '0', 'S': '0',
1967
                        'D': '1', 'T': '1', 'N': '2', 'M': '3', 'R': '4',
1968
                        'L': '5', 'J': '6', 'G': '7', 'Q': '7', 'X': '7',
1969
                        'F': '8', 'V': '8', 'B': '9', 'P': '9'}
1970
    _alpha_sis_basic_order = ('SCH', 'CZ', 'CH', 'CK', 'DS', 'DZ', 'TS', 'TZ',
1971
                              'CI', 'CY', 'CE', 'SH', 'DG', 'PH', 'C', 'K',
1972
                              'Z', 'S', 'D', 'T', 'N', 'M', 'R', 'L', 'J', 'C',
1973
                              'G', 'K', 'Q', 'X', 'F', 'V', 'B', 'P')
1974
1975
    alpha = ['']
1976
    pos = 0
1977
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
1978
    word = word.replace('ß', 'SS')
1979
    word = ''.join(c for c in word if c in
1980
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
1981
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
1982
                    'Y', 'Z'})
1983
1984
    # Clamp maxlength to [4, 64]
1985
    if maxlength is not None:
1986
        maxlength = min(max(4, maxlength), 64)
1987
    else:
1988
        maxlength = 64
1989
1990
    # Do special processing for initial substrings
1991
    for k in _alpha_sis_initials_order:
1992
        if word.startswith(k):
1993
            alpha[0] += _alpha_sis_initials[k]
1994
            pos += len(k)
1995
            break
1996
1997
    # Add a '0' if alpha is still empty
1998
    if not alpha[0]:
1999
        alpha[0] += '0'
2000
2001
    # Whether or not any special initial codes were encoded, iterate
2002
    # through the length of the word in the main encoding loop
2003
    while pos < len(word):
2004
        origpos = pos
2005
        for k in _alpha_sis_basic_order:
2006
            if word[pos:].startswith(k):
2007
                if isinstance(_alpha_sis_basic[k], tuple):
2008
                    newalpha = []
2009
                    for i in range(len(_alpha_sis_basic[k])):
2010
                        newalpha += [_ + _alpha_sis_basic[k][i] for _ in alpha]
2011
                    alpha = newalpha
2012
                else:
2013
                    alpha = [_ + _alpha_sis_basic[k] for _ in alpha]
2014
                pos += len(k)
2015
                break
2016
        if pos == origpos:
2017
            alpha = [_ + '_' for _ in alpha]
2018
            pos += 1
2019
2020
    # Trim doublets and placeholders
2021
    for i in range(len(alpha)):
2022
        pos = 1
2023
        while pos < len(alpha[i]):
2024
            if alpha[i][pos] == alpha[i][pos-1]:
2025
                alpha[i] = alpha[i][:pos]+alpha[i][pos+1:]
2026
            pos += 1
2027
    alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2028
2029
    # Trim codes and return tuple
2030
    alpha = ((_ + ('0'*maxlength))[:maxlength] for _ in alpha)
2031
    return tuple(alpha)
2032
2033
2034
def fuzzy_soundex(word, maxlength=5, zero_pad=True):
2035
    """Return the Fuzzy Soundex code for a word.
2036
2037
    Fuzzy Soundex is an algorithm derived from Soundex, defined in:
2038
    Holmes, David and M. Catherine McCabe. "Improving Precision and Recall for
2039
    Soundex Retrieval."
2040
    http://wayback.archive.org/web/20100629121128/http://www.ir.iit.edu/publications/downloads/IEEESoundexV5.pdf
2041
2042
    :param str word: the word to transform
2043
    :param int maxlength: the length of the code returned (defaults to 4)
2044
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2045
        a maxlength string
2046
    :returns: the Fuzzy Soundex value
2047
    :rtype: str
2048
2049
    >>> fuzzy_soundex('Christopher')
2050
    'K6931'
2051
    >>> fuzzy_soundex('Niall')
2052
    'N4000'
2053
    >>> fuzzy_soundex('Smith')
2054
    'S5300'
2055
    >>> fuzzy_soundex('Smith')
2056
    'S5300'
2057
    """
2058
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2059
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2060
                                          '0193017-07745501769301-7-9'))
2061
2062
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2063
    word = word.replace('ß', 'SS')
2064
2065
    # Clamp maxlength to [4, 64]
2066
    if maxlength is not None:
2067
        maxlength = min(max(4, maxlength), 64)
2068
    else:
2069
        maxlength = 64
2070
2071
    if not word:
2072
        if zero_pad:
2073
            return '0' * maxlength
2074
        return '0'
2075
2076
    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
2077
        word = 'SS' + word[2:]
2078
    elif word[:2] == 'GN':
2079
        word = 'NN' + word[2:]
2080
    elif word[:2] in {'HR', 'WR'}:
2081
        word = 'RR' + word[2:]
2082
    elif word[:2] == 'HW':
2083
        word = 'WW' + word[2:]
2084
    elif word[:2] in {'KN', 'NG'}:
2085
        word = 'NN' + word[2:]
2086
2087
    if word[-2:] == 'CH':
2088
        word = word[:-2] + 'KK'
2089
    elif word[-2:] == 'NT':
2090
        word = word[:-2] + 'TT'
2091
    elif word[-2:] == 'RT':
2092
        word = word[:-2] + 'RR'
2093
    elif word[-3:] == 'RDT':
2094
        word = word[:-3] + 'RR'
2095
2096
    word = word.replace('CA', 'KA')
2097
    word = word.replace('CC', 'KK')
2098
    word = word.replace('CK', 'KK')
2099
    word = word.replace('CE', 'SE')
2100
    word = word.replace('CHL', 'KL')
2101
    word = word.replace('CL', 'KL')
2102
    word = word.replace('CHR', 'KR')
2103
    word = word.replace('CR', 'KR')
2104
    word = word.replace('CI', 'SI')
2105
    word = word.replace('CO', 'KO')
2106
    word = word.replace('CU', 'KU')
2107
    word = word.replace('CY', 'SY')
2108
    word = word.replace('DG', 'GG')
2109
    word = word.replace('GH', 'HH')
2110
    word = word.replace('MAC', 'MK')
2111
    word = word.replace('MC', 'MK')
2112
    word = word.replace('NST', 'NSS')
2113
    word = word.replace('PF', 'FF')
2114
    word = word.replace('PH', 'FF')
2115
    word = word.replace('SCH', 'SSS')
2116
    word = word.replace('TIO', 'SIO')
2117
    word = word.replace('TIA', 'SIO')
2118
    word = word.replace('TCH', 'CHH')
2119
2120
    sdx = word.translate(_fuzzy_soundex_translation)
2121
    sdx = sdx.replace('-', '')
2122
2123
    # remove repeating characters
2124
    sdx = _delete_consecutive_repeats(sdx)
2125
2126
    if word[0] in {'H', 'W', 'Y'}:
2127
        sdx = word[0] + sdx
2128
    else:
2129
        sdx = word[0] + sdx[1:]
2130
2131
    sdx = sdx.replace('0', '')
2132
2133
    if zero_pad:
2134
        sdx += ('0'*maxlength)
2135
2136
    return sdx[:maxlength]
2137
2138
2139
def phonex(word, maxlength=4, zero_pad=True):
2140
    """Return the Phonex code for a word.
2141
2142
    Phonex is an algorithm derived from Soundex, defined in:
2143
    Lait, A. J. and B. Randell. "An Assessment of Name Matching Algorithms".
2144
    http://homepages.cs.ncl.ac.uk/brian.randell/Genealogy/NameMatching.pdf
2145
2146
    :param str word: the word to transform
2147
    :param int maxlength: the length of the code returned (defaults to 4)
2148
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2149
        a maxlength string
2150
    :returns: the Phonex value
2151
    :rtype: str
2152
2153
    >>> phonex('Christopher')
2154
    'C623'
2155
    >>> phonex('Niall')
2156
    'N400'
2157
    >>> phonex('Schmidt')
2158
    'S253'
2159
    >>> phonex('Smith')
2160
    'S530'
2161
    """
2162
    name = unicodedata.normalize('NFKD', text_type(word.upper()))
2163
    name = name.replace('ß', 'SS')
2164
2165
    # Clamp maxlength to [4, 64]
2166
    if maxlength is not None:
2167
        maxlength = min(max(4, maxlength), 64)
2168
    else:
2169
        maxlength = 64
2170
2171
    name_code = last = ''
2172
2173
    # Deletions effected by replacing with next letter which
2174
    # will be ignored due to duplicate handling of Soundex code.
2175
    # This is faster than 'moving' all subsequent letters.
2176
2177
    # Remove any trailing Ss
2178
    while name[-1:] == 'S':
2179
        name = name[:-1]
2180
2181
    # Phonetic equivalents of first 2 characters
2182
    # Works since duplicate letters are ignored
2183
    if name[:2] == 'KN':
2184
        name = 'N' + name[2:]  # KN.. == N..
2185
    elif name[:2] == 'PH':
2186
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
2187
    elif name[:2] == 'WR':
2188
        name = 'R' + name[2:]  # WR.. == R..
2189
2190
    if name:
2191
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
2192
        # Works since duplicate letters are ignored
2193
        if name[0] == 'H':
2194
            name = name[1:]
2195
2196
    if name:
2197
        # Phonetic equivalents of first character
2198
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2199
            name = 'A' + name[1:]
2200
        elif name[0] in {'B', 'P'}:
2201
            name = 'B' + name[1:]
2202
        elif name[0] in {'V', 'F'}:
2203
            name = 'F' + name[1:]
2204
        elif name[0] in {'C', 'K', 'Q'}:
2205
            name = 'C' + name[1:]
2206
        elif name[0] in {'G', 'J'}:
2207
            name = 'G' + name[1:]
2208
        elif name[0] in {'S', 'Z'}:
2209
            name = 'S' + name[1:]
2210
2211
        name_code = last = name[0]
2212
2213
    # MODIFIED SOUNDEX CODE
2214
    for i in range(1, len(name)):
2215
        code = '0'
2216
        if name[i] in {'B', 'F', 'P', 'V'}:
2217
            code = '1'
2218
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
2219
            code = '2'
2220
        elif name[i] in {'D', 'T'}:
2221
            if name[i+1:i+2] != 'C':
2222
                code = '3'
2223
        elif name[i] == 'L':
2224
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2225
                    i+1 == len(name)):
2226
                code = '4'
2227
        elif name[i] in {'M', 'N'}:
2228
            if name[i+1:i+2] in {'D', 'G'}:
2229
                name = name[:i+1] + name[i] + name[i+2:]
2230
            code = '5'
2231
        elif name[i] == 'R':
2232
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
2233
                    i+1 == len(name)):
2234
                code = '6'
2235
2236
        if code != last and code != '0' and i != 0:
2237
            name_code += code
2238
2239
        last = name_code[-1]
2240
2241
    if zero_pad:
2242
        name_code += '0' * maxlength
2243
    if not name_code:
2244
        name_code = '0'
2245
    return name_code[:maxlength]
2246
2247
2248
def phonem(word):
2249
    """Return the Phonem code for a word.
2250
2251
    Phonem is defined in:
2252
    Wilde, Georg and Carsten Meyer. 1988. "Nicht wörtlich genommen,
2253
    'Schreibweisentolerante' Suchroutine in dBASE implementiert." c't Magazin
2254
    für Computer Technik. Oct. 1988. 126--131.
2255
2256
    This version is based on the Perl implementation documented at:
2257
    http://ifl.phil-fak.uni-koeln.de/sites/linguistik/Phonetik/import/Phonetik_Files/Allgemeine_Dateien/Martin_Wilz.pdf
2258
    It includes some enhancements presented in the Java port at:
2259
    https://github.com/dcm4che/dcm4che/blob/master/dcm4che-soundex/src/main/java/org/dcm4che3/soundex/Phonem.java
2260
2261
    Phonem is intended chiefly for German names/words.
2262
2263
    :param str word: the word to transform
2264
    :returns: the Phonem value
2265
    :rtype: str
2266
2267
    >>> phonem('Christopher')
2268
    'CRYSDOVR'
2269
    >>> phonem('Niall')
2270
    'NYAL'
2271
    >>> phonem('Smith')
2272
    'SMYD'
2273
    >>> phonem('Schmidt')
2274
    'CMYD'
2275
    """
2276
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
2277
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
2278
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
2279
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
2280
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
2281
                             ('AU', 'A§'), ('OU', '§'))
2282
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2283
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
2284
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
2285
2286
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2287
    for i, j in _phonem_substitutions:
2288
        word = word.replace(i, j)
2289
    word = word.translate(_phonem_translation)
2290
2291
    return ''.join(c for c in _delete_consecutive_repeats(word)
2292
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
2293
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})
2294
2295
2296
def phonix(word, maxlength=4, zero_pad=True):
2297
    """Return the Phonix code for a word.
2298
2299
    Phonix is a Soundex-like algorithm defined in:
2300
    T.N. Gadd: PHONIX --- The Algorithm, Program 24/4, 1990, p.363-366.
2301
2302
    This implementation is based on
2303
    http://cpansearch.perl.org/src/ULPFR/WAIT-1.800/soundex.c
2304
    http://cs.anu.edu.au/people/Peter.Christen/Febrl/febrl-0.4.01/encode.py
2305
    and
2306
    https://metacpan.org/pod/Text::Phonetic::Phonix
2307
2308
    :param str word: the word to transform
2309
    :param int maxlength: the length of the code returned (defaults to 4)
2310
    :param bool zero_pad: pad the end of the return value with 0s to achieve
2311
        a maxlength string
2312
    :returns: the Phonix value
2313
    :rtype: str
2314
2315
    >>> phonix('Christopher')
2316
    'K683'
2317
    >>> phonix('Niall')
2318
    'N400'
2319
    >>> phonix('Smith')
2320
    'S530'
2321
    >>> phonix('Schmidt')
2322
    'S530'
2323
    """
2324
    # pylint: disable=too-many-branches
2325
    def _start_repl(word, src, tar, post=None):
2326
        r"""Replace src with tar at the start of word."""
2327
        if post:
2328
            for i in post:
2329
                if word.startswith(src+i):
2330
                    return tar + word[len(src):]
2331
        elif word.startswith(src):
2332
            return tar + word[len(src):]
2333
        return word
2334
2335
    def _end_repl(word, src, tar, pre=None):
2336
        r"""Replace src with tar at the end of word."""
2337
        if pre:
2338
            for i in pre:
2339
                if word.endswith(i+src):
2340
                    return word[:-len(src)] + tar
2341
        elif word.endswith(src):
2342
            return word[:-len(src)] + tar
2343
        return word
2344
2345
    def _mid_repl(word, src, tar, pre=None, post=None):
2346
        r"""Replace src with tar in the middle of word."""
2347
        if pre or post:
2348
            if not pre:
2349
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
2350
            elif not post:
2351
                return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
2352
            return _all_repl(word, src, tar, pre, post)
2353
        return (word[0] + _all_repl(word[1:-1], src, tar, pre, post) +
2354
                word[-1])
2355
2356
    def _all_repl(word, src, tar, pre=None, post=None):
2357
        r"""Replace src with tar anywhere in word."""
2358
        if pre or post:
2359
            if post:
2360
                post = post
2361
            else:
2362
                post = frozenset(('',))
2363
            if pre:
2364
                pre = pre
2365
            else:
2366
                pre = frozenset(('',))
2367
2368
            for i, j in ((i, j) for i in pre for j in post):
2369
                word = word.replace(i+src+j, i+tar+j)
2370
            return word
2371
        else:
2372
            return word.replace(src, tar)
2373
2374
    _vow = {'A', 'E', 'I', 'O', 'U'}
2375
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
2376
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
2377
2378
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
2379
                             (_all_repl, 'CO', 'KO'),
2380
                             (_all_repl, 'CA', 'KA'),
2381
                             (_all_repl, 'CU', 'KU'),
2382
                             (_all_repl, 'CY', 'SI'),
2383
                             (_all_repl, 'CI', 'SI'),
2384
                             (_all_repl, 'CE', 'SE'),
2385
                             (_start_repl, 'CL', 'KL', _vow),
2386
                             (_all_repl, 'CK', 'K'),
2387
                             (_end_repl, 'GC', 'K'),
2388
                             (_end_repl, 'JC', 'K'),
2389
                             (_start_repl, 'CHR', 'KR', _vow),
2390
                             (_start_repl, 'CR', 'KR', _vow),
2391
                             (_start_repl, 'WR', 'R'),
2392
                             (_all_repl, 'NC', 'NK'),
2393
                             (_all_repl, 'CT', 'KT'),
2394
                             (_all_repl, 'PH', 'F'),
2395
                             (_all_repl, 'AA', 'AR'),
2396
                             (_all_repl, 'SCH', 'SH'),
2397
                             (_all_repl, 'BTL', 'TL'),
2398
                             (_all_repl, 'GHT', 'T'),
2399
                             (_all_repl, 'AUGH', 'ARF'),
2400
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
2401
                             (_all_repl, 'LOUGH', 'LOW'),
2402
                             (_start_repl, 'Q', 'KW'),
2403
                             (_start_repl, 'KN', 'N'),
2404
                             (_end_repl, 'GN', 'N'),
2405
                             (_all_repl, 'GHN', 'N'),
2406
                             (_end_repl, 'GNE', 'N'),
2407
                             (_all_repl, 'GHNE', 'NE'),
2408
                             (_end_repl, 'GNES', 'NS'),
2409
                             (_start_repl, 'GN', 'N'),
2410
                             (_mid_repl, 'GN', 'N', None, _con),
2411
                             (_end_repl, 'GN', 'N'),
2412
                             (_start_repl, 'PS', 'S'),
2413
                             (_start_repl, 'PT', 'T'),
2414
                             (_start_repl, 'CZ', 'C'),
2415
                             (_mid_repl, 'WZ', 'Z', _vow),
2416
                             (_mid_repl, 'CZ', 'CH'),
2417
                             (_all_repl, 'LZ', 'LSH'),
2418
                             (_all_repl, 'RZ', 'RSH'),
2419
                             (_mid_repl, 'Z', 'S', None, _vow),
2420
                             (_all_repl, 'ZZ', 'TS'),
2421
                             (_mid_repl, 'Z', 'TS', _con),
2422
                             (_all_repl, 'HROUG', 'REW'),
2423
                             (_all_repl, 'OUGH', 'OF'),
2424
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
2425
                             (_mid_repl, 'J', 'Y', _vow, _vow),
2426
                             (_start_repl, 'YJ', 'Y', _vow),
2427
                             (_start_repl, 'GH', 'G'),
2428
                             (_end_repl, 'GH', 'E', _vow),
2429
                             (_start_repl, 'CY', 'S'),
2430
                             (_all_repl, 'NX', 'NKS'),
2431
                             (_start_repl, 'PF', 'F'),
2432
                             (_end_repl, 'DT', 'T'),
2433
                             (_end_repl, 'TL', 'TIL'),
2434
                             (_end_repl, 'DL', 'DIL'),
2435
                             (_all_repl, 'YTH', 'ITH'),
2436
                             (_start_repl, 'TJ', 'CH', _vow),
2437
                             (_start_repl, 'TSJ', 'CH', _vow),
2438
                             (_start_repl, 'TS', 'T', _vow),
2439
                             (_all_repl, 'TCH', 'CH'),
2440
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
2441
                             (_end_repl, 'WSK', 'VSKIE', _vow),
2442
                             (_start_repl, 'MN', 'N', _vow),
2443
                             (_start_repl, 'PN', 'N', _vow),
2444
                             (_mid_repl, 'STL', 'SL', _vow),
2445
                             (_end_repl, 'STL', 'SL', _vow),
2446
                             (_end_repl, 'TNT', 'ENT'),
2447
                             (_end_repl, 'EAUX', 'OH'),
2448
                             (_all_repl, 'EXCI', 'ECS'),
2449
                             (_all_repl, 'X', 'ECS'),
2450
                             (_end_repl, 'NED', 'ND'),
2451
                             (_all_repl, 'JR', 'DR'),
2452
                             (_end_repl, 'EE', 'EA'),
2453
                             (_all_repl, 'ZS', 'S'),
2454
                             (_mid_repl, 'R', 'AH', _vow, _con),
2455
                             (_end_repl, 'R', 'AH', _vow),
2456
                             (_mid_repl, 'HR', 'AH', _vow, _con),
2457
                             (_end_repl, 'HR', 'AH', _vow),
2458
                             (_end_repl, 'HR', 'AH', _vow),
2459
                             (_end_repl, 'RE', 'AR'),
2460
                             (_end_repl, 'R', 'AH', _vow),
2461
                             (_all_repl, 'LLE', 'LE'),
2462
                             (_end_repl, 'LE', 'ILE', _con),
2463
                             (_end_repl, 'LES', 'ILES', _con),
2464
                             (_end_repl, 'E', ''),
2465
                             (_end_repl, 'ES', 'S'),
2466
                             (_end_repl, 'SS', 'AS', _vow),
2467
                             (_end_repl, 'MB', 'M', _vow),
2468
                             (_all_repl, 'MPTS', 'MPS'),
2469
                             (_all_repl, 'MPS', 'MS'),
2470
                             (_all_repl, 'MPT', 'MT'))
2471
2472
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2473
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
2474
                                   '01230720022455012683070808'))
2475
2476
    sdx = ''
2477
2478
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
2479
    word = word.replace('ß', 'SS')
2480
    word = ''.join(c for c in word if c in
2481
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2482
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2483
                    'Y', 'Z'})
2484
    if word:
2485
        for trans in _phonix_substitutions:
2486
            word = trans[0](word, *trans[1:])
2487
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
2488
            sdx = 'v' + word[1:].translate(_phonix_translation)
2489
        else:
2490
            sdx = word[0] + word[1:].translate(_phonix_translation)
2491
        sdx = _delete_consecutive_repeats(sdx)
2492
        sdx = sdx.replace('0', '')
2493
2494
    # Clamp maxlength to [4, 64]
2495
    if maxlength is not None:
2496
        maxlength = min(max(4, maxlength), 64)
2497
    else:
2498
        maxlength = 64
2499
2500
    if zero_pad:
2501
        sdx += '0' * maxlength
2502
    if not sdx:
2503
        sdx = '0'
2504
    return sdx[:maxlength]
2505
2506
2507
def sfinxbis(word, maxlength=None):
2508
    """Return the SfinxBis code for a word.
2509
2510
    SfinxBis is a Soundex-like algorithm defined in:
2511
    http://www.swami.se/download/18.248ad5af12aa8136533800091/SfinxBis.pdf
2512
2513
    This implementation follows the reference implementation:
2514
    http://www.swami.se/download/18.248ad5af12aa8136533800093/swamiSfinxBis.java.txt
2515
2516
    SfinxBis is intended chiefly for Swedish names.
2517
2518
    :param str word: the word to transform
2519
    :param int maxlength: the length of the code returned (defaults to
2520
        unlimited)
2521
    :returns: the SfinxBis value
2522
    :rtype: tuple
2523
2524
    >>> sfinxbis('Christopher')
2525
    ('K68376',)
2526
    >>> sfinxbis('Niall')
2527
    ('N4',)
2528
    >>> sfinxbis('Smith')
2529
    ('S53',)
2530
    >>> sfinxbis('Schmidt')
2531
    ('S53',)
2532
2533
    >>> sfinxbis('Johansson')
2534
    ('J585',)
2535
    >>> sfinxbis('Sjöberg')
2536
    ('#162',)
2537
    """
2538
    adelstitler = (' DE LA ', ' DE LAS ', ' DE LOS ', ' VAN DE ', ' VAN DEN ',
2539
                   ' VAN DER ', ' VON DEM ', ' VON DER ',
2540
                   ' AF ', ' AV ', ' DA ', ' DE ', ' DEL ', ' DEN ', ' DES ',
2541
                   ' DI ', ' DO ', ' DON ', ' DOS ', ' DU ', ' E ', ' IN ',
2542
                   ' LA ', ' LE ', ' MAC ', ' MC ', ' VAN ', ' VON ', ' Y ',
2543
                   ' S:T ')
2544
2545
    _harde_vokaler = {'A', 'O', 'U', 'Å'}
2546
    _mjuka_vokaler = {'E', 'I', 'Y', 'Ä', 'Ö'}
2547
    _konsonanter = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P',
2548
                    'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
2549
    _alfabet = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
2550
                'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
2551
                'Y', 'Z', 'Ä', 'Å', 'Ö'}
2552
2553
    _sfinxbis_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
2554
                                      'BCDFGHJKLMNPQRSTVZAOUÅEIYÄÖ'),
2555
                                     '123729224551268378999999999'))
2556
2557
    _sfinxbis_substitutions = dict(zip((ord(_) for _ in
2558
                                        'WZÀÁÂÃÆÇÈÉÊËÌÍÎÏÑÒÓÔÕØÙÚÛÜÝ'),
2559
                                       'VSAAAAÄCEEEEIIIINOOOOÖUUUYY'))
2560
2561
    def _foersvensker(ordet):
2562
        """Return the Swedish-ized form of the word."""
2563
        ordet = ordet.replace('STIERN', 'STJÄRN')
2564
        ordet = ordet.replace('HIE', 'HJ')
2565
        ordet = ordet.replace('SIÖ', 'SJÖ')
2566
        ordet = ordet.replace('SCH', 'SH')
2567
        ordet = ordet.replace('QU', 'KV')
2568
        ordet = ordet.replace('IO', 'JO')
2569
        ordet = ordet.replace('PH', 'F')
2570
2571
        for i in _harde_vokaler:
2572
            ordet = ordet.replace(i+'Ü', i+'J')
2573
            ordet = ordet.replace(i+'Y', i+'J')
2574
            ordet = ordet.replace(i+'I', i+'J')
2575
        for i in _mjuka_vokaler:
2576
            ordet = ordet.replace(i+'Ü', i+'J')
2577
            ordet = ordet.replace(i+'Y', i+'J')
2578
            ordet = ordet.replace(i+'I', i+'J')
2579
2580
        if 'H' in ordet:
2581
            for i in _konsonanter:
2582
                ordet = ordet.replace('H'+i, i)
2583
2584
        ordet = ordet.translate(_sfinxbis_substitutions)
2585
2586
        ordet = ordet.replace('Ð', 'ETH')
2587
        ordet = ordet.replace('Þ', 'TH')
2588
        ordet = ordet.replace('ß', 'SS')
2589
2590
        return ordet
2591
2592
    def _koda_foersta_ljudet(ordet):
2593
        """Return the word with the first sound coded."""
2594
        if ordet[0:1] in _mjuka_vokaler or ordet[0:1] in _harde_vokaler:
2595
            ordet = '$' + ordet[1:]
2596
        elif ordet[0:2] in ('DJ', 'GJ', 'HJ', 'LJ'):
2597
            ordet = 'J' + ordet[2:]
2598
        elif ordet[0:1] == 'G' and ordet[1:2] in _mjuka_vokaler:
2599
            ordet = 'J' + ordet[1:]
2600
        elif ordet[0:1] == 'Q':
2601
            ordet = 'K' + ordet[1:]
2602
        elif (ordet[0:2] == 'CH' and
2603
              ordet[2:3] in frozenset(_mjuka_vokaler | _harde_vokaler)):
2604
            ordet = '#' + ordet[2:]
2605
        elif ordet[0:1] == 'C' and ordet[1:2] in _harde_vokaler:
2606
            ordet = 'K' + ordet[1:]
2607
        elif ordet[0:1] == 'C' and ordet[1:2] in _konsonanter:
2608
            ordet = 'K' + ordet[1:]
2609
        elif ordet[0:1] == 'X':
2610
            ordet = 'S' + ordet[1:]
2611
        elif ordet[0:1] == 'C' and ordet[1:2] in _mjuka_vokaler:
2612
            ordet = 'S' + ordet[1:]
2613
        elif ordet[0:3] in ('SKJ', 'STJ', 'SCH'):
2614
            ordet = '#' + ordet[3:]
2615
        elif ordet[0:2] in ('SH', 'KJ', 'TJ', 'SJ'):
2616
            ordet = '#' + ordet[2:]
2617
        elif ordet[0:2] == 'SK' and ordet[2:3] in _mjuka_vokaler:
2618
            ordet = '#' + ordet[2:]
2619
        elif ordet[0:1] == 'K' and ordet[1:2] in _mjuka_vokaler:
2620
            ordet = '#' + ordet[1:]
2621
        return ordet
2622
2623
    # Steg 1, Versaler
2624
    word = unicodedata.normalize('NFC', text_type(word.upper()))
2625
    word = word.replace('ß', 'SS')
2626
    word = word.replace('-', ' ')
2627
2628
    # Steg 2, Ta bort adelsprefix
2629
    for adelstitel in adelstitler:
2630
        while adelstitel in word:
2631
            word = word.replace(adelstitel, ' ')
2632
        if word.startswith(adelstitel[1:]):
2633
            word = word[len(adelstitel)-1:]
2634
2635
    # Split word into tokens
2636
    ordlista = word.split()
2637
2638
    # Steg 3, Ta bort dubbelteckning i början på namnet
2639
    ordlista = [_delete_consecutive_repeats(ordet) for ordet in ordlista]
2640
    if not ordlista:
2641
        return ('',)
2642
2643
    # Steg 4, Försvenskning
2644
    ordlista = [_foersvensker(ordet) for ordet in ordlista]
2645
2646
    # Steg 5, Ta bort alla tecken som inte är A-Ö (65-90,196,197,214)
2647
    ordlista = [''.join(c for c in ordet if c in _alfabet)
2648
                for ordet in ordlista]
2649
2650
    # Steg 6, Koda första ljudet
2651
    ordlista = [_koda_foersta_ljudet(ordet) for ordet in ordlista]
2652
2653
    # Steg 7, Dela upp namnet i två delar
2654
    rest = [ordet[1:] for ordet in ordlista]
2655
2656
    # Steg 8, Utför fonetisk transformation i resten
2657
    rest = [ordet.replace('DT', 'T') for ordet in rest]
2658
    rest = [ordet.replace('X', 'KS') for ordet in rest]
2659
2660
    # Steg 9, Koda resten till en sifferkod
2661
    for vokal in _mjuka_vokaler:
2662
        rest = [ordet.replace('C'+vokal, '8'+vokal) for ordet in rest]
2663
    rest = [ordet.translate(_sfinxbis_translation) for ordet in rest]
2664
2665
    # Steg 10, Ta bort intilliggande dubbletter
2666
    rest = [_delete_consecutive_repeats(ordet) for ordet in rest]
2667
2668
    # Steg 11, Ta bort alla "9"
2669
    rest = [ordet.replace('9', '') for ordet in rest]
2670
2671
    # Steg 12, Sätt ihop delarna igen
2672
    ordlista = [''.join(ordet) for ordet in
2673
                zip((_[0:1] for _ in ordlista), rest)]
2674
2675
    # truncate, if maxlength is set
2676
    if maxlength and maxlength < _INFINITY:
2677
        ordlista = [ordet[:maxlength] for ordet in ordlista]
2678
2679
    return tuple(ordlista)
2680
2681
2682
def phonet(word, mode=1, lang='de', trace=False):
2683
    """Return the phonet code for a word.
2684
2685
    phonet ("Hannoveraner Phonetik") was developed by Jörg Michael and
2686
    documented in c't magazine vol. 25/1999, p. 252. It is a phonetic
2687
    algorithm designed primarily for German.
2688
    Cf. http://www.heise.de/ct/ftp/99/25/252/
2689
2690
    This is a port of Jesper Zedlitz's code, which is licensed LGPL:
2691
    https://github.com/jze/phonet4java/blob/master/src/main/java/de/zedlitz/phonet4java/Phonet.java
2692
2693
    That is, in turn, based on Michael's C code, which is also licensed LGPL:
2694
    ftp://ftp.heise.de/pub/ct/listings/phonet.zip
2695
2696
    :param str word: the word to transform
2697
    :param int mode: the ponet variant to employ (1 or 2)
2698
    :param str lang: 'de' (default) for German
2699
            'none' for no language
2700
    :param bool trace: prints debugging info if True
2701
    :returns: the phonet value
2702
    :rtype: str
2703
2704
    >>> phonet('Christopher')
2705
    'KRISTOFA'
2706
    >>> phonet('Niall')
2707
    'NIAL'
2708
    >>> phonet('Smith')
2709
    'SMIT'
2710
    >>> phonet('Schmidt')
2711
    'SHMIT'
2712
2713
    >>> phonet('Christopher', mode=2)
2714
    'KRIZTUFA'
2715
    >>> phonet('Niall', mode=2)
2716
    'NIAL'
2717
    >>> phonet('Smith', mode=2)
2718
    'ZNIT'
2719
    >>> phonet('Schmidt', mode=2)
2720
    'ZNIT'
2721
2722
    >>> phonet('Christopher', lang='none')
2723
    'CHRISTOPHER'
2724
    >>> phonet('Niall', lang='none')
2725
    'NIAL'
2726
    >>> phonet('Smith', lang='none')
2727
    'SMITH'
2728
    >>> phonet('Schmidt', lang='none')
2729
    'SCHMIDT'
2730
    """
2731
    # pylint: disable=too-many-branches
2732
2733
    _phonet_rules_no_lang = (  # separator chars
2734
        '´', ' ', ' ',
2735
        '"', ' ', ' ',
2736
        '`$', '', '',
2737
        '\'', ' ', ' ',
2738
        ',', ',', ',',
2739
        ';', ',', ',',
2740
        '-', ' ', ' ',
2741
        ' ', ' ', ' ',
2742
        '.', '.', '.',
2743
        ':', '.', '.',
2744
        # German umlauts
2745
        'Ä', 'AE', 'AE',
2746
        'Ö', 'OE', 'OE',
2747
        'Ü', 'UE', 'UE',
2748
        'ß', 'S', 'S',
2749
        # international umlauts
2750
        'À', 'A', 'A',
2751
        'Á', 'A', 'A',
2752
        'Â', 'A', 'A',
2753
        'Ã', 'A', 'A',
2754
        'Å', 'A', 'A',
2755
        'Æ', 'AE', 'AE',
2756
        'Ç', 'C', 'C',
2757
        'Ð', 'DJ', 'DJ',
2758
        'È', 'E', 'E',
2759
        'É', 'E', 'E',
2760
        'Ê', 'E', 'E',
2761
        'Ë', 'E', 'E',
2762
        'Ì', 'I', 'I',
2763
        'Í', 'I', 'I',
2764
        'Î', 'I', 'I',
2765
        'Ï', 'I', 'I',
2766
        'Ñ', 'NH', 'NH',
2767
        'Ò', 'O', 'O',
2768
        'Ó', 'O', 'O',
2769
        'Ô', 'O', 'O',
2770
        'Õ', 'O', 'O',
2771
        'Œ', 'OE', 'OE',
2772
        'Ø', 'OE', 'OE',
2773
        'Š', 'SH', 'SH',
2774
        'Þ', 'TH', 'TH',
2775
        'Ù', 'U', 'U',
2776
        'Ú', 'U', 'U',
2777
        'Û', 'U', 'U',
2778
        'Ý', 'Y', 'Y',
2779
        'Ÿ', 'Y', 'Y',
2780
        # 'normal' letters (A-Z)
2781
        'MC^', 'MAC', 'MAC',
2782
        'MC^', 'MAC', 'MAC',
2783
        'M´^', 'MAC', 'MAC',
2784
        'M\'^', 'MAC', 'MAC',
2785
        'O´^', 'O', 'O',
2786
        'O\'^', 'O', 'O',
2787
        'VAN DEN ^', 'VANDEN', 'VANDEN',
2788
        None, None, None)
2789
2790
    _phonet_rules_german = (  # separator chars
2791
        '´', ' ', ' ',
2792
        '"', ' ', ' ',
2793
        '`$', '', '',
2794
        '\'', ' ', ' ',
2795
        ',', ' ', ' ',
2796
        ';', ' ', ' ',
2797
        '-', ' ', ' ',
2798
        ' ', ' ', ' ',
2799
        '.', '.', '.',
2800
        ':', '.', '.',
2801
        # German umlauts
2802
        'ÄE', 'E', 'E',
2803
        'ÄU<', 'EU', 'EU',
2804
        'ÄV(AEOU)-<', 'EW', None,
2805
        'Ä$', 'Ä', None,
2806
        'Ä<', None, 'E',
2807
        'Ä', 'E', None,
2808
        'ÖE', 'Ö', 'Ö',
2809
        'ÖU', 'Ö', 'Ö',
2810
        'ÖVER--<', 'ÖW', None,
2811
        'ÖV(AOU)-', 'ÖW', None,
2812
        'ÜBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
2813
        'ÜBER^^', 'ÜBA', 'IBA',
2814
        'ÜE', 'Ü', 'I',
2815
        'ÜVER--<', 'ÜW', None,
2816
        'ÜV(AOU)-', 'ÜW', None,
2817
        'Ü', None, 'I',
2818
        'ßCH<', None, 'Z',
2819
        'ß<', 'S', 'Z',
2820
        # international umlauts
2821
        'À<', 'A', 'A',
2822
        'Á<', 'A', 'A',
2823
        'Â<', 'A', 'A',
2824
        'Ã<', 'A', 'A',
2825
        'Å<', 'A', 'A',
2826
        'ÆER-', 'E', 'E',
2827
        'ÆU<', 'EU', 'EU',
2828
        'ÆV(AEOU)-<', 'EW', None,
2829
        'Æ$', 'Ä', None,
2830
        'Æ<', None, 'E',
2831
        'Æ', 'E', None,
2832
        'Ç', 'Z', 'Z',
2833
        'ÐÐ-', '', '',
2834
        'Ð', 'DI', 'TI',
2835
        'È<', 'E', 'E',
2836
        'É<', 'E', 'E',
2837
        'Ê<', 'E', 'E',
2838
        'Ë', 'E', 'E',
2839
        'Ì<', 'I', 'I',
2840
        'Í<', 'I', 'I',
2841
        'Î<', 'I', 'I',
2842
        'Ï', 'I', 'I',
2843
        'ÑÑ-', '', '',
2844
        'Ñ', 'NI', 'NI',
2845
        'Ò<', 'O', 'U',
2846
        'Ó<', 'O', 'U',
2847
        'Ô<', 'O', 'U',
2848
        'Õ<', 'O', 'U',
2849
        'Œ<', 'Ö', 'Ö',
2850
        'Ø(IJY)-<', 'E', 'E',
2851
        'Ø<', 'Ö', 'Ö',
2852
        'Š', 'SH', 'Z',
2853
        'Þ', 'T', 'T',
2854
        'Ù<', 'U', 'U',
2855
        'Ú<', 'U', 'U',
2856
        'Û<', 'U', 'U',
2857
        'Ý<', 'I', 'I',
2858
        'Ÿ<', 'I', 'I',
2859
        # 'normal' letters (A-Z)
2860
        'ABELLE$', 'ABL', 'ABL',
2861
        'ABELL$', 'ABL', 'ABL',
2862
        'ABIENNE$', 'ABIN', 'ABIN',
2863
        'ACHME---^', 'ACH', 'AK',
2864
        'ACEY$', 'AZI', 'AZI',
2865
        'ADV', 'ATW', None,
2866
        'AEGL-', 'EK', None,
2867
        'AEU<', 'EU', 'EU',
2868
        'AE2', 'E', 'E',
2869
        'AFTRAUBEN------', 'AFT ', 'AFT ',
2870
        'AGL-1', 'AK', None,
2871
        'AGNI-^', 'AKN', 'AKN',
2872
        'AGNIE-', 'ANI', 'ANI',
2873
        'AGN(AEOU)-$', 'ANI', 'ANI',
2874
        'AH(AIOÖUÜY)-', 'AH', None,
2875
        'AIA2', 'AIA', 'AIA',
2876
        'AIE$', 'E', 'E',
2877
        'AILL(EOU)-', 'ALI', 'ALI',
2878
        'AINE$', 'EN', 'EN',
2879
        'AIRE$', 'ER', 'ER',
2880
        'AIR-', 'E', 'E',
2881
        'AISE$', 'ES', 'EZ',
2882
        'AISSANCE$', 'ESANS', 'EZANZ',
2883
        'AISSE$', 'ES', 'EZ',
2884
        'AIX$', 'EX', 'EX',
2885
        'AJ(AÄEÈÉÊIOÖUÜ)--', 'A', 'A',
2886
        'AKTIE', 'AXIE', 'AXIE',
2887
        'AKTUEL', 'AKTUEL', None,
2888
        'ALOI^', 'ALOI', 'ALUI',  # Don't merge these rules
2889
        'ALOY^', 'ALOI', 'ALUI',  # needed by 'check_rules'
2890
        'AMATEU(RS)-', 'AMATÖ', 'ANATÖ',
2891
        'ANCH(OEI)-', 'ANSH', 'ANZ',
2892
        'ANDERGEGANG----', 'ANDA GE', 'ANTA KE',
2893
        'ANDERGEHE----', 'ANDA ', 'ANTA ',
2894
        'ANDERGESETZ----', 'ANDA GE', 'ANTA KE',
2895
        'ANDERGING----', 'ANDA ', 'ANTA ',
2896
        'ANDERSETZ(ET)-----', 'ANDA ', 'ANTA ',
2897
        'ANDERZUGEHE----', 'ANDA ZU ', 'ANTA ZU ',
2898
        'ANDERZUSETZE-----', 'ANDA ZU ', 'ANTA ZU ',
2899
        'ANER(BKO)---^^', 'AN', None,
2900
        'ANHAND---^$', 'AN H', 'AN ',
2901
        'ANH(AÄEIOÖUÜY)--^^', 'AN', None,
2902
        'ANIELLE$', 'ANIEL', 'ANIL',
2903
        'ANIEL', 'ANIEL', None,
2904
        'ANSTELLE----^$', 'AN ST', 'AN ZT',
2905
        'ANTI^^', 'ANTI', 'ANTI',
2906
        'ANVER^^', 'ANFA', 'ANFA',
2907
        'ATIA$', 'ATIA', 'ATIA',
2908
        'ATIA(NS)--', 'ATI', 'ATI',
2909
        'ATI(AÄOÖUÜ)-', 'AZI', 'AZI',
2910
        'AUAU--', '', '',
2911
        'AUERE$', 'AUERE', None,
2912
        'AUERE(NS)-$', 'AUERE', None,
2913
        'AUERE(AIOUY)--', 'AUER', None,
2914
        'AUER(AÄIOÖUÜY)-', 'AUER', None,
2915
        'AUER<', 'AUA', 'AUA',
2916
        'AUF^^', 'AUF', 'AUF',
2917
        'AULT$', 'O', 'U',
2918
        'AUR(BCDFGKLMNQSTVWZ)-', 'AUA', 'AUA',
2919
        'AUR$', 'AUA', 'AUA',
2920
        'AUSSE$', 'OS', 'UZ',
2921
        'AUS(ST)-^', 'AUS', 'AUS',
2922
        'AUS^^', 'AUS', 'AUS',
2923
        'AUTOFAHR----', 'AUTO ', 'AUTU ',
2924
        'AUTO^^', 'AUTO', 'AUTU',
2925
        'AUX(IY)-', 'AUX', 'AUX',
2926
        'AUX', 'O', 'U',
2927
        'AU', 'AU', 'AU',
2928
        'AVER--<', 'AW', None,
2929
        'AVIER$', 'AWIE', 'AFIE',
2930
        'AV(EÈÉÊI)-^', 'AW', None,
2931
        'AV(AOU)-', 'AW', None,
2932
        'AYRE$', 'EIRE', 'EIRE',
2933
        'AYRE(NS)-$', 'EIRE', 'EIRE',
2934
        'AYRE(AIOUY)--', 'EIR', 'EIR',
2935
        'AYR(AÄIOÖUÜY)-', 'EIR', 'EIR',
2936
        'AYR<', 'EIA', 'EIA',
2937
        'AYER--<', 'EI', 'EI',
2938
        'AY(AÄEIOÖUÜY)--', 'A', 'A',
2939
        'AË', 'E', 'E',
2940
        'A(IJY)<', 'EI', 'EI',
2941
        'BABY^$', 'BEBI', 'BEBI',
2942
        'BAB(IY)^', 'BEBI', 'BEBI',
2943
        'BEAU^$', 'BO', None,
2944
        'BEA(BCMNRU)-^', 'BEA', 'BEA',
2945
        'BEAT(AEIMORU)-^', 'BEAT', 'BEAT',
2946
        'BEE$', 'BI', 'BI',
2947
        'BEIGE^$', 'BESH', 'BEZ',
2948
        'BENOIT--', 'BENO', 'BENU',
2949
        'BER(DT)-', 'BER', None,
2950
        'BERN(DT)-', 'BERN', None,
2951
        'BE(LMNRST)-^', 'BE', 'BE',
2952
        'BETTE$', 'BET', 'BET',
2953
        'BEVOR^$', 'BEFOR', None,
2954
        'BIC$', 'BIZ', 'BIZ',
2955
        'BOWL(EI)-', 'BOL', 'BUL',
2956
        'BP(AÄEÈÉÊIÌÍÎOÖRUÜY)-', 'B', 'B',
2957
        'BRINGEND-----^', 'BRI', 'BRI',
2958
        'BRINGEND-----', ' BRI', ' BRI',
2959
        'BROW(NS)-', 'BRAU', 'BRAU',
2960
        'BUDGET7', 'BÜGE', 'BIKE',
2961
        'BUFFET7', 'BÜFE', 'BIFE',
2962
        'BYLLE$', 'BILE', 'BILE',
2963
        'BYLL$', 'BIL', 'BIL',
2964
        'BYPA--^', 'BEI', 'BEI',
2965
        'BYTE<', 'BEIT', 'BEIT',
2966
        'BY9^', 'BÜ', None,
2967
        'B(SßZ)$', 'BS', None,
2968
        'CACH(EI)-^', 'KESH', 'KEZ',
2969
        'CAE--', 'Z', 'Z',
2970
        'CA(IY)$', 'ZEI', 'ZEI',
2971
        'CE(EIJUY)--', 'Z', 'Z',
2972
        'CENT<', 'ZENT', 'ZENT',
2973
        'CERST(EI)----^', 'KE', 'KE',
2974
        'CER$', 'ZA', 'ZA',
2975
        'CE3', 'ZE', 'ZE',
2976
        'CH\'S$', 'X', 'X',
2977
        'CH´S$', 'X', 'X',
2978
        'CHAO(ST)-', 'KAO', 'KAU',
2979
        'CHAMPIO-^', 'SHEMPI', 'ZENBI',
2980
        'CHAR(AI)-^', 'KAR', 'KAR',
2981
        'CHAU(CDFSVWXZ)-', 'SHO', 'ZU',
2982
        'CHÄ(CF)-', 'SHE', 'ZE',
2983
        'CHE(CF)-', 'SHE', 'ZE',
2984
        'CHEM-^', 'KE', 'KE',  # or: 'CHE', 'KE'
2985
        'CHEQUE<', 'SHEK', 'ZEK',
2986
        'CHI(CFGPVW)-', 'SHI', 'ZI',
2987
        'CH(AEUY)-<^', 'SH', 'Z',
2988
        'CHK-', '', '',
2989
        'CHO(CKPS)-^', 'SHO', 'ZU',
2990
        'CHRIS-', 'KRI', None,
2991
        'CHRO-', 'KR', None,
2992
        'CH(LOR)-<^', 'K', 'K',
2993
        'CHST-', 'X', 'X',
2994
        'CH(SßXZ)3', 'X', 'X',
2995
        'CHTNI-3', 'CHN', 'KN',
2996
        'CH^', 'K', 'K',  # or: 'CH', 'K'
2997
        'CH', 'CH', 'K',
2998
        'CIC$', 'ZIZ', 'ZIZ',
2999
        'CIENCEFICT----', 'EIENS ', 'EIENZ ',
3000
        'CIENCE$', 'EIENS', 'EIENZ',
3001
        'CIER$', 'ZIE', 'ZIE',
3002
        'CYB-^', 'ZEI', 'ZEI',
3003
        'CY9^', 'ZÜ', 'ZI',
3004
        'C(IJY)-<3', 'Z', 'Z',
3005
        'CLOWN-', 'KLAU', 'KLAU',
3006
        'CCH', 'Z', 'Z',
3007
        'CCE-', 'X', 'X',
3008
        'C(CK)-', '', '',
3009
        'CLAUDET---', 'KLO', 'KLU',
3010
        'CLAUDINE^$', 'KLODIN', 'KLUTIN',
3011
        'COACH', 'KOSH', 'KUZ',
3012
        'COLE$', 'KOL', 'KUL',
3013
        'COUCH', 'KAUSH', 'KAUZ',
3014
        'COW', 'KAU', 'KAU',
3015
        'CQUES$', 'K', 'K',
3016
        'CQUE', 'K', 'K',
3017
        'CRASH--9', 'KRE', 'KRE',
3018
        'CREAT-^', 'KREA', 'KREA',
3019
        'CST', 'XT', 'XT',
3020
        'CS<^', 'Z', 'Z',
3021
        'C(SßX)', 'X', 'X',
3022
        'CT\'S$', 'X', 'X',
3023
        'CT(SßXZ)', 'X', 'X',
3024
        'CZ<', 'Z', 'Z',
3025
        'C(ÈÉÊÌÍÎÝ)3', 'Z', 'Z',
3026
        'C.^', 'C.', 'C.',
3027
        'CÄ-', 'Z', 'Z',
3028
        'CÜ$', 'ZÜ', 'ZI',
3029
        'C\'S$', 'X', 'X',
3030
        'C<', 'K', 'K',
3031
        'DAHER^$', 'DAHER', None,
3032
        'DARAUFFOLGE-----', 'DARAUF ', 'TARAUF ',
3033
        'DAVO(NR)-^$', 'DAFO', 'TAFU',
3034
        'DD(SZ)--<', '', '',
3035
        'DD9', 'D', None,
3036
        'DEPOT7', 'DEPO', 'TEBU',
3037
        'DESIGN', 'DISEIN', 'TIZEIN',
3038
        'DE(LMNRST)-3^', 'DE', 'TE',
3039
        'DETTE$', 'DET', 'TET',
3040
        'DH$', 'T', None,
3041
        'DIC$', 'DIZ', 'TIZ',
3042
        'DIDR-^', 'DIT', None,
3043
        'DIEDR-^', 'DIT', None,
3044
        'DJ(AEIOU)-^', 'I', 'I',
3045
        'DMITR-^', 'DIMIT', 'TINIT',
3046
        'DRY9^', 'DRÜ', None,
3047
        'DT-', '', '',
3048
        'DUIS-^', 'DÜ', 'TI',
3049
        'DURCH^^', 'DURCH', 'TURK',
3050
        'DVA$', 'TWA', None,
3051
        'DY9^', 'DÜ', None,
3052
        'DYS$', 'DIS', None,
3053
        'DS(CH)--<', 'T', 'T',
3054
        'DST', 'ZT', 'ZT',
3055
        'DZS(CH)--', 'T', 'T',
3056
        'D(SßZ)', 'Z', 'Z',
3057
        'D(AÄEIOÖRUÜY)-', 'D', None,
3058
        'D(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'D', None,
3059
        'D\'H^', 'D', 'T',
3060
        'D´H^', 'D', 'T',
3061
        'D`H^', 'D', 'T',
3062
        'D\'S3$', 'Z', 'Z',
3063
        'D´S3$', 'Z', 'Z',
3064
        'D^', 'D', None,
3065
        'D', 'T', 'T',
3066
        'EAULT$', 'O', 'U',
3067
        'EAUX$', 'O', 'U',
3068
        'EAU', 'O', 'U',
3069
        'EAV', 'IW', 'IF',
3070
        'EAS3$', 'EAS', None,
3071
        'EA(AÄEIOÖÜY)-3', 'EA', 'EA',
3072
        'EA3$', 'EA', 'EA',
3073
        'EA3', 'I', 'I',
3074
        'EBENSO^$', 'EBNSO', 'EBNZU',
3075
        'EBENSO^^', 'EBNSO ', 'EBNZU ',
3076
        'EBEN^^', 'EBN', 'EBN',
3077
        'EE9', 'E', 'E',
3078
        'EGL-1', 'EK', None,
3079
        'EHE(IUY)--1', 'EH', None,
3080
        'EHUNG---1', 'E', None,
3081
        'EH(AÄIOÖUÜY)-1', 'EH', None,
3082
        'EIEI--', '', '',
3083
        'EIERE^$', 'EIERE', None,
3084
        'EIERE$', 'EIERE', None,
3085
        'EIERE(NS)-$', 'EIERE', None,
3086
        'EIERE(AIOUY)--', 'EIER', None,
3087
        'EIER(AÄIOÖUÜY)-', 'EIER', None,
3088
        'EIER<', 'EIA', None,
3089
        'EIGL-1', 'EIK', None,
3090
        'EIGH$', 'EI', 'EI',
3091
        'EIH--', 'E', 'E',
3092
        'EILLE$', 'EI', 'EI',
3093
        'EIR(BCDFGKLMNQSTVWZ)-', 'EIA', 'EIA',
3094
        'EIR$', 'EIA', 'EIA',
3095
        'EITRAUBEN------', 'EIT ', 'EIT ',
3096
        'EI', 'EI', 'EI',
3097
        'EJ$', 'EI', 'EI',
3098
        'ELIZ^', 'ELIS', None,
3099
        'ELZ^', 'ELS', None,
3100
        'EL-^', 'E', 'E',
3101
        'ELANG----1', 'E', 'E',
3102
        'EL(DKL)--1', 'E', 'E',
3103
        'EL(MNT)--1$', 'E', 'E',
3104
        'ELYNE$', 'ELINE', 'ELINE',
3105
        'ELYN$', 'ELIN', 'ELIN',
3106
        'EL(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'EL', 'EL',
3107
        'EL-1', 'L', 'L',
3108
        'EM-^', None, 'E',
3109
        'EM(DFKMPQT)--1', None, 'E',
3110
        'EM(AÄEÈÉÊIÌÍÎOÖUÜY)--1', None, 'E',
3111
        'EM-1', None, 'N',
3112
        'ENGAG-^', 'ANGA', 'ANKA',
3113
        'EN-^', 'E', 'E',
3114
        'ENTUEL', 'ENTUEL', None,
3115
        'EN(CDGKQSTZ)--1', 'E', 'E',
3116
        'EN(AÄEÈÉÊIÌÍÎNOÖUÜY)-1', 'EN', 'EN',
3117
        'EN-1', '', '',
3118
        'ERH(AÄEIOÖUÜ)-^', 'ERH', 'ER',
3119
        'ER-^', 'E', 'E',
3120
        'ERREGEND-----', ' ER', ' ER',
3121
        'ERT1$', 'AT', None,
3122
        'ER(DGLKMNRQTZß)-1', 'ER', None,
3123
        'ER(AÄEÈÉÊIÌÍÎOÖUÜY)-1', 'ER', 'A',
3124
        'ER1$', 'A', 'A',
3125
        'ER<1', 'A', 'A',
3126
        'ETAT7', 'ETA', 'ETA',
3127
        'ETI(AÄOÖÜU)-', 'EZI', 'EZI',
3128
        'EUERE$', 'EUERE', None,
3129
        'EUERE(NS)-$', 'EUERE', None,
3130
        'EUERE(AIOUY)--', 'EUER', None,
3131
        'EUER(AÄIOÖUÜY)-', 'EUER', None,
3132
        'EUER<', 'EUA', None,
3133
        'EUEU--', '', '',
3134
        'EUILLE$', 'Ö', 'Ö',
3135
        'EUR$', 'ÖR', 'ÖR',
3136
        'EUX', 'Ö', 'Ö',
3137
        'EUSZ$', 'EUS', None,
3138
        'EUTZ$', 'EUS', None,
3139
        'EUYS$', 'EUS', 'EUZ',
3140
        'EUZ$', 'EUS', None,
3141
        'EU', 'EU', 'EU',
3142
        'EVER--<1', 'EW', None,
3143
        'EV(ÄOÖUÜ)-1', 'EW', None,
3144
        'EYER<', 'EIA', 'EIA',
3145
        'EY<', 'EI', 'EI',
3146
        'FACETTE', 'FASET', 'FAZET',
3147
        'FANS--^$', 'FE', 'FE',
3148
        'FAN-^$', 'FE', 'FE',
3149
        'FAULT-', 'FOL', 'FUL',
3150
        'FEE(DL)-', 'FI', 'FI',
3151
        'FEHLER', 'FELA', 'FELA',
3152
        'FE(LMNRST)-3^', 'FE', 'FE',
3153
        'FOERDERN---^', 'FÖRD', 'FÖRT',
3154
        'FOERDERN---', ' FÖRD', ' FÖRT',
3155
        'FOND7', 'FON', 'FUN',
3156
        'FRAIN$', 'FRA', 'FRA',
3157
        'FRISEU(RS)-', 'FRISÖ', 'FRIZÖ',
3158
        'FY9^', 'FÜ', None,
3159
        'FÖRDERN---^', 'FÖRD', 'FÖRT',
3160
        'FÖRDERN---', ' FÖRD', ' FÖRT',
3161
        'GAGS^$', 'GEX', 'KEX',
3162
        'GAG^$', 'GEK', 'KEK',
3163
        'GD', 'KT', 'KT',
3164
        'GEGEN^^', 'GEGN', 'KEKN',
3165
        'GEGENGEKOM-----', 'GEGN ', 'KEKN ',
3166
        'GEGENGESET-----', 'GEGN ', 'KEKN ',
3167
        'GEGENKOMME-----', 'GEGN ', 'KEKN ',
3168
        'GEGENZUKOM---', 'GEGN ZU ', 'KEKN ZU ',
3169
        'GENDETWAS-----$', 'GENT ', 'KENT ',
3170
        'GENRE', 'IORE', 'IURE',
3171
        'GE(LMNRST)-3^', 'GE', 'KE',
3172
        'GER(DKT)-', 'GER', None,
3173
        'GETTE$', 'GET', 'KET',
3174
        'GGF.', 'GF.', None,
3175
        'GG-', '', '',
3176
        'GH', 'G', None,
3177
        'GI(AOU)-^', 'I', 'I',
3178
        'GION-3', 'KIO', 'KIU',
3179
        'G(CK)-', '', '',
3180
        'GJ(AEIOU)-^', 'I', 'I',
3181
        'GMBH^$', 'GMBH', 'GMBH',
3182
        'GNAC$', 'NIAK', 'NIAK',
3183
        'GNON$', 'NION', 'NIUN',
3184
        'GN$', 'N', 'N',
3185
        'GONCAL-^', 'GONZA', 'KUNZA',
3186
        'GRY9^', 'GRÜ', None,
3187
        'G(SßXZ)-<', 'K', 'K',
3188
        'GUCK-', 'KU', 'KU',
3189
        'GUISEP-^', 'IUSE', 'IUZE',
3190
        'GUI-^', 'G', 'K',
3191
        'GUTAUSSEH------^', 'GUT ', 'KUT ',
3192
        'GUTGEHEND------^', 'GUT ', 'KUT ',
3193
        'GY9^', 'GÜ', None,
3194
        'G(AÄEILOÖRUÜY)-', 'G', None,
3195
        'G(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'G', None,
3196
        'G\'S$', 'X', 'X',
3197
        'G´S$', 'X', 'X',
3198
        'G^', 'G', None,
3199
        'G', 'K', 'K',
3200
        'HA(HIUY)--1', 'H', None,
3201
        'HANDVOL---^', 'HANT ', 'ANT ',
3202
        'HANNOVE-^', 'HANOF', None,
3203
        'HAVEN7$', 'HAFN', None,
3204
        'HEAD-', 'HE', 'E',
3205
        'HELIEGEN------', 'E ', 'E ',
3206
        'HESTEHEN------', 'E ', 'E ',
3207
        'HE(LMNRST)-3^', 'HE', 'E',
3208
        'HE(LMN)-1', 'E', 'E',
3209
        'HEUR1$', 'ÖR', 'ÖR',
3210
        'HE(HIUY)--1', 'H', None,
3211
        'HIH(AÄEIOÖUÜY)-1', 'IH', None,
3212
        'HLH(AÄEIOÖUÜY)-1', 'LH', None,
3213
        'HMH(AÄEIOÖUÜY)-1', 'MH', None,
3214
        'HNH(AÄEIOÖUÜY)-1', 'NH', None,
3215
        'HOBBY9^', 'HOBI', None,
3216
        'HOCHBEGAB-----^', 'HOCH ', 'UK ',
3217
        'HOCHTALEN-----^', 'HOCH ', 'UK ',
3218
        'HOCHZUFRI-----^', 'HOCH ', 'UK ',
3219
        'HO(HIY)--1', 'H', None,
3220
        'HRH(AÄEIOÖUÜY)-1', 'RH', None,
3221
        'HUH(AÄEIOÖUÜY)-1', 'UH', None,
3222
        'HUIS^^', 'HÜS', 'IZ',
3223
        'HUIS$', 'ÜS', 'IZ',
3224
        'HUI--1', 'H', None,
3225
        'HYGIEN^', 'HÜKIEN', None,
3226
        'HY9^', 'HÜ', None,
3227
        'HY(BDGMNPST)-', 'Ü', None,
3228
        'H.^', None, 'H.',
3229
        'HÄU--1', 'H', None,
3230
        'H^', 'H', '',
3231
        'H', '', '',
3232
        'ICHELL---', 'ISH', 'IZ',
3233
        'ICHI$', 'ISHI', 'IZI',
3234
        'IEC$', 'IZ', 'IZ',
3235
        'IEDENSTELLE------', 'IDN ', 'ITN ',
3236
        'IEI-3', '', '',
3237
        'IELL3', 'IEL', 'IEL',
3238
        'IENNE$', 'IN', 'IN',
3239
        'IERRE$', 'IER', 'IER',
3240
        'IERZULAN---', 'IR ZU ', 'IR ZU ',
3241
        'IETTE$', 'IT', 'IT',
3242
        'IEU', 'IÖ', 'IÖ',
3243
        'IE<4', 'I', 'I',
3244
        'IGL-1', 'IK', None,
3245
        'IGHT3$', 'EIT', 'EIT',
3246
        'IGNI(EO)-', 'INI', 'INI',
3247
        'IGN(AEOU)-$', 'INI', 'INI',
3248
        'IHER(DGLKRT)--1', 'IHE', None,
3249
        'IHE(IUY)--', 'IH', None,
3250
        'IH(AIOÖUÜY)-', 'IH', None,
3251
        'IJ(AOU)-', 'I', 'I',
3252
        'IJ$', 'I', 'I',
3253
        'IJ<', 'EI', 'EI',
3254
        'IKOLE$', 'IKOL', 'IKUL',
3255
        'ILLAN(STZ)--4', 'ILIA', 'ILIA',
3256
        'ILLAR(DT)--4', 'ILIA', 'ILIA',
3257
        'IMSTAN----^', 'IM ', 'IN ',
3258
        'INDELERREGE------', 'INDL ', 'INTL ',
3259
        'INFRAGE-----^$', 'IN ', 'IN ',
3260
        'INTERN(AOU)-^', 'INTAN', 'INTAN',
3261
        'INVER-', 'INWE', 'INFE',
3262
        'ITI(AÄIOÖUÜ)-', 'IZI', 'IZI',
3263
        'IUSZ$', 'IUS', None,
3264
        'IUTZ$', 'IUS', None,
3265
        'IUZ$', 'IUS', None,
3266
        'IVER--<', 'IW', None,
3267
        'IVIER$', 'IWIE', 'IFIE',
3268
        'IV(ÄOÖUÜ)-', 'IW', None,
3269
        'IV<3', 'IW', None,
3270
        'IY2', 'I', None,
3271
        'I(ÈÉÊ)<4', 'I', 'I',
3272
        'JAVIE---<^', 'ZA', 'ZA',
3273
        'JEANS^$', 'JINS', 'INZ',
3274
        'JEANNE^$', 'IAN', 'IAN',
3275
        'JEAN-^', 'IA', 'IA',
3276
        'JER-^', 'IE', 'IE',
3277
        'JE(LMNST)-', 'IE', 'IE',
3278
        'JI^', 'JI', None,
3279
        'JOR(GK)^$', 'IÖRK', 'IÖRK',
3280
        'J', 'I', 'I',
3281
        'KC(ÄEIJ)-', 'X', 'X',
3282
        'KD', 'KT', None,
3283
        'KE(LMNRST)-3^', 'KE', 'KE',
3284
        'KG(AÄEILOÖRUÜY)-', 'K', None,
3285
        'KH<^', 'K', 'K',
3286
        'KIC$', 'KIZ', 'KIZ',
3287
        'KLE(LMNRST)-3^', 'KLE', 'KLE',
3288
        'KOTELE-^', 'KOTL', 'KUTL',
3289
        'KREAT-^', 'KREA', 'KREA',
3290
        'KRÜS(TZ)--^', 'KRI', None,
3291
        'KRYS(TZ)--^', 'KRI', None,
3292
        'KRY9^', 'KRÜ', None,
3293
        'KSCH---', 'K', 'K',
3294
        'KSH--', 'K', 'K',
3295
        'K(SßXZ)7', 'X', 'X',  # implies 'KST' -> 'XT'
3296
        'KT\'S$', 'X', 'X',
3297
        'KTI(AIOU)-3', 'XI', 'XI',
3298
        'KT(SßXZ)', 'X', 'X',
3299
        'KY9^', 'KÜ', None,
3300
        'K\'S$', 'X', 'X',
3301
        'K´S$', 'X', 'X',
3302
        'LANGES$', ' LANGES', ' LANKEZ',
3303
        'LANGE$', ' LANGE', ' LANKE',
3304
        'LANG$', ' LANK', ' LANK',
3305
        'LARVE-', 'LARF', 'LARF',
3306
        'LD(SßZ)$', 'LS', 'LZ',
3307
        'LD\'S$', 'LS', 'LZ',
3308
        'LD´S$', 'LS', 'LZ',
3309
        'LEAND-^', 'LEAN', 'LEAN',
3310
        'LEERSTEHE-----^', 'LER ', 'LER ',
3311
        'LEICHBLEIB-----', 'LEICH ', 'LEIK ',
3312
        'LEICHLAUTE-----', 'LEICH ', 'LEIK ',
3313
        'LEIDERREGE------', 'LEIT ', 'LEIT ',
3314
        'LEIDGEPR----^', 'LEIT ', 'LEIT ',
3315
        'LEINSTEHE-----', 'LEIN ', 'LEIN ',
3316
        'LEL-', 'LE', 'LE',
3317
        'LE(MNRST)-3^', 'LE', 'LE',
3318
        'LETTE$', 'LET', 'LET',
3319
        'LFGNAG-', 'LFGAN', 'LFKAN',
3320
        'LICHERWEIS----', 'LICHA ', 'LIKA ',
3321
        'LIC$', 'LIZ', 'LIZ',
3322
        'LIVE^$', 'LEIF', 'LEIF',
3323
        'LT(SßZ)$', 'LS', 'LZ',
3324
        'LT\'S$', 'LS', 'LZ',
3325
        'LT´S$', 'LS', 'LZ',
3326
        'LUI(GS)--', 'LU', 'LU',
3327
        'LV(AIO)-', 'LW', None,
3328
        'LY9^', 'LÜ', None,
3329
        'LSTS$', 'LS', 'LZ',
3330
        'LZ(BDFGKLMNPQRSTVWX)-', 'LS', None,
3331
        'L(SßZ)$', 'LS', None,
3332
        'MAIR-<', 'MEI', 'NEI',
3333
        'MANAG-', 'MENE', 'NENE',
3334
        'MANUEL', 'MANUEL', None,
3335
        'MASSEU(RS)-', 'MASÖ', 'NAZÖ',
3336
        'MATCH', 'MESH', 'NEZ',
3337
        'MAURICE', 'MORIS', 'NURIZ',
3338
        'MBH^$', 'MBH', 'MBH',
3339
        'MB(ßZ)$', 'MS', None,
3340
        'MB(SßTZ)-', 'M', 'N',
3341
        'MCG9^', 'MAK', 'NAK',
3342
        'MC9^', 'MAK', 'NAK',
3343
        'MEMOIR-^', 'MEMOA', 'NENUA',
3344
        'MERHAVEN$', 'MAHAFN', None,
3345
        'ME(LMNRST)-3^', 'ME', 'NE',
3346
        'MEN(STZ)--3', 'ME', None,
3347
        'MEN$', 'MEN', None,
3348
        'MIGUEL-', 'MIGE', 'NIKE',
3349
        'MIKE^$', 'MEIK', 'NEIK',
3350
        'MITHILFE----^$', 'MIT H', 'NIT ',
3351
        'MN$', 'M', None,
3352
        'MN', 'N', 'N',
3353
        'MPJUTE-', 'MPUT', 'NBUT',
3354
        'MP(ßZ)$', 'MS', None,
3355
        'MP(SßTZ)-', 'M', 'N',
3356
        'MP(BDJLMNPQVW)-', 'MB', 'NB',
3357
        'MY9^', 'MÜ', None,
3358
        'M(ßZ)$', 'MS', None,
3359
        'M´G7^', 'MAK', 'NAK',
3360
        'M\'G7^', 'MAK', 'NAK',
3361
        'M´^', 'MAK', 'NAK',
3362
        'M\'^', 'MAK', 'NAK',
3363
        'M', None, 'N',
3364
        'NACH^^', 'NACH', 'NAK',
3365
        'NADINE', 'NADIN', 'NATIN',
3366
        'NAIV--', 'NA', 'NA',
3367
        'NAISE$', 'NESE', 'NEZE',
3368
        'NAUGENOMM------', 'NAU ', 'NAU ',
3369
        'NAUSOGUT$', 'NAUSO GUT', 'NAUZU KUT',
3370
        'NCH$', 'NSH', 'NZ',
3371
        'NCOISE$', 'SOA', 'ZUA',
3372
        'NCOIS$', 'SOA', 'ZUA',
3373
        'NDAR$', 'NDA', 'NTA',
3374
        'NDERINGEN------', 'NDE ', 'NTE ',
3375
        'NDRO(CDKTZ)-', 'NTRO', None,
3376
        'ND(BFGJLMNPQVW)-', 'NT', None,
3377
        'ND(SßZ)$', 'NS', 'NZ',
3378
        'ND\'S$', 'NS', 'NZ',
3379
        'ND´S$', 'NS', 'NZ',
3380
        'NEBEN^^', 'NEBN', 'NEBN',
3381
        'NENGELERN------', 'NEN ', 'NEN ',
3382
        'NENLERN(ET)---', 'NEN LE', 'NEN LE',
3383
        'NENZULERNE---', 'NEN ZU LE', 'NEN ZU LE',
3384
        'NE(LMNRST)-3^', 'NE', 'NE',
3385
        'NEN-3', 'NE', 'NE',
3386
        'NETTE$', 'NET', 'NET',
3387
        'NGU^^', 'NU', 'NU',
3388
        'NG(BDFJLMNPQRTVW)-', 'NK', 'NK',
3389
        'NH(AUO)-$', 'NI', 'NI',
3390
        'NICHTSAHNEN-----', 'NIX ', 'NIX ',
3391
        'NICHTSSAGE----', 'NIX ', 'NIX ',
3392
        'NICHTS^^', 'NIX', 'NIX',
3393
        'NICHT^^', 'NICHT', 'NIKT',
3394
        'NINE$', 'NIN', 'NIN',
3395
        'NON^^', 'NON', 'NUN',
3396
        'NOTLEIDE-----^', 'NOT ', 'NUT ',
3397
        'NOT^^', 'NOT', 'NUT',
3398
        'NTI(AIOU)-3', 'NZI', 'NZI',
3399
        'NTIEL--3', 'NZI', 'NZI',
3400
        'NT(SßZ)$', 'NS', 'NZ',
3401
        'NT\'S$', 'NS', 'NZ',
3402
        'NT´S$', 'NS', 'NZ',
3403
        'NYLON', 'NEILON', 'NEILUN',
3404
        'NY9^', 'NÜ', None,
3405
        'NSTZUNEH---', 'NST ZU ', 'NZT ZU ',
3406
        'NSZ-', 'NS', None,
3407
        'NSTS$', 'NS', 'NZ',
3408
        'NZ(BDFGKLMNPQRSTVWX)-', 'NS', None,
3409
        'N(SßZ)$', 'NS', None,
3410
        'OBERE-', 'OBER', None,
3411
        'OBER^^', 'OBA', 'UBA',
3412
        'OEU2', 'Ö', 'Ö',
3413
        'OE<2', 'Ö', 'Ö',
3414
        'OGL-', 'OK', None,
3415
        'OGNIE-', 'ONI', 'UNI',
3416
        'OGN(AEOU)-$', 'ONI', 'UNI',
3417
        'OH(AIOÖUÜY)-', 'OH', None,
3418
        'OIE$', 'Ö', 'Ö',
3419
        'OIRE$', 'OA', 'UA',
3420
        'OIR$', 'OA', 'UA',
3421
        'OIX', 'OA', 'UA',
3422
        'OI<3', 'EU', 'EU',
3423
        'OKAY^$', 'OKE', 'UKE',
3424
        'OLYN$', 'OLIN', 'ULIN',
3425
        'OO(DLMZ)-', 'U', None,
3426
        'OO$', 'U', None,
3427
        'OO-', '', '',
3428
        'ORGINAL-----', 'ORI', 'URI',
3429
        'OTI(AÄOÖUÜ)-', 'OZI', 'UZI',
3430
        'OUI^', 'WI', 'FI',
3431
        'OUILLE$', 'ULIE', 'ULIE',
3432
        'OU(DT)-^', 'AU', 'AU',
3433
        'OUSE$', 'AUS', 'AUZ',
3434
        'OUT-', 'AU', 'AU',
3435
        'OU', 'U', 'U',
3436
        'O(FV)$', 'AU', 'AU',  # due to 'OW$' -> 'AU'
3437
        'OVER--<', 'OW', None,
3438
        'OV(AOU)-', 'OW', None,
3439
        'OW$', 'AU', 'AU',
3440
        'OWS$', 'OS', 'UZ',
3441
        'OJ(AÄEIOÖUÜ)--', 'O', 'U',
3442
        'OYER', 'OIA', None,
3443
        'OY(AÄEIOÖUÜ)--', 'O', 'U',
3444
        'O(JY)<', 'EU', 'EU',
3445
        'OZ$', 'OS', None,
3446
        'O´^', 'O', 'U',
3447
        'O\'^', 'O', 'U',
3448
        'O', None, 'U',
3449
        'PATIEN--^', 'PAZI', 'PAZI',
3450
        'PENSIO-^', 'PANSI', 'PANZI',
3451
        'PE(LMNRST)-3^', 'PE', 'PE',
3452
        'PFER-^', 'FE', 'FE',
3453
        'P(FH)<', 'F', 'F',
3454
        'PIC^$', 'PIK', 'PIK',
3455
        'PIC$', 'PIZ', 'PIZ',
3456
        'PIPELINE', 'PEIBLEIN', 'PEIBLEIN',
3457
        'POLYP-', 'POLÜ', None,
3458
        'POLY^^', 'POLI', 'PULI',
3459
        'PORTRAIT7', 'PORTRE', 'PURTRE',
3460
        'POWER7', 'PAUA', 'PAUA',
3461
        'PP(FH)--<', 'B', 'B',
3462
        'PP-', '', '',
3463
        'PRODUZ-^', 'PRODU', 'BRUTU',
3464
        'PRODUZI--', ' PRODU', ' BRUTU',
3465
        'PRIX^$', 'PRI', 'PRI',
3466
        'PS-^^', 'P', None,
3467
        'P(SßZ)^', None, 'Z',
3468
        'P(SßZ)$', 'BS', None,
3469
        'PT-^', '', '',
3470
        'PTI(AÄOÖUÜ)-3', 'BZI', 'BZI',
3471
        'PY9^', 'PÜ', None,
3472
        'P(AÄEIOÖRUÜY)-', 'P', 'P',
3473
        'P(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'P', None,
3474
        'P.^', None, 'P.',
3475
        'P^', 'P', None,
3476
        'P', 'B', 'B',
3477
        'QI-', 'Z', 'Z',
3478
        'QUARANT--', 'KARA', 'KARA',
3479
        'QUE(LMNRST)-3', 'KWE', 'KFE',
3480
        'QUE$', 'K', 'K',
3481
        'QUI(NS)$', 'KI', 'KI',
3482
        'QUIZ7', 'KWIS', None,
3483
        'Q(UV)7', 'KW', 'KF',
3484
        'Q<', 'K', 'K',
3485
        'RADFAHR----', 'RAT ', 'RAT ',
3486
        'RAEFTEZEHRE-----', 'REFTE ', 'REFTE ',
3487
        'RCH', 'RCH', 'RK',
3488
        'REA(DU)---3^', 'R', None,
3489
        'REBSERZEUG------', 'REBS ', 'REBZ ',
3490
        'RECHERCH^', 'RESHASH', 'REZAZ',
3491
        'RECYCL--', 'RIZEI', 'RIZEI',
3492
        'RE(ALST)-3^', 'RE', None,
3493
        'REE$', 'RI', 'RI',
3494
        'RER$', 'RA', 'RA',
3495
        'RE(MNR)-4', 'RE', 'RE',
3496
        'RETTE$', 'RET', 'RET',
3497
        'REUZ$', 'REUZ', None,
3498
        'REW$', 'RU', 'RU',
3499
        'RH<^', 'R', 'R',
3500
        'RJA(MN)--', 'RI', 'RI',
3501
        'ROWD-^', 'RAU', 'RAU',
3502
        'RTEMONNAIE-', 'RTMON', 'RTNUN',
3503
        'RTI(AÄOÖUÜ)-3', 'RZI', 'RZI',
3504
        'RTIEL--3', 'RZI', 'RZI',
3505
        'RV(AEOU)-3', 'RW', None,
3506
        'RY(KN)-$', 'RI', 'RI',
3507
        'RY9^', 'RÜ', None,
3508
        'RÄFTEZEHRE-----', 'REFTE ', 'REFTE ',
3509
        'SAISO-^', 'SES', 'ZEZ',
3510
        'SAFE^$', 'SEIF', 'ZEIF',
3511
        'SAUCE-^', 'SOS', 'ZUZ',
3512
        'SCHLAGGEBEN-----<', 'SHLAK ', 'ZLAK ',
3513
        'SCHSCH---7', '', '',
3514
        'SCHTSCH', 'SH', 'Z',
3515
        'SC(HZ)<', 'SH', 'Z',
3516
        'SC', 'SK', 'ZK',
3517
        'SELBSTST--7^^', 'SELB', 'ZELB',
3518
        'SELBST7^^', 'SELBST', 'ZELBZT',
3519
        'SERVICE7^', 'SÖRWIS', 'ZÖRFIZ',
3520
        'SERVI-^', 'SERW', None,
3521
        'SE(LMNRST)-3^', 'SE', 'ZE',
3522
        'SETTE$', 'SET', 'ZET',
3523
        'SHP-^', 'S', 'Z',
3524
        'SHST', 'SHT', 'ZT',
3525
        'SHTSH', 'SH', 'Z',
3526
        'SHT', 'ST', 'Z',
3527
        'SHY9^', 'SHÜ', None,
3528
        'SH^^', 'SH', None,
3529
        'SH3', 'SH', 'Z',
3530
        'SICHERGEGAN-----^', 'SICHA ', 'ZIKA ',
3531
        'SICHERGEHE----^', 'SICHA ', 'ZIKA ',
3532
        'SICHERGESTEL------^', 'SICHA ', 'ZIKA ',
3533
        'SICHERSTELL-----^', 'SICHA ', 'ZIKA ',
3534
        'SICHERZU(GS)--^', 'SICHA ZU ', 'ZIKA ZU ',
3535
        'SIEGLI-^', 'SIKL', 'ZIKL',
3536
        'SIGLI-^', 'SIKL', 'ZIKL',
3537
        'SIGHT', 'SEIT', 'ZEIT',
3538
        'SIGN', 'SEIN', 'ZEIN',
3539
        'SKI(NPZ)-', 'SKI', 'ZKI',
3540
        'SKI<^', 'SHI', 'ZI',
3541
        'SODASS^$', 'SO DAS', 'ZU TAZ',
3542
        'SODAß^$', 'SO DAS', 'ZU TAZ',
3543
        'SOGENAN--^', 'SO GEN', 'ZU KEN',
3544
        'SOUND-', 'SAUN', 'ZAUN',
3545
        'STAATS^^', 'STAZ', 'ZTAZ',
3546
        'STADT^^', 'STAT', 'ZTAT',
3547
        'STANDE$', ' STANDE', ' ZTANTE',
3548
        'START^^', 'START', 'ZTART',
3549
        'STAURANT7', 'STORAN', 'ZTURAN',
3550
        'STEAK-', 'STE', 'ZTE',
3551
        'STEPHEN-^$', 'STEW', None,
3552
        'STERN', 'STERN', None,
3553
        'STRAF^^', 'STRAF', 'ZTRAF',
3554
        'ST\'S$', 'Z', 'Z',
3555
        'ST´S$', 'Z', 'Z',
3556
        'STST--', '', '',
3557
        'STS(ACEÈÉÊHIÌÍÎOUÄÜÖ)--', 'ST', 'ZT',
3558
        'ST(SZ)', 'Z', 'Z',
3559
        'SPAREN---^', 'SPA', 'ZPA',
3560
        'SPAREND----', ' SPA', ' ZPA',
3561
        'S(PTW)-^^', 'S', None,
3562
        'SP', 'SP', None,
3563
        'STYN(AE)-$', 'STIN', 'ZTIN',
3564
        'ST', 'ST', 'ZT',
3565
        'SUITE<', 'SIUT', 'ZIUT',
3566
        'SUKE--$', 'S', 'Z',
3567
        'SURF(EI)-', 'SÖRF', 'ZÖRF',
3568
        'SV(AEÈÉÊIÌÍÎOU)-<^', 'SW', None,
3569
        'SYB(IY)--^', 'SIB', None,
3570
        'SYL(KVW)--^', 'SI', None,
3571
        'SY9^', 'SÜ', None,
3572
        'SZE(NPT)-^', 'ZE', 'ZE',
3573
        'SZI(ELN)-^', 'ZI', 'ZI',
3574
        'SZCZ<', 'SH', 'Z',
3575
        'SZT<', 'ST', 'ZT',
3576
        'SZ<3', 'SH', 'Z',
3577
        'SÜL(KVW)--^', 'SI', None,
3578
        'S', None, 'Z',
3579
        'TCH', 'SH', 'Z',
3580
        'TD(AÄEIOÖRUÜY)-', 'T', None,
3581
        'TD(ÀÁÂÃÅÈÉÊËÌÍÎÏÒÓÔÕØÙÚÛÝŸ)-', 'T', None,
3582
        'TEAT-^', 'TEA', 'TEA',
3583
        'TERRAI7^', 'TERA', 'TERA',
3584
        'TE(LMNRST)-3^', 'TE', 'TE',
3585
        'TH<', 'T', 'T',
3586
        'TICHT-', 'TIK', 'TIK',
3587
        'TICH$', 'TIK', 'TIK',
3588
        'TIC$', 'TIZ', 'TIZ',
3589
        'TIGGESTELL-------', 'TIK ', 'TIK ',
3590
        'TIGSTELL-----', 'TIK ', 'TIK ',
3591
        'TOAS-^', 'TO', 'TU',
3592
        'TOILET-', 'TOLE', 'TULE',
3593
        'TOIN-', 'TOA', 'TUA',
3594
        'TRAECHTI-^', 'TRECHT', 'TREKT',
3595
        'TRAECHTIG--', ' TRECHT', ' TREKT',
3596
        'TRAINI-', 'TREN', 'TREN',
3597
        'TRÄCHTI-^', 'TRECHT', 'TREKT',
3598
        'TRÄCHTIG--', ' TRECHT', ' TREKT',
3599
        'TSCH', 'SH', 'Z',
3600
        'TSH', 'SH', 'Z',
3601
        'TST', 'ZT', 'ZT',
3602
        'T(Sß)', 'Z', 'Z',
3603
        'TT(SZ)--<', '', '',
3604
        'TT9', 'T', 'T',
3605
        'TV^$', 'TV', 'TV',
3606
        'TX(AEIOU)-3', 'SH', 'Z',
3607
        'TY9^', 'TÜ', None,
3608
        'TZ-', '', '',
3609
        'T\'S3$', 'Z', 'Z',
3610
        'T´S3$', 'Z', 'Z',
3611
        'UEBEL(GNRW)-^^', 'ÜBL ', 'IBL ',
3612
        'UEBER^^', 'ÜBA', 'IBA',
3613
        'UE2', 'Ü', 'I',
3614
        'UGL-', 'UK', None,
3615
        'UH(AOÖUÜY)-', 'UH', None,
3616
        'UIE$', 'Ü', 'I',
3617
        'UM^^', 'UM', 'UN',
3618
        'UNTERE--3', 'UNTE', 'UNTE',
3619
        'UNTER^^', 'UNTA', 'UNTA',
3620
        'UNVER^^', 'UNFA', 'UNFA',
3621
        'UN^^', 'UN', 'UN',
3622
        'UTI(AÄOÖUÜ)-', 'UZI', 'UZI',
3623
        'UVE-4', 'UW', None,
3624
        'UY2', 'UI', None,
3625
        'UZZ', 'AS', 'AZ',
3626
        'VACL-^', 'WAZ', 'FAZ',
3627
        'VAC$', 'WAZ', 'FAZ',
3628
        'VAN DEN ^', 'FANDN', 'FANTN',
3629
        'VANES-^', 'WANE', None,
3630
        'VATRO-', 'WATR', None,
3631
        'VA(DHJNT)--^', 'F', None,
3632
        'VEDD-^', 'FE', 'FE',
3633
        'VE(BEHIU)--^', 'F', None,
3634
        'VEL(BDLMNT)-^', 'FEL', None,
3635
        'VENTZ-^', 'FEN', None,
3636
        'VEN(NRSZ)-^', 'FEN', None,
3637
        'VER(AB)-^$', 'WER', None,
3638
        'VERBAL^$', 'WERBAL', None,
3639
        'VERBAL(EINS)-^', 'WERBAL', None,
3640
        'VERTEBR--', 'WERTE', None,
3641
        'VEREIN-----', 'F', None,
3642
        'VEREN(AEIOU)-^', 'WEREN', None,
3643
        'VERIFI', 'WERIFI', None,
3644
        'VERON(AEIOU)-^', 'WERON', None,
3645
        'VERSEN^', 'FERSN', 'FAZN',
3646
        'VERSIERT--^', 'WERSI', None,
3647
        'VERSIO--^', 'WERS', None,
3648
        'VERSUS', 'WERSUS', None,
3649
        'VERTI(GK)-', 'WERTI', None,
3650
        'VER^^', 'FER', 'FA',
3651
        'VERSPRECHE-------', ' FER', ' FA',
3652
        'VER$', 'WA', None,
3653
        'VER', 'FA', 'FA',
3654
        'VET(HT)-^', 'FET', 'FET',
3655
        'VETTE$', 'WET', 'FET',
3656
        'VE^', 'WE', None,
3657
        'VIC$', 'WIZ', 'FIZ',
3658
        'VIELSAGE----', 'FIL ', 'FIL ',
3659
        'VIEL', 'FIL', 'FIL',
3660
        'VIEW', 'WIU', 'FIU',
3661
        'VILL(AE)-', 'WIL', None,
3662
        'VIS(ACEIKUVWZ)-<^', 'WIS', None,
3663
        'VI(ELS)--^', 'F', None,
3664
        'VILLON--', 'WILI', 'FILI',
3665
        'VIZE^^', 'FIZE', 'FIZE',
3666
        'VLIE--^', 'FL', None,
3667
        'VL(AEIOU)--', 'W', None,
3668
        'VOKA-^', 'WOK', None,
3669
        'VOL(ATUVW)--^', 'WO', None,
3670
        'VOR^^', 'FOR', 'FUR',
3671
        'VR(AEIOU)--', 'W', None,
3672
        'VV9', 'W', None,
3673
        'VY9^', 'WÜ', 'FI',
3674
        'V(ÜY)-', 'W', None,
3675
        'V(ÀÁÂÃÅÈÉÊÌÍÎÙÚÛ)-', 'W', None,
3676
        'V(AEIJLRU)-<', 'W', None,
3677
        'V.^', 'V.', None,
3678
        'V<', 'F', 'F',
3679
        'WEITERENTWI-----^', 'WEITA ', 'FEITA ',
3680
        'WEITREICH-----^', 'WEIT ', 'FEIT ',
3681
        'WEITVER^', 'WEIT FER', 'FEIT FA',
3682
        'WE(LMNRST)-3^', 'WE', 'FE',
3683
        'WER(DST)-', 'WER', None,
3684
        'WIC$', 'WIZ', 'FIZ',
3685
        'WIEDERU--', 'WIDE', 'FITE',
3686
        'WIEDER^$', 'WIDA', 'FITA',
3687
        'WIEDER^^', 'WIDA ', 'FITA ',
3688
        'WIEVIEL', 'WI FIL', 'FI FIL',
3689
        'WISUEL', 'WISUEL', None,
3690
        'WR-^', 'W', None,
3691
        'WY9^', 'WÜ', 'FI',
3692
        'W(BDFGJKLMNPQRSTZ)-', 'F', None,
3693
        'W$', 'F', None,
3694
        'W', None, 'F',
3695
        'X<^', 'Z', 'Z',
3696
        'XHAVEN$', 'XAFN', None,
3697
        'X(CSZ)', 'X', 'X',
3698
        'XTS(CH)--', 'XT', 'XT',
3699
        'XT(SZ)', 'Z', 'Z',
3700
        'YE(LMNRST)-3^', 'IE', 'IE',
3701
        'YE-3', 'I', 'I',
3702
        'YOR(GK)^$', 'IÖRK', 'IÖRK',
3703
        'Y(AOU)-<7', 'I', 'I',
3704
        'Y(BKLMNPRSTX)-1', 'Ü', None,
3705
        'YVES^$', 'IF', 'IF',
3706
        'YVONNE^$', 'IWON', 'IFUN',
3707
        'Y.^', 'Y.', None,
3708
        'Y', 'I', 'I',
3709
        'ZC(AOU)-', 'SK', 'ZK',
3710
        'ZE(LMNRST)-3^', 'ZE', 'ZE',
3711
        'ZIEJ$', 'ZI', 'ZI',
3712
        'ZIGERJA(HR)-3', 'ZIGA IA', 'ZIKA IA',
3713
        'ZL(AEIOU)-', 'SL', None,
3714
        'ZS(CHT)--', '', '',
3715
        'ZS', 'SH', 'Z',
3716
        'ZUERST', 'ZUERST', 'ZUERST',
3717
        'ZUGRUNDE^$', 'ZU GRUNDE', 'ZU KRUNTE',
3718
        'ZUGRUNDE', 'ZU GRUNDE ', 'ZU KRUNTE ',
3719
        'ZUGUNSTEN', 'ZU GUNSTN', 'ZU KUNZTN',
3720
        'ZUHAUSE-', 'ZU HAUS', 'ZU AUZ',
3721
        'ZULASTEN^$', 'ZU LASTN', 'ZU LAZTN',
3722
        'ZURUECK^^', 'ZURÜK', 'ZURIK',
3723
        'ZURZEIT', 'ZUR ZEIT', 'ZUR ZEIT',
3724
        'ZURÜCK^^', 'ZURÜK', 'ZURIK',
3725
        'ZUSTANDE', 'ZU STANDE', 'ZU ZTANTE',
3726
        'ZUTAGE', 'ZU TAGE', 'ZU TAKE',
3727
        'ZUVER^^', 'ZUFA', 'ZUFA',
3728
        'ZUVIEL', 'ZU FIL', 'ZU FIL',
3729
        'ZUWENIG', 'ZU WENIK', 'ZU FENIK',
3730
        'ZY9^', 'ZÜ', None,
3731
        'ZYK3$', 'ZIK', None,
3732
        'Z(VW)7^', 'SW', None,
3733
        None, None, None)
3734
3735
    phonet_hash = Counter()
3736
    alpha_pos = Counter()
3737
3738
    phonet_hash_1 = Counter()
3739
    phonet_hash_2 = Counter()
3740
3741
    _phonet_upper_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
3742
                                          'abcdefghijklmnopqrstuvwxyzàáâãåäæ' +
3743
                                          'çðèéêëìíîïñòóôõöøœšßþùúûüýÿ'),
3744
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÅÄÆ' +
3745
                                         'ÇÐÈÉÊËÌÍÎÏÑÒÓÔÕÖØŒŠßÞÙÚÛÜÝŸ'))
3746
3747
    def _trinfo(text, rule, err_text, lang):
3748
        """Output debug information."""
3749
        if lang == 'none':
3750
            _phonet_rules = _phonet_rules_no_lang
3751
        else:
3752
            _phonet_rules = _phonet_rules_german
3753
3754
        from_rule = ('(NULL)' if _phonet_rules[rule] is None else
3755
                     _phonet_rules[rule])
3756
        to_rule1 = ('(NULL)' if (_phonet_rules[rule + 1] is None) else
3757
                    _phonet_rules[rule + 1])
3758
        to_rule2 = ('(NULL)' if (_phonet_rules[rule + 2] is None) else
3759
                    _phonet_rules[rule + 2])
3760
        print('"{} {}:  "{}"{}"{}" {}'.format(text, ((rule / 3) + 1),
3761
                                              from_rule, to_rule1, to_rule2,
3762
                                              err_text))
3763
3764
    def _initialize_phonet(lang):
3765
        """Initialize phonet variables."""
3766
        if lang == 'none':
3767
            _phonet_rules = _phonet_rules_no_lang
3768
        else:
3769
            _phonet_rules = _phonet_rules_german
3770
3771
        phonet_hash[''] = -1
3772
3773
        # German and international umlauts
3774
        for j in {'À', 'Á', 'Â', 'Ã', 'Ä', 'Å', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ë',
3775
                  'Ì', 'Í', 'Î', 'Ï', 'Ð', 'Ñ', 'Ò', 'Ó', 'Ô', 'Õ', 'Ö', 'Ø',
3776
                  'Ù', 'Ú', 'Û', 'Ü', 'Ý', 'Þ', 'ß', 'Œ', 'Š', 'Ÿ'}:
3777
            alpha_pos[j] = 1
3778
            phonet_hash[j] = -1
3779
3780
        # "normal" letters ('A'-'Z')
3781
        for i, j in enumerate('ABCDEFGHIJKLMNOPQRSTUVWXYZ'):
3782
            alpha_pos[j] = i + 2
3783
            phonet_hash[j] = -1
3784
3785
        for i in range(26):
3786
            for j in range(28):
3787
                phonet_hash_1[i, j] = -1
3788
                phonet_hash_2[i, j] = -1
3789
3790
        # for each phonetc rule
3791
        for i in range(len(_phonet_rules)):
3792
            rule = _phonet_rules[i]
3793
3794
            if rule and i % 3 == 0:
3795
                # calculate first hash value
3796
                k = _phonet_rules[i][0]
3797
3798
                if phonet_hash[k] < 0 and (_phonet_rules[i+1] or
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash does not seem to be defined.
Loading history...
3799
                                           _phonet_rules[i+2]):
3800
                    phonet_hash[k] = i
3801
3802
                # calculate second hash values
3803
                if k and alpha_pos[k] >= 2:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable alpha_pos does not seem to be defined.
Loading history...
3804
                    k = alpha_pos[k]
3805
3806
                    j = k-2
3807
                    rule = rule[1:]
3808
3809
                    if not rule:
3810
                        rule = ' '
3811
                    elif rule[0] == '(':
3812
                        rule = rule[1:]
3813
                    else:
3814
                        rule = rule[0]
3815
3816
                    while rule and (rule[0] != ')'):
3817
                        k = alpha_pos[rule[0]]
3818
3819
                        if k > 0:
3820
                            # add hash value for this letter
3821
                            if phonet_hash_1[j, k] < 0:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_1 does not seem to be defined.
Loading history...
3822
                                phonet_hash_1[j, k] = i
3823
                                phonet_hash_2[j, k] = i
3824
3825
                            if phonet_hash_2[j, k] >= (i-30):
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable phonet_hash_2 does not seem to be defined.
Loading history...
3826
                                phonet_hash_2[j, k] = i
3827
                            else:
3828
                                k = -1
3829
3830
                        if k <= 0:
3831
                            # add hash value for all letters
3832
                            if phonet_hash_1[j, 0] < 0:
3833
                                phonet_hash_1[j, 0] = i
3834
3835
                            phonet_hash_2[j, 0] = i
3836
3837
                        rule = rule[1:]
3838
3839
    def _phonet(term, mode, lang, trace):
3840
        """Return the phonet coded form of a term."""
3841
        if lang == 'none':
3842
            _phonet_rules = _phonet_rules_no_lang
3843
        else:
3844
            _phonet_rules = _phonet_rules_german
3845
3846
        char0 = ''
3847
        dest = term
3848
3849
        if not term:
3850
            return ''
3851
3852
        term_length = len(term)
3853
3854
        # convert input string to upper-case
3855
        src = term.translate(_phonet_upper_translation)
3856
3857
        # check "src"
3858
        i = 0
3859
        j = 0
3860
        zeta = 0
3861
3862
        while i < len(src):
3863
            char = src[i]
3864
3865
            if trace:
3866
                print('\ncheck position {}:  src = "{}",  dest = "{}"'.format
3867
                      (j, src[i:], dest[:j]))
3868
3869
            pos = alpha_pos[char]
3870
3871
            if pos >= 2:
3872
                xpos = pos-2
3873
3874
                if i+1 == len(src):
3875
                    pos = alpha_pos['']
3876
                else:
3877
                    pos = alpha_pos[src[i+1]]
3878
3879
                start1 = phonet_hash_1[xpos, pos]
3880
                start2 = phonet_hash_1[xpos, 0]
3881
                end1 = phonet_hash_2[xpos, pos]
3882
                end2 = phonet_hash_2[xpos, 0]
3883
3884
                # preserve rule priorities
3885
                if (start2 >= 0) and ((start1 < 0) or (start2 < start1)):
3886
                    pos = start1
3887
                    start1 = start2
3888
                    start2 = pos
3889
                    pos = end1
3890
                    end1 = end2
3891
                    end2 = pos
3892
3893
                if (end1 >= start2) and (start2 >= 0):
3894
                    if end2 > end1:
3895
                        end1 = end2
3896
3897
                    start2 = -1
3898
                    end2 = -1
3899
            else:
3900
                pos = phonet_hash[char]
3901
                start1 = pos
3902
                end1 = 10000
3903
                start2 = -1
3904
                end2 = -1
3905
3906
            pos = start1
3907
            zeta0 = 0
3908
3909
            if pos >= 0:
3910
                # check rules for this char
3911
                while ((_phonet_rules[pos] is None) or
3912
                       (_phonet_rules[pos][0] == char)):
3913
                    if pos > end1:
3914
                        if start2 > 0:
3915
                            pos = start2
3916
                            start1 = start2
3917
                            start2 = -1
3918
                            end1 = end2
3919
                            end2 = -1
3920
                            continue
3921
3922
                        break
3923
3924
                    if (((_phonet_rules[pos] is None) or
3925
                         (_phonet_rules[pos + mode] is None))):
3926
                        # no conversion rule available
3927
                        pos += 3
3928
                        continue
3929
3930
                    if trace:
3931
                        _trinfo('> rule no.', pos, 'is being checked', lang)
3932
3933
                    # check whole string
3934
                    matches = 1  # number of matching letters
3935
                    priority = 5  # default priority
3936
                    rule = _phonet_rules[pos]
3937
                    rule = rule[1:]
3938
3939
                    while (rule and
3940
                           (len(src) > (i + matches)) and
3941
                           (src[i + matches] == rule[0]) and
3942
                           not rule[0].isdigit() and
3943
                           (rule not in '(-<^$')):
3944
                        matches += 1
3945
                        rule = rule[1:]
3946
3947
                    if rule and (rule[0] == '('):
3948
                        # check an array of letters
3949
                        if (((len(src) > (i + matches)) and
3950
                             src[i + matches].isalpha() and
3951
                             (src[i + matches] in rule[1:]))):
3952
                            matches += 1
3953
3954
                            while rule and rule[0] != ')':
3955
                                rule = rule[1:]
3956
3957
                            # if rule[0] == ')':
3958
                            rule = rule[1:]
3959
3960
                    if rule:
3961
                        priority0 = ord(rule[0])
3962
                    else:
3963
                        priority0 = 0
3964
3965
                    matches0 = matches
3966
3967
                    while rule and rule[0] == '-' and matches > 1:
3968
                        matches -= 1
3969
                        rule = rule[1:]
3970
3971
                    if rule and rule[0] == '<':
3972
                        rule = rule[1:]
3973
3974
                    if rule and rule[0].isdigit():
3975
                        # read priority
3976
                        priority = int(rule[0])
3977
                        rule = rule[1:]
3978
3979
                    if rule and rule[0:2] == '^^':
3980
                        rule = rule[1:]
3981
3982
                    if (not rule or
3983
                            ((rule[0] == '^') and
3984
                             ((i == 0) or not src[i-1].isalpha()) and
3985
                             ((rule[1:2] != '$') or
3986
                              (not (src[i+matches0:i+matches0+1].isalpha()) and
3987
                               (src[i+matches0:i+matches0+1] != '.')))) or
3988
                            ((rule[0] == '$') and (i > 0) and
3989
                             src[i-1].isalpha() and
3990
                             ((not src[i+matches0:i+matches0+1].isalpha()) and
3991
                              (src[i+matches0:i+matches0+1] != '.')))):
3992
                        # look for continuation, if:
3993
                        # matches > 1 und NO '-' in first string */
3994
                        pos0 = -1
3995
3996
                        start3 = 0
3997
                        start4 = 0
3998
                        end3 = 0
3999
                        end4 = 0
4000
4001
                        if (((matches > 1) and
4002
                             src[i+matches:i+matches+1] and
4003
                             (priority0 != ord('-')))):
4004
                            char0 = src[i+matches-1]
4005
                            pos0 = alpha_pos[char0]
4006
4007
                            if pos0 >= 2 and src[i+matches]:
4008
                                xpos = pos0 - 2
4009
                                pos0 = alpha_pos[src[i+matches]]
4010
                                start3 = phonet_hash_1[xpos, pos0]
4011
                                start4 = phonet_hash_1[xpos, 0]
4012
                                end3 = phonet_hash_2[xpos, pos0]
4013
                                end4 = phonet_hash_2[xpos, 0]
4014
4015
                                # preserve rule priorities
4016
                                if (((start4 >= 0) and
4017
                                     ((start3 < 0) or (start4 < start3)))):
4018
                                    pos0 = start3
4019
                                    start3 = start4
4020
                                    start4 = pos0
4021
                                    pos0 = end3
4022
                                    end3 = end4
4023
                                    end4 = pos0
4024
4025
                                if (end3 >= start4) and (start4 >= 0):
4026
                                    if end4 > end3:
4027
                                        end3 = end4
4028
4029
                                    start4 = -1
4030
                                    end4 = -1
4031
                            else:
4032
                                pos0 = phonet_hash[char0]
4033
                                start3 = pos0
4034
                                end3 = 10000
4035
                                start4 = -1
4036
                                end4 = -1
4037
4038
                            pos0 = start3
4039
4040
                        # check continuation rules for src[i+matches]
4041
                        if pos0 >= 0:
4042
                            while ((_phonet_rules[pos0] is None) or
4043
                                   (_phonet_rules[pos0][0] == char0)):
4044
                                if pos0 > end3:
4045
                                    if start4 > 0:
4046
                                        pos0 = start4
4047
                                        start3 = start4
4048
                                        start4 = -1
4049
                                        end3 = end4
4050
                                        end4 = -1
4051
                                        continue
4052
4053
                                    priority0 = -1
4054
4055
                                    # important
4056
                                    break
4057
4058
                                if (((_phonet_rules[pos0] is None) or
4059
                                     (_phonet_rules[pos0 + mode] is None))):
4060
                                    # no conversion rule available
4061
                                    pos0 += 3
4062
                                    continue
4063
4064
                                if trace:
4065
                                    _trinfo('> > continuation rule no.', pos0,
4066
                                            'is being checked', lang)
4067
4068
                                # check whole string
4069
                                matches0 = matches
4070
                                priority0 = 5
4071
                                rule = _phonet_rules[pos0]
4072
                                rule = rule[1:]
4073
4074
                                while (rule and
4075
                                       (src[i+matches0:i+matches0+1] ==
4076
                                        rule[0]) and
4077
                                       (not rule[0].isdigit() or
4078
                                        (rule in '(-<^$'))):
4079
                                    matches0 += 1
4080
                                    rule = rule[1:]
4081
4082
                                if rule and rule[0] == '(':
4083
                                    # check an array of letters
4084
                                    if ((src[i+matches0:i+matches0+1]
4085
                                         .isalpha() and
4086
                                         (src[i+matches0] in rule[1:]))):
4087
                                        matches0 += 1
4088
4089
                                        while rule and rule[0] != ')':
4090
                                            rule = rule[1:]
4091
4092
                                        # if rule[0] == ')':
4093
                                        rule = rule[1:]
4094
4095
                                while rule and rule[0] == '-':
4096
                                    # "matches0" is NOT decremented
4097
                                    # because of  "if (matches0 == matches)"
4098
                                    rule = rule[1:]
4099
4100
                                if rule and rule[0] == '<':
4101
                                    rule = rule[1:]
4102
4103
                                if rule and rule[0].isdigit():
4104
                                    priority0 = int(rule[0])
4105
                                    rule = rule[1:]
4106
4107
                                if (not rule or
4108
                                        # rule == '^' is not possible here
4109
                                        ((rule[0] == '$') and not
4110
                                         src[i+matches0:i+matches0+1]
4111
                                         .isalpha() and
4112
                                         (src[i+matches0:i+matches0+1]
4113
                                          != '.'))):
4114
                                    if matches0 == matches:
4115
                                        # this is only a partial string
4116
                                        if trace:
4117
                                            _trinfo('> > continuation ' +
4118
                                                    'rule no.',
4119
                                                    pos0,
4120
                                                    'not used (too short)',
4121
                                                    lang)
4122
4123
                                        pos0 += 3
4124
                                        continue
4125
4126
                                    if priority0 < priority:
4127
                                        # priority is too low
4128
                                        if trace:
4129
                                            _trinfo('> > continuation ' +
4130
                                                    'rule no.',
4131
                                                    pos0,
4132
                                                    'not used (priority)',
4133
                                                    lang)
4134
4135
                                        pos0 += 3
4136
                                        continue
4137
4138
                                    # continuation rule found
4139
                                    break
4140
4141
                                if trace:
4142
                                    _trinfo('> > continuation rule no.', pos0,
4143
                                            'not used', lang)
4144
4145
                                pos0 += 3
4146
4147
                            # end of "while"
4148
                            if ((priority0 >= priority) and
4149
                                    ((_phonet_rules[pos0] is not None) and
4150
                                     (_phonet_rules[pos0][0] == char0))):
4151
4152
                                if trace:
4153
                                    _trinfo('> rule no.', pos, '', lang)
4154
                                    _trinfo('> not used because of ' +
4155
                                            'continuation', pos0, '', lang)
4156
4157
                                pos += 3
4158
                                continue
4159
4160
                        # replace string
4161
                        if trace:
4162
                            _trinfo('Rule no.', pos, 'is applied', lang)
4163
4164
                        if ((_phonet_rules[pos] and
4165
                             ('<' in _phonet_rules[pos][1:]))):
4166
                            priority0 = 1
4167
                        else:
4168
                            priority0 = 0
4169
4170
                        rule = _phonet_rules[pos + mode]
4171
4172
                        if (priority0 == 1) and (zeta == 0):
4173
                            # rule with '<' is applied
4174
                            if ((j > 0) and rule and
4175
                                    ((dest[j-1] == char) or
4176
                                     (dest[j-1] == rule[0]))):
4177
                                j -= 1
4178
4179
                            zeta0 = 1
4180
                            zeta += 1
4181
                            matches0 = 0
4182
4183
                            while rule and src[i+matches0]:
4184
                                src = (src[0:i+matches0] + rule[0] +
4185
                                       src[i+matches0+1:])
4186
                                matches0 += 1
4187
                                rule = rule[1:]
4188
4189
                            if matches0 < matches:
4190
                                src = (src[0:i+matches0] +
4191
                                       src[i+matches:])
4192
4193
                            char = src[i]
4194
                        else:
4195
                            i = i + matches - 1
4196
                            zeta = 0
4197
4198
                            while len(rule) > 1:
4199
                                if (j == 0) or (dest[j - 1] != rule[0]):
4200
                                    dest = (dest[0:j] + rule[0] +
4201
                                            dest[min(len(dest), j+1):])
4202
                                    j += 1
4203
4204
                                rule = rule[1:]
4205
4206
                            # new "current char"
4207
                            if not rule:
4208
                                rule = ''
4209
                                char = ''
4210
                            else:
4211
                                char = rule[0]
4212
4213
                            if ((_phonet_rules[pos] and
4214
                                 '^^' in _phonet_rules[pos][1:])):
4215
                                if char:  # pragma: no branch
4216
                                    dest = (dest[0:j] + char +
4217
                                            dest[min(len(dest), j + 1):])
4218
                                    j += 1
4219
4220
                                src = src[i + 1:]
4221
                                i = 0
4222
                                zeta0 = 1
4223
4224
                        break
4225
4226
                    pos += 3
4227
4228
                    if pos > end1 and start2 > 0:
4229
                        pos = start2
4230
                        start1 = start2
4231
                        end1 = end2
4232
                        start2 = -1
4233
                        end2 = -1
4234
4235
            if zeta0 == 0:
4236
                if char and ((j == 0) or (dest[j-1] != char)):
4237
                    # delete multiple letters only
4238
                    dest = dest[0:j] + char + dest[min(j+1, term_length):]
4239
                    j += 1
4240
4241
                i += 1
4242
                zeta = 0
4243
4244
        dest = dest[0:j]
4245
4246
        return dest
4247
4248
    _initialize_phonet(lang)
4249
4250
    word = unicodedata.normalize('NFKC', text_type(word))
4251
    return _phonet(word, mode, lang, trace)
4252
4253
4254
def spfc(word):
4255
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
4256
4257
    Standardized Phonetic Frequency Code is roughly Soundex-like.
4258
    This implementation is based on page 19-21 of
4259
    https://archive.org/stream/accessingindivid00moor#page/19/mode/1up
4260
4261
    :param str word: the word to transform
4262
    :returns: the SPFC value
4263
    :rtype: str
4264
4265
    >>> spfc('Christopher Smith')
4266
    '01160'
4267
    >>> spfc('Christopher Schmidt')
4268
    '01160'
4269
    >>> spfc('Niall Smith')
4270
    '01660'
4271
    >>> spfc('Niall Schmidt')
4272
4273
    >>> spfc('L.Smith')
4274
    '01960'
4275
    >>> spfc('R.Miller')
4276
    '65490'
4277
4278
    >>> spfc(('L', 'Smith'))
4279
    '01960'
4280
    >>> spfc(('R', 'Miller'))
4281
    '65490'
4282
    """
4283
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4284
                    '0011112222334445556666777'))
4285
    _pf2 = dict(zip((ord(_) for _ in
4286
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
4287
                    '0011122233445556677788899'))
4288
    _pf3 = dict(zip((ord(_) for _ in
4289
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
4290
                    '00000112223334456677777777'))
4291
4292
    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
4293
                      ('MN', 'N'))
4294
4295
    def _raise_word_ex():
4296
        """Raise an AttributeError."""
4297
        raise AttributeError('word attribute must be a string with a space ' +
4298
                             'or period dividing the first and last names ' +
4299
                             'or a tuple/list consisting of the first and ' +
4300
                             'last names')
4301
4302
    if not word:
4303
        return ''
4304
4305
    if isinstance(word, (str, text_type)):
4306
        names = word.split('.', 1)
4307
        if len(names) != 2:
4308
            names = word.split(' ', 1)
4309
            if len(names) != 2:
4310
                _raise_word_ex()
4311
    elif hasattr(word, '__iter__'):
4312
        if len(word) != 2:
4313
            _raise_word_ex()
4314
        names = word
4315
    else:
4316
        _raise_word_ex()
4317
4318
    names = [unicodedata.normalize('NFKD', text_type(_.strip()
4319
                                                     .replace('ß', 'SS')
4320
                                                     .upper()))
4321
             for _ in names]
0 ignored issues
show
introduced by
The variable names does not seem to be defined for all execution paths.
Loading history...
4322
    code = ''
4323
4324
    def steps_one_to_three(name):
4325
        """Perform the first three steps of SPFC."""
4326
        # filter out non A-Z
4327
        name = ''.join(_ for _ in name if _ in
4328
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
4329
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
4330
                        'W', 'X', 'Y', 'Z'})
4331
4332
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
4333
        # and MN to N
4334
        for subst in _substitutions:
4335
            name = name.replace(subst[0], subst[1])
4336
4337
        # 2. In the name field, replace multiple letters with a single letter
4338
        name = _delete_consecutive_repeats(name)
4339
4340
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
4341
        # field.
4342
        if name:
4343
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
4344
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
4345
        return name
4346
4347
    names = [steps_one_to_three(_) for _ in names]
4348
4349
    # 4. The first digit of the code is obtained using PF1 and the first letter
4350
    # of the name field. Remove this letter after coding.
4351
    if names[1]:
4352
        code += names[1][0].translate(_pf1)
4353
        names[1] = names[1][1:]
4354
4355
    # 5. Using the last letters of the name, use Table PF3 to obtain the
4356
    # second digit of the code. Use as many letters as possible and remove
4357
    # after coding.
4358
    if names[1]:
4359
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
4360
            code += '8'
4361
            names[1] = names[1][:-3]
4362
        elif names[1][-2:] == 'SN':
4363
            code += '8'
4364
            names[1] = names[1][:-2]
4365
        elif names[1][-3:] == 'STR':
4366
            code += '9'
4367
            names[1] = names[1][:-3]
4368
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
4369
            code += '9'
4370
            names[1] = names[1][:-2]
4371
        elif names[1][-3:] == 'DRS':
4372
            code += '7'
4373
            names[1] = names[1][:-3]
4374
        elif names[1][-2:] in {'TR', 'MN'}:
4375
            code += '7'
4376
            names[1] = names[1][:-2]
4377
        else:
4378
            code += names[1][-1].translate(_pf3)
4379
            names[1] = names[1][:-1]
4380
4381
    # 6. The third digit is found using Table PF2 and the first character of
4382
    # the first name. Remove after coding.
4383
    if names[0]:
4384
        code += names[0][0].translate(_pf2)
4385
        names[0] = names[0][1:]
4386
4387
    # 7. The fourth digit is found using Table PF2 and the first character of
4388
    # the name field. If no letters remain use zero. After coding remove the
4389
    # letter.
4390
    # 8. The fifth digit is found in the same manner as the fourth using the
4391
    # remaining characters of the name field if any.
4392
    for _ in range(2):
4393
        if names[1]:
4394
            code += names[1][0].translate(_pf2)
4395
            names[1] = names[1][1:]
4396
        else:
4397
            code += '0'
4398
4399
    return code
4400
4401
4402
def statistics_canada(word, maxlength=4):
4403
    """Return the Statistics Canada code for a word.
4404
4405
    The original description of this algorithm could not be located, and
4406
    may only have been specified in an unpublished TR. The coding does not
4407
    appear to be in use by Statistics Canada any longer. In its place, this is
4408
    an implementation of the "Census modified Statistics Canada name coding
4409
    procedure".
4410
4411
    The modified version of this algorithm is described in Appendix B of
4412
    Lynch, Billy T. and William L. Arends. `Selection of a Surname Coding
4413
    Procedure for the SRS Record Linkage System.` Statistical Reporting
4414
    Service, U.S. Department of Agriculture, Washington, D.C. February 1977.
4415
    https://naldc.nal.usda.gov/download/27833/PDF
4416
4417
    :param str word: the word to transform
4418
    :param int maxlength: the maximum length (default 6) of the code to return
4419
    :param bool modified: indicates whether to use USDA modified algorithm
4420
    :returns: the Statistics Canada name code value
4421
    :rtype: str
4422
4423
    >>> statistics_canada('Christopher')
4424
    'CHRS'
4425
    >>> statistics_canada('Niall')
4426
    'NL'
4427
    >>> statistics_canada('Smith')
4428
    'SMTH'
4429
    >>> statistics_canada('Schmidt')
4430
    'SCHM'
4431
    """
4432
    # uppercase, normalize, decompose, and filter non-A-Z out
4433
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4434
    word = word.replace('ß', 'SS')
4435
    word = ''.join(c for c in word if c in
4436
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4437
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4438
                    'Y', 'Z'})
4439
    if not word:
4440
        return ''
4441
4442
    code = word[1:]
4443
    for vowel in {'A', 'E', 'I', 'O', 'U', 'Y'}:
4444
        code = code.replace(vowel, '')
4445
    code = word[0]+code
4446
    code = _delete_consecutive_repeats(code)
4447
    code = code.replace(' ', '')
4448
4449
    return code[:maxlength]
4450
4451
4452
def lein(word, maxlength=4, zero_pad=True):
4453
    """Return the Lein code for a word.
4454
4455
    This is Lein name coding, based on
4456
    https://naldc.nal.usda.gov/download/27833/PDF
4457
4458
    :param str word: the word to transform
4459
    :param int maxlength: the maximum length (default 4) of the code to return
4460
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4461
        maxlength string
4462
    :returns: the Lein code
4463
    :rtype: str
4464
4465
    >>> lein('Christopher')
4466
    'C351'
4467
    >>> lein('Niall')
4468
    'N300'
4469
    >>> lein('Smith')
4470
    'S210'
4471
    >>> lein('Schmidt')
4472
    'S521'
4473
    """
4474
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
4475
                                  'BCDFGJKLMNPQRSTVXZ'),
4476
                                 '451455532245351455'))
4477
4478
    # uppercase, normalize, decompose, and filter non-A-Z out
4479
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4480
    word = word.replace('ß', 'SS')
4481
    word = ''.join(c for c in word if c in
4482
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4483
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4484
                    'Y', 'Z'})
4485
4486
    if not word:
4487
        return ''
4488
4489
    code = word[0]  # Rule 1
4490
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
4491
                               73: None, 79: None, 85: None, 87: None,
4492
                               89: None})  # Rule 2
4493
    word = _delete_consecutive_repeats(word)  # Rule 3
4494
    code += word.translate(_lein_translation)  # Rule 4
4495
4496
    if zero_pad:
4497
        code += ('0'*maxlength)  # Rule 4
4498
4499
    return code[:maxlength]
4500
4501
4502
def roger_root(word, maxlength=5, zero_pad=True):
4503
    """Return the Roger Root code for a word.
4504
4505
    This is Roger Root name coding, based on
4506
    https://naldc.nal.usda.gov/download/27833/PDF
4507
4508
    :param str word: the word to transform
4509
    :param int maxlength: the maximum length (default 5) of the code to return
4510
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4511
        maxlength string
4512
    :returns: the Roger Root code
4513
    :rtype: str
4514
4515
    >>> roger_root('Christopher')
4516
    '06401'
4517
    >>> roger_root('Niall')
4518
    '02500'
4519
    >>> roger_root('Smith')
4520
    '00310'
4521
    >>> roger_root('Schmidt')
4522
    '06310'
4523
    """
4524
    # uppercase, normalize, decompose, and filter non-A-Z out
4525
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4526
    word = word.replace('ß', 'SS')
4527
    word = ''.join(c for c in word if c in
4528
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4529
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4530
                    'Y', 'Z'})
4531
4532
    if not word:
4533
        return ''
4534
4535
    # '*' is used to prevent combining by _delete_consecutive_repeats()
4536
    _init_patterns = {4: {'TSCH': '06'},
4537
                      3: {'TSH': '06', 'SCH': '06'},
4538
                      2: {'CE': '0*0', 'CH': '06', 'CI': '0*0', 'CY': '0*0',
4539
                          'DG': '07', 'GF': '08', 'GM': '03', 'GN': '02',
4540
                          'KN': '02', 'PF': '08', 'PH': '08', 'PN': '02',
4541
                          'SH': '06', 'TS': '0*0', 'WR': '04'},
4542
                      1: {'A': '1', 'B': '09', 'C': '07', 'D': '01', 'E': '1',
4543
                          'F': '08', 'G': '07', 'H': '2', 'I': '1', 'J': '3',
4544
                          'K': '07', 'L': '05', 'M': '03', 'N': '02', 'O': '1',
4545
                          'P': '09', 'Q': '07', 'R': '04', 'S': '0*0',
4546
                          'T': '01', 'U': '1', 'V': '08', 'W': '4', 'X': '07',
4547
                          'Y': '5', 'Z': '0*0'}}
4548
4549
    _med_patterns = {4: {'TSCH': '6'},
4550
                     3: {'TSH': '6', 'SCH': '6'},
4551
                     2: {'CE': '0', 'CH': '6', 'CI': '0', 'CY': '0', 'DG': '7',
4552
                         'PH': '8', 'SH': '6', 'TS': '0'},
4553
                     1: {'B': '9', 'C': '7', 'D': '1', 'F': '8', 'G': '7',
4554
                         'J': '6', 'K': '7', 'L': '5', 'M': '3', 'N': '2',
4555
                         'P': '9', 'Q': '7', 'R': '4', 'S': '0', 'T': '1',
4556
                         'V': '8', 'X': '7', 'Z': '0',
4557
                         'A': '*', 'E': '*', 'H': '*', 'I': '*', 'O': '*',
4558
                         'U': '*', 'W': '*', 'Y': '*'}}
4559
4560
    code = ''
4561
    pos = 0
4562
4563
    # Do first digit(s) first
4564
    for num in range(4, 0, -1):
4565
        if word[:num] in _init_patterns[num]:
4566
            code = _init_patterns[num][word[:num]]
4567
            pos += num
4568
            break
4569
    else:
4570
        pos += 1  # Advance if nothing is recognized
4571
4572
    # Then code subsequent digits
4573
    while pos < len(word):
4574
        for num in range(4, 0, -1):
4575
            if word[pos:pos+num] in _med_patterns[num]:
4576
                code += _med_patterns[num][word[pos:pos+num]]
4577
                pos += num
4578
                break
4579
        else:
4580
            pos += 1  # Advance if nothing is recognized
4581
4582
    code = _delete_consecutive_repeats(code)
4583
    code = code.replace('*', '')
4584
4585
    if zero_pad:
4586
        code += '0'*maxlength
4587
4588
    return code[:maxlength]
4589
4590
4591
def onca(word, maxlength=4, zero_pad=True):
4592
    """Return the Oxford Name Compression Algorithm (ONCA) code for a word.
4593
4594
    This is the Oxford Name Compression Algorithm, based on:
4595
    Gill, Leicester E. 1997. "OX-LINK: The Oxford Medical Record Linkage
4596
    System." In ``Record Linkage Techniques -- 1997``. Arlington, VA. March
4597
    20--21, 1997.
4598
    https://nces.ed.gov/FCSM/pdf/RLT97.pdf
4599
4600
    I can find no complete description of the "anglicised version of the NYSIIS
4601
    method" identified as the first step in this algorithm, so this is likely
4602
    not a correct implementation, in that it employs the standard NYSIIS
4603
    algorithm.
4604
4605
    :param str word: the word to transform
4606
    :param int maxlength: the maximum length (default 5) of the code to return
4607
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
4608
        maxlength string
4609
    :returns: the ONCA code
4610
    :rtype: str
4611
4612
    >>> onca('Christopher')
4613
    'C623'
4614
    >>> onca('Niall')
4615
    'N400'
4616
    >>> onca('Smith')
4617
    'S530'
4618
    >>> onca('Schmidt')
4619
    'S530'
4620
    """
4621
    # In the most extreme case, 3 characters of NYSIIS input can be compressed
4622
    # to one character of output, so give it triple the maxlength.
4623
    return soundex(nysiis(word, maxlength=maxlength*3), maxlength,
4624
                   zero_pad=zero_pad)
4625
4626
4627
def eudex(word, maxlength=8):
4628
    """Return the eudex phonetic hash of a word.
4629
4630
    This implementation of eudex phonetic hashing is based on the specification
4631
    (not the reference implementation) at:
4632
    Ticki. 2017. "Eudex: A blazingly fast phonetic reduction/hashing
4633
    algorithm." https://docs.rs/crate/eudex
4634
4635
    Further details can be found at
4636
    http://ticki.github.io/blog/the-eudex-algorithm/
4637
4638
    :param str word: the word to transform
4639
    :param int maxlength: the length of the code returned (defaults to 8)
4640
    :returns: the eudex hash
4641
    :rtype: str
4642
    """
4643
    _trailing_phones = {
4644
        'a': 0,  # a
4645
        'b': 0b01001000,  # b
4646
        'c': 0b00001100,  # c
4647
        'd': 0b00011000,  # d
4648
        'e': 0,  # e
4649
        'f': 0b01000100,  # f
4650
        'g': 0b00001000,  # g
4651
        'h': 0b00000100,  # h
4652
        'i': 1,  # i
4653
        'j': 0b00000101,  # j
4654
        'k': 0b00001001,  # k
4655
        'l': 0b10100000,  # l
4656
        'm': 0b00000010,  # m
4657
        'n': 0b00010010,  # n
4658
        'o': 0,  # o
4659
        'p': 0b01001001,  # p
4660
        'q': 0b10101000,  # q
4661
        'r': 0b10100001,  # r
4662
        's': 0b00010100,  # s
4663
        't': 0b00011101,  # t
4664
        'u': 1,  # u
4665
        'v': 0b01000101,  # v
4666
        'w': 0b00000000,  # w
4667
        'x': 0b10000100,  # x
4668
        'y': 1,  # y
4669
        'z': 0b10010100,  # z
4670
4671
        'ß': 0b00010101,  # ß
4672
        'à': 0,  # à
4673
        'á': 0,  # á
4674
        'â': 0,  # â
4675
        'ã': 0,  # ã
4676
        'ä': 0,  # ä[æ]
4677
        'å': 1,  # å[oː]
4678
        'æ': 0,  # æ[æ]
4679
        'ç': 0b10010101,  # ç[t͡ʃ]
4680
        'è': 1,  # è
4681
        'é': 1,  # é
4682
        'ê': 1,  # ê
4683
        'ë': 1,  # ë
4684
        'ì': 1,  # ì
4685
        'í': 1,  # í
4686
        'î': 1,  # î
4687
        'ï': 1,  # ï
4688
        'ð': 0b00010101,  # ð[ð̠](represented as a non-plosive T)
4689
        'ñ': 0b00010111,  # ñ[nj](represented as a combination of n and j)
4690
        'ò': 0,  # ò
4691
        'ó': 0,  # ó
4692
        'ô': 0,  # ô
4693
        'õ': 0,  # õ
4694
        'ö': 1,  # ö[ø]
4695
        '÷': 0b11111111,  # ÷
4696
        'ø': 1,  # ø[ø]
4697
        'ù': 1,  # ù
4698
        'ú': 1,  # ú
4699
        'û': 1,  # û
4700
        'ü': 1,  # ü
4701
        'ý': 1,  # ý
4702
        'þ': 0b00010101,  # þ[ð̠](represented as a non-plosive T)
4703
        'ÿ': 1,  # ÿ
4704
    }
4705
4706
    _initial_phones = {
4707
        'a': 0b10000100,  # a*
4708
        'b': 0b00100100,  # b
4709
        'c': 0b00000110,  # c
4710
        'd': 0b00001100,  # d
4711
        'e': 0b11011000,  # e*
4712
        'f': 0b00100010,  # f
4713
        'g': 0b00000100,  # g
4714
        'h': 0b00000010,  # h
4715
        'i': 0b11111000,  # i*
4716
        'j': 0b00000011,  # j
4717
        'k': 0b00000101,  # k
4718
        'l': 0b01010000,  # l
4719
        'm': 0b00000001,  # m
4720
        'n': 0b00001001,  # n
4721
        'o': 0b10010100,  # o*
4722
        'p': 0b00100101,  # p
4723
        'q': 0b01010100,  # q
4724
        'r': 0b01010001,  # r
4725
        's': 0b00001010,  # s
4726
        't': 0b00001110,  # t
4727
        'u': 0b11100000,  # u*
4728
        'v': 0b00100011,  # v
4729
        'w': 0b00000000,  # w
4730
        'x': 0b01000010,  # x
4731
        'y': 0b11100100,  # y*
4732
        'z': 0b01001010,  # z
4733
4734
        'ß': 0b00001011,  # ß
4735
        'à': 0b10000101,  # à
4736
        'á': 0b10000101,  # á
4737
        'â': 0b10000000,  # â
4738
        'ã': 0b10000110,  # ã
4739
        'ä': 0b10100110,  # ä [æ]
4740
        'å': 0b11000010,  # å [oː]
4741
        'æ': 0b10100111,  # æ [æ]
4742
        'ç': 0b01010100,  # ç [t͡ʃ]
4743
        'è': 0b11011001,  # è
4744
        'é': 0b11011001,  # é
4745
        'ê': 0b11011001,  # ê
4746
        'ë': 0b11000110,  # ë [ə] or [œ]
4747
        'ì': 0b11111001,  # ì
4748
        'í': 0b11111001,  # í
4749
        'î': 0b11111001,  # î
4750
        'ï': 0b11111001,  # ï
4751
        'ð': 0b00001011,  # ð [ð̠] (represented as a non-plosive T)
4752
        'ñ': 0b00001011,  # ñ [nj] (represented as a combination of n and j)
4753
        'ò': 0b10010101,  # ò
4754
        'ó': 0b10010101,  # ó
4755
        'ô': 0b10010101,  # ô
4756
        'õ': 0b10010101,  # õ
4757
        'ö': 0b11011100,  # ö [œ] or [ø]
4758
        '÷': 0b11111111,  # ÷
4759
        'ø': 0b11011101,  # ø [œ] or [ø]
4760
        'ù': 0b11100001,  # ù
4761
        'ú': 0b11100001,  # ú
4762
        'û': 0b11100001,  # û
4763
        'ü': 0b11100101,  # ü
4764
        'ý': 0b11100101,  # ý
4765
        'þ': 0b00001011,  # þ [ð̠] (represented as a non-plosive T)
4766
        'ÿ': 0b11100101,  # ÿ
4767
    }
4768
    # Lowercase input & filter unknown characters
4769
    word = ''.join(char for char in word.lower() if char in _initial_phones)
4770
4771
    # Perform initial eudex coding of each character
4772
    values = [_initial_phones[word[0]]]
4773
    values += [_trailing_phones[char] for char in word[1:]]
4774
4775
    # Right-shift by one to determine if second instance should be skipped
4776
    shifted_values = [_ >> 1 for _ in values]
4777
    condensed_values = [values[0]]
4778
    for n in range(1, len(shifted_values)):
4779
        if shifted_values[n] != shifted_values[n-1]:
4780
            condensed_values.append(values[n])
4781
4782
    # Add padding after first character & trim beyond maxlength
4783
    values = ([condensed_values[0]] +
4784
              [0]*max(0, maxlength - len(condensed_values)) +
4785
              condensed_values[1:maxlength])
4786
4787
    # Combine individual character values into eudex hash
4788
    hash_value = 0
4789
    for val in values:
4790
        hash_value = (hash_value << 8) | val
4791
4792
    return hash_value
4793
4794
4795
def haase_phonetik(word, primary_only=False):
4796
    """Return the Haase Phonetik (numeric output) code for a word.
4797
4798
    Based on the algorithm described at
4799
    https://github.com/elastic/elasticsearch/blob/master/plugins/analysis-phonetic/src/main/java/org/elasticsearch/index/analysis/phonetic/HaasePhonetik.java
4800
4801
    Based on the original
4802
    Haase, Martin and Kai Heitmann. 2000. Die Erweiterte Kölner Phonetik.
4803
4804
    While the output code is numeric, it is still a str.
4805
4806
    :param str word: the word to transform
4807
    :returns: the Haase Phonetik value as a numeric string
4808
    :rtype: str
4809
    """
4810
    def _after(word, i, letters):
4811
        """Return True if word[i] follows one of the supplied letters."""
4812
        if i > 0 and word[i-1] in letters:
4813
            return True
4814
        return False
4815
4816
    def _before(word, i, letters):
4817
        """Return True if word[i] precedes one of the supplied letters."""
4818
        if i+1 < len(word) and word[i+1] in letters:
4819
            return True
4820
        return False
4821
4822
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
4823
4824
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
4825
    word = word.replace('ß', 'SS')
4826
4827
    word = word.replace('Ä', 'AE')
4828
    word = word.replace('Ö', 'OE')
4829
    word = word.replace('Ü', 'UE')
4830
    word = ''.join(c for c in word if c in
4831
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
4832
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
4833
                    'Y', 'Z'})
4834
4835
    # Nothing to convert, return base case
4836
    if not word:
4837
        return ''
4838
4839
    variants = []
4840
    if primary_only:
4841
        variants = [word]
4842
    else:
4843
        pos = 0
4844
        if word[:2] == 'CH':
4845
            variants.append(('CH', 'SCH'))
4846
            pos += 2
4847
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
4848
                      'AUX': 'O', 'EUX': 'O'}
4849
        while pos < len(word):
4850
            if word[pos:pos+4] == 'ILLE':
4851
                variants.append(('ILLE', 'I'))
4852
                pos += 4
4853
            elif word[pos:pos+3] in len_3_vars:
4854
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
4855
                pos += 3
4856
            elif word[pos:pos+2] == 'RB':
4857
                variants.append(('RB', 'RW'))
4858
                pos += 2
4859
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
4860
                variants.append(('EAU', 'O'))
4861
                pos += 3
4862
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
4863
                if word[pos:] == 'O':
4864
                    variants.append(('O', 'OW'))
4865
                else:
4866
                    variants.append(('A', 'AR'))
4867
                pos += 1
4868
            else:
4869
                variants.append((word[pos],))
4870
                pos += 1
4871
4872
        variants = [''.join(letters) for letters in product(*variants)]
4873
4874
    def _haase_code(word):
4875
        sdx = ''
4876
        for i in range(len(word)):
4877 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
4878
                sdx += '9'
4879
            elif word[i] == 'B':
4880
                sdx += '1'
4881
            elif word[i] == 'P':
4882
                if _before(word, i, {'H'}):
4883
                    sdx += '3'
4884
                else:
4885
                    sdx += '1'
4886
            elif word[i] in {'D', 'T'}:
4887
                if _before(word, i, {'C', 'S', 'Z'}):
4888
                    sdx += '8'
4889
                else:
4890
                    sdx += '2'
4891
            elif word[i] in {'F', 'V', 'W'}:
4892
                sdx += '3'
4893
            elif word[i] in {'G', 'K', 'Q'}:
4894
                sdx += '4'
4895
            elif word[i] == 'C':
4896
                if _after(word, i, {'S', 'Z'}):
4897
                    sdx += '8'
4898
                elif i == 0:
4899
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
4900
                                         'U', 'X'}):
4901
                        sdx += '4'
4902
                    else:
4903
                        sdx += '8'
4904
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
4905
                    sdx += '4'
4906
                else:
4907
                    sdx += '8'
4908
            elif word[i] == 'X':
4909
                if _after(word, i, {'C', 'K', 'Q'}):
4910
                    sdx += '8'
4911
                else:
4912
                    sdx += '48'
4913
            elif word[i] == 'L':
4914
                sdx += '5'
4915
            elif word[i] in {'M', 'N'}:
4916
                sdx += '6'
4917
            elif word[i] == 'R':
4918
                sdx += '7'
4919
            elif word[i] in {'S', 'Z'}:
4920
                sdx += '8'
4921
4922
        sdx = _delete_consecutive_repeats(sdx)
4923
4924
        # if sdx:
4925
        #     sdx = sdx[0] + sdx[1:].replace('9', '')
4926
4927
        return sdx
4928
4929
    return tuple(_haase_code(word) for word in variants)
4930
4931
4932
def reth_schek_phonetik(word):
4933
    """Return Reth-Schek Phonetik code for a word.
4934
4935
    This algorithm is proposed in:
4936
    von Reth, Hans-Peter and Schek, Hans-Jörg. 1977. "Eine Zugriffsmethode für
4937
    die phonetische Ähnlichkeitssuche." Heidelberg Scientific Center technical
4938
    reports 77.03.002. IBM Deutschland GmbH.
4939
4940
    Since I couldn't secure a copy of that document (maybe I'll look for it
4941
    next time I'm in Germany), this implementation is based on what I could
4942
    glean from the implementations published by German Record Linkage
4943
    Center (www.record-linkage.de):
4944
    - Privacy-preserving Record Linkage (PPRL) (in R)
4945
    - Merge ToolBox (in Java)
4946
4947
    Rules that are unclear:
4948
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
4949
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
4950
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
4951
        think of a German word with '-tui-' in it.)
4952
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
4953
4954
    :param word:
4955
    :return:
4956
    """
4957
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
4958
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
4959
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
4960
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
4961
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
4962
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
4963
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
4964
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
4965
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
4966
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
4967
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
4968
                        'SS': 'S', 'KW': 'QU'},
4969
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
4970
                        'K': 'G', 'Y': 'I'}}
4971
4972
    # Uppercase
4973
    word = word.upper()
4974
4975
    # Replace umlauts/eszett
4976
    word = word.replace('Ä', 'AE')
4977
    word = word.replace('Ö', 'OE')
4978
    word = word.replace('Ü', 'UE')
4979
    word = word.replace('ß', 'SS')
4980
4981
    # Main loop, using above replacements table
4982
    pos = 0
4983
    while pos < len(word):
4984
        for num in range(3, 0, -1):
4985
            if word[pos:pos+num] in replacements[num]:
4986
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
4987
                        + word[pos+num:])
4988
                pos += 1
4989
                break
4990
        else:
4991
            pos += 1  # Advance if nothing is recognized
4992
4993
    # Change 'CH' back(?) to 'SCH'
4994
    word = word.replace('CH', 'SCH')
4995
4996
    # Replace final sequences
4997
    if word[-2:] == 'ER':
4998
        word = word[:-2]+'R'
4999
    elif word[-2:] == 'EL':
5000
        word = word[:-2]+'L'
5001
    elif word[-1] == 'H':
5002
        word = word[:-1]
5003
5004
    return word
5005
5006
5007
def fonem(word):
5008
    """Return the FONEM code of a word.
5009
5010
    FONEM is a phonetic algorithm designed for French (particularly surnames in
5011
    Saguenay, Canada), defined in:
5012
    Bouchard, Gérard, Patrick Brard, and Yolande Lavoie. 1981. "FONEM: Un code
5013
    de transcription phonétique pour la reconstitution automatique des
5014
    familles saguenayennes." Population. 36(6). 1085--1103.
5015
    https://doi.org/10.2307/1532326
5016
    http://www.persee.fr/doc/pop_0032-4663_1981_num_36_6_17248
5017
5018
    Guillaume Plique's Javascript implementation at
5019
    https://github.com/Yomguithereal/talisman/blob/master/src/phonetics/french/fonem.js
5020
    was also consulted for this implementation.
5021
5022
    :param str word: the word to transform
5023
    :returns: the FONEM code
5024
    :rtype: str
5025
    """
5026
    # I don't see a sane way of doing this without regexps :(
5027
    rule_table = {
5028
        # Vowels & groups of vowels
5029
        'V-1':     (re.compile('E?AU'), 'O'),
5030
        'V-2,5':   (re.compile('(E?AU|O)L[TX]$'), 'O'),
5031
        'V-3,4':   (re.compile('E?AU[TX]$'), 'O'),
5032
        'V-6':     (re.compile('E?AUL?D$'), 'O'),
5033
        'V-7':     (re.compile(r'(?<!G)AY$'), 'E'),
5034
        'V-8':     (re.compile('EUX$'), 'EU'),
5035
        'V-9':     (re.compile('EY(?=$|[BCDFGHJKLMNPQRSTVWXZ])'), 'E'),
5036
        'V-10':    ('Y', 'I'),
5037
        'V-11':    (re.compile('(?<=[AEIOUY])I(?=[AEIOUY])'), 'Y'),
5038
        'V-12':    (re.compile('(?<=[AEIOUY])ILL'), 'Y'),
5039
        'V-13':    (re.compile('OU(?=[AEOU]|I(?!LL))'), 'W'),
5040
        'V-14':    (re.compile(r'([AEIOUY])(?=\1)'), ''),
5041
        # Nasal vowels
5042
        'V-15':    (re.compile('[AE]M(?=[BCDFGHJKLMPQRSTVWXZ])(?!$)'), 'EN'),
5043
        'V-16':    (re.compile('OM(?=[BCDFGHJKLMPQRSTVWXZ])'), 'ON'),
5044
        'V-17':    (re.compile('AN(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'EN'),
5045
        'V-18':    (re.compile('(AI[MN]|EIN)(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'),
5046
                    'IN'),
5047
        'V-19':    (re.compile('B(O|U|OU)RNE?$'), 'BURN'),
5048
        'V-20':    (re.compile('(^IM|(?<=[BCDFGHJKLMNPQRSTVWXZ])IM(?=[BCDFGHJKLMPQRSTVWXZ]))'),
5049
                    'IN'),
5050
        # Consonants and groups of consonants
5051
        'C-1':     ('BV', 'V'),
5052
        'C-2':     (re.compile('(?<=[AEIOUY])C(?=[EIY])'), 'SS'),
5053
        'C-3':     (re.compile('(?<=[BDFGHJKLMNPQRSTVWZ])C(?=[EIY])'), 'S'),
5054
        'C-4':     (re.compile('^C(?=[EIY])'), 'S'),
5055
        'C-5':     (re.compile('^C(?=[OUA])'), 'K'),
5056
        'C-6':     (re.compile('(?<=[AEIOUY])C$'), 'K'),
5057
        'C-7':     (re.compile('C(?=[BDFGJKLMNPQRSTVWXZ])'), 'K'),
5058
        'C-8':     (re.compile('CC(?=[AOU])'), 'K'),
5059
        'C-9':     (re.compile('CC(?=[EIY])'), 'X'),
5060
        'C-10':    (re.compile('G(?=[EIY])'), 'J'),
5061
        'C-11':    (re.compile('GA(?=I?[MN])'), 'G#'),
5062
        'C-12':    (re.compile('GE(O|AU)'), 'JO'),
5063
        'C-13':    (re.compile('GNI(?=[AEIOUY])'), 'GN'),
5064
        'C-14':    (re.compile('(?<![PCS])H'), ''),
5065
        'C-15':    ('JEA', 'JA'),
5066
        'C-16':    (re.compile('^MAC(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'MA#'),
5067
        'C-17':    (re.compile('^MC'), 'MA#'),
5068
        'C-18':    ('PH', 'F'),
5069
        'C-19':    ('QU', 'K'),
5070
        'C-20':    (re.compile('^SC(?=[EIY])'), 'S'),
5071
        'C-21':    (re.compile('(?<=.)SC(?=[EIY])'), 'SS'),
5072
        'C-22':    (re.compile('(?<=.)SC(?=[AOU])'), 'SK'),
5073
        'C-23':    ('SH', 'CH'),
5074
        'C-24':    (re.compile('TIA$'), 'SSIA'),
5075
        'C-25':    (re.compile('(?<=[AIOUY])W'), ''),
5076
        'C-26':    (re.compile('X[CSZ]'), 'X'),
5077
        'C-27':    (re.compile('(?<=[AEIOUY])Z|(?<=[BCDFGHJKLMNPQRSTVWXZ])Z(?=[BCDFGHJKLMNPQRSTVWXZ])'), 'S'),
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (110/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
5078
        'C-28':    (re.compile(r'([BDFGHJKMNPQRTVWXZ])\1'), r'\1'),
5079
        'C-28a':   (re.compile('CC(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'C'),
5080
        'C-28b':   (re.compile('((?<=[BCDFGHJKLMNPQRSTVWXZ])|^)SS'), 'S'),
5081
        'C-28bb':  (re.compile('SS(?=[BCDFGHJKLMNPQRSTVWXZ]|$)'), 'S'),
5082
        'C-28c':   (re.compile('((?<=[^I])|^)LL'), 'L'),
5083
        'C-28d':   (re.compile('ILE$'), 'ILLE'),
5084
        'C-29':    (re.compile('(ILS|[CS]H|[MN]P|R[CFKLNSX])$|([BCDFGHJKLMNPQRSTVWXZ])[BCDFGHJKLMNPQRSTVWXZ]$'), r'\1\2'),
0 ignored issues
show
Coding Style introduced by
This line is too long as per the coding-style (122/100).

This check looks for lines that are too long. You can specify the maximum line length.

Loading history...
5085
        'C-30,32': (re.compile('^(SA?INT?|SEI[NM]|CINQ?|ST)(?!E)-?'), 'ST-'),
5086
        'C-31,33': (re.compile('^(SAINTE|STE)-?'), 'STE-'),
5087
        # Rules to undo rule bleeding prevention in C-11, C-16, C-17
5088
        'C-34':    ('G#', 'GA'),
5089
        'C-35':    ('MA#', 'MAC')
5090
    }
5091
    rule_order = [
5092
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5093
        'C-12',
5094
        'C-8', 'C-9', 'C-10',
5095
        'C-16', 'C-17', 'C-2', 'C-3', 'C-7',
5096
        'V-2,5', 'V-3,4', 'V-6',
5097
        'V-1', 'C-14',
5098
        'C-31,33', 'C-30,32',
5099
        'C-11', 'V-15', 'V-17', 'V-18',
5100
        'V-7', 'V-8', 'V-9', 'V-10', 'V-11', 'V-12', 'V-13', 'V-16',
5101
        'V-19', 'V-20',
5102
        'C-1', 'C-4', 'C-5', 'C-6', 'C-13', 'C-15',
5103
        'C-18', 'C-19', 'C-20', 'C-21', 'C-22', 'C-23', 'C-24',
5104
        'C-25', 'C-26', 'C-27',
5105
        'C-29',
5106
        'V-14', 'C-28', 'C-28a', 'C-28b', 'C-28bb', 'C-28c', 'C-28d',
5107
        'C-34', 'C-35'
5108
    ]
5109
5110
    # normalize, upper-case, and filter non-French letters
5111
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
5112
    word = word.translate({198: 'AE', 338: 'OE'})
5113
    word = ''.join(c for c in word if c in
5114
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5115
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5116
                    'Y', 'Z', '-'})
5117
5118
    for rule in rule_order:
5119
        regex, repl = rule_table[rule]
5120
        if isinstance(regex, text_type):
5121
            word = word.replace(regex, repl)
5122
        else:
5123
            word = regex.sub(repl, word)
5124
        # print(rule, word)
5125
5126
    return word
5127
5128
5129
def parmar_kumbharana(word):
5130
    """Return the Parmar-Kumbharana encoding of a word.
5131
5132
    This is based on the phonetic algorithm proposed in
5133
    Parmar, Vimal P. and CK Kumbharana. 2014. "Study Existing Various Phonetic
5134
    Algorithms and Designing and Development of a working model for the New
5135
    Developed Algorithm and Comparison by implementing ti with Existing
5136
    Algorithm(s)." International Journal of Computer Applications. 98(19).
5137
    https://doi.org/10.5120/17295-7795
5138
5139
    :param word:
5140
    :return:
5141
    """
5142
    rule_table = {4: {'OUGH': 'F'},
5143
                  3: {'DGE': 'J',
5144
                      'OUL': 'U',
5145
                      'GHT': 'T'},
5146
                  2: {'CE': 'S', 'CI': 'S', 'CY': 'S',
5147
                      'GE': 'J', 'GI': 'J', 'GY': 'J',
5148
                      'WR': 'R',
5149
                      'GN': 'N', 'KN': 'N', 'PN': 'N',
5150
                      'CK': 'K',
5151
                      'SH': 'S'}}
5152
    vowel_trans = {65: '', 69: '', 73: '', 79: '', 85: '', 89: ''}
5153
5154
    word = word.upper()  # Rule 3
5155
    word = _delete_consecutive_repeats(word)  # Rule 4
5156
5157
    # Rule 5
5158
    i = 0
5159
    while i < len(word):
5160
        for match_len in range(4, 1, -1):
5161
            if word[i:i+match_len] in rule_table[match_len]:
5162
                repl = rule_table[match_len][word[i:i+match_len]]
5163
                word = (word[:i] + repl + word[i+match_len:])
5164
                i += len(repl)
5165
        else:
5166
            i += 1
5167
5168
    word = word[0]+word[1:].translate(vowel_trans)  # Rule 6
5169
    return word
5170
5171
5172
def davidson(lname, fname='.', omit_fname=False):
5173
    """Return Davidson's Consonant Code.
5174
5175
    This is based on the name compression system described in:
5176
    Davidson, Leon. 1962. "Retrieval of Misspelled Names in an Airline
5177
    Passenger Record System." Communications of the ACM. 5(3). 169--171.
5178
    https://dl.acm.org/citation.cfm?id=366913
5179
5180
    Dolby (1970) identifies this as having been the name compression algorithm
5181
    used by SABRE.
5182
5183
    :param str lname: Last name (or word) to be encoded
5184
    :param str fname: First name (optional), of which the first character is
5185
        included in the code.
5186
    :param str omit_fname: Set to True to completely omit the first character
5187
        of the first name
5188
    :return: Davidson's Consonant Code
5189
    """
5190
    trans = {65: '', 69: '', 73: '', 79: '', 85: '', 72: '', 87: '', 89: ''}
5191
5192
    lname = lname.upper()
5193
    code = _delete_consecutive_repeats(lname[:1] + lname[1:].translate(trans))
5194
    code = code[:4] + (4-len(code))*' '
5195
5196
    if not omit_fname:
5197
        code += fname[:1].upper()
5198
5199
    return code
5200
5201
5202
def sound_d(word, maxlength=4):
5203
    """Return the SoundD code.
5204
5205
    SoundD is defined in
5206
    Varol, Cihan and Coskun Bayrak. 2012. "Hybrid Matching Algorithm for
5207
    Personal Names." Journal of Data and Information Quality, 3(4).
5208
    doi:10.1145/2348828.2348830
5209
5210
    :param str word: the word to transform
5211
    :param int maxlength: the length of the code returned (defaults to 4)
5212
    :return:
5213
    """
5214
    _ref_soundd_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5215
                                        'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5216
                                       '01230120022455012623010202'))
5217
5218
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
5219
    word = word.replace('ß', 'SS')
5220
    word = ''.join(c for c in word if c in
5221
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5222
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5223
                    'Y', 'Z'})
5224
5225
    if word[:2] in {'KN', 'GN', 'PN', 'AC', 'WR'}:
5226
        word = word[1:]
5227
    elif word[:1] == 'X':
5228
        word = 'S'+word[1:]
5229
    elif word[:2] == 'WH':
5230
        word = 'W'+word[2:]
5231
5232
    word = word.replace('DGE', '20').replace('DGI', '20').replace('GH', '0')
5233
5234
    word = word.translate(_ref_soundd_translation)
5235
    word = _delete_consecutive_repeats(word)
5236
    word = word.replace('0', '')
5237
5238
    if maxlength is not None:
5239
        if len(word) < maxlength:
5240
            word += '0' * (maxlength-len(word))
5241
        else:
5242
            word = word[:maxlength]
5243
5244
    return word
5245
5246
5247
def pshp_soundex_last(lname, maxlength=4, german=False):
5248
    """Calculate the PSHP Soundex/Viewex Coding of a last name.
5249
5250
    This coding is based on Hershberg, Theodore, Alan Burstein, and Robert
5251
    Dockhorn. 1976. "Record Linkage." Historical Methods Newsletter.
5252
    9(2-3). 137--163. doi:10.1080/00182494.1976.10112639
5253
5254
    Reference was also made to the German version of the same:
5255
    Hershberg, Theodore, Alan Burstein, and Robert Dockhorn. 1976. "Verkettung
5256
    von Daten: Record Linkage am Beispiel des Philadelphia Social History
5257
    Project." Moderne Stadtgeschichte. Stuttgart: Klett-Cotta, 1979.
5258
    http://nbn-resolving.de/urn:nbn:de:0168-ssoar-327824
5259
5260
    A separate function, pshp_soundex_first() is used for first names.
5261
5262
    :param lname: the last name to encode
5263
    :param german: set to True if the name is German (different rules apply)
5264
    :return:
5265
    """
5266
    lname = unicodedata.normalize('NFKD', text_type(lname.upper()))
5267
    lname = lname.replace('ß', 'SS')
5268
    lname = ''.join(c for c in lname if c in
5269
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5270
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5271
                     'W', 'X', 'Y', 'Z'})
5272
5273
    # A. Prefix treatment
5274
    if lname[:3] == 'VON' or lname[:3] == 'VAN':
5275
        lname = lname[3:].strip()
5276
5277
    # The rule implemented below says "MC, MAC become 1". I believe it meant to
5278
    # say they become M except in German data (where superscripted 1 indicates
5279
    # "except in German data"). It doesn't make sense for them to become 1
5280
    # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
5281
    # this error(?).
5282
    if not german:
5283
        if lname[:3] == 'MAC':
5284
            lname = 'M'+lname[3:]
5285
        elif lname[:2] == 'MC':
5286
            lname = 'M'+lname[2:]
5287
5288
    # The non-German-only rule to strip ' is unnecessary due to filtering
5289
5290
    if lname[:1] in {'E', 'I', 'O', 'U'}:
5291
        lname = 'A' + lname[1:]
5292
    elif lname[:2] in {'GE', 'GI', 'GY'}:
5293
        lname = 'J' + lname[1:]
5294
    elif lname[:2] in {'CE', 'CI', 'CY'}:
5295
        lname = 'S' + lname[1:]
5296
    elif lname[:3] == 'CHR':
5297
        lname = 'K' + lname[1:]
5298
    elif lname[:1] == 'C' and lname[:2] != 'CH':
5299
        lname = 'K' + lname[1:]
5300
5301
    if lname[:2] == 'KN':
5302
        lname = 'N' + lname[1:]
5303
    elif lname[:2] == 'PH':
5304
        lname = 'F' + lname[1:]
5305
    elif lname[:3] in {'WIE', 'WEI'}:
5306
        lname = 'V' + lname[1:]
5307
5308
    if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
5309
        lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]
5310
5311
    code = lname[:1]
5312
5313
    # B. Postfix treatment
5314
    if lname[-1:] == 'R':
5315
        lname = lname[:-1] + 'N'
5316
    elif lname[-2:] in {'SE', 'CE'}:
5317
        lname = lname[:-2]
5318
    if lname[-2:] == 'SS':
5319
        lname = lname[:-2]
5320
    elif lname[-1:] == 'S':
5321
        lname = lname[:-1]
5322
5323
    if not german:
5324
        l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
5325
        l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
5326
                   'STON': 'SAON'}
5327
        if lname[-5:] in l5_repl:
5328
            lname = lname[:-5] + l5_repl[lname[-5:]]
5329
        elif lname[-4:] in l4_repl:
5330
            lname = lname[:-4] + l4_repl[lname[-4:]]
5331
5332
    if lname[-2:] in {'NG', 'ND'}:
5333
        lname = lname[:-1]
5334
    if not german and lname[-3:] in {'GAN', 'GEN'}:
5335
        lname = lname[:-3]+'A'+lname[-2:]
5336
5337
    if german:
5338
        if lname[-3:] == 'TES':
5339
            lname = lname[:-3]
5340
        elif lname[-2:] == 'TS':
5341
            lname = lname[:-2]
5342
        if lname[-3:] == 'TZE':
5343
            lname = lname[:-3]
5344
        elif lname[-2:] == 'ZE':
5345
            lname = lname[:-2]
5346
        if lname[-1:] == 'Z':
5347
            lname = lname[:-1]
5348
        elif lname[-2:] == 'TE':
5349
            lname = lname[:-2]
5350
5351
    # C. Infix Treatment
5352
    lname = lname.replace('CK', 'C')
5353
    lname = lname.replace('SCH', 'S')
5354
    lname = lname.replace('DT', 'T')
5355
    lname = lname.replace('ND', 'N')
5356
    lname = lname.replace('NG', 'N')
5357
    lname = lname.replace('LM', 'M')
5358
    lname = lname.replace('MN', 'M')
5359
    lname = lname.replace('WIE', 'VIE')
5360
    lname = lname.replace('WEI', 'VEI')
5361
5362
    # D. Soundexing
5363
    # code for X & Y are unspecified, but presumably are 2 & 0
5364
    _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5365
                                  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5366
                                 '01230120022455012523010202'))
5367
5368
    lname = lname.translate(_pshp_translation)
5369
    lname = _delete_consecutive_repeats(lname)
5370
5371
    code += lname[1:]
5372
    code = code.replace('0', '')  # rule 1
5373
5374
    if maxlength is not None:
5375
        if len(code) < maxlength:
5376
            code += '0' * (maxlength-len(code))
5377
        else:
5378
            code = code[:maxlength]
5379
5380
    return code
5381
5382
5383
def pshp_soundex_first(fname, maxlength=4, german=False):
5384
    """Calculate the PSHP Soundex/Viewex Coding of a first name.
5385
5386
    This coding is based on Hershberg, Theodore, Alan Burstein, and Robert
5387
    Dockhorn. 1976. "Record Linkage." Historical Methods Newsletter.
5388
    9(2-3). 137--163. doi:10.1080/00182494.1976.10112639
5389
5390
    Reference was also made to the German version of the same:
5391
    Hershberg, Theodore, Alan Burstein, and Robert Dockhorn. 1976. "Verkettung
5392
    von Daten: Record Linkage am Beispiel des Philadelphia Social History
5393
    Project." Moderne Stadtgeschichte. Stuttgart: Klett-Cotta, 1979.
5394
    http://nbn-resolving.de/urn:nbn:de:0168-ssoar-327824
5395
5396
    A separate function, pshp_soundex_last() is used for last names.
5397
5398
    :param fname: the first name to encode
5399
    :param german: set to True if the name is German (different rules apply)
5400
    :return:
5401
    """
5402
    fname = unicodedata.normalize('NFKD', text_type(fname.upper()))
5403
    fname = fname.replace('ß', 'SS')
5404
    fname = ''.join(c for c in fname if c in
5405
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
5406
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
5407
                     'W', 'X', 'Y', 'Z'})
5408
5409
    # special rules
5410
    if fname == 'JAMES':
5411
        code = 'J7'
5412
    elif fname == 'PAT':
5413
        code = 'P7'
5414
5415
    else:
5416
        # A. Prefix treatment
5417
        if fname[:2] in {'GE', 'GI', 'GY'}:
5418
            fname = 'J' + fname[1:]
5419
        elif fname[:2] in {'CE', 'CI', 'CY'}:
5420
            fname = 'S' + fname[1:]
5421
        elif fname[:3] == 'CHR':
5422
            fname = 'K' + fname[1:]
5423
        elif fname[:1] == 'C' and fname[:2] != 'CH':
5424
            fname = 'K' + fname[1:]
5425
5426
        if fname[:2] == 'KN':
5427
            fname = 'N' + fname[1:]
5428
        elif fname[:2] == 'PH':
5429
            fname = 'F' + fname[1:]
5430
        elif fname[:3] in {'WIE', 'WEI'}:
5431
            fname = 'V' + fname[1:]
5432
5433
        if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
5434
            fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
5435
                     fname[1:])
5436
5437
        code = fname[:1]
5438
5439
        # B. Soundex coding
5440
        # code for Y unspecified, but presumably is 0
5441
        _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
5442
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
5443
                                     '01230120022455012523010202'))
5444
5445
        fname = fname.translate(_pshp_translation)
5446
        fname = _delete_consecutive_repeats(fname)
5447
        print(fname)
5448
        code += fname[1:]
5449
        syl_ptr = code.find('0')
5450
        syl2_ptr = code[syl_ptr + 1:].find('0')
5451
        if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
5452
            code = code[:syl_ptr + 2]
5453
5454
        code = code.replace('0', '')  # rule 1
5455
5456
    if maxlength is not None:
5457
        if len(code) < maxlength:
5458
            code += '0' * (maxlength-len(code))
5459
        else:
5460
            code = code[:maxlength]
5461
5462
    return code
5463
5464
5465
def henry_early(word, maxlength=3):
5466
    """Calculate the early version of the Henry code for a word.
5467
5468
    The early version of Henry coding is given in:
5469
    Légaré, Jacques, Yolande Lavoie, and Hubert Charbonneau. 1972. "The Early
5470
    Canadian Population: Problems in Automatic Record Linkage." Canadian
5471
    Historical Review, 53(4). 427--442.
5472
    doi:10.3138/CHR-053-04-03
5473
5474
    :param word:
5475
    :param int maxlength: the length of the code returned (defaults to 3)
5476
    :return:
5477
    """
5478
    _cons = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
5479
             'R', 'S', 'T', 'V', 'W', 'X', 'Z'}
5480
    _vows = {'A', 'E', 'I', 'O', 'U', 'Y'}
5481
    _diph = {'AI': 'E', 'AY': 'E', 'EI': 'E', 'AU': 'O', 'OI': 'O', 'OU': 'O',
5482
             'EU': 'U'}
5483
    _unaltered = {'B', 'D', 'F', 'J', 'K', 'L', 'M', 'N', 'R', 'T', 'V'}
5484
    _simple = {'W': 'V', 'X': 'S', 'V': 'S'}
5485
5486
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
5487
    word = ''.join(c for c in word if c in
5488
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5489
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5490
                    'Y', 'Z'})
5491
5492
    # Rule Ia seems to be covered entirely in II
5493
5494
    # Rule Ib
5495
    if word[0] in _vows:
5496
        # Ib1
5497
        if (((word[1:2] in _cons-{'M', 'N'} and word[2:3] in _cons) or
5498
             (word[1:2] in _cons and word[2:3] not in _cons))):
5499
            if word[0] == 'Y':
5500
                word = 'I'+word[1:]
5501
        # Ib2
5502
        elif word[1:2] in {'M', 'N'} and word[2:3] in _cons:
5503
            if word[0] == 'E':
5504
                word = 'A'+word[1:]
5505
            elif word[0] in {'I', 'U', 'Y'}:
5506
                word = 'E'+word[1:]
5507
        # Ib3
5508
        elif word[:2] in _diph:
5509
            word = _diph[word[:2]]+word[2:]
5510
        # Ib4
5511
        elif word[1:2] in _vows and word[0] == 'Y':
5512
            word = 'I' + word[1:]
5513
5514
    code = ''
5515
    skip = 0
5516
5517
    # Rule II
5518
    for pos, char in enumerate(word):
5519
        nxch = char[pos+1:pos+2]
5520
        prev = char[pos-1:pos]
5521
5522
        if skip:
5523
            skip -= 1
5524
        elif char in _vows:
5525
            code += char
5526
        # IIc
5527
        elif char == nxch:
5528
            skip = 1
5529
            code += char
5530
        elif word[pos:pos+2] in {'CQ', 'DT', 'SC'}:
5531
            skip = 1
5532
            code += word[pos+1]
5533
        # IId
5534
        elif char == 'H' and prev in _cons:
5535
            continue
5536
        elif char == 'S' and nxch in _cons:
5537
            continue
5538
        elif char in _cons-{'L', 'R'} and nxch in _cons-{'L', 'R'}:
5539
            continue
5540
        elif char == 'L' and nxch in {'M', 'N'}:
5541
            continue
5542
        elif char in {'M', 'N'} and prev in _vows and nxch in _cons:
5543
            continue
5544
        # IIa
5545
        elif char in _unaltered:
5546
            code += char
5547
        # IIb
5548
        elif char in _simple:
5549
            code += _simple[char]
5550
        elif char in {'C', 'G', 'P', 'Q', 'S'}:
5551
            if char == 'C':
5552
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5553
                    code += 'K'
5554
                elif nxch in {'E', 'I', 'Y'}:
5555
                    code += 'J'
5556
                elif nxch == 'H':
5557
                    if word[pos+2:pos+3] in _vows:
5558
                        code += 'C'
5559
                    elif word[pos+2:pos+3] in {'R', 'L'}:
5560
                        code += 'K'
5561
            elif char == 'G':
5562
                if nxch in {'A', 'O', 'U', 'L', 'R'}:
5563
                    code += 'G'
5564
                elif nxch in {'E', 'I', 'Y'}:
5565
                    code += 'J'
5566
                elif nxch == 'N':
5567
                    code += 'N'
5568
            elif char == 'P':
5569
                if nxch != 'H':
5570
                    code += 'P'
5571
                else:
5572
                    code += 'F'
5573
            elif char == 'Q':
5574
                if word[pos+1:pos+2] in {'UE', 'UI', 'UY'}:
5575
                    char += 'G'
5576
                elif word[pos + 1:pos + 2] in {'UA', 'UO'}:
5577
                    char += 'K'
5578
            elif char == 'S':
5579
                if word[pos:pos+6] == 'SAINTE':
5580
                    code += 'X'
5581
                    skip = 5
5582
                elif word[pos:pos+5] == 'SAINT':
5583
                    code += 'X'
5584
                    skip = 4
5585
                elif word[pos:pos+3] == 'STE':
5586
                    code += 'X'
5587
                    skip = 2
5588
                elif word[pos:pos+2] == 'ST':
5589
                    code += 'X'
5590
                    skip = 1
5591
                else:
5592
                    code += 'S'
5593
        else:  # this should not be possible
5594
            continue
5595
5596
    # IIe1
5597
    if code[-4:] in {'AULT', 'EULT', 'OULT'}:
5598
        code = code[:-2]
5599
    elif code[-4:-3] in _vows and code[-3:] == 'MPS':
5600
        code = code[:-3]
5601
    elif code[-3:-2] in _vows and code[-2:] in {'MB', 'MP', 'ND', 'NS', 'NT'}:
5602
        code = code[:-2]
5603
    elif code[-2:-1] == 'R' and code[-1:] in _cons:
5604
        code = code[:-1]
5605
    # IIe2
5606
    elif code[-2:-1] in _vows and code[-1:] in {'D', 'M', 'N', 'S', 'T'}:
5607
        code = code[:-1]
5608
    elif code[-2:] == 'ER':
5609
        code = code[:-1]
5610
5611
    # Drop non-initial vowels
5612
    code = code[:1]+code[1:].translate({65: '', 69: '', 73: '', 79: '', 85: '',
5613
                                        89: ''})
5614
5615
    if maxlength is not None:
5616
            code = code[:maxlength]
0 ignored issues
show
Coding Style introduced by
The indentation here looks off. 8 spaces were expected, but 12 were found.
Loading history...
5617
5618
    return code
5619
5620
5621
def norphone(word):
5622
    """Return the Norphone code.
5623
5624
    The reference implementation by Lars Marius Garshol is available at
5625
    https://github.com/larsga/Duke/blob/master/duke-core/src/main/java/no/priv/garshol/duke/comparators/NorphoneComparator.java
5626
5627
    Norphone was designed for Norwegian, but this implementation has been
5628
    extended to support Swedish vowels as well. This function incorporates
5629
    the "not implemented" rules from the above file's rule set.
5630
5631
    :param word:
5632
    :return:
5633
    """
5634
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y', 'Å', 'Æ', 'Ø', 'Ä', 'Ö'}
5635
5636
    replacements = {4: {'SKEI': 'X'},
5637
                    3: {'SKJ': 'X', 'KEI': 'X'},
5638
                    2: {'CH': 'K', 'CK': 'K', 'GJ': 'J', 'GH': 'K', 'HG': 'K',
5639
                        'HJ': 'J', 'HL': 'L', 'HR': 'R', 'KJ': 'X', 'KI': 'X',
5640
                        'LD': 'L', 'ND': 'N', 'PH': 'F', 'TH': 'T', 'SJ': 'X'},
5641
                    1: {'W': 'V', 'X': 'KS', 'Z': 'S', 'D': 'T', 'G': 'K'}}
5642
5643
    word = word.upper()
5644
5645
    code = ''
5646
    skip = 0
5647
5648
    if word[0:2] == 'AA':
5649
        code = 'Å'
5650
        skip = 2
5651
    elif word[0:2] == 'GI':
5652
        code = 'J'
5653
        skip = 2
5654
    elif word[0:3] == 'SKY':
5655
        code = 'X'
5656
        skip = 3
5657
    elif word[0:2] == 'EI':
5658
        code = 'Æ'
5659
        skip = 2
5660
    elif word[0:2] == 'KY':
5661
        code = 'X'
5662
        skip = 2
5663
    elif word[:1] == 'C':
5664
        code = 'K'
5665
        skip = 1
5666
    elif word[:1] == 'Ä':
5667
        code = 'Æ'
5668
        skip = 1
5669
    elif word[:1] == 'Ö':
5670
        code = 'Ø'
5671
        skip = 1
5672
5673
    if word[-2:] == 'DT':
5674
        word = word[:-2]+'T'
5675
    # Though the rules indicate this rule applies in all positions, the
5676
    # reference implementation indicates it applies only in final position.
5677
    elif word[-2:-1] in _vowels and word[-1:] == 'D':
5678
        word = word[:-2]
5679
5680
    for pos, char in enumerate(word):
5681
        if skip:
5682
            skip -= 1
5683
        else:
5684
            for length in sorted(replacements, reverse=True):
5685
                if word[pos:pos+length] in replacements[length]:
5686
                    code += replacements[length][word[pos:pos+length]]
5687
                    skip = length-1
5688
                    break
5689
            else:
5690
                if not pos or char not in _vowels:
5691
                    code += char
5692
5693
    code = _delete_consecutive_repeats(code)
5694
5695
    return code
5696
5697
5698
def dolby(word, maxlength=None, keep_vowels=False, vowel_char='*'):
5699
    """Return the Dolby Code of a name.
5700
5701
    This follows "A Spelling Equivalent Abbreviation Algorithm For Personal
5702
    Names" from:
5703
    Dolby, James L. 1970. "An Algorithm for Variable-Length Proper-Name
5704
    Compression." Journal of Library Automation, 3(4).
5705
    doi:10.6017/ital.v3i4.5259
5706
5707
    :param word: the word to encode
5708
    :param maxlength: maximum length of the returned Dolby code -- this also
5709
        activates the fixed-length code mode
5710
    :param keep_vowels: if True, retains all vowel markers
5711
    :param vowel_char: the vowel marker character (default to *)
5712
    :return:
5713
    """
5714
    _vowels = {'A', 'E', 'I', 'O', 'U', 'Y'}
5715
5716
    # uppercase, normalize, decompose, and filter non-A-Z out
5717
    word = unicodedata.normalize('NFKD', text_type(word.upper()))
5718
    word = word.replace('ß', 'SS')
5719
    word = ''.join(c for c in word if c in
5720
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
5721
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
5722
                    'Y', 'Z'})
5723
5724
    # Rule 1 (FL2)
5725
    if word[:3] in {'MCG', 'MAG', 'MAC'}:
5726
        word = 'MK'+word[3:]
5727
    elif word[:2] == 'MC':
5728
        word = 'MK'+word[2:]
5729
5730
    # Rule 2 (FL3)
5731
    pos = len(word)-2
5732
    while pos > -1:
5733
        if word[pos:pos+2] in {'DT', 'LD', 'ND', 'NT', 'RC', 'RD', 'RT', 'SC',
5734
                               'SK', 'ST'}:
5735
            word = word[:pos+1]+word[pos+2:]
5736
            pos += 1
5737
        pos -= 1
5738
5739
    # Rule 3 (FL4)
5740
    # Although the rule indicates "after the first letter", the test cases make
5741
    # it clear that these apply to the first letter also.
5742
    word = word.replace('X', 'KS')
5743
    word = word.replace('CE', 'SE')
5744
    word = word.replace('CI', 'SI')
5745
    word = word.replace('CY', 'SI')
5746
5747
    # not in the rule set, but they seem to have intended it
5748
    word = word.replace('TCH', 'CH')
5749
5750
    pos = word.find('CH', 1)
5751
    while pos != -1:
5752
        if word[pos-1:pos] not in _vowels:
5753
            word = word[:pos]+'S'+word[pos+1:]
5754
        pos = word.find('CH', pos+1)
5755
5756
    word = word.replace('C', 'K')
5757
    word = word.replace('Z', 'S')
5758
5759
    word = word.replace('WR', 'R')
5760
    word = word.replace('DG', 'G')
5761
    word = word.replace('QU', 'K')
5762
    word = word.replace('T', 'D')
5763
    word = word.replace('PH', 'F')
5764
5765
    # Rule 4 (FL5)
5766
    # Although the rule indicates "after the first letter", the test cases make
5767
    # it clear that these apply to the first letter also.
5768
    pos = word.find('K', 0)
5769
    while pos != -1:
5770
        if pos > 1 and word[pos-1:pos] not in _vowels | {'L', 'N', 'R'}:
5771
            word = word[:pos-1]+word[pos:]
5772
            pos -= 1
5773
        pos = word.find('K', pos+1)
5774
5775
    # Rule FL6
5776
    if maxlength and word[-1:] == 'E':
5777
        word = word[:-1]
5778
5779
    # Rule 5 (FL7)
5780
    word = _delete_consecutive_repeats(word)
5781
5782
    # Rule 6 (FL8)
5783
    if word[:2] == 'PF':
5784
        word = word[1:]
5785
    if word[-2:] == 'PF':
5786
        word = word[:-1]
5787
    elif word[-2:] == 'GH':
5788
        if word[-3:-2] in _vowels:
5789
            word = word[:-2]+'F'
5790
        else:
5791
            word = word[:-2]+'G'
5792
    word = word.replace('GH', '')
5793
5794
    # Rule FL9
5795
    if maxlength:
5796
        word = word.replace('V', 'F')
5797
5798
    # Rules 7-9 (FL10-FL12)
5799
    first = 1 + (1 if maxlength else 0)
5800
    code = ''
5801
    for pos, char in enumerate(word):
5802
        if char in _vowels:
5803
            if first or keep_vowels:
5804
                code += vowel_char
5805
                first -= 1
5806
            else:
5807
                continue
5808
        elif pos > 0 and char in {'W', 'H'}:
5809
            continue
5810
        else:
5811
            code += char
5812
5813
    if maxlength:
5814
        # Rule FL13
5815
        if len(code) > maxlength and code[-1:] == 'S':
5816
            code = code[:-1]
5817
        if keep_vowels:
5818
            code = code[:maxlength]
5819
        else:
5820
            # Rule FL14
5821
            code = code[:maxlength + 2]
5822
            # Rule FL15
5823
            while len(code) > maxlength:
5824
                vowels = len(code) - maxlength
5825
                excess = vowels - 1
5826
                word = code
5827
                code = ''
5828
                for char in word:
5829
                    if char == vowel_char:
5830
                        if vowels:
5831
                            code += char
5832
                            vowels -= 1
5833
                    else:
5834
                        code += char
5835
                code = code[:maxlength + excess]
5836
5837
        # Rule FL16
5838
        code += ' ' * (maxlength - len(code))
5839
5840
    return code
5841
5842
5843
def bmpm(word, language_arg=0, name_mode='gen', match_mode='approx',
5844
         concat=False, filter_langs=False):
5845
    """Return the Beider-Morse Phonetic Matching algorithm code for a word.
5846
5847
    The Beider-Morse Phonetic Matching algorithm is described at:
5848
    http://stevemorse.org/phonetics/bmpm.htm
5849
    The reference implementation is licensed under GPLv3 and available at:
5850
    http://stevemorse.org/phoneticinfo.htm
5851
5852
    :param str word: the word to transform
5853
    :param str language_arg: the language of the term; supported values
5854
        include:
5855
5856
            - 'any'
5857
            - 'arabic'
5858
            - 'cyrillic'
5859
            - 'czech'
5860
            - 'dutch'
5861
            - 'english'
5862
            - 'french'
5863
            - 'german'
5864
            - 'greek'
5865
            - 'greeklatin'
5866
            - 'hebrew'
5867
            - 'hungarian'
5868
            - 'italian'
5869
            - 'polish'
5870
            - 'portuguese'
5871
            - 'romanian'
5872
            - 'russian'
5873
            - 'spanish'
5874
            - 'turkish'
5875
            - 'germandjsg'
5876
            - 'polishdjskp'
5877
            - 'russiandjsre'
5878
5879
    :param str name_mode: the name mode of the algorithm:
5880
5881
            - 'gen' -- general (default)
5882
            - 'ash' -- Ashkenazi
5883
            - 'sep' -- Sephardic
5884
5885
    :param str match_mode: matching mode: 'approx' or 'exact'
5886
    :param bool concat: concatenation mode
5887
    :param bool filter_langs: filter out incompatible languages
5888
    :returns: the BMPM value(s)
5889
    :rtype: tuple
5890
5891
    >>> bmpm('Christopher')
5892
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5893
    xristYfir xristopi xritopir xritopi xristofi xritofir xritofi tzristopir
5894
    tzristofir zristopir zristopi zritopir zritopi zristofir zristofi zritofir
5895
    zritofi'
5896
    >>> bmpm('Niall')
5897
    'nial niol'
5898
    >>> bmpm('Smith')
5899
    'zmit'
5900
    >>> bmpm('Schmidt')
5901
    'zmit stzmit'
5902
5903
    >>> bmpm('Christopher', language_arg='German')
5904
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5905
    xristYfir'
5906
    >>> bmpm('Christopher', language_arg='English')
5907
    'tzristofir tzrQstofir tzristafir tzrQstafir xristofir xrQstofir xristafir
5908
    xrQstafir'
5909
    >>> bmpm('Christopher', language_arg='German', name_mode='ash')
5910
    'xrQstopir xrQstYpir xristopir xristYpir xrQstofir xrQstYfir xristofir
5911
    xristYfir'
5912
5913
    >>> bmpm('Christopher', language_arg='German', match_mode='exact')
5914
    'xriStopher xriStofer xristopher xristofer'
5915
    """
5916
    return _bmpm(word, language_arg, name_mode, match_mode,
5917
                 concat, filter_langs)
5918
5919
5920
if __name__ == '__main__':
5921
    import doctest
5922
    doctest.testmod()
5923