Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

abydos.phonetic.soundex   F

Complexity

Total Complexity 145

Size/Duplication

Total Lines 929
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 510
dl 0
loc 929
rs 2
c 0
b 0
f 0
wmc 145

8 Functions

Rating   Name   Duplication   Size   Complexity  
F phonix() 0 208 24
A refined_soundex() 0 48 4
F phonex() 0 105 33
F pshp_soundex_first() 0 99 18
A lein() 0 44 2
F pshp_soundex_last() 0 143 36
F fuzzy_soundex() 0 101 15
D soundex() 0 112 13

How to fix   Complexity   

Complexity

Complex classes like abydos.phonetic.soundex often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.soundex.
20
21
The phonetic.soundex module implements phonetic algorithms that are generally
22
Soundex-like, including:
23
24
    - American Soundex
25
    - Refined Soundex
26
    - Fuzzy Soundex
27
    - Phonex
28
    - Phonix
29
    - Lein
30
    - PSHP Soundex/Viewex Coding
31
32
Being Soundex-like, for the purposes of this module means: targeted at English,
33
returning a code that starts with a letter and continues with (usually 3)
34
numerals, and mostly based on a simple translation table.
35
"""
36
37
from __future__ import unicode_literals
38
39
from unicodedata import normalize as unicode_normalize
40
41
from six import text_type
42
from six.moves import range
43
44
from . import _delete_consecutive_repeats
45
46
__all__ = ['fuzzy_soundex', 'lein', 'phonex', 'phonix', 'pshp_soundex_first',
47
           'pshp_soundex_last', 'refined_soundex', 'soundex']
48
49
50
def soundex(word, max_length=4, var='American', reverse=False, zero_pad=True):
51
    """Return the Soundex code for a word.
52
53
    :param str word: the word to transform
54
    :param int max_length: the length of the code returned (defaults to 4)
55
    :param str var: the variant of the algorithm to employ (defaults to
56
        'American'):
57
58
        - 'American' follows the American Soundex algorithm, as described at
59
          :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
60
          Miracode
61
        - 'special' follows the rules from the 1880-1910 US Census
62
          retrospective re-analysis, in which h & w are not treated as blocking
63
          consonants but as vowels. Cf. :cite:`Repici:2013`.
64
        - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
65
          US Census, including coding prefixed and unprefixed versions of some
66
          names
67
68
    :param bool reverse: reverse the word before computing the selected Soundex
69
        (defaults to False); This results in "Reverse Soundex", which is useful
70
        for blocking in cases where the initial elements may be in error.
71
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
72
        max_length string
73
    :returns: the Soundex value
74
    :rtype: str
75
76
    >>> soundex("Christopher")
77
    'C623'
78
    >>> soundex("Niall")
79
    'N400'
80
    >>> soundex('Smith')
81
    'S530'
82
    >>> soundex('Schmidt')
83
    'S530'
84
85
    >>> soundex('Christopher', max_length=-1)
86
    'C623160000000000000000000000000000000000000000000000000000000000'
87
    >>> soundex('Christopher', max_length=-1, zero_pad=False)
88
    'C62316'
89
90
    >>> soundex('Christopher', reverse=True)
91
    'R132'
92
93
    >>> soundex('Ashcroft')
94
    'A261'
95
    >>> soundex('Asicroft')
96
    'A226'
97
    >>> soundex('Ashcroft', var='special')
98
    'A226'
99
    >>> soundex('Asicroft', var='special')
100
    'A226'
101
    """
102
    _soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
103
                                     'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
104
                                    '01230129022455012623019202'))
105
106
    # Require a max_length of at least 4 and not more than 64
107
    if max_length != -1:
108
        max_length = min(max(4, max_length), 64)
109
    else:
110
        max_length = 64
111
112
    # uppercase, normalize, decompose, and filter non-A-Z out
113
    word = unicode_normalize('NFKD', text_type(word.upper()))
114
    word = word.replace('ß', 'SS')
115
116
    if var == 'Census':
117
        # TODO: Should these prefixes be supplemented? (VANDE, DELA, VON)
118
        if word[:3] in {'VAN', 'CON'} and len(word) > 4:
119
            return (soundex(word, max_length, 'American', reverse, zero_pad),
120
                    soundex(word[3:], max_length, 'American', reverse,
121
                            zero_pad))
122
        if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
123
            return (soundex(word, max_length, 'American', reverse, zero_pad),
124
                    soundex(word[2:], max_length, 'American', reverse,
125
                            zero_pad))
126
        # Otherwise, proceed as usual (var='American' mode, ostensibly)
127
128
    word = ''.join(c for c in word if c in
129
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
130
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
131
                    'Y', 'Z'})
132
133
    # Nothing to convert, return base case
134
    if not word:
135
        if zero_pad:
136
            return '0'*max_length
137
        return '0'
138
139
    # Reverse word if computing Reverse Soundex
140
    if reverse:
141
        word = word[::-1]
142
143
    # apply the Soundex algorithm
144
    sdx = word.translate(_soundex_translation)
145
146
    if var == 'special':
147
        sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
148
    else:
149
        sdx = sdx.replace('9', '')  # rule 1
150
    sdx = _delete_consecutive_repeats(sdx)  # rule 3
151
152
    if word[0] in 'HW':
153
        sdx = word[0] + sdx
154
    else:
155
        sdx = word[0] + sdx[1:]
156
    sdx = sdx.replace('0', '')  # rule 1
157
158
    if zero_pad:
159
        sdx += ('0'*max_length)  # rule 4
160
161
    return sdx[:max_length]
162
163
164
def refined_soundex(word, max_length=-1, zero_pad=False,
165
                    retain_vowels=False):
166
    """Return the Refined Soundex code for a word.
167
168
    This is Soundex, but with more character classes. It was defined at
169
    :cite:`Boyce:1998`.
170
171
    :param word: the word to transform
172
    :param max_length: the length of the code returned (defaults to unlimited)
173
    :param zero_pad: pad the end of the return value with 0s to achieve a
174
        max_length string
175
    :param retain_vowels: retain vowels (as 0) in the resulting code
176
    :returns: the Refined Soundex value
177
    :rtype: str
178
179
    >>> refined_soundex('Christopher')
180
    'C393619'
181
    >>> refined_soundex('Niall')
182
    'N87'
183
    >>> refined_soundex('Smith')
184
    'S386'
185
    >>> refined_soundex('Schmidt')
186
    'S386'
187
    """
188
    _ref_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
189
                                         'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
190
                                        '01360240043788015936020505'))
191
192
    # uppercase, normalize, decompose, and filter non-A-Z out
193
    word = unicode_normalize('NFKD', text_type(word.upper()))
194
    word = word.replace('ß', 'SS')
195
    word = ''.join(c for c in word if c in
196
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
197
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
198
                    'Y', 'Z'})
199
200
    # apply the Soundex algorithm
201
    sdx = word[:1] + word.translate(_ref_soundex_translation)
202
    sdx = _delete_consecutive_repeats(sdx)
203
    if not retain_vowels:
204
        sdx = sdx.replace('0', '')  # Delete vowels, H, W, Y
205
206
    if max_length > 0:
207
        if zero_pad:
208
            sdx += ('0' * max_length)
209
        sdx = sdx[:max_length]
210
211
    return sdx
212
213
214
def fuzzy_soundex(word, max_length=5, zero_pad=True):
215
    """Return the Fuzzy Soundex code for a word.
216
217
    Fuzzy Soundex is an algorithm derived from Soundex, defined in
218
    :cite:`Holmes:2002`.
219
220
    :param str word: the word to transform
221
    :param int max_length: the length of the code returned (defaults to 4)
222
    :param bool zero_pad: pad the end of the return value with 0s to achieve
223
        a max_length string
224
    :returns: the Fuzzy Soundex value
225
    :rtype: str
226
227
    >>> fuzzy_soundex('Christopher')
228
    'K6931'
229
    >>> fuzzy_soundex('Niall')
230
    'N4000'
231
    >>> fuzzy_soundex('Smith')
232
    'S5300'
233
    >>> fuzzy_soundex('Smith')
234
    'S5300'
235
    """
236
    _fuzzy_soundex_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
237
                                           'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
238
                                          '0193017-07745501769301-7-9'))
239
240
    word = unicode_normalize('NFKD', text_type(word.upper()))
241
    word = word.replace('ß', 'SS')
242
243
    # Clamp max_length to [4, 64]
244
    if max_length != -1:
245
        max_length = min(max(4, max_length), 64)
246
    else:
247
        max_length = 64
248
249
    if not word:
250
        if zero_pad:
251
            return '0' * max_length
252
        return '0'
253
254
    if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
255
        word = 'SS' + word[2:]
256
    elif word[:2] == 'GN':
257
        word = 'NN' + word[2:]
258
    elif word[:2] in {'HR', 'WR'}:
259
        word = 'RR' + word[2:]
260
    elif word[:2] == 'HW':
261
        word = 'WW' + word[2:]
262
    elif word[:2] in {'KN', 'NG'}:
263
        word = 'NN' + word[2:]
264
265
    if word[-2:] == 'CH':
266
        word = word[:-2] + 'KK'
267
    elif word[-2:] == 'NT':
268
        word = word[:-2] + 'TT'
269
    elif word[-2:] == 'RT':
270
        word = word[:-2] + 'RR'
271
    elif word[-3:] == 'RDT':
272
        word = word[:-3] + 'RR'
273
274
    word = word.replace('CA', 'KA')
275
    word = word.replace('CC', 'KK')
276
    word = word.replace('CK', 'KK')
277
    word = word.replace('CE', 'SE')
278
    word = word.replace('CHL', 'KL')
279
    word = word.replace('CL', 'KL')
280
    word = word.replace('CHR', 'KR')
281
    word = word.replace('CR', 'KR')
282
    word = word.replace('CI', 'SI')
283
    word = word.replace('CO', 'KO')
284
    word = word.replace('CU', 'KU')
285
    word = word.replace('CY', 'SY')
286
    word = word.replace('DG', 'GG')
287
    word = word.replace('GH', 'HH')
288
    word = word.replace('MAC', 'MK')
289
    word = word.replace('MC', 'MK')
290
    word = word.replace('NST', 'NSS')
291
    word = word.replace('PF', 'FF')
292
    word = word.replace('PH', 'FF')
293
    word = word.replace('SCH', 'SSS')
294
    word = word.replace('TIO', 'SIO')
295
    word = word.replace('TIA', 'SIO')
296
    word = word.replace('TCH', 'CHH')
297
298
    sdx = word.translate(_fuzzy_soundex_translation)
299
    sdx = sdx.replace('-', '')
300
301
    # remove repeating characters
302
    sdx = _delete_consecutive_repeats(sdx)
303
304
    if word[0] in {'H', 'W', 'Y'}:
305
        sdx = word[0] + sdx
306
    else:
307
        sdx = word[0] + sdx[1:]
308
309
    sdx = sdx.replace('0', '')
310
311
    if zero_pad:
312
        sdx += ('0'*max_length)
313
314
    return sdx[:max_length]
315
316
317
def phonex(word, max_length=4, zero_pad=True):
318
    """Return the Phonex code for a word.
319
320
    Phonex is an algorithm derived from Soundex, defined in :cite:`Lait:1996`.
321
322
    :param str word: the word to transform
323
    :param int max_length: the length of the code returned (defaults to 4)
324
    :param bool zero_pad: pad the end of the return value with 0s to achieve
325
        a max_length string
326
    :returns: the Phonex value
327
    :rtype: str
328
329
    >>> phonex('Christopher')
330
    'C623'
331
    >>> phonex('Niall')
332
    'N400'
333
    >>> phonex('Schmidt')
334
    'S253'
335
    >>> phonex('Smith')
336
    'S530'
337
    """
338
    name = unicode_normalize('NFKD', text_type(word.upper()))
339
    name = name.replace('ß', 'SS')
340
341
    # Clamp max_length to [4, 64]
342
    if max_length != -1:
343
        max_length = min(max(4, max_length), 64)
344
    else:
345
        max_length = 64
346
347
    name_code = last = ''
348
349
    # Deletions effected by replacing with next letter which
350
    # will be ignored due to duplicate handling of Soundex code.
351
    # This is faster than 'moving' all subsequent letters.
352
353
    # Remove any trailing Ss
354
    while name[-1:] == 'S':
355
        name = name[:-1]
356
357
    # Phonetic equivalents of first 2 characters
358
    # Works since duplicate letters are ignored
359
    if name[:2] == 'KN':
360
        name = 'N' + name[2:]  # KN.. == N..
361
    elif name[:2] == 'PH':
362
        name = 'F' + name[2:]  # PH.. == F.. (H ignored anyway)
363
    elif name[:2] == 'WR':
364
        name = 'R' + name[2:]  # WR.. == R..
365
366
    if name:
367
        # Special case, ignore H first letter (subsequent Hs ignored anyway)
368
        # Works since duplicate letters are ignored
369
        if name[0] == 'H':
370
            name = name[1:]
371
372
    if name:
373
        # Phonetic equivalents of first character
374
        if name[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
375
            name = 'A' + name[1:]
376
        elif name[0] in {'B', 'P'}:
377
            name = 'B' + name[1:]
378
        elif name[0] in {'V', 'F'}:
379
            name = 'F' + name[1:]
380
        elif name[0] in {'C', 'K', 'Q'}:
381
            name = 'C' + name[1:]
382
        elif name[0] in {'G', 'J'}:
383
            name = 'G' + name[1:]
384
        elif name[0] in {'S', 'Z'}:
385
            name = 'S' + name[1:]
386
387
        name_code = last = name[0]
388
389
    # Modified Soundex code
390
    for i in range(1, len(name)):
391
        code = '0'
392
        if name[i] in {'B', 'F', 'P', 'V'}:
393
            code = '1'
394
        elif name[i] in {'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z'}:
395
            code = '2'
396
        elif name[i] in {'D', 'T'}:
397
            if name[i+1:i+2] != 'C':
398
                code = '3'
399
        elif name[i] == 'L':
400
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
401
                    i+1 == len(name)):
402
                code = '4'
403
        elif name[i] in {'M', 'N'}:
404
            if name[i+1:i+2] in {'D', 'G'}:
405
                name = name[:i+1] + name[i] + name[i+2:]
406
            code = '5'
407
        elif name[i] == 'R':
408
            if (name[i+1:i+2] in {'A', 'E', 'I', 'O', 'U', 'Y'} or
409
                    i+1 == len(name)):
410
                code = '6'
411
412
        if code != last and code != '0' and i != 0:
413
            name_code += code
414
415
        last = name_code[-1]
416
417
    if zero_pad:
418
        name_code += '0' * max_length
419
    if not name_code:
420
        name_code = '0'
421
    return name_code[:max_length]
422
423
424
def phonix(word, max_length=4, zero_pad=True):
425
    """Return the Phonix code for a word.
426
427
    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
428
429
    This implementation is based on:
430
    - :cite:`Pfeifer:2000`
431
    - :cite:`Christen:2011`
432
    - :cite:`Kollar:2007`
433
434
    :param str word: the word to transform
435
    :param int max_length: the length of the code returned (defaults to 4)
436
    :param bool zero_pad: pad the end of the return value with 0s to achieve
437
        a max_length string
438
    :returns: the Phonix value
439
    :rtype: str
440
441
    >>> phonix('Christopher')
442
    'K683'
443
    >>> phonix('Niall')
444
    'N400'
445
    >>> phonix('Smith')
446
    'S530'
447
    >>> phonix('Schmidt')
448
    'S530'
449
    """
450
    def _start_repl(word, src, tar, post=None):
451
        r"""Replace src with tar at the start of word."""
452
        if post:
453
            for i in post:
454
                if word.startswith(src+i):
455
                    return tar + word[len(src):]
456
        elif word.startswith(src):
457
            return tar + word[len(src):]
458
        return word
459
460
    def _end_repl(word, src, tar, pre=None):
461
        r"""Replace src with tar at the end of word."""
462
        if pre:
463
            for i in pre:
464
                if word.endswith(i+src):
465
                    return word[:-len(src)] + tar
466
        elif word.endswith(src):
467
            return word[:-len(src)] + tar
468
        return word
469
470
    def _mid_repl(word, src, tar, pre=None, post=None):
471
        r"""Replace src with tar in the middle of word."""
472
        if pre or post:
473
            if not pre:
474
                return word[0] + _all_repl(word[1:], src, tar, pre, post)
475
            elif not post:
476
                return (_all_repl(word[:-1], src, tar, pre, post) +
477
                        word[-1])
478
            return _all_repl(word, src, tar, pre, post)
479
        return (word[0] +
480
                _all_repl(word[1:-1], src, tar, pre, post) +
481
                word[-1])
482
483
    def _all_repl(word, src, tar, pre=None, post=None):
484
        r"""Replace src with tar anywhere in word."""
485
        if pre or post:
486
            if post:
487
                post = post
488
            else:
489
                post = frozenset(('',))
490
            if pre:
491
                pre = pre
492
            else:
493
                pre = frozenset(('',))
494
495
            for i, j in ((i, j) for i in pre for j in post):
496
                word = word.replace(i+src+j, i+tar+j)
497
            return word
498
        else:
499
            return word.replace(src, tar)
500
501
    _vow = {'A', 'E', 'I', 'O', 'U'}
502
    _con = {'B', 'C', 'D', 'F', 'G', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'Q',
503
            'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z'}
504
505
    _phonix_substitutions = ((_all_repl, 'DG', 'G'),
506
                             (_all_repl, 'CO', 'KO'),
507
                             (_all_repl, 'CA', 'KA'),
508
                             (_all_repl, 'CU', 'KU'),
509
                             (_all_repl, 'CY', 'SI'),
510
                             (_all_repl, 'CI', 'SI'),
511
                             (_all_repl, 'CE', 'SE'),
512
                             (_start_repl, 'CL', 'KL', _vow),
513
                             (_all_repl, 'CK', 'K'),
514
                             (_end_repl, 'GC', 'K'),
515
                             (_end_repl, 'JC', 'K'),
516
                             (_start_repl, 'CHR', 'KR', _vow),
517
                             (_start_repl, 'CR', 'KR', _vow),
518
                             (_start_repl, 'WR', 'R'),
519
                             (_all_repl, 'NC', 'NK'),
520
                             (_all_repl, 'CT', 'KT'),
521
                             (_all_repl, 'PH', 'F'),
522
                             (_all_repl, 'AA', 'AR'),
523
                             (_all_repl, 'SCH', 'SH'),
524
                             (_all_repl, 'BTL', 'TL'),
525
                             (_all_repl, 'GHT', 'T'),
526
                             (_all_repl, 'AUGH', 'ARF'),
527
                             (_mid_repl, 'LJ', 'LD', _vow, _vow),
528
                             (_all_repl, 'LOUGH', 'LOW'),
529
                             (_start_repl, 'Q', 'KW'),
530
                             (_start_repl, 'KN', 'N'),
531
                             (_end_repl, 'GN', 'N'),
532
                             (_all_repl, 'GHN', 'N'),
533
                             (_end_repl, 'GNE', 'N'),
534
                             (_all_repl, 'GHNE', 'NE'),
535
                             (_end_repl, 'GNES', 'NS'),
536
                             (_start_repl, 'GN', 'N'),
537
                             (_mid_repl, 'GN', 'N', None, _con),
538
                             (_end_repl, 'GN', 'N'),
539
                             (_start_repl, 'PS', 'S'),
540
                             (_start_repl, 'PT', 'T'),
541
                             (_start_repl, 'CZ', 'C'),
542
                             (_mid_repl, 'WZ', 'Z', _vow),
543
                             (_mid_repl, 'CZ', 'CH'),
544
                             (_all_repl, 'LZ', 'LSH'),
545
                             (_all_repl, 'RZ', 'RSH'),
546
                             (_mid_repl, 'Z', 'S', None, _vow),
547
                             (_all_repl, 'ZZ', 'TS'),
548
                             (_mid_repl, 'Z', 'TS', _con),
549
                             (_all_repl, 'HROUG', 'REW'),
550
                             (_all_repl, 'OUGH', 'OF'),
551
                             (_mid_repl, 'Q', 'KW', _vow, _vow),
552
                             (_mid_repl, 'J', 'Y', _vow, _vow),
553
                             (_start_repl, 'YJ', 'Y', _vow),
554
                             (_start_repl, 'GH', 'G'),
555
                             (_end_repl, 'GH', 'E', _vow),
556
                             (_start_repl, 'CY', 'S'),
557
                             (_all_repl, 'NX', 'NKS'),
558
                             (_start_repl, 'PF', 'F'),
559
                             (_end_repl, 'DT', 'T'),
560
                             (_end_repl, 'TL', 'TIL'),
561
                             (_end_repl, 'DL', 'DIL'),
562
                             (_all_repl, 'YTH', 'ITH'),
563
                             (_start_repl, 'TJ', 'CH', _vow),
564
                             (_start_repl, 'TSJ', 'CH', _vow),
565
                             (_start_repl, 'TS', 'T', _vow),
566
                             (_all_repl, 'TCH', 'CH'),
567
                             (_mid_repl, 'WSK', 'VSKIE', _vow),
568
                             (_end_repl, 'WSK', 'VSKIE', _vow),
569
                             (_start_repl, 'MN', 'N', _vow),
570
                             (_start_repl, 'PN', 'N', _vow),
571
                             (_mid_repl, 'STL', 'SL', _vow),
572
                             (_end_repl, 'STL', 'SL', _vow),
573
                             (_end_repl, 'TNT', 'ENT'),
574
                             (_end_repl, 'EAUX', 'OH'),
575
                             (_all_repl, 'EXCI', 'ECS'),
576
                             (_all_repl, 'X', 'ECS'),
577
                             (_end_repl, 'NED', 'ND'),
578
                             (_all_repl, 'JR', 'DR'),
579
                             (_end_repl, 'EE', 'EA'),
580
                             (_all_repl, 'ZS', 'S'),
581
                             (_mid_repl, 'R', 'AH', _vow, _con),
582
                             (_end_repl, 'R', 'AH', _vow),
583
                             (_mid_repl, 'HR', 'AH', _vow, _con),
584
                             (_end_repl, 'HR', 'AH', _vow),
585
                             (_end_repl, 'HR', 'AH', _vow),
586
                             (_end_repl, 'RE', 'AR'),
587
                             (_end_repl, 'R', 'AH', _vow),
588
                             (_all_repl, 'LLE', 'LE'),
589
                             (_end_repl, 'LE', 'ILE', _con),
590
                             (_end_repl, 'LES', 'ILES', _con),
591
                             (_end_repl, 'E', ''),
592
                             (_end_repl, 'ES', 'S'),
593
                             (_end_repl, 'SS', 'AS', _vow),
594
                             (_end_repl, 'MB', 'M', _vow),
595
                             (_all_repl, 'MPTS', 'MPS'),
596
                             (_all_repl, 'MPS', 'MS'),
597
                             (_all_repl, 'MPT', 'MT'))
598
599
    _phonix_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
600
                                    'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
601
                                   '01230720022455012683070808'))
602
603
    sdx = ''
604
605
    word = unicode_normalize('NFKD', text_type(word.upper()))
606
    word = word.replace('ß', 'SS')
607
    word = ''.join(c for c in word if c in
608
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
609
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
610
                    'Y', 'Z'})
611
    if word:
612
        for trans in _phonix_substitutions:
613
            word = trans[0](word, *trans[1:])
614
        if word[0] in {'A', 'E', 'I', 'O', 'U', 'Y'}:
615
            sdx = 'v' + word[1:].translate(_phonix_translation)
616
        else:
617
            sdx = word[0] + word[1:].translate(_phonix_translation)
618
        sdx = _delete_consecutive_repeats(sdx)
619
        sdx = sdx.replace('0', '')
620
621
    # Clamp max_length to [4, 64]
622
    if max_length != -1:
623
        max_length = min(max(4, max_length), 64)
624
    else:
625
        max_length = 64
626
627
    if zero_pad:
628
        sdx += '0' * max_length
629
    if not sdx:
630
        sdx = '0'
631
    return sdx[:max_length]
632
633
634
def lein(word, max_length=4, zero_pad=True):
635
    """Return the Lein code for a word.
636
637
    This is Lein name coding, described in :cite:`Moore:1977`.
638
639
    :param str word: the word to transform
640
    :param int max_length: the maximum length (default 4) of the code to return
641
    :param bool zero_pad: pad the end of the return value with 0s to achieve a
642
        max_length string
643
    :returns: the Lein code
644
    :rtype: str
645
646
    >>> lein('Christopher')
647
    'C351'
648
    >>> lein('Niall')
649
    'N300'
650
    >>> lein('Smith')
651
    'S210'
652
    >>> lein('Schmidt')
653
    'S521'
654
    """
655
    _lein_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
656
                                  'BCDFGJKLMNPQRSTVXZ'),
657
                                 '451455532245351455'))
658
659
    # uppercase, normalize, decompose, and filter non-A-Z out
660
    word = unicode_normalize('NFKD', text_type(word.upper()))
661
    word = word.replace('ß', 'SS')
662
    word = ''.join(c for c in word if c in
663
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
664
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
665
                    'Y', 'Z'})
666
667
    code = word[:1]  # Rule 1
668
    word = word[1:].translate({32: None, 65: None, 69: None, 72: None,
669
                               73: None, 79: None, 85: None, 87: None,
670
                               89: None})  # Rule 2
671
    word = _delete_consecutive_repeats(word)  # Rule 3
672
    code += word.translate(_lein_translation)  # Rule 4
673
674
    if zero_pad:
675
        code += ('0'*max_length)  # Rule 4
676
677
    return code[:max_length]
678
679
680
def pshp_soundex_last(lname, max_length=4, german=False):
681
    """Calculate the PSHP Soundex/Viewex Coding of a last name.
682
683
    This coding is based on :cite:`Hershberg:1976`.
684
685
    Reference was also made to the German version of the same:
686
    :cite:`Hershberg:1979`.
687
688
    A separate function, pshp_soundex_first() is used for first names.
689
690
    :param str lname: the last name to encode
691
    :param int max_length: the length of the code returned (defaults to 4)
692
    :param bool german: set to True if the name is German (different rules
693
        apply)
694
    :returns: the PSHP Soundex/Viewex Coding
695
    :rtype: str
696
697
    >>> pshp_soundex_last('Smith')
698
    'S530'
699
    >>> pshp_soundex_last('Waters')
700
    'W350'
701
    >>> pshp_soundex_last('James')
702
    'J500'
703
    >>> pshp_soundex_last('Schmidt')
704
    'S530'
705
    >>> pshp_soundex_last('Ashcroft')
706
    'A225'
707
    """
708
    lname = unicode_normalize('NFKD', text_type(lname.upper()))
709
    lname = lname.replace('ß', 'SS')
710
    lname = ''.join(c for c in lname if c in
711
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
712
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
713
                     'W', 'X', 'Y', 'Z'})
714
715
    # A. Prefix treatment
716
    if lname[:3] == 'VON' or lname[:3] == 'VAN':
717
        lname = lname[3:].strip()
718
719
    # The rule implemented below says "MC, MAC become 1". I believe it meant to
720
    # say they become M except in German data (where superscripted 1 indicates
721
    # "except in German data"). It doesn't make sense for them to become 1
722
    # (BPFV -> 1) or to apply outside German. Unfortunately, both articles have
723
    # this error(?).
724
    if not german:
725
        if lname[:3] == 'MAC':
726
            lname = 'M'+lname[3:]
727
        elif lname[:2] == 'MC':
728
            lname = 'M'+lname[2:]
729
730
    # The non-German-only rule to strip ' is unnecessary due to filtering
731
732
    if lname[:1] in {'E', 'I', 'O', 'U'}:
733
        lname = 'A' + lname[1:]
734
    elif lname[:2] in {'GE', 'GI', 'GY'}:
735
        lname = 'J' + lname[1:]
736
    elif lname[:2] in {'CE', 'CI', 'CY'}:
737
        lname = 'S' + lname[1:]
738
    elif lname[:3] == 'CHR':
739
        lname = 'K' + lname[1:]
740
    elif lname[:1] == 'C' and lname[:2] != 'CH':
741
        lname = 'K' + lname[1:]
742
743
    if lname[:2] == 'KN':
744
        lname = 'N' + lname[1:]
745
    elif lname[:2] == 'PH':
746
        lname = 'F' + lname[1:]
747
    elif lname[:3] in {'WIE', 'WEI'}:
748
        lname = 'V' + lname[1:]
749
750
    if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
751
        lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]]+lname[1:]
752
753
    code = lname[:1]
754
755
    # B. Postfix treatment
756
    if german:  # moved from end of postfix treatment due to blocking
757
        if lname[-3:] == 'TES':
758
            lname = lname[:-3]
759
        elif lname[-2:] == 'TS':
760
            lname = lname[:-2]
761
        if lname[-3:] == 'TZE':
762
            lname = lname[:-3]
763
        elif lname[-2:] == 'ZE':
764
            lname = lname[:-2]
765
        if lname[-1:] == 'Z':
766
            lname = lname[:-1]
767
        elif lname[-2:] == 'TE':
768
            lname = lname[:-2]
769
770
    if lname[-1:] == 'R':
771
        lname = lname[:-1] + 'N'
772
    elif lname[-2:] in {'SE', 'CE'}:
773
        lname = lname[:-2]
774
    if lname[-2:] == 'SS':
775
        lname = lname[:-2]
776
    elif lname[-1:] == 'S':
777
        lname = lname[:-1]
778
779
    if not german:
780
        l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
781
        l4_repl = {'NSEN': 'ASEN', 'MSON': 'ASON', 'STEN': 'SAEN',
782
                   'STON': 'SAON'}
783
        if lname[-5:] in l5_repl:
784
            lname = lname[:-5] + l5_repl[lname[-5:]]
785
        elif lname[-4:] in l4_repl:
786
            lname = lname[:-4] + l4_repl[lname[-4:]]
787
788
    if lname[-2:] in {'NG', 'ND'}:
789
        lname = lname[:-1]
790
    if not german and lname[-3:] in {'GAN', 'GEN'}:
791
        lname = lname[:-3]+'A'+lname[-2:]
792
793
    # C. Infix Treatment
794
    lname = lname.replace('CK', 'C')
795
    lname = lname.replace('SCH', 'S')
796
    lname = lname.replace('DT', 'T')
797
    lname = lname.replace('ND', 'N')
798
    lname = lname.replace('NG', 'N')
799
    lname = lname.replace('LM', 'M')
800
    lname = lname.replace('MN', 'M')
801
    lname = lname.replace('WIE', 'VIE')
802
    lname = lname.replace('WEI', 'VEI')
803
804
    # D. Soundexing
805
    # code for X & Y are unspecified, but presumably are 2 & 0
806
    _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
807
                                  'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
808
                                 '01230120022455012523010202'))
809
810
    lname = lname.translate(_pshp_translation)
811
    lname = _delete_consecutive_repeats(lname)
812
813
    code += lname[1:]
814
    code = code.replace('0', '')  # rule 1
815
816
    if max_length != -1:
817
        if len(code) < max_length:
818
            code += '0' * (max_length-len(code))
819
        else:
820
            code = code[:max_length]
821
822
    return code
823
824
825
def pshp_soundex_first(fname, max_length=4, german=False):
826
    """Calculate the PSHP Soundex/Viewex Coding of a first name.
827
828
    This coding is based on :cite:`Hershberg:1976`.
829
830
    Reference was also made to the German version of the same:
831
    :cite:`Hershberg:1979`.
832
833
    A separate function, pshp_soundex_last() is used for last names.
834
835
    :param str fname: the first name to encode
836
    :param int max_length: the length of the code returned (defaults to 4)
837
    :param bool german: set to True if the name is German (different rules
838
        apply)
839
    :returns: the PSHP Soundex/Viewex Coding
840
    :rtype: str
841
842
    >>> pshp_soundex_first('Smith')
843
    'S530'
844
    >>> pshp_soundex_first('Waters')
845
    'W352'
846
    >>> pshp_soundex_first('James')
847
    'J700'
848
    >>> pshp_soundex_first('Schmidt')
849
    'S500'
850
    >>> pshp_soundex_first('Ashcroft')
851
    'A220'
852
    >>> pshp_soundex_first('John')
853
    'J500'
854
    >>> pshp_soundex_first('Colin')
855
    'K400'
856
    >>> pshp_soundex_first('Niall')
857
    'N400'
858
    >>> pshp_soundex_first('Sally')
859
    'S400'
860
    >>> pshp_soundex_first('Jane')
861
    'J500'
862
    """
863
    fname = unicode_normalize('NFKD', text_type(fname.upper()))
864
    fname = fname.replace('ß', 'SS')
865
    fname = ''.join(c for c in fname if c in
866
                    {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
867
                     'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
868
                     'W', 'X', 'Y', 'Z'})
869
870
    # special rules
871
    if fname == 'JAMES':
872
        code = 'J7'
873
    elif fname == 'PAT':
874
        code = 'P7'
875
876
    else:
877
        # A. Prefix treatment
878
        if fname[:2] in {'GE', 'GI', 'GY'}:
879
            fname = 'J' + fname[1:]
880
        elif fname[:2] in {'CE', 'CI', 'CY'}:
881
            fname = 'S' + fname[1:]
882
        elif fname[:3] == 'CHR':
883
            fname = 'K' + fname[1:]
884
        elif fname[:1] == 'C' and fname[:2] != 'CH':
885
            fname = 'K' + fname[1:]
886
887
        if fname[:2] == 'KN':
888
            fname = 'N' + fname[1:]
889
        elif fname[:2] == 'PH':
890
            fname = 'F' + fname[1:]
891
        elif fname[:3] in {'WIE', 'WEI'}:
892
            fname = 'V' + fname[1:]
893
894
        if german and fname[:1] in {'W', 'M', 'Y', 'Z'}:
895
            fname = ({'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[fname[0]] +
896
                     fname[1:])
897
898
        code = fname[:1]
899
900
        # B. Soundex coding
901
        # code for Y unspecified, but presumably is 0
902
        _pshp_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
903
                                      'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
904
                                     '01230120022455012523010202'))
905
906
        fname = fname.translate(_pshp_translation)
907
        fname = _delete_consecutive_repeats(fname)
908
909
        code += fname[1:]
910
        syl_ptr = code.find('0')
911
        syl2_ptr = code[syl_ptr + 1:].find('0')
912
        if syl_ptr != -1 and syl2_ptr != -1 and syl2_ptr - syl_ptr > -1:
913
            code = code[:syl_ptr + 2]
914
915
        code = code.replace('0', '')  # rule 1
916
917
    if max_length != -1:
918
        if len(code) < max_length:
919
            code += '0' * (max_length-len(code))
920
        else:
921
            code = code[:max_length]
922
923
    return code
924
925
926
if __name__ == '__main__':
927
    import doctest
928
    doctest.testmod()
929