Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._fuzzy_soundex.fuzzy_soundex()   A

Complexity

Conditions 1

Size

Total Lines 32
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 32
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._fuzzy_soundex.
20
21
Fuzzy Soundex
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import _Phonetic
36
37 1
__all__ = ['FuzzySoundex', 'fuzzy_soundex']
38
39
40 1
class FuzzySoundex(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """Fuzzy Soundex.
42
43
    Fuzzy Soundex is an algorithm derived from Soundex, defined in
44
    :cite:`Holmes:2002`.
45
    """
46
47 1
    _trans = dict(
48
        zip(
49
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
50
            '0193017-07745501769301-7-9',
51
        )
52
    )
53
54 1
    def encode(self, word, max_length=5, zero_pad=True):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
55
        """Return the Fuzzy Soundex code for a word.
56
57
        Parameters
58
        ----------
59
        word : str
60
            The word to transform
61
        max_length : int
62
            The length of the code returned (defaults to 4)
63
        zero_pad : bool
64
            Pad the end of the return value with 0s to achieve a max_length
65
            string
66
67
        Returns
68
        -------
69
        str
70
            The Fuzzy Soundex value
71
72
        Examples
73
        --------
74
        >>> pe = FuzzySoundex()
75
        >>> pe.encode('Christopher')
76
        'K6931'
77
        >>> pe.encode('Niall')
78
        'N4000'
79
        >>> pe.encode('Smith')
80
        'S5300'
81
        >>> pe.encode('Smith')
82
        'S5300'
83
84
        """
85 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
86 1
        word = word.replace('ß', 'SS')
87
88
        # Clamp max_length to [4, 64]
89 1
        if max_length != -1:
90 1
            max_length = min(max(4, max_length), 64)
91
        else:
92 1
            max_length = 64
93
94 1
        if not word:
95 1
            if zero_pad:
96 1
                return '0' * max_length
97 1
            return '0'
98
99 1
        if word[:2] in {'CS', 'CZ', 'TS', 'TZ'}:
100 1
            word = 'SS' + word[2:]
101 1
        elif word[:2] == 'GN':
102 1
            word = 'NN' + word[2:]
103 1
        elif word[:2] in {'HR', 'WR'}:
104 1
            word = 'RR' + word[2:]
105 1
        elif word[:2] == 'HW':
106 1
            word = 'WW' + word[2:]
107 1
        elif word[:2] in {'KN', 'NG'}:
108 1
            word = 'NN' + word[2:]
109
110 1
        if word[-2:] == 'CH':
111 1
            word = word[:-2] + 'KK'
112 1
        elif word[-2:] == 'NT':
113 1
            word = word[:-2] + 'TT'
114 1
        elif word[-2:] == 'RT':
115 1
            word = word[:-2] + 'RR'
116 1
        elif word[-3:] == 'RDT':
117 1
            word = word[:-3] + 'RR'
118
119 1
        word = word.replace('CA', 'KA')
120 1
        word = word.replace('CC', 'KK')
121 1
        word = word.replace('CK', 'KK')
122 1
        word = word.replace('CE', 'SE')
123 1
        word = word.replace('CHL', 'KL')
124 1
        word = word.replace('CL', 'KL')
125 1
        word = word.replace('CHR', 'KR')
126 1
        word = word.replace('CR', 'KR')
127 1
        word = word.replace('CI', 'SI')
128 1
        word = word.replace('CO', 'KO')
129 1
        word = word.replace('CU', 'KU')
130 1
        word = word.replace('CY', 'SY')
131 1
        word = word.replace('DG', 'GG')
132 1
        word = word.replace('GH', 'HH')
133 1
        word = word.replace('MAC', 'MK')
134 1
        word = word.replace('MC', 'MK')
135 1
        word = word.replace('NST', 'NSS')
136 1
        word = word.replace('PF', 'FF')
137 1
        word = word.replace('PH', 'FF')
138 1
        word = word.replace('SCH', 'SSS')
139 1
        word = word.replace('TIO', 'SIO')
140 1
        word = word.replace('TIA', 'SIO')
141 1
        word = word.replace('TCH', 'CHH')
142
143 1
        sdx = word.translate(self._trans)
144 1
        sdx = sdx.replace('-', '')
145
146
        # remove repeating characters
147 1
        sdx = self._delete_consecutive_repeats(sdx)
148
149 1
        if word[0] in {'H', 'W', 'Y'}:
150 1
            sdx = word[0] + sdx
151
        else:
152 1
            sdx = word[0] + sdx[1:]
153
154 1
        sdx = sdx.replace('0', '')
155
156 1
        if zero_pad:
157 1
            sdx += '0' * max_length
158
159 1
        return sdx[:max_length]
160
161
162 1
def fuzzy_soundex(word, max_length=5, zero_pad=True):
163
    """Return the Fuzzy Soundex code for a word.
164
165
    This is a wrapper for :py:meth:`FuzzySoundex.encode`.
166
167
    Parameters
168
    ----------
169
    word : str
170
        The word to transform
171
    max_length : int
172
        The length of the code returned (defaults to 4)
173
    zero_pad : bool
174
        Pad the end of the return value with 0s to achieve a max_length string
175
176
    Returns
177
    -------
178
    str
179
        The Fuzzy Soundex value
180
181
    Examples
182
    --------
183
    >>> fuzzy_soundex('Christopher')
184
    'K6931'
185
    >>> fuzzy_soundex('Niall')
186
    'N4000'
187
    >>> fuzzy_soundex('Smith')
188
    'S5300'
189
    >>> fuzzy_soundex('Smith')
190
    'S5300'
191
192
    """
193 1
    return FuzzySoundex().encode(word, max_length, zero_pad)
194
195
196
if __name__ == '__main__':
197
    import doctest
198
199
    doctest.testmod()
200