abydos.phonetic._soundex   A
last analyzed

Complexity

Total Complexity 17

Size/Duplication

Total Lines 244
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 17
eloc 60
dl 0
loc 244
ccs 34
cts 34
cp 1
rs 10
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A Soundex.__init__() 0 47 2
A Soundex.encode_alpha() 0 31 1
F Soundex.encode() 0 101 14
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._soundex.
18
19 1
American Soundex
20
"""
21
22
from typing import Any
23
from unicodedata import normalize as unicode_normalize
24 1
25
from ._phonetic import _Phonetic
26
27
__all__ = ['Soundex']
28
29
30
class Soundex(_Phonetic):
31 1
    """Soundex.
32
33 1
    Three variants of Soundex are implemented:
34
35 1
    - 'American' follows the American Soundex algorithm, as described at
36
      :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
37 1
      Miracode
38 1
    - 'special' follows the rules from the 1880-1910 US Census
39
      retrospective re-analysis, in which h & w are not treated as blocking
40 1
      consonants but as vowels. Cf. :cite:`Repici:2013`.
41
    - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
42
      US Census, including coding prefixed and unprefixed versions of some
43 1
      names
44
45
    .. versionadded:: 0.3.6
46
    """
47
48
    _trans = dict(
49
        zip(
50
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
51
            '01230129022455012623019202',
52
        )
53
    )
54
55
    _alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))
56
57
    def __init__(
58
        self,
59
        max_length: int = 4,
60
        var: str = 'American',
61 1
        reverse: bool = False,
62
        zero_pad: bool = True,
63
    ) -> None:
64
        """Initialize Soundex instance.
65
66
        Parameters
67
        ----------
68 1
        max_length : int
69
            The length of the code returned (defaults to 4)
70 1
        var : str
71
            The variant of the algorithm to employ (defaults to ``American``):
72
73
                - ``American`` follows the American Soundex algorithm, as
74
                  described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
75
                  is also called Miracode
76
                - ``special`` follows the rules from the 1880-1910 US Census
77
                  retrospective re-analysis, in which h & w are not treated as
78
                  blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
79
                - ``Census`` follows the rules laid out in GIL 55
80
                  :cite:`US:1997` by the US Census, including coding prefixed
81
                  and unprefixed versions of some names
82
83
        reverse : bool
84
            Reverse the word before computing the selected Soundex (defaults to
85
            False); This results in "Reverse Soundex", which is useful for
86
            blocking in cases where the initial elements may be in error.
87
        zero_pad : bool
88
            Pad the end of the return value with 0s to achieve a max_length
89
            string
90
91
92
        .. versionadded:: 0.4.0
93
94
        """
95
        # Require a max_length of at least 4 and not more than 64
96
        if max_length != -1:
97
            self._max_length = min(max(4, max_length), 64)
98
        else:
99
            self._max_length = 64
100
101
        self._var = var
102
        self._reverse = reverse
103
        self._zero_pad = zero_pad
104
105 1
    def encode_alpha(self, word: str) -> str:
106 1
        """Return the alphabetic Soundex code for a word.
107
108 1
        Parameters
109
        ----------
110 1
        word : str
111 1
            The word to transform
112 1
113
        Returns
114 1
        -------
115
        str
116
            The alphabetic Soundex value
117
118
        Examples
119
        --------
120
        >>> pe = Soundex()
121
        >>> pe.encode_alpha("Christopher")
122
        'CRKT'
123
        >>> pe.encode_alpha("Niall")
124
        'NL'
125
        >>> pe.encode_alpha('Smith')
126
        'SNT'
127
        >>> pe.encode_alpha('Schmidt')
128
        'SNT'
129
130
131
        .. versionadded:: 0.4.0
132
133
        """
134
        code = self.encode(word).rstrip('0')
135
        return code[:1] + code[1:].translate(self._alphabetic)
136
137
    def encode(self, word: str, **kwargs: Any) -> str:
138
        """Return the Soundex code for a word.
139
140
        Parameters
141
        ----------
142
        word : str
143 1
            The word to transform
144 1
145
        Returns
146 1
        -------
147
        str
148
            The Soundex value
149
150
        Examples
151
        --------
152
        >>> pe = Soundex()
153
        >>> pe.encode("Christopher")
154
        'C623'
155
        >>> pe.encode("Niall")
156
        'N400'
157
        >>> pe.encode('Smith')
158
        'S530'
159
        >>> pe.encode('Schmidt')
160
        'S530'
161
162
        >>> Soundex(max_length=-1).encode('Christopher')
163
        'C623160000000000000000000000000000000000000000000000000000000000'
164
        >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
165
        'C62316'
166
167
        >>> Soundex(reverse=True).encode('Christopher')
168
        'R132'
169
170
        >>> pe.encode('Ashcroft')
171
        'A261'
172
        >>> pe.encode('Asicroft')
173
        'A226'
174
175
        >>> pe_special = Soundex(var='special')
176
        >>> pe_special.encode('Ashcroft')
177
        'A226'
178
        >>> pe_special.encode('Asicroft')
179
        'A226'
180
181
182
        .. versionadded:: 0.1.0
183
        .. versionchanged:: 0.3.6
184
            Encapsulated in class
185
        .. versionchanged:: 0.6.0
186
            Made return a str only (comma-separated)
187
188
        """
189
        # uppercase, normalize, decompose, and filter non-A-Z out
190
        word = unicode_normalize('NFKD', word.upper())
191
192
        if self._var == 'Census' and (
193
            'recurse' not in kwargs or kwargs['recurse'] is not False
194
        ):
195
            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
196
                return '{0},{1}'.format(
197 1
                    self.encode(word, recurse=False),
198 1
                    self.encode(word[3:], recurse=False),
199
                )
200 1
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
201 1
                return '{0},{1}'.format(
202 1
                    self.encode(word, recurse=False),
203
                    self.encode(word[2:], recurse=False),
204
                )
205
            # Otherwise, proceed as usual (var='American' mode, ostensibly)
206
207
        word = ''.join(c for c in word if c in self._uc_set)
208
209
        # Nothing to convert, return base case
210
        if not word:
211
            if self._zero_pad:
212
                return '0' * self._max_length
213
            return '0'
214
215
        # Reverse word if computing Reverse Soundex
216
        if self._reverse:
217
            word = word[::-1]
218 1
219 1
        # apply the Soundex algorithm
220
        sdx = word.translate(self._trans)
221
222
        if self._var == 'special':
223
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
224
        else:
225
            sdx = sdx.replace('9', '')  # rule 1
226
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3
227
228
        if word[0] in 'HW':
229
            sdx = word[0] + sdx
230
        else:
231
            sdx = word[0] + sdx[1:]
232
        sdx = sdx.replace('0', '')  # rule 1
233
234
        if self._zero_pad:
235
            sdx += '0' * self._max_length  # rule 4
236
237 1
        return sdx[: self._max_length]
238
239
240 1
if __name__ == '__main__':
241 1
    import doctest
242 1
243
    doctest.testmod()
244