Passed
Push — master ( c2a3b6...15a61d )
by Chris
01:00 queued 14s
created

abydos.phonetic._soundex.Soundex.encode_alpha()   A

Complexity

Conditions 1

Size

Total Lines 31
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 7
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 31
ccs 7
cts 7
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._soundex.
18
19 1
American Soundex
20
"""
21
22
from unicodedata import normalize as unicode_normalize
23
24 1
from ._phonetic import _Phonetic
25
26
__all__ = ['Soundex']
27
28
29
class Soundex(_Phonetic):
30
    """Soundex.
31 1
32
    Three variants of Soundex are implemented:
33 1
34
    - 'American' follows the American Soundex algorithm, as described at
35 1
      :cite:`US:2007` and in :cite:`Knuth:1998`; this is also called
36
      Miracode
37 1
    - 'special' follows the rules from the 1880-1910 US Census
38 1
      retrospective re-analysis, in which h & w are not treated as blocking
39
      consonants but as vowels. Cf. :cite:`Repici:2013`.
40 1
    - 'Census' follows the rules laid out in GIL 55 :cite:`US:1997` by the
41
      US Census, including coding prefixed and unprefixed versions of some
42
      names
43 1
44
    .. versionadded:: 0.3.6
45
    """
46
47
    _trans = dict(
48
        zip(
49
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
50
            '01230129022455012623019202',
51
        )
52
    )
53
54
    _alphabetic = dict(zip((ord(_) for _ in '01234569'), 'APKTLNRH'))
55
56
    def __init__(
57
        self, max_length=4, var='American', reverse=False, zero_pad=True
58
    ):
59
        """Initialize Soundex instance.
60
61 1
        Parameters
62
        ----------
63
        max_length : int
64
            The length of the code returned (defaults to 4)
65
        var : str
66
            The variant of the algorithm to employ (defaults to ``American``):
67
68 1
                - ``American`` follows the American Soundex algorithm, as
69
                  described at :cite:`US:2007` and in :cite:`Knuth:1998`; this
70 1
                  is also called Miracode
71
                - ``special`` follows the rules from the 1880-1910 US Census
72
                  retrospective re-analysis, in which h & w are not treated as
73
                  blocking consonants but as vowels. Cf. :cite:`Repici:2013`.
74
                - ``Census`` follows the rules laid out in GIL 55
75
                  :cite:`US:1997` by the US Census, including coding prefixed
76
                  and unprefixed versions of some names
77
78
        reverse : bool
79
            Reverse the word before computing the selected Soundex (defaults to
80
            False); This results in "Reverse Soundex", which is useful for
81
            blocking in cases where the initial elements may be in error.
82
        zero_pad : bool
83
            Pad the end of the return value with 0s to achieve a max_length
84
            string
85
86
87
        .. versionadded:: 0.4.0
88
89
        """
90
        # Require a max_length of at least 4 and not more than 64
91
        if max_length != -1:
92
            self._max_length = min(max(4, max_length), 64)
93
        else:
94
            self._max_length = 64
95
96
        self._var = var
97
        self._reverse = reverse
98
        self._zero_pad = zero_pad
99
100
    def encode_alpha(self, word):
101
        """Return the alphabetic Soundex code for a word.
102
103
        Parameters
104
        ----------
105 1
        word : str
106 1
            The word to transform
107
108 1
        Returns
109
        -------
110 1
        str
111 1
            The alphabetic Soundex value
112 1
113
        Examples
114 1
        --------
115
        >>> pe = Soundex()
116
        >>> pe.encode_alpha("Christopher")
117
        'CRKT'
118
        >>> pe.encode_alpha("Niall")
119
        'NL'
120
        >>> pe.encode_alpha('Smith')
121
        'SNT'
122
        >>> pe.encode_alpha('Schmidt')
123
        'SNT'
124
125
126
        .. versionadded:: 0.4.0
127
128
        """
129
        code = self.encode(word).rstrip('0')
130
        return code[:1] + code[1:].translate(self._alphabetic)
131
132
    def encode(self, word, **kwargs):
133
        """Return the Soundex code for a word.
134
135
        Parameters
136
        ----------
137
        word : str
138
            The word to transform
139
140
        Returns
141
        -------
142
        str
143 1
            The Soundex value
144 1
145
        Examples
146 1
        --------
147
        >>> pe = Soundex()
148
        >>> pe.encode("Christopher")
149
        'C623'
150
        >>> pe.encode("Niall")
151
        'N400'
152
        >>> pe.encode('Smith')
153
        'S530'
154
        >>> pe.encode('Schmidt')
155
        'S530'
156
157
        >>> Soundex(max_length=-1).encode('Christopher')
158
        'C623160000000000000000000000000000000000000000000000000000000000'
159
        >>> Soundex(max_length=-1, zero_pad=False).encode('Christopher')
160
        'C62316'
161
162
        >>> Soundex(reverse=True).encode('Christopher')
163
        'R132'
164
165
        >>> pe.encode('Ashcroft')
166
        'A261'
167
        >>> pe.encode('Asicroft')
168
        'A226'
169
170
        >>> pe_special = Soundex(var='special')
171
        >>> pe_special.encode('Ashcroft')
172
        'A226'
173
        >>> pe_special.encode('Asicroft')
174
        'A226'
175
176
177
        .. versionadded:: 0.1.0
178
        .. versionchanged:: 0.3.6
179
            Encapsulated in class
180
181
        """
182
        # uppercase, normalize, decompose, and filter non-A-Z out
183
        word = unicode_normalize('NFKD', word.upper())
184
185
        if self._var == 'Census' and (
186
            'recurse' not in kwargs or kwargs['recurse'] is not False
187
        ):
188
            if word[:3] in {'VAN', 'CON'} and len(word) > 4:
189
                return (
190
                    self.encode(word, recurse=False),
191
                    self.encode(word[3:], recurse=False),
192
                )
193
            if word[:2] in {'DE', 'DI', 'LA', 'LE'} and len(word) > 3:
194
                return (
195
                    self.encode(word, recurse=False),
196
                    self.encode(word[2:], recurse=False),
197 1
                )
198 1
            # Otherwise, proceed as usual (var='American' mode, ostensibly)
199
200 1
        word = ''.join(c for c in word if c in self._uc_set)
201 1
202 1
        # Nothing to convert, return base case
203
        if not word:
204
            if self._zero_pad:
205
                return '0' * self._max_length
206
            return '0'
207
208
        # Reverse word if computing Reverse Soundex
209
        if self._reverse:
210
            word = word[::-1]
211
212
        # apply the Soundex algorithm
213
        sdx = word.translate(self._trans)
214
215
        if self._var == 'special':
216
            sdx = sdx.replace('9', '0')  # special rule for 1880-1910 census
217
        else:
218 1
            sdx = sdx.replace('9', '')  # rule 1
219 1
        sdx = self._delete_consecutive_repeats(sdx)  # rule 3
220
221
        if word[0] in 'HW':
222
            sdx = word[0] + sdx
223
        else:
224
            sdx = word[0] + sdx[1:]
225
        sdx = sdx.replace('0', '')  # rule 1
226
227
        if self._zero_pad:
228
            sdx += '0' * self._max_length  # rule 4
229
230
        return sdx[: self._max_length]
231
232
233
if __name__ == '__main__':
234
    import doctest
235
236
    doctest.testmod()
237