Completed
Pull Request — master (#138)
by Chris
14:20
created

abydos.phonetic._spfc.SPFC.encode()   F

Complexity

Conditions 21

Size

Total Lines 138
Code Lines 70

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 61
CRAP Score 21

Importance

Changes 0
Metric Value
eloc 70
dl 0
loc 138
ccs 61
cts 61
cp 1
rs 0
c 0
b 0
f 0
cc 21
nop 2
crap 21

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._spfc.SPFC.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._spfc.
20
21
The phonetic._spfc module implements the Standardized Phonetic Frequency Code
22
(SPFC) algorithm.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from unicodedata import normalize as unicode_normalize
28
29 1
from six import text_type
30 1
from six.moves import range
31
32 1
from ._phonetic import Phonetic
33
34 1
__all__ = ['SPFC', 'spfc']
35
36
37 1
class SPFC(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
38
    """Standardized Phonetic Frequency Code (SPFC).
39
40
    Standardized Phonetic Frequency Code is roughly Soundex-like.
41
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
42
    """
43
44 1
    _pf1 = dict(
45
        zip(
46
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
47
            '0011112222334445556666777',
48
        )
49
    )
50 1
    _pf2 = dict(
51
        zip(
52
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
53
            '0011122233445556677788899',
54
        )
55
    )
56 1
    _pf3 = dict(
57
        zip(
58
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
59
            '00000112223334456677777777',
60
        )
61
    )
62
63 1
    _substitutions = (
64
        ('DK', 'K'),
65
        ('DT', 'T'),
66
        ('SC', 'S'),
67
        ('KN', 'N'),
68
        ('MN', 'N'),
69
    )
70
71 1
    def encode(self, word):
72
        """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
73
74
        :param str word: the word to transform
75
        :returns: the SPFC value
76
        :rtype: str
77
78
        >>> pe = SPFC()
79
        >>> pe.encode('Christopher Smith')
80
        '01160'
81
        >>> pe.encode('Christopher Schmidt')
82
        '01160'
83
        >>> pe.encode('Niall Smith')
84
        '01660'
85
        >>> pe.encode('Niall Schmidt')
86
        '01660'
87
88
        >>> pe.encode('L.Smith')
89
        '01960'
90
        >>> pe.encode('R.Miller')
91
        '65490'
92
93
        >>> pe.encode(('L', 'Smith'))
94
        '01960'
95
        >>> pe.encode(('R', 'Miller'))
96
        '65490'
97
        """
98
99 1
        def _raise_word_ex():
100
            """Raise an AttributeError."""
101 1
            raise AttributeError(
102
                'word attribute must be a string with a space or period '
103
                + 'dividing the first and last names or a tuple/list '
104
                + 'consisting of the first and last names'
105
            )
106
107 1
        if not word:
108 1
            return ''
109
110 1
        names = []
111 1
        if isinstance(word, (str, text_type)):
112 1
            names = word.split('.', 1)
113 1
            if len(names) != 2:
114 1
                names = word.split(' ', 1)
115 1
                if len(names) != 2:
116 1
                    _raise_word_ex()
117 1
        elif hasattr(word, '__iter__'):
118 1
            if len(word) != 2:
119 1
                _raise_word_ex()
120 1
            names = word
121
        else:
122 1
            _raise_word_ex()
123
124 1
        names = [
125
            unicode_normalize(
126
                'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
127
            )
128
            for _ in names
129
        ]
130 1
        code = ''
131
132 1
        def _steps_one_to_three(name):
133
            """Perform the first three steps of SPFC."""
134
            # filter out non A-Z
135 1
            name = ''.join(_ for _ in name if _ in self._uc_set)
136
137
            # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
138
            # and MN to N
139 1
            for subst in self._substitutions:
140 1
                name = name.replace(subst[0], subst[1])
141
142
            # 2. In the name field, replace multiple letters with a single
143
            # letter
144 1
            name = self._delete_consecutive_repeats(name)
145
146
            # 3. Remove vowels, W, H, and Y, but keep the first letter in the
147
            # name field.
148 1
            if name:
149 1
                name = name[0] + ''.join(
150
                    _
151
                    for _ in name[1:]
152
                    if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
153
                )
154 1
            return name
155
156 1
        names = [_steps_one_to_three(_) for _ in names]
157
158
        # 4. The first digit of the code is obtained using PF1 and the first
159
        # letter of the name field. Remove this letter after coding.
160 1
        if names[1]:
161 1
            code += names[1][0].translate(self._pf1)
162 1
            names[1] = names[1][1:]
163
164
        # 5. Using the last letters of the name, use Table PF3 to obtain the
165
        # second digit of the code. Use as many letters as possible and remove
166
        # after coding.
167 1
        if names[1]:
168 1
            if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
169 1
                code += '8'
170 1
                names[1] = names[1][:-3]
171 1
            elif names[1][-2:] == 'SN':
172 1
                code += '8'
173 1
                names[1] = names[1][:-2]
174 1
            elif names[1][-3:] == 'STR':
175 1
                code += '9'
176 1
                names[1] = names[1][:-3]
177 1
            elif names[1][-2:] in {'SR', 'TN', 'TD'}:
178 1
                code += '9'
179 1
                names[1] = names[1][:-2]
180 1
            elif names[1][-3:] == 'DRS':
181 1
                code += '7'
182 1
                names[1] = names[1][:-3]
183 1
            elif names[1][-2:] in {'TR', 'MN'}:
184 1
                code += '7'
185 1
                names[1] = names[1][:-2]
186
            else:
187 1
                code += names[1][-1].translate(self._pf3)
188 1
                names[1] = names[1][:-1]
189
190
        # 6. The third digit is found using Table PF2 and the first character
191
        # of the first name. Remove after coding.
192 1
        if names[0]:
193 1
            code += names[0][0].translate(self._pf2)
194 1
            names[0] = names[0][1:]
195
196
        # 7. The fourth digit is found using Table PF2 and the first character
197
        # of the name field. If no letters remain use zero. After coding remove
198
        # the letter.
199
        # 8. The fifth digit is found in the same manner as the fourth using
200
        # the remaining characters of the name field if any.
201 1
        for _ in range(2):
202 1
            if names[1]:
203 1
                code += names[1][0].translate(self._pf2)
204 1
                names[1] = names[1][1:]
205
            else:
206 1
                code += '0'
207
208 1
        return code
209
210
211 1
def spfc(word):
212
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
213
214
    This is a wraper for :py:meth:`SPFC.encode`.
215
216
    :param str word: the word to transform
217
    :returns: the SPFC value
218
    :rtype: str
219
220
    >>> spfc('Christopher Smith')
221
    '01160'
222
    >>> spfc('Christopher Schmidt')
223
    '01160'
224
    >>> spfc('Niall Smith')
225
    '01660'
226
    >>> spfc('Niall Schmidt')
227
    '01660'
228
229
    >>> spfc('L.Smith')
230
    '01960'
231
    >>> spfc('R.Miller')
232
    '65490'
233
234
    >>> spfc(('L', 'Smith'))
235
    '01960'
236
    >>> spfc(('R', 'Miller'))
237
    '65490'
238
    """
239 1
    return SPFC().encode(word)
240
241
242
if __name__ == '__main__':
243
    import doctest
244
245
    doctest.testmod()
246