Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._spfc.spfc()   F

Complexity

Conditions 21

Size

Total Lines 198
Code Lines 119

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 65
CRAP Score 21

Importance

Changes 0
Metric Value
eloc 119
dl 0
loc 198
ccs 65
cts 65
cp 1
rs 0
c 0
b 0
f 0
cc 21
nop 1
crap 21

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._spfc.spfc() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._spfc.
20
21
The phonetic._spfc module implements the Standardized Phonetic Frequency Code
22
(SPFC) algorithm.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from unicodedata import normalize as unicode_normalize
28
29 1
from six import text_type
30 1
from six.moves import range
31
32 1
from ._util import _delete_consecutive_repeats
33
34 1
__all__ = ['spfc']
35
36
37 1
def spfc(word):
38
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
39
40
    Standardized Phonetic Frequency Code is roughly Soundex-like.
41
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
42
43
    :param str word: the word to transform
44
    :returns: the SPFC value
45
    :rtype: str
46
47
    >>> spfc('Christopher Smith')
48
    '01160'
49
    >>> spfc('Christopher Schmidt')
50
    '01160'
51
    >>> spfc('Niall Smith')
52
    '01660'
53
    >>> spfc('Niall Schmidt')
54
    '01660'
55
56
    >>> spfc('L.Smith')
57
    '01960'
58
    >>> spfc('R.Miller')
59
    '65490'
60
61
    >>> spfc(('L', 'Smith'))
62
    '01960'
63
    >>> spfc(('R', 'Miller'))
64
    '65490'
65
    """
66 1
    _pf1 = dict(
67
        zip(
68
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
69
            '0011112222334445556666777',
70
        )
71
    )
72 1
    _pf2 = dict(
73
        zip(
74
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
75
            '0011122233445556677788899',
76
        )
77
    )
78 1
    _pf3 = dict(
79
        zip(
80
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
81
            '00000112223334456677777777',
82
        )
83
    )
84
85 1
    _substitutions = (
86
        ('DK', 'K'),
87
        ('DT', 'T'),
88
        ('SC', 'S'),
89
        ('KN', 'N'),
90
        ('MN', 'N'),
91
    )
92
93 1
    def _raise_word_ex():
94
        """Raise an AttributeError."""
95 1
        raise AttributeError(
96
            'word attribute must be a string with a space '
97
            + 'or period dividing the first and last names '
98
            + 'or a tuple/list consisting of the first and '
99
            + 'last names'
100
        )
101
102 1
    if not word:
103 1
        return ''
104
105 1
    names = []
106 1
    if isinstance(word, (str, text_type)):
107 1
        names = word.split('.', 1)
108 1
        if len(names) != 2:
109 1
            names = word.split(' ', 1)
110 1
            if len(names) != 2:
111 1
                _raise_word_ex()
112 1
    elif hasattr(word, '__iter__'):
113 1
        if len(word) != 2:
114 1
            _raise_word_ex()
115 1
        names = word
116
    else:
117 1
        _raise_word_ex()
118
119 1
    names = [
120
        unicode_normalize(
121
            'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
122
        )
123
        for _ in names
124
    ]
125 1
    code = ''
126
127 1
    def steps_one_to_three(name):
128
        """Perform the first three steps of SPFC."""
129
        # filter out non A-Z
130 1
        name = ''.join(
131
            _
132
            for _ in name
133
            if _
134
            in {
135
                'A',
136
                'B',
137
                'C',
138
                'D',
139
                'E',
140
                'F',
141
                'G',
142
                'H',
143
                'I',
144
                'J',
145
                'K',
146
                'L',
147
                'M',
148
                'N',
149
                'O',
150
                'P',
151
                'Q',
152
                'R',
153
                'S',
154
                'T',
155
                'U',
156
                'V',
157
                'W',
158
                'X',
159
                'Y',
160
                'Z',
161
            }
162
        )
163
164
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
165
        # and MN to N
166 1
        for subst in _substitutions:
167 1
            name = name.replace(subst[0], subst[1])
168
169
        # 2. In the name field, replace multiple letters with a single letter
170 1
        name = _delete_consecutive_repeats(name)
171
172
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
173
        # field.
174 1
        if name:
175 1
            name = name[0] + ''.join(
176
                _
177
                for _ in name[1:]
178
                if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
179
            )
180 1
        return name
181
182 1
    names = [steps_one_to_three(_) for _ in names]
183
184
    # 4. The first digit of the code is obtained using PF1 and the first letter
185
    # of the name field. Remove this letter after coding.
186 1
    if names[1]:
187 1
        code += names[1][0].translate(_pf1)
188 1
        names[1] = names[1][1:]
189
190
    # 5. Using the last letters of the name, use Table PF3 to obtain the
191
    # second digit of the code. Use as many letters as possible and remove
192
    # after coding.
193 1
    if names[1]:
194 1
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
195 1
            code += '8'
196 1
            names[1] = names[1][:-3]
197 1
        elif names[1][-2:] == 'SN':
198 1
            code += '8'
199 1
            names[1] = names[1][:-2]
200 1
        elif names[1][-3:] == 'STR':
201 1
            code += '9'
202 1
            names[1] = names[1][:-3]
203 1
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
204 1
            code += '9'
205 1
            names[1] = names[1][:-2]
206 1
        elif names[1][-3:] == 'DRS':
207 1
            code += '7'
208 1
            names[1] = names[1][:-3]
209 1
        elif names[1][-2:] in {'TR', 'MN'}:
210 1
            code += '7'
211 1
            names[1] = names[1][:-2]
212
        else:
213 1
            code += names[1][-1].translate(_pf3)
214 1
            names[1] = names[1][:-1]
215
216
    # 6. The third digit is found using Table PF2 and the first character of
217
    # the first name. Remove after coding.
218 1
    if names[0]:
219 1
        code += names[0][0].translate(_pf2)
220 1
        names[0] = names[0][1:]
221
222
    # 7. The fourth digit is found using Table PF2 and the first character of
223
    # the name field. If no letters remain use zero. After coding remove the
224
    # letter.
225
    # 8. The fifth digit is found in the same manner as the fourth using the
226
    # remaining characters of the name field if any.
227 1
    for _ in range(2):
228 1
        if names[1]:
229 1
            code += names[1][0].translate(_pf2)
230 1
            names[1] = names[1][1:]
231
        else:
232 1
            code += '0'
233
234 1
    return code
235
236
237
if __name__ == '__main__':
238
    import doctest
239
240
    doctest.testmod()
241