Completed
Branch master (78a222)
by Chris
14:36
created

abydos.phonetic._spfc   A

Complexity

Total Complexity 21

Size/Duplication

Total Lines 241
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 21
eloc 129
dl 0
loc 241
ccs 72
cts 72
cp 1
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F spfc() 0 198 21
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._spfc.
20
21
The phonetic._spfc module implements the Standardized Phonetic Frequency Code
22
(SPFC) algorithm.
23
"""
24
25 1
from __future__ import unicode_literals
26
27 1
from unicodedata import normalize as unicode_normalize
28
29 1
from six import text_type
30 1
from six.moves import range
31
32 1
from ._util import _delete_consecutive_repeats
33
34 1
__all__ = ['spfc']
35
36
37 1
def spfc(word):
38
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.
39
40
    Standardized Phonetic Frequency Code is roughly Soundex-like.
41
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
42
43
    :param str word: the word to transform
44
    :returns: the SPFC value
45
    :rtype: str
46
47
    >>> spfc('Christopher Smith')
48
    '01160'
49
    >>> spfc('Christopher Schmidt')
50
    '01160'
51
    >>> spfc('Niall Smith')
52
    '01660'
53
    >>> spfc('Niall Schmidt')
54
    '01660'
55
56
    >>> spfc('L.Smith')
57
    '01960'
58
    >>> spfc('R.Miller')
59
    '65490'
60
61
    >>> spfc(('L', 'Smith'))
62
    '01960'
63
    >>> spfc(('R', 'Miller'))
64
    '65490'
65
    """
66 1
    _pf1 = dict(
67
        zip(
68
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
69
            '0011112222334445556666777',
70
        )
71
    )
72 1
    _pf2 = dict(
73
        zip(
74
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
75
            '0011122233445556677788899',
76
        )
77
    )
78 1
    _pf3 = dict(
79
        zip(
80
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
81
            '00000112223334456677777777',
82
        )
83
    )
84
85 1
    _substitutions = (
86
        ('DK', 'K'),
87
        ('DT', 'T'),
88
        ('SC', 'S'),
89
        ('KN', 'N'),
90
        ('MN', 'N'),
91
    )
92
93 1
    def _raise_word_ex():
94
        """Raise an AttributeError."""
95 1
        raise AttributeError(
96
            'word attribute must be a string with a space '
97
            + 'or period dividing the first and last names '
98
            + 'or a tuple/list consisting of the first and '
99
            + 'last names'
100
        )
101
102 1
    if not word:
103 1
        return ''
104
105 1
    names = []
106 1
    if isinstance(word, (str, text_type)):
107 1
        names = word.split('.', 1)
108 1
        if len(names) != 2:
109 1
            names = word.split(' ', 1)
110 1
            if len(names) != 2:
111 1
                _raise_word_ex()
112 1
    elif hasattr(word, '__iter__'):
113 1
        if len(word) != 2:
114 1
            _raise_word_ex()
115 1
        names = word
116
    else:
117 1
        _raise_word_ex()
118
119 1
    names = [
120
        unicode_normalize(
121
            'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
122
        )
123
        for _ in names
124
    ]
125 1
    code = ''
126
127 1
    def steps_one_to_three(name):
128
        """Perform the first three steps of SPFC."""
129
        # filter out non A-Z
130 1
        name = ''.join(
131
            _
132
            for _ in name
133
            if _
134
            in {
135
                'A',
136
                'B',
137
                'C',
138
                'D',
139
                'E',
140
                'F',
141
                'G',
142
                'H',
143
                'I',
144
                'J',
145
                'K',
146
                'L',
147
                'M',
148
                'N',
149
                'O',
150
                'P',
151
                'Q',
152
                'R',
153
                'S',
154
                'T',
155
                'U',
156
                'V',
157
                'W',
158
                'X',
159
                'Y',
160
                'Z',
161
            }
162
        )
163
164
        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
165
        # and MN to N
166 1
        for subst in _substitutions:
167 1
            name = name.replace(subst[0], subst[1])
168
169
        # 2. In the name field, replace multiple letters with a single letter
170 1
        name = _delete_consecutive_repeats(name)
171
172
        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
173
        # field.
174 1
        if name:
175 1
            name = name[0] + ''.join(
176
                _
177
                for _ in name[1:]
178
                if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
179
            )
180 1
        return name
181
182 1
    names = [steps_one_to_three(_) for _ in names]
183
184
    # 4. The first digit of the code is obtained using PF1 and the first letter
185
    # of the name field. Remove this letter after coding.
186 1
    if names[1]:
187 1
        code += names[1][0].translate(_pf1)
188 1
        names[1] = names[1][1:]
189
190
    # 5. Using the last letters of the name, use Table PF3 to obtain the
191
    # second digit of the code. Use as many letters as possible and remove
192
    # after coding.
193 1
    if names[1]:
194 1
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
195 1
            code += '8'
196 1
            names[1] = names[1][:-3]
197 1
        elif names[1][-2:] == 'SN':
198 1
            code += '8'
199 1
            names[1] = names[1][:-2]
200 1
        elif names[1][-3:] == 'STR':
201 1
            code += '9'
202 1
            names[1] = names[1][:-3]
203 1
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
204 1
            code += '9'
205 1
            names[1] = names[1][:-2]
206 1
        elif names[1][-3:] == 'DRS':
207 1
            code += '7'
208 1
            names[1] = names[1][:-3]
209 1
        elif names[1][-2:] in {'TR', 'MN'}:
210 1
            code += '7'
211 1
            names[1] = names[1][:-2]
212
        else:
213 1
            code += names[1][-1].translate(_pf3)
214 1
            names[1] = names[1][:-1]
215
216
    # 6. The third digit is found using Table PF2 and the first character of
217
    # the first name. Remove after coding.
218 1
    if names[0]:
219 1
        code += names[0][0].translate(_pf2)
220 1
        names[0] = names[0][1:]
221
222
    # 7. The fourth digit is found using Table PF2 and the first character of
223
    # the name field. If no letters remain use zero. After coding remove the
224
    # letter.
225
    # 8. The fifth digit is found in the same manner as the fourth using the
226
    # remaining characters of the name field if any.
227 1
    for _ in range(2):
228 1
        if names[1]:
229 1
            code += names[1][0].translate(_pf2)
230 1
            names[1] = names[1][1:]
231
        else:
232 1
            code += '0'
233
234 1
    return code
235
236
237
if __name__ == '__main__':
238
    import doctest
239
240
    doctest.testmod()
241