abydos.phonetic.spfc - Code Metrics - Inspection of "Merge pull request #120 from chrislit/modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Push — master ( 64abe2...a464fa )

by Chris

created 2018-10-19 22:32 UTC

abydos.phonetic.spfc A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	189
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	91
dl	0
loc	189
rs	10
c	0
b	0
f	0
wmc	21

1 Function

Rating	Name	Duplication	Size	Complexity
F	spfc()	0	147	21

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.spfc.

The phonetic.spfc module implements the Standardized Phonetic Frequency Code
(SPFC) algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from . import _delete_consecutive_repeats

__all__ = ['spfc']


def spfc(word):
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

    Standardized Phonetic Frequency Code is roughly Soundex-like.
    This implementation is based on page 19-21 of :cite:`Moore:1977`.

    :param str word: the word to transform
    :returns: the SPFC value
    :rtype: str

    >>> spfc('Christopher Smith')
    '01160'
    >>> spfc('Christopher Schmidt')
    '01160'
    >>> spfc('Niall Smith')
    '01660'
    >>> spfc('Niall Schmidt')
    '01660'

    >>> spfc('L.Smith')
    '01960'
    >>> spfc('R.Miller')
    '65490'

    >>> spfc(('L', 'Smith'))
    '01960'
    >>> spfc(('R', 'Miller'))
    '65490'
    """
    _pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),

                    '0011112222334445556666777'))
    _pf2 = dict(zip((ord(_) for _ in
                     'SZCKQFPXABORDHIMNGJTUVWEL'),
                    '0011122233445556677788899'))
    _pf3 = dict(zip((ord(_) for _ in
                     'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
                    '00000112223334456677777777'))

    _substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
                      ('MN', 'N'))

    def _raise_word_ex():
        """Raise an AttributeError."""
        raise AttributeError('word attribute must be a string with a space ' +
                             'or period dividing the first and last names ' +
                             'or a tuple/list consisting of the first and ' +
                             'last names')

    if not word:
        return ''

    names = []
    if isinstance(word, (str, text_type)):
        names = word.split('.', 1)
        if len(names) != 2:
            names = word.split(' ', 1)
            if len(names) != 2:
                _raise_word_ex()
    elif hasattr(word, '__iter__'):
        if len(word) != 2:
            _raise_word_ex()
        names = word
    else:
        _raise_word_ex()

    names = [unicode_normalize('NFKD', text_type(_.strip()
                                                 .replace('ß', 'SS')
                                                 .upper()))
             for _ in names]
    code = ''

    def steps_one_to_three(name):
        """Perform the first three steps of SPFC."""
        # filter out non A-Z
        name = ''.join(_ for _ in name if _ in
                       {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
                        'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
                        'W', 'X', 'Y', 'Z'})

        # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
        # and MN to N
        for subst in _substitutions:
            name = name.replace(subst[0], subst[1])

        # 2. In the name field, replace multiple letters with a single letter
        name = _delete_consecutive_repeats(name)

        # 3. Remove vowels, W, H, and Y, but keep the first letter in the name
        # field.
        if name:
            name = name[0] + ''.join(_ for _ in name[1:] if _ not in
                                     {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
        return name

    names = [steps_one_to_three(_) for _ in names]

    # 4. The first digit of the code is obtained using PF1 and the first letter
    # of the name field. Remove this letter after coding.
    if names[1]:
        code += names[1][0].translate(_pf1)
        names[1] = names[1][1:]

    # 5. Using the last letters of the name, use Table PF3 to obtain the
    # second digit of the code. Use as many letters as possible and remove
    # after coding.
    if names[1]:
        if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
            code += '8'
            names[1] = names[1][:-3]
        elif names[1][-2:] == 'SN':
            code += '8'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'STR':
            code += '9'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'SR', 'TN', 'TD'}:
            code += '9'
            names[1] = names[1][:-2]
        elif names[1][-3:] == 'DRS':
            code += '7'
            names[1] = names[1][:-3]
        elif names[1][-2:] in {'TR', 'MN'}:
            code += '7'
            names[1] = names[1][:-2]
        else:
            code += names[1][-1].translate(_pf3)
            names[1] = names[1][:-1]

    # 6. The third digit is found using Table PF2 and the first character of
    # the first name. Remove after coding.
    if names[0]:
        code += names[0][0].translate(_pf2)
        names[0] = names[0][1:]

    # 7. The fourth digit is found using Table PF2 and the first character of
    # the name field. If no letters remain use zero. After coding remove the
    # letter.
    # 8. The fifth digit is found in the same manner as the fourth using the
    # remaining characters of the name field if any.
    for _ in range(2):
        if names[1]:
            code += names[1][0].translate(_pf2)
            names[1] = names[1][1:]
        else:
            code += '0'

    return code


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.phonetic.spfc.
20
21			The phonetic.spfc module implements the Standardized Phonetic Frequency Code
22			(SPFC) algorithm.
23			"""
24
25			from __future__ import unicode_literals
26
27			from unicodedata import normalize as unicode_normalize
28
29			from six import text_type
30			from six.moves import range
31
32			from . import _delete_consecutive_repeats
33
34			__all__ = ['spfc']
35
36
37			def spfc(word):
38			"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
39
40			Standardized Phonetic Frequency Code is roughly Soundex-like.
41			This implementation is based on page 19-21 of :cite:`Moore:1977`.
42
43			:param str word: the word to transform
44			:returns: the SPFC value
45			:rtype: str
46
47			>>> spfc('Christopher Smith')
48			'01160'
49			>>> spfc('Christopher Schmidt')
50			'01160'
51			>>> spfc('Niall Smith')
52			'01660'
53			>>> spfc('Niall Schmidt')
54			'01660'
55
56			>>> spfc('L.Smith')
57			'01960'
58			>>> spfc('R.Miller')
59			'65490'
60
61			>>> spfc(('L', 'Smith'))
62			'01960'
63			>>> spfc(('R', 'Miller'))
64			'65490'
65			"""
66			_pf1 = dict(zip((ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
			0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
67			'0011112222334445556666777'))
68			_pf2 = dict(zip((ord(_) for _ in
69			'SZCKQFPXABORDHIMNGJTUVWEL'),
70			'0011122233445556677788899'))
71			_pf3 = dict(zip((ord(_) for _ in
72			'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
73			'00000112223334456677777777'))
74
75			_substitutions = (('DK', 'K'), ('DT', 'T'), ('SC', 'S'), ('KN', 'N'),
76			('MN', 'N'))
77
78			def _raise_word_ex():
79			"""Raise an AttributeError."""
80			raise AttributeError('word attribute must be a string with a space ' +
81			'or period dividing the first and last names ' +
82			'or a tuple/list consisting of the first and ' +
83			'last names')
84
85			if not word:
86			return ''
87
88			names = []
89			if isinstance(word, (str, text_type)):
90			names = word.split('.', 1)
91			if len(names) != 2:
92			names = word.split(' ', 1)
93			if len(names) != 2:
94			_raise_word_ex()
95			elif hasattr(word, '__iter__'):
96			if len(word) != 2:
97			_raise_word_ex()
98			names = word
99			else:
100			_raise_word_ex()
101
102			names = [unicode_normalize('NFKD', text_type(_.strip()
103			.replace('ß', 'SS')
104			.upper()))
105			for _ in names]
106			code = ''
107
108			def steps_one_to_three(name):
109			"""Perform the first three steps of SPFC."""
110			# filter out non A-Z
111			name = ''.join(_ for _ in name if _ in
112			{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
113			'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
114			'W', 'X', 'Y', 'Z'})
115
116			# 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
117			# and MN to N
118			for subst in _substitutions:
119			name = name.replace(subst[0], subst[1])
120
121			# 2. In the name field, replace multiple letters with a single letter
122			name = _delete_consecutive_repeats(name)
123
124			# 3. Remove vowels, W, H, and Y, but keep the first letter in the name
125			# field.
126			if name:
127			name = name[0] + ''.join(_ for _ in name[1:] if _ not in
128			{'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'})
129			return name
130
131			names = [steps_one_to_three(_) for _ in names]
132
133			# 4. The first digit of the code is obtained using PF1 and the first letter
134			# of the name field. Remove this letter after coding.
135			if names[1]:
136			code += names[1][0].translate(_pf1)
137			names[1] = names[1][1:]
138
139			# 5. Using the last letters of the name, use Table PF3 to obtain the
140			# second digit of the code. Use as many letters as possible and remove
141			# after coding.
142			if names[1]:
143			if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
144			code += '8'
145			names[1] = names[1][:-3]
146			elif names[1][-2:] == 'SN':
147			code += '8'
148			names[1] = names[1][:-2]
149			elif names[1][-3:] == 'STR':
150			code += '9'
151			names[1] = names[1][:-3]
152			elif names[1][-2:] in {'SR', 'TN', 'TD'}:
153			code += '9'
154			names[1] = names[1][:-2]
155			elif names[1][-3:] == 'DRS':
156			code += '7'
157			names[1] = names[1][:-3]
158			elif names[1][-2:] in {'TR', 'MN'}:
159			code += '7'
160			names[1] = names[1][:-2]
161			else:
162			code += names[1][-1].translate(_pf3)
163			names[1] = names[1][:-1]
164
165			# 6. The third digit is found using Table PF2 and the first character of
166			# the first name. Remove after coding.
167			if names[0]:
168			code += names[0][0].translate(_pf2)
169			names[0] = names[0][1:]
170
171			# 7. The fourth digit is found using Table PF2 and the first character of
172			# the name field. If no letters remain use zero. After coding remove the
173			# letter.
174			# 8. The fifth digit is found in the same manner as the fourth using the
175			# remaining characters of the name field if any.
176			for _ in range(2):
177			if names[1]:
178			code += names[1][0].translate(_pf2)
179			names[1] = names[1][1:]
180			else:
181			code += '0'
182
183			return code
184
185
186			if __name__ == '__main__':
187			import doctest
188			doctest.testmod()
189

chrislit / abydos

Push — master ( 64abe2...a464fa )

abydos.phonetic.spfc A

Complexity

Size/Duplication

Importance

1 Function

Duplication Side-by-Side

Filter issues like