abydos.phonetic._spfc.SPFC.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-08 03:44 UTC

abydos.phonetic._spfc.SPFC.encode() F

↳ Parent: abydos.phonetic._spfc

Complexity

Conditions

Size

Total Lines	162
Code Lines	70

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	61
CRAP Score	21

Importance

Changes

Metric	Value
eloc	70
dl	0
loc	162
ccs	61
cts	61
cp	1
rs	0
c	0
b	0
f	0
cc	21
nop	2
crap	21

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._spfc.

The phonetic._spfc module implements the Standardized Phonetic Frequency Code
(SPFC) algorithm.
"""

from __future__ import unicode_literals

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import Phonetic

__all__ = ['SPFC', 'spfc']


class SPFC(Phonetic):

    """Standardized Phonetic Frequency Code (SPFC).

    Standardized Phonetic Frequency Code is roughly Soundex-like.
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
    """

    _pf1 = dict(
        zip(
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),

            '0011112222334445556666777',
        )
    )
    _pf2 = dict(
        zip(
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
            '0011122233445556677788899',
        )
    )
    _pf3 = dict(
        zip(
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
            '00000112223334456677777777',
        )
    )

    _substitutions = (
        ('DK', 'K'),
        ('DT', 'T'),
        ('SC', 'S'),
        ('KN', 'N'),
        ('MN', 'N'),
    )

    def encode(self, word):
        """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

        Args:
            word (str): The word to transform

        Returns:
            str: The SPFC value

        Raises:
            AttributeError: Word attribute must be a string with a space or
                period dividing the first and last names or a tuple/list
                consisting of the first and last names

        Examples:
            >>> pe = SPFC()
            >>> pe.encode('Christopher Smith')
            '01160'
            >>> pe.encode('Christopher Schmidt')
            '01160'
            >>> pe.encode('Niall Smith')
            '01660'
            >>> pe.encode('Niall Schmidt')
            '01660'

            >>> pe.encode('L.Smith')
            '01960'
            >>> pe.encode('R.Miller')
            '65490'

            >>> pe.encode(('L', 'Smith'))
            '01960'
            >>> pe.encode(('R', 'Miller'))
            '65490'

        """

        def _raise_word_ex():
            """Raise an AttributeError.

            Raises:
                AttributeError: Word attribute must be a string with a space or
                    period dividing the first and last names or a tuple/list
                    consisting of the first and last names

            """
            raise AttributeError(
                'Word attribute must be a string with a space or period '
                + 'dividing the first and last names or a tuple/list '
                + 'consisting of the first and last names'
            )

        if not word:
            return ''

        names = []
        if isinstance(word, (str, text_type)):
            names = word.split('.', 1)
            if len(names) != 2:
                names = word.split(' ', 1)
                if len(names) != 2:
                    _raise_word_ex()
        elif hasattr(word, '__iter__'):
            if len(word) != 2:
                _raise_word_ex()
            names = word
        else:
            _raise_word_ex()

        names = [
            unicode_normalize(
                'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
            )
            for _ in names
        ]
        code = ''

        def _steps_one_to_three(name):
            """Perform the first three steps of SPFC.

            Args:
                name (str): Name to transform

            Returns:
                str: Transformed name

            """
            # filter out non A-Z
            name = ''.join(_ for _ in name if _ in self._uc_set)

            # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
            # and MN to N
            for subst in self._substitutions:
                name = name.replace(subst[0], subst[1])

            # 2. In the name field, replace multiple letters with a single
            # letter
            name = self._delete_consecutive_repeats(name)

            # 3. Remove vowels, W, H, and Y, but keep the first letter in the
            # name field.
            if name:
                name = name[0] + ''.join(
                    _
                    for _ in name[1:]
                    if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
                )
            return name

        names = [_steps_one_to_three(_) for _ in names]

        # 4. The first digit of the code is obtained using PF1 and the first
        # letter of the name field. Remove this letter after coding.
        if names[1]:
            code += names[1][0].translate(self._pf1)
            names[1] = names[1][1:]

        # 5. Using the last letters of the name, use Table PF3 to obtain the
        # second digit of the code. Use as many letters as possible and remove
        # after coding.
        if names[1]:
            if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
                code += '8'
                names[1] = names[1][:-3]
            elif names[1][-2:] == 'SN':
                code += '8'
                names[1] = names[1][:-2]
            elif names[1][-3:] == 'STR':
                code += '9'
                names[1] = names[1][:-3]
            elif names[1][-2:] in {'SR', 'TN', 'TD'}:
                code += '9'
                names[1] = names[1][:-2]
            elif names[1][-3:] == 'DRS':
                code += '7'
                names[1] = names[1][:-3]
            elif names[1][-2:] in {'TR', 'MN'}:
                code += '7'
                names[1] = names[1][:-2]
            else:
                code += names[1][-1].translate(self._pf3)
                names[1] = names[1][:-1]

        # 6. The third digit is found using Table PF2 and the first character
        # of the first name. Remove after coding.
        if names[0]:
            code += names[0][0].translate(self._pf2)
            names[0] = names[0][1:]

        # 7. The fourth digit is found using Table PF2 and the first character
        # of the name field. If no letters remain use zero. After coding remove
        # the letter.
        # 8. The fifth digit is found in the same manner as the fourth using
        # the remaining characters of the name field if any.
        for _ in range(2):
            if names[1]:
                code += names[1][0].translate(self._pf2)
                names[1] = names[1][1:]
            else:
                code += '0'

        return code


def spfc(word):
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

    This is a wrapper for :py:meth:`SPFC.encode`.

    Args:
        word (str): The word to transform

    Returns:
        str: The SPFC value

    Examples:
        >>> spfc('Christopher Smith')
        '01160'
        >>> spfc('Christopher Schmidt')
        '01160'
        >>> spfc('Niall Smith')
        '01660'
        >>> spfc('Niall Schmidt')
        '01660'

        >>> spfc('L.Smith')
        '01960'
        >>> spfc('R.Miller')
        '65490'

        >>> spfc(('L', 'Smith'))
        '01960'
        >>> spfc(('R', 'Miller'))
        '65490'

    """
    return SPFC().encode(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._spfc.
20
21		The phonetic._spfc module implements the Standardized Phonetic Frequency Code
22		(SPFC) algorithm.
23		"""
24
25	1	from __future__ import unicode_literals
26
27	1	from unicodedata import normalize as unicode_normalize
28
29	1	from six import text_type
30	1	from six.moves import range
31
32	1	from ._phonetic import Phonetic
33
34	1	__all__ = ['SPFC', 'spfc']
35
36
37	1	class SPFC(Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
38		"""Standardized Phonetic Frequency Code (SPFC).
39
40		Standardized Phonetic Frequency Code is roughly Soundex-like.
41		This implementation is based on page 19-21 of :cite:`Moore:1977`.
42		"""
43
44	1	_pf1 = dict(
45		zip(
46		(ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
47		'0011112222334445556666777',
48		)
49		)
50	1	_pf2 = dict(
51		zip(
52		(ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
53		'0011122233445556677788899',
54		)
55		)
56	1	_pf3 = dict(
57		zip(
58		(ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
59		'00000112223334456677777777',
60		)
61		)
62
63	1	_substitutions = (
64		('DK', 'K'),
65		('DT', 'T'),
66		('SC', 'S'),
67		('KN', 'N'),
68		('MN', 'N'),
69		)
70
71	1	def encode(self, word):
72		"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
73
74		Args:
75		word (str): The word to transform
76
77		Returns:
78		str: The SPFC value
79
80		Raises:
81		AttributeError: Word attribute must be a string with a space or
82		period dividing the first and last names or a tuple/list
83		consisting of the first and last names
84
85		Examples:
86		>>> pe = SPFC()
87		>>> pe.encode('Christopher Smith')
88		'01160'
89		>>> pe.encode('Christopher Schmidt')
90		'01160'
91		>>> pe.encode('Niall Smith')
92		'01660'
93		>>> pe.encode('Niall Schmidt')
94		'01660'
95
96		>>> pe.encode('L.Smith')
97		'01960'
98		>>> pe.encode('R.Miller')
99		'65490'
100
101		>>> pe.encode(('L', 'Smith'))
102		'01960'
103		>>> pe.encode(('R', 'Miller'))
104		'65490'
105
106		"""
107
108	1	def _raise_word_ex():
109		"""Raise an AttributeError.
110
111		Raises:
112		AttributeError: Word attribute must be a string with a space or
113		period dividing the first and last names or a tuple/list
114		consisting of the first and last names
115
116		"""
117	1	raise AttributeError(
118		'Word attribute must be a string with a space or period '
119		+ 'dividing the first and last names or a tuple/list '
120		+ 'consisting of the first and last names'
121		)
122
123	1	if not word:
124	1	return ''
125
126	1	names = []
127	1	if isinstance(word, (str, text_type)):
128	1	names = word.split('.', 1)
129	1	if len(names) != 2:
130	1	names = word.split(' ', 1)
131	1	if len(names) != 2:
132	1	_raise_word_ex()
133	1	elif hasattr(word, '__iter__'):
134	1	if len(word) != 2:
135	1	_raise_word_ex()
136	1	names = word
137		else:
138	1	_raise_word_ex()
139
140	1	names = [
141		unicode_normalize(
142		'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
143		)
144		for _ in names
145		]
146	1	code = ''
147
148	1	def _steps_one_to_three(name):
149		"""Perform the first three steps of SPFC.
150
151		Args:
152		name (str): Name to transform
153
154		Returns:
155		str: Transformed name
156
157		"""
158		# filter out non A-Z
159	1	name = ''.join(_ for _ in name if _ in self._uc_set)
160
161		# 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
162		# and MN to N
163	1	for subst in self._substitutions:
164	1	name = name.replace(subst[0], subst[1])
165
166		# 2. In the name field, replace multiple letters with a single
167		# letter
168	1	name = self._delete_consecutive_repeats(name)
169
170		# 3. Remove vowels, W, H, and Y, but keep the first letter in the
171		# name field.
172	1	if name:
173	1	name = name[0] + ''.join(
174		_
175		for _ in name[1:]
176		if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
177		)
178	1	return name
179
180	1	names = [_steps_one_to_three(_) for _ in names]
181
182		# 4. The first digit of the code is obtained using PF1 and the first
183		# letter of the name field. Remove this letter after coding.
184	1	if names[1]:
185	1	code += names[1][0].translate(self._pf1)
186	1	names[1] = names[1][1:]
187
188		# 5. Using the last letters of the name, use Table PF3 to obtain the
189		# second digit of the code. Use as many letters as possible and remove
190		# after coding.
191	1	if names[1]:
192	1	if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
193	1	code += '8'
194	1	names[1] = names[1][:-3]
195	1	elif names[1][-2:] == 'SN':
196	1	code += '8'
197	1	names[1] = names[1][:-2]
198	1	elif names[1][-3:] == 'STR':
199	1	code += '9'
200	1	names[1] = names[1][:-3]
201	1	elif names[1][-2:] in {'SR', 'TN', 'TD'}:
202	1	code += '9'
203	1	names[1] = names[1][:-2]
204	1	elif names[1][-3:] == 'DRS':
205	1	code += '7'
206	1	names[1] = names[1][:-3]
207	1	elif names[1][-2:] in {'TR', 'MN'}:
208	1	code += '7'
209	1	names[1] = names[1][:-2]
210		else:
211	1	code += names[1][-1].translate(self._pf3)
212	1	names[1] = names[1][:-1]
213
214		# 6. The third digit is found using Table PF2 and the first character
215		# of the first name. Remove after coding.
216	1	if names[0]:
217	1	code += names[0][0].translate(self._pf2)
218	1	names[0] = names[0][1:]
219
220		# 7. The fourth digit is found using Table PF2 and the first character
221		# of the name field. If no letters remain use zero. After coding remove
222		# the letter.
223		# 8. The fifth digit is found in the same manner as the fourth using
224		# the remaining characters of the name field if any.
225	1	for _ in range(2):
226	1	if names[1]:
227	1	code += names[1][0].translate(self._pf2)
228	1	names[1] = names[1][1:]
229		else:
230	1	code += '0'
231
232	1	return code
233
234
235	1	def spfc(word):
236		"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
237
238		This is a wrapper for :py:meth:`SPFC.encode`.
239
240		Args:
241		word (str): The word to transform
242
243		Returns:
244		str: The SPFC value
245
246		Examples:
247		>>> spfc('Christopher Smith')
248		'01160'
249		>>> spfc('Christopher Schmidt')
250		'01160'
251		>>> spfc('Niall Smith')
252		'01660'
253		>>> spfc('Niall Schmidt')
254		'01660'
255
256		>>> spfc('L.Smith')
257		'01960'
258		>>> spfc('R.Miller')
259		'65490'
260
261		>>> spfc(('L', 'Smith'))
262		'01960'
263		>>> spfc(('R', 'Miller'))
264		'65490'
265
266		"""
267	1	return SPFC().encode(word)
268
269
270		if __name__ == '__main__':
271		import doctest
272
273		doctest.testmod()
274

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._spfc.SPFC.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like