abydos.phonetic._spfc - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.phonetic._spfc A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	296
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
eloc	105
dl	0
loc	296
ccs	75
cts	75
cp	1
rs	10
c	0
b	0
f	0
wmc	22

1 Method

Rating	Name	Duplication	Size	Complexity
F	SPFC.encode()	0	175	21

1 Function

Rating	Name	Duplication	Size	Complexity
A	spfc()	0	38	1

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._spfc.

Standardized Phonetic Frequency Code (SPFC) algorithm
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import _Phonetic

__all__ = ['SPFC', 'spfc']


class SPFC(_Phonetic):

    """Standardized Phonetic Frequency Code (SPFC).

    Standardized Phonetic Frequency Code is roughly Soundex-like.
    This implementation is based on page 19-21 of :cite:`Moore:1977`.
    """

    _pf1 = dict(
        zip(
            (ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),

            '0011112222334445556666777',
        )
    )
    _pf2 = dict(
        zip(
            (ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
            '0011122233445556677788899',
        )
    )
    _pf3 = dict(
        zip(
            (ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
            '00000112223334456677777777',
        )
    )

    _substitutions = (
        ('DK', 'K'),
        ('DT', 'T'),
        ('SC', 'S'),
        ('KN', 'N'),
        ('MN', 'N'),
    )

    def encode(self, word):
        """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The SPFC value

        Raises
        ------
        AttributeError
            Word attribute must be a string with a space or period dividing the
            first and last names or a tuple/list consisting of the first and
            last names

        Examples
        --------
        >>> pe = SPFC()
        >>> pe.encode('Christopher Smith')
        '01160'
        >>> pe.encode('Christopher Schmidt')
        '01160'
        >>> pe.encode('Niall Smith')
        '01660'
        >>> pe.encode('Niall Schmidt')
        '01660'

        >>> pe.encode('L.Smith')
        '01960'
        >>> pe.encode('R.Miller')
        '65490'

        >>> pe.encode(('L', 'Smith'))
        '01960'
        >>> pe.encode(('R', 'Miller'))
        '65490'

        """

        def _raise_word_ex():
            """Raise an AttributeError.

            Raises
            ------
            AttributeError
                Word attribute must be a string with a space or period dividing
                the first and last names or a tuple/list consisting of the
                first and last names

            """
            raise AttributeError(
                'Word attribute must be a string with a space or period '
                + 'dividing the first and last names or a tuple/list '
                + 'consisting of the first and last names'
            )

        if not word:
            return ''

        names = []
        if isinstance(word, (str, text_type)):
            names = word.split('.', 1)
            if len(names) != 2:
                names = word.split(' ', 1)
                if len(names) != 2:
                    _raise_word_ex()
        elif hasattr(word, '__iter__'):
            if len(word) != 2:
                _raise_word_ex()
            names = word
        else:
            _raise_word_ex()

        names = [
            unicode_normalize(
                'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
            )
            for _ in names
        ]
        code = ''

        def _steps_one_to_three(name):
            """Perform the first three steps of SPFC.

            Parameters
            ----------
            name : str
                Name to transform

            Returns
            -------
            str
                Transformed name

            """
            # filter out non A-Z
            name = ''.join(_ for _ in name if _ in self._uc_set)

            # 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
            # and MN to N
            for subst in self._substitutions:
                name = name.replace(subst[0], subst[1])

            # 2. In the name field, replace multiple letters with a single
            # letter
            name = self._delete_consecutive_repeats(name)

            # 3. Remove vowels, W, H, and Y, but keep the first letter in the
            # name field.
            if name:
                name = name[0] + ''.join(
                    _
                    for _ in name[1:]
                    if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
                )
            return name

        names = [_steps_one_to_three(_) for _ in names]

        # 4. The first digit of the code is obtained using PF1 and the first
        # letter of the name field. Remove this letter after coding.
        if names[1]:
            code += names[1][0].translate(self._pf1)
            names[1] = names[1][1:]

        # 5. Using the last letters of the name, use Table PF3 to obtain the
        # second digit of the code. Use as many letters as possible and remove
        # after coding.
        if names[1]:
            if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
                code += '8'
                names[1] = names[1][:-3]
            elif names[1][-2:] == 'SN':
                code += '8'
                names[1] = names[1][:-2]
            elif names[1][-3:] == 'STR':
                code += '9'
                names[1] = names[1][:-3]
            elif names[1][-2:] in {'SR', 'TN', 'TD'}:
                code += '9'
                names[1] = names[1][:-2]
            elif names[1][-3:] == 'DRS':
                code += '7'
                names[1] = names[1][:-3]
            elif names[1][-2:] in {'TR', 'MN'}:
                code += '7'
                names[1] = names[1][:-2]
            else:
                code += names[1][-1].translate(self._pf3)
                names[1] = names[1][:-1]

        # 6. The third digit is found using Table PF2 and the first character
        # of the first name. Remove after coding.
        if names[0]:
            code += names[0][0].translate(self._pf2)
            names[0] = names[0][1:]

        # 7. The fourth digit is found using Table PF2 and the first character
        # of the name field. If no letters remain use zero. After coding remove
        # the letter.
        # 8. The fifth digit is found in the same manner as the fourth using
        # the remaining characters of the name field if any.
        for _ in range(2):
            if names[1]:
                code += names[1][0].translate(self._pf2)
                names[1] = names[1][1:]
            else:
                code += '0'

        return code


def spfc(word):
    """Return the Standardized Phonetic Frequency Code (SPFC) of a word.

    This is a wrapper for :py:meth:`SPFC.encode`.

    Parameters
    ----------
    word : str
        The word to transform

    Returns
    -------
    str
        The SPFC value

    Examples
    --------
    >>> spfc('Christopher Smith')
    '01160'
    >>> spfc('Christopher Schmidt')
    '01160'
    >>> spfc('Niall Smith')
    '01660'
    >>> spfc('Niall Schmidt')
    '01660'

    >>> spfc('L.Smith')
    '01960'
    >>> spfc('R.Miller')
    '65490'

    >>> spfc(('L', 'Smith'))
    '01960'
    >>> spfc(('R', 'Miller'))
    '65490'

    """
    return SPFC().encode(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._spfc.
20
21		Standardized Phonetic Frequency Code (SPFC) algorithm
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34	1	from six.moves import range
35
36	1	from ._phonetic import _Phonetic
37
38	1	__all__ = ['SPFC', 'spfc']
39
40
41	1	class SPFC(_Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
42		"""Standardized Phonetic Frequency Code (SPFC).
43
44		Standardized Phonetic Frequency Code is roughly Soundex-like.
45		This implementation is based on page 19-21 of :cite:`Moore:1977`.
46		"""
47
48	1	_pf1 = dict(
49		zip(
50		(ord(_) for _ in 'SZCKQVFPUWABLORDHIEMNXGJT'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
51		'0011112222334445556666777',
52		)
53		)
54	1	_pf2 = dict(
55		zip(
56		(ord(_) for _ in 'SZCKQFPXABORDHIMNGJTUVWEL'),
57		'0011122233445556677788899',
58		)
59		)
60	1	_pf3 = dict(
61		zip(
62		(ord(_) for _ in 'BCKQVDTFLPGJXMNRSZAEHIOUWY'),
63		'00000112223334456677777777',
64		)
65		)
66
67	1	_substitutions = (
68		('DK', 'K'),
69		('DT', 'T'),
70		('SC', 'S'),
71		('KN', 'N'),
72		('MN', 'N'),
73		)
74
75	1	def encode(self, word):
76		"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
77
78		Parameters
79		----------
80		word : str
81		The word to transform
82
83		Returns
84		-------
85		str
86		The SPFC value
87
88		Raises
89		------
90		AttributeError
91		Word attribute must be a string with a space or period dividing the
92		first and last names or a tuple/list consisting of the first and
93		last names
94
95		Examples
96		--------
97		>>> pe = SPFC()
98		>>> pe.encode('Christopher Smith')
99		'01160'
100		>>> pe.encode('Christopher Schmidt')
101		'01160'
102		>>> pe.encode('Niall Smith')
103		'01660'
104		>>> pe.encode('Niall Schmidt')
105		'01660'
106
107		>>> pe.encode('L.Smith')
108		'01960'
109		>>> pe.encode('R.Miller')
110		'65490'
111
112		>>> pe.encode(('L', 'Smith'))
113		'01960'
114		>>> pe.encode(('R', 'Miller'))
115		'65490'
116
117		"""
118
119	1	def _raise_word_ex():
120		"""Raise an AttributeError.
121
122		Raises
123		------
124		AttributeError
125		Word attribute must be a string with a space or period dividing
126		the first and last names or a tuple/list consisting of the
127		first and last names
128
129		"""
130	1	raise AttributeError(
131		'Word attribute must be a string with a space or period '
132		+ 'dividing the first and last names or a tuple/list '
133		+ 'consisting of the first and last names'
134		)
135
136	1	if not word:
137	1	return ''
138
139	1	names = []
140	1	if isinstance(word, (str, text_type)):
141	1	names = word.split('.', 1)
142	1	if len(names) != 2:
143	1	names = word.split(' ', 1)
144	1	if len(names) != 2:
145	1	_raise_word_ex()
146	1	elif hasattr(word, '__iter__'):
147	1	if len(word) != 2:
148	1	_raise_word_ex()
149	1	names = word
150		else:
151	1	_raise_word_ex()
152
153	1	names = [
154		unicode_normalize(
155		'NFKD', text_type(_.strip().replace('ß', 'SS').upper())
156		)
157		for _ in names
158		]
159	1	code = ''
160
161	1	def _steps_one_to_three(name):
162		"""Perform the first three steps of SPFC.
163
164		Parameters
165		----------
166		name : str
167		Name to transform
168
169		Returns
170		-------
171		str
172		Transformed name
173
174		"""
175		# filter out non A-Z
176	1	name = ''.join(_ for _ in name if _ in self._uc_set)
177
178		# 1. In the field, convert DK to K, DT to T, SC to S, KN to N,
179		# and MN to N
180	1	for subst in self._substitutions:
181	1	name = name.replace(subst[0], subst[1])
182
183		# 2. In the name field, replace multiple letters with a single
184		# letter
185	1	name = self._delete_consecutive_repeats(name)
186
187		# 3. Remove vowels, W, H, and Y, but keep the first letter in the
188		# name field.
189	1	if name:
190	1	name = name[0] + ''.join(
191		_
192		for _ in name[1:]
193		if _ not in {'A', 'E', 'H', 'I', 'O', 'U', 'W', 'Y'}
194		)
195	1	return name
196
197	1	names = [_steps_one_to_three(_) for _ in names]
198
199		# 4. The first digit of the code is obtained using PF1 and the first
200		# letter of the name field. Remove this letter after coding.
201	1	if names[1]:
202	1	code += names[1][0].translate(self._pf1)
203	1	names[1] = names[1][1:]
204
205		# 5. Using the last letters of the name, use Table PF3 to obtain the
206		# second digit of the code. Use as many letters as possible and remove
207		# after coding.
208	1	if names[1]:
209	1	if names[1][-3:] == 'STN' or names[1][-3:] == 'PRS':
210	1	code += '8'
211	1	names[1] = names[1][:-3]
212	1	elif names[1][-2:] == 'SN':
213	1	code += '8'
214	1	names[1] = names[1][:-2]
215	1	elif names[1][-3:] == 'STR':
216	1	code += '9'
217	1	names[1] = names[1][:-3]
218	1	elif names[1][-2:] in {'SR', 'TN', 'TD'}:
219	1	code += '9'
220	1	names[1] = names[1][:-2]
221	1	elif names[1][-3:] == 'DRS':
222	1	code += '7'
223	1	names[1] = names[1][:-3]
224	1	elif names[1][-2:] in {'TR', 'MN'}:
225	1	code += '7'
226	1	names[1] = names[1][:-2]
227		else:
228	1	code += names[1][-1].translate(self._pf3)
229	1	names[1] = names[1][:-1]
230
231		# 6. The third digit is found using Table PF2 and the first character
232		# of the first name. Remove after coding.
233	1	if names[0]:
234	1	code += names[0][0].translate(self._pf2)
235	1	names[0] = names[0][1:]
236
237		# 7. The fourth digit is found using Table PF2 and the first character
238		# of the name field. If no letters remain use zero. After coding remove
239		# the letter.
240		# 8. The fifth digit is found in the same manner as the fourth using
241		# the remaining characters of the name field if any.
242	1	for _ in range(2):
243	1	if names[1]:
244	1	code += names[1][0].translate(self._pf2)
245	1	names[1] = names[1][1:]
246		else:
247	1	code += '0'
248
249	1	return code
250
251
252	1	def spfc(word):
253		"""Return the Standardized Phonetic Frequency Code (SPFC) of a word.
254
255		This is a wrapper for :py:meth:`SPFC.encode`.
256
257		Parameters
258		----------
259		word : str
260		The word to transform
261
262		Returns
263		-------
264		str
265		The SPFC value
266
267		Examples
268		--------
269		>>> spfc('Christopher Smith')
270		'01160'
271		>>> spfc('Christopher Schmidt')
272		'01160'
273		>>> spfc('Niall Smith')
274		'01660'
275		>>> spfc('Niall Schmidt')
276		'01660'
277
278		>>> spfc('L.Smith')
279		'01960'
280		>>> spfc('R.Miller')
281		'65490'
282
283		>>> spfc(('L', 'Smith'))
284		'01960'
285		>>> spfc(('R', 'Miller'))
286		'65490'
287
288		"""
289	1	return SPFC().encode(word)
290
291
292		if __name__ == '__main__':
293		import doctest
294
295		doctest.testmod()
296

chrislit / abydos

Push — master ( f43547...71985b )

abydos.phonetic._spfc A

Complexity

Size/Duplication

Test Coverage

Importance

1 Method

1 Function

Duplication Side-by-Side

Filter issues like