abydos.phonetic._pshp_soundex_last.PSHPSoundexLast.encode() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

PSHPSoundexLast.encode() F

↳ Parent: abydos.phonetic._pshp_soundex_last

Complexity

Conditions

Size

Total Lines	147
Code Lines	85

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	80
CRAP Score	36

Importance

Changes

Metric	Value
cc	36
eloc	85
nop	4
dl	0
loc	147
ccs	80
cts	80
cp	1
crap	36
rs	0
c	0
b	0
f	0

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._pshp_soundex_last.

PSHP Soundex/Viewex Coding for last names
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize as unicode_normalize

from six import text_type

from ._phonetic import _Phonetic

__all__ = ['PSHPSoundexLast', 'pshp_soundex_last']


class PSHPSoundexLast(_Phonetic):

    """PSHP Soundex/Viewex Coding of a last name.

    This coding is based on :cite:`Hershberg:1976`.

    Reference was also made to the German version of the same:
    :cite:`Hershberg:1979`.

    A separate function, :py:class:`PSHPSoundexFirst` is used for first names.
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '01230120022455012523010202',
        )
    )

    def encode(self, lname, max_length=4, german=False):

        """Calculate the PSHP Soundex/Viewex Coding of a last name.

        Parameters
        ----------
        lname : str
            The last name to encode
        max_length : int
            The length of the code returned (defaults to 4)
        german : bool
            Set to True if the name is German (different rules apply)

        Returns
        -------
        str
            The PSHP Soundex/Viewex Coding

        Examples
        --------
        >>> pe = PSHPSoundexLast()
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Waters')
        'W350'
        >>> pe.encode('James')
        'J500'
        >>> pe.encode('Schmidt')
        'S530'
        >>> pe.encode('Ashcroft')
        'A225'

        """
        lname = unicode_normalize('NFKD', text_type(lname.upper()))
        lname = lname.replace('ß', 'SS')
        lname = ''.join(c for c in lname if c in self._uc_set)

        # A. Prefix treatment
        if lname[:3] == 'VON' or lname[:3] == 'VAN':
            lname = lname[3:].strip()

        # The rule implemented below says "MC, MAC become 1". I believe it
        # meant to say they become M except in German data (where superscripted
        # 1 indicates "except in German data"). It doesn't make sense for them
        # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
        # both articles have this error(?).
        if not german:
            if lname[:3] == 'MAC':
                lname = 'M' + lname[3:]
            elif lname[:2] == 'MC':
                lname = 'M' + lname[2:]

        # The non-German-only rule to strip ' is unnecessary due to filtering

        if lname[:1] in {'E', 'I', 'O', 'U'}:
            lname = 'A' + lname[1:]
        elif lname[:2] in {'GE', 'GI', 'GY'}:
            lname = 'J' + lname[1:]
        elif lname[:2] in {'CE', 'CI', 'CY'}:
            lname = 'S' + lname[1:]
        elif lname[:3] == 'CHR':
            lname = 'K' + lname[1:]
        elif lname[:1] == 'C' and lname[:2] != 'CH':
            lname = 'K' + lname[1:]

        if lname[:2] == 'KN':
            lname = 'N' + lname[1:]
        elif lname[:2] == 'PH':
            lname = 'F' + lname[1:]
        elif lname[:3] in {'WIE', 'WEI'}:
            lname = 'V' + lname[1:]

        if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
            lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
                1:
            ]

        code = lname[:1]

        # B. Postfix treatment
        if german:  # moved from end of postfix treatment due to blocking
            if lname[-3:] == 'TES':
                lname = lname[:-3]
            elif lname[-2:] == 'TS':
                lname = lname[:-2]
            if lname[-3:] == 'TZE':
                lname = lname[:-3]
            elif lname[-2:] == 'ZE':
                lname = lname[:-2]
            if lname[-1:] == 'Z':
                lname = lname[:-1]
            elif lname[-2:] == 'TE':
                lname = lname[:-2]

        if lname[-1:] == 'R':
            lname = lname[:-1] + 'N'
        elif lname[-2:] in {'SE', 'CE'}:
            lname = lname[:-2]
        if lname[-2:] == 'SS':
            lname = lname[:-2]
        elif lname[-1:] == 'S':
            lname = lname[:-1]

        if not german:
            l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
            l4_repl = {
                'NSEN': 'ASEN',
                'MSON': 'ASON',
                'STEN': 'SAEN',
                'STON': 'SAON',
            }
            if lname[-5:] in l5_repl:
                lname = lname[:-5] + l5_repl[lname[-5:]]
            elif lname[-4:] in l4_repl:
                lname = lname[:-4] + l4_repl[lname[-4:]]

        if lname[-2:] in {'NG', 'ND'}:
            lname = lname[:-1]
        if not german and lname[-3:] in {'GAN', 'GEN'}:
            lname = lname[:-3] + 'A' + lname[-2:]

        # C. Infix Treatment
        lname = lname.replace('CK', 'C')
        lname = lname.replace('SCH', 'S')
        lname = lname.replace('DT', 'T')
        lname = lname.replace('ND', 'N')
        lname = lname.replace('NG', 'N')
        lname = lname.replace('LM', 'M')
        lname = lname.replace('MN', 'M')
        lname = lname.replace('WIE', 'VIE')
        lname = lname.replace('WEI', 'VEI')

        # D. Soundexing
        # code for X & Y are unspecified, but presumably are 2 & 0

        lname = lname.translate(self._trans)
        lname = self._delete_consecutive_repeats(lname)

        code += lname[1:]
        code = code.replace('0', '')  # rule 1

        if max_length != -1:
            if len(code) < max_length:
                code += '0' * (max_length - len(code))
            else:
                code = code[:max_length]

        return code


def pshp_soundex_last(lname, max_length=4, german=False):
    """Calculate the PSHP Soundex/Viewex Coding of a last name.

    This is a wrapper for :py:meth:`PSHPSoundexLast.encode`.

    Parameters
    ----------
    lname : str
        The last name to encode
    max_length : int
        The length of the code returned (defaults to 4)
    german : bool
        Set to True if the name is German (different rules apply)

    Returns
    -------
    str
        The PSHP Soundex/Viewex Coding

    Examples
    --------
    >>> pshp_soundex_last('Smith')
    'S530'
    >>> pshp_soundex_last('Waters')
    'W350'
    >>> pshp_soundex_last('James')
    'J500'
    >>> pshp_soundex_last('Schmidt')
    'S530'
    >>> pshp_soundex_last('Ashcroft')
    'A225'

    """
    return PSHPSoundexLast().encode(lname, max_length, german)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.phonetic._pshp_soundex_last.
20
21		PSHP Soundex/Viewex Coding for last names
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize as unicode_normalize
32
33	1	from six import text_type
34
35	1	from ._phonetic import _Phonetic
36
37	1	__all__ = ['PSHPSoundexLast', 'pshp_soundex_last']
38
39
40	1	class PSHPSoundexLast(_Phonetic):
		0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""PSHP Soundex/Viewex Coding of a last name.
42
43		This coding is based on :cite:`Hershberg:1976`.
44
45		Reference was also made to the German version of the same:
46		:cite:`Hershberg:1979`.
47
48		A separate function, :py:class:`PSHPSoundexFirst` is used for first names.
49		"""
50
51	1	_trans = dict(
52		zip(
53		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
54		'01230120022455012523010202',
55		)
56		)
57
58	1	def encode(self, lname, max_length=4, german=False):
		0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
59		"""Calculate the PSHP Soundex/Viewex Coding of a last name.
60
61		Parameters
62		----------
63		lname : str
64		The last name to encode
65		max_length : int
66		The length of the code returned (defaults to 4)
67		german : bool
68		Set to True if the name is German (different rules apply)
69
70		Returns
71		-------
72		str
73		The PSHP Soundex/Viewex Coding
74
75		Examples
76		--------
77		>>> pe = PSHPSoundexLast()
78		>>> pe.encode('Smith')
79		'S530'
80		>>> pe.encode('Waters')
81		'W350'
82		>>> pe.encode('James')
83		'J500'
84		>>> pe.encode('Schmidt')
85		'S530'
86		>>> pe.encode('Ashcroft')
87		'A225'
88
89		"""
90	1	lname = unicode_normalize('NFKD', text_type(lname.upper()))
91	1	lname = lname.replace('ß', 'SS')
92	1	lname = ''.join(c for c in lname if c in self._uc_set)
93
94		# A. Prefix treatment
95	1	if lname[:3] == 'VON' or lname[:3] == 'VAN':
96	1	lname = lname[3:].strip()
97
98		# The rule implemented below says "MC, MAC become 1". I believe it
99		# meant to say they become M except in German data (where superscripted
100		# 1 indicates "except in German data"). It doesn't make sense for them
101		# to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
102		# both articles have this error(?).
103	1	if not german:
104	1	if lname[:3] == 'MAC':
105	1	lname = 'M' + lname[3:]
106	1	elif lname[:2] == 'MC':
107	1	lname = 'M' + lname[2:]
108
109		# The non-German-only rule to strip ' is unnecessary due to filtering
110
111	1	if lname[:1] in {'E', 'I', 'O', 'U'}:
112	1	lname = 'A' + lname[1:]
113	1	elif lname[:2] in {'GE', 'GI', 'GY'}:
114	1	lname = 'J' + lname[1:]
115	1	elif lname[:2] in {'CE', 'CI', 'CY'}:
116	1	lname = 'S' + lname[1:]
117	1	elif lname[:3] == 'CHR':
118	1	lname = 'K' + lname[1:]
119	1	elif lname[:1] == 'C' and lname[:2] != 'CH':
120	1	lname = 'K' + lname[1:]
121
122	1	if lname[:2] == 'KN':
123	1	lname = 'N' + lname[1:]
124	1	elif lname[:2] == 'PH':
125	1	lname = 'F' + lname[1:]
126	1	elif lname[:3] in {'WIE', 'WEI'}:
127	1	lname = 'V' + lname[1:]
128
129	1	if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
130	1	lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
131		1:
132		]
133
134	1	code = lname[:1]
135
136		# B. Postfix treatment
137	1	if german: # moved from end of postfix treatment due to blocking
138	1	if lname[-3:] == 'TES':
139	1	lname = lname[:-3]
140	1	elif lname[-2:] == 'TS':
141	1	lname = lname[:-2]
142	1	if lname[-3:] == 'TZE':
143	1	lname = lname[:-3]
144	1	elif lname[-2:] == 'ZE':
145	1	lname = lname[:-2]
146	1	if lname[-1:] == 'Z':
147	1	lname = lname[:-1]
148	1	elif lname[-2:] == 'TE':
149	1	lname = lname[:-2]
150
151	1	if lname[-1:] == 'R':
152	1	lname = lname[:-1] + 'N'
153	1	elif lname[-2:] in {'SE', 'CE'}:
154	1	lname = lname[:-2]
155	1	if lname[-2:] == 'SS':
156	1	lname = lname[:-2]
157	1	elif lname[-1:] == 'S':
158	1	lname = lname[:-1]
159
160	1	if not german:
161	1	l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
162	1	l4_repl = {
163		'NSEN': 'ASEN',
164		'MSON': 'ASON',
165		'STEN': 'SAEN',
166		'STON': 'SAON',
167		}
168	1	if lname[-5:] in l5_repl:
169	1	lname = lname[:-5] + l5_repl[lname[-5:]]
170	1	elif lname[-4:] in l4_repl:
171	1	lname = lname[:-4] + l4_repl[lname[-4:]]
172
173	1	if lname[-2:] in {'NG', 'ND'}:
174	1	lname = lname[:-1]
175	1	if not german and lname[-3:] in {'GAN', 'GEN'}:
176	1	lname = lname[:-3] + 'A' + lname[-2:]
177
178		# C. Infix Treatment
179	1	lname = lname.replace('CK', 'C')
180	1	lname = lname.replace('SCH', 'S')
181	1	lname = lname.replace('DT', 'T')
182	1	lname = lname.replace('ND', 'N')
183	1	lname = lname.replace('NG', 'N')
184	1	lname = lname.replace('LM', 'M')
185	1	lname = lname.replace('MN', 'M')
186	1	lname = lname.replace('WIE', 'VIE')
187	1	lname = lname.replace('WEI', 'VEI')
188
189		# D. Soundexing
190		# code for X & Y are unspecified, but presumably are 2 & 0
191
192	1	lname = lname.translate(self._trans)
193	1	lname = self._delete_consecutive_repeats(lname)
194
195	1	code += lname[1:]
196	1	code = code.replace('0', '') # rule 1
197
198	1	if max_length != -1:
199	1	if len(code) < max_length:
200	1	code += '0' * (max_length - len(code))
201		else:
202	1	code = code[:max_length]
203
204	1	return code
205
206
207	1	def pshp_soundex_last(lname, max_length=4, german=False):
208		"""Calculate the PSHP Soundex/Viewex Coding of a last name.
209
210		This is a wrapper for :py:meth:`PSHPSoundexLast.encode`.
211
212		Parameters
213		----------
214		lname : str
215		The last name to encode
216		max_length : int
217		The length of the code returned (defaults to 4)
218		german : bool
219		Set to True if the name is German (different rules apply)
220
221		Returns
222		-------
223		str
224		The PSHP Soundex/Viewex Coding
225
226		Examples
227		--------
228		>>> pshp_soundex_last('Smith')
229		'S530'
230		>>> pshp_soundex_last('Waters')
231		'W350'
232		>>> pshp_soundex_last('James')
233		'J500'
234		>>> pshp_soundex_last('Schmidt')
235		'S530'
236		>>> pshp_soundex_last('Ashcroft')
237		'A225'
238
239		"""
240	1	return PSHPSoundexLast().encode(lname, max_length, german)
241
242
243		if __name__ == '__main__':
244		import doctest
245
246		doctest.testmod()
247

chrislit / abydos

Push — master ( f43547...71985b )

PSHPSoundexLast.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like