abydos.phonetic._pshp_soundex_last.PSHPSoundexLast.__init__() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

PSHPSoundexLast.init() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.phonetic._pshp_soundex_last

Complexity

Conditions

Size

Total Lines	16
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	3
CRAP Score	1

Importance

Changes

Metric	Value
eloc	3
dl	0
loc	16
ccs	3
cts	3
cp	1
rs	10
c	0
b	0
f	0
cc	1
nop	3
crap	1

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._pshp_soundex_last.

PSHP Soundex/Viewex Coding for last names
"""

from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['PSHPSoundexLast']


class PSHPSoundexLast(_Phonetic):
    """PSHP Soundex/Viewex Coding of a last name.

    This coding is based on :cite:`Hershberg:1976`.

    Reference was also made to the German version of the same:
    :cite:`Hershberg:1979`.

    A separate function, :py:class:`PSHPSoundexFirst` is used for first names.

    .. versionadded:: 0.3.6
    """

    _trans = dict(
        zip(
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),

            '01230120022455012523010202',
        )
    )

    _alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN'))

    def __init__(self, max_length: int = 4, german: bool = False) -> None:
        """Initialize PSHPSoundexLast instance.

        Parameters
        ----------
        max_length : int
            The length of the code returned (defaults to 4)
        german : bool
            Set to True if the name is German (different rules apply)


        .. versionadded:: 0.4.0

        """
        self._max_length = max_length
        self._german = german

    def encode_alpha(self, lname: str) -> str:
        """Calculate the alphabetic PSHP Soundex/Viewex Coding of a last name.

        Parameters
        ----------
        lname : str
            The last name to encode

        Returns
        -------
        str
            The PSHP alphabetic Soundex/Viewex Coding

        Examples
        --------
        >>> pe = PSHPSoundexLast()
        >>> pe.encode_alpha('Smith')
        'SNT'
        >>> pe.encode_alpha('Waters')
        'WTN'
        >>> pe.encode_alpha('James')
        'JN'
        >>> pe.encode_alpha('Schmidt')
        'SNT'
        >>> pe.encode_alpha('Ashcroft')
        'AKKN'


        .. versionadded:: 0.4.0

        """
        code = self.encode(lname).rstrip('0')
        return code[:1] + code[1:].translate(self._alphabetic)

    def encode(self, lname: str) -> str:
        """Calculate the PSHP Soundex/Viewex Coding of a last name.

        Parameters
        ----------
        lname : str
            The last name to encode

        Returns
        -------
        str
            The PSHP Soundex/Viewex Coding

        Examples
        --------
        >>> pe = PSHPSoundexLast()
        >>> pe.encode('Smith')
        'S530'
        >>> pe.encode('Waters')
        'W350'
        >>> pe.encode('James')
        'J500'
        >>> pe.encode('Schmidt')
        'S530'
        >>> pe.encode('Ashcroft')
        'A225'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        lname = unicode_normalize('NFKD', lname.upper())
        lname = ''.join(c for c in lname if c in self._uc_set)

        # A. Prefix treatment
        if lname[:3] == 'VON' or lname[:3] == 'VAN':
            lname = lname[3:].strip()

        # The rule implemented below says "MC, MAC become 1". I believe it
        # meant to say they become M except in German data (where superscripted
        # 1 indicates "except in German data"). It doesn't make sense for them
        # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
        # both articles have this error(?).
        if not self._german:
            if lname[:3] == 'MAC':
                lname = 'M' + lname[3:]
            elif lname[:2] == 'MC':
                lname = 'M' + lname[2:]

        # The non-German-only rule to strip ' is unnecessary due to filtering

        if lname[:1] in {'E', 'I', 'O', 'U'}:
            lname = 'A' + lname[1:]
        elif lname[:2] in {'GE', 'GI', 'GY'}:
            lname = 'J' + lname[1:]
        elif lname[:2] in {'CE', 'CI', 'CY'}:
            lname = 'S' + lname[1:]
        elif lname[:3] == 'CHR':
            lname = 'K' + lname[1:]
        elif lname[:1] == 'C' and lname[:2] != 'CH':
            lname = 'K' + lname[1:]

        if lname[:2] == 'KN':
            lname = 'N' + lname[1:]
        elif lname[:2] == 'PH':
            lname = 'F' + lname[1:]
        elif lname[:3] in {'WIE', 'WEI'}:
            lname = 'V' + lname[1:]

        if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
            lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
                1:
            ]

        code = lname[:1]

        # B. Postfix treatment
        if self._german:  # moved from end of postfix treatment due to blocking
            if lname[-3:] == 'TES':
                lname = lname[:-3]
            elif lname[-2:] == 'TS':
                lname = lname[:-2]
            if lname[-3:] == 'TZE':
                lname = lname[:-3]
            elif lname[-2:] == 'ZE':
                lname = lname[:-2]
            if lname[-1:] == 'Z':
                lname = lname[:-1]
            elif lname[-2:] == 'TE':
                lname = lname[:-2]

        if lname[-1:] == 'R':
            lname = lname[:-1] + 'N'
        elif lname[-2:] in {'SE', 'CE'}:
            lname = lname[:-2]
        if lname[-2:] == 'SS':
            lname = lname[:-2]
        elif lname[-1:] == 'S':
            lname = lname[:-1]

        if not self._german:
            l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
            l4_repl = {
                'NSEN': 'ASEN',
                'MSON': 'ASON',
                'STEN': 'SAEN',
                'STON': 'SAON',
            }
            if lname[-5:] in l5_repl:
                lname = lname[:-5] + l5_repl[lname[-5:]]
            elif lname[-4:] in l4_repl:
                lname = lname[:-4] + l4_repl[lname[-4:]]

        if lname[-2:] in {'NG', 'ND'}:
            lname = lname[:-1]
        if not self._german and lname[-3:] in {'GAN', 'GEN'}:
            lname = lname[:-3] + 'A' + lname[-2:]

        # C. Infix Treatment
        lname = lname.replace('CK', 'C')
        lname = lname.replace('SCH', 'S')
        lname = lname.replace('DT', 'T')
        lname = lname.replace('ND', 'N')
        lname = lname.replace('NG', 'N')
        lname = lname.replace('LM', 'M')
        lname = lname.replace('MN', 'M')
        lname = lname.replace('WIE', 'VIE')
        lname = lname.replace('WEI', 'VEI')

        # D. Soundexing
        # code for X & Y are unspecified, but presumably are 2 & 0

        lname = lname.translate(self._trans)
        lname = self._delete_consecutive_repeats(lname)

        code += lname[1:]
        code = code.replace('0', '')  # rule 1

        if self._max_length != -1:
            if len(code) < self._max_length:
                code += '0' * (self._max_length - len(code))
            else:
                code = code[: self._max_length]

        return code


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.phonetic._pshp_soundex_last.
18
19	1	PSHP Soundex/Viewex Coding for last names
20		"""
21
22		from unicodedata import normalize as unicode_normalize
23
24	1	from ._phonetic import _Phonetic
25
26		__all__ = ['PSHPSoundexLast']
27
28
29		class PSHPSoundexLast(_Phonetic):
30		"""PSHP Soundex/Viewex Coding of a last name.
31	1
32		This coding is based on :cite:`Hershberg:1976`.
33	1
34		Reference was also made to the German version of the same:
35	1	:cite:`Hershberg:1979`.
36
37	1	A separate function, :py:class:`PSHPSoundexFirst` is used for first names.
38	1
39		.. versionadded:: 0.3.6
40	1	"""
41
42		_trans = dict(
43	1	zip(
44		(ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
45		'01230120022455012523010202',
46		)
47		)
48
49		_alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN'))
50
51		def __init__(self, max_length: int = 4, german: bool = False) -> None:
52		"""Initialize PSHPSoundexLast instance.
53
54		Parameters
55		----------
56	1	max_length : int
57		The length of the code returned (defaults to 4)
58		german : bool
59		Set to True if the name is German (different rules apply)
60
61
62		.. versionadded:: 0.4.0
63	1
64		"""
65	1	self._max_length = max_length
66		self._german = german
67
68		def encode_alpha(self, lname: str) -> str:
69		"""Calculate the alphabetic PSHP Soundex/Viewex Coding of a last name.
70
71		Parameters
72		----------
73		lname : str
74		The last name to encode
75
76		Returns
77		-------
78		str
79	1	The PSHP alphabetic Soundex/Viewex Coding
80	1
81		Examples
82	1	--------
83		>>> pe = PSHPSoundexLast()
84		>>> pe.encode_alpha('Smith')
85		'SNT'
86		>>> pe.encode_alpha('Waters')
87		'WTN'
88		>>> pe.encode_alpha('James')
89		'JN'
90		>>> pe.encode_alpha('Schmidt')
91		'SNT'
92		>>> pe.encode_alpha('Ashcroft')
93		'AKKN'
94
95
96		.. versionadded:: 0.4.0
97
98		"""
99		code = self.encode(lname).rstrip('0')
100		return code[:1] + code[1:].translate(self._alphabetic)
101
102		def encode(self, lname: str) -> str:
103		"""Calculate the PSHP Soundex/Viewex Coding of a last name.
104
105		Parameters
106		----------
107		lname : str
108		The last name to encode
109
110		Returns
111		-------
112		str
113	1	The PSHP Soundex/Viewex Coding
114	1
115		Examples
116	1	--------
117		>>> pe = PSHPSoundexLast()
118		>>> pe.encode('Smith')
119		'S530'
120		>>> pe.encode('Waters')
121		'W350'
122		>>> pe.encode('James')
123		'J500'
124		>>> pe.encode('Schmidt')
125		'S530'
126		>>> pe.encode('Ashcroft')
127		'A225'
128
129
130		.. versionadded:: 0.3.0
131		.. versionchanged:: 0.3.6
132		Encapsulated in class
133
134		"""
135		lname = unicode_normalize('NFKD', lname.upper())
136		lname = ''.join(c for c in lname if c in self._uc_set)
137
138		# A. Prefix treatment
139		if lname[:3] == 'VON' or lname[:3] == 'VAN':
140		lname = lname[3:].strip()
141
142		# The rule implemented below says "MC, MAC become 1". I believe it
143		# meant to say they become M except in German data (where superscripted
144		# 1 indicates "except in German data"). It doesn't make sense for them
145		# to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
146		# both articles have this error(?).
147		if not self._german:
148		if lname[:3] == 'MAC':
149	1	lname = 'M' + lname[3:]
150	1	elif lname[:2] == 'MC':
151	1	lname = 'M' + lname[2:]
152
153		# The non-German-only rule to strip ' is unnecessary due to filtering
154	1
155	1	if lname[:1] in {'E', 'I', 'O', 'U'}:
156		lname = 'A' + lname[1:]
157		elif lname[:2] in {'GE', 'GI', 'GY'}:
158		lname = 'J' + lname[1:]
159		elif lname[:2] in {'CE', 'CI', 'CY'}:
160		lname = 'S' + lname[1:]
161		elif lname[:3] == 'CHR':
162	1	lname = 'K' + lname[1:]
163	1	elif lname[:1] == 'C' and lname[:2] != 'CH':
164	1	lname = 'K' + lname[1:]
165	1
166	1	if lname[:2] == 'KN':
167		lname = 'N' + lname[1:]
168		elif lname[:2] == 'PH':
169		lname = 'F' + lname[1:]
170	1	elif lname[:3] in {'WIE', 'WEI'}:
171	1	lname = 'V' + lname[1:]
172	1
173	1	if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
174	1	lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
175	1	1:
176	1	]
177	1
178	1	code = lname[:1]
179	1
180		# B. Postfix treatment
181	1	if self._german: # moved from end of postfix treatment due to blocking
182	1	if lname[-3:] == 'TES':
183	1	lname = lname[:-3]
184	1	elif lname[-2:] == 'TS':
185	1	lname = lname[:-2]
186	1	if lname[-3:] == 'TZE':
187		lname = lname[:-3]
188	1	elif lname[-2:] == 'ZE':
189	1	lname = lname[:-2]
190		if lname[-1:] == 'Z':
191		lname = lname[:-1]
192		elif lname[-2:] == 'TE':
193	1	lname = lname[:-2]
194
195		if lname[-1:] == 'R':
196	1	lname = lname[:-1] + 'N'
197	1	elif lname[-2:] in {'SE', 'CE'}:
198	1	lname = lname[:-2]
199	1	if lname[-2:] == 'SS':
200	1	lname = lname[:-2]
201	1	elif lname[-1:] == 'S':
202	1	lname = lname[:-1]
203	1
204	1	if not self._german:
205	1	l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
206	1	l4_repl = {
207	1	'NSEN': 'ASEN',
208	1	'MSON': 'ASON',
209		'STEN': 'SAEN',
210	1	'STON': 'SAON',
211	1	}
212	1	if lname[-5:] in l5_repl:
213	1	lname = lname[:-5] + l5_repl[lname[-5:]]
214	1	elif lname[-4:] in l4_repl:
215	1	lname = lname[:-4] + l4_repl[lname[-4:]]
216	1
217	1	if lname[-2:] in {'NG', 'ND'}:
218		lname = lname[:-1]
219	1	if not self._german and lname[-3:] in {'GAN', 'GEN'}:
220	1	lname = lname[:-3] + 'A' + lname[-2:]
221	1
222		# C. Infix Treatment
223		lname = lname.replace('CK', 'C')
224		lname = lname.replace('SCH', 'S')
225		lname = lname.replace('DT', 'T')
226		lname = lname.replace('ND', 'N')
227	1	lname = lname.replace('NG', 'N')
228	1	lname = lname.replace('LM', 'M')
229	1	lname = lname.replace('MN', 'M')
230	1	lname = lname.replace('WIE', 'VIE')
231		lname = lname.replace('WEI', 'VEI')
232	1
233	1	# D. Soundexing
234	1	# code for X & Y are unspecified, but presumably are 2 & 0
235	1
236		lname = lname.translate(self._trans)
237		lname = self._delete_consecutive_repeats(lname)
238	1
239	1	code += lname[1:]
240	1	code = code.replace('0', '') # rule 1
241	1
242	1	if self._max_length != -1:
243	1	if len(code) < self._max_length:
244	1	code += '0' * (self._max_length - len(code))
245	1	else:
246	1	code = code[: self._max_length]
247
248		return code
249
250
251	1	if __name__ == '__main__':
252	1	import doctest
253
254		doctest.testmod()
255

chrislit / abydos

PSHPSoundexLast.__init__() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

PSHPSoundexLast.init() A
last analyzed 2020-12-31 20:10 UTC