PSHPSoundexLast.__init__()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 16
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 3
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 3
dl 0
loc 16
ccs 3
cts 3
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 3
crap 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._pshp_soundex_last.
18
19 1
PSHP Soundex/Viewex Coding for last names
20
"""
21
22
from unicodedata import normalize as unicode_normalize
23
24 1
from ._phonetic import _Phonetic
25
26
__all__ = ['PSHPSoundexLast']
27
28
29
class PSHPSoundexLast(_Phonetic):
30
    """PSHP Soundex/Viewex Coding of a last name.
31 1
32
    This coding is based on :cite:`Hershberg:1976`.
33 1
34
    Reference was also made to the German version of the same:
35 1
    :cite:`Hershberg:1979`.
36
37 1
    A separate function, :py:class:`PSHPSoundexFirst` is used for first names.
38 1
39
    .. versionadded:: 0.3.6
40 1
    """
41
42
    _trans = dict(
43 1
        zip(
44
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
45
            '01230120022455012523010202',
46
        )
47
    )
48
49
    _alphabetic = dict(zip((ord(_) for _ in '12345'), 'PKTLN'))
50
51
    def __init__(self, max_length: int = 4, german: bool = False) -> None:
52
        """Initialize PSHPSoundexLast instance.
53
54
        Parameters
55
        ----------
56 1
        max_length : int
57
            The length of the code returned (defaults to 4)
58
        german : bool
59
            Set to True if the name is German (different rules apply)
60
61
62
        .. versionadded:: 0.4.0
63 1
64
        """
65 1
        self._max_length = max_length
66
        self._german = german
67
68
    def encode_alpha(self, lname: str) -> str:
69
        """Calculate the alphabetic PSHP Soundex/Viewex Coding of a last name.
70
71
        Parameters
72
        ----------
73
        lname : str
74
            The last name to encode
75
76
        Returns
77
        -------
78
        str
79 1
            The PSHP alphabetic Soundex/Viewex Coding
80 1
81
        Examples
82 1
        --------
83
        >>> pe = PSHPSoundexLast()
84
        >>> pe.encode_alpha('Smith')
85
        'SNT'
86
        >>> pe.encode_alpha('Waters')
87
        'WTN'
88
        >>> pe.encode_alpha('James')
89
        'JN'
90
        >>> pe.encode_alpha('Schmidt')
91
        'SNT'
92
        >>> pe.encode_alpha('Ashcroft')
93
        'AKKN'
94
95
96
        .. versionadded:: 0.4.0
97
98
        """
99
        code = self.encode(lname).rstrip('0')
100
        return code[:1] + code[1:].translate(self._alphabetic)
101
102
    def encode(self, lname: str) -> str:
103
        """Calculate the PSHP Soundex/Viewex Coding of a last name.
104
105
        Parameters
106
        ----------
107
        lname : str
108
            The last name to encode
109
110
        Returns
111
        -------
112
        str
113 1
            The PSHP Soundex/Viewex Coding
114 1
115
        Examples
116 1
        --------
117
        >>> pe = PSHPSoundexLast()
118
        >>> pe.encode('Smith')
119
        'S530'
120
        >>> pe.encode('Waters')
121
        'W350'
122
        >>> pe.encode('James')
123
        'J500'
124
        >>> pe.encode('Schmidt')
125
        'S530'
126
        >>> pe.encode('Ashcroft')
127
        'A225'
128
129
130
        .. versionadded:: 0.3.0
131
        .. versionchanged:: 0.3.6
132
            Encapsulated in class
133
134
        """
135
        lname = unicode_normalize('NFKD', lname.upper())
136
        lname = ''.join(c for c in lname if c in self._uc_set)
137
138
        # A. Prefix treatment
139
        if lname[:3] == 'VON' or lname[:3] == 'VAN':
140
            lname = lname[3:].strip()
141
142
        # The rule implemented below says "MC, MAC become 1". I believe it
143
        # meant to say they become M except in German data (where superscripted
144
        # 1 indicates "except in German data"). It doesn't make sense for them
145
        # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
146
        # both articles have this error(?).
147
        if not self._german:
148
            if lname[:3] == 'MAC':
149 1
                lname = 'M' + lname[3:]
150 1
            elif lname[:2] == 'MC':
151 1
                lname = 'M' + lname[2:]
152
153
        # The non-German-only rule to strip ' is unnecessary due to filtering
154 1
155 1
        if lname[:1] in {'E', 'I', 'O', 'U'}:
156
            lname = 'A' + lname[1:]
157
        elif lname[:2] in {'GE', 'GI', 'GY'}:
158
            lname = 'J' + lname[1:]
159
        elif lname[:2] in {'CE', 'CI', 'CY'}:
160
            lname = 'S' + lname[1:]
161
        elif lname[:3] == 'CHR':
162 1
            lname = 'K' + lname[1:]
163 1
        elif lname[:1] == 'C' and lname[:2] != 'CH':
164 1
            lname = 'K' + lname[1:]
165 1
166 1
        if lname[:2] == 'KN':
167
            lname = 'N' + lname[1:]
168
        elif lname[:2] == 'PH':
169
            lname = 'F' + lname[1:]
170 1
        elif lname[:3] in {'WIE', 'WEI'}:
171 1
            lname = 'V' + lname[1:]
172 1
173 1
        if self._german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
174 1
            lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
175 1
                1:
176 1
            ]
177 1
178 1
        code = lname[:1]
179 1
180
        # B. Postfix treatment
181 1
        if self._german:  # moved from end of postfix treatment due to blocking
182 1
            if lname[-3:] == 'TES':
183 1
                lname = lname[:-3]
184 1
            elif lname[-2:] == 'TS':
185 1
                lname = lname[:-2]
186 1
            if lname[-3:] == 'TZE':
187
                lname = lname[:-3]
188 1
            elif lname[-2:] == 'ZE':
189 1
                lname = lname[:-2]
190
            if lname[-1:] == 'Z':
191
                lname = lname[:-1]
192
            elif lname[-2:] == 'TE':
193 1
                lname = lname[:-2]
194
195
        if lname[-1:] == 'R':
196 1
            lname = lname[:-1] + 'N'
197 1
        elif lname[-2:] in {'SE', 'CE'}:
198 1
            lname = lname[:-2]
199 1
        if lname[-2:] == 'SS':
200 1
            lname = lname[:-2]
201 1
        elif lname[-1:] == 'S':
202 1
            lname = lname[:-1]
203 1
204 1
        if not self._german:
205 1
            l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
206 1
            l4_repl = {
207 1
                'NSEN': 'ASEN',
208 1
                'MSON': 'ASON',
209
                'STEN': 'SAEN',
210 1
                'STON': 'SAON',
211 1
            }
212 1
            if lname[-5:] in l5_repl:
213 1
                lname = lname[:-5] + l5_repl[lname[-5:]]
214 1
            elif lname[-4:] in l4_repl:
215 1
                lname = lname[:-4] + l4_repl[lname[-4:]]
216 1
217 1
        if lname[-2:] in {'NG', 'ND'}:
218
            lname = lname[:-1]
219 1
        if not self._german and lname[-3:] in {'GAN', 'GEN'}:
220 1
            lname = lname[:-3] + 'A' + lname[-2:]
221 1
222
        # C. Infix Treatment
223
        lname = lname.replace('CK', 'C')
224
        lname = lname.replace('SCH', 'S')
225
        lname = lname.replace('DT', 'T')
226
        lname = lname.replace('ND', 'N')
227 1
        lname = lname.replace('NG', 'N')
228 1
        lname = lname.replace('LM', 'M')
229 1
        lname = lname.replace('MN', 'M')
230 1
        lname = lname.replace('WIE', 'VIE')
231
        lname = lname.replace('WEI', 'VEI')
232 1
233 1
        # D. Soundexing
234 1
        # code for X & Y are unspecified, but presumably are 2 & 0
235 1
236
        lname = lname.translate(self._trans)
237
        lname = self._delete_consecutive_repeats(lname)
238 1
239 1
        code += lname[1:]
240 1
        code = code.replace('0', '')  # rule 1
241 1
242 1
        if self._max_length != -1:
243 1
            if len(code) < self._max_length:
244 1
                code += '0' * (self._max_length - len(code))
245 1
            else:
246 1
                code = code[: self._max_length]
247
248
        return code
249
250
251 1
if __name__ == '__main__':
252 1
    import doctest
253
254
    doctest.testmod()
255