Completed
Pull Request — master (#141)
by Chris
11:42
created

abydos.phonetic._PSHPSoundexLast   A

Complexity

Total Complexity 37

Size/Duplication

Total Lines 235
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
eloc 105
dl 0
loc 235
ccs 90
cts 90
cp 1
rs 9.44
c 0
b 0
f 0
wmc 37

1 Method

Rating   Name   Duplication   Size   Complexity  
F PSHPSoundexLast.encode() 0 141 36

1 Function

Rating   Name   Duplication   Size   Complexity  
A pshp_soundex_last() 0 28 1
1
# -*- coding: utf-8 -*-
0 ignored issues
show
Coding Style Naming introduced by
The name _PSHPSoundexLast does not conform to the module naming conventions ((([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._PSHPSoundexLast.
20
21
PSHP Soundex/Viewex Coding for last names
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._Phonetic import Phonetic
36
37 1
__all__ = ['PSHPSoundexLast', 'pshp_soundex_last']
38
39
40 1
class PSHPSoundexLast(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """PSHP Soundex/Viewex Coding of a last name.
42
43
    This coding is based on :cite:`Hershberg:1976`.
44
45
    Reference was also made to the German version of the same:
46
    :cite:`Hershberg:1979`.
47
48
    A separate function, :py:class:`PSHPSoundexFirst` is used for first names.
49
    """
50
51 1
    _trans = dict(
52
        zip(
53
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
54
            '01230120022455012523010202',
55
        )
56
    )
57
58 1
    def encode(self, lname, max_length=4, german=False):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
59
        """Calculate the PSHP Soundex/Viewex Coding of a last name.
60
61
        Args:
62
            lname (str): The last name to encode
63
            max_length (int): The length of the code returned (defaults to 4)
64
            german (bool): Set to True if the name is German (different rules
65
                apply)
66
67
        Returns:
68
            str: The PSHP Soundex/Viewex Coding
69
70
        Examples:
71
            >>> pe = PSHPSoundexLast()
72
            >>> pe.encode('Smith')
73
            'S530'
74
            >>> pe.encode('Waters')
75
            'W350'
76
            >>> pe.encode('James')
77
            'J500'
78
            >>> pe.encode('Schmidt')
79
            'S530'
80
            >>> pe.encode('Ashcroft')
81
            'A225'
82
83
        """
84 1
        lname = unicode_normalize('NFKD', text_type(lname.upper()))
85 1
        lname = lname.replace('ß', 'SS')
86 1
        lname = ''.join(c for c in lname if c in self._uc_set)
87
88
        # A. Prefix treatment
89 1
        if lname[:3] == 'VON' or lname[:3] == 'VAN':
90 1
            lname = lname[3:].strip()
91
92
        # The rule implemented below says "MC, MAC become 1". I believe it
93
        # meant to say they become M except in German data (where superscripted
94
        # 1 indicates "except in German data"). It doesn't make sense for them
95
        # to become 1 (BPFV -> 1) or to apply outside German. Unfortunately,
96
        # both articles have this error(?).
97 1
        if not german:
98 1
            if lname[:3] == 'MAC':
99 1
                lname = 'M' + lname[3:]
100 1
            elif lname[:2] == 'MC':
101 1
                lname = 'M' + lname[2:]
102
103
        # The non-German-only rule to strip ' is unnecessary due to filtering
104
105 1
        if lname[:1] in {'E', 'I', 'O', 'U'}:
106 1
            lname = 'A' + lname[1:]
107 1
        elif lname[:2] in {'GE', 'GI', 'GY'}:
108 1
            lname = 'J' + lname[1:]
109 1
        elif lname[:2] in {'CE', 'CI', 'CY'}:
110 1
            lname = 'S' + lname[1:]
111 1
        elif lname[:3] == 'CHR':
112 1
            lname = 'K' + lname[1:]
113 1
        elif lname[:1] == 'C' and lname[:2] != 'CH':
114 1
            lname = 'K' + lname[1:]
115
116 1
        if lname[:2] == 'KN':
117 1
            lname = 'N' + lname[1:]
118 1
        elif lname[:2] == 'PH':
119 1
            lname = 'F' + lname[1:]
120 1
        elif lname[:3] in {'WIE', 'WEI'}:
121 1
            lname = 'V' + lname[1:]
122
123 1
        if german and lname[:1] in {'W', 'M', 'Y', 'Z'}:
124 1
            lname = {'W': 'V', 'M': 'N', 'Y': 'J', 'Z': 'S'}[lname[0]] + lname[
125
                1:
126
            ]
127
128 1
        code = lname[:1]
129
130
        # B. Postfix treatment
131 1
        if german:  # moved from end of postfix treatment due to blocking
132 1
            if lname[-3:] == 'TES':
133 1
                lname = lname[:-3]
134 1
            elif lname[-2:] == 'TS':
135 1
                lname = lname[:-2]
136 1
            if lname[-3:] == 'TZE':
137 1
                lname = lname[:-3]
138 1
            elif lname[-2:] == 'ZE':
139 1
                lname = lname[:-2]
140 1
            if lname[-1:] == 'Z':
141 1
                lname = lname[:-1]
142 1
            elif lname[-2:] == 'TE':
143 1
                lname = lname[:-2]
144
145 1
        if lname[-1:] == 'R':
146 1
            lname = lname[:-1] + 'N'
147 1
        elif lname[-2:] in {'SE', 'CE'}:
148 1
            lname = lname[:-2]
149 1
        if lname[-2:] == 'SS':
150 1
            lname = lname[:-2]
151 1
        elif lname[-1:] == 'S':
152 1
            lname = lname[:-1]
153
154 1
        if not german:
155 1
            l5_repl = {'STOWN': 'SAWON', 'MPSON': 'MASON'}
156 1
            l4_repl = {
157
                'NSEN': 'ASEN',
158
                'MSON': 'ASON',
159
                'STEN': 'SAEN',
160
                'STON': 'SAON',
161
            }
162 1
            if lname[-5:] in l5_repl:
163 1
                lname = lname[:-5] + l5_repl[lname[-5:]]
164 1
            elif lname[-4:] in l4_repl:
165 1
                lname = lname[:-4] + l4_repl[lname[-4:]]
166
167 1
        if lname[-2:] in {'NG', 'ND'}:
168 1
            lname = lname[:-1]
169 1
        if not german and lname[-3:] in {'GAN', 'GEN'}:
170 1
            lname = lname[:-3] + 'A' + lname[-2:]
171
172
        # C. Infix Treatment
173 1
        lname = lname.replace('CK', 'C')
174 1
        lname = lname.replace('SCH', 'S')
175 1
        lname = lname.replace('DT', 'T')
176 1
        lname = lname.replace('ND', 'N')
177 1
        lname = lname.replace('NG', 'N')
178 1
        lname = lname.replace('LM', 'M')
179 1
        lname = lname.replace('MN', 'M')
180 1
        lname = lname.replace('WIE', 'VIE')
181 1
        lname = lname.replace('WEI', 'VEI')
182
183
        # D. Soundexing
184
        # code for X & Y are unspecified, but presumably are 2 & 0
185
186 1
        lname = lname.translate(self._trans)
187 1
        lname = self._delete_consecutive_repeats(lname)
188
189 1
        code += lname[1:]
190 1
        code = code.replace('0', '')  # rule 1
191
192 1
        if max_length != -1:
193 1
            if len(code) < max_length:
194 1
                code += '0' * (max_length - len(code))
195
            else:
196 1
                code = code[:max_length]
197
198 1
        return code
199
200
201 1
def pshp_soundex_last(lname, max_length=4, german=False):
202
    """Calculate the PSHP Soundex/Viewex Coding of a last name.
203
204
    This is a wrapper for :py:meth:`PSHPSoundexLast.encode`.
205
206
    Args:
207
        lname (str): The last name to encode
208
        max_length (int): The length of the code returned (defaults to 4)
209
        german (bool): Set to True if the name is German (different rules
210
            apply)
211
212
    Returns:
213
        str: The PSHP Soundex/Viewex Coding
214
215
    Examples:
216
        >>> pshp_soundex_last('Smith')
217
        'S530'
218
        >>> pshp_soundex_last('Waters')
219
        'W350'
220
        >>> pshp_soundex_last('James')
221
        'J500'
222
        >>> pshp_soundex_last('Schmidt')
223
        'S530'
224
        >>> pshp_soundex_last('Ashcroft')
225
        'A225'
226
227
    """
228 1
    return PSHPSoundexLast().encode(lname, max_length, german)
229
230
231
if __name__ == '__main__':
232
    import doctest
233
234
    doctest.testmod()
235