Test Failed
Push — master ( 23810f...afe14d )
by Chris
09:47
created

abydos.phonetic._alpha_sis.AlphaSIS.encode()   F

Complexity

Conditions 14

Size

Total Lines 88
Code Lines 41

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 40
CRAP Score 14

Importance

Changes 0
Metric Value
cc 14
eloc 41
nop 3
dl 0
loc 88
ccs 40
cts 40
cp 1
crap 14
rs 3.6
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._alpha_sis.AlphaSIS.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._alpha_sis.
20
21
IBM's Alpha Search Inquiry System coding
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34 1
from six.moves import range
35
36 1
from ._phonetic import _Phonetic
37
38 1
__all__ = ['AlphaSIS', 'alpha_sis']
39
40
41 1
class AlphaSIS(_Phonetic):
42
    """Alpha-SIS.
43
44
    The Alpha Search Inquiry System code is defined in :cite:`IBM:1973`.
45
    This implementation is based on the description in :cite:`Moore:1977`.
46
    """
47
48 1
    _alpha_sis_initials = {
49
        'GF': '08',
50
        'GM': '03',
51
        'GN': '02',
52
        'KN': '02',
53
        'PF': '08',
54
        'PN': '02',
55
        'PS': '00',
56
        'WR': '04',
57
        'A': '1',
58
        'E': '1',
59
        'H': '2',
60
        'I': '1',
61
        'J': '3',
62
        'O': '1',
63
        'U': '1',
64
        'W': '4',
65
        'Y': '5',
66
    }
67 1
    _alpha_sis_initials_order = (
68
        'GF',
69
        'GM',
70
        'GN',
71
        'KN',
72
        'PF',
73
        'PN',
74
        'PS',
75
        'WR',
76
        'A',
77
        'E',
78
        'H',
79
        'I',
80
        'J',
81
        'O',
82
        'U',
83
        'W',
84
        'Y',
85
    )
86 1
    _alpha_sis_basic = {
87
        'SCH': '6',
88
        'CZ': ('70', '6', '0'),
89
        'CH': ('6', '70', '0'),
90
        'CK': ('7', '6'),
91
        'DS': ('0', '10'),
92
        'DZ': ('0', '10'),
93
        'TS': ('0', '10'),
94
        'TZ': ('0', '10'),
95
        'CI': '0',
96
        'CY': '0',
97
        'CE': '0',
98
        'SH': '6',
99
        'DG': '7',
100
        'PH': '8',
101
        'C': ('7', '6'),
102
        'K': ('7', '6'),
103
        'Z': '0',
104
        'S': '0',
105
        'D': '1',
106
        'T': '1',
107
        'N': '2',
108
        'M': '3',
109
        'R': '4',
110
        'L': '5',
111
        'J': '6',
112
        'G': '7',
113
        'Q': '7',
114
        'X': '7',
115
        'F': '8',
116
        'V': '8',
117
        'B': '9',
118
        'P': '9',
119
    }
120 1
    _alpha_sis_basic_order = (
121
        'SCH',
122
        'CZ',
123
        'CH',
124
        'CK',
125
        'DS',
126
        'DZ',
127
        'TS',
128
        'TZ',
129
        'CI',
130
        'CY',
131
        'CE',
132
        'SH',
133
        'DG',
134
        'PH',
135
        'C',
136
        'K',
137
        'Z',
138
        'S',
139
        'D',
140
        'T',
141
        'N',
142
        'M',
143
        'R',
144
        'L',
145
        'J',
146
        'C',
147
        'G',
148
        'K',
149
        'Q',
150
        'X',
151
        'F',
152
        'V',
153
        'B',
154
        'P',
155
    )
156
157 1
    def encode(self, word, max_length=14):
158
        """Return the IBM Alpha Search Inquiry System code for a word.
159
160
        A collection is necessary as the return type since there can be
161
        multiple values for a single word. But the collection must be ordered
162
        since the first value is the primary coding.
163
164
        Parameters
165
        ----------
166
        word : str
167
            The word to transform
168
        max_length : int
169
            The length of the code returned (defaults to 14)
170
171
        Returns
172
        -------
173
        tuple
174
            The Alpha-SIS value
175
176
        Examples
177
        --------
178
        >>> pe = AlphaSIS()
179
        >>> pe.encode('Christopher')
180
        ('06401840000000', '07040184000000', '04018400000000')
181
        >>> pe.encode('Niall')
182
        ('02500000000000',)
183
        >>> pe.encode('Smith')
184
        ('03100000000000',)
185
        >>> pe.encode('Schmidt')
186
        ('06310000000000',)
187
188
        """
189 1
        alpha = ['']
190 1
        pos = 0
191 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
192 1
        word = word.replace('ß', 'SS')
193 1
        word = ''.join(c for c in word if c in self._uc_set)
194
195
        # Clamp max_length to [4, 64]
196 1
        if max_length != -1:
197 1
            max_length = min(max(4, max_length), 64)
198
        else:
199 1
            max_length = 64
200
201
        # Do special processing for initial substrings
202 1
        for k in self._alpha_sis_initials_order:
203 1
            if word.startswith(k):
204 1
                alpha[0] += self._alpha_sis_initials[k]
205 1
                pos += len(k)
206 1
                break
207
208
        # Add a '0' if alpha is still empty
209 1
        if not alpha[0]:
210 1
            alpha[0] += '0'
211
212
        # Whether or not any special initial codes were encoded, iterate
213
        # through the length of the word in the main encoding loop
214 1
        while pos < len(word):
215 1
            orig_pos = pos
216 1
            for k in self._alpha_sis_basic_order:
217 1
                if word[pos:].startswith(k):
218 1
                    if isinstance(self._alpha_sis_basic[k], tuple):
219 1
                        newalpha = []
220 1
                        for i in range(len(self._alpha_sis_basic[k])):
221 1
                            newalpha += [
222
                                _ + self._alpha_sis_basic[k][i] for _ in alpha
223
                            ]
224 1
                        alpha = newalpha
225
                    else:
226 1
                        alpha = [_ + self._alpha_sis_basic[k] for _ in alpha]
227 1
                    pos += len(k)
228 1
                    break
229 1
            if pos == orig_pos:
230 1
                alpha = [_ + '_' for _ in alpha]
231 1
                pos += 1
232
233
        # Trim doublets and placeholders
234 1
        for i in range(len(alpha)):
235 1
            pos = 1
236 1
            while pos < len(alpha[i]):
237 1
                if alpha[i][pos] == alpha[i][pos - 1]:
238 1
                    alpha[i] = alpha[i][:pos] + alpha[i][pos + 1 :]
239 1
                pos += 1
240 1
        alpha = (_.replace('_', '') for _ in alpha)
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
241
242
        # Trim codes and return tuple
243 1
        alpha = ((_ + ('0' * max_length))[:max_length] for _ in alpha)
244 1
        return tuple(alpha)
245
246
247 1
def alpha_sis(word, max_length=14):
248
    """Return the IBM Alpha Search Inquiry System code for a word.
249
250
    This is a wrapper for :py:meth:`AlphaSIS.encode`.
251
252
    Parameters
253
    ----------
254
    word : str
255
        The word to transform
256
    max_length : int
257
        The length of the code returned (defaults to 14)
258
259
    Returns
260
    -------
261
    tuple
262
        The Alpha-SIS value
263
264
    Examples
265
    --------
266
    >>> alpha_sis('Christopher')
267
    ('06401840000000', '07040184000000', '04018400000000')
268
    >>> alpha_sis('Niall')
269
    ('02500000000000',)
270
    >>> alpha_sis('Smith')
271
    ('03100000000000',)
272
    >>> alpha_sis('Schmidt')
273
    ('06310000000000',)
274
275
    """
276 1
    return AlphaSIS().encode(word, max_length)
277
278
279
if __name__ == '__main__':
280
    import doctest
281
282
    doctest.testmod()
283