Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.phonetic._phonix.Phonix.encode()   F

Complexity

Conditions 24

Size

Total Lines 186
Code Lines 59

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 58
CRAP Score 24

Importance

Changes 0
Metric Value
cc 24
eloc 59
nop 4
dl 0
loc 186
ccs 58
cts 58
cp 1
crap 24
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._phonix.Phonix.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._phonix.
20
21
Phonix
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import _Phonetic
36
37 1
__all__ = ['Phonix', 'phonix']
38
39
40 1
class Phonix(_Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """Phonix code.
42
43
    Phonix is a Soundex-like algorithm defined in :cite:`Gadd:1990`.
44
45
    This implementation is based on:
46
    - :cite:`Pfeifer:2000`
47
    - :cite:`Christen:2011`
48
    - :cite:`Kollar:2007`
49
    """
50
51 1
    _uc_c_set = None
52
53 1
    _substitutions = None
54
55 1
    _trans = dict(
56
        zip(
57
            (ord(_) for _ in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
58
            '01230720022455012683070808',
59
        )
60
    )
61
62 1
    def __init__(self):
63
        """Initialize Phonix."""
64 1
        self._uc_c_set = (
65
            super(Phonix, self)._uc_set - super(Phonix, self)._uc_v_set
66
        )
67
68 1
        self._substitutions = (
69
            (3, 'DG', 'G'),
70
            (3, 'CO', 'KO'),
71
            (3, 'CA', 'KA'),
72
            (3, 'CU', 'KU'),
73
            (3, 'CY', 'SI'),
74
            (3, 'CI', 'SI'),
75
            (3, 'CE', 'SE'),
76
            (0, 'CL', 'KL', super(Phonix, self)._uc_v_set),
77
            (3, 'CK', 'K'),
78
            (1, 'GC', 'K'),
79
            (1, 'JC', 'K'),
80
            (0, 'CHR', 'KR', super(Phonix, self)._uc_v_set),
81
            (0, 'CR', 'KR', super(Phonix, self)._uc_v_set),
82
            (0, 'WR', 'R'),
83
            (3, 'NC', 'NK'),
84
            (3, 'CT', 'KT'),
85
            (3, 'PH', 'F'),
86
            (3, 'AA', 'AR'),
87
            (3, 'SCH', 'SH'),
88
            (3, 'BTL', 'TL'),
89
            (3, 'GHT', 'T'),
90
            (3, 'AUGH', 'ARF'),
91
            (
92
                2,
93
                'LJ',
94
                'LD',
95
                super(Phonix, self)._uc_v_set,
96
                super(Phonix, self)._uc_v_set,
97
            ),
98
            (3, 'LOUGH', 'LOW'),
99
            (0, 'Q', 'KW'),
100
            (0, 'KN', 'N'),
101
            (1, 'GN', 'N'),
102
            (3, 'GHN', 'N'),
103
            (1, 'GNE', 'N'),
104
            (3, 'GHNE', 'NE'),
105
            (1, 'GNES', 'NS'),
106
            (0, 'GN', 'N'),
107
            (2, 'GN', 'N', None, self._uc_c_set),
108
            (1, 'GN', 'N'),
109
            (0, 'PS', 'S'),
110
            (0, 'PT', 'T'),
111
            (0, 'CZ', 'C'),
112
            (2, 'WZ', 'Z', super(Phonix, self)._uc_v_set),
113
            (2, 'CZ', 'CH'),
114
            (3, 'LZ', 'LSH'),
115
            (3, 'RZ', 'RSH'),
116
            (2, 'Z', 'S', None, super(Phonix, self)._uc_v_set),
117
            (3, 'ZZ', 'TS'),
118
            (2, 'Z', 'TS', self._uc_c_set),
119
            (3, 'HROUG', 'REW'),
120
            (3, 'OUGH', 'OF'),
121
            (
122
                2,
123
                'Q',
124
                'KW',
125
                super(Phonix, self)._uc_v_set,
126
                super(Phonix, self)._uc_v_set,
127
            ),
128
            (
129
                2,
130
                'J',
131
                'Y',
132
                super(Phonix, self)._uc_v_set,
133
                super(Phonix, self)._uc_v_set,
134
            ),
135
            (0, 'YJ', 'Y', super(Phonix, self)._uc_v_set),
136
            (0, 'GH', 'G'),
137
            (1, 'GH', 'E', super(Phonix, self)._uc_v_set),
138
            (0, 'CY', 'S'),
139
            (3, 'NX', 'NKS'),
140
            (0, 'PF', 'F'),
141
            (1, 'DT', 'T'),
142
            (1, 'TL', 'TIL'),
143
            (1, 'DL', 'DIL'),
144
            (3, 'YTH', 'ITH'),
145
            (0, 'TJ', 'CH', super(Phonix, self)._uc_v_set),
146
            (0, 'TSJ', 'CH', super(Phonix, self)._uc_v_set),
147
            (0, 'TS', 'T', super(Phonix, self)._uc_v_set),
148
            (3, 'TCH', 'CH'),
149
            (2, 'WSK', 'VSKIE', super(Phonix, self)._uc_v_set),
150
            (1, 'WSK', 'VSKIE', super(Phonix, self)._uc_v_set),
151
            (0, 'MN', 'N', super(Phonix, self)._uc_v_set),
152
            (0, 'PN', 'N', super(Phonix, self)._uc_v_set),
153
            (2, 'STL', 'SL', super(Phonix, self)._uc_v_set),
154
            (1, 'STL', 'SL', super(Phonix, self)._uc_v_set),
155
            (1, 'TNT', 'ENT'),
156
            (1, 'EAUX', 'OH'),
157
            (3, 'EXCI', 'ECS'),
158
            (3, 'X', 'ECS'),
159
            (1, 'NED', 'ND'),
160
            (3, 'JR', 'DR'),
161
            (1, 'EE', 'EA'),
162
            (3, 'ZS', 'S'),
163
            (2, 'R', 'AH', super(Phonix, self)._uc_v_set, self._uc_c_set),
164
            (1, 'R', 'AH', super(Phonix, self)._uc_v_set),
165
            (2, 'HR', 'AH', super(Phonix, self)._uc_v_set, self._uc_c_set),
166
            (1, 'HR', 'AH', super(Phonix, self)._uc_v_set),
167
            (1, 'HR', 'AH', super(Phonix, self)._uc_v_set),
168
            (1, 'RE', 'AR'),
169
            (1, 'R', 'AH', super(Phonix, self)._uc_v_set),
170
            (3, 'LLE', 'LE'),
171
            (1, 'LE', 'ILE', self._uc_c_set),
172
            (1, 'LES', 'ILES', self._uc_c_set),
173
            (1, 'E', ''),
174
            (1, 'ES', 'S'),
175
            (1, 'SS', 'AS', super(Phonix, self)._uc_v_set),
176
            (1, 'MB', 'M', super(Phonix, self)._uc_v_set),
177
            (3, 'MPTS', 'MPS'),
178
            (3, 'MPS', 'MS'),
179
            (3, 'MPT', 'MT'),
180
        )
181
182 1
    def encode(self, word, max_length=4, zero_pad=True):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
183
        """Return the Phonix code for a word.
184
185
        Parameters
186
        ----------
187
        word : str
188
            The word to transform
189
        max_length : int
190
            The length of the code returned (defaults to 4)
191
        zero_pad : bool
192
            Pad the end of the return value with 0s to achieve a max_length
193
            string
194
195
        Returns
196
        -------
197
        str
198
            The Phonix value
199
200
        Examples
201
        --------
202
        >>> pe = Phonix()
203
        >>> pe.encode('Christopher')
204
        'K683'
205
        >>> pe.encode('Niall')
206
        'N400'
207
        >>> pe.encode('Smith')
208
        'S530'
209
        >>> pe.encode('Schmidt')
210
        'S530'
211
212
        """
213
214 1
        def _start_repl(word, src, tar, post=None):
215
            """Replace src with tar at the start of word.
216
217
            Parameters
218
            ----------
219
            word : str
220
                The word to modify
221
            src : str
222
                Substring to match
223
            tar : str
224
                Substring to substitute
225
            post : set
226
                Following characters
227
228
            Returns
229
            -------
230
            str
231
                Modified string
232
233
            """
234 1
            if post:
235 1
                for i in post:
236 1
                    if word.startswith(src + i):
237 1
                        return tar + word[len(src) :]
238 1
            elif word.startswith(src):
239 1
                return tar + word[len(src) :]
240 1
            return word
241
242 1
        def _end_repl(word, src, tar, pre=None):
243
            """Replace src with tar at the end of word.
244
245
            Parameters
246
            ----------
247
            word : str
248
                The word to modify
249
            src : str
250
                Substring to match
251
            tar : str
252
                Substring to substitute
253
            pre : set
254
                Preceding characters
255
256
            Returns
257
            -------
258
            str
259
                Modified string
260
261
            """
262 1
            if pre:
263 1
                for i in pre:
264 1
                    if word.endswith(i + src):
265 1
                        return word[: -len(src)] + tar
266 1
            elif word.endswith(src):
267 1
                return word[: -len(src)] + tar
268 1
            return word
269
270 1
        def _mid_repl(word, src, tar, pre=None, post=None):
271
            """Replace src with tar in the middle of word.
272
273
            Parameters
274
            ----------
275
            word : str
276
                The word to modify
277
            src : str
278
                Substring to match
279
            tar : str
280
                Substring to substitute
281
            pre : set
282
                Preceding characters
283
            post : set
284
                Following characters
285
286
            Returns
287
            -------
288
            str
289
                Modified string
290
291
            """
292 1
            if pre or post:
293 1
                if not pre:
294 1
                    return word[0] + _all_repl(word[1:], src, tar, pre, post)
295 1
                elif not post:
296 1
                    return _all_repl(word[:-1], src, tar, pre, post) + word[-1]
297 1
                return _all_repl(word, src, tar, pre, post)
298 1
            return (
299
                word[0] + _all_repl(word[1:-1], src, tar, pre, post) + word[-1]
300
            )
301
302 1
        def _all_repl(word, src, tar, pre=None, post=None):
303
            """Replace src with tar anywhere in word.
304
305
            Parameters
306
            ----------
307
            word : str
308
                The word to modify
309
            src : str
310
                Substring to match
311
            tar : str
312
                Substring to substitute
313
            pre : set
314
                Preceding characters
315
            post : set
316
                Following characters
317
318
            Returns
319
            -------
320
            str
321
                Modified string
322
323
            """
324 1
            if pre or post:
325 1
                if post:
326 1
                    post = post
327
                else:
328 1
                    post = frozenset(('',))
329 1
                if pre:
330 1
                    pre = pre
331
                else:
332 1
                    pre = frozenset(('',))
333
334 1
                for i, j in ((i, j) for i in pre for j in post):
335 1
                    word = word.replace(i + src + j, i + tar + j)
336 1
                return word
337
            else:
338 1
                return word.replace(src, tar)
339
340 1
        repl_at = (_start_repl, _end_repl, _mid_repl, _all_repl)
341
342 1
        sdx = ''
343
344 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
345 1
        word = word.replace('ß', 'SS')
346 1
        word = ''.join(c for c in word if c in self._uc_set)
347 1
        if word:
348 1
            for trans in self._substitutions:
349 1
                word = repl_at[trans[0]](word, *trans[1:])
350 1
            if word[0] in self._uc_vy_set:
351 1
                sdx = 'v' + word[1:].translate(self._trans)
352
            else:
353 1
                sdx = word[0] + word[1:].translate(self._trans)
354 1
            sdx = self._delete_consecutive_repeats(sdx)
355 1
            sdx = sdx.replace('0', '')
356
357
        # Clamp max_length to [4, 64]
358 1
        if max_length != -1:
359 1
            max_length = min(max(4, max_length), 64)
360
        else:
361 1
            max_length = 64
362
363 1
        if zero_pad:
364 1
            sdx += '0' * max_length
365 1
        if not sdx:
366 1
            sdx = '0'
367 1
        return sdx[:max_length]
368
369
370 1
def phonix(word, max_length=4, zero_pad=True):
371
    """Return the Phonix code for a word.
372
373
    This is a wrapper for :py:meth:`Phonix.encode`.
374
375
    Parameters
376
    ----------
377
    word : str
378
        The word to transform
379
    max_length : int
380
        The length of the code returned (defaults to 4)
381
    zero_pad : bool
382
        Pad the end of the return value with 0s to achieve a max_length string
383
384
    Returns
385
    -------
386
    str
387
        The Phonix value
388
389
    Examples
390
    --------
391
    >>> phonix('Christopher')
392
    'K683'
393
    >>> phonix('Niall')
394
    'N400'
395
    >>> phonix('Smith')
396
    'S530'
397
    >>> phonix('Schmidt')
398
    'S530'
399
400
    """
401 1
    return Phonix().encode(word, max_length, zero_pad)
402
403
404
if __name__ == '__main__':
405
    import doctest
406
407
    doctest.testmod()
408