Test Failed
Push — master ( 23810f...afe14d )
by Chris
09:47
created

DaitchMokotoff.encode()   D

Complexity

Conditions 12

Size

Total Lines 99
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 32
CRAP Score 12

Importance

Changes 0
Metric Value
cc 12
eloc 37
nop 4
dl 0
loc 99
ccs 32
cts 32
cp 1
crap 12
rs 4.8
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._daitch_mokotoff.DaitchMokotoff.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._daitch_mokotoff.
20
21
Daitch-Mokotoff Soundex
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import _Phonetic
36
37 1
__all__ = ['DaitchMokotoff', 'dm_soundex']
38
39
40 1
class DaitchMokotoff(_Phonetic):
41
    """Daitch-Mokotoff Soundex.
42
43
    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
44
    of a word as a set. A collection is necessary since there can be multiple
45
    values for a single word.
46
    """
47
48 1
    _dms_table = {
49
        'STCH': (2, 4, 4),
50
        'DRZ': (4, 4, 4),
51
        'ZH': (4, 4, 4),
52
        'ZHDZH': (2, 4, 4),
53
        'DZH': (4, 4, 4),
54
        'DRS': (4, 4, 4),
55
        'DZS': (4, 4, 4),
56
        'SCHTCH': (2, 4, 4),
57
        'SHTSH': (2, 4, 4),
58
        'SZCZ': (2, 4, 4),
59
        'TZS': (4, 4, 4),
60
        'SZCS': (2, 4, 4),
61
        'STSH': (2, 4, 4),
62
        'SHCH': (2, 4, 4),
63
        'D': (3, 3, 3),
64
        'H': (5, 5, '_'),
65
        'TTSCH': (4, 4, 4),
66
        'THS': (4, 4, 4),
67
        'L': (8, 8, 8),
68
        'P': (7, 7, 7),
69
        'CHS': (5, 54, 54),
70
        'T': (3, 3, 3),
71
        'X': (5, 54, 54),
72
        'OJ': (0, 1, '_'),
73
        'OI': (0, 1, '_'),
74
        'SCHTSH': (2, 4, 4),
75
        'OY': (0, 1, '_'),
76
        'Y': (1, '_', '_'),
77
        'TSH': (4, 4, 4),
78
        'ZDZ': (2, 4, 4),
79
        'TSZ': (4, 4, 4),
80
        'SHT': (2, 43, 43),
81
        'SCHTSCH': (2, 4, 4),
82
        'TTSZ': (4, 4, 4),
83
        'TTZ': (4, 4, 4),
84
        'SCH': (4, 4, 4),
85
        'TTS': (4, 4, 4),
86
        'SZD': (2, 43, 43),
87
        'AI': (0, 1, '_'),
88
        'PF': (7, 7, 7),
89
        'TCH': (4, 4, 4),
90
        'PH': (7, 7, 7),
91
        'TTCH': (4, 4, 4),
92
        'SZT': (2, 43, 43),
93
        'ZDZH': (2, 4, 4),
94
        'EI': (0, 1, '_'),
95
        'G': (5, 5, 5),
96
        'EJ': (0, 1, '_'),
97
        'ZD': (2, 43, 43),
98
        'IU': (1, '_', '_'),
99
        'K': (5, 5, 5),
100
        'O': (0, '_', '_'),
101
        'SHTCH': (2, 4, 4),
102
        'S': (4, 4, 4),
103
        'TRZ': (4, 4, 4),
104
        'SHD': (2, 43, 43),
105
        'DSH': (4, 4, 4),
106
        'CSZ': (4, 4, 4),
107
        'EU': (1, 1, '_'),
108
        'TRS': (4, 4, 4),
109
        'ZS': (4, 4, 4),
110
        'STRZ': (2, 4, 4),
111
        'UY': (0, 1, '_'),
112
        'STRS': (2, 4, 4),
113
        'CZS': (4, 4, 4),
114
        'MN': ('6_6', '6_6', '6_6'),
115
        'UI': (0, 1, '_'),
116
        'UJ': (0, 1, '_'),
117
        'UE': (0, '_', '_'),
118
        'EY': (0, 1, '_'),
119
        'W': (7, 7, 7),
120
        'IA': (1, '_', '_'),
121
        'FB': (7, 7, 7),
122
        'STSCH': (2, 4, 4),
123
        'SCHT': (2, 43, 43),
124
        'NM': ('6_6', '6_6', '6_6'),
125
        'SCHD': (2, 43, 43),
126
        'B': (7, 7, 7),
127
        'DSZ': (4, 4, 4),
128
        'F': (7, 7, 7),
129
        'N': (6, 6, 6),
130
        'CZ': (4, 4, 4),
131
        'R': (9, 9, 9),
132
        'U': (0, '_', '_'),
133
        'V': (7, 7, 7),
134
        'CS': (4, 4, 4),
135
        'Z': (4, 4, 4),
136
        'SZ': (4, 4, 4),
137
        'TSCH': (4, 4, 4),
138
        'KH': (5, 5, 5),
139
        'ST': (2, 43, 43),
140
        'KS': (5, 54, 54),
141
        'SH': (4, 4, 4),
142
        'SC': (2, 4, 4),
143
        'SD': (2, 43, 43),
144
        'DZ': (4, 4, 4),
145
        'ZHD': (2, 43, 43),
146
        'DT': (3, 3, 3),
147
        'ZSH': (4, 4, 4),
148
        'DS': (4, 4, 4),
149
        'TZ': (4, 4, 4),
150
        'TS': (4, 4, 4),
151
        'TH': (3, 3, 3),
152
        'TC': (4, 4, 4),
153
        'A': (0, '_', '_'),
154
        'E': (0, '_', '_'),
155
        'I': (0, '_', '_'),
156
        'AJ': (0, 1, '_'),
157
        'M': (6, 6, 6),
158
        'Q': (5, 5, 5),
159
        'AU': (0, 7, '_'),
160
        'IO': (1, '_', '_'),
161
        'AY': (0, 1, '_'),
162
        'IE': (1, '_', '_'),
163
        'ZSCH': (4, 4, 4),
164
        'CH': ((5, 4), (5, 4), (5, 4)),
165
        'CK': ((5, 45), (5, 45), (5, 45)),
166
        'C': ((5, 4), (5, 4), (5, 4)),
167
        'J': ((1, 4), ('_', 4), ('_', 4)),
168
        'RZ': ((94, 4), (94, 4), (94, 4)),
169
        'RS': ((94, 4), (94, 4), (94, 4)),
170
    }
171
172 1
    _dms_order = {
173
        'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
174
        'B': ('B',),
175
        'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
176
        'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'),
177
        'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
178
        'F': ('FB', 'F'),
179
        'G': ('G',),
180
        'H': ('H',),
181
        'I': ('IA', 'IE', 'IO', 'IU', 'I'),
182
        'J': ('J',),
183
        'K': ('KH', 'KS', 'K'),
184
        'L': ('L',),
185
        'M': ('MN', 'M'),
186
        'N': ('NM', 'N'),
187
        'O': ('OI', 'OJ', 'OY', 'O'),
188
        'P': ('PF', 'PH', 'P'),
189
        'Q': ('Q',),
190
        'R': ('RS', 'RZ', 'R'),
191
        'S': (
192
            'SCHTSCH',
193
            'SCHTCH',
194
            'SCHTSH',
195
            'SHTCH',
196
            'SHTSH',
197
            'STSCH',
198
            'SCHD',
199
            'SCHT',
200
            'SHCH',
201
            'STCH',
202
            'STRS',
203
            'STRZ',
204
            'STSH',
205
            'SZCS',
206
            'SZCZ',
207
            'SCH',
208
            'SHD',
209
            'SHT',
210
            'SZD',
211
            'SZT',
212
            'SC',
213
            'SD',
214
            'SH',
215
            'ST',
216
            'SZ',
217
            'S',
218
        ),
219
        'T': (
220
            'TTSCH',
221
            'TSCH',
222
            'TTCH',
223
            'TTSZ',
224
            'TCH',
225
            'THS',
226
            'TRS',
227
            'TRZ',
228
            'TSH',
229
            'TSZ',
230
            'TTS',
231
            'TTZ',
232
            'TZS',
233
            'TC',
234
            'TH',
235
            'TS',
236
            'TZ',
237
            'T',
238
        ),
239
        'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
240
        'V': ('V',),
241
        'W': ('W',),
242
        'X': ('X',),
243
        'Y': ('Y',),
244
        'Z': (
245
            'ZHDZH',
246
            'ZDZH',
247
            'ZSCH',
248
            'ZDZ',
249
            'ZHD',
250
            'ZSH',
251
            'ZD',
252
            'ZH',
253
            'ZS',
254
            'Z',
255
        ),
256
    }
257
258 1
    _uc_v_set = set('AEIJOUY')
259
260 1
    def encode(self, word, max_length=6, zero_pad=True):
261
        """Return the Daitch-Mokotoff Soundex code for a word.
262
263
        Parameters
264
        ----------
265
        word : str
266
            The word to transform
267
        max_length : int
268
            The length of the code returned (defaults to 6; must be between 6
269
            and 64)
270
        zero_pad : bool
271
            Pad the end of the return value with 0s to achieve a max_length
272
            string
273
274
        Returns
275
        -------
276
        str
277
            The Daitch-Mokotoff Soundex value
278
279
        Examples
280
        --------
281
        >>> pe = DaitchMokotoff()
282
        >>> sorted(pe.encode('Christopher'))
283
        ['494379', '594379']
284
        >>> pe.encode('Niall')
285
        {'680000'}
286
        >>> pe.encode('Smith')
287
        {'463000'}
288
        >>> pe.encode('Schmidt')
289
        {'463000'}
290
291
        >>> sorted(pe.encode('The quick brown fox', max_length=20,
292
        ... zero_pad=False))
293
        ['35457976754', '3557976754']
294
295
        """
296 1
        dms = ['']  # initialize empty code list
297
298
        # Require a max_length of at least 6 and not more than 64
299 1
        if max_length != -1:
300 1
            max_length = min(max(6, max_length), 64)
301
        else:
302 1
            max_length = 64
303
304
        # uppercase, normalize, decompose, and filter non-A-Z
305 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
306 1
        word = word.replace('ß', 'SS')
307 1
        word = ''.join(c for c in word if c in self._uc_set)
308
309
        # Nothing to convert, return base case
310 1
        if not word:
311 1
            if zero_pad:
312 1
                return {'0' * max_length}
313 1
            return {'0'}
314
315 1
        pos = 0
316 1
        while pos < len(word):
317
            # Iterate through _dms_order, which specifies the possible
318
            # substrings for which codes exist in the Daitch-Mokotoff coding
319 1
            for sstr in self._dms_order[word[pos]]:  # pragma: no branch
320 1
                if word[pos:].startswith(sstr):
321
                    # Having determined a valid substring start, retrieve the
322
                    # code
323 1
                    dm_val = self._dms_table[sstr]
324
325
                    # Having retried the code (triple), determine the correct
326
                    # positional variant (first, pre-vocalic, elsewhere)
327 1
                    if pos == 0:
328 1
                        dm_val = dm_val[0]
329 1
                    elif (
330
                        pos + len(sstr) < len(word)
331
                        and word[pos + len(sstr)] in self._uc_v_set
332
                    ):
333 1
                        dm_val = dm_val[1]
334
                    else:
335 1
                        dm_val = dm_val[2]
336
337
                    # Build the code strings
338 1
                    if isinstance(dm_val, tuple):
339 1
                        dms = [_ + text_type(dm_val[0]) for _ in dms] + [
340
                            _ + text_type(dm_val[1]) for _ in dms
341
                        ]
342
                    else:
343 1
                        dms = [_ + text_type(dm_val) for _ in dms]
344 1
                    pos += len(sstr)
345 1
                    break
346
347
        # Filter out double letters and _ placeholders
348 1
        dms = (
349
            ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
350
            for _ in dms
351
        )
352
353
        # Trim codes and return set
354 1
        if zero_pad:
355 1
            dms = ((_ + ('0' * max_length))[:max_length] for _ in dms)
356
        else:
357 1
            dms = (_[:max_length] for _ in dms)
358 1
        return set(dms)
359
360
361 1
def dm_soundex(word, max_length=6, zero_pad=True):
362
    """Return the Daitch-Mokotoff Soundex code for a word.
363
364
    This is a wrapper for :py:meth:`DaitchMokotoff.encode`.
365
366
    Parameters
367
    ----------
368
    word : str
369
        The word to transform
370
    max_length : int
371
        The length of the code returned (defaults to 6; must be between 6 and
372
        64)
373
    zero_pad : bool
374
        Pad the end of the return value with 0s to achieve a max_length string
375
376
    Returns
377
    -------
378
    str
379
        The Daitch-Mokotoff Soundex value
380
381
    Examples
382
    --------
383
    >>> sorted(dm_soundex('Christopher'))
384
    ['494379', '594379']
385
    >>> dm_soundex('Niall')
386
    {'680000'}
387
    >>> dm_soundex('Smith')
388
    {'463000'}
389
    >>> dm_soundex('Schmidt')
390
    {'463000'}
391
392
    >>> sorted(dm_soundex('The quick brown fox', max_length=20,
393
    ... zero_pad=False))
394
    ['35457976754', '3557976754']
395
396
    """
397 1
    return DaitchMokotoff().encode(word, max_length, zero_pad)
398
399
400
if __name__ == '__main__':
401
    import doctest
402
403
    doctest.testmod()
404