Completed
Push — master ( 3ac297...afe14d )
by Chris
16:40 queued 07:25
created

abydos.phonetic._daitch_mokotoff.dm_soundex()   A

Complexity

Conditions 1

Size

Total Lines 37
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 3
dl 0
loc 37
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._daitch_mokotoff.
20
21
Daitch-Mokotoff Soundex
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize as unicode_normalize
32
33 1
from six import text_type
34
35 1
from ._phonetic import _Phonetic
36
37 1
__all__ = ['DaitchMokotoff', 'dm_soundex']
38
39
40 1
class DaitchMokotoff(_Phonetic):
41
    """Daitch-Mokotoff Soundex.
42
43
    Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values
44
    of a word as a set. A collection is necessary since there can be multiple
45
    values for a single word.
46
    """
47
48 1
    _dms_table = {
49
        'STCH': (2, 4, 4),
50
        'DRZ': (4, 4, 4),
51
        'ZH': (4, 4, 4),
52
        'ZHDZH': (2, 4, 4),
53
        'DZH': (4, 4, 4),
54
        'DRS': (4, 4, 4),
55
        'DZS': (4, 4, 4),
56
        'SCHTCH': (2, 4, 4),
57
        'SHTSH': (2, 4, 4),
58
        'SZCZ': (2, 4, 4),
59
        'TZS': (4, 4, 4),
60
        'SZCS': (2, 4, 4),
61
        'STSH': (2, 4, 4),
62
        'SHCH': (2, 4, 4),
63
        'D': (3, 3, 3),
64
        'H': (5, 5, '_'),
65
        'TTSCH': (4, 4, 4),
66
        'THS': (4, 4, 4),
67
        'L': (8, 8, 8),
68
        'P': (7, 7, 7),
69
        'CHS': (5, 54, 54),
70
        'T': (3, 3, 3),
71
        'X': (5, 54, 54),
72
        'OJ': (0, 1, '_'),
73
        'OI': (0, 1, '_'),
74
        'SCHTSH': (2, 4, 4),
75
        'OY': (0, 1, '_'),
76
        'Y': (1, '_', '_'),
77
        'TSH': (4, 4, 4),
78
        'ZDZ': (2, 4, 4),
79
        'TSZ': (4, 4, 4),
80
        'SHT': (2, 43, 43),
81
        'SCHTSCH': (2, 4, 4),
82
        'TTSZ': (4, 4, 4),
83
        'TTZ': (4, 4, 4),
84
        'SCH': (4, 4, 4),
85
        'TTS': (4, 4, 4),
86
        'SZD': (2, 43, 43),
87
        'AI': (0, 1, '_'),
88
        'PF': (7, 7, 7),
89
        'TCH': (4, 4, 4),
90
        'PH': (7, 7, 7),
91
        'TTCH': (4, 4, 4),
92
        'SZT': (2, 43, 43),
93
        'ZDZH': (2, 4, 4),
94
        'EI': (0, 1, '_'),
95
        'G': (5, 5, 5),
96
        'EJ': (0, 1, '_'),
97
        'ZD': (2, 43, 43),
98
        'IU': (1, '_', '_'),
99
        'K': (5, 5, 5),
100
        'O': (0, '_', '_'),
101
        'SHTCH': (2, 4, 4),
102
        'S': (4, 4, 4),
103
        'TRZ': (4, 4, 4),
104
        'SHD': (2, 43, 43),
105
        'DSH': (4, 4, 4),
106
        'CSZ': (4, 4, 4),
107
        'EU': (1, 1, '_'),
108
        'TRS': (4, 4, 4),
109
        'ZS': (4, 4, 4),
110
        'STRZ': (2, 4, 4),
111
        'UY': (0, 1, '_'),
112
        'STRS': (2, 4, 4),
113
        'CZS': (4, 4, 4),
114
        'MN': ('6_6', '6_6', '6_6'),
115
        'UI': (0, 1, '_'),
116
        'UJ': (0, 1, '_'),
117
        'UE': (0, '_', '_'),
118
        'EY': (0, 1, '_'),
119
        'W': (7, 7, 7),
120
        'IA': (1, '_', '_'),
121
        'FB': (7, 7, 7),
122
        'STSCH': (2, 4, 4),
123
        'SCHT': (2, 43, 43),
124
        'NM': ('6_6', '6_6', '6_6'),
125
        'SCHD': (2, 43, 43),
126
        'B': (7, 7, 7),
127
        'DSZ': (4, 4, 4),
128
        'F': (7, 7, 7),
129
        'N': (6, 6, 6),
130
        'CZ': (4, 4, 4),
131
        'R': (9, 9, 9),
132
        'U': (0, '_', '_'),
133
        'V': (7, 7, 7),
134
        'CS': (4, 4, 4),
135
        'Z': (4, 4, 4),
136
        'SZ': (4, 4, 4),
137
        'TSCH': (4, 4, 4),
138
        'KH': (5, 5, 5),
139
        'ST': (2, 43, 43),
140
        'KS': (5, 54, 54),
141
        'SH': (4, 4, 4),
142
        'SC': (2, 4, 4),
143
        'SD': (2, 43, 43),
144
        'DZ': (4, 4, 4),
145
        'ZHD': (2, 43, 43),
146
        'DT': (3, 3, 3),
147
        'ZSH': (4, 4, 4),
148
        'DS': (4, 4, 4),
149
        'TZ': (4, 4, 4),
150
        'TS': (4, 4, 4),
151
        'TH': (3, 3, 3),
152
        'TC': (4, 4, 4),
153
        'A': (0, '_', '_'),
154
        'E': (0, '_', '_'),
155
        'I': (0, '_', '_'),
156
        'AJ': (0, 1, '_'),
157
        'M': (6, 6, 6),
158
        'Q': (5, 5, 5),
159
        'AU': (0, 7, '_'),
160
        'IO': (1, '_', '_'),
161
        'AY': (0, 1, '_'),
162
        'IE': (1, '_', '_'),
163
        'ZSCH': (4, 4, 4),
164
        'CH': ((5, 4), (5, 4), (5, 4)),
165
        'CK': ((5, 45), (5, 45), (5, 45)),
166
        'C': ((5, 4), (5, 4), (5, 4)),
167
        'J': ((1, 4), ('_', 4), ('_', 4)),
168
        'RZ': ((94, 4), (94, 4), (94, 4)),
169
        'RS': ((94, 4), (94, 4), (94, 4)),
170
    }
171
172 1
    _dms_order = {
173
        'A': ('AI', 'AJ', 'AU', 'AY', 'A'),
174
        'B': ('B',),
175
        'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'),
176
        'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'),
177
        'E': ('EI', 'EJ', 'EU', 'EY', 'E'),
178
        'F': ('FB', 'F'),
179
        'G': ('G',),
180
        'H': ('H',),
181
        'I': ('IA', 'IE', 'IO', 'IU', 'I'),
182
        'J': ('J',),
183
        'K': ('KH', 'KS', 'K'),
184
        'L': ('L',),
185
        'M': ('MN', 'M'),
186
        'N': ('NM', 'N'),
187
        'O': ('OI', 'OJ', 'OY', 'O'),
188
        'P': ('PF', 'PH', 'P'),
189
        'Q': ('Q',),
190
        'R': ('RS', 'RZ', 'R'),
191
        'S': (
192
            'SCHTSCH',
193
            'SCHTCH',
194
            'SCHTSH',
195
            'SHTCH',
196
            'SHTSH',
197
            'STSCH',
198
            'SCHD',
199
            'SCHT',
200
            'SHCH',
201
            'STCH',
202
            'STRS',
203
            'STRZ',
204
            'STSH',
205
            'SZCS',
206
            'SZCZ',
207
            'SCH',
208
            'SHD',
209
            'SHT',
210
            'SZD',
211
            'SZT',
212
            'SC',
213
            'SD',
214
            'SH',
215
            'ST',
216
            'SZ',
217
            'S',
218
        ),
219
        'T': (
220
            'TTSCH',
221
            'TSCH',
222
            'TTCH',
223
            'TTSZ',
224
            'TCH',
225
            'THS',
226
            'TRS',
227
            'TRZ',
228
            'TSH',
229
            'TSZ',
230
            'TTS',
231
            'TTZ',
232
            'TZS',
233
            'TC',
234
            'TH',
235
            'TS',
236
            'TZ',
237
            'T',
238
        ),
239
        'U': ('UE', 'UI', 'UJ', 'UY', 'U'),
240
        'V': ('V',),
241
        'W': ('W',),
242
        'X': ('X',),
243
        'Y': ('Y',),
244
        'Z': (
245
            'ZHDZH',
246
            'ZDZH',
247
            'ZSCH',
248
            'ZDZ',
249
            'ZHD',
250
            'ZSH',
251
            'ZD',
252
            'ZH',
253
            'ZS',
254
            'Z',
255
        ),
256
    }
257
258 1
    _uc_v_set = set('AEIJOUY')
259
260 1
    def encode(self, word, max_length=6, zero_pad=True):
261
        """Return the Daitch-Mokotoff Soundex code for a word.
262
263
        Parameters
264
        ----------
265
        word : str
266
            The word to transform
267
        max_length : int
268
            The length of the code returned (defaults to 6; must be between 6
269
            and 64)
270
        zero_pad : bool
271
            Pad the end of the return value with 0s to achieve a max_length
272
            string
273
274
        Returns
275
        -------
276
        str
277
            The Daitch-Mokotoff Soundex value
278
279
        Examples
280
        --------
281
        >>> pe = DaitchMokotoff()
282
        >>> sorted(pe.encode('Christopher'))
283
        ['494379', '594379']
284
        >>> pe.encode('Niall')
285
        {'680000'}
286
        >>> pe.encode('Smith')
287
        {'463000'}
288
        >>> pe.encode('Schmidt')
289
        {'463000'}
290
291
        >>> sorted(pe.encode('The quick brown fox', max_length=20,
292
        ... zero_pad=False))
293
        ['35457976754', '3557976754']
294
295
        """
296 1
        dms = ['']  # initialize empty code list
297
298
        # Require a max_length of at least 6 and not more than 64
299 1
        if max_length != -1:
300 1
            max_length = min(max(6, max_length), 64)
301
        else:
302 1
            max_length = 64
303
304
        # uppercase, normalize, decompose, and filter non-A-Z
305 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
306 1
        word = word.replace('ß', 'SS')
307 1
        word = ''.join(c for c in word if c in self._uc_set)
308
309
        # Nothing to convert, return base case
310 1
        if not word:
311 1
            if zero_pad:
312 1
                return {'0' * max_length}
313 1
            return {'0'}
314
315 1
        pos = 0
316 1
        while pos < len(word):
317
            # Iterate through _dms_order, which specifies the possible
318
            # substrings for which codes exist in the Daitch-Mokotoff coding
319 1
            for sstr in self._dms_order[word[pos]]:  # pragma: no branch
320 1
                if word[pos:].startswith(sstr):
321
                    # Having determined a valid substring start, retrieve the
322
                    # code
323 1
                    dm_val = self._dms_table[sstr]
324
325
                    # Having retried the code (triple), determine the correct
326
                    # positional variant (first, pre-vocalic, elsewhere)
327 1
                    if pos == 0:
328 1
                        dm_val = dm_val[0]
329 1
                    elif (
330
                        pos + len(sstr) < len(word)
331
                        and word[pos + len(sstr)] in self._uc_v_set
332
                    ):
333 1
                        dm_val = dm_val[1]
334
                    else:
335 1
                        dm_val = dm_val[2]
336
337
                    # Build the code strings
338 1
                    if isinstance(dm_val, tuple):
339 1
                        dms = [_ + text_type(dm_val[0]) for _ in dms] + [
340
                            _ + text_type(dm_val[1]) for _ in dms
341
                        ]
342
                    else:
343 1
                        dms = [_ + text_type(dm_val) for _ in dms]
344 1
                    pos += len(sstr)
345 1
                    break
346
347
        # Filter out double letters and _ placeholders
348 1
        dms = (
349
            ''.join(c for c in self._delete_consecutive_repeats(_) if c != '_')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
350
            for _ in dms
351
        )
352
353
        # Trim codes and return set
354 1
        if zero_pad:
355 1
            dms = ((_ + ('0' * max_length))[:max_length] for _ in dms)
356
        else:
357 1
            dms = (_[:max_length] for _ in dms)
358 1
        return set(dms)
359
360
361 1
def dm_soundex(word, max_length=6, zero_pad=True):
362
    """Return the Daitch-Mokotoff Soundex code for a word.
363
364
    This is a wrapper for :py:meth:`DaitchMokotoff.encode`.
365
366
    Parameters
367
    ----------
368
    word : str
369
        The word to transform
370
    max_length : int
371
        The length of the code returned (defaults to 6; must be between 6 and
372
        64)
373
    zero_pad : bool
374
        Pad the end of the return value with 0s to achieve a max_length string
375
376
    Returns
377
    -------
378
    str
379
        The Daitch-Mokotoff Soundex value
380
381
    Examples
382
    --------
383
    >>> sorted(dm_soundex('Christopher'))
384
    ['494379', '594379']
385
    >>> dm_soundex('Niall')
386
    {'680000'}
387
    >>> dm_soundex('Smith')
388
    {'463000'}
389
    >>> dm_soundex('Schmidt')
390
    {'463000'}
391
392
    >>> sorted(dm_soundex('The quick brown fox', max_length=20,
393
    ... zero_pad=False))
394
    ['35457976754', '3557976754']
395
396
    """
397 1
    return DaitchMokotoff().encode(word, max_length, zero_pad)
398
399
400
if __name__ == '__main__':
401
    import doctest
402
403
    doctest.testmod()
404