1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.phonetic._daitch_mokotoff. |
20
|
|
|
|
21
|
|
|
Daitch-Mokotoff Soundex |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import ( |
25
|
|
|
absolute_import, |
26
|
|
|
division, |
27
|
|
|
print_function, |
28
|
|
|
unicode_literals, |
29
|
|
|
) |
30
|
|
|
|
31
|
1 |
|
from unicodedata import normalize as unicode_normalize |
32
|
|
|
|
33
|
1 |
|
from six import text_type |
34
|
|
|
|
35
|
1 |
|
from ._phonetic import _Phonetic |
36
|
|
|
|
37
|
1 |
|
__all__ = ['DaitchMokotoff', 'dm_soundex'] |
38
|
|
|
|
39
|
|
|
|
40
|
1 |
|
class DaitchMokotoff(_Phonetic): |
41
|
|
|
"""Daitch-Mokotoff Soundex. |
42
|
|
|
|
43
|
|
|
Based on Daitch-Mokotoff Soundex :cite:`Mokotoff:1997`, this returns values |
44
|
|
|
of a word as a set. A collection is necessary since there can be multiple |
45
|
|
|
values for a single word. |
46
|
|
|
""" |
47
|
|
|
|
48
|
1 |
|
_dms_table = { |
49
|
|
|
'STCH': (2, 4, 4), |
50
|
|
|
'DRZ': (4, 4, 4), |
51
|
|
|
'ZH': (4, 4, 4), |
52
|
|
|
'ZHDZH': (2, 4, 4), |
53
|
|
|
'DZH': (4, 4, 4), |
54
|
|
|
'DRS': (4, 4, 4), |
55
|
|
|
'DZS': (4, 4, 4), |
56
|
|
|
'SCHTCH': (2, 4, 4), |
57
|
|
|
'SHTSH': (2, 4, 4), |
58
|
|
|
'SZCZ': (2, 4, 4), |
59
|
|
|
'TZS': (4, 4, 4), |
60
|
|
|
'SZCS': (2, 4, 4), |
61
|
|
|
'STSH': (2, 4, 4), |
62
|
|
|
'SHCH': (2, 4, 4), |
63
|
|
|
'D': (3, 3, 3), |
64
|
|
|
'H': (5, 5, '_'), |
65
|
|
|
'TTSCH': (4, 4, 4), |
66
|
|
|
'THS': (4, 4, 4), |
67
|
|
|
'L': (8, 8, 8), |
68
|
|
|
'P': (7, 7, 7), |
69
|
|
|
'CHS': (5, 54, 54), |
70
|
|
|
'T': (3, 3, 3), |
71
|
|
|
'X': (5, 54, 54), |
72
|
|
|
'OJ': (0, 1, '_'), |
73
|
|
|
'OI': (0, 1, '_'), |
74
|
|
|
'SCHTSH': (2, 4, 4), |
75
|
|
|
'OY': (0, 1, '_'), |
76
|
|
|
'Y': (1, '_', '_'), |
77
|
|
|
'TSH': (4, 4, 4), |
78
|
|
|
'ZDZ': (2, 4, 4), |
79
|
|
|
'TSZ': (4, 4, 4), |
80
|
|
|
'SHT': (2, 43, 43), |
81
|
|
|
'SCHTSCH': (2, 4, 4), |
82
|
|
|
'TTSZ': (4, 4, 4), |
83
|
|
|
'TTZ': (4, 4, 4), |
84
|
|
|
'SCH': (4, 4, 4), |
85
|
|
|
'TTS': (4, 4, 4), |
86
|
|
|
'SZD': (2, 43, 43), |
87
|
|
|
'AI': (0, 1, '_'), |
88
|
|
|
'PF': (7, 7, 7), |
89
|
|
|
'TCH': (4, 4, 4), |
90
|
|
|
'PH': (7, 7, 7), |
91
|
|
|
'TTCH': (4, 4, 4), |
92
|
|
|
'SZT': (2, 43, 43), |
93
|
|
|
'ZDZH': (2, 4, 4), |
94
|
|
|
'EI': (0, 1, '_'), |
95
|
|
|
'G': (5, 5, 5), |
96
|
|
|
'EJ': (0, 1, '_'), |
97
|
|
|
'ZD': (2, 43, 43), |
98
|
|
|
'IU': (1, '_', '_'), |
99
|
|
|
'K': (5, 5, 5), |
100
|
|
|
'O': (0, '_', '_'), |
101
|
|
|
'SHTCH': (2, 4, 4), |
102
|
|
|
'S': (4, 4, 4), |
103
|
|
|
'TRZ': (4, 4, 4), |
104
|
|
|
'SHD': (2, 43, 43), |
105
|
|
|
'DSH': (4, 4, 4), |
106
|
|
|
'CSZ': (4, 4, 4), |
107
|
|
|
'EU': (1, 1, '_'), |
108
|
|
|
'TRS': (4, 4, 4), |
109
|
|
|
'ZS': (4, 4, 4), |
110
|
|
|
'STRZ': (2, 4, 4), |
111
|
|
|
'UY': (0, 1, '_'), |
112
|
|
|
'STRS': (2, 4, 4), |
113
|
|
|
'CZS': (4, 4, 4), |
114
|
|
|
'MN': ('6_6', '6_6', '6_6'), |
115
|
|
|
'UI': (0, 1, '_'), |
116
|
|
|
'UJ': (0, 1, '_'), |
117
|
|
|
'UE': (0, '_', '_'), |
118
|
|
|
'EY': (0, 1, '_'), |
119
|
|
|
'W': (7, 7, 7), |
120
|
|
|
'IA': (1, '_', '_'), |
121
|
|
|
'FB': (7, 7, 7), |
122
|
|
|
'STSCH': (2, 4, 4), |
123
|
|
|
'SCHT': (2, 43, 43), |
124
|
|
|
'NM': ('6_6', '6_6', '6_6'), |
125
|
|
|
'SCHD': (2, 43, 43), |
126
|
|
|
'B': (7, 7, 7), |
127
|
|
|
'DSZ': (4, 4, 4), |
128
|
|
|
'F': (7, 7, 7), |
129
|
|
|
'N': (6, 6, 6), |
130
|
|
|
'CZ': (4, 4, 4), |
131
|
|
|
'R': (9, 9, 9), |
132
|
|
|
'U': (0, '_', '_'), |
133
|
|
|
'V': (7, 7, 7), |
134
|
|
|
'CS': (4, 4, 4), |
135
|
|
|
'Z': (4, 4, 4), |
136
|
|
|
'SZ': (4, 4, 4), |
137
|
|
|
'TSCH': (4, 4, 4), |
138
|
|
|
'KH': (5, 5, 5), |
139
|
|
|
'ST': (2, 43, 43), |
140
|
|
|
'KS': (5, 54, 54), |
141
|
|
|
'SH': (4, 4, 4), |
142
|
|
|
'SC': (2, 4, 4), |
143
|
|
|
'SD': (2, 43, 43), |
144
|
|
|
'DZ': (4, 4, 4), |
145
|
|
|
'ZHD': (2, 43, 43), |
146
|
|
|
'DT': (3, 3, 3), |
147
|
|
|
'ZSH': (4, 4, 4), |
148
|
|
|
'DS': (4, 4, 4), |
149
|
|
|
'TZ': (4, 4, 4), |
150
|
|
|
'TS': (4, 4, 4), |
151
|
|
|
'TH': (3, 3, 3), |
152
|
|
|
'TC': (4, 4, 4), |
153
|
|
|
'A': (0, '_', '_'), |
154
|
|
|
'E': (0, '_', '_'), |
155
|
|
|
'I': (0, '_', '_'), |
156
|
|
|
'AJ': (0, 1, '_'), |
157
|
|
|
'M': (6, 6, 6), |
158
|
|
|
'Q': (5, 5, 5), |
159
|
|
|
'AU': (0, 7, '_'), |
160
|
|
|
'IO': (1, '_', '_'), |
161
|
|
|
'AY': (0, 1, '_'), |
162
|
|
|
'IE': (1, '_', '_'), |
163
|
|
|
'ZSCH': (4, 4, 4), |
164
|
|
|
'CH': ((5, 4), (5, 4), (5, 4)), |
165
|
|
|
'CK': ((5, 45), (5, 45), (5, 45)), |
166
|
|
|
'C': ((5, 4), (5, 4), (5, 4)), |
167
|
|
|
'J': ((1, 4), ('_', 4), ('_', 4)), |
168
|
|
|
'RZ': ((94, 4), (94, 4), (94, 4)), |
169
|
|
|
'RS': ((94, 4), (94, 4), (94, 4)), |
170
|
|
|
} |
171
|
|
|
|
172
|
1 |
|
_dms_order = { |
173
|
|
|
'A': ('AI', 'AJ', 'AU', 'AY', 'A'), |
174
|
|
|
'B': ('B',), |
175
|
|
|
'C': ('CHS', 'CSZ', 'CZS', 'CH', 'CK', 'CS', 'CZ', 'C'), |
176
|
|
|
'D': ('DRS', 'DRZ', 'DSH', 'DSZ', 'DZH', 'DZS', 'DS', 'DT', 'DZ', 'D'), |
177
|
|
|
'E': ('EI', 'EJ', 'EU', 'EY', 'E'), |
178
|
|
|
'F': ('FB', 'F'), |
179
|
|
|
'G': ('G',), |
180
|
|
|
'H': ('H',), |
181
|
|
|
'I': ('IA', 'IE', 'IO', 'IU', 'I'), |
182
|
|
|
'J': ('J',), |
183
|
|
|
'K': ('KH', 'KS', 'K'), |
184
|
|
|
'L': ('L',), |
185
|
|
|
'M': ('MN', 'M'), |
186
|
|
|
'N': ('NM', 'N'), |
187
|
|
|
'O': ('OI', 'OJ', 'OY', 'O'), |
188
|
|
|
'P': ('PF', 'PH', 'P'), |
189
|
|
|
'Q': ('Q',), |
190
|
|
|
'R': ('RS', 'RZ', 'R'), |
191
|
|
|
'S': ( |
192
|
|
|
'SCHTSCH', |
193
|
|
|
'SCHTCH', |
194
|
|
|
'SCHTSH', |
195
|
|
|
'SHTCH', |
196
|
|
|
'SHTSH', |
197
|
|
|
'STSCH', |
198
|
|
|
'SCHD', |
199
|
|
|
'SCHT', |
200
|
|
|
'SHCH', |
201
|
|
|
'STCH', |
202
|
|
|
'STRS', |
203
|
|
|
'STRZ', |
204
|
|
|
'STSH', |
205
|
|
|
'SZCS', |
206
|
|
|
'SZCZ', |
207
|
|
|
'SCH', |
208
|
|
|
'SHD', |
209
|
|
|
'SHT', |
210
|
|
|
'SZD', |
211
|
|
|
'SZT', |
212
|
|
|
'SC', |
213
|
|
|
'SD', |
214
|
|
|
'SH', |
215
|
|
|
'ST', |
216
|
|
|
'SZ', |
217
|
|
|
'S', |
218
|
|
|
), |
219
|
|
|
'T': ( |
220
|
|
|
'TTSCH', |
221
|
|
|
'TSCH', |
222
|
|
|
'TTCH', |
223
|
|
|
'TTSZ', |
224
|
|
|
'TCH', |
225
|
|
|
'THS', |
226
|
|
|
'TRS', |
227
|
|
|
'TRZ', |
228
|
|
|
'TSH', |
229
|
|
|
'TSZ', |
230
|
|
|
'TTS', |
231
|
|
|
'TTZ', |
232
|
|
|
'TZS', |
233
|
|
|
'TC', |
234
|
|
|
'TH', |
235
|
|
|
'TS', |
236
|
|
|
'TZ', |
237
|
|
|
'T', |
238
|
|
|
), |
239
|
|
|
'U': ('UE', 'UI', 'UJ', 'UY', 'U'), |
240
|
|
|
'V': ('V',), |
241
|
|
|
'W': ('W',), |
242
|
|
|
'X': ('X',), |
243
|
|
|
'Y': ('Y',), |
244
|
|
|
'Z': ( |
245
|
|
|
'ZHDZH', |
246
|
|
|
'ZDZH', |
247
|
|
|
'ZSCH', |
248
|
|
|
'ZDZ', |
249
|
|
|
'ZHD', |
250
|
|
|
'ZSH', |
251
|
|
|
'ZD', |
252
|
|
|
'ZH', |
253
|
|
|
'ZS', |
254
|
|
|
'Z', |
255
|
|
|
), |
256
|
|
|
} |
257
|
|
|
|
258
|
1 |
|
_uc_v_set = set('AEIJOUY') |
259
|
|
|
|
260
|
1 |
|
def encode(self, word, max_length=6, zero_pad=True): |
261
|
|
|
"""Return the Daitch-Mokotoff Soundex code for a word. |
262
|
|
|
|
263
|
|
|
Parameters |
264
|
|
|
---------- |
265
|
|
|
word : str |
266
|
|
|
The word to transform |
267
|
|
|
max_length : int |
268
|
|
|
The length of the code returned (defaults to 6; must be between 6 |
269
|
|
|
and 64) |
270
|
|
|
zero_pad : bool |
271
|
|
|
Pad the end of the return value with 0s to achieve a max_length |
272
|
|
|
string |
273
|
|
|
|
274
|
|
|
Returns |
275
|
|
|
------- |
276
|
|
|
str |
277
|
|
|
The Daitch-Mokotoff Soundex value |
278
|
|
|
|
279
|
|
|
Examples |
280
|
|
|
-------- |
281
|
|
|
>>> pe = DaitchMokotoff() |
282
|
|
|
>>> sorted(pe.encode('Christopher')) |
283
|
|
|
['494379', '594379'] |
284
|
|
|
>>> pe.encode('Niall') |
285
|
|
|
{'680000'} |
286
|
|
|
>>> pe.encode('Smith') |
287
|
|
|
{'463000'} |
288
|
|
|
>>> pe.encode('Schmidt') |
289
|
|
|
{'463000'} |
290
|
|
|
|
291
|
|
|
>>> sorted(pe.encode('The quick brown fox', max_length=20, |
292
|
|
|
... zero_pad=False)) |
293
|
|
|
['35457976754', '3557976754'] |
294
|
|
|
|
295
|
|
|
""" |
296
|
1 |
|
dms = [''] # initialize empty code list |
297
|
|
|
|
298
|
|
|
# Require a max_length of at least 6 and not more than 64 |
299
|
1 |
|
if max_length != -1: |
300
|
1 |
|
max_length = min(max(6, max_length), 64) |
301
|
|
|
else: |
302
|
1 |
|
max_length = 64 |
303
|
|
|
|
304
|
|
|
# uppercase, normalize, decompose, and filter non-A-Z |
305
|
1 |
|
word = unicode_normalize('NFKD', text_type(word.upper())) |
306
|
1 |
|
word = word.replace('ß', 'SS') |
307
|
1 |
|
word = ''.join(c for c in word if c in self._uc_set) |
308
|
|
|
|
309
|
|
|
# Nothing to convert, return base case |
310
|
1 |
|
if not word: |
311
|
1 |
|
if zero_pad: |
312
|
1 |
|
return {'0' * max_length} |
313
|
1 |
|
return {'0'} |
314
|
|
|
|
315
|
1 |
|
pos = 0 |
316
|
1 |
|
while pos < len(word): |
317
|
|
|
# Iterate through _dms_order, which specifies the possible |
318
|
|
|
# substrings for which codes exist in the Daitch-Mokotoff coding |
319
|
1 |
|
for sstr in self._dms_order[word[pos]]: # pragma: no branch |
320
|
1 |
|
if word[pos:].startswith(sstr): |
321
|
|
|
# Having determined a valid substring start, retrieve the |
322
|
|
|
# code |
323
|
1 |
|
dm_val = self._dms_table[sstr] |
324
|
|
|
|
325
|
|
|
# Having retried the code (triple), determine the correct |
326
|
|
|
# positional variant (first, pre-vocalic, elsewhere) |
327
|
1 |
|
if pos == 0: |
328
|
1 |
|
dm_val = dm_val[0] |
329
|
1 |
|
elif ( |
330
|
|
|
pos + len(sstr) < len(word) |
331
|
|
|
and word[pos + len(sstr)] in self._uc_v_set |
332
|
|
|
): |
333
|
1 |
|
dm_val = dm_val[1] |
334
|
|
|
else: |
335
|
1 |
|
dm_val = dm_val[2] |
336
|
|
|
|
337
|
|
|
# Build the code strings |
338
|
1 |
|
if isinstance(dm_val, tuple): |
339
|
1 |
|
dms = [_ + text_type(dm_val[0]) for _ in dms] + [ |
340
|
|
|
_ + text_type(dm_val[1]) for _ in dms |
341
|
|
|
] |
342
|
|
|
else: |
343
|
1 |
|
dms = [_ + text_type(dm_val) for _ in dms] |
344
|
1 |
|
pos += len(sstr) |
345
|
1 |
|
break |
346
|
|
|
|
347
|
|
|
# Filter out double letters and _ placeholders |
348
|
1 |
|
dms = ( |
349
|
|
|
''.join(c for c in self._delete_consecutive_repeats(_) if c != '_') |
|
|
|
|
350
|
|
|
for _ in dms |
351
|
|
|
) |
352
|
|
|
|
353
|
|
|
# Trim codes and return set |
354
|
1 |
|
if zero_pad: |
355
|
1 |
|
dms = ((_ + ('0' * max_length))[:max_length] for _ in dms) |
356
|
|
|
else: |
357
|
1 |
|
dms = (_[:max_length] for _ in dms) |
358
|
1 |
|
return set(dms) |
359
|
|
|
|
360
|
|
|
|
361
|
1 |
|
def dm_soundex(word, max_length=6, zero_pad=True): |
362
|
|
|
"""Return the Daitch-Mokotoff Soundex code for a word. |
363
|
|
|
|
364
|
|
|
This is a wrapper for :py:meth:`DaitchMokotoff.encode`. |
365
|
|
|
|
366
|
|
|
Parameters |
367
|
|
|
---------- |
368
|
|
|
word : str |
369
|
|
|
The word to transform |
370
|
|
|
max_length : int |
371
|
|
|
The length of the code returned (defaults to 6; must be between 6 and |
372
|
|
|
64) |
373
|
|
|
zero_pad : bool |
374
|
|
|
Pad the end of the return value with 0s to achieve a max_length string |
375
|
|
|
|
376
|
|
|
Returns |
377
|
|
|
------- |
378
|
|
|
str |
379
|
|
|
The Daitch-Mokotoff Soundex value |
380
|
|
|
|
381
|
|
|
Examples |
382
|
|
|
-------- |
383
|
|
|
>>> sorted(dm_soundex('Christopher')) |
384
|
|
|
['494379', '594379'] |
385
|
|
|
>>> dm_soundex('Niall') |
386
|
|
|
{'680000'} |
387
|
|
|
>>> dm_soundex('Smith') |
388
|
|
|
{'463000'} |
389
|
|
|
>>> dm_soundex('Schmidt') |
390
|
|
|
{'463000'} |
391
|
|
|
|
392
|
|
|
>>> sorted(dm_soundex('The quick brown fox', max_length=20, |
393
|
|
|
... zero_pad=False)) |
394
|
|
|
['35457976754', '3557976754'] |
395
|
|
|
|
396
|
|
|
""" |
397
|
1 |
|
return DaitchMokotoff().encode(word, max_length, zero_pad) |
398
|
|
|
|
399
|
|
|
|
400
|
|
|
if __name__ == '__main__': |
401
|
|
|
import doctest |
402
|
|
|
|
403
|
|
|
doctest.testmod() |
404
|
|
|
|