Total Complexity | 80 |
Total Lines | 809 |
Duplicated Lines | 11.37 % |
Coverage | 100% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.phonetic._de often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Copyright 2014-2018 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | 1 | """abydos.phonetic._de. |
|
20 | |||
21 | The phonetic._de module implements the Kölner Phonetik and related |
||
22 | algorithms for German: |
||
23 | |||
24 | - Kölner Phonetik |
||
25 | - Phonem |
||
26 | - Haase Phonetik |
||
27 | - Reth-Schek Phonetik |
||
28 | """ |
||
29 | |||
30 | 1 | from __future__ import unicode_literals |
|
31 | |||
32 | 1 | from itertools import product |
|
33 | 1 | from unicodedata import normalize as unicode_normalize |
|
34 | |||
35 | 1 | from six import text_type |
|
36 | 1 | from six.moves import range |
|
37 | |||
38 | 1 | from ._phonetic import Phonetic |
|
39 | |||
40 | 1 | __all__ = [ |
|
41 | 'Haase', |
||
42 | 'Koelner', |
||
43 | 'Phonem', |
||
44 | 'RethSchek', |
||
45 | 'haase_phonetik', |
||
46 | 'koelner_phonetik', |
||
47 | 'koelner_phonetik_alpha', |
||
48 | 'koelner_phonetik_num_to_alpha', |
||
49 | 'phonem', |
||
50 | 'reth_schek_phonetik', |
||
51 | ] |
||
52 | |||
53 | |||
54 | 1 | class Koelner(Phonetic): |
|
|
|||
55 | """Kölner Phonetik. |
||
56 | |||
57 | Based on the algorithm defined by :cite:`Postel:1969`. |
||
58 | """ |
||
59 | |||
60 | 1 | _uc_v_set = set('AEIOUJY') |
|
61 | |||
62 | 1 | _num_trans = dict(zip((ord(_) for _ in '012345678'), 'APTFKLNRS')) |
|
63 | 1 | _num_set = set('012345678') |
|
64 | |||
65 | 1 | def encode(self, word): |
|
66 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
67 | |||
68 | While the output code is numeric, it is still a str because 0s can lead |
||
69 | the code. |
||
70 | |||
71 | Args: |
||
72 | word (str): The word to transform |
||
73 | |||
74 | Returns: |
||
75 | str: The Kölner Phonetik value as a numeric string |
||
76 | |||
77 | Example: |
||
78 | >>> pe = Koelner() |
||
79 | >>> pe.encode('Christopher') |
||
80 | '478237' |
||
81 | >>> pe.encode('Niall') |
||
82 | '65' |
||
83 | >>> pe.encode('Smith') |
||
84 | '862' |
||
85 | >>> pe.encode('Schmidt') |
||
86 | '862' |
||
87 | >>> pe.encode('Müller') |
||
88 | '657' |
||
89 | >>> pe.encode('Zimmermann') |
||
90 | '86766' |
||
91 | |||
92 | """ |
||
93 | |||
94 | 1 | def _after(word, pos, letters): |
|
95 | """Return True if word[pos] follows one of the supplied letters. |
||
96 | |||
97 | Args: |
||
98 | word (str): The word to check |
||
99 | pos (int): Position within word to check |
||
100 | letters (str): Letters to confirm precede word[pos] |
||
101 | |||
102 | Returns: |
||
103 | bool: True if word[pos] follows a value in letters |
||
104 | |||
105 | """ |
||
106 | 1 | return pos > 0 and word[pos - 1] in letters |
|
107 | |||
108 | 1 | def _before(word, pos, letters): |
|
109 | """Return True if word[pos] precedes one of the supplied letters. |
||
110 | |||
111 | Args: |
||
112 | word (str): The word to check |
||
113 | pos (int): Position within word to check |
||
114 | letters (str): Letters to confirm follow word[pos] |
||
115 | |||
116 | Returns: |
||
117 | bool: True if word[pos] precedes a value in letters |
||
118 | |||
119 | """ |
||
120 | 1 | return pos + 1 < len(word) and word[pos + 1] in letters |
|
121 | |||
122 | 1 | sdx = '' |
|
123 | |||
124 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
125 | 1 | word = word.replace('ß', 'SS') |
|
126 | |||
127 | 1 | word = word.replace('Ä', 'AE') |
|
128 | 1 | word = word.replace('Ö', 'OE') |
|
129 | 1 | word = word.replace('Ü', 'UE') |
|
130 | 1 | word = ''.join(c for c in word if c in self._uc_set) |
|
131 | |||
132 | # Nothing to convert, return base case |
||
133 | 1 | if not word: |
|
134 | 1 | return sdx |
|
135 | |||
136 | 1 | for i in range(len(word)): |
|
137 | 1 | View Code Duplication | if word[i] in self._uc_v_set: |
138 | 1 | sdx += '0' |
|
139 | 1 | elif word[i] == 'B': |
|
140 | 1 | sdx += '1' |
|
141 | 1 | elif word[i] == 'P': |
|
142 | 1 | if _before(word, i, {'H'}): |
|
143 | 1 | sdx += '3' |
|
144 | else: |
||
145 | 1 | sdx += '1' |
|
146 | 1 | elif word[i] in {'D', 'T'}: |
|
147 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
148 | 1 | sdx += '8' |
|
149 | else: |
||
150 | 1 | sdx += '2' |
|
151 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
152 | 1 | sdx += '3' |
|
153 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
154 | 1 | sdx += '4' |
|
155 | 1 | elif word[i] == 'C': |
|
156 | 1 | if _after(word, i, {'S', 'Z'}): |
|
157 | 1 | sdx += '8' |
|
158 | 1 | elif i == 0: |
|
159 | 1 | if _before( |
|
160 | word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} |
||
161 | ): |
||
162 | 1 | sdx += '4' |
|
163 | else: |
||
164 | 1 | sdx += '8' |
|
165 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
166 | 1 | sdx += '4' |
|
167 | else: |
||
168 | 1 | sdx += '8' |
|
169 | 1 | elif word[i] == 'X': |
|
170 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
171 | 1 | sdx += '8' |
|
172 | else: |
||
173 | 1 | sdx += '48' |
|
174 | 1 | elif word[i] == 'L': |
|
175 | 1 | sdx += '5' |
|
176 | 1 | elif word[i] in {'M', 'N'}: |
|
177 | 1 | sdx += '6' |
|
178 | 1 | elif word[i] == 'R': |
|
179 | 1 | sdx += '7' |
|
180 | 1 | elif word[i] in {'S', 'Z'}: |
|
181 | 1 | sdx += '8' |
|
182 | |||
183 | 1 | sdx = self._delete_consecutive_repeats(sdx) |
|
184 | |||
185 | 1 | if sdx: |
|
186 | 1 | sdx = sdx[:1] + sdx[1:].replace('0', '') |
|
187 | |||
188 | 1 | return sdx |
|
189 | |||
190 | 1 | def _to_alpha(self, num): |
|
191 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
192 | |||
193 | Args: |
||
194 | num (str or int): A numeric Kölner Phonetik representation |
||
195 | |||
196 | Returns: |
||
197 | str: An alphabetic representation of the same word |
||
198 | |||
199 | Examples: |
||
200 | >>> pe = Koelner() |
||
201 | >>> pe._to_alpha('862') |
||
202 | 'SNT' |
||
203 | >>> pe._to_alpha('657') |
||
204 | 'NLR' |
||
205 | >>> pe._to_alpha('86766') |
||
206 | 'SNRNN' |
||
207 | |||
208 | """ |
||
209 | 1 | num = ''.join(c for c in text_type(num) if c in self._num_set) |
|
210 | 1 | return num.translate(self._num_trans) |
|
211 | |||
212 | 1 | def encode_alpha(self, word): |
|
213 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
214 | |||
215 | Args: |
||
216 | word (str): The word to transform |
||
217 | |||
218 | Returns: |
||
219 | str: The Kölner Phonetik value as an alphabetic string |
||
220 | |||
221 | Examples: |
||
222 | >>> pe = Koelner() |
||
223 | >>> pe.encode_alpha('Smith') |
||
224 | 'SNT' |
||
225 | >>> pe.encode_alpha('Schmidt') |
||
226 | 'SNT' |
||
227 | >>> pe.encode_alpha('Müller') |
||
228 | 'NLR' |
||
229 | >>> pe.encode_alpha('Zimmermann') |
||
230 | 'SNRNN' |
||
231 | |||
232 | """ |
||
233 | 1 | return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
|
234 | |||
235 | |||
236 | 1 | def koelner_phonetik(word): |
|
237 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
238 | |||
239 | This is a wrapper for :py:meth:`Koelner.encode`. |
||
240 | |||
241 | Args: |
||
242 | word (str): The word to transform |
||
243 | |||
244 | Returns: |
||
245 | str: The Kölner Phonetik value as a numeric string |
||
246 | |||
247 | Example: |
||
248 | >>> koelner_phonetik('Christopher') |
||
249 | '478237' |
||
250 | >>> koelner_phonetik('Niall') |
||
251 | '65' |
||
252 | >>> koelner_phonetik('Smith') |
||
253 | '862' |
||
254 | >>> koelner_phonetik('Schmidt') |
||
255 | '862' |
||
256 | >>> koelner_phonetik('Müller') |
||
257 | '657' |
||
258 | >>> koelner_phonetik('Zimmermann') |
||
259 | '86766' |
||
260 | |||
261 | """ |
||
262 | 1 | return Koelner().encode(word) |
|
263 | |||
264 | |||
265 | 1 | def koelner_phonetik_num_to_alpha(num): |
|
266 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
267 | |||
268 | This is a wrapper for :py:meth:`Koelner._to_alpha`. |
||
269 | |||
270 | Args: |
||
271 | num (str or int): A numeric Kölner Phonetik representation |
||
272 | |||
273 | Returns: |
||
274 | str: An alphabetic representation of the same word |
||
275 | |||
276 | Examples: |
||
277 | >>> koelner_phonetik_num_to_alpha('862') |
||
278 | 'SNT' |
||
279 | >>> koelner_phonetik_num_to_alpha('657') |
||
280 | 'NLR' |
||
281 | >>> koelner_phonetik_num_to_alpha('86766') |
||
282 | 'SNRNN' |
||
283 | |||
284 | """ |
||
285 | 1 | return Koelner()._to_alpha(num) |
|
286 | |||
287 | |||
288 | 1 | def koelner_phonetik_alpha(word): |
|
289 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
290 | |||
291 | This is a wrapper for :py:meth:`Koelner.encode_alpha`. |
||
292 | |||
293 | Args: |
||
294 | word (str): The word to transform |
||
295 | |||
296 | Returns: |
||
297 | str: The Kölner Phonetik value as an alphabetic string |
||
298 | |||
299 | Examples: |
||
300 | >>> koelner_phonetik_alpha('Smith') |
||
301 | 'SNT' |
||
302 | >>> koelner_phonetik_alpha('Schmidt') |
||
303 | 'SNT' |
||
304 | >>> koelner_phonetik_alpha('Müller') |
||
305 | 'NLR' |
||
306 | >>> koelner_phonetik_alpha('Zimmermann') |
||
307 | 'SNRNN' |
||
308 | |||
309 | """ |
||
310 | 1 | return Koelner().encode_alpha(word) |
|
311 | |||
312 | |||
313 | 1 | class Phonem(Phonetic): |
|
314 | """Phonem. |
||
315 | |||
316 | Phonem is defined in :cite:`Wilde:1988`. |
||
317 | |||
318 | This version is based on the Perl implementation documented at |
||
319 | :cite:`Wilz:2005`. |
||
320 | It includes some enhancements presented in the Java port at |
||
321 | :cite:`dcm4che:2011`. |
||
322 | |||
323 | Phonem is intended chiefly for German names/words. |
||
324 | """ |
||
325 | |||
326 | 1 | _substitutions = ( |
|
327 | ('SC', 'C'), |
||
328 | ('SZ', 'C'), |
||
329 | ('CZ', 'C'), |
||
330 | ('TZ', 'C'), |
||
331 | ('TS', 'C'), |
||
332 | ('KS', 'X'), |
||
333 | ('PF', 'V'), |
||
334 | ('QU', 'KW'), |
||
335 | ('PH', 'V'), |
||
336 | ('UE', 'Y'), |
||
337 | ('AE', 'E'), |
||
338 | ('OE', 'Ö'), |
||
339 | ('EI', 'AY'), |
||
340 | ('EY', 'AY'), |
||
341 | ('EU', 'OY'), |
||
342 | ('AU', 'A§'), |
||
343 | ('OU', '§'), |
||
344 | ) |
||
345 | |||
346 | 1 | _trans = dict( |
|
347 | zip( |
||
348 | (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
||
349 | 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ', |
||
350 | ) |
||
351 | ) |
||
352 | |||
353 | 1 | _uc_set = set('ABCDLMNORSUVWXYÖ') |
|
354 | |||
355 | 1 | def encode(self, word): |
|
356 | """Return the Phonem code for a word. |
||
357 | |||
358 | Args: |
||
359 | word (str): The word to transform |
||
360 | |||
361 | Returns: |
||
362 | str: The Phonem value |
||
363 | |||
364 | Examples: |
||
365 | >>> pe = Phonem() |
||
366 | >>> pe.encode('Christopher') |
||
367 | 'CRYSDOVR' |
||
368 | >>> pe.encode('Niall') |
||
369 | 'NYAL' |
||
370 | >>> pe.encode('Smith') |
||
371 | 'SMYD' |
||
372 | >>> pe.encode('Schmidt') |
||
373 | 'CMYD' |
||
374 | |||
375 | """ |
||
376 | 1 | word = unicode_normalize('NFC', text_type(word.upper())) |
|
377 | 1 | for i, j in self._substitutions: |
|
378 | 1 | word = word.replace(i, j) |
|
379 | 1 | word = word.translate(self._trans) |
|
380 | |||
381 | 1 | return ''.join( |
|
382 | c |
||
383 | for c in self._delete_consecutive_repeats(word) |
||
384 | if c in self._uc_set |
||
385 | ) |
||
386 | |||
387 | |||
388 | 1 | def phonem(word): |
|
389 | """Return the Phonem code for a word. |
||
390 | |||
391 | This is a wrapper for :py:meth:`Phonem.encode`. |
||
392 | |||
393 | Args: |
||
394 | word (str): The word to transform |
||
395 | |||
396 | Returns: |
||
397 | str: The Phonem value |
||
398 | |||
399 | Examples: |
||
400 | >>> phonem('Christopher') |
||
401 | 'CRYSDOVR' |
||
402 | >>> phonem('Niall') |
||
403 | 'NYAL' |
||
404 | >>> phonem('Smith') |
||
405 | 'SMYD' |
||
406 | >>> phonem('Schmidt') |
||
407 | 'CMYD' |
||
408 | |||
409 | """ |
||
410 | 1 | return Phonem().encode(word) |
|
411 | |||
412 | |||
413 | 1 | class Haase(Phonetic): |
|
414 | """Haase Phonetik. |
||
415 | |||
416 | Based on the algorithm described at :cite:`Prante:2015`. |
||
417 | |||
418 | Based on the original :cite:`Haase:2000`. |
||
419 | """ |
||
420 | |||
421 | 1 | _uc_v_set = set('AEIJOUY') |
|
422 | |||
423 | 1 | def encode(self, word, primary_only=False): |
|
424 | """Return the Haase Phonetik (numeric output) code for a word. |
||
425 | |||
426 | While the output code is numeric, it is nevertheless a str. |
||
427 | |||
428 | Args: |
||
429 | word (str): The word to transform |
||
430 | primary_only (bool): If True, only the primary code is returned |
||
431 | |||
432 | Returns: |
||
433 | tuple: The Haase Phonetik value as a numeric string |
||
434 | |||
435 | Examples: |
||
436 | >>> pe = Haase() |
||
437 | >>> pe.encode('Joachim') |
||
438 | ('9496',) |
||
439 | >>> pe.encode('Christoph') |
||
440 | ('4798293', '8798293') |
||
441 | >>> pe.encode('Jörg') |
||
442 | ('974',) |
||
443 | >>> pe.encode('Smith') |
||
444 | ('8692',) |
||
445 | >>> pe.encode('Schmidt') |
||
446 | ('8692', '4692') |
||
447 | |||
448 | """ |
||
449 | |||
450 | 1 | def _after(word, pos, letters): |
|
451 | """Return True if word[pos] follows one of the supplied letters. |
||
452 | |||
453 | Args: |
||
454 | word (str): Word to modify |
||
455 | pos (int): Position to examine |
||
456 | letters (set): Letters to check for |
||
457 | |||
458 | Returns: |
||
459 | bool: True if word[pos] follows one of letters |
||
460 | |||
461 | """ |
||
462 | 1 | if pos > 0 and word[pos - 1] in letters: |
|
463 | 1 | return True |
|
464 | 1 | return False |
|
465 | |||
466 | 1 | def _before(word, pos, letters): |
|
467 | """Return True if word[pos] precedes one of the supplied letters. |
||
468 | |||
469 | Args: |
||
470 | word (str): Word to modify |
||
471 | pos (int): Position to examine |
||
472 | letters (set): Letters to check for |
||
473 | |||
474 | Returns: |
||
475 | bool: True if word[pos] precedes one of letters |
||
476 | |||
477 | """ |
||
478 | 1 | if pos + 1 < len(word) and word[pos + 1] in letters: |
|
479 | 1 | return True |
|
480 | 1 | return False |
|
481 | |||
482 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
483 | 1 | word = word.replace('ß', 'SS') |
|
484 | |||
485 | 1 | word = word.replace('Ä', 'AE') |
|
486 | 1 | word = word.replace('Ö', 'OE') |
|
487 | 1 | word = word.replace('Ü', 'UE') |
|
488 | 1 | word = ''.join(c for c in word if c in self._uc_set) |
|
489 | |||
490 | 1 | variants = [] |
|
491 | 1 | if primary_only: |
|
492 | 1 | variants = [word] |
|
493 | else: |
||
494 | 1 | pos = 0 |
|
495 | 1 | if word[:2] == 'CH': |
|
496 | 1 | variants.append(('CH', 'SCH')) |
|
497 | 1 | pos += 2 |
|
498 | 1 | len_3_vars = { |
|
499 | 'OWN': 'AUN', |
||
500 | 'WSK': 'RSK', |
||
501 | 'SCH': 'CH', |
||
502 | 'GLI': 'LI', |
||
503 | 'AUX': 'O', |
||
504 | 'EUX': 'O', |
||
505 | } |
||
506 | 1 | while pos < len(word): |
|
507 | 1 | if word[pos : pos + 4] == 'ILLE': |
|
508 | 1 | variants.append(('ILLE', 'I')) |
|
509 | 1 | pos += 4 |
|
510 | 1 | elif word[pos : pos + 3] in len_3_vars: |
|
511 | 1 | variants.append( |
|
512 | (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) |
||
513 | ) |
||
514 | 1 | pos += 3 |
|
515 | 1 | elif word[pos : pos + 2] == 'RB': |
|
516 | 1 | variants.append(('RB', 'RW')) |
|
517 | 1 | pos += 2 |
|
518 | 1 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
|
519 | 1 | variants.append(('EAU', 'O')) |
|
520 | 1 | pos += 3 |
|
521 | 1 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
522 | 1 | if word[pos:] == 'O': |
|
523 | 1 | variants.append(('O', 'OW')) |
|
524 | else: |
||
525 | 1 | variants.append(('A', 'AR')) |
|
526 | 1 | pos += 1 |
|
527 | else: |
||
528 | 1 | variants.append((word[pos],)) |
|
529 | 1 | pos += 1 |
|
530 | |||
531 | 1 | variants = [''.join(letters) for letters in product(*variants)] |
|
532 | |||
533 | 1 | def _haase_code(word): |
|
534 | 1 | sdx = '' |
|
535 | 1 | for i in range(len(word)): |
|
536 | 1 | View Code Duplication | if word[i] in self._uc_v_set: |
537 | 1 | sdx += '9' |
|
538 | 1 | elif word[i] == 'B': |
|
539 | 1 | sdx += '1' |
|
540 | 1 | elif word[i] == 'P': |
|
541 | 1 | if _before(word, i, {'H'}): |
|
542 | 1 | sdx += '3' |
|
543 | else: |
||
544 | 1 | sdx += '1' |
|
545 | 1 | elif word[i] in {'D', 'T'}: |
|
546 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
547 | 1 | sdx += '8' |
|
548 | else: |
||
549 | 1 | sdx += '2' |
|
550 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
551 | 1 | sdx += '3' |
|
552 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
553 | 1 | sdx += '4' |
|
554 | 1 | elif word[i] == 'C': |
|
555 | 1 | if _after(word, i, {'S', 'Z'}): |
|
556 | 1 | sdx += '8' |
|
557 | 1 | elif i == 0: |
|
558 | 1 | if _before( |
|
559 | word, |
||
560 | i, |
||
561 | {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, |
||
562 | ): |
||
563 | 1 | sdx += '4' |
|
564 | else: |
||
565 | 1 | sdx += '8' |
|
566 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
567 | 1 | sdx += '4' |
|
568 | else: |
||
569 | 1 | sdx += '8' |
|
570 | 1 | elif word[i] == 'X': |
|
571 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
572 | 1 | sdx += '8' |
|
573 | else: |
||
574 | 1 | sdx += '48' |
|
575 | 1 | elif word[i] == 'L': |
|
576 | 1 | sdx += '5' |
|
577 | 1 | elif word[i] in {'M', 'N'}: |
|
578 | 1 | sdx += '6' |
|
579 | 1 | elif word[i] == 'R': |
|
580 | 1 | sdx += '7' |
|
581 | 1 | elif word[i] in {'S', 'Z'}: |
|
582 | 1 | sdx += '8' |
|
583 | |||
584 | 1 | sdx = self._delete_consecutive_repeats(sdx) |
|
585 | |||
586 | 1 | return sdx |
|
587 | |||
588 | 1 | encoded = tuple(_haase_code(word) for word in variants) |
|
589 | 1 | if len(encoded) > 1: |
|
590 | 1 | encoded_set = set() |
|
591 | 1 | encoded_single = [] |
|
592 | 1 | for code in encoded: |
|
593 | 1 | if code not in encoded_set: |
|
594 | 1 | encoded_set.add(code) |
|
595 | 1 | encoded_single.append(code) |
|
596 | 1 | return tuple(encoded_single) |
|
597 | |||
598 | 1 | return encoded |
|
599 | |||
600 | |||
601 | 1 | def haase_phonetik(word, primary_only=False): |
|
602 | """Return the Haase Phonetik (numeric output) code for a word. |
||
603 | |||
604 | This is a wrapper for :py:meth:`Haase.encode`. |
||
605 | |||
606 | Args: |
||
607 | word (str): The word to transform |
||
608 | primary_only (bool): If True, only the primary code is returned |
||
609 | |||
610 | Returns: |
||
611 | tuple: The Haase Phonetik value as a numeric string |
||
612 | |||
613 | Examples: |
||
614 | >>> haase_phonetik('Joachim') |
||
615 | ('9496',) |
||
616 | >>> haase_phonetik('Christoph') |
||
617 | ('4798293', '8798293') |
||
618 | >>> haase_phonetik('Jörg') |
||
619 | ('974',) |
||
620 | >>> haase_phonetik('Smith') |
||
621 | ('8692',) |
||
622 | >>> haase_phonetik('Schmidt') |
||
623 | ('8692', '4692') |
||
624 | |||
625 | """ |
||
626 | 1 | return Haase().encode(word, primary_only) |
|
627 | |||
628 | |||
629 | 1 | class RethSchek(Phonetic): |
|
630 | """Reth-Schek Phonetik. |
||
631 | |||
632 | This algorithm is proposed in :cite:`Reth:1977`. |
||
633 | |||
634 | Since I couldn't secure a copy of that document (maybe I'll look for it |
||
635 | next time I'm in Germany), this implementation is based on what I could |
||
636 | glean from the implementations published by German Record Linkage |
||
637 | Center (www.record-linkage.de): |
||
638 | |||
639 | - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` |
||
640 | - Merge ToolBox (in Java) :cite:`Schnell:2004` |
||
641 | |||
642 | Rules that are unclear: |
||
643 | |||
644 | - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
||
645 | - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
||
646 | - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
||
647 | think of a German word with '-tui-' in it.) |
||
648 | - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
||
649 | """ |
||
650 | |||
651 | 1 | _replacements = { |
|
652 | 3: { |
||
653 | 'AEH': 'E', |
||
654 | 'IEH': 'I', |
||
655 | 'OEH': 'OE', |
||
656 | 'UEH': 'UE', |
||
657 | 'SCH': 'CH', |
||
658 | 'ZIO': 'TIO', |
||
659 | 'TIU': 'TIO', |
||
660 | 'ZIU': 'TIO', |
||
661 | 'CHS': 'X', |
||
662 | 'CKS': 'X', |
||
663 | 'AEU': 'OI', |
||
664 | }, |
||
665 | 2: { |
||
666 | 'LL': 'L', |
||
667 | 'AA': 'A', |
||
668 | 'AH': 'A', |
||
669 | 'BB': 'B', |
||
670 | 'PP': 'B', |
||
671 | 'BP': 'B', |
||
672 | 'PB': 'B', |
||
673 | 'DD': 'D', |
||
674 | 'DT': 'D', |
||
675 | 'TT': 'D', |
||
676 | 'TH': 'D', |
||
677 | 'EE': 'E', |
||
678 | 'EH': 'E', |
||
679 | 'AE': 'E', |
||
680 | 'FF': 'F', |
||
681 | 'PH': 'F', |
||
682 | 'KK': 'K', |
||
683 | 'GG': 'G', |
||
684 | 'GK': 'G', |
||
685 | 'KG': 'G', |
||
686 | 'CK': 'G', |
||
687 | 'CC': 'C', |
||
688 | 'IE': 'I', |
||
689 | 'IH': 'I', |
||
690 | 'MM': 'M', |
||
691 | 'NN': 'N', |
||
692 | 'OO': 'O', |
||
693 | 'OH': 'O', |
||
694 | 'SZ': 'S', |
||
695 | 'UH': 'U', |
||
696 | 'GS': 'X', |
||
697 | 'KS': 'X', |
||
698 | 'TZ': 'Z', |
||
699 | 'AY': 'AI', |
||
700 | 'EI': 'AI', |
||
701 | 'EY': 'AI', |
||
702 | 'EU': 'OI', |
||
703 | 'RR': 'R', |
||
704 | 'SS': 'S', |
||
705 | 'KW': 'QU', |
||
706 | }, |
||
707 | 1: { |
||
708 | 'P': 'B', |
||
709 | 'T': 'D', |
||
710 | 'V': 'F', |
||
711 | 'W': 'F', |
||
712 | 'C': 'G', |
||
713 | 'K': 'G', |
||
714 | 'Y': 'I', |
||
715 | }, |
||
716 | } |
||
717 | |||
718 | 1 | def encode(self, word): |
|
719 | """Return Reth-Schek Phonetik code for a word. |
||
720 | |||
721 | Args: |
||
722 | word (str): The word to transform |
||
723 | |||
724 | Returns: |
||
725 | str: The Reth-Schek Phonetik code |
||
726 | |||
727 | Examples: |
||
728 | >>> reth_schek_phonetik('Joachim') |
||
729 | 'JOAGHIM' |
||
730 | >>> reth_schek_phonetik('Christoph') |
||
731 | 'GHRISDOF' |
||
732 | >>> reth_schek_phonetik('Jörg') |
||
733 | 'JOERG' |
||
734 | >>> reth_schek_phonetik('Smith') |
||
735 | 'SMID' |
||
736 | >>> reth_schek_phonetik('Schmidt') |
||
737 | 'SCHMID' |
||
738 | |||
739 | """ |
||
740 | # Uppercase |
||
741 | 1 | word = word.upper() |
|
742 | |||
743 | # Replace umlauts/eszett |
||
744 | 1 | word = word.replace('Ä', 'AE') |
|
745 | 1 | word = word.replace('Ö', 'OE') |
|
746 | 1 | word = word.replace('Ü', 'UE') |
|
747 | 1 | word = word.replace('ß', 'SS') |
|
748 | |||
749 | # Main loop, using above replacements table |
||
750 | 1 | pos = 0 |
|
751 | 1 | while pos < len(word): |
|
752 | 1 | for num in range(3, 0, -1): |
|
753 | 1 | if word[pos : pos + num] in self._replacements[num]: |
|
754 | 1 | word = ( |
|
755 | word[:pos] |
||
756 | + self._replacements[num][word[pos : pos + num]] |
||
757 | + word[pos + num :] |
||
758 | ) |
||
759 | 1 | pos += 1 |
|
760 | 1 | break |
|
761 | else: |
||
762 | 1 | pos += 1 # Advance if nothing is recognized |
|
763 | |||
764 | # Change 'CH' back(?) to 'SCH' |
||
765 | 1 | word = word.replace('CH', 'SCH') |
|
766 | |||
767 | # Replace final sequences |
||
768 | 1 | if word[-2:] == 'ER': |
|
769 | 1 | word = word[:-2] + 'R' |
|
770 | 1 | elif word[-2:] == 'EL': |
|
771 | 1 | word = word[:-2] + 'L' |
|
772 | 1 | elif word[-1:] == 'H': |
|
773 | 1 | word = word[:-1] |
|
774 | |||
775 | 1 | return word |
|
776 | |||
777 | |||
778 | 1 | def reth_schek_phonetik(word): |
|
779 | """Return Reth-Schek Phonetik code for a word. |
||
780 | |||
781 | This is a wrapper for :py:meth:`RethSchek.encode`. |
||
782 | |||
783 | Args: |
||
784 | word (str): The word to transform |
||
785 | |||
786 | Returns: |
||
787 | str: The Reth-Schek Phonetik code |
||
788 | |||
789 | Examples: |
||
790 | >>> reth_schek_phonetik('Joachim') |
||
791 | 'JOAGHIM' |
||
792 | >>> reth_schek_phonetik('Christoph') |
||
793 | 'GHRISDOF' |
||
794 | >>> reth_schek_phonetik('Jörg') |
||
795 | 'JOERG' |
||
796 | >>> reth_schek_phonetik('Smith') |
||
797 | 'SMID' |
||
798 | >>> reth_schek_phonetik('Schmidt') |
||
799 | 'SCHMID' |
||
800 | |||
801 | """ |
||
802 | 1 | return RethSchek().encode(word) |
|
803 | |||
804 | |||
805 | if __name__ == '__main__': |
||
806 | import doctest |
||
807 | |||
808 | doctest.testmod() |
||
809 |