Total Complexity | 80 |
Total Lines | 724 |
Duplicated Lines | 12.71 % |
Coverage | 100% |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.phonetic._de often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Copyright 2014-2018 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | 1 | """abydos.phonetic._de. |
|
20 | |||
21 | The phonetic._de module implements the Kölner Phonetik and related |
||
22 | algorithms for German: |
||
23 | |||
24 | - Kölner Phonetik |
||
25 | - Phonem |
||
26 | - Haase Phonetik |
||
27 | - Reth-Schek Phonetik |
||
28 | """ |
||
29 | |||
30 | 1 | from __future__ import unicode_literals |
|
31 | |||
32 | 1 | from itertools import product |
|
33 | 1 | from unicodedata import normalize as unicode_normalize |
|
34 | |||
35 | 1 | from six import text_type |
|
36 | 1 | from six.moves import range |
|
37 | |||
38 | 1 | from ._phonetic import Phonetic |
|
39 | |||
40 | 1 | __all__ = [ |
|
41 | 'Haase', |
||
42 | 'Koelner', |
||
43 | 'Phonem', |
||
44 | 'RethSchek', |
||
45 | 'haase_phonetik', |
||
46 | 'koelner_phonetik', |
||
47 | 'koelner_phonetik_alpha', |
||
48 | 'koelner_phonetik_num_to_alpha', |
||
49 | 'phonem', |
||
50 | 'reth_schek_phonetik', |
||
51 | ] |
||
52 | |||
53 | |||
54 | 1 | class Koelner(Phonetic): |
|
|
|||
55 | """Kölner Phonetik. |
||
56 | |||
57 | Based on the algorithm defined by :cite:`Postel:1969`. |
||
58 | """ |
||
59 | |||
60 | 1 | _uc_v_set = set('AEIOUJY') |
|
61 | |||
62 | 1 | _num_trans = dict(zip((ord(_) for _ in '012345678'), 'APTFKLNRS')) |
|
63 | 1 | _num_set = set('012345678') |
|
64 | |||
65 | 1 | def encode(self, word): |
|
66 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
67 | |||
68 | While the output code is numeric, it is still a str because 0s can lead |
||
69 | the code. |
||
70 | |||
71 | :param str word: the word to transform |
||
72 | :returns: the Kölner Phonetik value as a numeric string |
||
73 | :rtype: str |
||
74 | |||
75 | >>> pe = Koelner() |
||
76 | >>> pe.encode('Christopher') |
||
77 | '478237' |
||
78 | >>> pe.encode('Niall') |
||
79 | '65' |
||
80 | >>> pe.encode('Smith') |
||
81 | '862' |
||
82 | >>> pe.encode('Schmidt') |
||
83 | '862' |
||
84 | >>> pe.encode('Müller') |
||
85 | '657' |
||
86 | >>> pe.encode('Zimmermann') |
||
87 | '86766' |
||
88 | """ |
||
89 | |||
90 | 1 | def _after(word, pos, letters): |
|
91 | """Return True if word[i] follows one of the supplied letters.""" |
||
92 | 1 | return pos > 0 and word[pos - 1] in letters |
|
93 | |||
94 | 1 | def _before(word, pos, letters): |
|
95 | """Return True if word[i] precedes one of the supplied letters.""" |
||
96 | 1 | return pos + 1 < len(word) and word[pos + 1] in letters |
|
97 | |||
98 | 1 | sdx = '' |
|
99 | |||
100 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
101 | 1 | word = word.replace('ß', 'SS') |
|
102 | |||
103 | 1 | word = word.replace('Ä', 'AE') |
|
104 | 1 | word = word.replace('Ö', 'OE') |
|
105 | 1 | word = word.replace('Ü', 'UE') |
|
106 | 1 | word = ''.join(c for c in word if c in self._uc_set) |
|
107 | |||
108 | # Nothing to convert, return base case |
||
109 | 1 | if not word: |
|
110 | 1 | return sdx |
|
111 | |||
112 | 1 | for i in range(len(word)): |
|
113 | 1 | View Code Duplication | if word[i] in self._uc_v_set: |
114 | 1 | sdx += '0' |
|
115 | 1 | elif word[i] == 'B': |
|
116 | 1 | sdx += '1' |
|
117 | 1 | elif word[i] == 'P': |
|
118 | 1 | if _before(word, i, {'H'}): |
|
119 | 1 | sdx += '3' |
|
120 | else: |
||
121 | 1 | sdx += '1' |
|
122 | 1 | elif word[i] in {'D', 'T'}: |
|
123 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
124 | 1 | sdx += '8' |
|
125 | else: |
||
126 | 1 | sdx += '2' |
|
127 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
128 | 1 | sdx += '3' |
|
129 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
130 | 1 | sdx += '4' |
|
131 | 1 | elif word[i] == 'C': |
|
132 | 1 | if _after(word, i, {'S', 'Z'}): |
|
133 | 1 | sdx += '8' |
|
134 | 1 | elif i == 0: |
|
135 | 1 | if _before( |
|
136 | word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} |
||
137 | ): |
||
138 | 1 | sdx += '4' |
|
139 | else: |
||
140 | 1 | sdx += '8' |
|
141 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
142 | 1 | sdx += '4' |
|
143 | else: |
||
144 | 1 | sdx += '8' |
|
145 | 1 | elif word[i] == 'X': |
|
146 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
147 | 1 | sdx += '8' |
|
148 | else: |
||
149 | 1 | sdx += '48' |
|
150 | 1 | elif word[i] == 'L': |
|
151 | 1 | sdx += '5' |
|
152 | 1 | elif word[i] in {'M', 'N'}: |
|
153 | 1 | sdx += '6' |
|
154 | 1 | elif word[i] == 'R': |
|
155 | 1 | sdx += '7' |
|
156 | 1 | elif word[i] in {'S', 'Z'}: |
|
157 | 1 | sdx += '8' |
|
158 | |||
159 | 1 | sdx = self._delete_consecutive_repeats(sdx) |
|
160 | |||
161 | 1 | if sdx: |
|
162 | 1 | sdx = sdx[:1] + sdx[1:].replace('0', '') |
|
163 | |||
164 | 1 | return sdx |
|
165 | |||
166 | 1 | def _to_alpha(self, num): |
|
167 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
168 | |||
169 | :param str num: a numeric Kölner Phonetik representation (can be a str |
||
170 | or an int) |
||
171 | :returns: an alphabetic representation of the same word |
||
172 | :rtype: str |
||
173 | |||
174 | >>> pe = Koelner() |
||
175 | >>> pe._to_alpha('862') |
||
176 | 'SNT' |
||
177 | >>> pe._to_alpha('657') |
||
178 | 'NLR' |
||
179 | >>> pe._to_alpha('86766') |
||
180 | 'SNRNN' |
||
181 | """ |
||
182 | 1 | num = ''.join(c for c in text_type(num) if c in self._num_set) |
|
183 | 1 | return num.translate(self._num_trans) |
|
184 | |||
185 | 1 | def encode_alpha(self, word): |
|
186 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
187 | |||
188 | :param str word: the word to transform |
||
189 | :returns: the Kölner Phonetik value as an alphabetic string |
||
190 | :rtype: str |
||
191 | |||
192 | >>> pe = Koelner() |
||
193 | >>> pe.encode_alpha('Smith') |
||
194 | 'SNT' |
||
195 | >>> pe.encode_alpha('Schmidt') |
||
196 | 'SNT' |
||
197 | >>> pe.encode_alpha('Müller') |
||
198 | 'NLR' |
||
199 | >>> pe.encode_alpha('Zimmermann') |
||
200 | 'SNRNN' |
||
201 | """ |
||
202 | 1 | return koelner_phonetik_num_to_alpha(koelner_phonetik(word)) |
|
203 | |||
204 | |||
205 | 1 | def koelner_phonetik(word): |
|
206 | """Return the Kölner Phonetik (numeric output) code for a word. |
||
207 | |||
208 | This is a wrapper for :py:meth:`Koelner.encode`. |
||
209 | |||
210 | :param str word: the word to transform |
||
211 | :returns: the Kölner Phonetik value as a numeric string |
||
212 | :rtype: str |
||
213 | |||
214 | >>> koelner_phonetik('Christopher') |
||
215 | '478237' |
||
216 | >>> koelner_phonetik('Niall') |
||
217 | '65' |
||
218 | >>> koelner_phonetik('Smith') |
||
219 | '862' |
||
220 | >>> koelner_phonetik('Schmidt') |
||
221 | '862' |
||
222 | >>> koelner_phonetik('Müller') |
||
223 | '657' |
||
224 | >>> koelner_phonetik('Zimmermann') |
||
225 | '86766' |
||
226 | """ |
||
227 | 1 | return Koelner().encode(word) |
|
228 | |||
229 | |||
230 | 1 | def koelner_phonetik_num_to_alpha(num): |
|
231 | """Convert a Kölner Phonetik code from numeric to alphabetic. |
||
232 | |||
233 | This is a wrapper for :py:meth:`Koelner._to_alpha`. |
||
234 | |||
235 | :param str num: a numeric Kölner Phonetik representation (can be a str or |
||
236 | an int) |
||
237 | :returns: an alphabetic representation of the same word |
||
238 | :rtype: str |
||
239 | |||
240 | >>> koelner_phonetik_num_to_alpha('862') |
||
241 | 'SNT' |
||
242 | >>> koelner_phonetik_num_to_alpha('657') |
||
243 | 'NLR' |
||
244 | >>> koelner_phonetik_num_to_alpha('86766') |
||
245 | 'SNRNN' |
||
246 | """ |
||
247 | 1 | return Koelner()._to_alpha(num) |
|
248 | |||
249 | |||
250 | 1 | def koelner_phonetik_alpha(word): |
|
251 | """Return the Kölner Phonetik (alphabetic output) code for a word. |
||
252 | |||
253 | This is a wrapper for :py:meth:`Koelner.encode_alpha`. |
||
254 | |||
255 | :param str word: the word to transform |
||
256 | :returns: the Kölner Phonetik value as an alphabetic string |
||
257 | :rtype: str |
||
258 | |||
259 | >>> koelner_phonetik_alpha('Smith') |
||
260 | 'SNT' |
||
261 | >>> koelner_phonetik_alpha('Schmidt') |
||
262 | 'SNT' |
||
263 | >>> koelner_phonetik_alpha('Müller') |
||
264 | 'NLR' |
||
265 | >>> koelner_phonetik_alpha('Zimmermann') |
||
266 | 'SNRNN' |
||
267 | """ |
||
268 | 1 | return Koelner().encode_alpha(word) |
|
269 | |||
270 | |||
271 | 1 | class Phonem(Phonetic): |
|
272 | """Phonem. |
||
273 | |||
274 | Phonem is defined in :cite:`Wilde:1988`. |
||
275 | |||
276 | This version is based on the Perl implementation documented at |
||
277 | :cite:`Wilz:2005`. |
||
278 | It includes some enhancements presented in the Java port at |
||
279 | :cite:`dcm4che:2011`. |
||
280 | |||
281 | Phonem is intended chiefly for German names/words. |
||
282 | """ |
||
283 | |||
284 | 1 | _substitutions = ( |
|
285 | ('SC', 'C'), |
||
286 | ('SZ', 'C'), |
||
287 | ('CZ', 'C'), |
||
288 | ('TZ', 'C'), |
||
289 | ('TS', 'C'), |
||
290 | ('KS', 'X'), |
||
291 | ('PF', 'V'), |
||
292 | ('QU', 'KW'), |
||
293 | ('PH', 'V'), |
||
294 | ('UE', 'Y'), |
||
295 | ('AE', 'E'), |
||
296 | ('OE', 'Ö'), |
||
297 | ('EI', 'AY'), |
||
298 | ('EY', 'AY'), |
||
299 | ('EU', 'OY'), |
||
300 | ('AU', 'A§'), |
||
301 | ('OU', '§'), |
||
302 | ) |
||
303 | |||
304 | 1 | _trans = dict( |
|
305 | zip( |
||
306 | (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'), |
||
307 | 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ', |
||
308 | ) |
||
309 | ) |
||
310 | |||
311 | 1 | _uc_set = set('ABCDLMNORSUVWXYÖ') |
|
312 | |||
313 | 1 | def encode(self, word): |
|
314 | """Return the Phonem code for a word. |
||
315 | |||
316 | :param str word: the word to transform |
||
317 | :returns: the Phonem value |
||
318 | :rtype: str |
||
319 | |||
320 | >>> pe = Phonem() |
||
321 | >>> pe.encode('Christopher') |
||
322 | 'CRYSDOVR' |
||
323 | >>> pe.encode('Niall') |
||
324 | 'NYAL' |
||
325 | >>> pe.encode('Smith') |
||
326 | 'SMYD' |
||
327 | >>> pe.encode('Schmidt') |
||
328 | 'CMYD' |
||
329 | """ |
||
330 | 1 | word = unicode_normalize('NFC', text_type(word.upper())) |
|
331 | 1 | for i, j in self._substitutions: |
|
332 | 1 | word = word.replace(i, j) |
|
333 | 1 | word = word.translate(self._trans) |
|
334 | |||
335 | 1 | return ''.join( |
|
336 | c |
||
337 | for c in self._delete_consecutive_repeats(word) |
||
338 | if c in self._uc_set |
||
339 | ) |
||
340 | |||
341 | |||
342 | 1 | def phonem(word): |
|
343 | """Return the Phonem code for a word. |
||
344 | |||
345 | This is a wrapper for :py:meth:`Phonem.encode`. |
||
346 | |||
347 | :param str word: the word to transform |
||
348 | :returns: the Phonem value |
||
349 | :rtype: str |
||
350 | |||
351 | >>> phonem('Christopher') |
||
352 | 'CRYSDOVR' |
||
353 | >>> phonem('Niall') |
||
354 | 'NYAL' |
||
355 | >>> phonem('Smith') |
||
356 | 'SMYD' |
||
357 | >>> phonem('Schmidt') |
||
358 | 'CMYD' |
||
359 | """ |
||
360 | 1 | return Phonem().encode(word) |
|
361 | |||
362 | |||
363 | 1 | class Haase(Phonetic): |
|
364 | """Haase Phonetik. |
||
365 | |||
366 | Based on the algorithm described at :cite:`Prante:2015`. |
||
367 | |||
368 | Based on the original :cite:`Haase:2000`. |
||
369 | """ |
||
370 | |||
371 | 1 | _uc_v_set = set('AEIJOUY') |
|
372 | |||
373 | 1 | def encode(self, word, primary_only=False): |
|
374 | """Return the Haase Phonetik (numeric output) code for a word. |
||
375 | |||
376 | While the output code is numeric, it is nevertheless a str. |
||
377 | |||
378 | :param str word: the word to transform |
||
379 | :param bool primary_only: if True, only the primary code is returned |
||
380 | :returns: the Haase Phonetik value as a numeric string |
||
381 | :rtype: tuple |
||
382 | |||
383 | |||
384 | >>> pe = Haase() |
||
385 | >>> pe.encode('Joachim') |
||
386 | ('9496',) |
||
387 | >>> pe.encode('Christoph') |
||
388 | ('4798293', '8798293') |
||
389 | >>> pe.encode('Jörg') |
||
390 | ('974',) |
||
391 | >>> pe.encode('Smith') |
||
392 | ('8692',) |
||
393 | >>> pe.encode('Schmidt') |
||
394 | ('8692', '4692') |
||
395 | """ |
||
396 | |||
397 | 1 | def _after(word, i, letters): |
|
398 | """Return True if word[i] follows one of the supplied letters.""" |
||
399 | 1 | if i > 0 and word[i - 1] in letters: |
|
400 | 1 | return True |
|
401 | 1 | return False |
|
402 | |||
403 | 1 | def _before(word, i, letters): |
|
404 | """Return True if word[i] precedes one of the supplied letters.""" |
||
405 | 1 | if i + 1 < len(word) and word[i + 1] in letters: |
|
406 | 1 | return True |
|
407 | 1 | return False |
|
408 | |||
409 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
410 | 1 | word = word.replace('ß', 'SS') |
|
411 | |||
412 | 1 | word = word.replace('Ä', 'AE') |
|
413 | 1 | word = word.replace('Ö', 'OE') |
|
414 | 1 | word = word.replace('Ü', 'UE') |
|
415 | 1 | word = ''.join(c for c in word if c in self._uc_set) |
|
416 | |||
417 | 1 | variants = [] |
|
418 | 1 | if primary_only: |
|
419 | 1 | variants = [word] |
|
420 | else: |
||
421 | 1 | pos = 0 |
|
422 | 1 | if word[:2] == 'CH': |
|
423 | 1 | variants.append(('CH', 'SCH')) |
|
424 | 1 | pos += 2 |
|
425 | 1 | len_3_vars = { |
|
426 | 'OWN': 'AUN', |
||
427 | 'WSK': 'RSK', |
||
428 | 'SCH': 'CH', |
||
429 | 'GLI': 'LI', |
||
430 | 'AUX': 'O', |
||
431 | 'EUX': 'O', |
||
432 | } |
||
433 | 1 | while pos < len(word): |
|
434 | 1 | if word[pos : pos + 4] == 'ILLE': |
|
435 | 1 | variants.append(('ILLE', 'I')) |
|
436 | 1 | pos += 4 |
|
437 | 1 | elif word[pos : pos + 3] in len_3_vars: |
|
438 | 1 | variants.append( |
|
439 | (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) |
||
440 | ) |
||
441 | 1 | pos += 3 |
|
442 | 1 | elif word[pos : pos + 2] == 'RB': |
|
443 | 1 | variants.append(('RB', 'RW')) |
|
444 | 1 | pos += 2 |
|
445 | 1 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
|
446 | 1 | variants.append(('EAU', 'O')) |
|
447 | 1 | pos += 3 |
|
448 | 1 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
449 | 1 | if word[pos:] == 'O': |
|
450 | 1 | variants.append(('O', 'OW')) |
|
451 | else: |
||
452 | 1 | variants.append(('A', 'AR')) |
|
453 | 1 | pos += 1 |
|
454 | else: |
||
455 | 1 | variants.append((word[pos],)) |
|
456 | 1 | pos += 1 |
|
457 | |||
458 | 1 | variants = [''.join(letters) for letters in product(*variants)] |
|
459 | |||
460 | 1 | def _haase_code(word): |
|
461 | 1 | sdx = '' |
|
462 | 1 | for i in range(len(word)): |
|
463 | 1 | View Code Duplication | if word[i] in self._uc_v_set: |
464 | 1 | sdx += '9' |
|
465 | 1 | elif word[i] == 'B': |
|
466 | 1 | sdx += '1' |
|
467 | 1 | elif word[i] == 'P': |
|
468 | 1 | if _before(word, i, {'H'}): |
|
469 | 1 | sdx += '3' |
|
470 | else: |
||
471 | 1 | sdx += '1' |
|
472 | 1 | elif word[i] in {'D', 'T'}: |
|
473 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
474 | 1 | sdx += '8' |
|
475 | else: |
||
476 | 1 | sdx += '2' |
|
477 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
478 | 1 | sdx += '3' |
|
479 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
480 | 1 | sdx += '4' |
|
481 | 1 | elif word[i] == 'C': |
|
482 | 1 | if _after(word, i, {'S', 'Z'}): |
|
483 | 1 | sdx += '8' |
|
484 | 1 | elif i == 0: |
|
485 | 1 | if _before( |
|
486 | word, |
||
487 | i, |
||
488 | {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, |
||
489 | ): |
||
490 | 1 | sdx += '4' |
|
491 | else: |
||
492 | 1 | sdx += '8' |
|
493 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
494 | 1 | sdx += '4' |
|
495 | else: |
||
496 | 1 | sdx += '8' |
|
497 | 1 | elif word[i] == 'X': |
|
498 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
499 | 1 | sdx += '8' |
|
500 | else: |
||
501 | 1 | sdx += '48' |
|
502 | 1 | elif word[i] == 'L': |
|
503 | 1 | sdx += '5' |
|
504 | 1 | elif word[i] in {'M', 'N'}: |
|
505 | 1 | sdx += '6' |
|
506 | 1 | elif word[i] == 'R': |
|
507 | 1 | sdx += '7' |
|
508 | 1 | elif word[i] in {'S', 'Z'}: |
|
509 | 1 | sdx += '8' |
|
510 | |||
511 | 1 | sdx = self._delete_consecutive_repeats(sdx) |
|
512 | |||
513 | 1 | return sdx |
|
514 | |||
515 | 1 | encoded = tuple(_haase_code(word) for word in variants) |
|
516 | 1 | if len(encoded) > 1: |
|
517 | 1 | encoded_set = set() |
|
518 | 1 | encoded_single = [] |
|
519 | 1 | for code in encoded: |
|
520 | 1 | if code not in encoded_set: |
|
521 | 1 | encoded_set.add(code) |
|
522 | 1 | encoded_single.append(code) |
|
523 | 1 | return tuple(encoded_single) |
|
524 | |||
525 | 1 | return encoded |
|
526 | |||
527 | |||
528 | 1 | def haase_phonetik(word, primary_only=False): |
|
529 | """Return the Haase Phonetik (numeric output) code for a word. |
||
530 | |||
531 | This is a wrapper for :py:meth:`Haase.encode`. |
||
532 | |||
533 | :param str word: the word to transform |
||
534 | :param bool primary_only: if True, only the primary code is returned |
||
535 | :returns: the Haase Phonetik value as a numeric string |
||
536 | :rtype: tuple |
||
537 | |||
538 | >>> haase_phonetik('Joachim') |
||
539 | ('9496',) |
||
540 | >>> haase_phonetik('Christoph') |
||
541 | ('4798293', '8798293') |
||
542 | >>> haase_phonetik('Jörg') |
||
543 | ('974',) |
||
544 | >>> haase_phonetik('Smith') |
||
545 | ('8692',) |
||
546 | >>> haase_phonetik('Schmidt') |
||
547 | ('8692', '4692') |
||
548 | """ |
||
549 | 1 | return Haase().encode(word, primary_only) |
|
550 | |||
551 | |||
552 | 1 | class RethSchek(Phonetic): |
|
553 | """Reth-Schek Phonetik. |
||
554 | |||
555 | This algorithm is proposed in :cite:`Reth:1977`. |
||
556 | |||
557 | Since I couldn't secure a copy of that document (maybe I'll look for it |
||
558 | next time I'm in Germany), this implementation is based on what I could |
||
559 | glean from the implementations published by German Record Linkage |
||
560 | Center (www.record-linkage.de): |
||
561 | |||
562 | - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018` |
||
563 | - Merge ToolBox (in Java) :cite:`Schnell:2004` |
||
564 | |||
565 | Rules that are unclear: |
||
566 | |||
567 | - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked) |
||
568 | - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo) |
||
569 | - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't |
||
570 | think of a German word with '-tui-' in it.) |
||
571 | - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'? |
||
572 | """ |
||
573 | |||
574 | 1 | _replacements = { |
|
575 | 3: { |
||
576 | 'AEH': 'E', |
||
577 | 'IEH': 'I', |
||
578 | 'OEH': 'OE', |
||
579 | 'UEH': 'UE', |
||
580 | 'SCH': 'CH', |
||
581 | 'ZIO': 'TIO', |
||
582 | 'TIU': 'TIO', |
||
583 | 'ZIU': 'TIO', |
||
584 | 'CHS': 'X', |
||
585 | 'CKS': 'X', |
||
586 | 'AEU': 'OI', |
||
587 | }, |
||
588 | 2: { |
||
589 | 'LL': 'L', |
||
590 | 'AA': 'A', |
||
591 | 'AH': 'A', |
||
592 | 'BB': 'B', |
||
593 | 'PP': 'B', |
||
594 | 'BP': 'B', |
||
595 | 'PB': 'B', |
||
596 | 'DD': 'D', |
||
597 | 'DT': 'D', |
||
598 | 'TT': 'D', |
||
599 | 'TH': 'D', |
||
600 | 'EE': 'E', |
||
601 | 'EH': 'E', |
||
602 | 'AE': 'E', |
||
603 | 'FF': 'F', |
||
604 | 'PH': 'F', |
||
605 | 'KK': 'K', |
||
606 | 'GG': 'G', |
||
607 | 'GK': 'G', |
||
608 | 'KG': 'G', |
||
609 | 'CK': 'G', |
||
610 | 'CC': 'C', |
||
611 | 'IE': 'I', |
||
612 | 'IH': 'I', |
||
613 | 'MM': 'M', |
||
614 | 'NN': 'N', |
||
615 | 'OO': 'O', |
||
616 | 'OH': 'O', |
||
617 | 'SZ': 'S', |
||
618 | 'UH': 'U', |
||
619 | 'GS': 'X', |
||
620 | 'KS': 'X', |
||
621 | 'TZ': 'Z', |
||
622 | 'AY': 'AI', |
||
623 | 'EI': 'AI', |
||
624 | 'EY': 'AI', |
||
625 | 'EU': 'OI', |
||
626 | 'RR': 'R', |
||
627 | 'SS': 'S', |
||
628 | 'KW': 'QU', |
||
629 | }, |
||
630 | 1: { |
||
631 | 'P': 'B', |
||
632 | 'T': 'D', |
||
633 | 'V': 'F', |
||
634 | 'W': 'F', |
||
635 | 'C': 'G', |
||
636 | 'K': 'G', |
||
637 | 'Y': 'I', |
||
638 | }, |
||
639 | } |
||
640 | |||
641 | 1 | def encode(self, word): |
|
642 | """Return Reth-Schek Phonetik code for a word. |
||
643 | |||
644 | :param str word: the word to transform |
||
645 | :returns: the Reth-Schek Phonetik code |
||
646 | :rtype: str |
||
647 | |||
648 | >>> reth_schek_phonetik('Joachim') |
||
649 | 'JOAGHIM' |
||
650 | >>> reth_schek_phonetik('Christoph') |
||
651 | 'GHRISDOF' |
||
652 | >>> reth_schek_phonetik('Jörg') |
||
653 | 'JOERG' |
||
654 | >>> reth_schek_phonetik('Smith') |
||
655 | 'SMID' |
||
656 | >>> reth_schek_phonetik('Schmidt') |
||
657 | 'SCHMID' |
||
658 | """ |
||
659 | # Uppercase |
||
660 | 1 | word = word.upper() |
|
661 | |||
662 | # Replace umlauts/eszett |
||
663 | 1 | word = word.replace('Ä', 'AE') |
|
664 | 1 | word = word.replace('Ö', 'OE') |
|
665 | 1 | word = word.replace('Ü', 'UE') |
|
666 | 1 | word = word.replace('ß', 'SS') |
|
667 | |||
668 | # Main loop, using above replacements table |
||
669 | 1 | pos = 0 |
|
670 | 1 | while pos < len(word): |
|
671 | 1 | for num in range(3, 0, -1): |
|
672 | 1 | if word[pos : pos + num] in self._replacements[num]: |
|
673 | 1 | word = ( |
|
674 | word[:pos] |
||
675 | + self._replacements[num][word[pos : pos + num]] |
||
676 | + word[pos + num :] |
||
677 | ) |
||
678 | 1 | pos += 1 |
|
679 | 1 | break |
|
680 | else: |
||
681 | 1 | pos += 1 # Advance if nothing is recognized |
|
682 | |||
683 | # Change 'CH' back(?) to 'SCH' |
||
684 | 1 | word = word.replace('CH', 'SCH') |
|
685 | |||
686 | # Replace final sequences |
||
687 | 1 | if word[-2:] == 'ER': |
|
688 | 1 | word = word[:-2] + 'R' |
|
689 | 1 | elif word[-2:] == 'EL': |
|
690 | 1 | word = word[:-2] + 'L' |
|
691 | 1 | elif word[-1:] == 'H': |
|
692 | 1 | word = word[:-1] |
|
693 | |||
694 | 1 | return word |
|
695 | |||
696 | |||
697 | 1 | def reth_schek_phonetik(word): |
|
698 | """Return Reth-Schek Phonetik code for a word. |
||
699 | |||
700 | This is a wrapper for :py:meth:`RethSchek.encode`. |
||
701 | |||
702 | :param str word: the word to transform |
||
703 | :returns: the Reth-Schek Phonetik code |
||
704 | :rtype: str |
||
705 | |||
706 | >>> reth_schek_phonetik('Joachim') |
||
707 | 'JOAGHIM' |
||
708 | >>> reth_schek_phonetik('Christoph') |
||
709 | 'GHRISDOF' |
||
710 | >>> reth_schek_phonetik('Jörg') |
||
711 | 'JOERG' |
||
712 | >>> reth_schek_phonetik('Smith') |
||
713 | 'SMID' |
||
714 | >>> reth_schek_phonetik('Schmidt') |
||
715 | 'SCHMID' |
||
716 | """ |
||
717 | 1 | return RethSchek().encode(word) |
|
718 | |||
719 | |||
720 | if __name__ == '__main__': |
||
721 | import doctest |
||
722 | |||
723 | doctest.testmod() |
||
724 |