Completed
Push — master ( 6ed6e1...91db7a )
by Chris
13:26
created

abydos.phonetic.de.phonem()   B

Complexity

Conditions 2

Size

Total Lines 77
Code Lines 48

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 2

Importance

Changes 0
Metric Value
cc 2
eloc 48
nop 1
dl 0
loc 77
ccs 8
cts 8
cp 1
crap 2
rs 8.7018
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic.de.
20
21
The phonetic.de module implements the Kölner Phonetik and related
22
algorithms for German:
23
24
    - Kölner Phonetik
25
    - Phonem
26
    - Haase Phonetik
27
    - Reth-Schek Phonetik
28
"""
29
30 1
from __future__ import unicode_literals
31
32 1
from itertools import product
33 1
from unicodedata import normalize as unicode_normalize
34
35 1
from six import text_type
36 1
from six.moves import range
37
38 1
from . import _delete_consecutive_repeats
39
40 1
__all__ = [
41
    'haase_phonetik',
42
    'koelner_phonetik',
43
    'koelner_phonetik_alpha',
44
    'koelner_phonetik_num_to_alpha',
45
    'phonem',
46
    'reth_schek_phonetik',
47
]
48
49
50 1
def koelner_phonetik(word):
51
    """Return the Kölner Phonetik (numeric output) code for a word.
52
53
    Based on the algorithm defined by :cite:`Postel:1969`.
54
55
    While the output code is numeric, it is still a str because 0s can lead
56
    the code.
57
58
    :param str word: the word to transform
59
    :returns: the Kölner Phonetik value as a numeric string
60
    :rtype: str
61
62
    >>> koelner_phonetik('Christopher')
63
    '478237'
64
    >>> koelner_phonetik('Niall')
65
    '65'
66
    >>> koelner_phonetik('Smith')
67
    '862'
68
    >>> koelner_phonetik('Schmidt')
69
    '862'
70
    >>> koelner_phonetik('Müller')
71
    '657'
72
    >>> koelner_phonetik('Zimmermann')
73
    '86766'
74
    """
75
76 1
    def _after(word, pos, letters):
77
        """Return True if word[i] follows one of the supplied letters."""
78 1
        return pos > 0 and word[pos - 1] in letters
79
80 1
    def _before(word, pos, letters):
81
        """Return True if word[i] precedes one of the supplied letters."""
82 1
        return pos + 1 < len(word) and word[pos + 1] in letters
83
84 1
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
85
86 1
    sdx = ''
87
88 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
89 1
    word = word.replace('ß', 'SS')
90
91 1
    word = word.replace('Ä', 'AE')
92 1
    word = word.replace('Ö', 'OE')
93 1
    word = word.replace('Ü', 'UE')
94 1
    word = ''.join(
95
        c
96
        for c in word
97
        if c
98
        in {
99
            'A',
100
            'B',
101
            'C',
102
            'D',
103
            'E',
104
            'F',
105
            'G',
106
            'H',
107
            'I',
108
            'J',
109
            'K',
110
            'L',
111
            'M',
112
            'N',
113
            'O',
114
            'P',
115
            'Q',
116
            'R',
117
            'S',
118
            'T',
119
            'U',
120
            'V',
121
            'W',
122
            'X',
123
            'Y',
124
            'Z',
125
        }
126
    )
127
128
    # Nothing to convert, return base case
129 1
    if not word:
130 1
        return sdx
131
132 1
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
133 1 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
134 1
            sdx += '0'
135 1
        elif word[i] == 'B':
136 1
            sdx += '1'
137 1
        elif word[i] == 'P':
138 1
            if _before(word, i, {'H'}):
139 1
                sdx += '3'
140
            else:
141 1
                sdx += '1'
142 1
        elif word[i] in {'D', 'T'}:
143 1
            if _before(word, i, {'C', 'S', 'Z'}):
144 1
                sdx += '8'
145
            else:
146 1
                sdx += '2'
147 1
        elif word[i] in {'F', 'V', 'W'}:
148 1
            sdx += '3'
149 1
        elif word[i] in {'G', 'K', 'Q'}:
150 1
            sdx += '4'
151 1
        elif word[i] == 'C':
152 1
            if _after(word, i, {'S', 'Z'}):
153 1
                sdx += '8'
154 1
            elif i == 0:
155 1
                if _before(
156
                    word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
157
                ):
158 1
                    sdx += '4'
159
                else:
160 1
                    sdx += '8'
161 1
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
162 1
                sdx += '4'
163
            else:
164 1
                sdx += '8'
165 1
        elif word[i] == 'X':
166 1
            if _after(word, i, {'C', 'K', 'Q'}):
167 1
                sdx += '8'
168
            else:
169 1
                sdx += '48'
170 1
        elif word[i] == 'L':
171 1
            sdx += '5'
172 1
        elif word[i] in {'M', 'N'}:
173 1
            sdx += '6'
174 1
        elif word[i] == 'R':
175 1
            sdx += '7'
176 1
        elif word[i] in {'S', 'Z'}:
177 1
            sdx += '8'
178
179 1
    sdx = _delete_consecutive_repeats(sdx)
180
181 1
    if sdx:
182 1
        sdx = sdx[:1] + sdx[1:].replace('0', '')
183
184 1
    return sdx
185
186
187 1
def koelner_phonetik_num_to_alpha(num):
188
    """Convert a Kölner Phonetik code from numeric to alphabetic.
189
190
    :param str num: a numeric Kölner Phonetik representation (can be a str or
191
        an int)
192
    :returns: an alphabetic representation of the same word
193
    :rtype: str
194
195
    >>> koelner_phonetik_num_to_alpha('862')
196
    'SNT'
197
    >>> koelner_phonetik_num_to_alpha('657')
198
    'NLR'
199
    >>> koelner_phonetik_num_to_alpha('86766')
200
    'SNRNN'
201
    """
202 1
    _koelner_num_translation = dict(
203
        zip((ord(_) for _ in '012345678'), 'APTFKLNRS')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
204
    )
205 1
    num = ''.join(
206
        c
207
        for c in text_type(num)
208
        if c in {'0', '1', '2', '3', '4', '5', '6', '7', '8'}
209
    )
210 1
    return num.translate(_koelner_num_translation)
211
212
213 1
def koelner_phonetik_alpha(word):
214
    """Return the Kölner Phonetik (alphabetic output) code for a word.
215
216
    :param str word: the word to transform
217
    :returns: the Kölner Phonetik value as an alphabetic string
218
    :rtype: str
219
220
    >>> koelner_phonetik_alpha('Smith')
221
    'SNT'
222
    >>> koelner_phonetik_alpha('Schmidt')
223
    'SNT'
224
    >>> koelner_phonetik_alpha('Müller')
225
    'NLR'
226
    >>> koelner_phonetik_alpha('Zimmermann')
227
    'SNRNN'
228
    """
229 1
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
230
231
232 1
def phonem(word):
233
    """Return the Phonem code for a word.
234
235
    Phonem is defined in :cite:`Wilde:1988`.
236
237
    This version is based on the Perl implementation documented at
238
    :cite:`Wilz:2005`.
239
    It includes some enhancements presented in the Java port at
240
    :cite:`dcm4che:2011`.
241
242
    Phonem is intended chiefly for German names/words.
243
244
    :param str word: the word to transform
245
    :returns: the Phonem value
246
    :rtype: str
247
248
    >>> phonem('Christopher')
249
    'CRYSDOVR'
250
    >>> phonem('Niall')
251
    'NYAL'
252
    >>> phonem('Smith')
253
    'SMYD'
254
    >>> phonem('Schmidt')
255
    'CMYD'
256
    """
257 1
    _phonem_substitutions = (
258
        ('SC', 'C'),
259
        ('SZ', 'C'),
260
        ('CZ', 'C'),
261
        ('TZ', 'C'),
262
        ('TS', 'C'),
263
        ('KS', 'X'),
264
        ('PF', 'V'),
265
        ('QU', 'KW'),
266
        ('PH', 'V'),
267
        ('UE', 'Y'),
268
        ('AE', 'E'),
269
        ('OE', 'Ö'),
270
        ('EI', 'AY'),
271
        ('EY', 'AY'),
272
        ('EU', 'OY'),
273
        ('AU', 'A§'),
274
        ('OU', '§'),
275
    )
276 1
    _phonem_translation = dict(
277
        zip(
278
            (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
279
            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
280
        )
281
    )
282
283 1
    word = unicode_normalize('NFC', text_type(word.upper()))
284 1
    for i, j in _phonem_substitutions:
285 1
        word = word.replace(i, j)
286 1
    word = word.translate(_phonem_translation)
287
288 1
    return ''.join(
289
        c
290
        for c in _delete_consecutive_repeats(word)
291
        if c
292
        in {
293
            'A',
294
            'B',
295
            'C',
296
            'D',
297
            'L',
298
            'M',
299
            'N',
300
            'O',
301
            'R',
302
            'S',
303
            'U',
304
            'V',
305
            'W',
306
            'X',
307
            'Y',
308
            'Ö',
309
        }
310
    )
311
312
313 1
def haase_phonetik(word, primary_only=False):
314
    """Return the Haase Phonetik (numeric output) code for a word.
315
316
    Based on the algorithm described at :cite:`Prante:2015`.
317
318
    Based on the original :cite:`Haase:2000`.
319
320
    While the output code is numeric, it is nevertheless a str.
321
322
    :param str word: the word to transform
323
    :param bool primary_only: if True, only the primary code is returned
324
    :returns: the Haase Phonetik value as a numeric string
325
    :rtype: tuple
326
327
    >>> haase_phonetik('Joachim')
328
    ('9496',)
329
    >>> haase_phonetik('Christoph')
330
    ('4798293', '8798293')
331
    >>> haase_phonetik('Jörg')
332
    ('974',)
333
    >>> haase_phonetik('Smith')
334
    ('8692',)
335
    >>> haase_phonetik('Schmidt')
336
    ('8692', '4692')
337
    """
338
339 1
    def _after(word, i, letters):
340
        """Return True if word[i] follows one of the supplied letters."""
341 1
        if i > 0 and word[i - 1] in letters:
342 1
            return True
343 1
        return False
344
345 1
    def _before(word, i, letters):
346
        """Return True if word[i] precedes one of the supplied letters."""
347 1
        if i + 1 < len(word) and word[i + 1] in letters:
348 1
            return True
349 1
        return False
350
351 1
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
352
353 1
    word = unicode_normalize('NFKD', text_type(word.upper()))
354 1
    word = word.replace('ß', 'SS')
355
356 1
    word = word.replace('Ä', 'AE')
357 1
    word = word.replace('Ö', 'OE')
358 1
    word = word.replace('Ü', 'UE')
359 1
    word = ''.join(
360
        c
361
        for c in word
362
        if c
363
        in {
364
            'A',
365
            'B',
366
            'C',
367
            'D',
368
            'E',
369
            'F',
370
            'G',
371
            'H',
372
            'I',
373
            'J',
374
            'K',
375
            'L',
376
            'M',
377
            'N',
378
            'O',
379
            'P',
380
            'Q',
381
            'R',
382
            'S',
383
            'T',
384
            'U',
385
            'V',
386
            'W',
387
            'X',
388
            'Y',
389
            'Z',
390
        }
391
    )
392
393 1
    variants = []
394 1
    if primary_only:
395 1
        variants = [word]
396
    else:
397 1
        pos = 0
398 1
        if word[:2] == 'CH':
399 1
            variants.append(('CH', 'SCH'))
400 1
            pos += 2
401 1
        len_3_vars = {
402
            'OWN': 'AUN',
403
            'WSK': 'RSK',
404
            'SCH': 'CH',
405
            'GLI': 'LI',
406
            'AUX': 'O',
407
            'EUX': 'O',
408
        }
409 1
        while pos < len(word):
410 1
            if word[pos : pos + 4] == 'ILLE':
411 1
                variants.append(('ILLE', 'I'))
412 1
                pos += 4
413 1
            elif word[pos : pos + 3] in len_3_vars:
414 1
                variants.append(
415
                    (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
416
                )
417 1
                pos += 3
418 1
            elif word[pos : pos + 2] == 'RB':
419 1
                variants.append(('RB', 'RW'))
420 1
                pos += 2
421 1
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
422 1
                variants.append(('EAU', 'O'))
423 1
                pos += 3
424 1
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
425 1
                if word[pos:] == 'O':
426 1
                    variants.append(('O', 'OW'))
427
                else:
428 1
                    variants.append(('A', 'AR'))
429 1
                pos += 1
430
            else:
431 1
                variants.append((word[pos],))
432 1
                pos += 1
433
434 1
        variants = [''.join(letters) for letters in product(*variants)]
435
436 1
    def _haase_code(word):
437 1
        sdx = ''
438 1
        for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
439 1 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
440 1
                sdx += '9'
441 1
            elif word[i] == 'B':
442 1
                sdx += '1'
443 1
            elif word[i] == 'P':
444 1
                if _before(word, i, {'H'}):
445 1
                    sdx += '3'
446
                else:
447 1
                    sdx += '1'
448 1
            elif word[i] in {'D', 'T'}:
449 1
                if _before(word, i, {'C', 'S', 'Z'}):
450 1
                    sdx += '8'
451
                else:
452 1
                    sdx += '2'
453 1
            elif word[i] in {'F', 'V', 'W'}:
454 1
                sdx += '3'
455 1
            elif word[i] in {'G', 'K', 'Q'}:
456 1
                sdx += '4'
457 1
            elif word[i] == 'C':
458 1
                if _after(word, i, {'S', 'Z'}):
459 1
                    sdx += '8'
460 1
                elif i == 0:
461 1
                    if _before(
462
                        word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
463
                    ):
464 1
                        sdx += '4'
465
                    else:
466 1
                        sdx += '8'
467 1
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
468 1
                    sdx += '4'
469
                else:
470 1
                    sdx += '8'
471 1
            elif word[i] == 'X':
472 1
                if _after(word, i, {'C', 'K', 'Q'}):
473 1
                    sdx += '8'
474
                else:
475 1
                    sdx += '48'
476 1
            elif word[i] == 'L':
477 1
                sdx += '5'
478 1
            elif word[i] in {'M', 'N'}:
479 1
                sdx += '6'
480 1
            elif word[i] == 'R':
481 1
                sdx += '7'
482 1
            elif word[i] in {'S', 'Z'}:
483 1
                sdx += '8'
484
485 1
        sdx = _delete_consecutive_repeats(sdx)
486
487 1
        return sdx
488
489 1
    encoded = tuple(_haase_code(word) for word in variants)
490 1
    if len(encoded) > 1:
491 1
        encoded_set = set()
492 1
        encoded_single = []
493 1
        for code in encoded:
494 1
            if code not in encoded_set:
495 1
                encoded_set.add(code)
496 1
                encoded_single.append(code)
497 1
        return tuple(encoded_single)
498
499 1
    return encoded
500
501
502 1
def reth_schek_phonetik(word):
503
    """Return Reth-Schek Phonetik code for a word.
504
505
    This algorithm is proposed in :cite:`Reth:1977`.
506
507
    Since I couldn't secure a copy of that document (maybe I'll look for it
508
    next time I'm in Germany), this implementation is based on what I could
509
    glean from the implementations published by German Record Linkage
510
    Center (www.record-linkage.de):
511
512
    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
513
    - Merge ToolBox (in Java) :cite:`Schnell:2004`
514
515
    Rules that are unclear:
516
517
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
518
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
519
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
520
      think of a German word with '-tui-' in it.)
521
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
522
523
    :param str word: the word to transform
524
    :returns: the Reth-Schek Phonetik code
525
    :rtype: str
526
527
    >>> reth_schek_phonetik('Joachim')
528
    'JOAGHIM'
529
    >>> reth_schek_phonetik('Christoph')
530
    'GHRISDOF'
531
    >>> reth_schek_phonetik('Jörg')
532
    'JOERG'
533
    >>> reth_schek_phonetik('Smith')
534
    'SMID'
535
    >>> reth_schek_phonetik('Schmidt')
536
    'SCHMID'
537
    """
538 1
    replacements = {
539
        3: {
540
            'AEH': 'E',
541
            'IEH': 'I',
542
            'OEH': 'OE',
543
            'UEH': 'UE',
544
            'SCH': 'CH',
545
            'ZIO': 'TIO',
546
            'TIU': 'TIO',
547
            'ZIU': 'TIO',
548
            'CHS': 'X',
549
            'CKS': 'X',
550
            'AEU': 'OI',
551
        },
552
        2: {
553
            'LL': 'L',
554
            'AA': 'A',
555
            'AH': 'A',
556
            'BB': 'B',
557
            'PP': 'B',
558
            'BP': 'B',
559
            'PB': 'B',
560
            'DD': 'D',
561
            'DT': 'D',
562
            'TT': 'D',
563
            'TH': 'D',
564
            'EE': 'E',
565
            'EH': 'E',
566
            'AE': 'E',
567
            'FF': 'F',
568
            'PH': 'F',
569
            'KK': 'K',
570
            'GG': 'G',
571
            'GK': 'G',
572
            'KG': 'G',
573
            'CK': 'G',
574
            'CC': 'C',
575
            'IE': 'I',
576
            'IH': 'I',
577
            'MM': 'M',
578
            'NN': 'N',
579
            'OO': 'O',
580
            'OH': 'O',
581
            'SZ': 'S',
582
            'UH': 'U',
583
            'GS': 'X',
584
            'KS': 'X',
585
            'TZ': 'Z',
586
            'AY': 'AI',
587
            'EI': 'AI',
588
            'EY': 'AI',
589
            'EU': 'OI',
590
            'RR': 'R',
591
            'SS': 'S',
592
            'KW': 'QU',
593
        },
594
        1: {
595
            'P': 'B',
596
            'T': 'D',
597
            'V': 'F',
598
            'W': 'F',
599
            'C': 'G',
600
            'K': 'G',
601
            'Y': 'I',
602
        },
603
    }
604
605
    # Uppercase
606 1
    word = word.upper()
607
608
    # Replace umlauts/eszett
609 1
    word = word.replace('Ä', 'AE')
610 1
    word = word.replace('Ö', 'OE')
611 1
    word = word.replace('Ü', 'UE')
612 1
    word = word.replace('ß', 'SS')
613
614
    # Main loop, using above replacements table
615 1
    pos = 0
616 1
    while pos < len(word):
617 1
        for num in range(3, 0, -1):
618 1
            if word[pos : pos + num] in replacements[num]:
619 1
                word = (
620
                    word[:pos]
621
                    + replacements[num][word[pos : pos + num]]
622
                    + word[pos + num :]
623
                )
624 1
                pos += 1
625 1
                break
626
        else:
627 1
            pos += 1  # Advance if nothing is recognized
628
629
    # Change 'CH' back(?) to 'SCH'
630 1
    word = word.replace('CH', 'SCH')
631
632
    # Replace final sequences
633 1
    if word[-2:] == 'ER':
634 1
        word = word[:-2] + 'R'
635 1
    elif word[-2:] == 'EL':
636 1
        word = word[:-2] + 'L'
637 1
    elif word[-1:] == 'H':
638 1
        word = word[:-1]
639
640 1
    return word
641
642
643
if __name__ == '__main__':
644
    import doctest
645
646
    doctest.testmod()
647