Test Failed
Push — master ( 64abe2...a464fa )
by Chris
04:02 queued 11s
created

abydos.phonetic.de.reth_schek_phonetik()   B

Complexity

Conditions 8

Size

Total Lines 84
Code Lines 37

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 37
nop 1
dl 0
loc 84
rs 7.1253
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.phonetic.de.
20
21
The phonetic.de module implements the Kölner Phonetik and related
22
algorithms for German:
23
24
    - Kölner Phonetik
25
    - Phonem
26
    - Haase Phonetik
27
    - Reth-Schek Phonetik
28
"""
29
30
from __future__ import unicode_literals
31
32
from itertools import product
33
from unicodedata import normalize as unicode_normalize
34
35
from six import text_type
36
from six.moves import range
37
38
from . import _delete_consecutive_repeats
39
40
__all__ = ['haase_phonetik', 'koelner_phonetik',
41
           'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha',
42
           'phonem', 'reth_schek_phonetik']
43
44
45
def koelner_phonetik(word):
46
    """Return the Kölner Phonetik (numeric output) code for a word.
47
48
    Based on the algorithm defined by :cite:`Postel:1969`.
49
50
    While the output code is numeric, it is still a str because 0s can lead
51
    the code.
52
53
    :param str word: the word to transform
54
    :returns: the Kölner Phonetik value as a numeric string
55
    :rtype: str
56
57
    >>> koelner_phonetik('Christopher')
58
    '478237'
59
    >>> koelner_phonetik('Niall')
60
    '65'
61
    >>> koelner_phonetik('Smith')
62
    '862'
63
    >>> koelner_phonetik('Schmidt')
64
    '862'
65
    >>> koelner_phonetik('Müller')
66
    '657'
67
    >>> koelner_phonetik('Zimmermann')
68
    '86766'
69
    """
70
    def _after(word, pos, letters):
71
        """Return True if word[i] follows one of the supplied letters."""
72
        return pos > 0 and word[pos-1] in letters
73
74
    def _before(word, pos, letters):
75
        """Return True if word[i] precedes one of the supplied letters."""
76
        return pos+1 < len(word) and word[pos+1] in letters
77
78
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
79
80
    sdx = ''
81
82
    word = unicode_normalize('NFKD', text_type(word.upper()))
83
    word = word.replace('ß', 'SS')
84
85
    word = word.replace('Ä', 'AE')
86
    word = word.replace('Ö', 'OE')
87
    word = word.replace('Ü', 'UE')
88
    word = ''.join(c for c in word if c in
89
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
90
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
91
                    'Y', 'Z'})
92
93
    # Nothing to convert, return base case
94
    if not word:
95
        return sdx
96
97
    for i in range(len(word)):
98 View Code Duplication
        if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
99
            sdx += '0'
100
        elif word[i] == 'B':
101
            sdx += '1'
102
        elif word[i] == 'P':
103
            if _before(word, i, {'H'}):
104
                sdx += '3'
105
            else:
106
                sdx += '1'
107
        elif word[i] in {'D', 'T'}:
108
            if _before(word, i, {'C', 'S', 'Z'}):
109
                sdx += '8'
110
            else:
111
                sdx += '2'
112
        elif word[i] in {'F', 'V', 'W'}:
113
            sdx += '3'
114
        elif word[i] in {'G', 'K', 'Q'}:
115
            sdx += '4'
116
        elif word[i] == 'C':
117
            if _after(word, i, {'S', 'Z'}):
118
                sdx += '8'
119
            elif i == 0:
120
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
121
                                     'X'}):
122
                    sdx += '4'
123
                else:
124
                    sdx += '8'
125
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
126
                sdx += '4'
127
            else:
128
                sdx += '8'
129
        elif word[i] == 'X':
130
            if _after(word, i, {'C', 'K', 'Q'}):
131
                sdx += '8'
132
            else:
133
                sdx += '48'
134
        elif word[i] == 'L':
135
            sdx += '5'
136
        elif word[i] in {'M', 'N'}:
137
            sdx += '6'
138
        elif word[i] == 'R':
139
            sdx += '7'
140
        elif word[i] in {'S', 'Z'}:
141
            sdx += '8'
142
143
    sdx = _delete_consecutive_repeats(sdx)
144
145
    if sdx:
146
        sdx = sdx[:1] + sdx[1:].replace('0', '')
147
148
    return sdx
149
150
151
def koelner_phonetik_num_to_alpha(num):
152
    """Convert a Kölner Phonetik code from numeric to alphabetic.
153
154
    :param str num: a numeric Kölner Phonetik representation (can be a str or
155
        an int)
156
    :returns: an alphabetic representation of the same word
157
    :rtype: str
158
159
    >>> koelner_phonetik_num_to_alpha('862')
160
    'SNT'
161
    >>> koelner_phonetik_num_to_alpha('657')
162
    'NLR'
163
    >>> koelner_phonetik_num_to_alpha('86766')
164
    'SNRNN'
165
    """
166
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
167
                                        'APTFKLNRS'))
168
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
169
                                                     '5', '6', '7', '8'})
170
    return num.translate(_koelner_num_translation)
171
172
173
def koelner_phonetik_alpha(word):
174
    """Return the Kölner Phonetik (alphabetic output) code for a word.
175
176
    :param str word: the word to transform
177
    :returns: the Kölner Phonetik value as an alphabetic string
178
    :rtype: str
179
180
    >>> koelner_phonetik_alpha('Smith')
181
    'SNT'
182
    >>> koelner_phonetik_alpha('Schmidt')
183
    'SNT'
184
    >>> koelner_phonetik_alpha('Müller')
185
    'NLR'
186
    >>> koelner_phonetik_alpha('Zimmermann')
187
    'SNRNN'
188
    """
189
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
190
191
192
def phonem(word):
193
    """Return the Phonem code for a word.
194
195
    Phonem is defined in :cite:`Wilde:1988`.
196
197
    This version is based on the Perl implementation documented at
198
    :cite:`Wilz:2005`.
199
    It includes some enhancements presented in the Java port at
200
    :cite:`dcm4che:2011`.
201
202
    Phonem is intended chiefly for German names/words.
203
204
    :param str word: the word to transform
205
    :returns: the Phonem value
206
    :rtype: str
207
208
    >>> phonem('Christopher')
209
    'CRYSDOVR'
210
    >>> phonem('Niall')
211
    'NYAL'
212
    >>> phonem('Smith')
213
    'SMYD'
214
    >>> phonem('Schmidt')
215
    'CMYD'
216
    """
217
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
218
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
219
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
220
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
221
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
222
                             ('AU', 'A§'), ('OU', '§'))
223
    _phonem_translation = dict(zip((ord(_) for _ in
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
224
                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
225
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
226
227
    word = unicode_normalize('NFC', text_type(word.upper()))
228
    for i, j in _phonem_substitutions:
229
        word = word.replace(i, j)
230
    word = word.translate(_phonem_translation)
231
232
    return ''.join(c for c in _delete_consecutive_repeats(word)
233
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
234
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})
235
236
237
def haase_phonetik(word, primary_only=False):
238
    """Return the Haase Phonetik (numeric output) code for a word.
239
240
    Based on the algorithm described at :cite:`Prante:2015`.
241
242
    Based on the original :cite:`Haase:2000`.
243
244
    While the output code is numeric, it is nevertheless a str.
245
246
    :param str word: the word to transform
247
    :param bool primary_only: if True, only the primary code is returned
248
    :returns: the Haase Phonetik value as a numeric string
249
    :rtype: tuple
250
251
    >>> haase_phonetik('Joachim')
252
    ('9496',)
253
    >>> haase_phonetik('Christoph')
254
    ('4798293', '8798293')
255
    >>> haase_phonetik('Jörg')
256
    ('974',)
257
    >>> haase_phonetik('Smith')
258
    ('8692',)
259
    >>> haase_phonetik('Schmidt')
260
    ('8692', '4692')
261
    """
262
    def _after(word, i, letters):
263
        """Return True if word[i] follows one of the supplied letters."""
264
        if i > 0 and word[i-1] in letters:
265
            return True
266
        return False
267
268
    def _before(word, i, letters):
269
        """Return True if word[i] precedes one of the supplied letters."""
270
        if i+1 < len(word) and word[i+1] in letters:
271
            return True
272
        return False
273
274
    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
275
276
    word = unicode_normalize('NFKD', text_type(word.upper()))
277
    word = word.replace('ß', 'SS')
278
279
    word = word.replace('Ä', 'AE')
280
    word = word.replace('Ö', 'OE')
281
    word = word.replace('Ü', 'UE')
282
    word = ''.join(c for c in word if c in
283
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
284
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
285
                    'Y', 'Z'})
286
287
    variants = []
288
    if primary_only:
289
        variants = [word]
290
    else:
291
        pos = 0
292
        if word[:2] == 'CH':
293
            variants.append(('CH', 'SCH'))
294
            pos += 2
295
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
296
                      'AUX': 'O', 'EUX': 'O'}
297
        while pos < len(word):
298
            if word[pos:pos+4] == 'ILLE':
299
                variants.append(('ILLE', 'I'))
300
                pos += 4
301
            elif word[pos:pos+3] in len_3_vars:
302
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
303
                pos += 3
304
            elif word[pos:pos+2] == 'RB':
305
                variants.append(('RB', 'RW'))
306
                pos += 2
307
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
308
                variants.append(('EAU', 'O'))
309
                pos += 3
310
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
311
                if word[pos:] == 'O':
312
                    variants.append(('O', 'OW'))
313
                else:
314
                    variants.append(('A', 'AR'))
315
                pos += 1
316
            else:
317
                variants.append((word[pos],))
318
                pos += 1
319
320
        variants = [''.join(letters) for letters in product(*variants)]
321
322
    def _haase_code(word):
323
        sdx = ''
324
        for i in range(len(word)):
325 View Code Duplication
            if word[i] in _vowels:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
326
                sdx += '9'
327
            elif word[i] == 'B':
328
                sdx += '1'
329
            elif word[i] == 'P':
330
                if _before(word, i, {'H'}):
331
                    sdx += '3'
332
                else:
333
                    sdx += '1'
334
            elif word[i] in {'D', 'T'}:
335
                if _before(word, i, {'C', 'S', 'Z'}):
336
                    sdx += '8'
337
                else:
338
                    sdx += '2'
339
            elif word[i] in {'F', 'V', 'W'}:
340
                sdx += '3'
341
            elif word[i] in {'G', 'K', 'Q'}:
342
                sdx += '4'
343
            elif word[i] == 'C':
344
                if _after(word, i, {'S', 'Z'}):
345
                    sdx += '8'
346
                elif i == 0:
347
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
348
                                         'U', 'X'}):
349
                        sdx += '4'
350
                    else:
351
                        sdx += '8'
352
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
353
                    sdx += '4'
354
                else:
355
                    sdx += '8'
356
            elif word[i] == 'X':
357
                if _after(word, i, {'C', 'K', 'Q'}):
358
                    sdx += '8'
359
                else:
360
                    sdx += '48'
361
            elif word[i] == 'L':
362
                sdx += '5'
363
            elif word[i] in {'M', 'N'}:
364
                sdx += '6'
365
            elif word[i] == 'R':
366
                sdx += '7'
367
            elif word[i] in {'S', 'Z'}:
368
                sdx += '8'
369
370
        sdx = _delete_consecutive_repeats(sdx)
371
372
        return sdx
373
374
    encoded = tuple(_haase_code(word) for word in variants)
375
    if len(encoded) > 1:
376
        encoded_set = set()
377
        encoded_single = []
378
        for code in encoded:
379
            if code not in encoded_set:
380
                encoded_set.add(code)
381
                encoded_single.append(code)
382
        return tuple(encoded_single)
383
384
    return encoded
385
386
387
def reth_schek_phonetik(word):
388
    """Return Reth-Schek Phonetik code for a word.
389
390
    This algorithm is proposed in :cite:`Reth:1977`.
391
392
    Since I couldn't secure a copy of that document (maybe I'll look for it
393
    next time I'm in Germany), this implementation is based on what I could
394
    glean from the implementations published by German Record Linkage
395
    Center (www.record-linkage.de):
396
397
    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
398
    - Merge ToolBox (in Java) :cite:`Schnell:2004`
399
400
    Rules that are unclear:
401
402
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
403
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
404
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
405
      think of a German word with '-tui-' in it.)
406
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
407
408
    :param str word: the word to transform
409
    :returns: the Reth-Schek Phonetik code
410
    :rtype: str
411
412
    >>> reth_schek_phonetik('Joachim')
413
    'JOAGHIM'
414
    >>> reth_schek_phonetik('Christoph')
415
    'GHRISDOF'
416
    >>> reth_schek_phonetik('Jörg')
417
    'JOERG'
418
    >>> reth_schek_phonetik('Smith')
419
    'SMID'
420
    >>> reth_schek_phonetik('Schmidt')
421
    'SCHMID'
422
    """
423
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
424
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
425
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
426
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
427
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
428
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
429
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
430
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
431
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
432
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
433
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
434
                        'SS': 'S', 'KW': 'QU'},
435
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
436
                        'K': 'G', 'Y': 'I'}}
437
438
    # Uppercase
439
    word = word.upper()
440
441
    # Replace umlauts/eszett
442
    word = word.replace('Ä', 'AE')
443
    word = word.replace('Ö', 'OE')
444
    word = word.replace('Ü', 'UE')
445
    word = word.replace('ß', 'SS')
446
447
    # Main loop, using above replacements table
448
    pos = 0
449
    while pos < len(word):
450
        for num in range(3, 0, -1):
451
            if word[pos:pos+num] in replacements[num]:
452
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
453
                        + word[pos+num:])
454
                pos += 1
455
                break
456
        else:
457
            pos += 1  # Advance if nothing is recognized
458
459
    # Change 'CH' back(?) to 'SCH'
460
    word = word.replace('CH', 'SCH')
461
462
    # Replace final sequences
463
    if word[-2:] == 'ER':
464
        word = word[:-2]+'R'
465
    elif word[-2:] == 'EL':
466
        word = word[:-2]+'L'
467
    elif word[-1:] == 'H':
468
        word = word[:-1]
469
470
    return word
471
472
473
if __name__ == '__main__':
474
    import doctest
475
    doctest.testmod()
476