Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.phonetic._de.haase_phonetik()   A

Complexity

Conditions 1

Size

Total Lines 26
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 26
ccs 2
cts 2
cp 1
rs 10
c 0
b 0
f 0
cc 1
nop 2
crap 1
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.phonetic._de.
20
21
The phonetic._de module implements the Kölner Phonetik and related
22
algorithms for German:
23
24
    - Kölner Phonetik
25
    - Phonem
26
    - Haase Phonetik
27
    - Reth-Schek Phonetik
28
"""
29
30 1
from __future__ import unicode_literals
31
32 1
from itertools import product
33 1
from unicodedata import normalize as unicode_normalize
34
35 1
from six import text_type
36 1
from six.moves import range
37
38 1
from ._phonetic import Phonetic
39
40 1
__all__ = [
41
    'Haase',
42
    'Koelner',
43
    'Phonem',
44
    'RethSchek',
45
    'haase_phonetik',
46
    'koelner_phonetik',
47
    'koelner_phonetik_alpha',
48
    'koelner_phonetik_num_to_alpha',
49
    'phonem',
50
    'reth_schek_phonetik',
51
]
52
53
54 1
class Koelner(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
55
    """Kölner Phonetik.
56
57
    Based on the algorithm defined by :cite:`Postel:1969`.
58
    """
59
60 1
    _uc_v_set = set('AEIOUJY')
61
62 1
    _num_trans = dict(zip((ord(_) for _ in '012345678'), 'APTFKLNRS'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
63 1
    _num_set = set('012345678')
64
65 1
    def encode(self, word):
66
        """Return the Kölner Phonetik (numeric output) code for a word.
67
68
        While the output code is numeric, it is still a str because 0s can lead
69
        the code.
70
71
        Args:
72
            word (str): The word to transform
73
74
        Returns:
75
            str: The Kölner Phonetik value as a numeric string
76
77
        Example:
78
            >>> pe = Koelner()
79
            >>> pe.encode('Christopher')
80
            '478237'
81
            >>> pe.encode('Niall')
82
            '65'
83
            >>> pe.encode('Smith')
84
            '862'
85
            >>> pe.encode('Schmidt')
86
            '862'
87
            >>> pe.encode('Müller')
88
            '657'
89
            >>> pe.encode('Zimmermann')
90
            '86766'
91
92
        """
93
94 1
        def _after(word, pos, letters):
95
            """Return True if word[pos] follows one of the supplied letters.
96
97
            Args:
98
                word (str): The word to check
99
                pos (int): Position within word to check
100
                letters (str): Letters to confirm precede word[pos]
101
102
            Returns:
103
                bool: True if word[pos] follows a value in letters
104
105
            """
106 1
            return pos > 0 and word[pos - 1] in letters
107
108 1
        def _before(word, pos, letters):
109
            """Return True if word[pos] precedes one of the supplied letters.
110
111
            Args:
112
                word (str): The word to check
113
                pos (int): Position within word to check
114
                letters (str): Letters to confirm follow word[pos]
115
116
            Returns:
117
                bool: True if word[pos] precedes a value in letters
118
119
            """
120 1
            return pos + 1 < len(word) and word[pos + 1] in letters
121
122 1
        sdx = ''
123
124 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
125 1
        word = word.replace('ß', 'SS')
126
127 1
        word = word.replace('Ä', 'AE')
128 1
        word = word.replace('Ö', 'OE')
129 1
        word = word.replace('Ü', 'UE')
130 1
        word = ''.join(c for c in word if c in self._uc_set)
131
132
        # Nothing to convert, return base case
133 1
        if not word:
134 1
            return sdx
135
136 1
        for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
137 1 View Code Duplication
            if word[i] in self._uc_v_set:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
138 1
                sdx += '0'
139 1
            elif word[i] == 'B':
140 1
                sdx += '1'
141 1
            elif word[i] == 'P':
142 1
                if _before(word, i, {'H'}):
143 1
                    sdx += '3'
144
                else:
145 1
                    sdx += '1'
146 1
            elif word[i] in {'D', 'T'}:
147 1
                if _before(word, i, {'C', 'S', 'Z'}):
148 1
                    sdx += '8'
149
                else:
150 1
                    sdx += '2'
151 1
            elif word[i] in {'F', 'V', 'W'}:
152 1
                sdx += '3'
153 1
            elif word[i] in {'G', 'K', 'Q'}:
154 1
                sdx += '4'
155 1
            elif word[i] == 'C':
156 1
                if _after(word, i, {'S', 'Z'}):
157 1
                    sdx += '8'
158 1
                elif i == 0:
159 1
                    if _before(
160
                        word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
161
                    ):
162 1
                        sdx += '4'
163
                    else:
164 1
                        sdx += '8'
165 1
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
166 1
                    sdx += '4'
167
                else:
168 1
                    sdx += '8'
169 1
            elif word[i] == 'X':
170 1
                if _after(word, i, {'C', 'K', 'Q'}):
171 1
                    sdx += '8'
172
                else:
173 1
                    sdx += '48'
174 1
            elif word[i] == 'L':
175 1
                sdx += '5'
176 1
            elif word[i] in {'M', 'N'}:
177 1
                sdx += '6'
178 1
            elif word[i] == 'R':
179 1
                sdx += '7'
180 1
            elif word[i] in {'S', 'Z'}:
181 1
                sdx += '8'
182
183 1
        sdx = self._delete_consecutive_repeats(sdx)
184
185 1
        if sdx:
186 1
            sdx = sdx[:1] + sdx[1:].replace('0', '')
187
188 1
        return sdx
189
190 1
    def _to_alpha(self, num):
191
        """Convert a Kölner Phonetik code from numeric to alphabetic.
192
193
        Args:
194
            num (str or int): A numeric Kölner Phonetik representation
195
196
        Returns:
197
            str: An alphabetic representation of the same word
198
199
        Examples:
200
            >>> pe = Koelner()
201
            >>> pe._to_alpha('862')
202
            'SNT'
203
            >>> pe._to_alpha('657')
204
            'NLR'
205
            >>> pe._to_alpha('86766')
206
            'SNRNN'
207
208
        """
209 1
        num = ''.join(c for c in text_type(num) if c in self._num_set)
210 1
        return num.translate(self._num_trans)
211
212 1
    def encode_alpha(self, word):
213
        """Return the Kölner Phonetik (alphabetic output) code for a word.
214
215
        Args:
216
            word (str): The word to transform
217
218
        Returns:
219
            str: The Kölner Phonetik value as an alphabetic string
220
221
        Examples:
222
            >>> pe = Koelner()
223
            >>> pe.encode_alpha('Smith')
224
            'SNT'
225
            >>> pe.encode_alpha('Schmidt')
226
            'SNT'
227
            >>> pe.encode_alpha('Müller')
228
            'NLR'
229
            >>> pe.encode_alpha('Zimmermann')
230
            'SNRNN'
231
232
        """
233 1
        return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
234
235
236 1
def koelner_phonetik(word):
237
    """Return the Kölner Phonetik (numeric output) code for a word.
238
239
    This is a wrapper for :py:meth:`Koelner.encode`.
240
241
    Args:
242
        word (str): The word to transform
243
244
    Returns:
245
        str: The Kölner Phonetik value as a numeric string
246
247
    Example:
248
        >>> koelner_phonetik('Christopher')
249
        '478237'
250
        >>> koelner_phonetik('Niall')
251
        '65'
252
        >>> koelner_phonetik('Smith')
253
        '862'
254
        >>> koelner_phonetik('Schmidt')
255
        '862'
256
        >>> koelner_phonetik('Müller')
257
        '657'
258
        >>> koelner_phonetik('Zimmermann')
259
        '86766'
260
261
    """
262 1
    return Koelner().encode(word)
263
264
265 1
def koelner_phonetik_num_to_alpha(num):
266
    """Convert a Kölner Phonetik code from numeric to alphabetic.
267
268
    This is a wrapper for :py:meth:`Koelner._to_alpha`.
269
270
    Args:
271
        num (str or int): A numeric Kölner Phonetik representation
272
273
    Returns:
274
        str: An alphabetic representation of the same word
275
276
    Examples:
277
        >>> koelner_phonetik_num_to_alpha('862')
278
        'SNT'
279
        >>> koelner_phonetik_num_to_alpha('657')
280
        'NLR'
281
        >>> koelner_phonetik_num_to_alpha('86766')
282
        'SNRNN'
283
284
    """
285 1
    return Koelner()._to_alpha(num)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _to_alpha was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
286
287
288 1
def koelner_phonetik_alpha(word):
289
    """Return the Kölner Phonetik (alphabetic output) code for a word.
290
291
    This is a wrapper for :py:meth:`Koelner.encode_alpha`.
292
293
    Args:
294
        word (str): The word to transform
295
296
    Returns:
297
        str: The Kölner Phonetik value as an alphabetic string
298
299
    Examples:
300
        >>> koelner_phonetik_alpha('Smith')
301
        'SNT'
302
        >>> koelner_phonetik_alpha('Schmidt')
303
        'SNT'
304
        >>> koelner_phonetik_alpha('Müller')
305
        'NLR'
306
        >>> koelner_phonetik_alpha('Zimmermann')
307
        'SNRNN'
308
309
    """
310 1
    return Koelner().encode_alpha(word)
311
312
313 1
class Phonem(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
314
    """Phonem.
315
316
    Phonem is defined in :cite:`Wilde:1988`.
317
318
    This version is based on the Perl implementation documented at
319
    :cite:`Wilz:2005`.
320
    It includes some enhancements presented in the Java port at
321
    :cite:`dcm4che:2011`.
322
323
    Phonem is intended chiefly for German names/words.
324
    """
325
326 1
    _substitutions = (
327
        ('SC', 'C'),
328
        ('SZ', 'C'),
329
        ('CZ', 'C'),
330
        ('TZ', 'C'),
331
        ('TS', 'C'),
332
        ('KS', 'X'),
333
        ('PF', 'V'),
334
        ('QU', 'KW'),
335
        ('PH', 'V'),
336
        ('UE', 'Y'),
337
        ('AE', 'E'),
338
        ('OE', 'Ö'),
339
        ('EI', 'AY'),
340
        ('EY', 'AY'),
341
        ('EU', 'OY'),
342
        ('AU', 'A§'),
343
        ('OU', '§'),
344
    )
345
346 1
    _trans = dict(
347
        zip(
348
            (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
349
            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
350
        )
351
    )
352
353 1
    _uc_set = set('ABCDLMNORSUVWXYÖ')
354
355 1
    def encode(self, word):
356
        """Return the Phonem code for a word.
357
358
        Args:
359
            word (str): The word to transform
360
361
        Returns:
362
            str: The Phonem value
363
364
        Examples:
365
            >>> pe = Phonem()
366
            >>> pe.encode('Christopher')
367
            'CRYSDOVR'
368
            >>> pe.encode('Niall')
369
            'NYAL'
370
            >>> pe.encode('Smith')
371
            'SMYD'
372
            >>> pe.encode('Schmidt')
373
            'CMYD'
374
375
        """
376 1
        word = unicode_normalize('NFC', text_type(word.upper()))
377 1
        for i, j in self._substitutions:
378 1
            word = word.replace(i, j)
379 1
        word = word.translate(self._trans)
380
381 1
        return ''.join(
382
            c
383
            for c in self._delete_consecutive_repeats(word)
384
            if c in self._uc_set
385
        )
386
387
388 1
def phonem(word):
389
    """Return the Phonem code for a word.
390
391
    This is a wrapper for :py:meth:`Phonem.encode`.
392
393
    Args:
394
        word (str): The word to transform
395
396
    Returns:
397
        str: The Phonem value
398
399
    Examples:
400
        >>> phonem('Christopher')
401
        'CRYSDOVR'
402
        >>> phonem('Niall')
403
        'NYAL'
404
        >>> phonem('Smith')
405
        'SMYD'
406
        >>> phonem('Schmidt')
407
        'CMYD'
408
409
    """
410 1
    return Phonem().encode(word)
411
412
413 1
class Haase(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
414
    """Haase Phonetik.
415
416
    Based on the algorithm described at :cite:`Prante:2015`.
417
418
    Based on the original :cite:`Haase:2000`.
419
    """
420
421 1
    _uc_v_set = set('AEIJOUY')
422
423 1
    def encode(self, word, primary_only=False):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'encode' method
Loading history...
424
        """Return the Haase Phonetik (numeric output) code for a word.
425
426
        While the output code is numeric, it is nevertheless a str.
427
428
        Args:
429
            word (str): The word to transform
430
            primary_only (bool): If True, only the primary code is returned
431
432
        Returns:
433
            tuple: The Haase Phonetik value as a numeric string
434
435
        Examples:
436
            >>> pe = Haase()
437
            >>> pe.encode('Joachim')
438
            ('9496',)
439
            >>> pe.encode('Christoph')
440
            ('4798293', '8798293')
441
            >>> pe.encode('Jörg')
442
            ('974',)
443
            >>> pe.encode('Smith')
444
            ('8692',)
445
            >>> pe.encode('Schmidt')
446
            ('8692', '4692')
447
448
        """
449
450 1
        def _after(word, pos, letters):
451
            """Return True if word[pos] follows one of the supplied letters.
452
453
            Args:
454
                word (str): Word to modify
455
                pos (int): Position to examine
456
                letters (set): Letters to check for
457
458
            Returns:
459
                bool: True if word[pos] follows one of letters
460
461
            """
462 1
            if pos > 0 and word[pos - 1] in letters:
463 1
                return True
464 1
            return False
465
466 1
        def _before(word, pos, letters):
467
            """Return True if word[pos] precedes one of the supplied letters.
468
469
            Args:
470
                word (str): Word to modify
471
                pos (int): Position to examine
472
                letters (set): Letters to check for
473
474
            Returns:
475
                bool: True if word[pos] precedes one of letters
476
477
            """
478 1
            if pos + 1 < len(word) and word[pos + 1] in letters:
479 1
                return True
480 1
            return False
481
482 1
        word = unicode_normalize('NFKD', text_type(word.upper()))
483 1
        word = word.replace('ß', 'SS')
484
485 1
        word = word.replace('Ä', 'AE')
486 1
        word = word.replace('Ö', 'OE')
487 1
        word = word.replace('Ü', 'UE')
488 1
        word = ''.join(c for c in word if c in self._uc_set)
489
490 1
        variants = []
491 1
        if primary_only:
492 1
            variants = [word]
493
        else:
494 1
            pos = 0
495 1
            if word[:2] == 'CH':
496 1
                variants.append(('CH', 'SCH'))
497 1
                pos += 2
498 1
            len_3_vars = {
499
                'OWN': 'AUN',
500
                'WSK': 'RSK',
501
                'SCH': 'CH',
502
                'GLI': 'LI',
503
                'AUX': 'O',
504
                'EUX': 'O',
505
            }
506 1
            while pos < len(word):
507 1
                if word[pos : pos + 4] == 'ILLE':
508 1
                    variants.append(('ILLE', 'I'))
509 1
                    pos += 4
510 1
                elif word[pos : pos + 3] in len_3_vars:
511 1
                    variants.append(
512
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
513
                    )
514 1
                    pos += 3
515 1
                elif word[pos : pos + 2] == 'RB':
516 1
                    variants.append(('RB', 'RW'))
517 1
                    pos += 2
518 1
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
519 1
                    variants.append(('EAU', 'O'))
520 1
                    pos += 3
521 1
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
522 1
                    if word[pos:] == 'O':
523 1
                        variants.append(('O', 'OW'))
524
                    else:
525 1
                        variants.append(('A', 'AR'))
526 1
                    pos += 1
527
                else:
528 1
                    variants.append((word[pos],))
529 1
                    pos += 1
530
531 1
            variants = [''.join(letters) for letters in product(*variants)]
532
533 1
        def _haase_code(word):
534 1
            sdx = ''
535 1
            for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
536 1 View Code Duplication
                if word[i] in self._uc_v_set:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
537 1
                    sdx += '9'
538 1
                elif word[i] == 'B':
539 1
                    sdx += '1'
540 1
                elif word[i] == 'P':
541 1
                    if _before(word, i, {'H'}):
542 1
                        sdx += '3'
543
                    else:
544 1
                        sdx += '1'
545 1
                elif word[i] in {'D', 'T'}:
546 1
                    if _before(word, i, {'C', 'S', 'Z'}):
547 1
                        sdx += '8'
548
                    else:
549 1
                        sdx += '2'
550 1
                elif word[i] in {'F', 'V', 'W'}:
551 1
                    sdx += '3'
552 1
                elif word[i] in {'G', 'K', 'Q'}:
553 1
                    sdx += '4'
554 1
                elif word[i] == 'C':
555 1
                    if _after(word, i, {'S', 'Z'}):
556 1
                        sdx += '8'
557 1
                    elif i == 0:
558 1
                        if _before(
559
                            word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
560
                            i,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
561
                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
562
                        ):
563 1
                            sdx += '4'
564
                        else:
565 1
                            sdx += '8'
566 1
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
567 1
                        sdx += '4'
568
                    else:
569 1
                        sdx += '8'
570 1
                elif word[i] == 'X':
571 1
                    if _after(word, i, {'C', 'K', 'Q'}):
572 1
                        sdx += '8'
573
                    else:
574 1
                        sdx += '48'
575 1
                elif word[i] == 'L':
576 1
                    sdx += '5'
577 1
                elif word[i] in {'M', 'N'}:
578 1
                    sdx += '6'
579 1
                elif word[i] == 'R':
580 1
                    sdx += '7'
581 1
                elif word[i] in {'S', 'Z'}:
582 1
                    sdx += '8'
583
584 1
            sdx = self._delete_consecutive_repeats(sdx)
585
586 1
            return sdx
587
588 1
        encoded = tuple(_haase_code(word) for word in variants)
589 1
        if len(encoded) > 1:
590 1
            encoded_set = set()
591 1
            encoded_single = []
592 1
            for code in encoded:
593 1
                if code not in encoded_set:
594 1
                    encoded_set.add(code)
595 1
                    encoded_single.append(code)
596 1
            return tuple(encoded_single)
597
598 1
        return encoded
599
600
601 1
def haase_phonetik(word, primary_only=False):
602
    """Return the Haase Phonetik (numeric output) code for a word.
603
604
    This is a wrapper for :py:meth:`Haase.encode`.
605
606
    Args:
607
        word (str): The word to transform
608
        primary_only (bool): If True, only the primary code is returned
609
610
    Returns:
611
        tuple: The Haase Phonetik value as a numeric string
612
613
    Examples:
614
        >>> haase_phonetik('Joachim')
615
        ('9496',)
616
        >>> haase_phonetik('Christoph')
617
        ('4798293', '8798293')
618
        >>> haase_phonetik('Jörg')
619
        ('974',)
620
        >>> haase_phonetik('Smith')
621
        ('8692',)
622
        >>> haase_phonetik('Schmidt')
623
        ('8692', '4692')
624
625
    """
626 1
    return Haase().encode(word, primary_only)
627
628
629 1
class RethSchek(Phonetic):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
630
    """Reth-Schek Phonetik.
631
632
    This algorithm is proposed in :cite:`Reth:1977`.
633
634
    Since I couldn't secure a copy of that document (maybe I'll look for it
635
    next time I'm in Germany), this implementation is based on what I could
636
    glean from the implementations published by German Record Linkage
637
    Center (www.record-linkage.de):
638
639
    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
640
    - Merge ToolBox (in Java) :cite:`Schnell:2004`
641
642
    Rules that are unclear:
643
644
    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
645
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
646
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
647
      think of a German word with '-tui-' in it.)
648
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
649
    """
650
651 1
    _replacements = {
652
        3: {
653
            'AEH': 'E',
654
            'IEH': 'I',
655
            'OEH': 'OE',
656
            'UEH': 'UE',
657
            'SCH': 'CH',
658
            'ZIO': 'TIO',
659
            'TIU': 'TIO',
660
            'ZIU': 'TIO',
661
            'CHS': 'X',
662
            'CKS': 'X',
663
            'AEU': 'OI',
664
        },
665
        2: {
666
            'LL': 'L',
667
            'AA': 'A',
668
            'AH': 'A',
669
            'BB': 'B',
670
            'PP': 'B',
671
            'BP': 'B',
672
            'PB': 'B',
673
            'DD': 'D',
674
            'DT': 'D',
675
            'TT': 'D',
676
            'TH': 'D',
677
            'EE': 'E',
678
            'EH': 'E',
679
            'AE': 'E',
680
            'FF': 'F',
681
            'PH': 'F',
682
            'KK': 'K',
683
            'GG': 'G',
684
            'GK': 'G',
685
            'KG': 'G',
686
            'CK': 'G',
687
            'CC': 'C',
688
            'IE': 'I',
689
            'IH': 'I',
690
            'MM': 'M',
691
            'NN': 'N',
692
            'OO': 'O',
693
            'OH': 'O',
694
            'SZ': 'S',
695
            'UH': 'U',
696
            'GS': 'X',
697
            'KS': 'X',
698
            'TZ': 'Z',
699
            'AY': 'AI',
700
            'EI': 'AI',
701
            'EY': 'AI',
702
            'EU': 'OI',
703
            'RR': 'R',
704
            'SS': 'S',
705
            'KW': 'QU',
706
        },
707
        1: {
708
            'P': 'B',
709
            'T': 'D',
710
            'V': 'F',
711
            'W': 'F',
712
            'C': 'G',
713
            'K': 'G',
714
            'Y': 'I',
715
        },
716
    }
717
718 1
    def encode(self, word):
719
        """Return Reth-Schek Phonetik code for a word.
720
721
        Args:
722
            word (str): The word to transform
723
724
        Returns:
725
            str: The Reth-Schek Phonetik code
726
727
        Examples:
728
            >>> reth_schek_phonetik('Joachim')
729
            'JOAGHIM'
730
            >>> reth_schek_phonetik('Christoph')
731
            'GHRISDOF'
732
            >>> reth_schek_phonetik('Jörg')
733
            'JOERG'
734
            >>> reth_schek_phonetik('Smith')
735
            'SMID'
736
            >>> reth_schek_phonetik('Schmidt')
737
            'SCHMID'
738
739
        """
740
        # Uppercase
741 1
        word = word.upper()
742
743
        # Replace umlauts/eszett
744 1
        word = word.replace('Ä', 'AE')
745 1
        word = word.replace('Ö', 'OE')
746 1
        word = word.replace('Ü', 'UE')
747 1
        word = word.replace('ß', 'SS')
748
749
        # Main loop, using above replacements table
750 1
        pos = 0
751 1
        while pos < len(word):
752 1
            for num in range(3, 0, -1):
753 1
                if word[pos : pos + num] in self._replacements[num]:
754 1
                    word = (
755
                        word[:pos]
756
                        + self._replacements[num][word[pos : pos + num]]
757
                        + word[pos + num :]
758
                    )
759 1
                    pos += 1
760 1
                    break
761
            else:
762 1
                pos += 1  # Advance if nothing is recognized
763
764
        # Change 'CH' back(?) to 'SCH'
765 1
        word = word.replace('CH', 'SCH')
766
767
        # Replace final sequences
768 1
        if word[-2:] == 'ER':
769 1
            word = word[:-2] + 'R'
770 1
        elif word[-2:] == 'EL':
771 1
            word = word[:-2] + 'L'
772 1
        elif word[-1:] == 'H':
773 1
            word = word[:-1]
774
775 1
        return word
776
777
778 1
def reth_schek_phonetik(word):
779
    """Return Reth-Schek Phonetik code for a word.
780
781
    This is a wrapper for :py:meth:`RethSchek.encode`.
782
783
    Args:
784
        word (str): The word to transform
785
786
    Returns:
787
        str: The Reth-Schek Phonetik code
788
789
    Examples:
790
        >>> reth_schek_phonetik('Joachim')
791
        'JOAGHIM'
792
        >>> reth_schek_phonetik('Christoph')
793
        'GHRISDOF'
794
        >>> reth_schek_phonetik('Jörg')
795
        'JOERG'
796
        >>> reth_schek_phonetik('Smith')
797
        'SMID'
798
        >>> reth_schek_phonetik('Schmidt')
799
        'SCHMID'
800
801
    """
802 1
    return RethSchek().encode(word)
803
804
805
if __name__ == '__main__':
806
    import doctest
807
808
    doctest.testmod()
809