abydos.phonetic.de.koelner_phonetik_alpha() - Code Metrics - Inspection of "Modularize" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#120)

by Chris

created 2018-10-19 22:19 UTC

abydos.phonetic.de.koelner_phonetik_alpha() A

↳ Parent: abydos.phonetic.de

Complexity

Conditions

Size

Total Lines	17
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	1
dl	0
loc	17
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.de.

The phonetic.de module implements the Kölner Phonetik and related
algorithms for German:

    - Kölner Phonetik
    - Phonem
    - Haase Phonetik
    - Reth-Schek Phonetik
"""

from __future__ import unicode_literals

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from . import _delete_consecutive_repeats

__all__ = ['haase_phonetik', 'koelner_phonetik',
           'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha',
           'phonem', 'reth_schek_phonetik']


def koelner_phonetik(word):
    """Return the Kölner Phonetik (numeric output) code for a word.

    Based on the algorithm defined by :cite:`Postel:1969`.

    While the output code is numeric, it is still a str because 0s can lead
    the code.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as a numeric string
    :rtype: str

    >>> koelner_phonetik('Christopher')
    '478237'
    >>> koelner_phonetik('Niall')
    '65'
    >>> koelner_phonetik('Smith')
    '862'
    >>> koelner_phonetik('Schmidt')
    '862'
    >>> koelner_phonetik('Müller')
    '657'
    >>> koelner_phonetik('Zimmermann')
    '86766'
    """
    def _after(word, pos, letters):
        """Return True if word[i] follows one of the supplied letters."""
        return pos > 0 and word[pos-1] in letters

    def _before(word, pos, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        return pos+1 < len(word) and word[pos+1] in letters

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    sdx = ''

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    # Nothing to convert, return base case
    if not word:
        return sdx

    for i in range(len(word)):
        if word[i] in _vowels:

            sdx += '0'
        elif word[i] == 'B':
            sdx += '1'
        elif word[i] == 'P':
            if _before(word, i, {'H'}):
                sdx += '3'
            else:
                sdx += '1'
        elif word[i] in {'D', 'T'}:
            if _before(word, i, {'C', 'S', 'Z'}):
                sdx += '8'
            else:
                sdx += '2'
        elif word[i] in {'F', 'V', 'W'}:
            sdx += '3'
        elif word[i] in {'G', 'K', 'Q'}:
            sdx += '4'
        elif word[i] == 'C':
            if _after(word, i, {'S', 'Z'}):
                sdx += '8'
            elif i == 0:
                if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
                                     'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                sdx += '4'
            else:
                sdx += '8'
        elif word[i] == 'X':
            if _after(word, i, {'C', 'K', 'Q'}):
                sdx += '8'
            else:
                sdx += '48'
        elif word[i] == 'L':
            sdx += '5'
        elif word[i] in {'M', 'N'}:
            sdx += '6'
        elif word[i] == 'R':
            sdx += '7'
        elif word[i] in {'S', 'Z'}:
            sdx += '8'

    sdx = _delete_consecutive_repeats(sdx)

    if sdx:
        sdx = sdx[:1] + sdx[1:].replace('0', '')

    return sdx


def koelner_phonetik_num_to_alpha(num):
    """Convert a Kölner Phonetik code from numeric to alphabetic.

    :param str num: a numeric Kölner Phonetik representation (can be a str or
        an int)
    :returns: an alphabetic representation of the same word
    :rtype: str

    >>> koelner_phonetik_num_to_alpha('862')
    'SNT'
    >>> koelner_phonetik_num_to_alpha('657')
    'NLR'
    >>> koelner_phonetik_num_to_alpha('86766')
    'SNRNN'
    """
    _koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),

                                        'APTFKLNRS'))
    num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
                                                     '5', '6', '7', '8'})
    return num.translate(_koelner_num_translation)


def koelner_phonetik_alpha(word):
    """Return the Kölner Phonetik (alphabetic output) code for a word.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as an alphabetic string
    :rtype: str

    >>> koelner_phonetik_alpha('Smith')
    'SNT'
    >>> koelner_phonetik_alpha('Schmidt')
    'SNT'
    >>> koelner_phonetik_alpha('Müller')
    'NLR'
    >>> koelner_phonetik_alpha('Zimmermann')
    'SNRNN'
    """
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))


def phonem(word):
    """Return the Phonem code for a word.

    Phonem is defined in :cite:`Wilde:1988`.

    This version is based on the Perl implementation documented at
    :cite:`Wilz:2005`.
    It includes some enhancements presented in the Java port at
    :cite:`dcm4che:2011`.

    Phonem is intended chiefly for German names/words.

    :param str word: the word to transform
    :returns: the Phonem value
    :rtype: str

    >>> phonem('Christopher')
    'CRYSDOVR'
    >>> phonem('Niall')
    'NYAL'
    >>> phonem('Smith')
    'SMYD'
    >>> phonem('Schmidt')
    'CMYD'
    """
    _phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
                             ('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
                             ('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
                             ('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
                             ('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
                             ('AU', 'A§'), ('OU', '§'))
    _phonem_translation = dict(zip((ord(_) for _ in

                                    'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),
                                   'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))

    word = unicode_normalize('NFC', text_type(word.upper()))
    for i, j in _phonem_substitutions:
        word = word.replace(i, j)
    word = word.translate(_phonem_translation)

    return ''.join(c for c in _delete_consecutive_repeats(word)
                   if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
                            'U', 'V', 'W', 'X', 'Y', 'Ö'})


def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.

    While the output code is numeric, it is nevertheless a str.

    :param str word: the word to transform
    :param bool primary_only: if True, only the primary code is returned
    :returns: the Haase Phonetik value as a numeric string
    :rtype: tuple

    >>> haase_phonetik('Joachim')
    ('9496',)
    >>> haase_phonetik('Christoph')
    ('4798293', '8798293')
    >>> haase_phonetik('Jörg')
    ('974',)
    >>> haase_phonetik('Smith')
    ('8692',)
    >>> haase_phonetik('Schmidt')
    ('8692', '4692')
    """
    def _after(word, i, letters):
        """Return True if word[i] follows one of the supplied letters."""
        if i > 0 and word[i-1] in letters:
            return True
        return False

    def _before(word, i, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        if i+1 < len(word) and word[i+1] in letters:
            return True
        return False

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(c for c in word if c in
                   {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
                    'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
                    'Y', 'Z'})

    variants = []
    if primary_only:
        variants = [word]
    else:
        pos = 0
        if word[:2] == 'CH':
            variants.append(('CH', 'SCH'))
            pos += 2
        len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
                      'AUX': 'O', 'EUX': 'O'}
        while pos < len(word):
            if word[pos:pos+4] == 'ILLE':
                variants.append(('ILLE', 'I'))
                pos += 4
            elif word[pos:pos+3] in len_3_vars:
                variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
                pos += 3
            elif word[pos:pos+2] == 'RB':
                variants.append(('RB', 'RW'))
                pos += 2
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                variants.append(('EAU', 'O'))
                pos += 3
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                if word[pos:] == 'O':
                    variants.append(('O', 'OW'))
                else:
                    variants.append(('A', 'AR'))
                pos += 1
            else:
                variants.append((word[pos],))
                pos += 1

        variants = [''.join(letters) for letters in product(*variants)]

    def _haase_code(word):
        sdx = ''
        for i in range(len(word)):
            if word[i] in _vowels:

                sdx += '9'
            elif word[i] == 'B':
                sdx += '1'
            elif word[i] == 'P':
                if _before(word, i, {'H'}):
                    sdx += '3'
                else:
                    sdx += '1'
            elif word[i] in {'D', 'T'}:
                if _before(word, i, {'C', 'S', 'Z'}):
                    sdx += '8'
                else:
                    sdx += '2'
            elif word[i] in {'F', 'V', 'W'}:
                sdx += '3'
            elif word[i] in {'G', 'K', 'Q'}:
                sdx += '4'
            elif word[i] == 'C':
                if _after(word, i, {'S', 'Z'}):
                    sdx += '8'
                elif i == 0:
                    if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
                                         'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif word[i] == 'X':
                if _after(word, i, {'C', 'K', 'Q'}):
                    sdx += '8'
                else:
                    sdx += '48'
            elif word[i] == 'L':
                sdx += '5'
            elif word[i] in {'M', 'N'}:
                sdx += '6'
            elif word[i] == 'R':
                sdx += '7'
            elif word[i] in {'S', 'Z'}:
                sdx += '8'

        sdx = _delete_consecutive_repeats(sdx)

        return sdx

    encoded = tuple(_haase_code(word) for word in variants)
    if len(encoded) > 1:
        encoded_set = set()
        encoded_single = []
        for code in encoded:
            if code not in encoded_set:
                encoded_set.add(code)
                encoded_single.append(code)
        return tuple(encoded_single)

    return encoded


def reth_schek_phonetik(word):
    """Return Reth-Schek Phonetik code for a word.

    This algorithm is proposed in :cite:`Reth:1977`.

    Since I couldn't secure a copy of that document (maybe I'll look for it
    next time I'm in Germany), this implementation is based on what I could
    glean from the implementations published by German Record Linkage
    Center (www.record-linkage.de):

    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
    - Merge ToolBox (in Java) :cite:`Schnell:2004`

    Rules that are unclear:

    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
      think of a German word with '-tui-' in it.)
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?

    :param str word: the word to transform
    :returns: the Reth-Schek Phonetik code
    :rtype: str

    >>> reth_schek_phonetik('Joachim')
    'JOAGHIM'
    >>> reth_schek_phonetik('Christoph')
    'GHRISDOF'
    >>> reth_schek_phonetik('Jörg')
    'JOERG'
    >>> reth_schek_phonetik('Smith')
    'SMID'
    >>> reth_schek_phonetik('Schmidt')
    'SCHMID'
    """
    replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
                        'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
                        'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
                    2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
                        'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
                        'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
                        'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
                        'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
                        'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
                        'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
                        'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
                        'SS': 'S', 'KW': 'QU'},
                    1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
                        'K': 'G', 'Y': 'I'}}

    # Uppercase
    word = word.upper()

    # Replace umlauts/eszett
    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = word.replace('ß', 'SS')

    # Main loop, using above replacements table
    pos = 0
    while pos < len(word):
        for num in range(3, 0, -1):
            if word[pos:pos+num] in replacements[num]:
                word = (word[:pos] + replacements[num][word[pos:pos+num]]
                        + word[pos+num:])
                pos += 1
                break
        else:
            pos += 1  # Advance if nothing is recognized

    # Change 'CH' back(?) to 'SCH'
    word = word.replace('CH', 'SCH')

    # Replace final sequences
    if word[-2:] == 'ER':
        word = word[:-2]+'R'
    elif word[-2:] == 'EL':
        word = word[:-2]+'L'
    elif word[-1:] == 'H':
        word = word[:-1]

    return word


if __name__ == '__main__':
    import doctest
    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19		"""abydos.phonetic.de.
20
21		The phonetic.de module implements the Kölner Phonetik and related
22		algorithms for German:
23
24		- Kölner Phonetik
25		- Phonem
26		- Haase Phonetik
27		- Reth-Schek Phonetik
28		"""
29
30		from __future__ import unicode_literals
31
32		from itertools import product
33		from unicodedata import normalize as unicode_normalize
34
35		from six import text_type
36		from six.moves import range
37
38		from . import _delete_consecutive_repeats
39
40		__all__ = ['haase_phonetik', 'koelner_phonetik',
41		'koelner_phonetik_alpha', 'koelner_phonetik_num_to_alpha',
42		'phonem', 'reth_schek_phonetik']
43
44
45		def koelner_phonetik(word):
46		"""Return the Kölner Phonetik (numeric output) code for a word.
47
48		Based on the algorithm defined by :cite:`Postel:1969`.
49
50		While the output code is numeric, it is still a str because 0s can lead
51		the code.
52
53		:param str word: the word to transform
54		:returns: the Kölner Phonetik value as a numeric string
55		:rtype: str
56
57		>>> koelner_phonetik('Christopher')
58		'478237'
59		>>> koelner_phonetik('Niall')
60		'65'
61		>>> koelner_phonetik('Smith')
62		'862'
63		>>> koelner_phonetik('Schmidt')
64		'862'
65		>>> koelner_phonetik('Müller')
66		'657'
67		>>> koelner_phonetik('Zimmermann')
68		'86766'
69		"""
70		def _after(word, pos, letters):
71		"""Return True if word[i] follows one of the supplied letters."""
72		return pos > 0 and word[pos-1] in letters
73
74		def _before(word, pos, letters):
75		"""Return True if word[i] precedes one of the supplied letters."""
76		return pos+1 < len(word) and word[pos+1] in letters
77
78		_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
79
80		sdx = ''
81
82		word = unicode_normalize('NFKD', text_type(word.upper()))
83		word = word.replace('ß', 'SS')
84
85		word = word.replace('Ä', 'AE')
86		word = word.replace('Ö', 'OE')
87		word = word.replace('Ü', 'UE')
88		word = ''.join(c for c in word if c in
89		{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
90		'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
91		'Y', 'Z'})
92
93		# Nothing to convert, return base case
94		if not word:
95		return sdx
96
97		for i in range(len(word)):
98	View Code Duplication	if word[i] in _vowels:
		0 ignored issues – show Duplication introduced 2018-08-26 04:07 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
99		sdx += '0'
100		elif word[i] == 'B':
101		sdx += '1'
102		elif word[i] == 'P':
103		if _before(word, i, {'H'}):
104		sdx += '3'
105		else:
106		sdx += '1'
107		elif word[i] in {'D', 'T'}:
108		if _before(word, i, {'C', 'S', 'Z'}):
109		sdx += '8'
110		else:
111		sdx += '2'
112		elif word[i] in {'F', 'V', 'W'}:
113		sdx += '3'
114		elif word[i] in {'G', 'K', 'Q'}:
115		sdx += '4'
116		elif word[i] == 'C':
117		if _after(word, i, {'S', 'Z'}):
118		sdx += '8'
119		elif i == 0:
120		if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U',
121		'X'}):
122		sdx += '4'
123		else:
124		sdx += '8'
125		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
126		sdx += '4'
127		else:
128		sdx += '8'
129		elif word[i] == 'X':
130		if _after(word, i, {'C', 'K', 'Q'}):
131		sdx += '8'
132		else:
133		sdx += '48'
134		elif word[i] == 'L':
135		sdx += '5'
136		elif word[i] in {'M', 'N'}:
137		sdx += '6'
138		elif word[i] == 'R':
139		sdx += '7'
140		elif word[i] in {'S', 'Z'}:
141		sdx += '8'
142
143		sdx = _delete_consecutive_repeats(sdx)
144
145		if sdx:
146		sdx = sdx[:1] + sdx[1:].replace('0', '')
147
148		return sdx
149
150
151		def koelner_phonetik_num_to_alpha(num):
152		"""Convert a Kölner Phonetik code from numeric to alphabetic.
153
154		:param str num: a numeric Kölner Phonetik representation (can be a str or
155		an int)
156		:returns: an alphabetic representation of the same word
157		:rtype: str
158
159		>>> koelner_phonetik_num_to_alpha('862')
160		'SNT'
161		>>> koelner_phonetik_num_to_alpha('657')
162		'NLR'
163		>>> koelner_phonetik_num_to_alpha('86766')
164		'SNRNN'
165		"""
166		_koelner_num_translation = dict(zip((ord(_) for _ in '012345678'),
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
167		'APTFKLNRS'))
168		num = ''.join(c for c in text_type(num) if c in {'0', '1', '2', '3', '4',
169		'5', '6', '7', '8'})
170		return num.translate(_koelner_num_translation)
171
172
173		def koelner_phonetik_alpha(word):
174		"""Return the Kölner Phonetik (alphabetic output) code for a word.
175
176		:param str word: the word to transform
177		:returns: the Kölner Phonetik value as an alphabetic string
178		:rtype: str
179
180		>>> koelner_phonetik_alpha('Smith')
181		'SNT'
182		>>> koelner_phonetik_alpha('Schmidt')
183		'SNT'
184		>>> koelner_phonetik_alpha('Müller')
185		'NLR'
186		>>> koelner_phonetik_alpha('Zimmermann')
187		'SNRNN'
188		"""
189		return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
190
191
192		def phonem(word):
193		"""Return the Phonem code for a word.
194
195		Phonem is defined in :cite:`Wilde:1988`.
196
197		This version is based on the Perl implementation documented at
198		:cite:`Wilz:2005`.
199		It includes some enhancements presented in the Java port at
200		:cite:`dcm4che:2011`.
201
202		Phonem is intended chiefly for German names/words.
203
204		:param str word: the word to transform
205		:returns: the Phonem value
206		:rtype: str
207
208		>>> phonem('Christopher')
209		'CRYSDOVR'
210		>>> phonem('Niall')
211		'NYAL'
212		>>> phonem('Smith')
213		'SMYD'
214		>>> phonem('Schmidt')
215		'CMYD'
216		"""
217		_phonem_substitutions = (('SC', 'C'), ('SZ', 'C'), ('CZ', 'C'),
218		('TZ', 'C'), ('TS', 'C'), ('KS', 'X'),
219		('PF', 'V'), ('QU', 'KW'), ('PH', 'V'),
220		('UE', 'Y'), ('AE', 'E'), ('OE', 'Ö'),
221		('EI', 'AY'), ('EY', 'AY'), ('EU', 'OY'),
222		('AU', 'A§'), ('OU', '§'))
223		_phonem_translation = dict(zip((ord(_) for _ in
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
224		'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),
225		'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ'))
226
227		word = unicode_normalize('NFC', text_type(word.upper()))
228		for i, j in _phonem_substitutions:
229		word = word.replace(i, j)
230		word = word.translate(_phonem_translation)
231
232		return ''.join(c for c in _delete_consecutive_repeats(word)
233		if c in {'A', 'B', 'C', 'D', 'L', 'M', 'N', 'O', 'R', 'S',
234		'U', 'V', 'W', 'X', 'Y', 'Ö'})
235
236
237		def haase_phonetik(word, primary_only=False):
238		"""Return the Haase Phonetik (numeric output) code for a word.
239
240		Based on the algorithm described at :cite:`Prante:2015`.
241
242		Based on the original :cite:`Haase:2000`.
243
244		While the output code is numeric, it is nevertheless a str.
245
246		:param str word: the word to transform
247		:param bool primary_only: if True, only the primary code is returned
248		:returns: the Haase Phonetik value as a numeric string
249		:rtype: tuple
250
251		>>> haase_phonetik('Joachim')
252		('9496',)
253		>>> haase_phonetik('Christoph')
254		('4798293', '8798293')
255		>>> haase_phonetik('Jörg')
256		('974',)
257		>>> haase_phonetik('Smith')
258		('8692',)
259		>>> haase_phonetik('Schmidt')
260		('8692', '4692')
261		"""
262		def _after(word, i, letters):
263		"""Return True if word[i] follows one of the supplied letters."""
264		if i > 0 and word[i-1] in letters:
265		return True
266		return False
267
268		def _before(word, i, letters):
269		"""Return True if word[i] precedes one of the supplied letters."""
270		if i+1 < len(word) and word[i+1] in letters:
271		return True
272		return False
273
274		_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
275
276		word = unicode_normalize('NFKD', text_type(word.upper()))
277		word = word.replace('ß', 'SS')
278
279		word = word.replace('Ä', 'AE')
280		word = word.replace('Ö', 'OE')
281		word = word.replace('Ü', 'UE')
282		word = ''.join(c for c in word if c in
283		{'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L',
284		'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
285		'Y', 'Z'})
286
287		variants = []
288		if primary_only:
289		variants = [word]
290		else:
291		pos = 0
292		if word[:2] == 'CH':
293		variants.append(('CH', 'SCH'))
294		pos += 2
295		len_3_vars = {'OWN': 'AUN', 'WSK': 'RSK', 'SCH': 'CH', 'GLI': 'LI',
296		'AUX': 'O', 'EUX': 'O'}
297		while pos < len(word):
298		if word[pos:pos+4] == 'ILLE':
299		variants.append(('ILLE', 'I'))
300		pos += 4
301		elif word[pos:pos+3] in len_3_vars:
302		variants.append((word[pos:pos+3], len_3_vars[word[pos:pos+3]]))
303		pos += 3
304		elif word[pos:pos+2] == 'RB':
305		variants.append(('RB', 'RW'))
306		pos += 2
307		elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
308		variants.append(('EAU', 'O'))
309		pos += 3
310		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
311		if word[pos:] == 'O':
312		variants.append(('O', 'OW'))
313		else:
314		variants.append(('A', 'AR'))
315		pos += 1
316		else:
317		variants.append((word[pos],))
318		pos += 1
319
320		variants = [''.join(letters) for letters in product(*variants)]
321
322		def _haase_code(word):
323		sdx = ''
324		for i in range(len(word)):
325	View Code Duplication	if word[i] in _vowels:
		0 ignored issues – show Duplication introduced 2018-08-26 04:07 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
326		sdx += '9'
327		elif word[i] == 'B':
328		sdx += '1'
329		elif word[i] == 'P':
330		if _before(word, i, {'H'}):
331		sdx += '3'
332		else:
333		sdx += '1'
334		elif word[i] in {'D', 'T'}:
335		if _before(word, i, {'C', 'S', 'Z'}):
336		sdx += '8'
337		else:
338		sdx += '2'
339		elif word[i] in {'F', 'V', 'W'}:
340		sdx += '3'
341		elif word[i] in {'G', 'K', 'Q'}:
342		sdx += '4'
343		elif word[i] == 'C':
344		if _after(word, i, {'S', 'Z'}):
345		sdx += '8'
346		elif i == 0:
347		if _before(word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R',
348		'U', 'X'}):
349		sdx += '4'
350		else:
351		sdx += '8'
352		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
353		sdx += '4'
354		else:
355		sdx += '8'
356		elif word[i] == 'X':
357		if _after(word, i, {'C', 'K', 'Q'}):
358		sdx += '8'
359		else:
360		sdx += '48'
361		elif word[i] == 'L':
362		sdx += '5'
363		elif word[i] in {'M', 'N'}:
364		sdx += '6'
365		elif word[i] == 'R':
366		sdx += '7'
367		elif word[i] in {'S', 'Z'}:
368		sdx += '8'
369
370		sdx = _delete_consecutive_repeats(sdx)
371
372		return sdx
373
374		encoded = tuple(_haase_code(word) for word in variants)
375		if len(encoded) > 1:
376		encoded_set = set()
377		encoded_single = []
378		for code in encoded:
379		if code not in encoded_set:
380		encoded_set.add(code)
381		encoded_single.append(code)
382		return tuple(encoded_single)
383
384		return encoded
385
386
387		def reth_schek_phonetik(word):
388		"""Return Reth-Schek Phonetik code for a word.
389
390		This algorithm is proposed in :cite:`Reth:1977`.
391
392		Since I couldn't secure a copy of that document (maybe I'll look for it
393		next time I'm in Germany), this implementation is based on what I could
394		glean from the implementations published by German Record Linkage
395		Center (www.record-linkage.de):
396
397		- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
398		- Merge ToolBox (in Java) :cite:`Schnell:2004`
399
400		Rules that are unclear:
401
402		- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
403		- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
404		- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
405		think of a German word with '-tui-' in it.)
406		- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
407
408		:param str word: the word to transform
409		:returns: the Reth-Schek Phonetik code
410		:rtype: str
411
412		>>> reth_schek_phonetik('Joachim')
413		'JOAGHIM'
414		>>> reth_schek_phonetik('Christoph')
415		'GHRISDOF'
416		>>> reth_schek_phonetik('Jörg')
417		'JOERG'
418		>>> reth_schek_phonetik('Smith')
419		'SMID'
420		>>> reth_schek_phonetik('Schmidt')
421		'SCHMID'
422		"""
423		replacements = {3: {'AEH': 'E', 'IEH': 'I', 'OEH': 'OE', 'UEH': 'UE',
424		'SCH': 'CH', 'ZIO': 'TIO', 'TIU': 'TIO', 'ZIU': 'TIO',
425		'CHS': 'X', 'CKS': 'X', 'AEU': 'OI'},
426		2: {'LL': 'L', 'AA': 'A', 'AH': 'A', 'BB': 'B', 'PP': 'B',
427		'BP': 'B', 'PB': 'B', 'DD': 'D', 'DT': 'D', 'TT': 'D',
428		'TH': 'D', 'EE': 'E', 'EH': 'E', 'AE': 'E', 'FF': 'F',
429		'PH': 'F', 'KK': 'K', 'GG': 'G', 'GK': 'G', 'KG': 'G',
430		'CK': 'G', 'CC': 'C', 'IE': 'I', 'IH': 'I', 'MM': 'M',
431		'NN': 'N', 'OO': 'O', 'OH': 'O', 'SZ': 'S', 'UH': 'U',
432		'GS': 'X', 'KS': 'X', 'TZ': 'Z', 'AY': 'AI',
433		'EI': 'AI', 'EY': 'AI', 'EU': 'OI', 'RR': 'R',
434		'SS': 'S', 'KW': 'QU'},
435		1: {'P': 'B', 'T': 'D', 'V': 'F', 'W': 'F', 'C': 'G',
436		'K': 'G', 'Y': 'I'}}
437
438		# Uppercase
439		word = word.upper()
440
441		# Replace umlauts/eszett
442		word = word.replace('Ä', 'AE')
443		word = word.replace('Ö', 'OE')
444		word = word.replace('Ü', 'UE')
445		word = word.replace('ß', 'SS')
446
447		# Main loop, using above replacements table
448		pos = 0
449		while pos < len(word):
450		for num in range(3, 0, -1):
451		if word[pos:pos+num] in replacements[num]:
452		word = (word[:pos] + replacements[num][word[pos:pos+num]]
453		+ word[pos+num:])
454		pos += 1
455		break
456		else:
457		pos += 1 # Advance if nothing is recognized
458
459		# Change 'CH' back(?) to 'SCH'
460		word = word.replace('CH', 'SCH')
461
462		# Replace final sequences
463		if word[-2:] == 'ER':
464		word = word[:-2]+'R'
465		elif word[-2:] == 'EL':
466		word = word[:-2]+'L'
467		elif word[-1:] == 'H':
468		word = word[:-1]
469
470		return word
471
472
473		if __name__ == '__main__':
474		import doctest
475		doctest.testmod()
476

chrislit / abydos

Pull Request — master (#120)

abydos.phonetic.de.koelner_phonetik_alpha() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like