abydos.phonetic.de.phonem() - Code Metrics - Inspection of "applied Black codestyle" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 6ed6e1...91db7a )

by Chris

created 2018-10-24 05:47 UTC

abydos.phonetic.de.phonem() B

↳ Parent: abydos.phonetic.de

Complexity

Conditions

Size

Total Lines	77
Code Lines	48

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	8
CRAP Score	2

Importance

Changes

Metric	Value
cc	2
eloc	48
nop	1
dl	0
loc	77
ccs	8
cts	8
cp	1
crap	2
rs	8.7018
c	0
b	0
f	0

How to fix Long Method

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic.de.

The phonetic.de module implements the Kölner Phonetik and related
algorithms for German:

    - Kölner Phonetik
    - Phonem
    - Haase Phonetik
    - Reth-Schek Phonetik
"""

from __future__ import unicode_literals

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from . import _delete_consecutive_repeats

__all__ = [
    'haase_phonetik',
    'koelner_phonetik',
    'koelner_phonetik_alpha',
    'koelner_phonetik_num_to_alpha',
    'phonem',
    'reth_schek_phonetik',
]


def koelner_phonetik(word):
    """Return the Kölner Phonetik (numeric output) code for a word.

    Based on the algorithm defined by :cite:`Postel:1969`.

    While the output code is numeric, it is still a str because 0s can lead
    the code.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as a numeric string
    :rtype: str

    >>> koelner_phonetik('Christopher')
    '478237'
    >>> koelner_phonetik('Niall')
    '65'
    >>> koelner_phonetik('Smith')
    '862'
    >>> koelner_phonetik('Schmidt')
    '862'
    >>> koelner_phonetik('Müller')
    '657'
    >>> koelner_phonetik('Zimmermann')
    '86766'
    """

    def _after(word, pos, letters):
        """Return True if word[i] follows one of the supplied letters."""
        return pos > 0 and word[pos - 1] in letters

    def _before(word, pos, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        return pos + 1 < len(word) and word[pos + 1] in letters

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    sdx = ''

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
        }
    )

    # Nothing to convert, return base case
    if not word:
        return sdx

    for i in range(len(word)):

        if word[i] in _vowels:

            sdx += '0'
        elif word[i] == 'B':
            sdx += '1'
        elif word[i] == 'P':
            if _before(word, i, {'H'}):
                sdx += '3'
            else:
                sdx += '1'
        elif word[i] in {'D', 'T'}:
            if _before(word, i, {'C', 'S', 'Z'}):
                sdx += '8'
            else:
                sdx += '2'
        elif word[i] in {'F', 'V', 'W'}:
            sdx += '3'
        elif word[i] in {'G', 'K', 'Q'}:
            sdx += '4'
        elif word[i] == 'C':
            if _after(word, i, {'S', 'Z'}):
                sdx += '8'
            elif i == 0:
                if _before(
                    word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}

                ):
                    sdx += '4'
                else:
                    sdx += '8'
            elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                sdx += '4'
            else:
                sdx += '8'
        elif word[i] == 'X':
            if _after(word, i, {'C', 'K', 'Q'}):
                sdx += '8'
            else:
                sdx += '48'
        elif word[i] == 'L':
            sdx += '5'
        elif word[i] in {'M', 'N'}:
            sdx += '6'
        elif word[i] == 'R':
            sdx += '7'
        elif word[i] in {'S', 'Z'}:
            sdx += '8'

    sdx = _delete_consecutive_repeats(sdx)

    if sdx:
        sdx = sdx[:1] + sdx[1:].replace('0', '')

    return sdx


def koelner_phonetik_num_to_alpha(num):
    """Convert a Kölner Phonetik code from numeric to alphabetic.

    :param str num: a numeric Kölner Phonetik representation (can be a str or
        an int)
    :returns: an alphabetic representation of the same word
    :rtype: str

    >>> koelner_phonetik_num_to_alpha('862')
    'SNT'
    >>> koelner_phonetik_num_to_alpha('657')
    'NLR'
    >>> koelner_phonetik_num_to_alpha('86766')
    'SNRNN'
    """
    _koelner_num_translation = dict(
        zip((ord(_) for _ in '012345678'), 'APTFKLNRS')

    )
    num = ''.join(
        c
        for c in text_type(num)
        if c in {'0', '1', '2', '3', '4', '5', '6', '7', '8'}
    )
    return num.translate(_koelner_num_translation)


def koelner_phonetik_alpha(word):
    """Return the Kölner Phonetik (alphabetic output) code for a word.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as an alphabetic string
    :rtype: str

    >>> koelner_phonetik_alpha('Smith')
    'SNT'
    >>> koelner_phonetik_alpha('Schmidt')
    'SNT'
    >>> koelner_phonetik_alpha('Müller')
    'NLR'
    >>> koelner_phonetik_alpha('Zimmermann')
    'SNRNN'
    """
    return koelner_phonetik_num_to_alpha(koelner_phonetik(word))


def phonem(word):
    """Return the Phonem code for a word.

    Phonem is defined in :cite:`Wilde:1988`.

    This version is based on the Perl implementation documented at
    :cite:`Wilz:2005`.
    It includes some enhancements presented in the Java port at
    :cite:`dcm4che:2011`.

    Phonem is intended chiefly for German names/words.

    :param str word: the word to transform
    :returns: the Phonem value
    :rtype: str

    >>> phonem('Christopher')
    'CRYSDOVR'
    >>> phonem('Niall')
    'NYAL'
    >>> phonem('Smith')
    'SMYD'
    >>> phonem('Schmidt')
    'CMYD'
    """
    _phonem_substitutions = (
        ('SC', 'C'),
        ('SZ', 'C'),
        ('CZ', 'C'),
        ('TZ', 'C'),
        ('TS', 'C'),
        ('KS', 'X'),
        ('PF', 'V'),
        ('QU', 'KW'),
        ('PH', 'V'),
        ('UE', 'Y'),
        ('AE', 'E'),
        ('OE', 'Ö'),
        ('EI', 'AY'),
        ('EY', 'AY'),
        ('EU', 'OY'),
        ('AU', 'A§'),
        ('OU', '§'),
    )
    _phonem_translation = dict(
        zip(
            (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),

            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
        )
    )

    word = unicode_normalize('NFC', text_type(word.upper()))
    for i, j in _phonem_substitutions:
        word = word.replace(i, j)
    word = word.translate(_phonem_translation)

    return ''.join(
        c
        for c in _delete_consecutive_repeats(word)
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'L',
            'M',
            'N',
            'O',
            'R',
            'S',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Ö',
        }
    )


def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.

    While the output code is numeric, it is nevertheless a str.

    :param str word: the word to transform
    :param bool primary_only: if True, only the primary code is returned
    :returns: the Haase Phonetik value as a numeric string
    :rtype: tuple

    >>> haase_phonetik('Joachim')
    ('9496',)
    >>> haase_phonetik('Christoph')
    ('4798293', '8798293')
    >>> haase_phonetik('Jörg')
    ('974',)
    >>> haase_phonetik('Smith')
    ('8692',)
    >>> haase_phonetik('Schmidt')
    ('8692', '4692')
    """

    def _after(word, i, letters):
        """Return True if word[i] follows one of the supplied letters."""
        if i > 0 and word[i - 1] in letters:
            return True
        return False

    def _before(word, i, letters):
        """Return True if word[i] precedes one of the supplied letters."""
        if i + 1 < len(word) and word[i + 1] in letters:
            return True
        return False

    _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}

    word = unicode_normalize('NFKD', text_type(word.upper()))
    word = word.replace('ß', 'SS')

    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = ''.join(
        c
        for c in word
        if c
        in {
            'A',
            'B',
            'C',
            'D',
            'E',
            'F',
            'G',
            'H',
            'I',
            'J',
            'K',
            'L',
            'M',
            'N',
            'O',
            'P',
            'Q',
            'R',
            'S',
            'T',
            'U',
            'V',
            'W',
            'X',
            'Y',
            'Z',
        }
    )

    variants = []
    if primary_only:
        variants = [word]
    else:
        pos = 0
        if word[:2] == 'CH':
            variants.append(('CH', 'SCH'))
            pos += 2
        len_3_vars = {
            'OWN': 'AUN',
            'WSK': 'RSK',
            'SCH': 'CH',
            'GLI': 'LI',
            'AUX': 'O',
            'EUX': 'O',
        }
        while pos < len(word):
            if word[pos : pos + 4] == 'ILLE':
                variants.append(('ILLE', 'I'))
                pos += 4
            elif word[pos : pos + 3] in len_3_vars:
                variants.append(
                    (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                )
                pos += 3
            elif word[pos : pos + 2] == 'RB':
                variants.append(('RB', 'RW'))
                pos += 2
            elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                variants.append(('EAU', 'O'))
                pos += 3
            elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                if word[pos:] == 'O':
                    variants.append(('O', 'OW'))
                else:
                    variants.append(('A', 'AR'))
                pos += 1
            else:
                variants.append((word[pos],))
                pos += 1

        variants = [''.join(letters) for letters in product(*variants)]

    def _haase_code(word):
        sdx = ''
        for i in range(len(word)):

            if word[i] in _vowels:

                sdx += '9'
            elif word[i] == 'B':
                sdx += '1'
            elif word[i] == 'P':
                if _before(word, i, {'H'}):
                    sdx += '3'
                else:
                    sdx += '1'
            elif word[i] in {'D', 'T'}:
                if _before(word, i, {'C', 'S', 'Z'}):
                    sdx += '8'
                else:
                    sdx += '2'
            elif word[i] in {'F', 'V', 'W'}:
                sdx += '3'
            elif word[i] in {'G', 'K', 'Q'}:
                sdx += '4'
            elif word[i] == 'C':
                if _after(word, i, {'S', 'Z'}):
                    sdx += '8'
                elif i == 0:
                    if _before(
                        word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}

                    ):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif word[i] == 'X':
                if _after(word, i, {'C', 'K', 'Q'}):
                    sdx += '8'
                else:
                    sdx += '48'
            elif word[i] == 'L':
                sdx += '5'
            elif word[i] in {'M', 'N'}:
                sdx += '6'
            elif word[i] == 'R':
                sdx += '7'
            elif word[i] in {'S', 'Z'}:
                sdx += '8'

        sdx = _delete_consecutive_repeats(sdx)

        return sdx

    encoded = tuple(_haase_code(word) for word in variants)
    if len(encoded) > 1:
        encoded_set = set()
        encoded_single = []
        for code in encoded:
            if code not in encoded_set:
                encoded_set.add(code)
                encoded_single.append(code)
        return tuple(encoded_single)

    return encoded


def reth_schek_phonetik(word):
    """Return Reth-Schek Phonetik code for a word.

    This algorithm is proposed in :cite:`Reth:1977`.

    Since I couldn't secure a copy of that document (maybe I'll look for it
    next time I'm in Germany), this implementation is based on what I could
    glean from the implementations published by German Record Linkage
    Center (www.record-linkage.de):

    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
    - Merge ToolBox (in Java) :cite:`Schnell:2004`

    Rules that are unclear:

    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
      think of a German word with '-tui-' in it.)
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?

    :param str word: the word to transform
    :returns: the Reth-Schek Phonetik code
    :rtype: str

    >>> reth_schek_phonetik('Joachim')
    'JOAGHIM'
    >>> reth_schek_phonetik('Christoph')
    'GHRISDOF'
    >>> reth_schek_phonetik('Jörg')
    'JOERG'
    >>> reth_schek_phonetik('Smith')
    'SMID'
    >>> reth_schek_phonetik('Schmidt')
    'SCHMID'
    """
    replacements = {
        3: {
            'AEH': 'E',
            'IEH': 'I',
            'OEH': 'OE',
            'UEH': 'UE',
            'SCH': 'CH',
            'ZIO': 'TIO',
            'TIU': 'TIO',
            'ZIU': 'TIO',
            'CHS': 'X',
            'CKS': 'X',
            'AEU': 'OI',
        },
        2: {
            'LL': 'L',
            'AA': 'A',
            'AH': 'A',
            'BB': 'B',
            'PP': 'B',
            'BP': 'B',
            'PB': 'B',
            'DD': 'D',
            'DT': 'D',
            'TT': 'D',
            'TH': 'D',
            'EE': 'E',
            'EH': 'E',
            'AE': 'E',
            'FF': 'F',
            'PH': 'F',
            'KK': 'K',
            'GG': 'G',
            'GK': 'G',
            'KG': 'G',
            'CK': 'G',
            'CC': 'C',
            'IE': 'I',
            'IH': 'I',
            'MM': 'M',
            'NN': 'N',
            'OO': 'O',
            'OH': 'O',
            'SZ': 'S',
            'UH': 'U',
            'GS': 'X',
            'KS': 'X',
            'TZ': 'Z',
            'AY': 'AI',
            'EI': 'AI',
            'EY': 'AI',
            'EU': 'OI',
            'RR': 'R',
            'SS': 'S',
            'KW': 'QU',
        },
        1: {
            'P': 'B',
            'T': 'D',
            'V': 'F',
            'W': 'F',
            'C': 'G',
            'K': 'G',
            'Y': 'I',
        },
    }

    # Uppercase
    word = word.upper()

    # Replace umlauts/eszett
    word = word.replace('Ä', 'AE')
    word = word.replace('Ö', 'OE')
    word = word.replace('Ü', 'UE')
    word = word.replace('ß', 'SS')

    # Main loop, using above replacements table
    pos = 0
    while pos < len(word):
        for num in range(3, 0, -1):
            if word[pos : pos + num] in replacements[num]:
                word = (
                    word[:pos]
                    + replacements[num][word[pos : pos + num]]
                    + word[pos + num :]
                )
                pos += 1
                break
        else:
            pos += 1  # Advance if nothing is recognized

    # Change 'CH' back(?) to 'SCH'
    word = word.replace('CH', 'SCH')

    # Replace final sequences
    if word[-2:] == 'ER':
        word = word[:-2] + 'R'
    elif word[-2:] == 'EL':
        word = word[:-2] + 'L'
    elif word[-1:] == 'H':
        word = word[:-1]

    return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.phonetic.de.
20
21			The phonetic.de module implements the Kölner Phonetik and related
22			algorithms for German:
23
24			- Kölner Phonetik
25			- Phonem
26			- Haase Phonetik
27			- Reth-Schek Phonetik
28			"""
29
30	1		from __future__ import unicode_literals
31
32	1		from itertools import product
33	1		from unicodedata import normalize as unicode_normalize
34
35	1		from six import text_type
36	1		from six.moves import range
37
38	1		from . import _delete_consecutive_repeats
39
40	1		__all__ = [
41			'haase_phonetik',
42			'koelner_phonetik',
43			'koelner_phonetik_alpha',
44			'koelner_phonetik_num_to_alpha',
45			'phonem',
46			'reth_schek_phonetik',
47			]
48
49
50	1		def koelner_phonetik(word):
51			"""Return the Kölner Phonetik (numeric output) code for a word.
52
53			Based on the algorithm defined by :cite:`Postel:1969`.
54
55			While the output code is numeric, it is still a str because 0s can lead
56			the code.
57
58			:param str word: the word to transform
59			:returns: the Kölner Phonetik value as a numeric string
60			:rtype: str
61
62			>>> koelner_phonetik('Christopher')
63			'478237'
64			>>> koelner_phonetik('Niall')
65			'65'
66			>>> koelner_phonetik('Smith')
67			'862'
68			>>> koelner_phonetik('Schmidt')
69			'862'
70			>>> koelner_phonetik('Müller')
71			'657'
72			>>> koelner_phonetik('Zimmermann')
73			'86766'
74			"""
75
76	1		def _after(word, pos, letters):
77			"""Return True if word[i] follows one of the supplied letters."""
78	1		return pos > 0 and word[pos - 1] in letters
79
80	1		def _before(word, pos, letters):
81			"""Return True if word[i] precedes one of the supplied letters."""
82	1		return pos + 1 < len(word) and word[pos + 1] in letters
83
84	1		_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
85
86	1		sdx = ''
87
88	1		word = unicode_normalize('NFKD', text_type(word.upper()))
89	1		word = word.replace('ß', 'SS')
90
91	1		word = word.replace('Ä', 'AE')
92	1		word = word.replace('Ö', 'OE')
93	1		word = word.replace('Ü', 'UE')
94	1		word = ''.join(
95			c
96			for c in word
97			if c
98			in {
99			'A',
100			'B',
101			'C',
102			'D',
103			'E',
104			'F',
105			'G',
106			'H',
107			'I',
108			'J',
109			'K',
110			'L',
111			'M',
112			'N',
113			'O',
114			'P',
115			'Q',
116			'R',
117			'S',
118			'T',
119			'U',
120			'V',
121			'W',
122			'X',
123			'Y',
124			'Z',
125			}
126			)
127
128			# Nothing to convert, return base case
129	1		if not word:
130	1		return sdx
131
132	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
133	1	View Code Duplication	if word[i] in _vowels:
			0 ignored issues – show Duplication introduced 2018-08-26 04:07 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
134	1		sdx += '0'
135	1		elif word[i] == 'B':
136	1		sdx += '1'
137	1		elif word[i] == 'P':
138	1		if _before(word, i, {'H'}):
139	1		sdx += '3'
140			else:
141	1		sdx += '1'
142	1		elif word[i] in {'D', 'T'}:
143	1		if _before(word, i, {'C', 'S', 'Z'}):
144	1		sdx += '8'
145			else:
146	1		sdx += '2'
147	1		elif word[i] in {'F', 'V', 'W'}:
148	1		sdx += '3'
149	1		elif word[i] in {'G', 'K', 'Q'}:
150	1		sdx += '4'
151	1		elif word[i] == 'C':
152	1		if _after(word, i, {'S', 'Z'}):
153	1		sdx += '8'
154	1		elif i == 0:
155	1		if _before(
156			word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
			0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
157			):
158	1		sdx += '4'
159			else:
160	1		sdx += '8'
161	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
162	1		sdx += '4'
163			else:
164	1		sdx += '8'
165	1		elif word[i] == 'X':
166	1		if _after(word, i, {'C', 'K', 'Q'}):
167	1		sdx += '8'
168			else:
169	1		sdx += '48'
170	1		elif word[i] == 'L':
171	1		sdx += '5'
172	1		elif word[i] in {'M', 'N'}:
173	1		sdx += '6'
174	1		elif word[i] == 'R':
175	1		sdx += '7'
176	1		elif word[i] in {'S', 'Z'}:
177	1		sdx += '8'
178
179	1		sdx = _delete_consecutive_repeats(sdx)
180
181	1		if sdx:
182	1		sdx = sdx[:1] + sdx[1:].replace('0', '')
183
184	1		return sdx
185
186
187	1		def koelner_phonetik_num_to_alpha(num):
188			"""Convert a Kölner Phonetik code from numeric to alphabetic.
189
190			:param str num: a numeric Kölner Phonetik representation (can be a str or
191			an int)
192			:returns: an alphabetic representation of the same word
193			:rtype: str
194
195			>>> koelner_phonetik_num_to_alpha('862')
196			'SNT'
197			>>> koelner_phonetik_num_to_alpha('657')
198			'NLR'
199			>>> koelner_phonetik_num_to_alpha('86766')
200			'SNRNN'
201			"""
202	1		_koelner_num_translation = dict(
203			zip((ord(_) for _ in '012345678'), 'APTFKLNRS')
			0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
204			)
205	1		num = ''.join(
206			c
207			for c in text_type(num)
208			if c in {'0', '1', '2', '3', '4', '5', '6', '7', '8'}
209			)
210	1		return num.translate(_koelner_num_translation)
211
212
213	1		def koelner_phonetik_alpha(word):
214			"""Return the Kölner Phonetik (alphabetic output) code for a word.
215
216			:param str word: the word to transform
217			:returns: the Kölner Phonetik value as an alphabetic string
218			:rtype: str
219
220			>>> koelner_phonetik_alpha('Smith')
221			'SNT'
222			>>> koelner_phonetik_alpha('Schmidt')
223			'SNT'
224			>>> koelner_phonetik_alpha('Müller')
225			'NLR'
226			>>> koelner_phonetik_alpha('Zimmermann')
227			'SNRNN'
228			"""
229	1		return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
230
231
232	1		def phonem(word):
233			"""Return the Phonem code for a word.
234
235			Phonem is defined in :cite:`Wilde:1988`.
236
237			This version is based on the Perl implementation documented at
238			:cite:`Wilz:2005`.
239			It includes some enhancements presented in the Java port at
240			:cite:`dcm4che:2011`.
241
242			Phonem is intended chiefly for German names/words.
243
244			:param str word: the word to transform
245			:returns: the Phonem value
246			:rtype: str
247
248			>>> phonem('Christopher')
249			'CRYSDOVR'
250			>>> phonem('Niall')
251			'NYAL'
252			>>> phonem('Smith')
253			'SMYD'
254			>>> phonem('Schmidt')
255			'CMYD'
256			"""
257	1		_phonem_substitutions = (
258			('SC', 'C'),
259			('SZ', 'C'),
260			('CZ', 'C'),
261			('TZ', 'C'),
262			('TS', 'C'),
263			('KS', 'X'),
264			('PF', 'V'),
265			('QU', 'KW'),
266			('PH', 'V'),
267			('UE', 'Y'),
268			('AE', 'E'),
269			('OE', 'Ö'),
270			('EI', 'AY'),
271			('EY', 'AY'),
272			('EU', 'OY'),
273			('AU', 'A§'),
274			('OU', '§'),
275			)
276	1		_phonem_translation = dict(
277			zip(
278			(ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),
			0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
279			'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
280			)
281			)
282
283	1		word = unicode_normalize('NFC', text_type(word.upper()))
284	1		for i, j in _phonem_substitutions:
285	1		word = word.replace(i, j)
286	1		word = word.translate(_phonem_translation)
287
288	1		return ''.join(
289			c
290			for c in _delete_consecutive_repeats(word)
291			if c
292			in {
293			'A',
294			'B',
295			'C',
296			'D',
297			'L',
298			'M',
299			'N',
300			'O',
301			'R',
302			'S',
303			'U',
304			'V',
305			'W',
306			'X',
307			'Y',
308			'Ö',
309			}
310			)
311
312
313	1		def haase_phonetik(word, primary_only=False):
314			"""Return the Haase Phonetik (numeric output) code for a word.
315
316			Based on the algorithm described at :cite:`Prante:2015`.
317
318			Based on the original :cite:`Haase:2000`.
319
320			While the output code is numeric, it is nevertheless a str.
321
322			:param str word: the word to transform
323			:param bool primary_only: if True, only the primary code is returned
324			:returns: the Haase Phonetik value as a numeric string
325			:rtype: tuple
326
327			>>> haase_phonetik('Joachim')
328			('9496',)
329			>>> haase_phonetik('Christoph')
330			('4798293', '8798293')
331			>>> haase_phonetik('Jörg')
332			('974',)
333			>>> haase_phonetik('Smith')
334			('8692',)
335			>>> haase_phonetik('Schmidt')
336			('8692', '4692')
337			"""
338
339	1		def _after(word, i, letters):
340			"""Return True if word[i] follows one of the supplied letters."""
341	1		if i > 0 and word[i - 1] in letters:
342	1		return True
343	1		return False
344
345	1		def _before(word, i, letters):
346			"""Return True if word[i] precedes one of the supplied letters."""
347	1		if i + 1 < len(word) and word[i + 1] in letters:
348	1		return True
349	1		return False
350
351	1		_vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'}
352
353	1		word = unicode_normalize('NFKD', text_type(word.upper()))
354	1		word = word.replace('ß', 'SS')
355
356	1		word = word.replace('Ä', 'AE')
357	1		word = word.replace('Ö', 'OE')
358	1		word = word.replace('Ü', 'UE')
359	1		word = ''.join(
360			c
361			for c in word
362			if c
363			in {
364			'A',
365			'B',
366			'C',
367			'D',
368			'E',
369			'F',
370			'G',
371			'H',
372			'I',
373			'J',
374			'K',
375			'L',
376			'M',
377			'N',
378			'O',
379			'P',
380			'Q',
381			'R',
382			'S',
383			'T',
384			'U',
385			'V',
386			'W',
387			'X',
388			'Y',
389			'Z',
390			}
391			)
392
393	1		variants = []
394	1		if primary_only:
395	1		variants = [word]
396			else:
397	1		pos = 0
398	1		if word[:2] == 'CH':
399	1		variants.append(('CH', 'SCH'))
400	1		pos += 2
401	1		len_3_vars = {
402			'OWN': 'AUN',
403			'WSK': 'RSK',
404			'SCH': 'CH',
405			'GLI': 'LI',
406			'AUX': 'O',
407			'EUX': 'O',
408			}
409	1		while pos < len(word):
410	1		if word[pos : pos + 4] == 'ILLE':
411	1		variants.append(('ILLE', 'I'))
412	1		pos += 4
413	1		elif word[pos : pos + 3] in len_3_vars:
414	1		variants.append(
415			(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
416			)
417	1		pos += 3
418	1		elif word[pos : pos + 2] == 'RB':
419	1		variants.append(('RB', 'RW'))
420	1		pos += 2
421	1		elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
422	1		variants.append(('EAU', 'O'))
423	1		pos += 3
424	1		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
425	1		if word[pos:] == 'O':
426	1		variants.append(('O', 'OW'))
427			else:
428	1		variants.append(('A', 'AR'))
429	1		pos += 1
430			else:
431	1		variants.append((word[pos],))
432	1		pos += 1
433
434	1		variants = [''.join(letters) for letters in product(*variants)]
435
436	1		def _haase_code(word):
437	1		sdx = ''
438	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-10-20 00:45 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
439	1	View Code Duplication	if word[i] in _vowels:
			0 ignored issues – show Duplication introduced 2018-08-26 04:07 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
440	1		sdx += '9'
441	1		elif word[i] == 'B':
442	1		sdx += '1'
443	1		elif word[i] == 'P':
444	1		if _before(word, i, {'H'}):
445	1		sdx += '3'
446			else:
447	1		sdx += '1'
448	1		elif word[i] in {'D', 'T'}:
449	1		if _before(word, i, {'C', 'S', 'Z'}):
450	1		sdx += '8'
451			else:
452	1		sdx += '2'
453	1		elif word[i] in {'F', 'V', 'W'}:
454	1		sdx += '3'
455	1		elif word[i] in {'G', 'K', 'Q'}:
456	1		sdx += '4'
457	1		elif word[i] == 'C':
458	1		if _after(word, i, {'S', 'Z'}):
459	1		sdx += '8'
460	1		elif i == 0:
461	1		if _before(
462			word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
			0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
463			):
464	1		sdx += '4'
465			else:
466	1		sdx += '8'
467	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
468	1		sdx += '4'
469			else:
470	1		sdx += '8'
471	1		elif word[i] == 'X':
472	1		if _after(word, i, {'C', 'K', 'Q'}):
473	1		sdx += '8'
474			else:
475	1		sdx += '48'
476	1		elif word[i] == 'L':
477	1		sdx += '5'
478	1		elif word[i] in {'M', 'N'}:
479	1		sdx += '6'
480	1		elif word[i] == 'R':
481	1		sdx += '7'
482	1		elif word[i] in {'S', 'Z'}:
483	1		sdx += '8'
484
485	1		sdx = _delete_consecutive_repeats(sdx)
486
487	1		return sdx
488
489	1		encoded = tuple(_haase_code(word) for word in variants)
490	1		if len(encoded) > 1:
491	1		encoded_set = set()
492	1		encoded_single = []
493	1		for code in encoded:
494	1		if code not in encoded_set:
495	1		encoded_set.add(code)
496	1		encoded_single.append(code)
497	1		return tuple(encoded_single)
498
499	1		return encoded
500
501
502	1		def reth_schek_phonetik(word):
503			"""Return Reth-Schek Phonetik code for a word.
504
505			This algorithm is proposed in :cite:`Reth:1977`.
506
507			Since I couldn't secure a copy of that document (maybe I'll look for it
508			next time I'm in Germany), this implementation is based on what I could
509			glean from the implementations published by German Record Linkage
510			Center (www.record-linkage.de):
511
512			- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
513			- Merge ToolBox (in Java) :cite:`Schnell:2004`
514
515			Rules that are unclear:
516
517			- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
518			- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
519			- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
520			think of a German word with '-tui-' in it.)
521			- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
522
523			:param str word: the word to transform
524			:returns: the Reth-Schek Phonetik code
525			:rtype: str
526
527			>>> reth_schek_phonetik('Joachim')
528			'JOAGHIM'
529			>>> reth_schek_phonetik('Christoph')
530			'GHRISDOF'
531			>>> reth_schek_phonetik('Jörg')
532			'JOERG'
533			>>> reth_schek_phonetik('Smith')
534			'SMID'
535			>>> reth_schek_phonetik('Schmidt')
536			'SCHMID'
537			"""
538	1		replacements = {
539			3: {
540			'AEH': 'E',
541			'IEH': 'I',
542			'OEH': 'OE',
543			'UEH': 'UE',
544			'SCH': 'CH',
545			'ZIO': 'TIO',
546			'TIU': 'TIO',
547			'ZIU': 'TIO',
548			'CHS': 'X',
549			'CKS': 'X',
550			'AEU': 'OI',
551			},
552			2: {
553			'LL': 'L',
554			'AA': 'A',
555			'AH': 'A',
556			'BB': 'B',
557			'PP': 'B',
558			'BP': 'B',
559			'PB': 'B',
560			'DD': 'D',
561			'DT': 'D',
562			'TT': 'D',
563			'TH': 'D',
564			'EE': 'E',
565			'EH': 'E',
566			'AE': 'E',
567			'FF': 'F',
568			'PH': 'F',
569			'KK': 'K',
570			'GG': 'G',
571			'GK': 'G',
572			'KG': 'G',
573			'CK': 'G',
574			'CC': 'C',
575			'IE': 'I',
576			'IH': 'I',
577			'MM': 'M',
578			'NN': 'N',
579			'OO': 'O',
580			'OH': 'O',
581			'SZ': 'S',
582			'UH': 'U',
583			'GS': 'X',
584			'KS': 'X',
585			'TZ': 'Z',
586			'AY': 'AI',
587			'EI': 'AI',
588			'EY': 'AI',
589			'EU': 'OI',
590			'RR': 'R',
591			'SS': 'S',
592			'KW': 'QU',
593			},
594			1: {
595			'P': 'B',
596			'T': 'D',
597			'V': 'F',
598			'W': 'F',
599			'C': 'G',
600			'K': 'G',
601			'Y': 'I',
602			},
603			}
604
605			# Uppercase
606	1		word = word.upper()
607
608			# Replace umlauts/eszett
609	1		word = word.replace('Ä', 'AE')
610	1		word = word.replace('Ö', 'OE')
611	1		word = word.replace('Ü', 'UE')
612	1		word = word.replace('ß', 'SS')
613
614			# Main loop, using above replacements table
615	1		pos = 0
616	1		while pos < len(word):
617	1		for num in range(3, 0, -1):
618	1		if word[pos : pos + num] in replacements[num]:
619	1		word = (
620			word[:pos]
621			+ replacements[num][word[pos : pos + num]]
622			+ word[pos + num :]
623			)
624	1		pos += 1
625	1		break
626			else:
627	1		pos += 1 # Advance if nothing is recognized
628
629			# Change 'CH' back(?) to 'SCH'
630	1		word = word.replace('CH', 'SCH')
631
632			# Replace final sequences
633	1		if word[-2:] == 'ER':
634	1		word = word[:-2] + 'R'
635	1		elif word[-2:] == 'EL':
636	1		word = word[:-2] + 'L'
637	1		elif word[-1:] == 'H':
638	1		word = word[:-1]
639
640	1		return word
641
642
643			if __name__ == '__main__':
644			import doctest
645
646			doctest.testmod()
647

chrislit / abydos

Push — master ( 6ed6e1...91db7a )

abydos.phonetic.de.phonem() B

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method

Long Method

Duplication Side-by-Side

Filter issues like