abydos.phonetic._de.Haase.encode() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#135)

by Chris

created 2018-11-04 07:51 UTC

abydos.phonetic._de.Haase.encode() F

↳ Parent: abydos.phonetic._de

Complexity

Conditions

Size

Total Lines	153
Code Lines	107

Duplication

Lines	47
Ratio	30.72 %

Code Coverage

Tests	97
CRAP Score	39

Importance

Changes

Metric	Value
eloc	107
dl	47
loc	153
ccs	97
cts	97
cp	1
rs	0
c	0
b	0
f	0
cc	39
nop	3
crap	39

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._de.

The phonetic._de module implements the Kölner Phonetik and related
algorithms for German:

    - Kölner Phonetik
    - Phonem
    - Haase Phonetik
    - Reth-Schek Phonetik
"""

from __future__ import unicode_literals

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import Phonetic

__all__ = [
    'Haase',
    'Koelner',
    'Phonem',
    'RethSchek',
    'haase_phonetik',
    'koelner_phonetik',
    'koelner_phonetik_alpha',
    'koelner_phonetik_num_to_alpha',
    'phonem',
    'reth_schek_phonetik',
]


class Koelner(Phonetic):

    """Kölner Phonetik.

    Based on the algorithm defined by :cite:`Postel:1969`.
    """

    _uc_v_set = set('AEIOUJY')

    _num_trans = dict(zip((ord(_) for _ in '012345678'), 'APTFKLNRS'))

    _num_set = set('012345678')

    def encode(self, word):
        """Return the Kölner Phonetik (numeric output) code for a word.

        While the output code is numeric, it is still a str because 0s can lead
        the code.

        :param str word: the word to transform
        :returns: the Kölner Phonetik value as a numeric string
        :rtype: str

        >>> pe = Koelner()
        >>> pe.encode('Christopher')
        '478237'
        >>> pe.encode('Niall')
        '65'
        >>> pe.encode('Smith')
        '862'
        >>> pe.encode('Schmidt')
        '862'
        >>> pe.encode('Müller')
        '657'
        >>> pe.encode('Zimmermann')
        '86766'
        """

        def _after(word, pos, letters):
            """Return True if word[i] follows one of the supplied letters."""
            return pos > 0 and word[pos - 1] in letters

        def _before(word, pos, letters):
            """Return True if word[i] precedes one of the supplied letters."""
            return pos + 1 < len(word) and word[pos + 1] in letters

        sdx = ''

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        # Nothing to convert, return base case
        if not word:
            return sdx

        for i in range(len(word)):

            if word[i] in self._uc_v_set:

                sdx += '0'
            elif word[i] == 'B':
                sdx += '1'
            elif word[i] == 'P':
                if _before(word, i, {'H'}):
                    sdx += '3'
                else:
                    sdx += '1'
            elif word[i] in {'D', 'T'}:
                if _before(word, i, {'C', 'S', 'Z'}):
                    sdx += '8'
                else:
                    sdx += '2'
            elif word[i] in {'F', 'V', 'W'}:
                sdx += '3'
            elif word[i] in {'G', 'K', 'Q'}:
                sdx += '4'
            elif word[i] == 'C':
                if _after(word, i, {'S', 'Z'}):
                    sdx += '8'
                elif i == 0:
                    if _before(
                        word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}

                    ):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                    sdx += '4'
                else:
                    sdx += '8'
            elif word[i] == 'X':
                if _after(word, i, {'C', 'K', 'Q'}):
                    sdx += '8'
                else:
                    sdx += '48'
            elif word[i] == 'L':
                sdx += '5'
            elif word[i] in {'M', 'N'}:
                sdx += '6'
            elif word[i] == 'R':
                sdx += '7'
            elif word[i] in {'S', 'Z'}:
                sdx += '8'

        sdx = self._delete_consecutive_repeats(sdx)

        if sdx:
            sdx = sdx[:1] + sdx[1:].replace('0', '')

        return sdx

    def _to_alpha(self, num):
        """Convert a Kölner Phonetik code from numeric to alphabetic.

        :param str num: a numeric Kölner Phonetik representation (can be a str
            or an int)
        :returns: an alphabetic representation of the same word
        :rtype: str

        >>> pe = Koelner()
        >>> pe._to_alpha('862')
        'SNT'
        >>> pe._to_alpha('657')
        'NLR'
        >>> pe._to_alpha('86766')
        'SNRNN'
        """
        num = ''.join(c for c in text_type(num) if c in self._num_set)
        return num.translate(self._num_trans)

    def encode_alpha(self, word):
        """Return the Kölner Phonetik (alphabetic output) code for a word.

        :param str word: the word to transform
        :returns: the Kölner Phonetik value as an alphabetic string
        :rtype: str

        >>> pe = Koelner()
        >>> pe.encode_alpha('Smith')
        'SNT'
        >>> pe.encode_alpha('Schmidt')
        'SNT'
        >>> pe.encode_alpha('Müller')
        'NLR'
        >>> pe.encode_alpha('Zimmermann')
        'SNRNN'
        """
        return koelner_phonetik_num_to_alpha(koelner_phonetik(word))


def koelner_phonetik(word):
    """Return the Kölner Phonetik (numeric output) code for a word.

    This is a wrapper for :py:meth:`Koelner.encode`.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as a numeric string
    :rtype: str

    >>> koelner_phonetik('Christopher')
    '478237'
    >>> koelner_phonetik('Niall')
    '65'
    >>> koelner_phonetik('Smith')
    '862'
    >>> koelner_phonetik('Schmidt')
    '862'
    >>> koelner_phonetik('Müller')
    '657'
    >>> koelner_phonetik('Zimmermann')
    '86766'
    """
    return Koelner().encode(word)


def koelner_phonetik_num_to_alpha(num):
    """Convert a Kölner Phonetik code from numeric to alphabetic.

    This is a wrapper for :py:meth:`Koelner._to_alpha`.

    :param str num: a numeric Kölner Phonetik representation (can be a str or
        an int)
    :returns: an alphabetic representation of the same word
    :rtype: str

    >>> koelner_phonetik_num_to_alpha('862')
    'SNT'
    >>> koelner_phonetik_num_to_alpha('657')
    'NLR'
    >>> koelner_phonetik_num_to_alpha('86766')
    'SNRNN'
    """
    return Koelner()._to_alpha(num)
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent


def koelner_phonetik_alpha(word):
    """Return the Kölner Phonetik (alphabetic output) code for a word.

    This is a wrapper for :py:meth:`Koelner.encode_alpha`.

    :param str word: the word to transform
    :returns: the Kölner Phonetik value as an alphabetic string
    :rtype: str

    >>> koelner_phonetik_alpha('Smith')
    'SNT'
    >>> koelner_phonetik_alpha('Schmidt')
    'SNT'
    >>> koelner_phonetik_alpha('Müller')
    'NLR'
    >>> koelner_phonetik_alpha('Zimmermann')
    'SNRNN'
    """
    return Koelner().encode_alpha(word)


class Phonem(Phonetic):

    """Phonem.

    Phonem is defined in :cite:`Wilde:1988`.

    This version is based on the Perl implementation documented at
    :cite:`Wilz:2005`.
    It includes some enhancements presented in the Java port at
    :cite:`dcm4che:2011`.

    Phonem is intended chiefly for German names/words.
    """

    _substitutions = (
        ('SC', 'C'),
        ('SZ', 'C'),
        ('CZ', 'C'),
        ('TZ', 'C'),
        ('TS', 'C'),
        ('KS', 'X'),
        ('PF', 'V'),
        ('QU', 'KW'),
        ('PH', 'V'),
        ('UE', 'Y'),
        ('AE', 'E'),
        ('OE', 'Ö'),
        ('EI', 'AY'),
        ('EY', 'AY'),
        ('EU', 'OY'),
        ('AU', 'A§'),
        ('OU', '§'),
    )

    _trans = dict(
        zip(
            (ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),

            'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
        )
    )

    _uc_set = set('ABCDLMNORSUVWXYÖ')

    def encode(self, word):
        """Return the Phonem code for a word.

        :param str word: the word to transform
        :returns: the Phonem value
        :rtype: str

        >>> pe = Phonem()
        >>> pe.encode('Christopher')
        'CRYSDOVR'
        >>> pe.encode('Niall')
        'NYAL'
        >>> pe.encode('Smith')
        'SMYD'
        >>> pe.encode('Schmidt')
        'CMYD'
        """
        word = unicode_normalize('NFC', text_type(word.upper()))
        for i, j in self._substitutions:
            word = word.replace(i, j)
        word = word.translate(self._trans)

        return ''.join(
            c
            for c in self._delete_consecutive_repeats(word)
            if c in self._uc_set
        )


def phonem(word):
    """Return the Phonem code for a word.

    This is a wrapper for :py:meth:`Phonem.encode`.

    :param str word: the word to transform
    :returns: the Phonem value
    :rtype: str

    >>> phonem('Christopher')
    'CRYSDOVR'
    >>> phonem('Niall')
    'NYAL'
    >>> phonem('Smith')
    'SMYD'
    >>> phonem('Schmidt')
    'CMYD'
    """
    return Phonem().encode(word)


class Haase(Phonetic):

    """Haase Phonetik.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.
    """

    _uc_v_set = set('AEIJOUY')

    def encode(self, word, primary_only=False):

        """Return the Haase Phonetik (numeric output) code for a word.

        While the output code is numeric, it is nevertheless a str.

        :param str word: the word to transform
        :param bool primary_only: if True, only the primary code is returned
        :returns: the Haase Phonetik value as a numeric string
        :rtype: tuple


        >>> pe = Haase()
        >>> pe.encode('Joachim')
        ('9496',)
        >>> pe.encode('Christoph')
        ('4798293', '8798293')
        >>> pe.encode('Jörg')
        ('974',)
        >>> pe.encode('Smith')
        ('8692',)
        >>> pe.encode('Schmidt')
        ('8692', '4692')
        """

        def _after(word, i, letters):
            """Return True if word[i] follows one of the supplied letters."""
            if i > 0 and word[i - 1] in letters:
                return True
            return False

        def _before(word, i, letters):
            """Return True if word[i] precedes one of the supplied letters."""
            if i + 1 < len(word) and word[i + 1] in letters:
                return True
            return False

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        variants = []
        if primary_only:
            variants = [word]
        else:
            pos = 0
            if word[:2] == 'CH':
                variants.append(('CH', 'SCH'))
                pos += 2
            len_3_vars = {
                'OWN': 'AUN',
                'WSK': 'RSK',
                'SCH': 'CH',
                'GLI': 'LI',
                'AUX': 'O',
                'EUX': 'O',
            }
            while pos < len(word):
                if word[pos : pos + 4] == 'ILLE':
                    variants.append(('ILLE', 'I'))
                    pos += 4
                elif word[pos : pos + 3] in len_3_vars:
                    variants.append(
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                    )
                    pos += 3
                elif word[pos : pos + 2] == 'RB':
                    variants.append(('RB', 'RW'))
                    pos += 2
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                    variants.append(('EAU', 'O'))
                    pos += 3
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                    if word[pos:] == 'O':
                        variants.append(('O', 'OW'))
                    else:
                        variants.append(('A', 'AR'))
                    pos += 1
                else:
                    variants.append((word[pos],))
                    pos += 1

            variants = [''.join(letters) for letters in product(*variants)]

        def _haase_code(word):
            sdx = ''
            for i in range(len(word)):

                if word[i] in self._uc_v_set:

                    sdx += '9'
                elif word[i] == 'B':
                    sdx += '1'
                elif word[i] == 'P':
                    if _before(word, i, {'H'}):
                        sdx += '3'
                    else:
                        sdx += '1'
                elif word[i] in {'D', 'T'}:
                    if _before(word, i, {'C', 'S', 'Z'}):
                        sdx += '8'
                    else:
                        sdx += '2'
                elif word[i] in {'F', 'V', 'W'}:
                    sdx += '3'
                elif word[i] in {'G', 'K', 'Q'}:
                    sdx += '4'
                elif word[i] == 'C':
                    if _after(word, i, {'S', 'Z'}):
                        sdx += '8'
                    elif i == 0:
                        if _before(
                            word,

                            i,

                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},

                        ):
                            sdx += '4'
                        else:
                            sdx += '8'
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif word[i] == 'X':
                    if _after(word, i, {'C', 'K', 'Q'}):
                        sdx += '8'
                    else:
                        sdx += '48'
                elif word[i] == 'L':
                    sdx += '5'
                elif word[i] in {'M', 'N'}:
                    sdx += '6'
                elif word[i] == 'R':
                    sdx += '7'
                elif word[i] in {'S', 'Z'}:
                    sdx += '8'

            sdx = self._delete_consecutive_repeats(sdx)

            return sdx

        encoded = tuple(_haase_code(word) for word in variants)
        if len(encoded) > 1:
            encoded_set = set()
            encoded_single = []
            for code in encoded:
                if code not in encoded_set:
                    encoded_set.add(code)
                    encoded_single.append(code)
            return tuple(encoded_single)

        return encoded


def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    This is a wrapper for :py:meth:`Haase.encode`.

    :param str word: the word to transform
    :param bool primary_only: if True, only the primary code is returned
    :returns: the Haase Phonetik value as a numeric string
    :rtype: tuple

    >>> haase_phonetik('Joachim')
    ('9496',)
    >>> haase_phonetik('Christoph')
    ('4798293', '8798293')
    >>> haase_phonetik('Jörg')
    ('974',)
    >>> haase_phonetik('Smith')
    ('8692',)
    >>> haase_phonetik('Schmidt')
    ('8692', '4692')
    """
    return Haase().encode(word, primary_only)


class RethSchek(Phonetic):

    """Reth-Schek Phonetik.

    This algorithm is proposed in :cite:`Reth:1977`.

    Since I couldn't secure a copy of that document (maybe I'll look for it
    next time I'm in Germany), this implementation is based on what I could
    glean from the implementations published by German Record Linkage
    Center (www.record-linkage.de):

    - Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
    - Merge ToolBox (in Java) :cite:`Schnell:2004`

    Rules that are unclear:

    - Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
    - Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
    - Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
      think of a German word with '-tui-' in it.)
    - Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
    """

    _replacements = {
        3: {
            'AEH': 'E',
            'IEH': 'I',
            'OEH': 'OE',
            'UEH': 'UE',
            'SCH': 'CH',
            'ZIO': 'TIO',
            'TIU': 'TIO',
            'ZIU': 'TIO',
            'CHS': 'X',
            'CKS': 'X',
            'AEU': 'OI',
        },
        2: {
            'LL': 'L',
            'AA': 'A',
            'AH': 'A',
            'BB': 'B',
            'PP': 'B',
            'BP': 'B',
            'PB': 'B',
            'DD': 'D',
            'DT': 'D',
            'TT': 'D',
            'TH': 'D',
            'EE': 'E',
            'EH': 'E',
            'AE': 'E',
            'FF': 'F',
            'PH': 'F',
            'KK': 'K',
            'GG': 'G',
            'GK': 'G',
            'KG': 'G',
            'CK': 'G',
            'CC': 'C',
            'IE': 'I',
            'IH': 'I',
            'MM': 'M',
            'NN': 'N',
            'OO': 'O',
            'OH': 'O',
            'SZ': 'S',
            'UH': 'U',
            'GS': 'X',
            'KS': 'X',
            'TZ': 'Z',
            'AY': 'AI',
            'EI': 'AI',
            'EY': 'AI',
            'EU': 'OI',
            'RR': 'R',
            'SS': 'S',
            'KW': 'QU',
        },
        1: {
            'P': 'B',
            'T': 'D',
            'V': 'F',
            'W': 'F',
            'C': 'G',
            'K': 'G',
            'Y': 'I',
        },
    }

    def encode(self, word):
        """Return Reth-Schek Phonetik code for a word.

        :param str word: the word to transform
        :returns: the Reth-Schek Phonetik code
        :rtype: str

        >>> reth_schek_phonetik('Joachim')
        'JOAGHIM'
        >>> reth_schek_phonetik('Christoph')
        'GHRISDOF'
        >>> reth_schek_phonetik('Jörg')
        'JOERG'
        >>> reth_schek_phonetik('Smith')
        'SMID'
        >>> reth_schek_phonetik('Schmidt')
        'SCHMID'
        """
        # Uppercase
        word = word.upper()

        # Replace umlauts/eszett
        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = word.replace('ß', 'SS')

        # Main loop, using above replacements table
        pos = 0
        while pos < len(word):
            for num in range(3, 0, -1):
                if word[pos : pos + num] in self._replacements[num]:
                    word = (
                        word[:pos]
                        + self._replacements[num][word[pos : pos + num]]
                        + word[pos + num :]
                    )
                    pos += 1
                    break
            else:
                pos += 1  # Advance if nothing is recognized

        # Change 'CH' back(?) to 'SCH'
        word = word.replace('CH', 'SCH')

        # Replace final sequences
        if word[-2:] == 'ER':
            word = word[:-2] + 'R'
        elif word[-2:] == 'EL':
            word = word[:-2] + 'L'
        elif word[-1:] == 'H':
            word = word[:-1]

        return word


def reth_schek_phonetik(word):
    """Return Reth-Schek Phonetik code for a word.

    This is a wrapper for :py:meth:`RethSchek.encode`.

    :param str word: the word to transform
    :returns: the Reth-Schek Phonetik code
    :rtype: str

    >>> reth_schek_phonetik('Joachim')
    'JOAGHIM'
    >>> reth_schek_phonetik('Christoph')
    'GHRISDOF'
    >>> reth_schek_phonetik('Jörg')
    'JOERG'
    >>> reth_schek_phonetik('Smith')
    'SMID'
    >>> reth_schek_phonetik('Schmidt')
    'SCHMID'
    """
    return RethSchek().encode(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.phonetic._de.
20
21			The phonetic._de module implements the Kölner Phonetik and related
22			algorithms for German:
23
24			- Kölner Phonetik
25			- Phonem
26			- Haase Phonetik
27			- Reth-Schek Phonetik
28			"""
29
30	1		from __future__ import unicode_literals
31
32	1		from itertools import product
33	1		from unicodedata import normalize as unicode_normalize
34
35	1		from six import text_type
36	1		from six.moves import range
37
38	1		from ._phonetic import Phonetic
39
40	1		__all__ = [
41			'Haase',
42			'Koelner',
43			'Phonem',
44			'RethSchek',
45			'haase_phonetik',
46			'koelner_phonetik',
47			'koelner_phonetik_alpha',
48			'koelner_phonetik_num_to_alpha',
49			'phonem',
50			'reth_schek_phonetik',
51			]
52
53
54	1		class Koelner(Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
55			"""Kölner Phonetik.
56
57			Based on the algorithm defined by :cite:`Postel:1969`.
58			"""
59
60	1		_uc_v_set = set('AEIOUJY')
61
62	1		_num_trans = dict(zip((ord(_) for _ in '012345678'), 'APTFKLNRS'))
			0 ignored issues – show Comprehensibility Best Practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
63	1		_num_set = set('012345678')
64
65	1		def encode(self, word):
66			"""Return the Kölner Phonetik (numeric output) code for a word.
67
68			While the output code is numeric, it is still a str because 0s can lead
69			the code.
70
71			:param str word: the word to transform
72			:returns: the Kölner Phonetik value as a numeric string
73			:rtype: str
74
75			>>> pe = Koelner()
76			>>> pe.encode('Christopher')
77			'478237'
78			>>> pe.encode('Niall')
79			'65'
80			>>> pe.encode('Smith')
81			'862'
82			>>> pe.encode('Schmidt')
83			'862'
84			>>> pe.encode('Müller')
85			'657'
86			>>> pe.encode('Zimmermann')
87			'86766'
88			"""
89
90	1		def _after(word, pos, letters):
91			"""Return True if word[i] follows one of the supplied letters."""
92	1		return pos > 0 and word[pos - 1] in letters
93
94	1		def _before(word, pos, letters):
95			"""Return True if word[i] precedes one of the supplied letters."""
96	1		return pos + 1 < len(word) and word[pos + 1] in letters
97
98	1		sdx = ''
99
100	1		word = unicode_normalize('NFKD', text_type(word.upper()))
101	1		word = word.replace('ß', 'SS')
102
103	1		word = word.replace('Ä', 'AE')
104	1		word = word.replace('Ö', 'OE')
105	1		word = word.replace('Ü', 'UE')
106	1		word = ''.join(c for c in word if c in self._uc_set)
107
108			# Nothing to convert, return base case
109	1		if not word:
110	1		return sdx
111
112	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
113	1	View Code Duplication	if word[i] in self._uc_v_set:
			0 ignored issues – show Duplication introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
114	1		sdx += '0'
115	1		elif word[i] == 'B':
116	1		sdx += '1'
117	1		elif word[i] == 'P':
118	1		if _before(word, i, {'H'}):
119	1		sdx += '3'
120			else:
121	1		sdx += '1'
122	1		elif word[i] in {'D', 'T'}:
123	1		if _before(word, i, {'C', 'S', 'Z'}):
124	1		sdx += '8'
125			else:
126	1		sdx += '2'
127	1		elif word[i] in {'F', 'V', 'W'}:
128	1		sdx += '3'
129	1		elif word[i] in {'G', 'K', 'Q'}:
130	1		sdx += '4'
131	1		elif word[i] == 'C':
132	1		if _after(word, i, {'S', 'Z'}):
133	1		sdx += '8'
134	1		elif i == 0:
135	1		if _before(
136			word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}
			0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
137			):
138	1		sdx += '4'
139			else:
140	1		sdx += '8'
141	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
142	1		sdx += '4'
143			else:
144	1		sdx += '8'
145	1		elif word[i] == 'X':
146	1		if _after(word, i, {'C', 'K', 'Q'}):
147	1		sdx += '8'
148			else:
149	1		sdx += '48'
150	1		elif word[i] == 'L':
151	1		sdx += '5'
152	1		elif word[i] in {'M', 'N'}:
153	1		sdx += '6'
154	1		elif word[i] == 'R':
155	1		sdx += '7'
156	1		elif word[i] in {'S', 'Z'}:
157	1		sdx += '8'
158
159	1		sdx = self._delete_consecutive_repeats(sdx)
160
161	1		if sdx:
162	1		sdx = sdx[:1] + sdx[1:].replace('0', '')
163
164	1		return sdx
165
166	1		def _to_alpha(self, num):
167			"""Convert a Kölner Phonetik code from numeric to alphabetic.
168
169			:param str num: a numeric Kölner Phonetik representation (can be a str
170			or an int)
171			:returns: an alphabetic representation of the same word
172			:rtype: str
173
174			>>> pe = Koelner()
175			>>> pe._to_alpha('862')
176			'SNT'
177			>>> pe._to_alpha('657')
178			'NLR'
179			>>> pe._to_alpha('86766')
180			'SNRNN'
181			"""
182	1		num = ''.join(c for c in text_type(num) if c in self._num_set)
183	1		return num.translate(self._num_trans)
184
185	1		def encode_alpha(self, word):
186			"""Return the Kölner Phonetik (alphabetic output) code for a word.
187
188			:param str word: the word to transform
189			:returns: the Kölner Phonetik value as an alphabetic string
190			:rtype: str
191
192			>>> pe = Koelner()
193			>>> pe.encode_alpha('Smith')
194			'SNT'
195			>>> pe.encode_alpha('Schmidt')
196			'SNT'
197			>>> pe.encode_alpha('Müller')
198			'NLR'
199			>>> pe.encode_alpha('Zimmermann')
200			'SNRNN'
201			"""
202	1		return koelner_phonetik_num_to_alpha(koelner_phonetik(word))
203
204
205	1		def koelner_phonetik(word):
206			"""Return the Kölner Phonetik (numeric output) code for a word.
207
208			This is a wrapper for :py:meth:`Koelner.encode`.
209
210			:param str word: the word to transform
211			:returns: the Kölner Phonetik value as a numeric string
212			:rtype: str
213
214			>>> koelner_phonetik('Christopher')
215			'478237'
216			>>> koelner_phonetik('Niall')
217			'65'
218			>>> koelner_phonetik('Smith')
219			'862'
220			>>> koelner_phonetik('Schmidt')
221			'862'
222			>>> koelner_phonetik('Müller')
223			'657'
224			>>> koelner_phonetik('Zimmermann')
225			'86766'
226			"""
227	1		return Koelner().encode(word)
228
229
230	1		def koelner_phonetik_num_to_alpha(num):
231			"""Convert a Kölner Phonetik code from numeric to alphabetic.
232
233			This is a wrapper for :py:meth:`Koelner._to_alpha`.
234
235			:param str num: a numeric Kölner Phonetik representation (can be a str or
236			an int)
237			:returns: an alphabetic representation of the same word
238			:rtype: str
239
240			>>> koelner_phonetik_num_to_alpha('862')
241			'SNT'
242			>>> koelner_phonetik_num_to_alpha('657')
243			'NLR'
244			>>> koelner_phonetik_num_to_alpha('86766')
245			'SNRNN'
246			"""
247	1		return Koelner()._to_alpha(num)
			0 ignored issues – show Coding Style Best Practice introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report It seems like `_to_alpha` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
248
249
250	1		def koelner_phonetik_alpha(word):
251			"""Return the Kölner Phonetik (alphabetic output) code for a word.
252
253			This is a wrapper for :py:meth:`Koelner.encode_alpha`.
254
255			:param str word: the word to transform
256			:returns: the Kölner Phonetik value as an alphabetic string
257			:rtype: str
258
259			>>> koelner_phonetik_alpha('Smith')
260			'SNT'
261			>>> koelner_phonetik_alpha('Schmidt')
262			'SNT'
263			>>> koelner_phonetik_alpha('Müller')
264			'NLR'
265			>>> koelner_phonetik_alpha('Zimmermann')
266			'SNRNN'
267			"""
268	1		return Koelner().encode_alpha(word)
269
270
271	1		class Phonem(Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
272			"""Phonem.
273
274			Phonem is defined in :cite:`Wilde:1988`.
275
276			This version is based on the Perl implementation documented at
277			:cite:`Wilz:2005`.
278			It includes some enhancements presented in the Java port at
279			:cite:`dcm4che:2011`.
280
281			Phonem is intended chiefly for German names/words.
282			"""
283
284	1		_substitutions = (
285			('SC', 'C'),
286			('SZ', 'C'),
287			('CZ', 'C'),
288			('TZ', 'C'),
289			('TS', 'C'),
290			('KS', 'X'),
291			('PF', 'V'),
292			('QU', 'KW'),
293			('PH', 'V'),
294			('UE', 'Y'),
295			('AE', 'E'),
296			('OE', 'Ö'),
297			('EI', 'AY'),
298			('EY', 'AY'),
299			('EU', 'OY'),
300			('AU', 'A§'),
301			('OU', '§'),
302			)
303
304	1		_trans = dict(
305			zip(
306			(ord(_) for _ in 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜÝ§ÚÙÛÔÒÓÕØ'),
			0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
307			'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ',
308			)
309			)
310
311	1		_uc_set = set('ABCDLMNORSUVWXYÖ')
312
313	1		def encode(self, word):
314			"""Return the Phonem code for a word.
315
316			:param str word: the word to transform
317			:returns: the Phonem value
318			:rtype: str
319
320			>>> pe = Phonem()
321			>>> pe.encode('Christopher')
322			'CRYSDOVR'
323			>>> pe.encode('Niall')
324			'NYAL'
325			>>> pe.encode('Smith')
326			'SMYD'
327			>>> pe.encode('Schmidt')
328			'CMYD'
329			"""
330	1		word = unicode_normalize('NFC', text_type(word.upper()))
331	1		for i, j in self._substitutions:
332	1		word = word.replace(i, j)
333	1		word = word.translate(self._trans)
334
335	1		return ''.join(
336			c
337			for c in self._delete_consecutive_repeats(word)
338			if c in self._uc_set
339			)
340
341
342	1		def phonem(word):
343			"""Return the Phonem code for a word.
344
345			This is a wrapper for :py:meth:`Phonem.encode`.
346
347			:param str word: the word to transform
348			:returns: the Phonem value
349			:rtype: str
350
351			>>> phonem('Christopher')
352			'CRYSDOVR'
353			>>> phonem('Niall')
354			'NYAL'
355			>>> phonem('Smith')
356			'SMYD'
357			>>> phonem('Schmidt')
358			'CMYD'
359			"""
360	1		return Phonem().encode(word)
361
362
363	1		class Haase(Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
364			"""Haase Phonetik.
365
366			Based on the algorithm described at :cite:`Prante:2015`.
367
368			Based on the original :cite:`Haase:2000`.
369			"""
370
371	1		_uc_v_set = set('AEIJOUY')
372
373	1		def encode(self, word, primary_only=False):
			0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
374			"""Return the Haase Phonetik (numeric output) code for a word.
375
376			While the output code is numeric, it is nevertheless a str.
377
378			:param str word: the word to transform
379			:param bool primary_only: if True, only the primary code is returned
380			:returns: the Haase Phonetik value as a numeric string
381			:rtype: tuple
382
383
384			>>> pe = Haase()
385			>>> pe.encode('Joachim')
386			('9496',)
387			>>> pe.encode('Christoph')
388			('4798293', '8798293')
389			>>> pe.encode('Jörg')
390			('974',)
391			>>> pe.encode('Smith')
392			('8692',)
393			>>> pe.encode('Schmidt')
394			('8692', '4692')
395			"""
396
397	1		def _after(word, i, letters):
398			"""Return True if word[i] follows one of the supplied letters."""
399	1		if i > 0 and word[i - 1] in letters:
400	1		return True
401	1		return False
402
403	1		def _before(word, i, letters):
404			"""Return True if word[i] precedes one of the supplied letters."""
405	1		if i + 1 < len(word) and word[i + 1] in letters:
406	1		return True
407	1		return False
408
409	1		word = unicode_normalize('NFKD', text_type(word.upper()))
410	1		word = word.replace('ß', 'SS')
411
412	1		word = word.replace('Ä', 'AE')
413	1		word = word.replace('Ö', 'OE')
414	1		word = word.replace('Ü', 'UE')
415	1		word = ''.join(c for c in word if c in self._uc_set)
416
417	1		variants = []
418	1		if primary_only:
419	1		variants = [word]
420			else:
421	1		pos = 0
422	1		if word[:2] == 'CH':
423	1		variants.append(('CH', 'SCH'))
424	1		pos += 2
425	1		len_3_vars = {
426			'OWN': 'AUN',
427			'WSK': 'RSK',
428			'SCH': 'CH',
429			'GLI': 'LI',
430			'AUX': 'O',
431			'EUX': 'O',
432			}
433	1		while pos < len(word):
434	1		if word[pos : pos + 4] == 'ILLE':
435	1		variants.append(('ILLE', 'I'))
436	1		pos += 4
437	1		elif word[pos : pos + 3] in len_3_vars:
438	1		variants.append(
439			(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
440			)
441	1		pos += 3
442	1		elif word[pos : pos + 2] == 'RB':
443	1		variants.append(('RB', 'RW'))
444	1		pos += 2
445	1		elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
446	1		variants.append(('EAU', 'O'))
447	1		pos += 3
448	1		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
449	1		if word[pos:] == 'O':
450	1		variants.append(('O', 'OW'))
451			else:
452	1		variants.append(('A', 'AR'))
453	1		pos += 1
454			else:
455	1		variants.append((word[pos],))
456	1		pos += 1
457
458	1		variants = [''.join(letters) for letters in product(*variants)]
459
460	1		def _haase_code(word):
461	1		sdx = ''
462	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-10-20 00:45 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
463	1	View Code Duplication	if word[i] in self._uc_v_set:
			0 ignored issues – show Duplication introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
464	1		sdx += '9'
465	1		elif word[i] == 'B':
466	1		sdx += '1'
467	1		elif word[i] == 'P':
468	1		if _before(word, i, {'H'}):
469	1		sdx += '3'
470			else:
471	1		sdx += '1'
472	1		elif word[i] in {'D', 'T'}:
473	1		if _before(word, i, {'C', 'S', 'Z'}):
474	1		sdx += '8'
475			else:
476	1		sdx += '2'
477	1		elif word[i] in {'F', 'V', 'W'}:
478	1		sdx += '3'
479	1		elif word[i] in {'G', 'K', 'Q'}:
480	1		sdx += '4'
481	1		elif word[i] == 'C':
482	1		if _after(word, i, {'S', 'Z'}):
483	1		sdx += '8'
484	1		elif i == 0:
485	1		if _before(
486			word,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
487			i,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
488			{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
489			):
490	1		sdx += '4'
491			else:
492	1		sdx += '8'
493	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
494	1		sdx += '4'
495			else:
496	1		sdx += '8'
497	1		elif word[i] == 'X':
498	1		if _after(word, i, {'C', 'K', 'Q'}):
499	1		sdx += '8'
500			else:
501	1		sdx += '48'
502	1		elif word[i] == 'L':
503	1		sdx += '5'
504	1		elif word[i] in {'M', 'N'}:
505	1		sdx += '6'
506	1		elif word[i] == 'R':
507	1		sdx += '7'
508	1		elif word[i] in {'S', 'Z'}:
509	1		sdx += '8'
510
511	1		sdx = self._delete_consecutive_repeats(sdx)
512
513	1		return sdx
514
515	1		encoded = tuple(_haase_code(word) for word in variants)
516	1		if len(encoded) > 1:
517	1		encoded_set = set()
518	1		encoded_single = []
519	1		for code in encoded:
520	1		if code not in encoded_set:
521	1		encoded_set.add(code)
522	1		encoded_single.append(code)
523	1		return tuple(encoded_single)
524
525	1		return encoded
526
527
528	1		def haase_phonetik(word, primary_only=False):
529			"""Return the Haase Phonetik (numeric output) code for a word.
530
531			This is a wrapper for :py:meth:`Haase.encode`.
532
533			:param str word: the word to transform
534			:param bool primary_only: if True, only the primary code is returned
535			:returns: the Haase Phonetik value as a numeric string
536			:rtype: tuple
537
538			>>> haase_phonetik('Joachim')
539			('9496',)
540			>>> haase_phonetik('Christoph')
541			('4798293', '8798293')
542			>>> haase_phonetik('Jörg')
543			('974',)
544			>>> haase_phonetik('Smith')
545			('8692',)
546			>>> haase_phonetik('Schmidt')
547			('8692', '4692')
548			"""
549	1		return Haase().encode(word, primary_only)
550
551
552	1		class RethSchek(Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
553			"""Reth-Schek Phonetik.
554
555			This algorithm is proposed in :cite:`Reth:1977`.
556
557			Since I couldn't secure a copy of that document (maybe I'll look for it
558			next time I'm in Germany), this implementation is based on what I could
559			glean from the implementations published by German Record Linkage
560			Center (www.record-linkage.de):
561
562			- Privacy-preserving Record Linkage (PPRL) (in R) :cite:`Rukasz:2018`
563			- Merge ToolBox (in Java) :cite:`Schnell:2004`
564
565			Rules that are unclear:
566
567			- Should 'C' become 'G' or 'Z'? (PPRL has both, 'Z' rule blocked)
568			- Should 'CC' become 'G'? (PPRL has blocked 'CK' that may be typo)
569			- Should 'TUI' -> 'ZUI' rule exist? (PPRL has rule, but I can't
570			think of a German word with '-tui-' in it.)
571			- Should we really change 'SCH' -> 'CH' and then 'CH' -> 'SCH'?
572			"""
573
574	1		_replacements = {
575			3: {
576			'AEH': 'E',
577			'IEH': 'I',
578			'OEH': 'OE',
579			'UEH': 'UE',
580			'SCH': 'CH',
581			'ZIO': 'TIO',
582			'TIU': 'TIO',
583			'ZIU': 'TIO',
584			'CHS': 'X',
585			'CKS': 'X',
586			'AEU': 'OI',
587			},
588			2: {
589			'LL': 'L',
590			'AA': 'A',
591			'AH': 'A',
592			'BB': 'B',
593			'PP': 'B',
594			'BP': 'B',
595			'PB': 'B',
596			'DD': 'D',
597			'DT': 'D',
598			'TT': 'D',
599			'TH': 'D',
600			'EE': 'E',
601			'EH': 'E',
602			'AE': 'E',
603			'FF': 'F',
604			'PH': 'F',
605			'KK': 'K',
606			'GG': 'G',
607			'GK': 'G',
608			'KG': 'G',
609			'CK': 'G',
610			'CC': 'C',
611			'IE': 'I',
612			'IH': 'I',
613			'MM': 'M',
614			'NN': 'N',
615			'OO': 'O',
616			'OH': 'O',
617			'SZ': 'S',
618			'UH': 'U',
619			'GS': 'X',
620			'KS': 'X',
621			'TZ': 'Z',
622			'AY': 'AI',
623			'EI': 'AI',
624			'EY': 'AI',
625			'EU': 'OI',
626			'RR': 'R',
627			'SS': 'S',
628			'KW': 'QU',
629			},
630			1: {
631			'P': 'B',
632			'T': 'D',
633			'V': 'F',
634			'W': 'F',
635			'C': 'G',
636			'K': 'G',
637			'Y': 'I',
638			},
639			}
640
641	1		def encode(self, word):
642			"""Return Reth-Schek Phonetik code for a word.
643
644			:param str word: the word to transform
645			:returns: the Reth-Schek Phonetik code
646			:rtype: str
647
648			>>> reth_schek_phonetik('Joachim')
649			'JOAGHIM'
650			>>> reth_schek_phonetik('Christoph')
651			'GHRISDOF'
652			>>> reth_schek_phonetik('Jörg')
653			'JOERG'
654			>>> reth_schek_phonetik('Smith')
655			'SMID'
656			>>> reth_schek_phonetik('Schmidt')
657			'SCHMID'
658			"""
659			# Uppercase
660	1		word = word.upper()
661
662			# Replace umlauts/eszett
663	1		word = word.replace('Ä', 'AE')
664	1		word = word.replace('Ö', 'OE')
665	1		word = word.replace('Ü', 'UE')
666	1		word = word.replace('ß', 'SS')
667
668			# Main loop, using above replacements table
669	1		pos = 0
670	1		while pos < len(word):
671	1		for num in range(3, 0, -1):
672	1		if word[pos : pos + num] in self._replacements[num]:
673	1		word = (
674			word[:pos]
675			+ self._replacements[num][word[pos : pos + num]]
676			+ word[pos + num :]
677			)
678	1		pos += 1
679	1		break
680			else:
681	1		pos += 1 # Advance if nothing is recognized
682
683			# Change 'CH' back(?) to 'SCH'
684	1		word = word.replace('CH', 'SCH')
685
686			# Replace final sequences
687	1		if word[-2:] == 'ER':
688	1		word = word[:-2] + 'R'
689	1		elif word[-2:] == 'EL':
690	1		word = word[:-2] + 'L'
691	1		elif word[-1:] == 'H':
692	1		word = word[:-1]
693
694	1		return word
695
696
697	1		def reth_schek_phonetik(word):
698			"""Return Reth-Schek Phonetik code for a word.
699
700			This is a wrapper for :py:meth:`RethSchek.encode`.
701
702			:param str word: the word to transform
703			:returns: the Reth-Schek Phonetik code
704			:rtype: str
705
706			>>> reth_schek_phonetik('Joachim')
707			'JOAGHIM'
708			>>> reth_schek_phonetik('Christoph')
709			'GHRISDOF'
710			>>> reth_schek_phonetik('Jörg')
711			'JOERG'
712			>>> reth_schek_phonetik('Smith')
713			'SMID'
714			>>> reth_schek_phonetik('Schmidt')
715			'SCHMID'
716			"""
717	1		return RethSchek().encode(word)
718
719
720			if __name__ == '__main__':
721			import doctest
722
723			doctest.testmod()
724

chrislit / abydos

Pull Request — master (#135)

abydos.phonetic._de.Haase.encode() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like