abydos.phonetic._haase.haase_phonetik() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 03:25 UTC

abydos.phonetic._haase.haase_phonetik() A

↳ Parent: abydos.phonetic._haase

Complexity

Conditions

Size

Total Lines	26
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	26
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._haase.

Haase Phonetik
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from itertools import product
from unicodedata import normalize as unicode_normalize

from six import text_type
from six.moves import range

from ._phonetic import Phonetic

__all__ = ['Haase', 'haase_phonetik']


class Haase(Phonetic):

    """Haase Phonetik.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.
    """

    _uc_v_set = set('AEIJOUY')

    def encode(self, word, primary_only=False):

        """Return the Haase Phonetik (numeric output) code for a word.

        While the output code is numeric, it is nevertheless a str.

        Args:
            word (str): The word to transform
            primary_only (bool): If True, only the primary code is returned

        Returns:
            tuple: The Haase Phonetik value as a numeric string

        Examples:
            >>> pe = Haase()
            >>> pe.encode('Joachim')
            ('9496',)
            >>> pe.encode('Christoph')
            ('4798293', '8798293')
            >>> pe.encode('Jörg')
            ('974',)
            >>> pe.encode('Smith')
            ('8692',)
            >>> pe.encode('Schmidt')
            ('8692', '4692')

        """

        def _after(word, pos, letters):
            """Return True if word[pos] follows one of the supplied letters.

            Args:
                word (str): Word to modify
                pos (int): Position to examine
                letters (set): Letters to check for

            Returns:
                bool: True if word[pos] follows one of letters

            """
            if pos > 0 and word[pos - 1] in letters:
                return True
            return False

        def _before(word, pos, letters):
            """Return True if word[pos] precedes one of the supplied letters.

            Args:
                word (str): Word to modify
                pos (int): Position to examine
                letters (set): Letters to check for

            Returns:
                bool: True if word[pos] precedes one of letters

            """
            if pos + 1 < len(word) and word[pos + 1] in letters:
                return True
            return False

        word = unicode_normalize('NFKD', text_type(word.upper()))
        word = word.replace('ß', 'SS')

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        variants = []
        if primary_only:
            variants = [word]
        else:
            pos = 0
            if word[:2] == 'CH':
                variants.append(('CH', 'SCH'))
                pos += 2
            len_3_vars = {
                'OWN': 'AUN',
                'WSK': 'RSK',
                'SCH': 'CH',
                'GLI': 'LI',
                'AUX': 'O',
                'EUX': 'O',
            }
            while pos < len(word):
                if word[pos : pos + 4] == 'ILLE':
                    variants.append(('ILLE', 'I'))
                    pos += 4
                elif word[pos : pos + 3] in len_3_vars:
                    variants.append(
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                    )
                    pos += 3
                elif word[pos : pos + 2] == 'RB':
                    variants.append(('RB', 'RW'))
                    pos += 2
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                    variants.append(('EAU', 'O'))
                    pos += 3
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                    if word[pos:] == 'O':
                        variants.append(('O', 'OW'))
                    else:
                        variants.append(('A', 'AR'))
                    pos += 1
                else:
                    variants.append((word[pos],))
                    pos += 1

            variants = [''.join(letters) for letters in product(*variants)]

        def _haase_code(word):
            sdx = ''
            for i in range(len(word)):

                if word[i] in self._uc_v_set:

                    sdx += '9'
                elif word[i] == 'B':
                    sdx += '1'
                elif word[i] == 'P':
                    if _before(word, i, {'H'}):
                        sdx += '3'
                    else:
                        sdx += '1'
                elif word[i] in {'D', 'T'}:
                    if _before(word, i, {'C', 'S', 'Z'}):
                        sdx += '8'
                    else:
                        sdx += '2'
                elif word[i] in {'F', 'V', 'W'}:
                    sdx += '3'
                elif word[i] in {'G', 'K', 'Q'}:
                    sdx += '4'
                elif word[i] == 'C':
                    if _after(word, i, {'S', 'Z'}):
                        sdx += '8'
                    elif i == 0:
                        if _before(
                            word,

                            i,

                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},

                        ):
                            sdx += '4'
                        else:
                            sdx += '8'
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif word[i] == 'X':
                    if _after(word, i, {'C', 'K', 'Q'}):
                        sdx += '8'
                    else:
                        sdx += '48'
                elif word[i] == 'L':
                    sdx += '5'
                elif word[i] in {'M', 'N'}:
                    sdx += '6'
                elif word[i] == 'R':
                    sdx += '7'
                elif word[i] in {'S', 'Z'}:
                    sdx += '8'

            sdx = self._delete_consecutive_repeats(sdx)

            return sdx

        encoded = tuple(_haase_code(word) for word in variants)
        if len(encoded) > 1:
            encoded_set = set()
            encoded_single = []
            for code in encoded:
                if code not in encoded_set:
                    encoded_set.add(code)
                    encoded_single.append(code)
            return tuple(encoded_single)

        return encoded


def haase_phonetik(word, primary_only=False):
    """Return the Haase Phonetik (numeric output) code for a word.

    This is a wrapper for :py:meth:`Haase.encode`.

    Args:
        word (str): The word to transform
        primary_only (bool): If True, only the primary code is returned

    Returns:
        tuple: The Haase Phonetik value as a numeric string

    Examples:
        >>> haase_phonetik('Joachim')
        ('9496',)
        >>> haase_phonetik('Christoph')
        ('4798293', '8798293')
        >>> haase_phonetik('Jörg')
        ('974',)
        >>> haase_phonetik('Smith')
        ('8692',)
        >>> haase_phonetik('Schmidt')
        ('8692', '4692')

    """
    return Haase().encode(word, primary_only)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1		"""abydos.phonetic._haase.
20
21			Haase Phonetik
22			"""
23
24	1		from __future__ import (
25			absolute_import,
26			division,
27			print_function,
28			unicode_literals,
29			)
30
31	1		from itertools import product
32	1		from unicodedata import normalize as unicode_normalize
33
34	1		from six import text_type
35	1		from six.moves import range
36
37	1		from ._phonetic import Phonetic
38
39	1		__all__ = ['Haase', 'haase_phonetik']
40
41
42	1		class Haase(Phonetic):
			0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
43			"""Haase Phonetik.
44
45			Based on the algorithm described at :cite:`Prante:2015`.
46
47			Based on the original :cite:`Haase:2000`.
48			"""
49
50	1		_uc_v_set = set('AEIJOUY')
51
52	1		def encode(self, word, primary_only=False):
			0 ignored issues – show Bug introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'encode' method Loading history...
53			"""Return the Haase Phonetik (numeric output) code for a word.
54
55			While the output code is numeric, it is nevertheless a str.
56
57			Args:
58			word (str): The word to transform
59			primary_only (bool): If True, only the primary code is returned
60
61			Returns:
62			tuple: The Haase Phonetik value as a numeric string
63
64			Examples:
65			>>> pe = Haase()
66			>>> pe.encode('Joachim')
67			('9496',)
68			>>> pe.encode('Christoph')
69			('4798293', '8798293')
70			>>> pe.encode('Jörg')
71			('974',)
72			>>> pe.encode('Smith')
73			('8692',)
74			>>> pe.encode('Schmidt')
75			('8692', '4692')
76
77			"""
78
79	1		def _after(word, pos, letters):
80			"""Return True if word[pos] follows one of the supplied letters.
81
82			Args:
83			word (str): Word to modify
84			pos (int): Position to examine
85			letters (set): Letters to check for
86
87			Returns:
88			bool: True if word[pos] follows one of letters
89
90			"""
91	1		if pos > 0 and word[pos - 1] in letters:
92	1		return True
93	1		return False
94
95	1		def _before(word, pos, letters):
96			"""Return True if word[pos] precedes one of the supplied letters.
97
98			Args:
99			word (str): Word to modify
100			pos (int): Position to examine
101			letters (set): Letters to check for
102
103			Returns:
104			bool: True if word[pos] precedes one of letters
105
106			"""
107	1		if pos + 1 < len(word) and word[pos + 1] in letters:
108	1		return True
109	1		return False
110
111	1		word = unicode_normalize('NFKD', text_type(word.upper()))
112	1		word = word.replace('ß', 'SS')
113
114	1		word = word.replace('Ä', 'AE')
115	1		word = word.replace('Ö', 'OE')
116	1		word = word.replace('Ü', 'UE')
117	1		word = ''.join(c for c in word if c in self._uc_set)
118
119	1		variants = []
120	1		if primary_only:
121	1		variants = [word]
122			else:
123	1		pos = 0
124	1		if word[:2] == 'CH':
125	1		variants.append(('CH', 'SCH'))
126	1		pos += 2
127	1		len_3_vars = {
128			'OWN': 'AUN',
129			'WSK': 'RSK',
130			'SCH': 'CH',
131			'GLI': 'LI',
132			'AUX': 'O',
133			'EUX': 'O',
134			}
135	1		while pos < len(word):
136	1		if word[pos : pos + 4] == 'ILLE':
137	1		variants.append(('ILLE', 'I'))
138	1		pos += 4
139	1		elif word[pos : pos + 3] in len_3_vars:
140	1		variants.append(
141			(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
142			)
143	1		pos += 3
144	1		elif word[pos : pos + 2] == 'RB':
145	1		variants.append(('RB', 'RW'))
146	1		pos += 2
147	1		elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
148	1		variants.append(('EAU', 'O'))
149	1		pos += 3
150	1		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
151	1		if word[pos:] == 'O':
152	1		variants.append(('O', 'OW'))
153			else:
154	1		variants.append(('A', 'AR'))
155	1		pos += 1
156			else:
157	1		variants.append((word[pos],))
158	1		pos += 1
159
160	1		variants = [''.join(letters) for letters in product(*variants)]
161
162	1		def _haase_code(word):
163	1		sdx = ''
164	1		for i in range(len(word)):
			0 ignored issues – show unused-code introduced 2018-10-20 00:45 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
165	1	View Code Duplication	if word[i] in self._uc_v_set:
			0 ignored issues – show Duplication introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
166	1		sdx += '9'
167	1		elif word[i] == 'B':
168	1		sdx += '1'
169	1		elif word[i] == 'P':
170	1		if _before(word, i, {'H'}):
171	1		sdx += '3'
172			else:
173	1		sdx += '1'
174	1		elif word[i] in {'D', 'T'}:
175	1		if _before(word, i, {'C', 'S', 'Z'}):
176	1		sdx += '8'
177			else:
178	1		sdx += '2'
179	1		elif word[i] in {'F', 'V', 'W'}:
180	1		sdx += '3'
181	1		elif word[i] in {'G', 'K', 'Q'}:
182	1		sdx += '4'
183	1		elif word[i] == 'C':
184	1		if _after(word, i, {'S', 'Z'}):
185	1		sdx += '8'
186	1		elif i == 0:
187	1		if _before(
188			word,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
189			i,
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
190			{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
			0 ignored issues – show Coding Style introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
191			):
192	1		sdx += '4'
193			else:
194	1		sdx += '8'
195	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
196	1		sdx += '4'
197			else:
198	1		sdx += '8'
199	1		elif word[i] == 'X':
200	1		if _after(word, i, {'C', 'K', 'Q'}):
201	1		sdx += '8'
202			else:
203	1		sdx += '48'
204	1		elif word[i] == 'L':
205	1		sdx += '5'
206	1		elif word[i] in {'M', 'N'}:
207	1		sdx += '6'
208	1		elif word[i] == 'R':
209	1		sdx += '7'
210	1		elif word[i] in {'S', 'Z'}:
211	1		sdx += '8'
212
213	1		sdx = self._delete_consecutive_repeats(sdx)
214
215	1		return sdx
216
217	1		encoded = tuple(_haase_code(word) for word in variants)
218	1		if len(encoded) > 1:
219	1		encoded_set = set()
220	1		encoded_single = []
221	1		for code in encoded:
222	1		if code not in encoded_set:
223	1		encoded_set.add(code)
224	1		encoded_single.append(code)
225	1		return tuple(encoded_single)
226
227	1		return encoded
228
229
230	1		def haase_phonetik(word, primary_only=False):
231			"""Return the Haase Phonetik (numeric output) code for a word.
232
233			This is a wrapper for :py:meth:`Haase.encode`.
234
235			Args:
236			word (str): The word to transform
237			primary_only (bool): If True, only the primary code is returned
238
239			Returns:
240			tuple: The Haase Phonetik value as a numeric string
241
242			Examples:
243			>>> haase_phonetik('Joachim')
244			('9496',)
245			>>> haase_phonetik('Christoph')
246			('4798293', '8798293')
247			>>> haase_phonetik('Jörg')
248			('974',)
249			>>> haase_phonetik('Smith')
250			('8692',)
251			>>> haase_phonetik('Schmidt')
252			('8692', '4692')
253
254			"""
255	1		return Haase().encode(word, primary_only)
256
257
258			if __name__ == '__main__':
259			import doctest
260
261			doctest.testmod()
262

chrislit / abydos

Pull Request — master (#141)

abydos.phonetic._haase.haase_phonetik() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like