abydos.phonetic._haase.Haase.encode() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.phonetic._haase.Haase.encode() F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.phonetic._haase

Complexity

Conditions

Size

Total Lines	202
Code Lines	106

Duplication

Lines	47
Ratio	23.27 %

Code Coverage

Tests	87
CRAP Score	39

Importance

Changes

Metric	Value
eloc	106
dl	47
loc	202
ccs	87
cts	87
cp	1
rs	0
c	0
b	0
f	0
cc	39
nop	2
crap	39

How to fix Long Method Complexity

# Copyright 2018-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.phonetic._haase.

Haase Phonetik
"""

from itertools import product
from typing import List, Set, Tuple, Union, cast
from unicodedata import normalize as unicode_normalize

from ._phonetic import _Phonetic

__all__ = ['Haase']


class Haase(_Phonetic):
    """Haase Phonetik.

    Based on the algorithm described at :cite:`Prante:2015`.

    Based on the original :cite:`Haase:2000`.

    .. versionadded:: 0.3.6
    """

    _uc_v_set = set('AEIJOUY')

    _alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA'))


    def __init__(self, primary_only: bool = False) -> None:
        """Initialize Haase instance.

        Parameters
        ----------
        primary_only : bool
            If True, only the primary code is returned


        .. versionadded:: 0.4.0

        """
        self._primary_only = primary_only

    def encode_alpha(self, word: str) -> str:
        """Return the alphabetic Haase Phonetik code for a word.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The alphabetic Haase Phonetik value

        Examples
        --------
        >>> pe = Haase()
        >>> pe.encode_alpha('Joachim')
        'AKAN'
        >>> pe.encode_alpha('Christoph')
        'KRASTAF,SRASTAF'
        >>> pe.encode_alpha('Jörg')
        'ARK'
        >>> pe.encode_alpha('Smith')
        'SNAT'
        >>> pe.encode_alpha('Schmidt')
        'SNAT,KNAT'


        .. versionadded:: 0.4.0
        .. versionchanged:: 0.6.0
            Made return a str only (comma-separated)

        """
        return self.encode(word).translate(self._alphabetic)

    def encode(self, word: str) -> str:
        """Return the Haase Phonetik (numeric output) code for a word.

        While the output code is numeric, it is nevertheless a str.

        Parameters
        ----------
        word : str
            The word to transform

        Returns
        -------
        str
            The Haase Phonetik value as a numeric string

        Examples
        --------
        >>> pe = Haase()
        >>> pe.encode('Joachim')
        '9496'
        >>> pe.encode('Christoph')
        '4798293,8798293'
        >>> pe.encode('Jörg')
        '974'
        >>> pe.encode('Smith')
        '8692'
        >>> pe.encode('Schmidt')
        '8692,4692'


        .. versionadded:: 0.3.0
        .. versionchanged:: 0.3.6
            Encapsulated in class
        .. versionchanged:: 0.6.0
            Made return a str only (comma-separated)

        """

        def _after(word: str, pos: int, letters: Set[str]) -> bool:
            """Return True if word[pos] follows one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] follows one of letters

            .. versionadded:: 0.3.0

            """
            if pos > 0 and word[pos - 1] in letters:
                return True
            return False

        def _before(word: str, pos: int, letters: Set[str]) -> bool:
            """Return True if word[pos] precedes one of the supplied letters.

            Parameters
            ----------
            word : str
                Word to modify
            pos : int
                Position to examine
            letters : set
                Letters to check for

            Returns
            -------
            bool
                True if word[pos] precedes one of letters

            .. versionadded:: 0.3.0

            """
            if pos + 1 < len(word) and word[pos + 1] in letters:
                return True
            return False

        word = unicode_normalize('NFKD', word.upper())

        word = word.replace('Ä', 'AE')
        word = word.replace('Ö', 'OE')
        word = word.replace('Ü', 'UE')
        word = ''.join(c for c in word if c in self._uc_set)

        variants = []  # type: List[Union[str, Tuple[str, ...]]]
        if self._primary_only:
            variants = [word]
        else:
            pos = 0
            if word[:2] == 'CH':
                variants.append(('CH', 'SCH'))
                pos += 2
            len_3_vars = {
                'OWN': 'AUN',
                'WSK': 'RSK',
                'SCH': 'CH',
                'GLI': 'LI',
                'AUX': 'O',
                'EUX': 'O',
            }
            while pos < len(word):
                if word[pos : pos + 4] == 'ILLE':
                    variants.append(('ILLE', 'I'))
                    pos += 4
                elif word[pos : pos + 3] in len_3_vars:
                    variants.append(
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
                    )
                    pos += 3
                elif word[pos : pos + 2] == 'RB':
                    variants.append(('RB', 'RW'))
                    pos += 2
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
                    variants.append(('EAU', 'O'))
                    pos += 3
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
                    if word[pos:] == 'O':
                        variants.append(('O', 'OW'))
                    else:
                        variants.append(('A', 'AR'))
                    pos += 1
                else:
                    variants.append((word[pos],))
                    pos += 1

            variants = [''.join(letters) for letters in product(*variants)]

        def _haase_code(word: str) -> str:
            sdx = ''
            for i in range(len(word)):
                if word[i] in self._uc_v_set:

                    sdx += '9'
                elif word[i] == 'B':
                    sdx += '1'
                elif word[i] == 'P':
                    if _before(word, i, {'H'}):
                        sdx += '3'
                    else:
                        sdx += '1'
                elif word[i] in {'D', 'T'}:
                    if _before(word, i, {'C', 'S', 'Z'}):
                        sdx += '8'
                    else:
                        sdx += '2'
                elif word[i] in {'F', 'V', 'W'}:
                    sdx += '3'
                elif word[i] in {'G', 'K', 'Q'}:
                    sdx += '4'
                elif word[i] == 'C':
                    if _after(word, i, {'S', 'Z'}):
                        sdx += '8'
                    elif i == 0:
                        if _before(
                            word,
                            i,
                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
                        ):
                            sdx += '4'
                        else:
                            sdx += '8'
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
                        sdx += '4'
                    else:
                        sdx += '8'
                elif word[i] == 'X':
                    if _after(word, i, {'C', 'K', 'Q'}):
                        sdx += '8'
                    else:
                        sdx += '48'
                elif word[i] == 'L':
                    sdx += '5'
                elif word[i] in {'M', 'N'}:
                    sdx += '6'
                elif word[i] == 'R':
                    sdx += '7'
                elif word[i] in {'S', 'Z'}:
                    sdx += '8'

            sdx = self._delete_consecutive_repeats(sdx)

            return sdx

        encoded = [_haase_code(word) for word in cast(List[str], variants)]
        if len(encoded) > 1:
            encoded_set = set()  # type: Set[str]
            encoded_single = []
            for code in encoded:
                if code not in encoded_set:
                    encoded_set.add(code)
                    encoded_single.append(code)
            return ','.join(encoded_single)

        return encoded[0]


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1			# Copyright 2018-2020 by Christopher C. Little.
2			# This file is part of Abydos.
3			#
4			# Abydos is free software: you can redistribute it and/or modify
5			# it under the terms of the GNU General Public License as published by
6			# the Free Software Foundation, either version 3 of the License, or
7			# (at your option) any later version.
8			#
9			# Abydos is distributed in the hope that it will be useful,
10			# but WITHOUT ANY WARRANTY; without even the implied warranty of
11			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12			# GNU General Public License for more details.
13			#
14			# You should have received a copy of the GNU General Public License
15			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17			"""abydos.phonetic._haase.
18
19	1		Haase Phonetik
20			"""
21
22			from itertools import product
23			from typing import List, Set, Tuple, Union, cast
24	1		from unicodedata import normalize as unicode_normalize
25
26			from ._phonetic import _Phonetic
27
28			__all__ = ['Haase']
29
30
31	1		class Haase(_Phonetic):
32	1		"""Haase Phonetik.
33
34	1		Based on the algorithm described at :cite:`Prante:2015`.
35
36	1		Based on the original :cite:`Haase:2000`.
37	1
38			.. versionadded:: 0.3.6
39	1		"""
40	1
41			_uc_v_set = set('AEIJOUY')
42	1
43			_alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA'))
			0 ignored issues – show Comprehensibility Best Practice introduced 2019-02-15 07:24 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
44
45	1		def __init__(self, primary_only: bool = False) -> None:
46			"""Initialize Haase instance.
47
48			Parameters
49			----------
50			primary_only : bool
51			If True, only the primary code is returned
52
53
54			.. versionadded:: 0.4.0
55	1
56			"""
57	1		self._primary_only = primary_only
58
59	1		def encode_alpha(self, word: str) -> str:
60			"""Return the alphabetic Haase Phonetik code for a word.
61
62			Parameters
63			----------
64			word : str
65			The word to transform
66
67			Returns
68			-------
69			str
70			The alphabetic Haase Phonetik value
71	1
72			Examples
73	1		--------
74			>>> pe = Haase()
75			>>> pe.encode_alpha('Joachim')
76			'AKAN'
77			>>> pe.encode_alpha('Christoph')
78			'KRASTAF,SRASTAF'
79			>>> pe.encode_alpha('Jörg')
80			'ARK'
81			>>> pe.encode_alpha('Smith')
82			'SNAT'
83			>>> pe.encode_alpha('Schmidt')
84			'SNAT,KNAT'
85
86
87			.. versionadded:: 0.4.0
88			.. versionchanged:: 0.6.0
89			Made return a str only (comma-separated)
90
91			"""
92			return self.encode(word).translate(self._alphabetic)
93
94			def encode(self, word: str) -> str:
95			"""Return the Haase Phonetik (numeric output) code for a word.
96
97			While the output code is numeric, it is nevertheless a str.
98
99			Parameters
100			----------
101			word : str
102			The word to transform
103
104	1		Returns
105			-------
106			str
107			The Haase Phonetik value as a numeric string
108	1
109			Examples
110			--------
111			>>> pe = Haase()
112			>>> pe.encode('Joachim')
113			'9496'
114			>>> pe.encode('Christoph')
115			'4798293,8798293'
116			>>> pe.encode('Jörg')
117			'974'
118			>>> pe.encode('Smith')
119			'8692'
120			>>> pe.encode('Schmidt')
121			'8692,4692'
122
123
124			.. versionadded:: 0.3.0
125			.. versionchanged:: 0.3.6
126			Encapsulated in class
127			.. versionchanged:: 0.6.0
128			Made return a str only (comma-separated)
129
130			"""
131
132			def _after(word: str, pos: int, letters: Set[str]) -> bool:
133			"""Return True if word[pos] follows one of the supplied letters.
134
135			Parameters
136			----------
137			word : str
138			Word to modify
139			pos : int
140			Position to examine
141			letters : set
142			Letters to check for
143
144	1		Returns
145			-------
146			bool
147			True if word[pos] follows one of letters
148
149			.. versionadded:: 0.3.0
150
151			"""
152			if pos > 0 and word[pos - 1] in letters:
153			return True
154			return False
155
156			def _before(word: str, pos: int, letters: Set[str]) -> bool:
157			"""Return True if word[pos] precedes one of the supplied letters.
158
159			Parameters
160			----------
161			word : str
162			Word to modify
163			pos : int
164	1		Position to examine
165	1		letters : set
166	1		Letters to check for
167
168	1		Returns
169			-------
170			bool
171			True if word[pos] precedes one of letters
172
173			.. versionadded:: 0.3.0
174
175			"""
176			if pos + 1 < len(word) and word[pos + 1] in letters:
177			return True
178			return False
179
180			word = unicode_normalize('NFKD', word.upper())
181
182			word = word.replace('Ä', 'AE')
183			word = word.replace('Ö', 'OE')
184			word = word.replace('Ü', 'UE')
185			word = ''.join(c for c in word if c in self._uc_set)
186
187			variants = [] # type: List[Union[str, Tuple[str, ...]]]
188	1		if self._primary_only:
189	1		variants = [word]
190	1		else:
191			pos = 0
192	1		if word[:2] == 'CH':
193	1		variants.append(('CH', 'SCH'))
194			pos += 2
195	1		len_3_vars = {
196	1		'OWN': 'AUN',
197	1		'WSK': 'RSK',
198	1		'SCH': 'CH',
199			'GLI': 'LI',
200	1		'AUX': 'O',
201	1		'EUX': 'O',
202	1		}
203			while pos < len(word):
204	1		if word[pos : pos + 4] == 'ILLE':
205	1		variants.append(('ILLE', 'I'))
206	1		pos += 4
207	1		elif word[pos : pos + 3] in len_3_vars:
208	1		variants.append(
209			(word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
210			)
211			pos += 3
212			elif word[pos : pos + 2] == 'RB':
213			variants.append(('RB', 'RW'))
214			pos += 2
215			elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
216	1		variants.append(('EAU', 'O'))
217	1		pos += 3
218	1		elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
219	1		if word[pos:] == 'O':
220	1		variants.append(('O', 'OW'))
221	1		else:
222			variants.append(('A', 'AR'))
223			pos += 1
224	1		else:
225	1		variants.append((word[pos],))
226	1		pos += 1
227	1
228	1		variants = [''.join(letters) for letters in product(*variants)]
229	1
230	1		def _haase_code(word: str) -> str:
231	1		sdx = ''
232	1		for i in range(len(word)):
233	1	View Code Duplication	if word[i] in self._uc_v_set:
			0 ignored issues – show Duplication introduced 2018-11-04 08:02 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
234			sdx += '9'
235	1		elif word[i] == 'B':
236	1		sdx += '1'
237			elif word[i] == 'P':
238	1		if _before(word, i, {'H'}):
239	1		sdx += '3'
240			else:
241	1		sdx += '1'
242			elif word[i] in {'D', 'T'}:
243	1		if _before(word, i, {'C', 'S', 'Z'}):
244	1		sdx += '8'
245	1		else:
246	1		sdx += '2'
247	1		elif word[i] in {'F', 'V', 'W'}:
248	1		sdx += '3'
249	1		elif word[i] in {'G', 'K', 'Q'}:
250	1		sdx += '4'
251	1		elif word[i] == 'C':
252	1		if _after(word, i, {'S', 'Z'}):
253			sdx += '8'
254	1		elif i == 0:
255	1		if _before(
256	1		word,
257	1		i,
258			{'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
259	1		):
260	1		sdx += '4'
261	1		else:
262	1		sdx += '8'
263	1		elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
264	1		sdx += '4'
265	1		else:
266	1		sdx += '8'
267	1		elif word[i] == 'X':
268	1		if _after(word, i, {'C', 'K', 'Q'}):
269			sdx += '8'
270			else:
271			sdx += '48'
272			elif word[i] == 'L':
273	1		sdx += '5'
274			elif word[i] in {'M', 'N'}:
275	1		sdx += '6'
276	1		elif word[i] == 'R':
277	1		sdx += '7'
278			elif word[i] in {'S', 'Z'}:
279	1		sdx += '8'
280	1
281	1		sdx = self._delete_consecutive_repeats(sdx)
282	1
283			return sdx
284	1
285	1		encoded = [_haase_code(word) for word in cast(List[str], variants)]
286	1		if len(encoded) > 1:
287	1		encoded_set = set() # type: Set[str]
288	1		encoded_single = []
289	1		for code in encoded:
290	1		if code not in encoded_set:
291	1		encoded_set.add(code)
292	1		encoded_single.append(code)
293			return ','.join(encoded_single)
294	1
295			return encoded[0]
296	1
297
298	1		if __name__ == '__main__':
299	1		import doctest
300	1
301			doctest.testmod()
302

chrislit / abydos

abydos.phonetic._haase.Haase.encode() F last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like

abydos.phonetic._haase.Haase.encode() F
last analyzed 2020-12-31 20:10 UTC