abydos.phonetic._haase.Haase.encode()   F
last analyzed

Complexity

Conditions 39

Size

Total Lines 202
Code Lines 106

Duplication

Lines 47
Ratio 23.27 %

Code Coverage

Tests 87
CRAP Score 39

Importance

Changes 0
Metric Value
eloc 106
dl 47
loc 202
ccs 87
cts 87
cp 1
rs 0
c 0
b 0
f 0
cc 39
nop 2
crap 39

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.phonetic._haase.Haase.encode() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.phonetic._haase.
18
19 1
Haase Phonetik
20
"""
21
22
from itertools import product
23
from typing import List, Set, Tuple, Union, cast
24 1
from unicodedata import normalize as unicode_normalize
25
26
from ._phonetic import _Phonetic
27
28
__all__ = ['Haase']
29
30
31 1
class Haase(_Phonetic):
32 1
    """Haase Phonetik.
33
34 1
    Based on the algorithm described at :cite:`Prante:2015`.
35
36 1
    Based on the original :cite:`Haase:2000`.
37 1
38
    .. versionadded:: 0.3.6
39 1
    """
40 1
41
    _uc_v_set = set('AEIJOUY')
42 1
43
    _alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
44
45 1
    def __init__(self, primary_only: bool = False) -> None:
46
        """Initialize Haase instance.
47
48
        Parameters
49
        ----------
50
        primary_only : bool
51
            If True, only the primary code is returned
52
53
54
        .. versionadded:: 0.4.0
55 1
56
        """
57 1
        self._primary_only = primary_only
58
59 1
    def encode_alpha(self, word: str) -> str:
60
        """Return the alphabetic Haase Phonetik code for a word.
61
62
        Parameters
63
        ----------
64
        word : str
65
            The word to transform
66
67
        Returns
68
        -------
69
        str
70
            The alphabetic Haase Phonetik value
71 1
72
        Examples
73 1
        --------
74
        >>> pe = Haase()
75
        >>> pe.encode_alpha('Joachim')
76
        'AKAN'
77
        >>> pe.encode_alpha('Christoph')
78
        'KRASTAF,SRASTAF'
79
        >>> pe.encode_alpha('Jörg')
80
        'ARK'
81
        >>> pe.encode_alpha('Smith')
82
        'SNAT'
83
        >>> pe.encode_alpha('Schmidt')
84
        'SNAT,KNAT'
85
86
87
        .. versionadded:: 0.4.0
88
        .. versionchanged:: 0.6.0
89
            Made return a str only (comma-separated)
90
91
        """
92
        return self.encode(word).translate(self._alphabetic)
93
94
    def encode(self, word: str) -> str:
95
        """Return the Haase Phonetik (numeric output) code for a word.
96
97
        While the output code is numeric, it is nevertheless a str.
98
99
        Parameters
100
        ----------
101
        word : str
102
            The word to transform
103
104 1
        Returns
105
        -------
106
        str
107
            The Haase Phonetik value as a numeric string
108 1
109
        Examples
110
        --------
111
        >>> pe = Haase()
112
        >>> pe.encode('Joachim')
113
        '9496'
114
        >>> pe.encode('Christoph')
115
        '4798293,8798293'
116
        >>> pe.encode('Jörg')
117
        '974'
118
        >>> pe.encode('Smith')
119
        '8692'
120
        >>> pe.encode('Schmidt')
121
        '8692,4692'
122
123
124
        .. versionadded:: 0.3.0
125
        .. versionchanged:: 0.3.6
126
            Encapsulated in class
127
        .. versionchanged:: 0.6.0
128
            Made return a str only (comma-separated)
129
130
        """
131
132
        def _after(word: str, pos: int, letters: Set[str]) -> bool:
133
            """Return True if word[pos] follows one of the supplied letters.
134
135
            Parameters
136
            ----------
137
            word : str
138
                Word to modify
139
            pos : int
140
                Position to examine
141
            letters : set
142
                Letters to check for
143
144 1
            Returns
145
            -------
146
            bool
147
                True if word[pos] follows one of letters
148
149
            .. versionadded:: 0.3.0
150
151
            """
152
            if pos > 0 and word[pos - 1] in letters:
153
                return True
154
            return False
155
156
        def _before(word: str, pos: int, letters: Set[str]) -> bool:
157
            """Return True if word[pos] precedes one of the supplied letters.
158
159
            Parameters
160
            ----------
161
            word : str
162
                Word to modify
163
            pos : int
164 1
                Position to examine
165 1
            letters : set
166 1
                Letters to check for
167
168 1
            Returns
169
            -------
170
            bool
171
                True if word[pos] precedes one of letters
172
173
            .. versionadded:: 0.3.0
174
175
            """
176
            if pos + 1 < len(word) and word[pos + 1] in letters:
177
                return True
178
            return False
179
180
        word = unicode_normalize('NFKD', word.upper())
181
182
        word = word.replace('Ä', 'AE')
183
        word = word.replace('Ö', 'OE')
184
        word = word.replace('Ü', 'UE')
185
        word = ''.join(c for c in word if c in self._uc_set)
186
187
        variants = []  # type: List[Union[str, Tuple[str, ...]]]
188 1
        if self._primary_only:
189 1
            variants = [word]
190 1
        else:
191
            pos = 0
192 1
            if word[:2] == 'CH':
193 1
                variants.append(('CH', 'SCH'))
194
                pos += 2
195 1
            len_3_vars = {
196 1
                'OWN': 'AUN',
197 1
                'WSK': 'RSK',
198 1
                'SCH': 'CH',
199
                'GLI': 'LI',
200 1
                'AUX': 'O',
201 1
                'EUX': 'O',
202 1
            }
203
            while pos < len(word):
204 1
                if word[pos : pos + 4] == 'ILLE':
205 1
                    variants.append(('ILLE', 'I'))
206 1
                    pos += 4
207 1
                elif word[pos : pos + 3] in len_3_vars:
208 1
                    variants.append(
209
                        (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]])
210
                    )
211
                    pos += 3
212
                elif word[pos : pos + 2] == 'RB':
213
                    variants.append(('RB', 'RW'))
214
                    pos += 2
215
                elif len(word[pos:]) == 3 and word[pos:] == 'EAU':
216 1
                    variants.append(('EAU', 'O'))
217 1
                    pos += 3
218 1
                elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}:
219 1
                    if word[pos:] == 'O':
220 1
                        variants.append(('O', 'OW'))
221 1
                    else:
222
                        variants.append(('A', 'AR'))
223
                    pos += 1
224 1
                else:
225 1
                    variants.append((word[pos],))
226 1
                    pos += 1
227 1
228 1
            variants = [''.join(letters) for letters in product(*variants)]
229 1
230 1
        def _haase_code(word: str) -> str:
231 1
            sdx = ''
232 1
            for i in range(len(word)):
233 1 View Code Duplication
                if word[i] in self._uc_v_set:
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
234
                    sdx += '9'
235 1
                elif word[i] == 'B':
236 1
                    sdx += '1'
237
                elif word[i] == 'P':
238 1
                    if _before(word, i, {'H'}):
239 1
                        sdx += '3'
240
                    else:
241 1
                        sdx += '1'
242
                elif word[i] in {'D', 'T'}:
243 1
                    if _before(word, i, {'C', 'S', 'Z'}):
244 1
                        sdx += '8'
245 1
                    else:
246 1
                        sdx += '2'
247 1
                elif word[i] in {'F', 'V', 'W'}:
248 1
                    sdx += '3'
249 1
                elif word[i] in {'G', 'K', 'Q'}:
250 1
                    sdx += '4'
251 1
                elif word[i] == 'C':
252 1
                    if _after(word, i, {'S', 'Z'}):
253
                        sdx += '8'
254 1
                    elif i == 0:
255 1
                        if _before(
256 1
                            word,
257 1
                            i,
258
                            {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'},
259 1
                        ):
260 1
                            sdx += '4'
261 1
                        else:
262 1
                            sdx += '8'
263 1
                    elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}):
264 1
                        sdx += '4'
265 1
                    else:
266 1
                        sdx += '8'
267 1
                elif word[i] == 'X':
268 1
                    if _after(word, i, {'C', 'K', 'Q'}):
269
                        sdx += '8'
270
                    else:
271
                        sdx += '48'
272
                elif word[i] == 'L':
273 1
                    sdx += '5'
274
                elif word[i] in {'M', 'N'}:
275 1
                    sdx += '6'
276 1
                elif word[i] == 'R':
277 1
                    sdx += '7'
278
                elif word[i] in {'S', 'Z'}:
279 1
                    sdx += '8'
280 1
281 1
            sdx = self._delete_consecutive_repeats(sdx)
282 1
283
            return sdx
284 1
285 1
        encoded = [_haase_code(word) for word in cast(List[str], variants)]
286 1
        if len(encoded) > 1:
287 1
            encoded_set = set()  # type: Set[str]
288 1
            encoded_single = []
289 1
            for code in encoded:
290 1
                if code not in encoded_set:
291 1
                    encoded_set.add(code)
292 1
                    encoded_single.append(code)
293
            return ','.join(encoded_single)
294 1
295
        return encoded[0]
296 1
297
298 1
if __name__ == '__main__':
299 1
    import doctest
300 1
301
    doctest.testmod()
302