| Total Complexity | 41 |
| Total Lines | 302 |
| Duplicated Lines | 15.56 % |
| Coverage | 100% |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like abydos.phonetic._haase often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # Copyright 2018-2020 by Christopher C. Little. |
||
| 2 | # This file is part of Abydos. |
||
| 3 | # |
||
| 4 | # Abydos is free software: you can redistribute it and/or modify |
||
| 5 | # it under the terms of the GNU General Public License as published by |
||
| 6 | # the Free Software Foundation, either version 3 of the License, or |
||
| 7 | # (at your option) any later version. |
||
| 8 | # |
||
| 9 | # Abydos is distributed in the hope that it will be useful, |
||
| 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 12 | # GNU General Public License for more details. |
||
| 13 | # |
||
| 14 | # You should have received a copy of the GNU General Public License |
||
| 15 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
| 16 | |||
| 17 | """abydos.phonetic._haase. |
||
| 18 | |||
| 19 | 1 | Haase Phonetik |
|
| 20 | """ |
||
| 21 | |||
| 22 | from itertools import product |
||
| 23 | from typing import List, Set, Tuple, Union, cast |
||
| 24 | 1 | from unicodedata import normalize as unicode_normalize |
|
| 25 | |||
| 26 | from ._phonetic import _Phonetic |
||
| 27 | |||
| 28 | __all__ = ['Haase'] |
||
| 29 | |||
| 30 | |||
| 31 | 1 | class Haase(_Phonetic): |
|
| 32 | 1 | """Haase Phonetik. |
|
| 33 | |||
| 34 | 1 | Based on the algorithm described at :cite:`Prante:2015`. |
|
| 35 | |||
| 36 | 1 | Based on the original :cite:`Haase:2000`. |
|
| 37 | 1 | ||
| 38 | .. versionadded:: 0.3.6 |
||
| 39 | 1 | """ |
|
| 40 | 1 | ||
| 41 | _uc_v_set = set('AEIJOUY') |
||
| 42 | 1 | ||
| 43 | _alphabetic = dict(zip((ord(_) for _ in '123456789'), 'PTFKLNRSA')) |
||
|
|
|||
| 44 | |||
| 45 | 1 | def __init__(self, primary_only: bool = False) -> None: |
|
| 46 | """Initialize Haase instance. |
||
| 47 | |||
| 48 | Parameters |
||
| 49 | ---------- |
||
| 50 | primary_only : bool |
||
| 51 | If True, only the primary code is returned |
||
| 52 | |||
| 53 | |||
| 54 | .. versionadded:: 0.4.0 |
||
| 55 | 1 | ||
| 56 | """ |
||
| 57 | 1 | self._primary_only = primary_only |
|
| 58 | |||
| 59 | 1 | def encode_alpha(self, word: str) -> str: |
|
| 60 | """Return the alphabetic Haase Phonetik code for a word. |
||
| 61 | |||
| 62 | Parameters |
||
| 63 | ---------- |
||
| 64 | word : str |
||
| 65 | The word to transform |
||
| 66 | |||
| 67 | Returns |
||
| 68 | ------- |
||
| 69 | str |
||
| 70 | The alphabetic Haase Phonetik value |
||
| 71 | 1 | ||
| 72 | Examples |
||
| 73 | 1 | -------- |
|
| 74 | >>> pe = Haase() |
||
| 75 | >>> pe.encode_alpha('Joachim') |
||
| 76 | 'AKAN' |
||
| 77 | >>> pe.encode_alpha('Christoph') |
||
| 78 | 'KRASTAF,SRASTAF' |
||
| 79 | >>> pe.encode_alpha('Jörg') |
||
| 80 | 'ARK' |
||
| 81 | >>> pe.encode_alpha('Smith') |
||
| 82 | 'SNAT' |
||
| 83 | >>> pe.encode_alpha('Schmidt') |
||
| 84 | 'SNAT,KNAT' |
||
| 85 | |||
| 86 | |||
| 87 | .. versionadded:: 0.4.0 |
||
| 88 | .. versionchanged:: 0.6.0 |
||
| 89 | Made return a str only (comma-separated) |
||
| 90 | |||
| 91 | """ |
||
| 92 | return self.encode(word).translate(self._alphabetic) |
||
| 93 | |||
| 94 | def encode(self, word: str) -> str: |
||
| 95 | """Return the Haase Phonetik (numeric output) code for a word. |
||
| 96 | |||
| 97 | While the output code is numeric, it is nevertheless a str. |
||
| 98 | |||
| 99 | Parameters |
||
| 100 | ---------- |
||
| 101 | word : str |
||
| 102 | The word to transform |
||
| 103 | |||
| 104 | 1 | Returns |
|
| 105 | ------- |
||
| 106 | str |
||
| 107 | The Haase Phonetik value as a numeric string |
||
| 108 | 1 | ||
| 109 | Examples |
||
| 110 | -------- |
||
| 111 | >>> pe = Haase() |
||
| 112 | >>> pe.encode('Joachim') |
||
| 113 | '9496' |
||
| 114 | >>> pe.encode('Christoph') |
||
| 115 | '4798293,8798293' |
||
| 116 | >>> pe.encode('Jörg') |
||
| 117 | '974' |
||
| 118 | >>> pe.encode('Smith') |
||
| 119 | '8692' |
||
| 120 | >>> pe.encode('Schmidt') |
||
| 121 | '8692,4692' |
||
| 122 | |||
| 123 | |||
| 124 | .. versionadded:: 0.3.0 |
||
| 125 | .. versionchanged:: 0.3.6 |
||
| 126 | Encapsulated in class |
||
| 127 | .. versionchanged:: 0.6.0 |
||
| 128 | Made return a str only (comma-separated) |
||
| 129 | |||
| 130 | """ |
||
| 131 | |||
| 132 | def _after(word: str, pos: int, letters: Set[str]) -> bool: |
||
| 133 | """Return True if word[pos] follows one of the supplied letters. |
||
| 134 | |||
| 135 | Parameters |
||
| 136 | ---------- |
||
| 137 | word : str |
||
| 138 | Word to modify |
||
| 139 | pos : int |
||
| 140 | Position to examine |
||
| 141 | letters : set |
||
| 142 | Letters to check for |
||
| 143 | |||
| 144 | 1 | Returns |
|
| 145 | ------- |
||
| 146 | bool |
||
| 147 | True if word[pos] follows one of letters |
||
| 148 | |||
| 149 | .. versionadded:: 0.3.0 |
||
| 150 | |||
| 151 | """ |
||
| 152 | if pos > 0 and word[pos - 1] in letters: |
||
| 153 | return True |
||
| 154 | return False |
||
| 155 | |||
| 156 | def _before(word: str, pos: int, letters: Set[str]) -> bool: |
||
| 157 | """Return True if word[pos] precedes one of the supplied letters. |
||
| 158 | |||
| 159 | Parameters |
||
| 160 | ---------- |
||
| 161 | word : str |
||
| 162 | Word to modify |
||
| 163 | pos : int |
||
| 164 | 1 | Position to examine |
|
| 165 | 1 | letters : set |
|
| 166 | 1 | Letters to check for |
|
| 167 | |||
| 168 | 1 | Returns |
|
| 169 | ------- |
||
| 170 | bool |
||
| 171 | True if word[pos] precedes one of letters |
||
| 172 | |||
| 173 | .. versionadded:: 0.3.0 |
||
| 174 | |||
| 175 | """ |
||
| 176 | if pos + 1 < len(word) and word[pos + 1] in letters: |
||
| 177 | return True |
||
| 178 | return False |
||
| 179 | |||
| 180 | word = unicode_normalize('NFKD', word.upper()) |
||
| 181 | |||
| 182 | word = word.replace('Ä', 'AE') |
||
| 183 | word = word.replace('Ö', 'OE') |
||
| 184 | word = word.replace('Ü', 'UE') |
||
| 185 | word = ''.join(c for c in word if c in self._uc_set) |
||
| 186 | |||
| 187 | variants = [] # type: List[Union[str, Tuple[str, ...]]] |
||
| 188 | 1 | if self._primary_only: |
|
| 189 | 1 | variants = [word] |
|
| 190 | 1 | else: |
|
| 191 | pos = 0 |
||
| 192 | 1 | if word[:2] == 'CH': |
|
| 193 | 1 | variants.append(('CH', 'SCH')) |
|
| 194 | pos += 2 |
||
| 195 | 1 | len_3_vars = { |
|
| 196 | 1 | 'OWN': 'AUN', |
|
| 197 | 1 | 'WSK': 'RSK', |
|
| 198 | 1 | 'SCH': 'CH', |
|
| 199 | 'GLI': 'LI', |
||
| 200 | 1 | 'AUX': 'O', |
|
| 201 | 1 | 'EUX': 'O', |
|
| 202 | 1 | } |
|
| 203 | while pos < len(word): |
||
| 204 | 1 | if word[pos : pos + 4] == 'ILLE': |
|
| 205 | 1 | variants.append(('ILLE', 'I')) |
|
| 206 | 1 | pos += 4 |
|
| 207 | 1 | elif word[pos : pos + 3] in len_3_vars: |
|
| 208 | 1 | variants.append( |
|
| 209 | (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) |
||
| 210 | ) |
||
| 211 | pos += 3 |
||
| 212 | elif word[pos : pos + 2] == 'RB': |
||
| 213 | variants.append(('RB', 'RW')) |
||
| 214 | pos += 2 |
||
| 215 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
||
| 216 | 1 | variants.append(('EAU', 'O')) |
|
| 217 | 1 | pos += 3 |
|
| 218 | 1 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
| 219 | 1 | if word[pos:] == 'O': |
|
| 220 | 1 | variants.append(('O', 'OW')) |
|
| 221 | 1 | else: |
|
| 222 | variants.append(('A', 'AR')) |
||
| 223 | pos += 1 |
||
| 224 | 1 | else: |
|
| 225 | 1 | variants.append((word[pos],)) |
|
| 226 | 1 | pos += 1 |
|
| 227 | 1 | ||
| 228 | 1 | variants = [''.join(letters) for letters in product(*variants)] |
|
| 229 | 1 | ||
| 230 | 1 | def _haase_code(word: str) -> str: |
|
| 231 | 1 | sdx = '' |
|
| 232 | 1 | for i in range(len(word)): |
|
| 233 | 1 | View Code Duplication | if word[i] in self._uc_v_set: |
| 234 | sdx += '9' |
||
| 235 | 1 | elif word[i] == 'B': |
|
| 236 | 1 | sdx += '1' |
|
| 237 | elif word[i] == 'P': |
||
| 238 | 1 | if _before(word, i, {'H'}): |
|
| 239 | 1 | sdx += '3' |
|
| 240 | else: |
||
| 241 | 1 | sdx += '1' |
|
| 242 | elif word[i] in {'D', 'T'}: |
||
| 243 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
| 244 | 1 | sdx += '8' |
|
| 245 | 1 | else: |
|
| 246 | 1 | sdx += '2' |
|
| 247 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
| 248 | 1 | sdx += '3' |
|
| 249 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
| 250 | 1 | sdx += '4' |
|
| 251 | 1 | elif word[i] == 'C': |
|
| 252 | 1 | if _after(word, i, {'S', 'Z'}): |
|
| 253 | sdx += '8' |
||
| 254 | 1 | elif i == 0: |
|
| 255 | 1 | if _before( |
|
| 256 | 1 | word, |
|
| 257 | 1 | i, |
|
| 258 | {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'}, |
||
| 259 | 1 | ): |
|
| 260 | 1 | sdx += '4' |
|
| 261 | 1 | else: |
|
| 262 | 1 | sdx += '8' |
|
| 263 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
| 264 | 1 | sdx += '4' |
|
| 265 | 1 | else: |
|
| 266 | 1 | sdx += '8' |
|
| 267 | 1 | elif word[i] == 'X': |
|
| 268 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
| 269 | sdx += '8' |
||
| 270 | else: |
||
| 271 | sdx += '48' |
||
| 272 | elif word[i] == 'L': |
||
| 273 | 1 | sdx += '5' |
|
| 274 | elif word[i] in {'M', 'N'}: |
||
| 275 | 1 | sdx += '6' |
|
| 276 | 1 | elif word[i] == 'R': |
|
| 277 | 1 | sdx += '7' |
|
| 278 | elif word[i] in {'S', 'Z'}: |
||
| 279 | 1 | sdx += '8' |
|
| 280 | 1 | ||
| 281 | 1 | sdx = self._delete_consecutive_repeats(sdx) |
|
| 282 | 1 | ||
| 283 | return sdx |
||
| 284 | 1 | ||
| 285 | 1 | encoded = [_haase_code(word) for word in cast(List[str], variants)] |
|
| 286 | 1 | if len(encoded) > 1: |
|
| 287 | 1 | encoded_set = set() # type: Set[str] |
|
| 288 | 1 | encoded_single = [] |
|
| 289 | 1 | for code in encoded: |
|
| 290 | 1 | if code not in encoded_set: |
|
| 291 | 1 | encoded_set.add(code) |
|
| 292 | 1 | encoded_single.append(code) |
|
| 293 | return ','.join(encoded_single) |
||
| 294 | 1 | ||
| 295 | return encoded[0] |
||
| 296 | 1 | ||
| 297 | |||
| 298 | 1 | if __name__ == '__main__': |
|
| 299 | 1 | import doctest |
|
| 300 | 1 | ||
| 301 | doctest.testmod() |
||
| 302 |