| Conditions | 39 |
| Total Lines | 187 |
| Code Lines | 136 |
| Lines | 45 |
| Ratio | 24.06 % |
| Tests | 98 |
| CRAP Score | 39 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._de.haase_phonetik() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 313 | 1 | def haase_phonetik(word, primary_only=False): |
|
| 314 | """Return the Haase Phonetik (numeric output) code for a word. |
||
| 315 | |||
| 316 | Based on the algorithm described at :cite:`Prante:2015`. |
||
| 317 | |||
| 318 | Based on the original :cite:`Haase:2000`. |
||
| 319 | |||
| 320 | While the output code is numeric, it is nevertheless a str. |
||
| 321 | |||
| 322 | :param str word: the word to transform |
||
| 323 | :param bool primary_only: if True, only the primary code is returned |
||
| 324 | :returns: the Haase Phonetik value as a numeric string |
||
| 325 | :rtype: tuple |
||
| 326 | |||
| 327 | >>> haase_phonetik('Joachim') |
||
| 328 | ('9496',) |
||
| 329 | >>> haase_phonetik('Christoph') |
||
| 330 | ('4798293', '8798293') |
||
| 331 | >>> haase_phonetik('Jörg') |
||
| 332 | ('974',) |
||
| 333 | >>> haase_phonetik('Smith') |
||
| 334 | ('8692',) |
||
| 335 | >>> haase_phonetik('Schmidt') |
||
| 336 | ('8692', '4692') |
||
| 337 | """ |
||
| 338 | |||
| 339 | 1 | def _after(word, i, letters): |
|
| 340 | """Return True if word[i] follows one of the supplied letters.""" |
||
| 341 | 1 | if i > 0 and word[i - 1] in letters: |
|
| 342 | 1 | return True |
|
| 343 | 1 | return False |
|
| 344 | |||
| 345 | 1 | def _before(word, i, letters): |
|
| 346 | """Return True if word[i] precedes one of the supplied letters.""" |
||
| 347 | 1 | if i + 1 < len(word) and word[i + 1] in letters: |
|
| 348 | 1 | return True |
|
| 349 | 1 | return False |
|
| 350 | |||
| 351 | 1 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
|
| 352 | |||
| 353 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
| 354 | 1 | word = word.replace('ß', 'SS') |
|
| 355 | |||
| 356 | 1 | word = word.replace('Ä', 'AE') |
|
| 357 | 1 | word = word.replace('Ö', 'OE') |
|
| 358 | 1 | word = word.replace('Ü', 'UE') |
|
| 359 | 1 | word = ''.join( |
|
| 360 | c |
||
| 361 | for c in word |
||
| 362 | if c |
||
| 363 | in { |
||
| 364 | 'A', |
||
| 365 | 'B', |
||
| 366 | 'C', |
||
| 367 | 'D', |
||
| 368 | 'E', |
||
| 369 | 'F', |
||
| 370 | 'G', |
||
| 371 | 'H', |
||
| 372 | 'I', |
||
| 373 | 'J', |
||
| 374 | 'K', |
||
| 375 | 'L', |
||
| 376 | 'M', |
||
| 377 | 'N', |
||
| 378 | 'O', |
||
| 379 | 'P', |
||
| 380 | 'Q', |
||
| 381 | 'R', |
||
| 382 | 'S', |
||
| 383 | 'T', |
||
| 384 | 'U', |
||
| 385 | 'V', |
||
| 386 | 'W', |
||
| 387 | 'X', |
||
| 388 | 'Y', |
||
| 389 | 'Z', |
||
| 390 | } |
||
| 391 | ) |
||
| 392 | |||
| 393 | 1 | variants = [] |
|
| 394 | 1 | if primary_only: |
|
| 395 | 1 | variants = [word] |
|
| 396 | else: |
||
| 397 | 1 | pos = 0 |
|
| 398 | 1 | if word[:2] == 'CH': |
|
| 399 | 1 | variants.append(('CH', 'SCH')) |
|
| 400 | 1 | pos += 2 |
|
| 401 | 1 | len_3_vars = { |
|
| 402 | 'OWN': 'AUN', |
||
| 403 | 'WSK': 'RSK', |
||
| 404 | 'SCH': 'CH', |
||
| 405 | 'GLI': 'LI', |
||
| 406 | 'AUX': 'O', |
||
| 407 | 'EUX': 'O', |
||
| 408 | } |
||
| 409 | 1 | while pos < len(word): |
|
| 410 | 1 | if word[pos : pos + 4] == 'ILLE': |
|
| 411 | 1 | variants.append(('ILLE', 'I')) |
|
| 412 | 1 | pos += 4 |
|
| 413 | 1 | elif word[pos : pos + 3] in len_3_vars: |
|
| 414 | 1 | variants.append( |
|
| 415 | (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) |
||
| 416 | ) |
||
| 417 | 1 | pos += 3 |
|
| 418 | 1 | elif word[pos : pos + 2] == 'RB': |
|
| 419 | 1 | variants.append(('RB', 'RW')) |
|
| 420 | 1 | pos += 2 |
|
| 421 | 1 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
|
| 422 | 1 | variants.append(('EAU', 'O')) |
|
| 423 | 1 | pos += 3 |
|
| 424 | 1 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
| 425 | 1 | if word[pos:] == 'O': |
|
| 426 | 1 | variants.append(('O', 'OW')) |
|
| 427 | else: |
||
| 428 | 1 | variants.append(('A', 'AR')) |
|
| 429 | 1 | pos += 1 |
|
| 430 | else: |
||
| 431 | 1 | variants.append((word[pos],)) |
|
| 432 | 1 | pos += 1 |
|
| 433 | |||
| 434 | 1 | variants = [''.join(letters) for letters in product(*variants)] |
|
| 435 | |||
| 436 | 1 | def _haase_code(word): |
|
| 437 | 1 | sdx = '' |
|
| 438 | 1 | for i in range(len(word)): |
|
| 439 | 1 | View Code Duplication | if word[i] in _vowels: |
| 440 | 1 | sdx += '9' |
|
| 441 | 1 | elif word[i] == 'B': |
|
| 442 | 1 | sdx += '1' |
|
| 443 | 1 | elif word[i] == 'P': |
|
| 444 | 1 | if _before(word, i, {'H'}): |
|
| 445 | 1 | sdx += '3' |
|
| 446 | else: |
||
| 447 | 1 | sdx += '1' |
|
| 448 | 1 | elif word[i] in {'D', 'T'}: |
|
| 449 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
| 450 | 1 | sdx += '8' |
|
| 451 | else: |
||
| 452 | 1 | sdx += '2' |
|
| 453 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
| 454 | 1 | sdx += '3' |
|
| 455 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
| 456 | 1 | sdx += '4' |
|
| 457 | 1 | elif word[i] == 'C': |
|
| 458 | 1 | if _after(word, i, {'S', 'Z'}): |
|
| 459 | 1 | sdx += '8' |
|
| 460 | 1 | elif i == 0: |
|
| 461 | 1 | if _before( |
|
| 462 | word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} |
||
| 463 | ): |
||
| 464 | 1 | sdx += '4' |
|
| 465 | else: |
||
| 466 | 1 | sdx += '8' |
|
| 467 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
| 468 | 1 | sdx += '4' |
|
| 469 | else: |
||
| 470 | 1 | sdx += '8' |
|
| 471 | 1 | elif word[i] == 'X': |
|
| 472 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
| 473 | 1 | sdx += '8' |
|
| 474 | else: |
||
| 475 | 1 | sdx += '48' |
|
| 476 | 1 | elif word[i] == 'L': |
|
| 477 | 1 | sdx += '5' |
|
| 478 | 1 | elif word[i] in {'M', 'N'}: |
|
| 479 | 1 | sdx += '6' |
|
| 480 | 1 | elif word[i] == 'R': |
|
| 481 | 1 | sdx += '7' |
|
| 482 | 1 | elif word[i] in {'S', 'Z'}: |
|
| 483 | 1 | sdx += '8' |
|
| 484 | |||
| 485 | 1 | sdx = _delete_consecutive_repeats(sdx) |
|
| 486 | |||
| 487 | 1 | return sdx |
|
| 488 | |||
| 489 | 1 | encoded = tuple(_haase_code(word) for word in variants) |
|
| 490 | 1 | if len(encoded) > 1: |
|
| 491 | 1 | encoded_set = set() |
|
| 492 | 1 | encoded_single = [] |
|
| 493 | 1 | for code in encoded: |
|
| 494 | 1 | if code not in encoded_set: |
|
| 495 | 1 | encoded_set.add(code) |
|
| 496 | 1 | encoded_single.append(code) |
|
| 497 | 1 | return tuple(encoded_single) |
|
| 498 | |||
| 499 | 1 | return encoded |
|
| 500 | |||
| 647 |