Conditions | 39 |
Total Lines | 187 |
Code Lines | 136 |
Lines | 45 |
Ratio | 24.06 % |
Tests | 98 |
CRAP Score | 39 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.phonetic._de.haase_phonetik() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
313 | 1 | def haase_phonetik(word, primary_only=False): |
|
314 | """Return the Haase Phonetik (numeric output) code for a word. |
||
315 | |||
316 | Based on the algorithm described at :cite:`Prante:2015`. |
||
317 | |||
318 | Based on the original :cite:`Haase:2000`. |
||
319 | |||
320 | While the output code is numeric, it is nevertheless a str. |
||
321 | |||
322 | :param str word: the word to transform |
||
323 | :param bool primary_only: if True, only the primary code is returned |
||
324 | :returns: the Haase Phonetik value as a numeric string |
||
325 | :rtype: tuple |
||
326 | |||
327 | >>> haase_phonetik('Joachim') |
||
328 | ('9496',) |
||
329 | >>> haase_phonetik('Christoph') |
||
330 | ('4798293', '8798293') |
||
331 | >>> haase_phonetik('Jörg') |
||
332 | ('974',) |
||
333 | >>> haase_phonetik('Smith') |
||
334 | ('8692',) |
||
335 | >>> haase_phonetik('Schmidt') |
||
336 | ('8692', '4692') |
||
337 | """ |
||
338 | |||
339 | 1 | def _after(word, i, letters): |
|
340 | """Return True if word[i] follows one of the supplied letters.""" |
||
341 | 1 | if i > 0 and word[i - 1] in letters: |
|
342 | 1 | return True |
|
343 | 1 | return False |
|
344 | |||
345 | 1 | def _before(word, i, letters): |
|
346 | """Return True if word[i] precedes one of the supplied letters.""" |
||
347 | 1 | if i + 1 < len(word) and word[i + 1] in letters: |
|
348 | 1 | return True |
|
349 | 1 | return False |
|
350 | |||
351 | 1 | _vowels = {'A', 'E', 'I', 'J', 'O', 'U', 'Y'} |
|
352 | |||
353 | 1 | word = unicode_normalize('NFKD', text_type(word.upper())) |
|
354 | 1 | word = word.replace('ß', 'SS') |
|
355 | |||
356 | 1 | word = word.replace('Ä', 'AE') |
|
357 | 1 | word = word.replace('Ö', 'OE') |
|
358 | 1 | word = word.replace('Ü', 'UE') |
|
359 | 1 | word = ''.join( |
|
360 | c |
||
361 | for c in word |
||
362 | if c |
||
363 | in { |
||
364 | 'A', |
||
365 | 'B', |
||
366 | 'C', |
||
367 | 'D', |
||
368 | 'E', |
||
369 | 'F', |
||
370 | 'G', |
||
371 | 'H', |
||
372 | 'I', |
||
373 | 'J', |
||
374 | 'K', |
||
375 | 'L', |
||
376 | 'M', |
||
377 | 'N', |
||
378 | 'O', |
||
379 | 'P', |
||
380 | 'Q', |
||
381 | 'R', |
||
382 | 'S', |
||
383 | 'T', |
||
384 | 'U', |
||
385 | 'V', |
||
386 | 'W', |
||
387 | 'X', |
||
388 | 'Y', |
||
389 | 'Z', |
||
390 | } |
||
391 | ) |
||
392 | |||
393 | 1 | variants = [] |
|
394 | 1 | if primary_only: |
|
395 | 1 | variants = [word] |
|
396 | else: |
||
397 | 1 | pos = 0 |
|
398 | 1 | if word[:2] == 'CH': |
|
399 | 1 | variants.append(('CH', 'SCH')) |
|
400 | 1 | pos += 2 |
|
401 | 1 | len_3_vars = { |
|
402 | 'OWN': 'AUN', |
||
403 | 'WSK': 'RSK', |
||
404 | 'SCH': 'CH', |
||
405 | 'GLI': 'LI', |
||
406 | 'AUX': 'O', |
||
407 | 'EUX': 'O', |
||
408 | } |
||
409 | 1 | while pos < len(word): |
|
410 | 1 | if word[pos : pos + 4] == 'ILLE': |
|
411 | 1 | variants.append(('ILLE', 'I')) |
|
412 | 1 | pos += 4 |
|
413 | 1 | elif word[pos : pos + 3] in len_3_vars: |
|
414 | 1 | variants.append( |
|
415 | (word[pos : pos + 3], len_3_vars[word[pos : pos + 3]]) |
||
416 | ) |
||
417 | 1 | pos += 3 |
|
418 | 1 | elif word[pos : pos + 2] == 'RB': |
|
419 | 1 | variants.append(('RB', 'RW')) |
|
420 | 1 | pos += 2 |
|
421 | 1 | elif len(word[pos:]) == 3 and word[pos:] == 'EAU': |
|
422 | 1 | variants.append(('EAU', 'O')) |
|
423 | 1 | pos += 3 |
|
424 | 1 | elif len(word[pos:]) == 1 and word[pos:] in {'A', 'O'}: |
|
425 | 1 | if word[pos:] == 'O': |
|
426 | 1 | variants.append(('O', 'OW')) |
|
427 | else: |
||
428 | 1 | variants.append(('A', 'AR')) |
|
429 | 1 | pos += 1 |
|
430 | else: |
||
431 | 1 | variants.append((word[pos],)) |
|
432 | 1 | pos += 1 |
|
433 | |||
434 | 1 | variants = [''.join(letters) for letters in product(*variants)] |
|
435 | |||
436 | 1 | def _haase_code(word): |
|
437 | 1 | sdx = '' |
|
438 | 1 | for i in range(len(word)): |
|
439 | 1 | View Code Duplication | if word[i] in _vowels: |
440 | 1 | sdx += '9' |
|
441 | 1 | elif word[i] == 'B': |
|
442 | 1 | sdx += '1' |
|
443 | 1 | elif word[i] == 'P': |
|
444 | 1 | if _before(word, i, {'H'}): |
|
445 | 1 | sdx += '3' |
|
446 | else: |
||
447 | 1 | sdx += '1' |
|
448 | 1 | elif word[i] in {'D', 'T'}: |
|
449 | 1 | if _before(word, i, {'C', 'S', 'Z'}): |
|
450 | 1 | sdx += '8' |
|
451 | else: |
||
452 | 1 | sdx += '2' |
|
453 | 1 | elif word[i] in {'F', 'V', 'W'}: |
|
454 | 1 | sdx += '3' |
|
455 | 1 | elif word[i] in {'G', 'K', 'Q'}: |
|
456 | 1 | sdx += '4' |
|
457 | 1 | elif word[i] == 'C': |
|
458 | 1 | if _after(word, i, {'S', 'Z'}): |
|
459 | 1 | sdx += '8' |
|
460 | 1 | elif i == 0: |
|
461 | 1 | if _before( |
|
462 | word, i, {'A', 'H', 'K', 'L', 'O', 'Q', 'R', 'U', 'X'} |
||
463 | ): |
||
464 | 1 | sdx += '4' |
|
465 | else: |
||
466 | 1 | sdx += '8' |
|
467 | 1 | elif _before(word, i, {'A', 'H', 'K', 'O', 'Q', 'U', 'X'}): |
|
468 | 1 | sdx += '4' |
|
469 | else: |
||
470 | 1 | sdx += '8' |
|
471 | 1 | elif word[i] == 'X': |
|
472 | 1 | if _after(word, i, {'C', 'K', 'Q'}): |
|
473 | 1 | sdx += '8' |
|
474 | else: |
||
475 | 1 | sdx += '48' |
|
476 | 1 | elif word[i] == 'L': |
|
477 | 1 | sdx += '5' |
|
478 | 1 | elif word[i] in {'M', 'N'}: |
|
479 | 1 | sdx += '6' |
|
480 | 1 | elif word[i] == 'R': |
|
481 | 1 | sdx += '7' |
|
482 | 1 | elif word[i] in {'S', 'Z'}: |
|
483 | 1 | sdx += '8' |
|
484 | |||
485 | 1 | sdx = _delete_consecutive_repeats(sdx) |
|
486 | |||
487 | 1 | return sdx |
|
488 | |||
489 | 1 | encoded = tuple(_haase_code(word) for word in variants) |
|
490 | 1 | if len(encoded) > 1: |
|
491 | 1 | encoded_set = set() |
|
492 | 1 | encoded_single = [] |
|
493 | 1 | for code in encoded: |
|
494 | 1 | if code not in encoded_set: |
|
495 | 1 | encoded_set.add(code) |
|
496 | 1 | encoded_single.append(code) |
|
497 | 1 | return tuple(encoded_single) |
|
498 | |||
499 | 1 | return encoded |
|
500 | |||
647 |