| Conditions | 17 |
| Total Lines | 91 |
| Code Lines | 46 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 40 |
| CRAP Score | 17 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._caumanns.caumanns() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 34 | 1 | def caumanns(word): |
|
| 35 | """Return Caumanns German stem. |
||
| 36 | |||
| 37 | Jörg Caumanns' stemmer is described in his article in |
||
| 38 | :cite:`Caumanns:1999`. |
||
| 39 | |||
| 40 | This implementation is based on the GermanStemFilter described at |
||
| 41 | :cite:`Lang:2013`. |
||
| 42 | |||
| 43 | :param str word: the word to calculate the stem of |
||
| 44 | :returns: word stem |
||
| 45 | :rtype: str |
||
| 46 | |||
| 47 | >>> caumanns('lesen') |
||
| 48 | 'les' |
||
| 49 | >>> caumanns('graues') |
||
| 50 | 'grau' |
||
| 51 | >>> caumanns('buchstabieren') |
||
| 52 | 'buchstabier' |
||
| 53 | """ |
||
| 54 | 1 | if not word: |
|
| 55 | 1 | return '' |
|
| 56 | |||
| 57 | 1 | upper_initial = word[0].isupper() |
|
| 58 | 1 | word = normalize('NFC', text_type(word.lower())) |
|
| 59 | |||
| 60 | # # Part 2: Substitution |
||
| 61 | # 1. Change umlauts to corresponding vowels & ß to ss |
||
| 62 | 1 | _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou')) |
|
|
|
|||
| 63 | 1 | word = word.translate(_umlauts) |
|
| 64 | 1 | word = word.replace('ß', 'ss') |
|
| 65 | |||
| 66 | # 2. Change second of doubled characters to * |
||
| 67 | 1 | new_word = word[0] |
|
| 68 | 1 | for i in range(1, len(word)): |
|
| 69 | 1 | if new_word[i - 1] == word[i]: |
|
| 70 | 1 | new_word += '*' |
|
| 71 | else: |
||
| 72 | 1 | new_word += word[i] |
|
| 73 | 1 | word = new_word |
|
| 74 | |||
| 75 | # 3. Replace sch, ch, ei, ie with $, §, %, & |
||
| 76 | 1 | word = word.replace('sch', '$') |
|
| 77 | 1 | word = word.replace('ch', '§') |
|
| 78 | 1 | word = word.replace('ei', '%') |
|
| 79 | 1 | word = word.replace('ie', '&') |
|
| 80 | 1 | word = word.replace('ig', '#') |
|
| 81 | 1 | word = word.replace('st', '!') |
|
| 82 | |||
| 83 | # # Part 1: Recursive Context-Free Stripping |
||
| 84 | # 1. Remove the following 7 suffixes recursively |
||
| 85 | 1 | while len(word) > 3: |
|
| 86 | 1 | if (len(word) > 4 and word[-2:] in {'em', 'er'}) or ( |
|
| 87 | len(word) > 5 and word[-2:] == 'nd' |
||
| 88 | ): |
||
| 89 | 1 | word = word[:-2] |
|
| 90 | 1 | elif (word[-1] in {'e', 's', 'n'}) or ( |
|
| 91 | not upper_initial and word[-1] in {'t', '!'} |
||
| 92 | ): |
||
| 93 | 1 | word = word[:-1] |
|
| 94 | else: |
||
| 95 | 1 | break |
|
| 96 | |||
| 97 | # Additional optimizations: |
||
| 98 | 1 | if len(word) > 5 and word[-5:] == 'erin*': |
|
| 99 | 1 | word = word[:-1] |
|
| 100 | 1 | if word[-1] == 'z': |
|
| 101 | 1 | word = word[:-1] + 'x' |
|
| 102 | |||
| 103 | # Reverse substitutions: |
||
| 104 | 1 | word = word.replace('$', 'sch') |
|
| 105 | 1 | word = word.replace('§', 'ch') |
|
| 106 | 1 | word = word.replace('%', 'ei') |
|
| 107 | 1 | word = word.replace('&', 'ie') |
|
| 108 | 1 | word = word.replace('#', 'ig') |
|
| 109 | 1 | word = word.replace('!', 'st') |
|
| 110 | |||
| 111 | # Expand doubled |
||
| 112 | 1 | word = ''.join( |
|
| 113 | [word[0]] |
||
| 114 | + [ |
||
| 115 | word[i - 1] if word[i] == '*' else word[i] |
||
| 116 | for i in range(1, len(word)) |
||
| 117 | ] |
||
| 118 | ) |
||
| 119 | |||
| 120 | # Finally, convert gege to ge |
||
| 121 | 1 | if len(word) > 4: |
|
| 122 | 1 | word = word.replace('gege', 'ge', 1) |
|
| 123 | |||
| 124 | 1 | return word |
|
| 125 | |||
| 131 |