| Conditions | 19 |
| Total Lines | 202 |
| Code Lines | 159 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 40 |
| CRAP Score | 19 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._paice_husk.paice_husk() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 31 | 1 | def paice_husk(word): |
|
| 32 | """Return Paice-Husk stem. |
||
| 33 | |||
| 34 | Implementation of the Paice-Husk Stemmer, also known as the Lancaster |
||
| 35 | Stemmer, developed by Chris Paice, with the assistance of Gareth Husk |
||
| 36 | |||
| 37 | This is based on the algorithm's description in :cite:`Paice:1990`. |
||
| 38 | |||
| 39 | :param str word: the word to stem |
||
| 40 | :returns: the stemmed word |
||
| 41 | :rtype: str |
||
| 42 | |||
| 43 | >>> paice_husk('assumption') |
||
| 44 | 'assum' |
||
| 45 | >>> paice_husk('verifiable') |
||
| 46 | 'ver' |
||
| 47 | >>> paice_husk('fancies') |
||
| 48 | 'fant' |
||
| 49 | >>> paice_husk('fanciful') |
||
| 50 | 'fancy' |
||
| 51 | >>> paice_husk('torment') |
||
| 52 | 'tor' |
||
| 53 | """ |
||
| 54 | 1 | rule_table = { |
|
| 55 | 6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)}, |
||
| 56 | 5: { |
||
| 57 | 'guish': (False, 5, 'ct', True), |
||
| 58 | 'sumpt': (False, 2, None, True), |
||
| 59 | 'istry': (False, 5, None, True), |
||
| 60 | }, |
||
| 61 | 4: { |
||
| 62 | 'ytic': (False, 3, 's', True), |
||
| 63 | 'ceed': (False, 2, 'ss', True), |
||
| 64 | 'hood': (False, 4, None, False), |
||
| 65 | 'lief': (False, 1, 'v', True), |
||
| 66 | 'verj': (False, 1, 't', True), |
||
| 67 | 'misj': (False, 2, 't', True), |
||
| 68 | 'iabl': (False, 4, 'y', True), |
||
| 69 | 'iful': (False, 4, 'y', True), |
||
| 70 | 'sion': (False, 4, 'j', False), |
||
| 71 | 'xion': (False, 4, 'ct', True), |
||
| 72 | 'ship': (False, 4, None, False), |
||
| 73 | 'ness': (False, 4, None, False), |
||
| 74 | 'ment': (False, 4, None, False), |
||
| 75 | 'ript': (False, 2, 'b', True), |
||
| 76 | 'orpt': (False, 2, 'b', True), |
||
| 77 | 'duct': (False, 1, None, True), |
||
| 78 | 'cept': (False, 2, 'iv', True), |
||
| 79 | 'olut': (False, 2, 'v', True), |
||
| 80 | 'sist': (False, 0, None, True), |
||
| 81 | }, |
||
| 82 | 3: { |
||
| 83 | 'ied': (False, 3, 'y', False), |
||
| 84 | 'eed': (False, 1, None, True), |
||
| 85 | 'ing': (False, 3, None, False), |
||
| 86 | 'iag': (False, 3, 'y', True), |
||
| 87 | 'ish': (False, 3, None, False), |
||
| 88 | 'fuj': (False, 1, 's', True), |
||
| 89 | 'hej': (False, 1, 'r', True), |
||
| 90 | 'abl': (False, 3, None, False), |
||
| 91 | 'ibl': (False, 3, None, True), |
||
| 92 | 'bil': (False, 2, 'l', False), |
||
| 93 | 'ful': (False, 3, None, False), |
||
| 94 | 'ial': (False, 3, None, False), |
||
| 95 | 'ual': (False, 3, None, False), |
||
| 96 | 'ium': (False, 3, None, True), |
||
| 97 | 'ism': (False, 3, None, False), |
||
| 98 | 'ion': (False, 3, None, False), |
||
| 99 | 'ian': (False, 3, None, False), |
||
| 100 | 'een': (False, 0, None, True), |
||
| 101 | 'ear': (False, 0, None, True), |
||
| 102 | 'ier': (False, 3, 'y', False), |
||
| 103 | 'ies': (False, 3, 'y', False), |
||
| 104 | 'sis': (False, 2, None, True), |
||
| 105 | 'ous': (False, 3, None, False), |
||
| 106 | 'ent': (False, 3, None, False), |
||
| 107 | 'ant': (False, 3, None, False), |
||
| 108 | 'ist': (False, 3, None, False), |
||
| 109 | 'iqu': (False, 3, None, True), |
||
| 110 | 'ogu': (False, 1, None, True), |
||
| 111 | 'siv': (False, 3, 'j', False), |
||
| 112 | 'eiv': (False, 0, None, True), |
||
| 113 | 'bly': (False, 1, None, False), |
||
| 114 | 'ily': (False, 3, 'y', False), |
||
| 115 | 'ply': (False, 0, None, True), |
||
| 116 | 'ogy': (False, 1, None, True), |
||
| 117 | 'phy': (False, 1, None, True), |
||
| 118 | 'omy': (False, 1, None, True), |
||
| 119 | 'opy': (False, 1, None, True), |
||
| 120 | 'ity': (False, 3, None, False), |
||
| 121 | 'ety': (False, 3, None, False), |
||
| 122 | 'lty': (False, 2, None, True), |
||
| 123 | 'ary': (False, 3, None, False), |
||
| 124 | 'ory': (False, 3, None, False), |
||
| 125 | 'ify': (False, 3, None, True), |
||
| 126 | 'ncy': (False, 2, 't', False), |
||
| 127 | 'acy': (False, 3, None, False), |
||
| 128 | }, |
||
| 129 | 2: { |
||
| 130 | 'ia': (True, 2, None, True), |
||
| 131 | 'bb': (False, 1, None, True), |
||
| 132 | 'ic': (False, 2, None, False), |
||
| 133 | 'nc': (False, 1, 't', False), |
||
| 134 | 'dd': (False, 1, None, True), |
||
| 135 | 'ed': (False, 2, None, False), |
||
| 136 | 'if': (False, 2, None, False), |
||
| 137 | 'ag': (False, 2, None, False), |
||
| 138 | 'gg': (False, 1, None, True), |
||
| 139 | 'th': (True, 2, None, True), |
||
| 140 | 'ij': (False, 1, 'd', True), |
||
| 141 | 'uj': (False, 1, 'd', True), |
||
| 142 | 'oj': (False, 1, 'd', True), |
||
| 143 | 'nj': (False, 1, 'd', True), |
||
| 144 | 'cl': (False, 1, None, True), |
||
| 145 | 'ul': (False, 2, None, True), |
||
| 146 | 'al': (False, 2, None, False), |
||
| 147 | 'll': (False, 1, None, True), |
||
| 148 | 'um': (True, 2, None, True), |
||
| 149 | 'mm': (False, 1, None, True), |
||
| 150 | 'an': (False, 2, None, False), |
||
| 151 | 'en': (False, 2, None, False), |
||
| 152 | 'nn': (False, 1, None, True), |
||
| 153 | 'pp': (False, 1, None, True), |
||
| 154 | 'er': (False, 2, None, False), |
||
| 155 | 'ar': (False, 2, None, True), |
||
| 156 | 'or': (False, 2, None, False), |
||
| 157 | 'ur': (False, 2, None, False), |
||
| 158 | 'rr': (False, 1, None, True), |
||
| 159 | 'tr': (False, 1, None, False), |
||
| 160 | 'is': (False, 2, None, False), |
||
| 161 | 'ss': (False, 0, None, True), |
||
| 162 | 'us': (True, 2, None, True), |
||
| 163 | 'at': (False, 2, None, False), |
||
| 164 | 'tt': (False, 1, None, True), |
||
| 165 | 'iv': (False, 2, None, False), |
||
| 166 | 'ly': (False, 2, None, False), |
||
| 167 | 'iz': (False, 2, None, False), |
||
| 168 | 'yz': (False, 1, 's', True), |
||
| 169 | }, |
||
| 170 | 1: { |
||
| 171 | 'a': (True, 1, None, True), |
||
| 172 | 'e': (False, 1, None, False), |
||
| 173 | 'i': ((True, 1, None, True), (False, 1, 'y', False)), |
||
| 174 | 'j': (False, 1, 's', True), |
||
| 175 | 's': ((True, 1, None, False), (False, 0, None, True)), |
||
| 176 | }, |
||
| 177 | } |
||
| 178 | |||
| 179 | 1 | def _has_vowel(word): |
|
| 180 | 1 | for char in word: |
|
| 181 | 1 | if char in {'a', 'e', 'i', 'o', 'u', 'y'}: |
|
| 182 | 1 | return True |
|
| 183 | 1 | return False |
|
| 184 | |||
| 185 | 1 | def _acceptable(word): |
|
| 186 | 1 | if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: |
|
| 187 | 1 | return len(word) > 1 |
|
| 188 | 1 | return len(word) > 2 and _has_vowel(word[1:]) |
|
| 189 | |||
| 190 | 1 | def _apply_rule(word, rule, intact): |
|
| 191 | 1 | old_word = word |
|
| 192 | 1 | only_intact, del_len, add_str, set_terminate = rule |
|
| 193 | # print(word, word[-n:], rule) |
||
| 194 | |||
| 195 | 1 | if (not only_intact) or (intact and only_intact): |
|
| 196 | 1 | if del_len: |
|
| 197 | 1 | word = word[:-del_len] |
|
| 198 | 1 | if add_str: |
|
| 199 | 1 | word += add_str |
|
| 200 | else: |
||
| 201 | 1 | return word, False, intact, terminate |
|
| 202 | |||
| 203 | 1 | if _acceptable(word): |
|
|
|
|||
| 204 | 1 | return word, True, False, set_terminate |
|
| 205 | else: |
||
| 206 | 1 | return old_word, False, intact, terminate |
|
| 207 | |||
| 208 | 1 | terminate = False |
|
| 209 | 1 | intact = True |
|
| 210 | 1 | while not terminate: |
|
| 211 | 1 | for n in range(6, 0, -1): |
|
| 212 | 1 | if word[-n:] in rule_table[n]: |
|
| 213 | 1 | accept = False |
|
| 214 | 1 | if len(rule_table[n][word[-n:]]) < 4: |
|
| 215 | 1 | for rule in rule_table[n][word[-n:]]: |
|
| 216 | 1 | (word, accept, intact, terminate) = _apply_rule( |
|
| 217 | word, rule, intact |
||
| 218 | ) |
||
| 219 | 1 | if accept: |
|
| 220 | 1 | break |
|
| 221 | else: |
||
| 222 | 1 | rule = rule_table[n][word[-n:]] |
|
| 223 | 1 | (word, accept, intact, terminate) = _apply_rule( |
|
| 224 | word, rule, intact |
||
| 225 | ) |
||
| 226 | |||
| 227 | 1 | if accept: |
|
| 228 | 1 | break |
|
| 229 | else: |
||
| 230 | 1 | break |
|
| 231 | |||
| 232 | 1 | return word |
|
| 233 | |||
| 239 |