| Conditions | 40 |
| Total Lines | 388 |
| Code Lines | 313 |
| Lines | 0 |
| Ratio | 0 % |
| Tests | 86 |
| CRAP Score | 40 |
| Changes | 0 | ||
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._uealite.uealite() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 33 | 1 | def uealite( |
|
| 34 | word, max_word_length=20, max_acro_length=8, return_rule_no=False, var=None |
||
|
|
|||
| 35 | ): |
||
| 36 | """Return UEA-Lite stem. |
||
| 37 | |||
| 38 | The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`. |
||
| 39 | |||
| 40 | This is chiefly based on the Java implementation of the algorithm, with |
||
| 41 | variants based on the Perl implementation and Jason Adams' Ruby port. |
||
| 42 | |||
| 43 | Java version: :cite:`Churchill:2005` |
||
| 44 | Perl version: :cite:`Jenkins:2005` |
||
| 45 | Ruby version: :cite:`Adams:2017` |
||
| 46 | |||
| 47 | :param str word: the word to calculate the stem of |
||
| 48 | :param int max_word_length: the maximum word length allowed |
||
| 49 | :param int max_acro_length: the maximum acryonym length allowed |
||
| 50 | :param bool return_rule_no: if True, returns the stem along with rule |
||
| 51 | number |
||
| 52 | :param str var: variant to use (set to 'Adams' to use Jason Adams' rules, |
||
| 53 | or 'Perl' to use the original Perl set of rules) |
||
| 54 | :returns: word stem |
||
| 55 | :rtype: str or (str, int) |
||
| 56 | |||
| 57 | >>> uealite('readings') |
||
| 58 | 'read' |
||
| 59 | >>> uealite('insulted') |
||
| 60 | 'insult' |
||
| 61 | >>> uealite('cussed') |
||
| 62 | 'cuss' |
||
| 63 | >>> uealite('fancies') |
||
| 64 | 'fancy' |
||
| 65 | >>> uealite('eroded') |
||
| 66 | 'erode' |
||
| 67 | """ |
||
| 68 | 1 | problem_words = {'is', 'as', 'this', 'has', 'was', 'during'} |
|
| 69 | |||
| 70 | # rule table format: |
||
| 71 | # top-level dictionary: length-of-suffix: dict-of-rules |
||
| 72 | # dict-of-rules: suffix: (rule_no, suffix_length_to_delete, |
||
| 73 | # suffix_to_append) |
||
| 74 | 1 | rule_table = { |
|
| 75 | 7: { |
||
| 76 | 'titudes': (30, 1, None), |
||
| 77 | 'fulness': (34, 4, None), |
||
| 78 | 'ousness': (35, 4, None), |
||
| 79 | 'eadings': (40.7, 4, None), |
||
| 80 | 'oadings': (40.6, 4, None), |
||
| 81 | 'ealings': (42.4, 4, None), |
||
| 82 | 'ailings': (42.2, 4, None), |
||
| 83 | }, |
||
| 84 | 6: { |
||
| 85 | 'aceous': (1, 6, None), |
||
| 86 | 'aining': (24, 3, None), |
||
| 87 | 'acting': (25, 3, None), |
||
| 88 | 'ttings': (26, 5, None), |
||
| 89 | 'viding': (27, 3, 'e'), |
||
| 90 | 'ssings': (37, 4, None), |
||
| 91 | 'ulting': (38, 3, None), |
||
| 92 | 'eading': (40.7, 3, None), |
||
| 93 | 'oading': (40.6, 3, None), |
||
| 94 | 'edings': (40.5, 4, None), |
||
| 95 | 'ddings': (40.4, 5, None), |
||
| 96 | 'ldings': (40.3, 4, None), |
||
| 97 | 'rdings': (40.2, 4, None), |
||
| 98 | 'ndings': (40.1, 4, None), |
||
| 99 | 'llings': (41, 5, None), |
||
| 100 | 'ealing': (42.4, 3, None), |
||
| 101 | 'olings': (42.3, 4, None), |
||
| 102 | 'ailing': (42.2, 3, None), |
||
| 103 | 'elings': (42.1, 4, None), |
||
| 104 | 'mmings': (44.3, 5, None), |
||
| 105 | 'ngings': (45.2, 4, None), |
||
| 106 | 'ggings': (45.1, 5, None), |
||
| 107 | 'stings': (47, 4, None), |
||
| 108 | 'etings': (48.4, 4, None), |
||
| 109 | 'ntings': (48.2, 4, None), |
||
| 110 | 'irings': (54.4, 4, 'e'), |
||
| 111 | 'urings': (54.3, 4, 'e'), |
||
| 112 | 'ncings': (54.2, 4, 'e'), |
||
| 113 | 'things': (58.1, 1, None), |
||
| 114 | }, |
||
| 115 | 5: { |
||
| 116 | 'iases': (11.4, 2, None), |
||
| 117 | 'ained': (13.6, 2, None), |
||
| 118 | 'erned': (13.5, 2, None), |
||
| 119 | 'ifted': (14, 2, None), |
||
| 120 | 'ected': (15, 2, None), |
||
| 121 | 'vided': (16, 1, None), |
||
| 122 | 'erred': (19, 3, None), |
||
| 123 | 'urred': (20.5, 3, None), |
||
| 124 | 'lored': (20.4, 2, None), |
||
| 125 | 'eared': (20.3, 2, None), |
||
| 126 | 'tored': (20.2, 1, None), |
||
| 127 | 'noted': (22.4, 1, None), |
||
| 128 | 'leted': (22.3, 1, None), |
||
| 129 | 'anges': (23, 1, None), |
||
| 130 | 'tting': (26, 4, None), |
||
| 131 | 'ulted': (32, 2, None), |
||
| 132 | 'uming': (33, 3, 'e'), |
||
| 133 | 'rabed': (36.1, 1, None), |
||
| 134 | 'rebed': (36.1, 1, None), |
||
| 135 | 'ribed': (36.1, 1, None), |
||
| 136 | 'robed': (36.1, 1, None), |
||
| 137 | 'rubed': (36.1, 1, None), |
||
| 138 | 'ssing': (37, 3, None), |
||
| 139 | 'vings': (39, 4, 'e'), |
||
| 140 | 'eding': (40.5, 3, None), |
||
| 141 | 'dding': (40.4, 4, None), |
||
| 142 | 'lding': (40.3, 3, None), |
||
| 143 | 'rding': (40.2, 3, None), |
||
| 144 | 'nding': (40.1, 3, None), |
||
| 145 | 'dings': (40, 4, 'e'), |
||
| 146 | 'lling': (41, 4, None), |
||
| 147 | 'oling': (42.3, 3, None), |
||
| 148 | 'eling': (42.1, 3, None), |
||
| 149 | 'lings': (42, 4, 'e'), |
||
| 150 | 'mming': (44.3, 4, None), |
||
| 151 | 'rming': (44.2, 3, None), |
||
| 152 | 'lming': (44.1, 3, None), |
||
| 153 | 'mings': (44, 4, 'e'), |
||
| 154 | 'nging': (45.2, 3, None), |
||
| 155 | 'gging': (45.1, 4, None), |
||
| 156 | 'gings': (45, 4, 'e'), |
||
| 157 | 'aning': (46.6, 3, None), |
||
| 158 | 'ening': (46.5, 3, None), |
||
| 159 | 'gning': (46.4, 3, None), |
||
| 160 | 'nning': (46.3, 4, None), |
||
| 161 | 'oning': (46.2, 3, None), |
||
| 162 | 'rning': (46.1, 3, None), |
||
| 163 | 'sting': (47, 3, None), |
||
| 164 | 'eting': (48.4, 3, None), |
||
| 165 | 'pting': (48.3, 3, None), |
||
| 166 | 'nting': (48.2, 3, None), |
||
| 167 | 'cting': (48.1, 3, None), |
||
| 168 | 'tings': (48, 4, 'e'), |
||
| 169 | 'iring': (54.4, 3, 'e'), |
||
| 170 | 'uring': (54.3, 3, 'e'), |
||
| 171 | 'ncing': (54.2, 3, 'e'), |
||
| 172 | 'sings': (54, 4, 'e'), |
||
| 173 | # 'lling': (55, 3, None), # masked by 41 |
||
| 174 | 'ating': (57, 3, 'e'), |
||
| 175 | 'thing': (58.1, 0, None), |
||
| 176 | }, |
||
| 177 | 4: { |
||
| 178 | 'eeds': (7, 1, None), |
||
| 179 | 'uses': (11.3, 1, None), |
||
| 180 | 'sses': (11.2, 2, None), |
||
| 181 | 'eses': (11.1, 2, 'is'), |
||
| 182 | 'tled': (12.5, 1, None), |
||
| 183 | 'pled': (12.4, 1, None), |
||
| 184 | 'bled': (12.3, 1, None), |
||
| 185 | 'eled': (12.2, 2, None), |
||
| 186 | 'lled': (12.1, 2, None), |
||
| 187 | 'ened': (13.7, 2, None), |
||
| 188 | 'rned': (13.4, 2, None), |
||
| 189 | 'nned': (13.3, 3, None), |
||
| 190 | 'oned': (13.2, 2, None), |
||
| 191 | 'gned': (13.1, 2, None), |
||
| 192 | 'ered': (20.1, 2, None), |
||
| 193 | 'reds': (20, 2, None), |
||
| 194 | 'tted': (21, 3, None), |
||
| 195 | 'uted': (22.2, 1, None), |
||
| 196 | 'ated': (22.1, 1, None), |
||
| 197 | 'ssed': (28, 2, None), |
||
| 198 | 'umed': (31, 1, None), |
||
| 199 | 'beds': (36, 3, None), |
||
| 200 | 'ving': (39, 3, 'e'), |
||
| 201 | 'ding': (40, 3, 'e'), |
||
| 202 | 'ling': (42, 3, 'e'), |
||
| 203 | 'nged': (43.2, 1, None), |
||
| 204 | 'gged': (43.1, 3, None), |
||
| 205 | 'ming': (44, 3, 'e'), |
||
| 206 | 'ging': (45, 3, 'e'), |
||
| 207 | 'ning': (46, 3, 'e'), |
||
| 208 | 'ting': (48, 3, 'e'), |
||
| 209 | # 'ssed': (49, 2, None), # masked by 28 |
||
| 210 | # 'lled': (53, 2, None), # masked by 12.1 |
||
| 211 | 'zing': (54.1, 3, 'e'), |
||
| 212 | 'sing': (54, 3, 'e'), |
||
| 213 | 'lves': (60.1, 3, 'f'), |
||
| 214 | 'aped': (61.3, 1, None), |
||
| 215 | 'uded': (61.2, 1, None), |
||
| 216 | 'oded': (61.1, 1, None), |
||
| 217 | # 'ated': (61, 1, None), # masked by 22.1 |
||
| 218 | 'ones': (63.6, 1, None), |
||
| 219 | 'izes': (63.5, 1, None), |
||
| 220 | 'ures': (63.4, 1, None), |
||
| 221 | 'ines': (63.3, 1, None), |
||
| 222 | 'ides': (63.2, 1, None), |
||
| 223 | }, |
||
| 224 | 3: { |
||
| 225 | 'ces': (2, 1, None), |
||
| 226 | 'sis': (4, 0, None), |
||
| 227 | 'tis': (5, 0, None), |
||
| 228 | 'eed': (7, 0, None), |
||
| 229 | 'ued': (8, 1, None), |
||
| 230 | 'ues': (9, 1, None), |
||
| 231 | 'ees': (10, 1, None), |
||
| 232 | 'ses': (11, 1, None), |
||
| 233 | 'led': (12, 2, None), |
||
| 234 | 'ned': (13, 1, None), |
||
| 235 | 'ved': (17, 1, None), |
||
| 236 | 'ced': (18, 1, None), |
||
| 237 | 'red': (20, 1, None), |
||
| 238 | 'ted': (22, 2, None), |
||
| 239 | 'sed': (29, 1, None), |
||
| 240 | 'bed': (36, 2, None), |
||
| 241 | 'ged': (43, 1, None), |
||
| 242 | 'les': (50, 1, None), |
||
| 243 | 'tes': (51, 1, None), |
||
| 244 | 'zed': (52, 1, None), |
||
| 245 | 'ied': (56, 3, 'y'), |
||
| 246 | 'ies': (59, 3, 'y'), |
||
| 247 | 'ves': (60, 1, None), |
||
| 248 | 'pes': (63.8, 1, None), |
||
| 249 | 'mes': (63.7, 1, None), |
||
| 250 | 'ges': (63.1, 1, None), |
||
| 251 | 'ous': (65, 0, None), |
||
| 252 | 'ums': (66, 0, None), |
||
| 253 | }, |
||
| 254 | 2: { |
||
| 255 | 'cs': (3, 0, None), |
||
| 256 | 'ss': (6, 0, None), |
||
| 257 | 'es': (63, 2, None), |
||
| 258 | 'is': (64, 2, 'e'), |
||
| 259 | 'us': (67, 0, None), |
||
| 260 | }, |
||
| 261 | } |
||
| 262 | |||
| 263 | 1 | if var == 'Perl': |
|
| 264 | 1 | perl_deletions = { |
|
| 265 | 7: ['eadings', 'oadings', 'ealings', 'ailings'], |
||
| 266 | 6: [ |
||
| 267 | 'ttings', |
||
| 268 | 'ssings', |
||
| 269 | 'edings', |
||
| 270 | 'ddings', |
||
| 271 | 'ldings', |
||
| 272 | 'rdings', |
||
| 273 | 'ndings', |
||
| 274 | 'llings', |
||
| 275 | 'olings', |
||
| 276 | 'elings', |
||
| 277 | 'mmings', |
||
| 278 | 'ngings', |
||
| 279 | 'ggings', |
||
| 280 | 'stings', |
||
| 281 | 'etings', |
||
| 282 | 'ntings', |
||
| 283 | 'irings', |
||
| 284 | 'urings', |
||
| 285 | 'ncings', |
||
| 286 | 'things', |
||
| 287 | ], |
||
| 288 | 5: ['vings', 'dings', 'lings', 'mings', 'gings', 'tings', 'sings'], |
||
| 289 | 4: ['eeds', 'reds', 'beds'], |
||
| 290 | } |
||
| 291 | |||
| 292 | # Delete the above rules from rule_table |
||
| 293 | 1 | for del_len in perl_deletions: |
|
| 294 | 1 | for term in perl_deletions[del_len]: |
|
| 295 | 1 | del rule_table[del_len][term] |
|
| 296 | |||
| 297 | 1 | elif var == 'Adams': |
|
| 298 | 1 | adams_additions = { |
|
| 299 | 6: {'chited': (22.8, 1, None)}, |
||
| 300 | 5: { |
||
| 301 | 'dying': (58.2, 4, 'ie'), |
||
| 302 | 'tying': (58.2, 4, 'ie'), |
||
| 303 | 'vited': (22.6, 1, None), |
||
| 304 | 'mited': (22.5, 1, None), |
||
| 305 | 'vided': (22.9, 1, None), |
||
| 306 | 'mided': (22.10, 1, None), |
||
| 307 | 'lying': (58.2, 4, 'ie'), |
||
| 308 | 'arred': (19.1, 3, None), |
||
| 309 | }, |
||
| 310 | 4: { |
||
| 311 | 'ited': (22.7, 2, None), |
||
| 312 | 'oked': (31.1, 1, None), |
||
| 313 | 'aked': (31.1, 1, None), |
||
| 314 | 'iked': (31.1, 1, None), |
||
| 315 | 'uked': (31.1, 1, None), |
||
| 316 | 'amed': (31, 1, None), |
||
| 317 | 'imed': (31, 1, None), |
||
| 318 | 'does': (31.2, 2, None), |
||
| 319 | }, |
||
| 320 | 3: { |
||
| 321 | 'oed': (31.3, 1, None), |
||
| 322 | 'oes': (31.2, 1, None), |
||
| 323 | 'kes': (63.1, 1, None), |
||
| 324 | 'des': (63.10, 1, None), |
||
| 325 | 'res': (63.9, 1, None), |
||
| 326 | }, |
||
| 327 | } |
||
| 328 | |||
| 329 | # Add the above additional rules to rule_table |
||
| 330 | 1 | for del_len in adams_additions: |
|
| 331 | 1 | rule_table[del_len] = dict( |
|
| 332 | rule_table[del_len], **adams_additions[del_len] |
||
| 333 | ) |
||
| 334 | # Add additional problem word |
||
| 335 | 1 | problem_words.add('menses') |
|
| 336 | |||
| 337 | 1 | def _stem_with_duplicate_character_check(word, del_len): |
|
| 338 | 1 | if word[-1] == 's': |
|
| 339 | 1 | del_len += 1 |
|
| 340 | 1 | stemmed_word = word[:-del_len] |
|
| 341 | 1 | if re_match(r'.*(\w)\1$', stemmed_word): |
|
| 342 | 1 | stemmed_word = stemmed_word[:-1] |
|
| 343 | 1 | return stemmed_word |
|
| 344 | |||
| 345 | 1 | def _stem(word): |
|
| 346 | 1 | stemmed_word = word |
|
| 347 | 1 | rule_no = 0 |
|
| 348 | |||
| 349 | 1 | if not word: |
|
| 350 | 1 | return word, 0 |
|
| 351 | 1 | if word in problem_words: |
|
| 352 | 1 | return word, 90 |
|
| 353 | 1 | if max_word_length and len(word) > max_word_length: |
|
| 354 | 1 | return word, 95 |
|
| 355 | |||
| 356 | 1 | if "'" in word: |
|
| 357 | 1 | if word[-2:] in {"'s", "'S"}: |
|
| 358 | 1 | stemmed_word = word[:-2] |
|
| 359 | 1 | if word[-1:] == "'": |
|
| 360 | 1 | stemmed_word = word[:-1] |
|
| 361 | 1 | stemmed_word = stemmed_word.replace("n't", 'not') |
|
| 362 | 1 | stemmed_word = stemmed_word.replace("'ve", 'have') |
|
| 363 | 1 | stemmed_word = stemmed_word.replace("'re", 'are') |
|
| 364 | 1 | stemmed_word = stemmed_word.replace("'m", 'am') |
|
| 365 | 1 | return stemmed_word, 94 |
|
| 366 | |||
| 367 | 1 | if word.isdigit(): |
|
| 368 | 1 | return word, 90.3 |
|
| 369 | else: |
||
| 370 | 1 | hyphen = word.find('-') |
|
| 371 | 1 | if len(word) > hyphen > 0: |
|
| 372 | 1 | if word[:hyphen].isalpha() and word[hyphen + 1 :].isalpha(): |
|
| 373 | 1 | return word, 90.2 |
|
| 374 | else: |
||
| 375 | 1 | return word, 90.1 |
|
| 376 | 1 | elif '_' in word: |
|
| 377 | 1 | return word, 90 |
|
| 378 | 1 | elif word[-1] == 's' and word[:-1].isupper(): |
|
| 379 | 1 | if var == 'Adams' and len(word) - 1 > max_acro_length: |
|
| 380 | 1 | return word, 96 |
|
| 381 | 1 | return word[:-1], 91.1 |
|
| 382 | 1 | elif word.isupper(): |
|
| 383 | 1 | if var == 'Adams' and len(word) > max_acro_length: |
|
| 384 | 1 | return word, 96 |
|
| 385 | 1 | return word, 91 |
|
| 386 | 1 | elif re_match(r'^.*[A-Z].*[A-Z].*$', word): |
|
| 387 | 1 | return word, 92 |
|
| 388 | 1 | elif word[0].isupper(): |
|
| 389 | 1 | return word, 93 |
|
| 390 | 1 | elif var == 'Adams' and re_match(r'^[a-z](|[rl])(ing|ed)$', word): |
|
| 391 | 1 | return word, 97 |
|
| 392 | |||
| 393 | 1 | for n in range(7, 1, -1): |
|
| 394 | 1 | if word[-n:] in rule_table[n]: |
|
| 395 | 1 | rule_no, del_len, add_str = rule_table[n][word[-n:]] |
|
| 396 | 1 | if del_len: |
|
| 397 | 1 | stemmed_word = word[:-del_len] |
|
| 398 | else: |
||
| 399 | 1 | stemmed_word = word |
|
| 400 | 1 | if add_str: |
|
| 401 | 1 | stemmed_word += add_str |
|
| 402 | 1 | break |
|
| 403 | |||
| 404 | 1 | if not rule_no: |
|
| 405 | 1 | if re_match(r'.*\w\wings?$', word): # rule 58 |
|
| 406 | 1 | stemmed_word = _stem_with_duplicate_character_check(word, 3) |
|
| 407 | 1 | rule_no = 58 |
|
| 408 | 1 | elif re_match(r'.*\w\weds?$', word): # rule 62 |
|
| 409 | 1 | stemmed_word = _stem_with_duplicate_character_check(word, 2) |
|
| 410 | 1 | rule_no = 62 |
|
| 411 | 1 | elif word[-1] == 's': # rule 68 |
|
| 412 | 1 | stemmed_word = word[:-1] |
|
| 413 | 1 | rule_no = 68 |
|
| 414 | |||
| 415 | 1 | return stemmed_word, rule_no |
|
| 416 | |||
| 417 | 1 | stem, rule_no = _stem(word) |
|
| 418 | 1 | if return_rule_no: |
|
| 419 | 1 | return stem, rule_no |
|
| 420 | 1 | return stem |
|
| 421 | |||
| 427 |