Conditions | 40 |
Total Lines | 388 |
Code Lines | 313 |
Lines | 0 |
Ratio | 0 % |
Tests | 86 |
CRAP Score | 40 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._uealite.uealite() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
33 | 1 | def uealite( |
|
34 | word, max_word_length=20, max_acro_length=8, return_rule_no=False, var=None |
||
|
|||
35 | ): |
||
36 | """Return UEA-Lite stem. |
||
37 | |||
38 | The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`. |
||
39 | |||
40 | This is chiefly based on the Java implementation of the algorithm, with |
||
41 | variants based on the Perl implementation and Jason Adams' Ruby port. |
||
42 | |||
43 | Java version: :cite:`Churchill:2005` |
||
44 | Perl version: :cite:`Jenkins:2005` |
||
45 | Ruby version: :cite:`Adams:2017` |
||
46 | |||
47 | :param str word: the word to calculate the stem of |
||
48 | :param int max_word_length: the maximum word length allowed |
||
49 | :param int max_acro_length: the maximum acryonym length allowed |
||
50 | :param bool return_rule_no: if True, returns the stem along with rule |
||
51 | number |
||
52 | :param str var: variant to use (set to 'Adams' to use Jason Adams' rules, |
||
53 | or 'Perl' to use the original Perl set of rules) |
||
54 | :returns: word stem |
||
55 | :rtype: str or (str, int) |
||
56 | |||
57 | >>> uealite('readings') |
||
58 | 'read' |
||
59 | >>> uealite('insulted') |
||
60 | 'insult' |
||
61 | >>> uealite('cussed') |
||
62 | 'cuss' |
||
63 | >>> uealite('fancies') |
||
64 | 'fancy' |
||
65 | >>> uealite('eroded') |
||
66 | 'erode' |
||
67 | """ |
||
68 | 1 | problem_words = {'is', 'as', 'this', 'has', 'was', 'during'} |
|
69 | |||
70 | # rule table format: |
||
71 | # top-level dictionary: length-of-suffix: dict-of-rules |
||
72 | # dict-of-rules: suffix: (rule_no, suffix_length_to_delete, |
||
73 | # suffix_to_append) |
||
74 | 1 | rule_table = { |
|
75 | 7: { |
||
76 | 'titudes': (30, 1, None), |
||
77 | 'fulness': (34, 4, None), |
||
78 | 'ousness': (35, 4, None), |
||
79 | 'eadings': (40.7, 4, None), |
||
80 | 'oadings': (40.6, 4, None), |
||
81 | 'ealings': (42.4, 4, None), |
||
82 | 'ailings': (42.2, 4, None), |
||
83 | }, |
||
84 | 6: { |
||
85 | 'aceous': (1, 6, None), |
||
86 | 'aining': (24, 3, None), |
||
87 | 'acting': (25, 3, None), |
||
88 | 'ttings': (26, 5, None), |
||
89 | 'viding': (27, 3, 'e'), |
||
90 | 'ssings': (37, 4, None), |
||
91 | 'ulting': (38, 3, None), |
||
92 | 'eading': (40.7, 3, None), |
||
93 | 'oading': (40.6, 3, None), |
||
94 | 'edings': (40.5, 4, None), |
||
95 | 'ddings': (40.4, 5, None), |
||
96 | 'ldings': (40.3, 4, None), |
||
97 | 'rdings': (40.2, 4, None), |
||
98 | 'ndings': (40.1, 4, None), |
||
99 | 'llings': (41, 5, None), |
||
100 | 'ealing': (42.4, 3, None), |
||
101 | 'olings': (42.3, 4, None), |
||
102 | 'ailing': (42.2, 3, None), |
||
103 | 'elings': (42.1, 4, None), |
||
104 | 'mmings': (44.3, 5, None), |
||
105 | 'ngings': (45.2, 4, None), |
||
106 | 'ggings': (45.1, 5, None), |
||
107 | 'stings': (47, 4, None), |
||
108 | 'etings': (48.4, 4, None), |
||
109 | 'ntings': (48.2, 4, None), |
||
110 | 'irings': (54.4, 4, 'e'), |
||
111 | 'urings': (54.3, 4, 'e'), |
||
112 | 'ncings': (54.2, 4, 'e'), |
||
113 | 'things': (58.1, 1, None), |
||
114 | }, |
||
115 | 5: { |
||
116 | 'iases': (11.4, 2, None), |
||
117 | 'ained': (13.6, 2, None), |
||
118 | 'erned': (13.5, 2, None), |
||
119 | 'ifted': (14, 2, None), |
||
120 | 'ected': (15, 2, None), |
||
121 | 'vided': (16, 1, None), |
||
122 | 'erred': (19, 3, None), |
||
123 | 'urred': (20.5, 3, None), |
||
124 | 'lored': (20.4, 2, None), |
||
125 | 'eared': (20.3, 2, None), |
||
126 | 'tored': (20.2, 1, None), |
||
127 | 'noted': (22.4, 1, None), |
||
128 | 'leted': (22.3, 1, None), |
||
129 | 'anges': (23, 1, None), |
||
130 | 'tting': (26, 4, None), |
||
131 | 'ulted': (32, 2, None), |
||
132 | 'uming': (33, 3, 'e'), |
||
133 | 'rabed': (36.1, 1, None), |
||
134 | 'rebed': (36.1, 1, None), |
||
135 | 'ribed': (36.1, 1, None), |
||
136 | 'robed': (36.1, 1, None), |
||
137 | 'rubed': (36.1, 1, None), |
||
138 | 'ssing': (37, 3, None), |
||
139 | 'vings': (39, 4, 'e'), |
||
140 | 'eding': (40.5, 3, None), |
||
141 | 'dding': (40.4, 4, None), |
||
142 | 'lding': (40.3, 3, None), |
||
143 | 'rding': (40.2, 3, None), |
||
144 | 'nding': (40.1, 3, None), |
||
145 | 'dings': (40, 4, 'e'), |
||
146 | 'lling': (41, 4, None), |
||
147 | 'oling': (42.3, 3, None), |
||
148 | 'eling': (42.1, 3, None), |
||
149 | 'lings': (42, 4, 'e'), |
||
150 | 'mming': (44.3, 4, None), |
||
151 | 'rming': (44.2, 3, None), |
||
152 | 'lming': (44.1, 3, None), |
||
153 | 'mings': (44, 4, 'e'), |
||
154 | 'nging': (45.2, 3, None), |
||
155 | 'gging': (45.1, 4, None), |
||
156 | 'gings': (45, 4, 'e'), |
||
157 | 'aning': (46.6, 3, None), |
||
158 | 'ening': (46.5, 3, None), |
||
159 | 'gning': (46.4, 3, None), |
||
160 | 'nning': (46.3, 4, None), |
||
161 | 'oning': (46.2, 3, None), |
||
162 | 'rning': (46.1, 3, None), |
||
163 | 'sting': (47, 3, None), |
||
164 | 'eting': (48.4, 3, None), |
||
165 | 'pting': (48.3, 3, None), |
||
166 | 'nting': (48.2, 3, None), |
||
167 | 'cting': (48.1, 3, None), |
||
168 | 'tings': (48, 4, 'e'), |
||
169 | 'iring': (54.4, 3, 'e'), |
||
170 | 'uring': (54.3, 3, 'e'), |
||
171 | 'ncing': (54.2, 3, 'e'), |
||
172 | 'sings': (54, 4, 'e'), |
||
173 | # 'lling': (55, 3, None), # masked by 41 |
||
174 | 'ating': (57, 3, 'e'), |
||
175 | 'thing': (58.1, 0, None), |
||
176 | }, |
||
177 | 4: { |
||
178 | 'eeds': (7, 1, None), |
||
179 | 'uses': (11.3, 1, None), |
||
180 | 'sses': (11.2, 2, None), |
||
181 | 'eses': (11.1, 2, 'is'), |
||
182 | 'tled': (12.5, 1, None), |
||
183 | 'pled': (12.4, 1, None), |
||
184 | 'bled': (12.3, 1, None), |
||
185 | 'eled': (12.2, 2, None), |
||
186 | 'lled': (12.1, 2, None), |
||
187 | 'ened': (13.7, 2, None), |
||
188 | 'rned': (13.4, 2, None), |
||
189 | 'nned': (13.3, 3, None), |
||
190 | 'oned': (13.2, 2, None), |
||
191 | 'gned': (13.1, 2, None), |
||
192 | 'ered': (20.1, 2, None), |
||
193 | 'reds': (20, 2, None), |
||
194 | 'tted': (21, 3, None), |
||
195 | 'uted': (22.2, 1, None), |
||
196 | 'ated': (22.1, 1, None), |
||
197 | 'ssed': (28, 2, None), |
||
198 | 'umed': (31, 1, None), |
||
199 | 'beds': (36, 3, None), |
||
200 | 'ving': (39, 3, 'e'), |
||
201 | 'ding': (40, 3, 'e'), |
||
202 | 'ling': (42, 3, 'e'), |
||
203 | 'nged': (43.2, 1, None), |
||
204 | 'gged': (43.1, 3, None), |
||
205 | 'ming': (44, 3, 'e'), |
||
206 | 'ging': (45, 3, 'e'), |
||
207 | 'ning': (46, 3, 'e'), |
||
208 | 'ting': (48, 3, 'e'), |
||
209 | # 'ssed': (49, 2, None), # masked by 28 |
||
210 | # 'lled': (53, 2, None), # masked by 12.1 |
||
211 | 'zing': (54.1, 3, 'e'), |
||
212 | 'sing': (54, 3, 'e'), |
||
213 | 'lves': (60.1, 3, 'f'), |
||
214 | 'aped': (61.3, 1, None), |
||
215 | 'uded': (61.2, 1, None), |
||
216 | 'oded': (61.1, 1, None), |
||
217 | # 'ated': (61, 1, None), # masked by 22.1 |
||
218 | 'ones': (63.6, 1, None), |
||
219 | 'izes': (63.5, 1, None), |
||
220 | 'ures': (63.4, 1, None), |
||
221 | 'ines': (63.3, 1, None), |
||
222 | 'ides': (63.2, 1, None), |
||
223 | }, |
||
224 | 3: { |
||
225 | 'ces': (2, 1, None), |
||
226 | 'sis': (4, 0, None), |
||
227 | 'tis': (5, 0, None), |
||
228 | 'eed': (7, 0, None), |
||
229 | 'ued': (8, 1, None), |
||
230 | 'ues': (9, 1, None), |
||
231 | 'ees': (10, 1, None), |
||
232 | 'ses': (11, 1, None), |
||
233 | 'led': (12, 2, None), |
||
234 | 'ned': (13, 1, None), |
||
235 | 'ved': (17, 1, None), |
||
236 | 'ced': (18, 1, None), |
||
237 | 'red': (20, 1, None), |
||
238 | 'ted': (22, 2, None), |
||
239 | 'sed': (29, 1, None), |
||
240 | 'bed': (36, 2, None), |
||
241 | 'ged': (43, 1, None), |
||
242 | 'les': (50, 1, None), |
||
243 | 'tes': (51, 1, None), |
||
244 | 'zed': (52, 1, None), |
||
245 | 'ied': (56, 3, 'y'), |
||
246 | 'ies': (59, 3, 'y'), |
||
247 | 'ves': (60, 1, None), |
||
248 | 'pes': (63.8, 1, None), |
||
249 | 'mes': (63.7, 1, None), |
||
250 | 'ges': (63.1, 1, None), |
||
251 | 'ous': (65, 0, None), |
||
252 | 'ums': (66, 0, None), |
||
253 | }, |
||
254 | 2: { |
||
255 | 'cs': (3, 0, None), |
||
256 | 'ss': (6, 0, None), |
||
257 | 'es': (63, 2, None), |
||
258 | 'is': (64, 2, 'e'), |
||
259 | 'us': (67, 0, None), |
||
260 | }, |
||
261 | } |
||
262 | |||
263 | 1 | if var == 'Perl': |
|
264 | 1 | perl_deletions = { |
|
265 | 7: ['eadings', 'oadings', 'ealings', 'ailings'], |
||
266 | 6: [ |
||
267 | 'ttings', |
||
268 | 'ssings', |
||
269 | 'edings', |
||
270 | 'ddings', |
||
271 | 'ldings', |
||
272 | 'rdings', |
||
273 | 'ndings', |
||
274 | 'llings', |
||
275 | 'olings', |
||
276 | 'elings', |
||
277 | 'mmings', |
||
278 | 'ngings', |
||
279 | 'ggings', |
||
280 | 'stings', |
||
281 | 'etings', |
||
282 | 'ntings', |
||
283 | 'irings', |
||
284 | 'urings', |
||
285 | 'ncings', |
||
286 | 'things', |
||
287 | ], |
||
288 | 5: ['vings', 'dings', 'lings', 'mings', 'gings', 'tings', 'sings'], |
||
289 | 4: ['eeds', 'reds', 'beds'], |
||
290 | } |
||
291 | |||
292 | # Delete the above rules from rule_table |
||
293 | 1 | for del_len in perl_deletions: |
|
294 | 1 | for term in perl_deletions[del_len]: |
|
295 | 1 | del rule_table[del_len][term] |
|
296 | |||
297 | 1 | elif var == 'Adams': |
|
298 | 1 | adams_additions = { |
|
299 | 6: {'chited': (22.8, 1, None)}, |
||
300 | 5: { |
||
301 | 'dying': (58.2, 4, 'ie'), |
||
302 | 'tying': (58.2, 4, 'ie'), |
||
303 | 'vited': (22.6, 1, None), |
||
304 | 'mited': (22.5, 1, None), |
||
305 | 'vided': (22.9, 1, None), |
||
306 | 'mided': (22.10, 1, None), |
||
307 | 'lying': (58.2, 4, 'ie'), |
||
308 | 'arred': (19.1, 3, None), |
||
309 | }, |
||
310 | 4: { |
||
311 | 'ited': (22.7, 2, None), |
||
312 | 'oked': (31.1, 1, None), |
||
313 | 'aked': (31.1, 1, None), |
||
314 | 'iked': (31.1, 1, None), |
||
315 | 'uked': (31.1, 1, None), |
||
316 | 'amed': (31, 1, None), |
||
317 | 'imed': (31, 1, None), |
||
318 | 'does': (31.2, 2, None), |
||
319 | }, |
||
320 | 3: { |
||
321 | 'oed': (31.3, 1, None), |
||
322 | 'oes': (31.2, 1, None), |
||
323 | 'kes': (63.1, 1, None), |
||
324 | 'des': (63.10, 1, None), |
||
325 | 'res': (63.9, 1, None), |
||
326 | }, |
||
327 | } |
||
328 | |||
329 | # Add the above additional rules to rule_table |
||
330 | 1 | for del_len in adams_additions: |
|
331 | 1 | rule_table[del_len] = dict( |
|
332 | rule_table[del_len], **adams_additions[del_len] |
||
333 | ) |
||
334 | # Add additional problem word |
||
335 | 1 | problem_words.add('menses') |
|
336 | |||
337 | 1 | def _stem_with_duplicate_character_check(word, del_len): |
|
338 | 1 | if word[-1] == 's': |
|
339 | 1 | del_len += 1 |
|
340 | 1 | stemmed_word = word[:-del_len] |
|
341 | 1 | if re_match(r'.*(\w)\1$', stemmed_word): |
|
342 | 1 | stemmed_word = stemmed_word[:-1] |
|
343 | 1 | return stemmed_word |
|
344 | |||
345 | 1 | def _stem(word): |
|
346 | 1 | stemmed_word = word |
|
347 | 1 | rule_no = 0 |
|
348 | |||
349 | 1 | if not word: |
|
350 | 1 | return word, 0 |
|
351 | 1 | if word in problem_words: |
|
352 | 1 | return word, 90 |
|
353 | 1 | if max_word_length and len(word) > max_word_length: |
|
354 | 1 | return word, 95 |
|
355 | |||
356 | 1 | if "'" in word: |
|
357 | 1 | if word[-2:] in {"'s", "'S"}: |
|
358 | 1 | stemmed_word = word[:-2] |
|
359 | 1 | if word[-1:] == "'": |
|
360 | 1 | stemmed_word = word[:-1] |
|
361 | 1 | stemmed_word = stemmed_word.replace("n't", 'not') |
|
362 | 1 | stemmed_word = stemmed_word.replace("'ve", 'have') |
|
363 | 1 | stemmed_word = stemmed_word.replace("'re", 'are') |
|
364 | 1 | stemmed_word = stemmed_word.replace("'m", 'am') |
|
365 | 1 | return stemmed_word, 94 |
|
366 | |||
367 | 1 | if word.isdigit(): |
|
368 | 1 | return word, 90.3 |
|
369 | else: |
||
370 | 1 | hyphen = word.find('-') |
|
371 | 1 | if len(word) > hyphen > 0: |
|
372 | 1 | if word[:hyphen].isalpha() and word[hyphen + 1 :].isalpha(): |
|
373 | 1 | return word, 90.2 |
|
374 | else: |
||
375 | 1 | return word, 90.1 |
|
376 | 1 | elif '_' in word: |
|
377 | 1 | return word, 90 |
|
378 | 1 | elif word[-1] == 's' and word[:-1].isupper(): |
|
379 | 1 | if var == 'Adams' and len(word) - 1 > max_acro_length: |
|
380 | 1 | return word, 96 |
|
381 | 1 | return word[:-1], 91.1 |
|
382 | 1 | elif word.isupper(): |
|
383 | 1 | if var == 'Adams' and len(word) > max_acro_length: |
|
384 | 1 | return word, 96 |
|
385 | 1 | return word, 91 |
|
386 | 1 | elif re_match(r'^.*[A-Z].*[A-Z].*$', word): |
|
387 | 1 | return word, 92 |
|
388 | 1 | elif word[0].isupper(): |
|
389 | 1 | return word, 93 |
|
390 | 1 | elif var == 'Adams' and re_match(r'^[a-z](|[rl])(ing|ed)$', word): |
|
391 | 1 | return word, 97 |
|
392 | |||
393 | 1 | for n in range(7, 1, -1): |
|
394 | 1 | if word[-n:] in rule_table[n]: |
|
395 | 1 | rule_no, del_len, add_str = rule_table[n][word[-n:]] |
|
396 | 1 | if del_len: |
|
397 | 1 | stemmed_word = word[:-del_len] |
|
398 | else: |
||
399 | 1 | stemmed_word = word |
|
400 | 1 | if add_str: |
|
401 | 1 | stemmed_word += add_str |
|
402 | 1 | break |
|
403 | |||
404 | 1 | if not rule_no: |
|
405 | 1 | if re_match(r'.*\w\wings?$', word): # rule 58 |
|
406 | 1 | stemmed_word = _stem_with_duplicate_character_check(word, 3) |
|
407 | 1 | rule_no = 58 |
|
408 | 1 | elif re_match(r'.*\w\weds?$', word): # rule 62 |
|
409 | 1 | stemmed_word = _stem_with_duplicate_character_check(word, 2) |
|
410 | 1 | rule_no = 62 |
|
411 | 1 | elif word[-1] == 's': # rule 68 |
|
412 | 1 | stemmed_word = word[:-1] |
|
413 | 1 | rule_no = 68 |
|
414 | |||
415 | 1 | return stemmed_word, rule_no |
|
416 | |||
417 | 1 | stem, rule_no = _stem(word) |
|
418 | 1 | if return_rule_no: |
|
419 | 1 | return stem, rule_no |
|
420 | 1 | return stem |
|
421 | |||
427 |