Conditions | 127 |
Total Lines | 285 |
Code Lines | 218 |
Lines | 0 |
Ratio | 0 % |
Tests | 187 |
CRAP Score | 127 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._porter2.Porter2.stem() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
87 | 1 | def stem(self, word, early_english=False): |
|
88 | """Return the Porter2 (Snowball English) stem. |
||
89 | |||
90 | Args: |
||
91 | word (str): The word to stem |
||
92 | early_english (bool): Set to True in order to remove -eth & -est |
||
93 | (2nd & 3rd person singular verbal agreement suffixes) |
||
94 | |||
95 | Returns: |
||
96 | str: Word stem |
||
97 | |||
98 | Examples: |
||
99 | >>> stmr = Porter2() |
||
100 | >>> stmr.stem('reading') |
||
101 | 'read' |
||
102 | >>> stmr.stem('suspension') |
||
103 | 'suspens' |
||
104 | >>> stmr.stem('elusiveness') |
||
105 | 'elus' |
||
106 | |||
107 | >>> stmr.stem('eateth', early_english=True) |
||
108 | 'eat' |
||
109 | |||
110 | """ |
||
111 | # lowercase, normalize, and compose |
||
112 | 1 | word = normalize('NFC', text_type(word.lower())) |
|
113 | # replace apostrophe-like characters with U+0027, per |
||
114 | # http://snowball.tartarus.org/texts/apostrophe.html |
||
115 | 1 | word = word.replace('’', '\'') |
|
116 | 1 | word = word.replace('’', '\'') |
|
117 | |||
118 | # Exceptions 1 |
||
119 | 1 | if word in self._exception1dict: |
|
120 | 1 | return self._exception1dict[word] |
|
121 | 1 | elif word in self._exception1set: |
|
122 | 1 | return word |
|
123 | |||
124 | # Return word if stem is shorter than 3 |
||
125 | 1 | if len(word) < 3: |
|
126 | 1 | return word |
|
127 | |||
128 | # Remove initial ', if present. |
||
129 | 1 | while word and word[0] == '\'': |
|
130 | 1 | word = word[1:] |
|
131 | # Return word if stem is shorter than 2 |
||
132 | 1 | if len(word) < 2: |
|
133 | 1 | return word |
|
134 | |||
135 | # Re-map vocalic Y to y (Y will be C, y will be V) |
||
136 | 1 | if word[0] == 'y': |
|
137 | 1 | word = 'Y' + word[1:] |
|
138 | 1 | for i in range(1, len(word)): |
|
139 | 1 | if word[i] == 'y' and word[i - 1] in self._vowels: |
|
140 | 1 | word = word[:i] + 'Y' + word[i + 1 :] |
|
141 | |||
142 | 1 | r1_start = self._sb_r1(word, self._r1_prefixes) |
|
143 | 1 | r2_start = self._sb_r2(word, self._r1_prefixes) |
|
144 | |||
145 | # Step 0 |
||
146 | 1 | if word[-3:] == '\'s\'': |
|
147 | 1 | word = word[:-3] |
|
148 | 1 | elif word[-2:] == '\'s': |
|
149 | 1 | word = word[:-2] |
|
150 | 1 | elif word[-1:] == '\'': |
|
151 | 1 | word = word[:-1] |
|
152 | # Return word if stem is shorter than 2 |
||
153 | 1 | if len(word) < 3: |
|
154 | 1 | return word |
|
155 | |||
156 | # Step 1a |
||
157 | 1 | if word[-4:] == 'sses': |
|
158 | 1 | word = word[:-2] |
|
159 | 1 | elif word[-3:] in {'ied', 'ies'}: |
|
160 | 1 | if len(word) > 4: |
|
161 | 1 | word = word[:-2] |
|
162 | else: |
||
163 | 1 | word = word[:-1] |
|
164 | 1 | elif word[-2:] in {'us', 'ss'}: |
|
165 | 1 | pass |
|
166 | 1 | elif word[-1] == 's': |
|
167 | 1 | if self._sb_has_vowel(word[:-2]): |
|
168 | 1 | word = word[:-1] |
|
169 | |||
170 | # Exceptions 2 |
||
171 | 1 | if word in self._exception2set: |
|
172 | 1 | return word |
|
173 | |||
174 | # Step 1b |
||
175 | 1 | step1b_flag = False |
|
176 | 1 | if word[-5:] == 'eedly': |
|
177 | 1 | if len(word[r1_start:]) >= 5: |
|
178 | 1 | word = word[:-3] |
|
179 | 1 | elif word[-5:] == 'ingly': |
|
180 | 1 | if self._sb_has_vowel(word[:-5]): |
|
181 | 1 | word = word[:-5] |
|
182 | 1 | step1b_flag = True |
|
183 | 1 | elif word[-4:] == 'edly': |
|
184 | 1 | if self._sb_has_vowel(word[:-4]): |
|
185 | 1 | word = word[:-4] |
|
186 | 1 | step1b_flag = True |
|
187 | 1 | elif word[-3:] == 'eed': |
|
188 | 1 | if len(word[r1_start:]) >= 3: |
|
189 | 1 | word = word[:-1] |
|
190 | 1 | elif word[-3:] == 'ing': |
|
191 | 1 | if self._sb_has_vowel(word[:-3]): |
|
192 | 1 | word = word[:-3] |
|
193 | 1 | step1b_flag = True |
|
194 | 1 | elif word[-2:] == 'ed': |
|
195 | 1 | if self._sb_has_vowel(word[:-2]): |
|
196 | 1 | word = word[:-2] |
|
197 | 1 | step1b_flag = True |
|
198 | 1 | elif early_english: |
|
199 | 1 | if word[-3:] == 'est': |
|
200 | 1 | if self._sb_has_vowel(word[:-3]): |
|
201 | 1 | word = word[:-3] |
|
202 | 1 | step1b_flag = True |
|
203 | 1 | elif word[-3:] == 'eth': |
|
204 | 1 | if self._sb_has_vowel(word[:-3]): |
|
205 | 1 | word = word[:-3] |
|
206 | 1 | step1b_flag = True |
|
207 | |||
208 | 1 | if step1b_flag: |
|
209 | 1 | if word[-2:] in {'at', 'bl', 'iz'}: |
|
210 | 1 | word += 'e' |
|
211 | 1 | elif word[-2:] in self._doubles: |
|
212 | 1 | word = word[:-1] |
|
213 | 1 | elif self._sb_short_word(word, self._r1_prefixes): |
|
214 | 1 | word += 'e' |
|
215 | |||
216 | # Step 1c |
||
217 | 1 | if ( |
|
218 | len(word) > 2 |
||
219 | and word[-1] in {'Y', 'y'} |
||
220 | and word[-2] not in self._vowels |
||
221 | ): |
||
222 | 1 | word = word[:-1] + 'i' |
|
223 | |||
224 | # Step 2 |
||
225 | 1 | if word[-2] == 'a': |
|
226 | 1 | if word[-7:] == 'ational': |
|
227 | 1 | if len(word[r1_start:]) >= 7: |
|
228 | 1 | word = word[:-5] + 'e' |
|
229 | 1 | elif word[-6:] == 'tional': |
|
230 | 1 | if len(word[r1_start:]) >= 6: |
|
231 | 1 | word = word[:-2] |
|
232 | 1 | elif word[-2] == 'c': |
|
233 | 1 | if word[-4:] in {'enci', 'anci'}: |
|
234 | 1 | if len(word[r1_start:]) >= 4: |
|
235 | 1 | word = word[:-1] + 'e' |
|
236 | 1 | elif word[-2] == 'e': |
|
237 | 1 | if word[-4:] == 'izer': |
|
238 | 1 | if len(word[r1_start:]) >= 4: |
|
239 | 1 | word = word[:-1] |
|
240 | 1 | elif word[-2] == 'g': |
|
241 | 1 | if word[-3:] == 'ogi': |
|
242 | 1 | if ( |
|
243 | r1_start >= 1 |
||
244 | and len(word[r1_start:]) >= 3 |
||
245 | and word[-4] == 'l' |
||
246 | ): |
||
247 | 1 | word = word[:-1] |
|
248 | 1 | elif word[-2] == 'l': |
|
249 | 1 | if word[-6:] == 'lessli': |
|
250 | 1 | if len(word[r1_start:]) >= 6: |
|
251 | 1 | word = word[:-2] |
|
252 | 1 | elif word[-5:] in {'entli', 'fulli', 'ousli'}: |
|
253 | 1 | if len(word[r1_start:]) >= 5: |
|
254 | 1 | word = word[:-2] |
|
255 | 1 | elif word[-4:] == 'abli': |
|
256 | 1 | if len(word[r1_start:]) >= 4: |
|
257 | 1 | word = word[:-1] + 'e' |
|
258 | 1 | elif word[-4:] == 'alli': |
|
259 | 1 | if len(word[r1_start:]) >= 4: |
|
260 | 1 | word = word[:-2] |
|
261 | 1 | elif word[-3:] == 'bli': |
|
262 | 1 | if len(word[r1_start:]) >= 3: |
|
263 | 1 | word = word[:-1] + 'e' |
|
264 | 1 | elif word[-2:] == 'li': |
|
265 | 1 | if ( |
|
266 | r1_start >= 1 |
||
267 | and len(word[r1_start:]) >= 2 |
||
268 | and word[-3] in self._li |
||
269 | ): |
||
270 | 1 | word = word[:-2] |
|
271 | 1 | elif word[-2] == 'o': |
|
272 | 1 | if word[-7:] == 'ization': |
|
273 | 1 | if len(word[r1_start:]) >= 7: |
|
274 | 1 | word = word[:-5] + 'e' |
|
275 | 1 | elif word[-5:] == 'ation': |
|
276 | 1 | if len(word[r1_start:]) >= 5: |
|
277 | 1 | word = word[:-3] + 'e' |
|
278 | 1 | elif word[-4:] == 'ator': |
|
279 | 1 | if len(word[r1_start:]) >= 4: |
|
280 | 1 | word = word[:-2] + 'e' |
|
281 | 1 | elif word[-2] == 's': |
|
282 | 1 | if word[-7:] in {'fulness', 'ousness', 'iveness'}: |
|
283 | 1 | if len(word[r1_start:]) >= 7: |
|
284 | 1 | word = word[:-4] |
|
285 | 1 | elif word[-5:] == 'alism': |
|
286 | 1 | if len(word[r1_start:]) >= 5: |
|
287 | 1 | word = word[:-3] |
|
288 | 1 | elif word[-2] == 't': |
|
289 | 1 | if word[-6:] == 'biliti': |
|
290 | 1 | if len(word[r1_start:]) >= 6: |
|
291 | 1 | word = word[:-5] + 'le' |
|
292 | 1 | elif word[-5:] == 'aliti': |
|
293 | 1 | if len(word[r1_start:]) >= 5: |
|
294 | 1 | word = word[:-3] |
|
295 | 1 | elif word[-5:] == 'iviti': |
|
296 | 1 | if len(word[r1_start:]) >= 5: |
|
297 | 1 | word = word[:-3] + 'e' |
|
298 | |||
299 | # Step 3 |
||
300 | 1 | if word[-7:] == 'ational': |
|
301 | 1 | if len(word[r1_start:]) >= 7: |
|
302 | 1 | word = word[:-5] + 'e' |
|
303 | 1 | elif word[-6:] == 'tional': |
|
304 | 1 | if len(word[r1_start:]) >= 6: |
|
305 | 1 | word = word[:-2] |
|
306 | 1 | elif word[-5:] in {'alize', 'icate', 'iciti'}: |
|
307 | 1 | if len(word[r1_start:]) >= 5: |
|
308 | 1 | word = word[:-3] |
|
309 | 1 | elif word[-5:] == 'ative': |
|
310 | 1 | if len(word[r2_start:]) >= 5: |
|
311 | 1 | word = word[:-5] |
|
312 | 1 | elif word[-4:] == 'ical': |
|
313 | 1 | if len(word[r1_start:]) >= 4: |
|
314 | 1 | word = word[:-2] |
|
315 | 1 | elif word[-4:] == 'ness': |
|
316 | 1 | if len(word[r1_start:]) >= 4: |
|
317 | 1 | word = word[:-4] |
|
318 | 1 | elif word[-3:] == 'ful': |
|
319 | 1 | if len(word[r1_start:]) >= 3: |
|
320 | 1 | word = word[:-3] |
|
321 | |||
322 | # Step 4 |
||
323 | 1 | for suffix in ( |
|
324 | 'ement', |
||
325 | 'ance', |
||
326 | 'ence', |
||
327 | 'able', |
||
328 | 'ible', |
||
329 | 'ment', |
||
330 | 'ant', |
||
331 | 'ent', |
||
332 | 'ism', |
||
333 | 'ate', |
||
334 | 'iti', |
||
335 | 'ous', |
||
336 | 'ive', |
||
337 | 'ize', |
||
338 | 'al', |
||
339 | 'er', |
||
340 | 'ic', |
||
341 | ): |
||
342 | 1 | if word[-len(suffix) :] == suffix: |
|
343 | 1 | if len(word[r2_start:]) >= len(suffix): |
|
344 | 1 | word = word[: -len(suffix)] |
|
345 | 1 | break |
|
346 | else: |
||
347 | 1 | if word[-3:] == 'ion': |
|
348 | 1 | if ( |
|
349 | len(word[r2_start:]) >= 3 |
||
350 | and len(word) >= 4 |
||
351 | and word[-4] in tuple('st') |
||
352 | ): |
||
353 | 1 | word = word[:-3] |
|
354 | |||
355 | # Step 5 |
||
356 | 1 | if word[-1] == 'e': |
|
357 | 1 | if len(word[r2_start:]) >= 1 or ( |
|
358 | len(word[r1_start:]) >= 1 |
||
359 | and not self._sb_ends_in_short_syllable(word[:-1]) |
||
360 | ): |
||
361 | 1 | word = word[:-1] |
|
362 | 1 | elif word[-1] == 'l': |
|
363 | 1 | if len(word[r2_start:]) >= 1 and word[-2] == 'l': |
|
364 | 1 | word = word[:-1] |
|
365 | |||
366 | # Change 'Y' back to 'y' if it survived stemming |
||
367 | 1 | for i in range(0, len(word)): |
|
368 | 1 | if word[i] == 'Y': |
|
369 | 1 | word = word[:i] + 'y' + word[i + 1 :] |
|
370 | |||
371 | 1 | return word |
|
372 | |||
406 |