Conditions | 127 |
Total Lines | 294 |
Code Lines | 218 |
Lines | 0 |
Ratio | 0 % |
Tests | 180 |
CRAP Score | 127 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._porter2.Porter2.stem() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # Copyright 2014-2020 by Christopher C. Little. |
||
92 | 1 | def stem(self, word: str) -> str: |
|
93 | """Return the Porter2 (Snowball English) stem. |
||
94 | |||
95 | Parameters |
||
96 | ---------- |
||
97 | word : str |
||
98 | The word to stem |
||
99 | |||
100 | Returns |
||
101 | ------- |
||
102 | str |
||
103 | Word stem |
||
104 | |||
105 | 1 | Examples |
|
106 | -------- |
||
107 | 1 | >>> stmr = Porter2() |
|
108 | >>> stmr.stem('reading') |
||
109 | 'read' |
||
110 | >>> stmr.stem('suspension') |
||
111 | 'suspens' |
||
112 | >>> stmr.stem('elusiveness') |
||
113 | 'elus' |
||
114 | |||
115 | >>> stmr = Porter2(early_english=True) |
||
116 | >>> stmr.stem('eateth') |
||
117 | 'eat' |
||
118 | |||
119 | |||
120 | .. versionadded:: 0.1.0 |
||
121 | .. versionchanged:: 0.3.6 |
||
122 | Encapsulated in class |
||
123 | |||
124 | """ |
||
125 | # lowercase, normalize, and compose |
||
126 | word = normalize('NFC', word.lower()) |
||
127 | # replace apostrophe-like characters with U+0027, per |
||
128 | # http://snowball.tartarus.org/texts/apostrophe.html |
||
129 | word = word.replace('’', "'") |
||
130 | word = word.replace('’', "'") |
||
131 | |||
132 | # Exceptions 1 |
||
133 | if word in self._exception1dict: |
||
134 | return self._exception1dict[word] |
||
135 | elif word in self._exception1set: |
||
136 | return word |
||
137 | |||
138 | # Return word if stem is shorter than 3 |
||
139 | if len(word) < 3: |
||
140 | return word |
||
141 | 1 | ||
142 | # Remove initial ', if present. |
||
143 | while word and word[0] == "'": |
||
144 | 1 | word = word[1:] |
|
145 | 1 | # Return word if stem is shorter than 2 |
|
146 | if len(word) < 2: |
||
147 | return word |
||
148 | 1 | ||
149 | 1 | # Re-map vocalic Y to y (Y will be C, y will be V) |
|
150 | 1 | if word[0] == 'y': |
|
151 | 1 | word = 'Y' + word[1:] |
|
152 | for i in range(1, len(word)): |
||
153 | if word[i] == 'y' and word[i - 1] in self._vowels: |
||
154 | 1 | word = word[:i] + 'Y' + word[i + 1 :] |
|
155 | 1 | ||
156 | r1_start = self._sb_r1(word, self._r1_prefixes) |
||
157 | r2_start = self._sb_r2(word, self._r1_prefixes) |
||
158 | 1 | ||
159 | 1 | # Step 0 |
|
160 | if word[-3:] == "'s'": |
||
161 | 1 | word = word[:-3] |
|
162 | 1 | elif word[-2:] == "'s": |
|
163 | word = word[:-2] |
||
164 | elif word[-1:] == "'": |
||
165 | 1 | word = word[:-1] |
|
166 | 1 | # Return word if stem is shorter than 2 |
|
167 | 1 | if len(word) < 3: |
|
168 | 1 | return word |
|
169 | 1 | ||
170 | # Step 1a |
||
171 | 1 | if word[-4:] == 'sses': |
|
172 | 1 | word = word[:-2] |
|
173 | elif word[-3:] in {'ied', 'ies'}: |
||
174 | if len(word) > 4: |
||
175 | 1 | word = word[:-2] |
|
176 | 1 | else: |
|
177 | 1 | word = word[:-1] |
|
178 | 1 | elif word[-2:] in {'us', 'ss'}: |
|
179 | 1 | pass |
|
180 | 1 | elif word[-1] == 's': |
|
181 | if self._sb_has_vowel(word[:-2]): |
||
182 | 1 | word = word[:-1] |
|
183 | 1 | ||
184 | # Exceptions 2 |
||
185 | if word in self._exception2set: |
||
186 | 1 | return word |
|
187 | 1 | ||
188 | 1 | # Step 1b |
|
189 | 1 | step1b_flag = False |
|
190 | 1 | if word[-5:] == 'eedly': |
|
191 | if len(word[r1_start:]) >= 5: |
||
192 | 1 | word = word[:-3] |
|
193 | 1 | elif word[-5:] == 'ingly': |
|
194 | 1 | if self._sb_has_vowel(word[:-5]): |
|
195 | 1 | word = word[:-5] |
|
196 | 1 | step1b_flag = True |
|
197 | 1 | elif word[-4:] == 'edly': |
|
198 | if self._sb_has_vowel(word[:-4]): |
||
199 | word = word[:-4] |
||
200 | 1 | step1b_flag = True |
|
201 | 1 | elif word[-3:] == 'eed': |
|
202 | if len(word[r1_start:]) >= 3: |
||
203 | word = word[:-1] |
||
204 | 1 | elif word[-3:] == 'ing': |
|
205 | 1 | if self._sb_has_vowel(word[:-3]): |
|
206 | 1 | word = word[:-3] |
|
207 | 1 | step1b_flag = True |
|
208 | 1 | elif word[-2:] == 'ed': |
|
209 | 1 | if self._sb_has_vowel(word[:-2]): |
|
210 | 1 | word = word[:-2] |
|
211 | 1 | step1b_flag = True |
|
212 | 1 | elif self._early_english: |
|
213 | 1 | if word[-3:] == 'est': |
|
214 | 1 | if self._sb_has_vowel(word[:-3]): |
|
215 | 1 | word = word[:-3] |
|
216 | 1 | step1b_flag = True |
|
217 | 1 | elif word[-3:] == 'eth': |
|
218 | 1 | if self._sb_has_vowel(word[:-3]): |
|
219 | 1 | word = word[:-3] |
|
220 | 1 | step1b_flag = True |
|
221 | 1 | ||
222 | 1 | if step1b_flag: |
|
223 | 1 | if word[-2:] in {'at', 'bl', 'iz'}: |
|
224 | 1 | word += 'e' |
|
225 | 1 | elif word[-2:] in self._doubles: |
|
226 | 1 | word = word[:-1] |
|
227 | 1 | elif self._sb_short_word(word, self._r1_prefixes): |
|
228 | 1 | word += 'e' |
|
229 | 1 | ||
230 | 1 | # Step 1c |
|
231 | 1 | if ( |
|
232 | 1 | len(word) > 2 |
|
233 | 1 | and word[-1] in {'Y', 'y'} |
|
234 | 1 | and word[-2] not in self._vowels |
|
235 | 1 | ): |
|
236 | word = word[:-1] + 'i' |
||
237 | 1 | ||
238 | 1 | # Step 2 |
|
239 | 1 | if word[-2] == 'a': |
|
240 | 1 | if word[-7:] == 'ational': |
|
241 | 1 | if len(word[r1_start:]) >= 7: |
|
242 | 1 | word = word[:-5] + 'e' |
|
243 | 1 | elif word[-6:] == 'tional': |
|
244 | if len(word[r1_start:]) >= 6: |
||
245 | word = word[:-2] |
||
246 | 1 | elif word[-2] == 'c': |
|
247 | if word[-4:] in {'enci', 'anci'}: |
||
248 | if len(word[r1_start:]) >= 4: |
||
249 | word = word[:-1] + 'e' |
||
250 | elif word[-2] == 'e': |
||
251 | 1 | if word[-4:] == 'izer': |
|
252 | if len(word[r1_start:]) >= 4: |
||
253 | word = word[:-1] |
||
254 | 1 | elif word[-2] == 'g': |
|
255 | 1 | if word[-3:] == 'ogi': |
|
256 | 1 | if ( |
|
257 | 1 | r1_start >= 1 |
|
258 | 1 | and len(word[r1_start:]) >= 3 |
|
259 | 1 | and word[-4] == 'l' |
|
260 | 1 | ): |
|
261 | 1 | word = word[:-1] |
|
262 | 1 | elif word[-2] == 'l': |
|
263 | 1 | if word[-6:] == 'lessli': |
|
264 | 1 | if len(word[r1_start:]) >= 6: |
|
265 | 1 | word = word[:-2] |
|
266 | 1 | elif word[-5:] in {'entli', 'fulli', 'ousli'}: |
|
267 | 1 | if len(word[r1_start:]) >= 5: |
|
268 | 1 | word = word[:-2] |
|
269 | 1 | elif word[-4:] == 'abli': |
|
270 | 1 | if len(word[r1_start:]) >= 4: |
|
271 | 1 | word = word[:-1] + 'e' |
|
272 | elif word[-4:] == 'alli': |
||
273 | if len(word[r1_start:]) >= 4: |
||
274 | word = word[:-2] |
||
275 | elif word[-3:] == 'bli': |
||
276 | 1 | if len(word[r1_start:]) >= 3: |
|
277 | 1 | word = word[:-1] + 'e' |
|
278 | 1 | elif word[-2:] == 'li': |
|
279 | 1 | if ( |
|
280 | 1 | r1_start >= 1 |
|
281 | 1 | and len(word[r1_start:]) >= 2 |
|
282 | 1 | and word[-3] in self._li |
|
283 | 1 | ): |
|
284 | 1 | word = word[:-2] |
|
285 | 1 | elif word[-2] == 'o': |
|
286 | 1 | if word[-7:] == 'ization': |
|
287 | 1 | if len(word[r1_start:]) >= 7: |
|
288 | 1 | word = word[:-5] + 'e' |
|
289 | 1 | elif word[-5:] == 'ation': |
|
290 | 1 | if len(word[r1_start:]) >= 5: |
|
291 | 1 | word = word[:-3] + 'e' |
|
292 | 1 | elif word[-4:] == 'ator': |
|
293 | 1 | if len(word[r1_start:]) >= 4: |
|
294 | 1 | word = word[:-2] + 'e' |
|
295 | elif word[-2] == 's': |
||
296 | if word[-7:] in {'fulness', 'ousness', 'iveness'}: |
||
297 | if len(word[r1_start:]) >= 7: |
||
298 | word = word[:-4] |
||
299 | 1 | elif word[-5:] == 'alism': |
|
300 | 1 | if len(word[r1_start:]) >= 5: |
|
301 | 1 | word = word[:-3] |
|
302 | 1 | elif word[-2] == 't': |
|
303 | 1 | if word[-6:] == 'biliti': |
|
304 | 1 | if len(word[r1_start:]) >= 6: |
|
305 | 1 | word = word[:-5] + 'le' |
|
306 | 1 | elif word[-5:] == 'aliti': |
|
307 | 1 | if len(word[r1_start:]) >= 5: |
|
308 | 1 | word = word[:-3] |
|
309 | 1 | elif word[-5:] == 'iviti': |
|
310 | 1 | if len(word[r1_start:]) >= 5: |
|
311 | 1 | word = word[:-3] + 'e' |
|
312 | 1 | ||
313 | 1 | # Step 3 |
|
314 | 1 | if word[-7:] == 'ational': |
|
315 | 1 | if len(word[r1_start:]) >= 7: |
|
316 | 1 | word = word[:-5] + 'e' |
|
317 | 1 | elif word[-6:] == 'tional': |
|
318 | 1 | if len(word[r1_start:]) >= 6: |
|
319 | 1 | word = word[:-2] |
|
320 | 1 | elif word[-5:] in {'alize', 'icate', 'iciti'}: |
|
321 | 1 | if len(word[r1_start:]) >= 5: |
|
322 | 1 | word = word[:-3] |
|
323 | 1 | elif word[-5:] == 'ative': |
|
324 | 1 | if len(word[r2_start:]) >= 5: |
|
325 | 1 | word = word[:-5] |
|
326 | 1 | elif word[-4:] == 'ical': |
|
327 | if len(word[r1_start:]) >= 4: |
||
328 | word = word[:-2] |
||
329 | 1 | elif word[-4:] == 'ness': |
|
330 | 1 | if len(word[r1_start:]) >= 4: |
|
331 | 1 | word = word[:-4] |
|
332 | 1 | elif word[-3:] == 'ful': |
|
333 | 1 | if len(word[r1_start:]) >= 3: |
|
334 | 1 | word = word[:-3] |
|
335 | 1 | ||
336 | 1 | # Step 4 |
|
337 | 1 | for suffix in ( |
|
338 | 1 | 'ement', |
|
339 | 1 | 'ance', |
|
340 | 1 | 'ence', |
|
341 | 1 | 'able', |
|
342 | 1 | 'ible', |
|
343 | 1 | 'ment', |
|
344 | 1 | 'ant', |
|
345 | 1 | 'ent', |
|
346 | 1 | 'ism', |
|
347 | 1 | 'ate', |
|
348 | 1 | 'iti', |
|
349 | 1 | 'ous', |
|
350 | 'ive', |
||
351 | 'ize', |
||
352 | 1 | 'al', |
|
353 | 'er', |
||
354 | 'ic', |
||
355 | ): |
||
356 | if word[-len(suffix) :] == suffix: |
||
357 | if len(word[r2_start:]) >= len(suffix): |
||
358 | word = word[: -len(suffix)] |
||
359 | break |
||
360 | else: |
||
361 | if word[-3:] == 'ion': |
||
362 | if ( |
||
363 | len(word[r2_start:]) >= 3 |
||
364 | and len(word) >= 4 |
||
365 | and word[-4] in tuple('st') |
||
366 | ): |
||
367 | word = word[:-3] |
||
368 | |||
369 | # Step 5 |
||
370 | if word[-1] == 'e': |
||
371 | 1 | if len(word[r2_start:]) >= 1 or ( |
|
372 | 1 | len(word[r1_start:]) >= 1 |
|
373 | 1 | and not self._sb_ends_in_short_syllable(word[:-1]) |
|
374 | 1 | ): |
|
375 | word = word[:-1] |
||
376 | 1 | elif word[-1] == 'l': |
|
377 | 1 | if len(word[r2_start:]) >= 1 and word[-2] == 'l': |
|
378 | word = word[:-1] |
||
379 | |||
380 | # Change 'Y' back to 'y' if it survived stemming |
||
381 | for i in range(0, len(word)): |
||
382 | 1 | if word[i] == 'Y': |
|
383 | word = word[:i] + 'y' + word[i + 1 :] |
||
384 | |||
385 | 1 | return word |
|
386 | 1 | ||
392 |