Conditions | 127 |
Total Lines | 291 |
Code Lines | 218 |
Lines | 0 |
Ratio | 0 % |
Tests | 187 |
CRAP Score | 127 |
Changes | 0 |
Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.
For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.
Commonly applied refactorings include:
If many parameters/temporary variables are present:
Complex classes like abydos.stemmer._porter2.Porter2.stem() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
87 | 1 | def stem(self, word, early_english=False): |
|
88 | """Return the Porter2 (Snowball English) stem. |
||
89 | |||
90 | Parameters |
||
91 | ---------- |
||
92 | word : str |
||
93 | The word to stem |
||
94 | early_english : bool |
||
95 | Set to True in order to remove -eth & -est (2nd & 3rd person |
||
96 | singular verbal agreement suffixes) |
||
97 | |||
98 | Returns |
||
99 | ------- |
||
100 | str |
||
101 | Word stem |
||
102 | |||
103 | Examples |
||
104 | -------- |
||
105 | >>> stmr = Porter2() |
||
106 | >>> stmr.stem('reading') |
||
107 | 'read' |
||
108 | >>> stmr.stem('suspension') |
||
109 | 'suspens' |
||
110 | >>> stmr.stem('elusiveness') |
||
111 | 'elus' |
||
112 | |||
113 | >>> stmr.stem('eateth', early_english=True) |
||
114 | 'eat' |
||
115 | |||
116 | """ |
||
117 | # lowercase, normalize, and compose |
||
118 | 1 | word = normalize('NFC', text_type(word.lower())) |
|
119 | # replace apostrophe-like characters with U+0027, per |
||
120 | # http://snowball.tartarus.org/texts/apostrophe.html |
||
121 | 1 | word = word.replace('’', '\'') |
|
122 | 1 | word = word.replace('’', '\'') |
|
123 | |||
124 | # Exceptions 1 |
||
125 | 1 | if word in self._exception1dict: |
|
126 | 1 | return self._exception1dict[word] |
|
127 | 1 | elif word in self._exception1set: |
|
128 | 1 | return word |
|
129 | |||
130 | # Return word if stem is shorter than 3 |
||
131 | 1 | if len(word) < 3: |
|
132 | 1 | return word |
|
133 | |||
134 | # Remove initial ', if present. |
||
135 | 1 | while word and word[0] == '\'': |
|
136 | 1 | word = word[1:] |
|
137 | # Return word if stem is shorter than 2 |
||
138 | 1 | if len(word) < 2: |
|
139 | 1 | return word |
|
140 | |||
141 | # Re-map vocalic Y to y (Y will be C, y will be V) |
||
142 | 1 | if word[0] == 'y': |
|
143 | 1 | word = 'Y' + word[1:] |
|
144 | 1 | for i in range(1, len(word)): |
|
145 | 1 | if word[i] == 'y' and word[i - 1] in self._vowels: |
|
146 | 1 | word = word[:i] + 'Y' + word[i + 1 :] |
|
147 | |||
148 | 1 | r1_start = self._sb_r1(word, self._r1_prefixes) |
|
149 | 1 | r2_start = self._sb_r2(word, self._r1_prefixes) |
|
150 | |||
151 | # Step 0 |
||
152 | 1 | if word[-3:] == '\'s\'': |
|
153 | 1 | word = word[:-3] |
|
154 | 1 | elif word[-2:] == '\'s': |
|
155 | 1 | word = word[:-2] |
|
156 | 1 | elif word[-1:] == '\'': |
|
157 | 1 | word = word[:-1] |
|
158 | # Return word if stem is shorter than 2 |
||
159 | 1 | if len(word) < 3: |
|
160 | 1 | return word |
|
161 | |||
162 | # Step 1a |
||
163 | 1 | if word[-4:] == 'sses': |
|
164 | 1 | word = word[:-2] |
|
165 | 1 | elif word[-3:] in {'ied', 'ies'}: |
|
166 | 1 | if len(word) > 4: |
|
167 | 1 | word = word[:-2] |
|
168 | else: |
||
169 | 1 | word = word[:-1] |
|
170 | 1 | elif word[-2:] in {'us', 'ss'}: |
|
171 | 1 | pass |
|
172 | 1 | elif word[-1] == 's': |
|
173 | 1 | if self._sb_has_vowel(word[:-2]): |
|
174 | 1 | word = word[:-1] |
|
175 | |||
176 | # Exceptions 2 |
||
177 | 1 | if word in self._exception2set: |
|
178 | 1 | return word |
|
179 | |||
180 | # Step 1b |
||
181 | 1 | step1b_flag = False |
|
182 | 1 | if word[-5:] == 'eedly': |
|
183 | 1 | if len(word[r1_start:]) >= 5: |
|
184 | 1 | word = word[:-3] |
|
185 | 1 | elif word[-5:] == 'ingly': |
|
186 | 1 | if self._sb_has_vowel(word[:-5]): |
|
187 | 1 | word = word[:-5] |
|
188 | 1 | step1b_flag = True |
|
189 | 1 | elif word[-4:] == 'edly': |
|
190 | 1 | if self._sb_has_vowel(word[:-4]): |
|
191 | 1 | word = word[:-4] |
|
192 | 1 | step1b_flag = True |
|
193 | 1 | elif word[-3:] == 'eed': |
|
194 | 1 | if len(word[r1_start:]) >= 3: |
|
195 | 1 | word = word[:-1] |
|
196 | 1 | elif word[-3:] == 'ing': |
|
197 | 1 | if self._sb_has_vowel(word[:-3]): |
|
198 | 1 | word = word[:-3] |
|
199 | 1 | step1b_flag = True |
|
200 | 1 | elif word[-2:] == 'ed': |
|
201 | 1 | if self._sb_has_vowel(word[:-2]): |
|
202 | 1 | word = word[:-2] |
|
203 | 1 | step1b_flag = True |
|
204 | 1 | elif early_english: |
|
205 | 1 | if word[-3:] == 'est': |
|
206 | 1 | if self._sb_has_vowel(word[:-3]): |
|
207 | 1 | word = word[:-3] |
|
208 | 1 | step1b_flag = True |
|
209 | 1 | elif word[-3:] == 'eth': |
|
210 | 1 | if self._sb_has_vowel(word[:-3]): |
|
211 | 1 | word = word[:-3] |
|
212 | 1 | step1b_flag = True |
|
213 | |||
214 | 1 | if step1b_flag: |
|
215 | 1 | if word[-2:] in {'at', 'bl', 'iz'}: |
|
216 | 1 | word += 'e' |
|
217 | 1 | elif word[-2:] in self._doubles: |
|
218 | 1 | word = word[:-1] |
|
219 | 1 | elif self._sb_short_word(word, self._r1_prefixes): |
|
220 | 1 | word += 'e' |
|
221 | |||
222 | # Step 1c |
||
223 | 1 | if ( |
|
224 | len(word) > 2 |
||
225 | and word[-1] in {'Y', 'y'} |
||
226 | and word[-2] not in self._vowels |
||
227 | ): |
||
228 | 1 | word = word[:-1] + 'i' |
|
229 | |||
230 | # Step 2 |
||
231 | 1 | if word[-2] == 'a': |
|
232 | 1 | if word[-7:] == 'ational': |
|
233 | 1 | if len(word[r1_start:]) >= 7: |
|
234 | 1 | word = word[:-5] + 'e' |
|
235 | 1 | elif word[-6:] == 'tional': |
|
236 | 1 | if len(word[r1_start:]) >= 6: |
|
237 | 1 | word = word[:-2] |
|
238 | 1 | elif word[-2] == 'c': |
|
239 | 1 | if word[-4:] in {'enci', 'anci'}: |
|
240 | 1 | if len(word[r1_start:]) >= 4: |
|
241 | 1 | word = word[:-1] + 'e' |
|
242 | 1 | elif word[-2] == 'e': |
|
243 | 1 | if word[-4:] == 'izer': |
|
244 | 1 | if len(word[r1_start:]) >= 4: |
|
245 | 1 | word = word[:-1] |
|
246 | 1 | elif word[-2] == 'g': |
|
247 | 1 | if word[-3:] == 'ogi': |
|
248 | 1 | if ( |
|
249 | r1_start >= 1 |
||
250 | and len(word[r1_start:]) >= 3 |
||
251 | and word[-4] == 'l' |
||
252 | ): |
||
253 | 1 | word = word[:-1] |
|
254 | 1 | elif word[-2] == 'l': |
|
255 | 1 | if word[-6:] == 'lessli': |
|
256 | 1 | if len(word[r1_start:]) >= 6: |
|
257 | 1 | word = word[:-2] |
|
258 | 1 | elif word[-5:] in {'entli', 'fulli', 'ousli'}: |
|
259 | 1 | if len(word[r1_start:]) >= 5: |
|
260 | 1 | word = word[:-2] |
|
261 | 1 | elif word[-4:] == 'abli': |
|
262 | 1 | if len(word[r1_start:]) >= 4: |
|
263 | 1 | word = word[:-1] + 'e' |
|
264 | 1 | elif word[-4:] == 'alli': |
|
265 | 1 | if len(word[r1_start:]) >= 4: |
|
266 | 1 | word = word[:-2] |
|
267 | 1 | elif word[-3:] == 'bli': |
|
268 | 1 | if len(word[r1_start:]) >= 3: |
|
269 | 1 | word = word[:-1] + 'e' |
|
270 | 1 | elif word[-2:] == 'li': |
|
271 | 1 | if ( |
|
272 | r1_start >= 1 |
||
273 | and len(word[r1_start:]) >= 2 |
||
274 | and word[-3] in self._li |
||
275 | ): |
||
276 | 1 | word = word[:-2] |
|
277 | 1 | elif word[-2] == 'o': |
|
278 | 1 | if word[-7:] == 'ization': |
|
279 | 1 | if len(word[r1_start:]) >= 7: |
|
280 | 1 | word = word[:-5] + 'e' |
|
281 | 1 | elif word[-5:] == 'ation': |
|
282 | 1 | if len(word[r1_start:]) >= 5: |
|
283 | 1 | word = word[:-3] + 'e' |
|
284 | 1 | elif word[-4:] == 'ator': |
|
285 | 1 | if len(word[r1_start:]) >= 4: |
|
286 | 1 | word = word[:-2] + 'e' |
|
287 | 1 | elif word[-2] == 's': |
|
288 | 1 | if word[-7:] in {'fulness', 'ousness', 'iveness'}: |
|
289 | 1 | if len(word[r1_start:]) >= 7: |
|
290 | 1 | word = word[:-4] |
|
291 | 1 | elif word[-5:] == 'alism': |
|
292 | 1 | if len(word[r1_start:]) >= 5: |
|
293 | 1 | word = word[:-3] |
|
294 | 1 | elif word[-2] == 't': |
|
295 | 1 | if word[-6:] == 'biliti': |
|
296 | 1 | if len(word[r1_start:]) >= 6: |
|
297 | 1 | word = word[:-5] + 'le' |
|
298 | 1 | elif word[-5:] == 'aliti': |
|
299 | 1 | if len(word[r1_start:]) >= 5: |
|
300 | 1 | word = word[:-3] |
|
301 | 1 | elif word[-5:] == 'iviti': |
|
302 | 1 | if len(word[r1_start:]) >= 5: |
|
303 | 1 | word = word[:-3] + 'e' |
|
304 | |||
305 | # Step 3 |
||
306 | 1 | if word[-7:] == 'ational': |
|
307 | 1 | if len(word[r1_start:]) >= 7: |
|
308 | 1 | word = word[:-5] + 'e' |
|
309 | 1 | elif word[-6:] == 'tional': |
|
310 | 1 | if len(word[r1_start:]) >= 6: |
|
311 | 1 | word = word[:-2] |
|
312 | 1 | elif word[-5:] in {'alize', 'icate', 'iciti'}: |
|
313 | 1 | if len(word[r1_start:]) >= 5: |
|
314 | 1 | word = word[:-3] |
|
315 | 1 | elif word[-5:] == 'ative': |
|
316 | 1 | if len(word[r2_start:]) >= 5: |
|
317 | 1 | word = word[:-5] |
|
318 | 1 | elif word[-4:] == 'ical': |
|
319 | 1 | if len(word[r1_start:]) >= 4: |
|
320 | 1 | word = word[:-2] |
|
321 | 1 | elif word[-4:] == 'ness': |
|
322 | 1 | if len(word[r1_start:]) >= 4: |
|
323 | 1 | word = word[:-4] |
|
324 | 1 | elif word[-3:] == 'ful': |
|
325 | 1 | if len(word[r1_start:]) >= 3: |
|
326 | 1 | word = word[:-3] |
|
327 | |||
328 | # Step 4 |
||
329 | 1 | for suffix in ( |
|
330 | 'ement', |
||
331 | 'ance', |
||
332 | 'ence', |
||
333 | 'able', |
||
334 | 'ible', |
||
335 | 'ment', |
||
336 | 'ant', |
||
337 | 'ent', |
||
338 | 'ism', |
||
339 | 'ate', |
||
340 | 'iti', |
||
341 | 'ous', |
||
342 | 'ive', |
||
343 | 'ize', |
||
344 | 'al', |
||
345 | 'er', |
||
346 | 'ic', |
||
347 | ): |
||
348 | 1 | if word[-len(suffix) :] == suffix: |
|
349 | 1 | if len(word[r2_start:]) >= len(suffix): |
|
350 | 1 | word = word[: -len(suffix)] |
|
351 | 1 | break |
|
352 | else: |
||
353 | 1 | if word[-3:] == 'ion': |
|
354 | 1 | if ( |
|
355 | len(word[r2_start:]) >= 3 |
||
356 | and len(word) >= 4 |
||
357 | and word[-4] in tuple('st') |
||
358 | ): |
||
359 | 1 | word = word[:-3] |
|
360 | |||
361 | # Step 5 |
||
362 | 1 | if word[-1] == 'e': |
|
363 | 1 | if len(word[r2_start:]) >= 1 or ( |
|
364 | len(word[r1_start:]) >= 1 |
||
365 | and not self._sb_ends_in_short_syllable(word[:-1]) |
||
366 | ): |
||
367 | 1 | word = word[:-1] |
|
368 | 1 | elif word[-1] == 'l': |
|
369 | 1 | if len(word[r2_start:]) >= 1 and word[-2] == 'l': |
|
370 | 1 | word = word[:-1] |
|
371 | |||
372 | # Change 'Y' back to 'y' if it survived stemming |
||
373 | 1 | for i in range(0, len(word)): |
|
374 | 1 | if word[i] == 'Y': |
|
375 | 1 | word = word[:i] + 'y' + word[i + 1 :] |
|
376 | |||
377 | 1 | return word |
|
378 | |||
418 |