abydos.stemmer._porter2.Porter2.stem()   F
last analyzed

Complexity

Conditions 127

Size

Total Lines 294
Code Lines 218

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 180
CRAP Score 127

Importance

Changes 0
Metric Value
eloc 218
dl 0
loc 294
ccs 180
cts 180
cp 1
rs 0
c 0
b 0
f 0
cc 127
nop 2
crap 127

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.stemmer._porter2.Porter2.stem() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._porter2.
18
19 1
Porter2 (Snowball English) stemmer
20
"""
21
22
from unicodedata import normalize
23
24 1
from ._snowball import _Snowball
25
26
__all__ = ['Porter2']
27
28
29
class Porter2(_Snowball):
30
    """Porter2 (Snowball English) stemmer.
31 1
32
    The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`.
33 1
34
    .. versionadded:: 0.3.6
35 1
    """
36 1
37
    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
38 1
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
39 1
40
    # R1 prefixes should be in order from longest to shortest to prevent
41 1
    # masking
42
    _r1_prefixes = ('commun', 'gener', 'arsen')
43
    _exception1dict = {  # special changes:
44 1
        'skis': 'ski',
45
        'skies': 'sky',
46
        'dying': 'die',
47
        'lying': 'lie',
48
        'tying': 'tie',
49
        # special -LY cases:
50
        'idly': 'idl',
51
        'gently': 'gentl',
52 1
        'ugly': 'ugli',
53 1
        'early': 'earli',
54
        'only': 'onli',
55
        'singly': 'singl',
56
    }
57 1
    _exception1set = {
58 1
        'sky',
59
        'news',
60
        'howe',
61
        'atlas',
62
        'cosmos',
63
        'bias',
64
        'andes',
65
    }
66
    _exception2set = {
67
        'inning',
68
        'outing',
69
        'canning',
70
        'herring',
71
        'earring',
72 1
        'proceed',
73
        'exceed',
74
        'succeed',
75
    }
76
77
    def __init__(self, early_english: bool = False) -> None:
78
        """Initialize Porter2 instance.
79
80
        Parameters
81 1
        ----------
82
        early_english : bool
83
            Set to True in order to remove -eth & -est (2nd & 3rd person
84
            singular verbal agreement suffixes)
85
86
87
        .. versionadded:: 0.4.0
88
89
        """
90
        self._early_english = early_english
91
92 1
    def stem(self, word: str) -> str:
93
        """Return the Porter2 (Snowball English) stem.
94
95
        Parameters
96
        ----------
97
        word : str
98
            The word to stem
99
100
        Returns
101
        -------
102
        str
103
            Word stem
104
105 1
        Examples
106
        --------
107 1
        >>> stmr = Porter2()
108
        >>> stmr.stem('reading')
109
        'read'
110
        >>> stmr.stem('suspension')
111
        'suspens'
112
        >>> stmr.stem('elusiveness')
113
        'elus'
114
115
        >>> stmr = Porter2(early_english=True)
116
        >>> stmr.stem('eateth')
117
        'eat'
118
119
120
        .. versionadded:: 0.1.0
121
        .. versionchanged:: 0.3.6
122
            Encapsulated in class
123
124
        """
125
        # lowercase, normalize, and compose
126
        word = normalize('NFC', word.lower())
127
        # replace apostrophe-like characters with U+0027, per
128
        # http://snowball.tartarus.org/texts/apostrophe.html
129
        word = word.replace('’', "'")
130
        word = word.replace('’', "'")
131
132
        # Exceptions 1
133
        if word in self._exception1dict:
134
            return self._exception1dict[word]
135
        elif word in self._exception1set:
136
            return word
137
138
        # Return word if stem is shorter than 3
139
        if len(word) < 3:
140
            return word
141 1
142
        # Remove initial ', if present.
143
        while word and word[0] == "'":
144 1
            word = word[1:]
145 1
            # Return word if stem is shorter than 2
146
            if len(word) < 2:
147
                return word
148 1
149 1
        # Re-map vocalic Y to y (Y will be C, y will be V)
150 1
        if word[0] == 'y':
151 1
            word = 'Y' + word[1:]
152
        for i in range(1, len(word)):
153
            if word[i] == 'y' and word[i - 1] in self._vowels:
154 1
                word = word[:i] + 'Y' + word[i + 1 :]
155 1
156
        r1_start = self._sb_r1(word, self._r1_prefixes)
157
        r2_start = self._sb_r2(word, self._r1_prefixes)
158 1
159 1
        # Step 0
160
        if word[-3:] == "'s'":
161 1
            word = word[:-3]
162 1
        elif word[-2:] == "'s":
163
            word = word[:-2]
164
        elif word[-1:] == "'":
165 1
            word = word[:-1]
166 1
        # Return word if stem is shorter than 2
167 1
        if len(word) < 3:
168 1
            return word
169 1
170
        # Step 1a
171 1
        if word[-4:] == 'sses':
172 1
            word = word[:-2]
173
        elif word[-3:] in {'ied', 'ies'}:
174
            if len(word) > 4:
175 1
                word = word[:-2]
176 1
            else:
177 1
                word = word[:-1]
178 1
        elif word[-2:] in {'us', 'ss'}:
179 1
            pass
180 1
        elif word[-1] == 's':
181
            if self._sb_has_vowel(word[:-2]):
182 1
                word = word[:-1]
183 1
184
        # Exceptions 2
185
        if word in self._exception2set:
186 1
            return word
187 1
188 1
        # Step 1b
189 1
        step1b_flag = False
190 1
        if word[-5:] == 'eedly':
191
            if len(word[r1_start:]) >= 5:
192 1
                word = word[:-3]
193 1
        elif word[-5:] == 'ingly':
194 1
            if self._sb_has_vowel(word[:-5]):
195 1
                word = word[:-5]
196 1
                step1b_flag = True
197 1
        elif word[-4:] == 'edly':
198
            if self._sb_has_vowel(word[:-4]):
199
                word = word[:-4]
200 1
                step1b_flag = True
201 1
        elif word[-3:] == 'eed':
202
            if len(word[r1_start:]) >= 3:
203
                word = word[:-1]
204 1
        elif word[-3:] == 'ing':
205 1
            if self._sb_has_vowel(word[:-3]):
206 1
                word = word[:-3]
207 1
                step1b_flag = True
208 1
        elif word[-2:] == 'ed':
209 1
            if self._sb_has_vowel(word[:-2]):
210 1
                word = word[:-2]
211 1
                step1b_flag = True
212 1
        elif self._early_english:
213 1
            if word[-3:] == 'est':
214 1
                if self._sb_has_vowel(word[:-3]):
215 1
                    word = word[:-3]
216 1
                    step1b_flag = True
217 1
            elif word[-3:] == 'eth':
218 1
                if self._sb_has_vowel(word[:-3]):
219 1
                    word = word[:-3]
220 1
                    step1b_flag = True
221 1
222 1
        if step1b_flag:
223 1
            if word[-2:] in {'at', 'bl', 'iz'}:
224 1
                word += 'e'
225 1
            elif word[-2:] in self._doubles:
226 1
                word = word[:-1]
227 1
            elif self._sb_short_word(word, self._r1_prefixes):
228 1
                word += 'e'
229 1
230 1
        # Step 1c
231 1
        if (
232 1
            len(word) > 2
233 1
            and word[-1] in {'Y', 'y'}
234 1
            and word[-2] not in self._vowels
235 1
        ):
236
            word = word[:-1] + 'i'
237 1
238 1
        # Step 2
239 1
        if word[-2] == 'a':
240 1
            if word[-7:] == 'ational':
241 1
                if len(word[r1_start:]) >= 7:
242 1
                    word = word[:-5] + 'e'
243 1
            elif word[-6:] == 'tional':
244
                if len(word[r1_start:]) >= 6:
245
                    word = word[:-2]
246 1
        elif word[-2] == 'c':
247
            if word[-4:] in {'enci', 'anci'}:
248
                if len(word[r1_start:]) >= 4:
249
                    word = word[:-1] + 'e'
250
        elif word[-2] == 'e':
251 1
            if word[-4:] == 'izer':
252
                if len(word[r1_start:]) >= 4:
253
                    word = word[:-1]
254 1
        elif word[-2] == 'g':
255 1
            if word[-3:] == 'ogi':
256 1
                if (
257 1
                    r1_start >= 1
258 1
                    and len(word[r1_start:]) >= 3
259 1
                    and word[-4] == 'l'
260 1
                ):
261 1
                    word = word[:-1]
262 1
        elif word[-2] == 'l':
263 1
            if word[-6:] == 'lessli':
264 1
                if len(word[r1_start:]) >= 6:
265 1
                    word = word[:-2]
266 1
            elif word[-5:] in {'entli', 'fulli', 'ousli'}:
267 1
                if len(word[r1_start:]) >= 5:
268 1
                    word = word[:-2]
269 1
            elif word[-4:] == 'abli':
270 1
                if len(word[r1_start:]) >= 4:
271 1
                    word = word[:-1] + 'e'
272
            elif word[-4:] == 'alli':
273
                if len(word[r1_start:]) >= 4:
274
                    word = word[:-2]
275
            elif word[-3:] == 'bli':
276 1
                if len(word[r1_start:]) >= 3:
277 1
                    word = word[:-1] + 'e'
278 1
            elif word[-2:] == 'li':
279 1
                if (
280 1
                    r1_start >= 1
281 1
                    and len(word[r1_start:]) >= 2
282 1
                    and word[-3] in self._li
283 1
                ):
284 1
                    word = word[:-2]
285 1
        elif word[-2] == 'o':
286 1
            if word[-7:] == 'ization':
287 1
                if len(word[r1_start:]) >= 7:
288 1
                    word = word[:-5] + 'e'
289 1
            elif word[-5:] == 'ation':
290 1
                if len(word[r1_start:]) >= 5:
291 1
                    word = word[:-3] + 'e'
292 1
            elif word[-4:] == 'ator':
293 1
                if len(word[r1_start:]) >= 4:
294 1
                    word = word[:-2] + 'e'
295
        elif word[-2] == 's':
296
            if word[-7:] in {'fulness', 'ousness', 'iveness'}:
297
                if len(word[r1_start:]) >= 7:
298
                    word = word[:-4]
299 1
            elif word[-5:] == 'alism':
300 1
                if len(word[r1_start:]) >= 5:
301 1
                    word = word[:-3]
302 1
        elif word[-2] == 't':
303 1
            if word[-6:] == 'biliti':
304 1
                if len(word[r1_start:]) >= 6:
305 1
                    word = word[:-5] + 'le'
306 1
            elif word[-5:] == 'aliti':
307 1
                if len(word[r1_start:]) >= 5:
308 1
                    word = word[:-3]
309 1
            elif word[-5:] == 'iviti':
310 1
                if len(word[r1_start:]) >= 5:
311 1
                    word = word[:-3] + 'e'
312 1
313 1
        # Step 3
314 1
        if word[-7:] == 'ational':
315 1
            if len(word[r1_start:]) >= 7:
316 1
                word = word[:-5] + 'e'
317 1
        elif word[-6:] == 'tional':
318 1
            if len(word[r1_start:]) >= 6:
319 1
                word = word[:-2]
320 1
        elif word[-5:] in {'alize', 'icate', 'iciti'}:
321 1
            if len(word[r1_start:]) >= 5:
322 1
                word = word[:-3]
323 1
        elif word[-5:] == 'ative':
324 1
            if len(word[r2_start:]) >= 5:
325 1
                word = word[:-5]
326 1
        elif word[-4:] == 'ical':
327
            if len(word[r1_start:]) >= 4:
328
                word = word[:-2]
329 1
        elif word[-4:] == 'ness':
330 1
            if len(word[r1_start:]) >= 4:
331 1
                word = word[:-4]
332 1
        elif word[-3:] == 'ful':
333 1
            if len(word[r1_start:]) >= 3:
334 1
                word = word[:-3]
335 1
336 1
        # Step 4
337 1
        for suffix in (
338 1
            'ement',
339 1
            'ance',
340 1
            'ence',
341 1
            'able',
342 1
            'ible',
343 1
            'ment',
344 1
            'ant',
345 1
            'ent',
346 1
            'ism',
347 1
            'ate',
348 1
            'iti',
349 1
            'ous',
350
            'ive',
351
            'ize',
352 1
            'al',
353
            'er',
354
            'ic',
355
        ):
356
            if word[-len(suffix) :] == suffix:
357
                if len(word[r2_start:]) >= len(suffix):
358
                    word = word[: -len(suffix)]
359
                break
360
        else:
361
            if word[-3:] == 'ion':
362
                if (
363
                    len(word[r2_start:]) >= 3
364
                    and len(word) >= 4
365
                    and word[-4] in tuple('st')
366
                ):
367
                    word = word[:-3]
368
369
        # Step 5
370
        if word[-1] == 'e':
371 1
            if len(word[r2_start:]) >= 1 or (
372 1
                len(word[r1_start:]) >= 1
373 1
                and not self._sb_ends_in_short_syllable(word[:-1])
374 1
            ):
375
                word = word[:-1]
376 1
        elif word[-1] == 'l':
377 1
            if len(word[r2_start:]) >= 1 and word[-2] == 'l':
378
                word = word[:-1]
379
380
        # Change 'Y' back to 'y' if it survived stemming
381
        for i in range(0, len(word)):
382 1
            if word[i] == 'Y':
383
                word = word[:i] + 'y' + word[i + 1 :]
384
385 1
        return word
386 1
387
388
if __name__ == '__main__':
389
    import doctest
390 1
391
    doctest.testmod()
392