abydos.stemmer._porter.Porter._m_degree()   A
last analyzed

Complexity

Conditions 4

Size

Total Lines 31
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 4

Importance

Changes 0
Metric Value
eloc 10
dl 0
loc 31
ccs 5
cts 5
cp 1
rs 9.9
c 0
b 0
f 0
cc 4
nop 2
crap 4
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._porter.
18
19 1
Porter stemmer
20
"""
21
22
from unicodedata import normalize
23
24 1
from ._stemmer import _Stemmer
25
26
__all__ = ['Porter']
27
28
29
class Porter(_Stemmer):
30
    """Porter stemmer.
31 1
32
    The Porter stemmer is described in :cite:`Porter:1980`.
33 1
34
    .. versionadded:: 0.3.6
35 1
    """
36 1
37
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
38 1
39 1
    def _m_degree(self, term: str) -> int:
40
        """Return Porter helper function _m_degree value.
41 1
42
        m-degree is equal to the number of V to C transitions
43
44 1
        Parameters
45
        ----------
46
        term : str
47
            The word for which to calculate the m-degree
48
49
        Returns
50
        -------
51
        int
52 1
            The m-degree as defined in the Porter stemmer definition
53
54 1
55
        .. versionadded:: 0.1.0
56
        .. versionchanged:: 0.3.6
57
            Encapsulated in class
58
59
        """
60
        mdeg = 0
61
        last_was_vowel = False
62
        for letter in term:
63
            if letter in self._vowels:
64
                last_was_vowel = True
65
            else:
66
                if last_was_vowel:
67
                    mdeg += 1
68
                last_was_vowel = False
69
        return mdeg
70
71
    def _has_vowel(self, term: str) -> bool:
72
        """Return Porter helper function _has_vowel value.
73
74
        Parameters
75 1
        ----------
76 1
        term : str
77 1
            The word to scan for vowels
78 1
79 1
        Returns
80
        -------
81 1
        bool
82 1
            True iff a vowel exists in the term (as defined in the Porter
83 1
            stemmer definition)
84 1
85
86 1
        .. versionadded:: 0.1.0
87
        .. versionchanged:: 0.3.6
88
            Encapsulated in class
89
90
        """
91
        for letter in term:
92
            if letter in self._vowels:
93
                return True
94
        return False
95
96
    def _ends_in_doubled_cons(self, term: str) -> bool:
97
        """Return Porter helper function _ends_in_doubled_cons value.
98
99
        Parameters
100
        ----------
101
        term : str
102
            The word to check for a final doubled consonant
103
104
        Returns
105
        -------
106 1
        bool
107 1
            True iff the stem ends in a doubled consonant (as defined in the
108 1
            Porter stemmer definition)
109 1
110
111 1
        .. versionadded:: 0.1.0
112
        .. versionchanged:: 0.3.6
113
            Encapsulated in class
114
115
        """
116
        return (
117
            len(term) > 1
118
            and term[-1] not in self._vowels
119
            and term[-2] == term[-1]
120
        )
121
122
    def _ends_in_cvc(self, term: str) -> bool:
123
        """Return Porter helper function _ends_in_cvc value.
124
125
        Parameters
126
        ----------
127
        term : str
128
            The word to scan for cvc
129
130
        Returns
131 1
        -------
132
        bool
133
            True iff the stem ends in cvc (as defined in the Porter stemmer
134
            definition)
135
136
137 1
        .. versionadded:: 0.1.0
138
        .. versionchanged:: 0.3.6
139
            Encapsulated in class
140
141
        """
142
        return len(term) > 2 and (
143
            term[-1] not in self._vowels
144
            and term[-2] in self._vowels
145
            and term[-3] not in self._vowels
146
            and term[-1] not in tuple('wxY')
147
        )
148
149
    def __init__(self, early_english: bool = False) -> None:
150
        """Initialize Porter instance.
151
152
        Parameters
153
        ----------
154
        early_english : bool
155
            Set to True in order to remove -eth & -est (2nd & 3rd person
156
            singular verbal agreement suffixes)
157 1
158
159
        .. versionadded:: 0.4.0
160
161
        """
162
        self._early_english = early_english
163
164 1
    def stem(self, word: str) -> str:
165
        """Return Porter stem.
166
167
        Parameters
168
        ----------
169
        word : str
170
            The word to stem
171
172
        Returns
173
        -------
174
        str
175
            Word stem
176
177 1
        Examples
178
        --------
179 1
        >>> stmr = Porter()
180
        >>> stmr.stem('reading')
181
        'read'
182
        >>> stmr.stem('suspension')
183
        'suspens'
184
        >>> stmr.stem('elusiveness')
185
        'elus'
186
187
        >>> stmr = Porter(early_english=True)
188
        >>> stmr.stem('eateth')
189
        'eat'
190
191
192
        .. versionadded:: 0.1.0
193
        .. versionchanged:: 0.3.6
194
            Encapsulated in class
195
196
        """
197
        # lowercase, normalize, and compose
198
        word = normalize('NFC', word.lower())
199
200
        # Return word if stem is shorter than 2
201
        if len(word) < 3:
202
            return word
203
204
        # Re-map consonantal y to Y (Y will be C, y will be V)
205
        if word[0] == 'y':
206
            word = 'Y' + word[1:]
207
        for i in range(1, len(word)):
208
            if word[i] == 'y' and word[i - 1] in self._vowels:
209
                word = word[:i] + 'Y' + word[i + 1 :]
210
211
        # Step 1a
212
        if word[-1] == 's':
213 1
            if word[-4:] == 'sses':
214
                word = word[:-2]
215
            elif word[-3:] == 'ies':
216 1
                word = word[:-2]
217 1
            elif word[-2:] == 'ss':
218
                pass
219
            else:
220 1
                word = word[:-1]
221 1
222 1
        # Step 1b
223 1
        step1b_flag = False
224 1
        if word[-3:] == 'eed':
225
            if self._m_degree(word[:-3]) > 0:
226
                word = word[:-1]
227 1
        elif word[-2:] == 'ed':
228 1
            if self._has_vowel(word[:-2]):
229 1
                word = word[:-2]
230 1
                step1b_flag = True
231 1
        elif word[-3:] == 'ing':
232 1
            if self._has_vowel(word[:-3]):
233 1
                word = word[:-3]
234
                step1b_flag = True
235 1
        elif self._early_english:
236
            if word[-3:] == 'est':
237
                if self._has_vowel(word[:-3]):
238 1
                    word = word[:-3]
239 1
                    step1b_flag = True
240 1
            elif word[-3:] == 'eth':
241 1
                if self._has_vowel(word[:-3]):
242 1
                    word = word[:-3]
243 1
                    step1b_flag = True
244 1
245 1
        if step1b_flag:
246 1
            if word[-2:] in {'at', 'bl', 'iz'}:
247 1
                word += 'e'
248 1
            elif self._ends_in_doubled_cons(word) and word[-1] not in {
249 1
                'l',
250 1
                's',
251 1
                'z',
252 1
            }:
253 1
                word = word[:-1]
254 1
            elif self._m_degree(word) == 1 and self._ends_in_cvc(word):
255 1
                word += 'e'
256 1
257 1
        # Step 1c
258 1
        if word[-1] in {'Y', 'y'} and self._has_vowel(word[:-1]):
259
            word = word[:-1] + 'i'
260 1
261 1
        # Step 2
262 1
        if len(word) > 1:
263 1
            if word[-2] == 'a':
264
                if word[-7:] == 'ational':
265
                    if self._m_degree(word[:-7]) > 0:
266
                        word = word[:-5] + 'e'
267
                elif word[-6:] == 'tional':
268 1
                    if self._m_degree(word[:-6]) > 0:
269 1
                        word = word[:-2]
270 1
            elif word[-2] == 'c':
271
                if word[-4:] in {'enci', 'anci'}:
272
                    if self._m_degree(word[:-4]) > 0:
273 1
                        word = word[:-1] + 'e'
274 1
            elif word[-2] == 'e':
275
                if word[-4:] == 'izer':
276
                    if self._m_degree(word[:-4]) > 0:
277 1
                        word = word[:-1]
278 1
            elif word[-2] == 'g':
279 1
                if word[-4:] == 'logi':
280 1
                    if self._m_degree(word[:-4]) > 0:
281 1
                        word = word[:-1]
282 1
            elif word[-2] == 'l':
283 1
                if word[-3:] == 'bli':
284 1
                    if self._m_degree(word[:-3]) > 0:
285 1
                        word = word[:-1] + 'e'
286 1
                elif word[-4:] == 'alli':
287 1
                    if self._m_degree(word[:-4]) > 0:
288 1
                        word = word[:-2]
289 1
                elif word[-5:] == 'entli':
290 1
                    if self._m_degree(word[:-5]) > 0:
291 1
                        word = word[:-2]
292 1
                elif word[-3:] == 'eli':
293 1
                    if self._m_degree(word[:-3]) > 0:
294 1
                        word = word[:-2]
295 1
                elif word[-5:] == 'ousli':
296 1
                    if self._m_degree(word[:-5]) > 0:
297 1
                        word = word[:-2]
298 1
            elif word[-2] == 'o':
299 1
                if word[-7:] == 'ization':
300 1
                    if self._m_degree(word[:-7]) > 0:
301 1
                        word = word[:-5] + 'e'
302 1
                elif word[-5:] == 'ation':
303 1
                    if self._m_degree(word[:-5]) > 0:
304 1
                        word = word[:-3] + 'e'
305 1
                elif word[-4:] == 'ator':
306 1
                    if self._m_degree(word[:-4]) > 0:
307 1
                        word = word[:-2] + 'e'
308 1
            elif word[-2] == 's':
309 1
                if word[-5:] == 'alism':
310 1
                    if self._m_degree(word[:-5]) > 0:
311 1
                        word = word[:-3]
312 1
                elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
313 1
                    if self._m_degree(word[:-7]) > 0:
314 1
                        word = word[:-4]
315 1
            elif word[-2] == 't':
316 1
                if word[-5:] == 'aliti':
317 1
                    if self._m_degree(word[:-5]) > 0:
318 1
                        word = word[:-3]
319 1
                elif word[-5:] == 'iviti':
320 1
                    if self._m_degree(word[:-5]) > 0:
321 1
                        word = word[:-3] + 'e'
322 1
                elif word[-6:] == 'biliti':
323 1
                    if self._m_degree(word[:-6]) > 0:
324 1
                        word = word[:-5] + 'le'
325 1
326 1
        # Step 3
327 1
        if word[-5:] in 'icate':
328 1
            if self._m_degree(word[:-5]) > 0:
329 1
                word = word[:-3]
330 1
        elif word[-5:] == 'ative':
331 1
            if self._m_degree(word[:-5]) > 0:
332 1
                word = word[:-5]
333 1
        elif word[-5:] in {'alize', 'iciti'}:
334 1
            if self._m_degree(word[:-5]) > 0:
335 1
                word = word[:-3]
336 1
        elif word[-4:] == 'ical':
337 1
            if self._m_degree(word[:-4]) > 0:
338 1
                word = word[:-2]
339 1
        elif word[-3:] == 'ful':
340
            if self._m_degree(word[:-3]) > 0:
341
                word = word[:-3]
342 1
        elif word[-4:] == 'ness':
343 1
            if self._m_degree(word[:-4]) > 0:
344 1
                word = word[:-4]
345 1
346 1
        # Step 4
347 1
        if word[-2:] == 'al':
348 1
            if self._m_degree(word[:-2]) > 1:
349 1
                word = word[:-2]
350 1
        elif word[-4:] in {'ance', 'ence'}:
351 1
            if self._m_degree(word[:-4]) > 1:
352 1
                word = word[:-4]
353 1
        elif word[-2:] in {'er', 'ic'}:
354 1
            if self._m_degree(word[:-2]) > 1:
355 1
                word = word[:-2]
356 1
        elif word[-4:] in {'able', 'ible'}:
357 1
            if self._m_degree(word[:-4]) > 1:
358 1
                word = word[:-4]
359 1
        elif word[-3:] == 'ant':
360
            if self._m_degree(word[:-3]) > 1:
361
                word = word[:-3]
362 1
        elif word[-5:] == 'ement':
363 1
            if self._m_degree(word[:-5]) > 1:
364 1
                word = word[:-5]
365 1
        elif word[-4:] == 'ment':
366 1
            if self._m_degree(word[:-4]) > 1:
367 1
                word = word[:-4]
368 1
        elif word[-3:] == 'ent':
369 1
            if self._m_degree(word[:-3]) > 1:
370 1
                word = word[:-3]
371 1
        elif word[-4:] in {'sion', 'tion'}:
372 1
            if self._m_degree(word[:-3]) > 1:
373 1
                word = word[:-3]
374 1
        elif word[-2:] == 'ou':
375 1
            if self._m_degree(word[:-2]) > 1:
376 1
                word = word[:-2]
377 1
        elif word[-3:] in {'ism', 'ate', 'iti', 'ous', 'ive', 'ize'}:
378 1
            if self._m_degree(word[:-3]) > 1:
379 1
                word = word[:-3]
380 1
381 1
        # Step 5a
382 1
        if word[-1] == 'e':
383 1
            if self._m_degree(word[:-1]) > 1:
384 1
                word = word[:-1]
385 1
            elif self._m_degree(word[:-1]) == 1 and not self._ends_in_cvc(
386 1
                word[:-1]
387 1
            ):
388 1
                word = word[:-1]
389 1
390 1
        # Step 5b
391 1
        if word[-2:] == 'll' and self._m_degree(word) > 1:
392 1
            word = word[:-1]
393 1
394 1
        # Change 'Y' back to 'y' if it survived stemming
395
        for i in range(len(word)):
396
            if word[i] == 'Y':
397 1
                word = word[:i] + 'y' + word[i + 1 :]
398 1
399 1
        return word
400 1
401
402
if __name__ == '__main__':
403 1
    import doctest
404
405
    doctest.testmod()
406