Completed
Pull Request — master (#141)
by Chris
11:04
created

abydos.stemmer._porter.porter()   A

Complexity

Conditions 1

Size

Total Lines 26
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 26
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._porter.
20
21
Porter stemmer
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from unicodedata import normalize
32
33 1
from six import text_type
34 1
from six.moves import range
35
36 1
from ._stemmer import _Stemmer
37
38 1
__all__ = ['Porter', 'porter']
39
40
41 1
class Porter(_Stemmer):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
42
    """Porter stemmer.
43
44
    The Porter stemmer is described in :cite:`Porter:1980`.
45
    """
46
47 1
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
48
49 1
    def _m_degree(self, term):
50
        """Return Porter helper function _m_degree value.
51
52
        m-degree is equal to the number of V to C transitions
53
54
        Args:
55
            term (str): The word for which to calculate the m-degree
56
57
        Returns:
58
            int: The m-degree as defined in the Porter stemmer definition
59
60
        """
61 1
        mdeg = 0
62 1
        last_was_vowel = False
63 1
        for letter in term:
64 1
            if letter in self._vowels:
65 1
                last_was_vowel = True
66
            else:
67 1
                if last_was_vowel:
68 1
                    mdeg += 1
69 1
                last_was_vowel = False
70 1
        return mdeg
71
72 1
    def _has_vowel(self, term):
73
        """Return Porter helper function _has_vowel value.
74
75
        Args:
76
            term (str): The word to scan for vowels
77
78
        Returns:
79
            bool: True iff a vowel exists in the term (as defined in the Porter
80
                stemmer definition)
81
82
        """
83 1
        for letter in term:
84 1
            if letter in self._vowels:
85 1
                return True
86 1
        return False
87
88 1
    def _ends_in_doubled_cons(self, term):
89
        """Return Porter helper function _ends_in_doubled_cons value.
90
91
        Args:
92
            term (str): The word to check for a final doubled consonant
93
94
        Returns:
95
            bool: True iff the stem ends in a doubled consonant (as defined in
96
                the Porter stemmer definition)
97
98
        """
99 1
        return (
100
            len(term) > 1
101
            and term[-1] not in self._vowels
102
            and term[-2] == term[-1]
103
        )
104
105 1
    def _ends_in_cvc(self, term):
106
        """Return Porter helper function _ends_in_cvc value.
107
108
        Args:
109
            term (str): The word to scan for cvc
110
111
        Returns:
112
            bool: True iff the stem ends in cvc (as defined in the Porter
113
                stemmer definition)
114
115
        """
116 1
        return len(term) > 2 and (
117
            term[-1] not in self._vowels
118
            and term[-2] in self._vowels
119
            and term[-3] not in self._vowels
120
            and term[-1] not in tuple('wxY')
121
        )
122
123 1
    def stem(self, word, early_english=False):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'stem' method
Loading history...
124
        """Return Porter stem.
125
126
        Args:
127
            word (str): The word to stem
128
            early_english (bool): Set to True in order to remove -eth & -est
129
                (2nd & 3rd person singular verbal agreement suffixes)
130
131
        Returns:
132
            str: Word stem
133
134
        Examples:
135
            >>> stmr = Porter()
136
            >>> stmr.stem('reading')
137
            'read'
138
            >>> stmr.stem('suspension')
139
            'suspens'
140
            >>> stmr.stem('elusiveness')
141
            'elus'
142
143
            >>> stmr.stem('eateth', early_english=True)
144
            'eat'
145
146
        """
147
        # lowercase, normalize, and compose
148 1
        word = normalize('NFC', text_type(word.lower()))
149
150
        # Return word if stem is shorter than 2
151 1
        if len(word) < 3:
152 1
            return word
153
154
        # Re-map consonantal y to Y (Y will be C, y will be V)
155 1
        if word[0] == 'y':
156 1
            word = 'Y' + word[1:]
157 1
        for i in range(1, len(word)):
158 1
            if word[i] == 'y' and word[i - 1] in self._vowels:
159 1
                word = word[:i] + 'Y' + word[i + 1 :]
160
161
        # Step 1a
162 1
        if word[-1] == 's':
163 1
            if word[-4:] == 'sses':
164 1
                word = word[:-2]
165 1
            elif word[-3:] == 'ies':
166 1
                word = word[:-2]
167 1
            elif word[-2:] == 'ss':
168 1
                pass
169
            else:
170 1
                word = word[:-1]
171
172
        # Step 1b
173 1
        step1b_flag = False
174 1
        if word[-3:] == 'eed':
175 1
            if self._m_degree(word[:-3]) > 0:
176 1
                word = word[:-1]
177 1
        elif word[-2:] == 'ed':
178 1
            if self._has_vowel(word[:-2]):
179 1
                word = word[:-2]
180 1
                step1b_flag = True
181 1
        elif word[-3:] == 'ing':
182 1
            if self._has_vowel(word[:-3]):
183 1
                word = word[:-3]
184 1
                step1b_flag = True
185 1
        elif early_english:
186 1
            if word[-3:] == 'est':
187 1
                if self._has_vowel(word[:-3]):
188 1
                    word = word[:-3]
189 1
                    step1b_flag = True
190 1
            elif word[-3:] == 'eth':
191 1
                if self._has_vowel(word[:-3]):
192 1
                    word = word[:-3]
193 1
                    step1b_flag = True
194
195 1
        if step1b_flag:
196 1
            if word[-2:] in {'at', 'bl', 'iz'}:
197 1
                word += 'e'
198 1
            elif self._ends_in_doubled_cons(word) and word[-1] not in {
199
                'l',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
200
                's',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
201
                'z',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
202
            }:
203 1
                word = word[:-1]
204 1
            elif self._m_degree(word) == 1 and self._ends_in_cvc(word):
205 1
                word += 'e'
206
207
        # Step 1c
208 1
        if word[-1] in {'Y', 'y'} and self._has_vowel(word[:-1]):
209 1
            word = word[:-1] + 'i'
210
211
        # Step 2
212 1
        if len(word) > 1:
213 1
            if word[-2] == 'a':
214 1
                if word[-7:] == 'ational':
215 1
                    if self._m_degree(word[:-7]) > 0:
216 1
                        word = word[:-5] + 'e'
217 1
                elif word[-6:] == 'tional':
218 1
                    if self._m_degree(word[:-6]) > 0:
219 1
                        word = word[:-2]
220 1
            elif word[-2] == 'c':
221 1
                if word[-4:] in {'enci', 'anci'}:
222 1
                    if self._m_degree(word[:-4]) > 0:
223 1
                        word = word[:-1] + 'e'
224 1
            elif word[-2] == 'e':
225 1
                if word[-4:] == 'izer':
226 1
                    if self._m_degree(word[:-4]) > 0:
227 1
                        word = word[:-1]
228 1
            elif word[-2] == 'g':
229 1
                if word[-4:] == 'logi':
230 1
                    if self._m_degree(word[:-4]) > 0:
231 1
                        word = word[:-1]
232 1
            elif word[-2] == 'l':
233 1
                if word[-3:] == 'bli':
234 1
                    if self._m_degree(word[:-3]) > 0:
235 1
                        word = word[:-1] + 'e'
236 1
                elif word[-4:] == 'alli':
237 1
                    if self._m_degree(word[:-4]) > 0:
238 1
                        word = word[:-2]
239 1
                elif word[-5:] == 'entli':
240 1
                    if self._m_degree(word[:-5]) > 0:
241 1
                        word = word[:-2]
242 1
                elif word[-3:] == 'eli':
243 1
                    if self._m_degree(word[:-3]) > 0:
244 1
                        word = word[:-2]
245 1
                elif word[-5:] == 'ousli':
246 1
                    if self._m_degree(word[:-5]) > 0:
247 1
                        word = word[:-2]
248 1
            elif word[-2] == 'o':
249 1
                if word[-7:] == 'ization':
250 1
                    if self._m_degree(word[:-7]) > 0:
251 1
                        word = word[:-5] + 'e'
252 1
                elif word[-5:] == 'ation':
253 1
                    if self._m_degree(word[:-5]) > 0:
254 1
                        word = word[:-3] + 'e'
255 1
                elif word[-4:] == 'ator':
256 1
                    if self._m_degree(word[:-4]) > 0:
257 1
                        word = word[:-2] + 'e'
258 1
            elif word[-2] == 's':
259 1
                if word[-5:] == 'alism':
260 1
                    if self._m_degree(word[:-5]) > 0:
261 1
                        word = word[:-3]
262 1
                elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
263 1
                    if self._m_degree(word[:-7]) > 0:
264 1
                        word = word[:-4]
265 1
            elif word[-2] == 't':
266 1
                if word[-5:] == 'aliti':
267 1
                    if self._m_degree(word[:-5]) > 0:
268 1
                        word = word[:-3]
269 1
                elif word[-5:] == 'iviti':
270 1
                    if self._m_degree(word[:-5]) > 0:
271 1
                        word = word[:-3] + 'e'
272 1
                elif word[-6:] == 'biliti':
273 1
                    if self._m_degree(word[:-6]) > 0:
274 1
                        word = word[:-5] + 'le'
275
276
        # Step 3
277 1
        if word[-5:] in 'icate':
278 1
            if self._m_degree(word[:-5]) > 0:
279 1
                word = word[:-3]
280 1
        elif word[-5:] == 'ative':
281 1
            if self._m_degree(word[:-5]) > 0:
282 1
                word = word[:-5]
283 1
        elif word[-5:] in {'alize', 'iciti'}:
284 1
            if self._m_degree(word[:-5]) > 0:
285 1
                word = word[:-3]
286 1
        elif word[-4:] == 'ical':
287 1
            if self._m_degree(word[:-4]) > 0:
288 1
                word = word[:-2]
289 1
        elif word[-3:] == 'ful':
290 1
            if self._m_degree(word[:-3]) > 0:
291 1
                word = word[:-3]
292 1
        elif word[-4:] == 'ness':
293 1
            if self._m_degree(word[:-4]) > 0:
294 1
                word = word[:-4]
295
296
        # Step 4
297 1
        if word[-2:] == 'al':
298 1
            if self._m_degree(word[:-2]) > 1:
299 1
                word = word[:-2]
300 1
        elif word[-4:] in {'ance', 'ence'}:
301 1
            if self._m_degree(word[:-4]) > 1:
302 1
                word = word[:-4]
303 1
        elif word[-2:] in {'er', 'ic'}:
304 1
            if self._m_degree(word[:-2]) > 1:
305 1
                word = word[:-2]
306 1
        elif word[-4:] in {'able', 'ible'}:
307 1
            if self._m_degree(word[:-4]) > 1:
308 1
                word = word[:-4]
309 1
        elif word[-3:] == 'ant':
310 1
            if self._m_degree(word[:-3]) > 1:
311 1
                word = word[:-3]
312 1
        elif word[-5:] == 'ement':
313 1
            if self._m_degree(word[:-5]) > 1:
314 1
                word = word[:-5]
315 1
        elif word[-4:] == 'ment':
316 1
            if self._m_degree(word[:-4]) > 1:
317 1
                word = word[:-4]
318 1
        elif word[-3:] == 'ent':
319 1
            if self._m_degree(word[:-3]) > 1:
320 1
                word = word[:-3]
321 1
        elif word[-4:] in {'sion', 'tion'}:
322 1
            if self._m_degree(word[:-3]) > 1:
323 1
                word = word[:-3]
324 1
        elif word[-2:] == 'ou':
325 1
            if self._m_degree(word[:-2]) > 1:
326 1
                word = word[:-2]
327 1
        elif word[-3:] in {'ism', 'ate', 'iti', 'ous', 'ive', 'ize'}:
328 1
            if self._m_degree(word[:-3]) > 1:
329 1
                word = word[:-3]
330
331
        # Step 5a
332 1
        if word[-1] == 'e':
333 1
            if self._m_degree(word[:-1]) > 1:
334 1
                word = word[:-1]
335 1
            elif self._m_degree(word[:-1]) == 1 and not self._ends_in_cvc(
336
                word[:-1]
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
337
            ):
338 1
                word = word[:-1]
339
340
        # Step 5b
341 1
        if word[-2:] == 'll' and self._m_degree(word) > 1:
342 1
            word = word[:-1]
343
344
        # Change 'Y' back to 'y' if it survived stemming
345 1
        for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
346 1
            if word[i] == 'Y':
347 1
                word = word[:i] + 'y' + word[i + 1 :]
348
349 1
        return word
350
351
352 1
def porter(word, early_english=False):
353
    """Return Porter stem.
354
355
    This is a wrapper for :py:meth:`Porter.stem`.
356
357
    Args:
358
        word (str): The word to stem
359
        early_english (bool): Set to True in order to remove -eth & -est
360
                (2nd & 3rd person singular verbal agreement suffixes)
361
362
    Returns:
363
        str: Word stem
364
365
    Examples:
366
        >>> porter('reading')
367
        'read'
368
        >>> porter('suspension')
369
        'suspens'
370
        >>> porter('elusiveness')
371
        'elus'
372
373
        >>> porter('eateth', early_english=True)
374
        'eat'
375
376
    """
377 1
    return Porter().stem(word, early_english)
378
379
380
if __name__ == '__main__':
381
    import doctest
382
383
    doctest.testmod()
384