abydos.stemmer._porter2 - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.stemmer._porter2 F

↳ Parent: Project

Complexity

Total Complexity

128

Size/Duplication

Total Lines	418
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
eloc	267
dl	0
loc	418
ccs	203
cts	203
cp	1
rs	2
c	0
b	0
f	0
wmc	128

1 Method

Rating	Name	Duplication	Size	Complexity
F	Porter2.stem()	0	291	127

1 Function

Rating	Name	Duplication	Size	Complexity
A	porter2()	0	32	1

How to fix Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._porter2.

Porter2 (Snowball English) stemmer
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize

from six import text_type
from six.moves import range

from ._snowball import _Snowball

__all__ = ['Porter2', 'porter2']


class Porter2(_Snowball):

    """Porter2 (Snowball English) stemmer.

    The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`.
    """

    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}

    # R1 prefixes should be in order from longest to shortest to prevent
    # masking
    _r1_prefixes = ('commun', 'gener', 'arsen')
    _exception1dict = {  # special changes:
        'skis': 'ski',
        'skies': 'sky',
        'dying': 'die',
        'lying': 'lie',
        'tying': 'tie',
        # special -LY cases:
        'idly': 'idl',
        'gently': 'gentl',
        'ugly': 'ugli',
        'early': 'earli',
        'only': 'onli',
        'singly': 'singl',
    }
    _exception1set = {
        'sky',
        'news',
        'howe',
        'atlas',
        'cosmos',
        'bias',
        'andes',
    }
    _exception2set = {
        'inning',
        'outing',
        'canning',
        'herring',
        'earring',
        'proceed',
        'exceed',
        'succeed',
    }

    def stem(self, word, early_english=False):

        """Return the Porter2 (Snowball English) stem.

        Parameters
        ----------
        word : str
            The word to stem
        early_english : bool
            Set to True in order to remove -eth & -est (2nd & 3rd person
            singular verbal agreement suffixes)

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = Porter2()
        >>> stmr.stem('reading')
        'read'
        >>> stmr.stem('suspension')
        'suspens'
        >>> stmr.stem('elusiveness')
        'elus'

        >>> stmr.stem('eateth', early_english=True)
        'eat'

        """
        # lowercase, normalize, and compose
        word = normalize('NFC', text_type(word.lower()))
        # replace apostrophe-like characters with U+0027, per
        # http://snowball.tartarus.org/texts/apostrophe.html
        word = word.replace('’', '\'')
        word = word.replace('’', '\'')

        # Exceptions 1
        if word in self._exception1dict:
            return self._exception1dict[word]
        elif word in self._exception1set:
            return word

        # Return word if stem is shorter than 3
        if len(word) < 3:
            return word

        # Remove initial ', if present.
        while word and word[0] == '\'':
            word = word[1:]
            # Return word if stem is shorter than 2
            if len(word) < 2:
                return word

        # Re-map vocalic Y to y (Y will be C, y will be V)
        if word[0] == 'y':
            word = 'Y' + word[1:]
        for i in range(1, len(word)):
            if word[i] == 'y' and word[i - 1] in self._vowels:
                word = word[:i] + 'Y' + word[i + 1 :]

        r1_start = self._sb_r1(word, self._r1_prefixes)
        r2_start = self._sb_r2(word, self._r1_prefixes)

        # Step 0
        if word[-3:] == '\'s\'':
            word = word[:-3]
        elif word[-2:] == '\'s':
            word = word[:-2]
        elif word[-1:] == '\'':
            word = word[:-1]
        # Return word if stem is shorter than 2
        if len(word) < 3:
            return word

        # Step 1a
        if word[-4:] == 'sses':
            word = word[:-2]
        elif word[-3:] in {'ied', 'ies'}:
            if len(word) > 4:
                word = word[:-2]
            else:
                word = word[:-1]
        elif word[-2:] in {'us', 'ss'}:
            pass
        elif word[-1] == 's':
            if self._sb_has_vowel(word[:-2]):
                word = word[:-1]

        # Exceptions 2
        if word in self._exception2set:
            return word

        # Step 1b
        step1b_flag = False
        if word[-5:] == 'eedly':
            if len(word[r1_start:]) >= 5:
                word = word[:-3]
        elif word[-5:] == 'ingly':
            if self._sb_has_vowel(word[:-5]):
                word = word[:-5]
                step1b_flag = True
        elif word[-4:] == 'edly':
            if self._sb_has_vowel(word[:-4]):
                word = word[:-4]
                step1b_flag = True
        elif word[-3:] == 'eed':
            if len(word[r1_start:]) >= 3:
                word = word[:-1]
        elif word[-3:] == 'ing':
            if self._sb_has_vowel(word[:-3]):
                word = word[:-3]
                step1b_flag = True
        elif word[-2:] == 'ed':
            if self._sb_has_vowel(word[:-2]):
                word = word[:-2]
                step1b_flag = True
        elif early_english:
            if word[-3:] == 'est':
                if self._sb_has_vowel(word[:-3]):
                    word = word[:-3]
                    step1b_flag = True
            elif word[-3:] == 'eth':
                if self._sb_has_vowel(word[:-3]):
                    word = word[:-3]
                    step1b_flag = True

        if step1b_flag:
            if word[-2:] in {'at', 'bl', 'iz'}:
                word += 'e'
            elif word[-2:] in self._doubles:
                word = word[:-1]
            elif self._sb_short_word(word, self._r1_prefixes):
                word += 'e'

        # Step 1c
        if (
            len(word) > 2

            and word[-1] in {'Y', 'y'}

            and word[-2] not in self._vowels

        ):
            word = word[:-1] + 'i'

        # Step 2
        if word[-2] == 'a':
            if word[-7:] == 'ational':
                if len(word[r1_start:]) >= 7:
                    word = word[:-5] + 'e'
            elif word[-6:] == 'tional':
                if len(word[r1_start:]) >= 6:
                    word = word[:-2]
        elif word[-2] == 'c':
            if word[-4:] in {'enci', 'anci'}:
                if len(word[r1_start:]) >= 4:
                    word = word[:-1] + 'e'
        elif word[-2] == 'e':
            if word[-4:] == 'izer':
                if len(word[r1_start:]) >= 4:
                    word = word[:-1]
        elif word[-2] == 'g':
            if word[-3:] == 'ogi':
                if (
                    r1_start >= 1

                    and len(word[r1_start:]) >= 3

                    and word[-4] == 'l'

                ):
                    word = word[:-1]
        elif word[-2] == 'l':
            if word[-6:] == 'lessli':
                if len(word[r1_start:]) >= 6:
                    word = word[:-2]
            elif word[-5:] in {'entli', 'fulli', 'ousli'}:
                if len(word[r1_start:]) >= 5:
                    word = word[:-2]
            elif word[-4:] == 'abli':
                if len(word[r1_start:]) >= 4:
                    word = word[:-1] + 'e'
            elif word[-4:] == 'alli':
                if len(word[r1_start:]) >= 4:
                    word = word[:-2]
            elif word[-3:] == 'bli':
                if len(word[r1_start:]) >= 3:
                    word = word[:-1] + 'e'
            elif word[-2:] == 'li':
                if (
                    r1_start >= 1

                    and len(word[r1_start:]) >= 2

                    and word[-3] in self._li

                ):
                    word = word[:-2]
        elif word[-2] == 'o':
            if word[-7:] == 'ization':
                if len(word[r1_start:]) >= 7:
                    word = word[:-5] + 'e'
            elif word[-5:] == 'ation':
                if len(word[r1_start:]) >= 5:
                    word = word[:-3] + 'e'
            elif word[-4:] == 'ator':
                if len(word[r1_start:]) >= 4:
                    word = word[:-2] + 'e'
        elif word[-2] == 's':
            if word[-7:] in {'fulness', 'ousness', 'iveness'}:
                if len(word[r1_start:]) >= 7:
                    word = word[:-4]
            elif word[-5:] == 'alism':
                if len(word[r1_start:]) >= 5:
                    word = word[:-3]
        elif word[-2] == 't':
            if word[-6:] == 'biliti':
                if len(word[r1_start:]) >= 6:
                    word = word[:-5] + 'le'
            elif word[-5:] == 'aliti':
                if len(word[r1_start:]) >= 5:
                    word = word[:-3]
            elif word[-5:] == 'iviti':
                if len(word[r1_start:]) >= 5:
                    word = word[:-3] + 'e'

        # Step 3
        if word[-7:] == 'ational':
            if len(word[r1_start:]) >= 7:
                word = word[:-5] + 'e'
        elif word[-6:] == 'tional':
            if len(word[r1_start:]) >= 6:
                word = word[:-2]
        elif word[-5:] in {'alize', 'icate', 'iciti'}:
            if len(word[r1_start:]) >= 5:
                word = word[:-3]
        elif word[-5:] == 'ative':
            if len(word[r2_start:]) >= 5:
                word = word[:-5]
        elif word[-4:] == 'ical':
            if len(word[r1_start:]) >= 4:
                word = word[:-2]
        elif word[-4:] == 'ness':
            if len(word[r1_start:]) >= 4:
                word = word[:-4]
        elif word[-3:] == 'ful':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]

        # Step 4
        for suffix in (
            'ement',

            'ance',

            'ence',

            'able',

            'ible',

            'ment',

            'ant',

            'ent',

            'ism',

            'ate',

            'iti',

            'ous',

            'ive',

            'ize',

            'al',

            'er',

            'ic',

        ):
            if word[-len(suffix) :] == suffix:
                if len(word[r2_start:]) >= len(suffix):
                    word = word[: -len(suffix)]
                break
        else:
            if word[-3:] == 'ion':
                if (
                    len(word[r2_start:]) >= 3

                    and len(word) >= 4

                    and word[-4] in tuple('st')

                ):
                    word = word[:-3]

        # Step 5
        if word[-1] == 'e':
            if len(word[r2_start:]) >= 1 or (
                len(word[r1_start:]) >= 1

                and not self._sb_ends_in_short_syllable(word[:-1])

            ):
                word = word[:-1]
        elif word[-1] == 'l':
            if len(word[r2_start:]) >= 1 and word[-2] == 'l':
                word = word[:-1]

        # Change 'Y' back to 'y' if it survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]

        return word


def porter2(word, early_english=False):
    """Return the Porter2 (Snowball English) stem.

    This is a wrapper for :py:meth:`Porter2.stem`.

    Parameters
    ----------
    word : str
        The word to stem
    early_english : bool
        Set to True in order to remove -eth & -est (2nd & 3rd person singular
        verbal agreement suffixes)

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> porter2('reading')
    'read'
    >>> porter2('suspension')
    'suspens'
    >>> porter2('elusiveness')
    'elus'

    >>> porter2('eateth', early_english=True)
    'eat'

    """
    return Porter2().stem(word, early_english)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._porter2.
20
21		Porter2 (Snowball English) stemmer
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize
32
33	1	from six import text_type
34	1	from six.moves import range
35
36	1	from ._snowball import _Snowball
37
38	1	__all__ = ['Porter2', 'porter2']
39
40
41	1	class Porter2(_Snowball):
		0 ignored issues – show Unused Code introduced 2018-11-10 01:42 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
42		"""Porter2 (Snowball English) stemmer.
43
44		The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`.
45		"""
46
47	1	_doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
48	1	_li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
49
50		# R1 prefixes should be in order from longest to shortest to prevent
51		# masking
52	1	_r1_prefixes = ('commun', 'gener', 'arsen')
53	1	_exception1dict = { # special changes:
54		'skis': 'ski',
55		'skies': 'sky',
56		'dying': 'die',
57		'lying': 'lie',
58		'tying': 'tie',
59		# special -LY cases:
60		'idly': 'idl',
61		'gently': 'gentl',
62		'ugly': 'ugli',
63		'early': 'earli',
64		'only': 'onli',
65		'singly': 'singl',
66		}
67	1	_exception1set = {
68		'sky',
69		'news',
70		'howe',
71		'atlas',
72		'cosmos',
73		'bias',
74		'andes',
75		}
76	1	_exception2set = {
77		'inning',
78		'outing',
79		'canning',
80		'herring',
81		'earring',
82		'proceed',
83		'exceed',
84		'succeed',
85		}
86
87	1	def stem(self, word, early_english=False):
		0 ignored issues – show Bug introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'stem' method Loading history... best-practice introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Too many return statements (7/6) Loading history...
88		"""Return the Porter2 (Snowball English) stem.
89
90		Parameters
91		----------
92		word : str
93		The word to stem
94		early_english : bool
95		Set to True in order to remove -eth & -est (2nd & 3rd person
96		singular verbal agreement suffixes)
97
98		Returns
99		-------
100		str
101		Word stem
102
103		Examples
104		--------
105		>>> stmr = Porter2()
106		>>> stmr.stem('reading')
107		'read'
108		>>> stmr.stem('suspension')
109		'suspens'
110		>>> stmr.stem('elusiveness')
111		'elus'
112
113		>>> stmr.stem('eateth', early_english=True)
114		'eat'
115
116		"""
117		# lowercase, normalize, and compose
118	1	word = normalize('NFC', text_type(word.lower()))
119		# replace apostrophe-like characters with U+0027, per
120		# http://snowball.tartarus.org/texts/apostrophe.html
121	1	word = word.replace('’', '\'')
122	1	word = word.replace('’', '\'')
123
124		# Exceptions 1
125	1	if word in self._exception1dict:
126	1	return self._exception1dict[word]
127	1	elif word in self._exception1set:
128	1	return word
129
130		# Return word if stem is shorter than 3
131	1	if len(word) < 3:
132	1	return word
133
134		# Remove initial ', if present.
135	1	while word and word[0] == '\'':
136	1	word = word[1:]
137		# Return word if stem is shorter than 2
138	1	if len(word) < 2:
139	1	return word
140
141		# Re-map vocalic Y to y (Y will be C, y will be V)
142	1	if word[0] == 'y':
143	1	word = 'Y' + word[1:]
144	1	for i in range(1, len(word)):
145	1	if word[i] == 'y' and word[i - 1] in self._vowels:
146	1	word = word[:i] + 'Y' + word[i + 1 :]
147
148	1	r1_start = self._sb_r1(word, self._r1_prefixes)
149	1	r2_start = self._sb_r2(word, self._r1_prefixes)
150
151		# Step 0
152	1	if word[-3:] == '\'s\'':
153	1	word = word[:-3]
154	1	elif word[-2:] == '\'s':
155	1	word = word[:-2]
156	1	elif word[-1:] == '\'':
157	1	word = word[:-1]
158		# Return word if stem is shorter than 2
159	1	if len(word) < 3:
160	1	return word
161
162		# Step 1a
163	1	if word[-4:] == 'sses':
164	1	word = word[:-2]
165	1	elif word[-3:] in {'ied', 'ies'}:
166	1	if len(word) > 4:
167	1	word = word[:-2]
168		else:
169	1	word = word[:-1]
170	1	elif word[-2:] in {'us', 'ss'}:
171	1	pass
172	1	elif word[-1] == 's':
173	1	if self._sb_has_vowel(word[:-2]):
174	1	word = word[:-1]
175
176		# Exceptions 2
177	1	if word in self._exception2set:
178	1	return word
179
180		# Step 1b
181	1	step1b_flag = False
182	1	if word[-5:] == 'eedly':
183	1	if len(word[r1_start:]) >= 5:
184	1	word = word[:-3]
185	1	elif word[-5:] == 'ingly':
186	1	if self._sb_has_vowel(word[:-5]):
187	1	word = word[:-5]
188	1	step1b_flag = True
189	1	elif word[-4:] == 'edly':
190	1	if self._sb_has_vowel(word[:-4]):
191	1	word = word[:-4]
192	1	step1b_flag = True
193	1	elif word[-3:] == 'eed':
194	1	if len(word[r1_start:]) >= 3:
195	1	word = word[:-1]
196	1	elif word[-3:] == 'ing':
197	1	if self._sb_has_vowel(word[:-3]):
198	1	word = word[:-3]
199	1	step1b_flag = True
200	1	elif word[-2:] == 'ed':
201	1	if self._sb_has_vowel(word[:-2]):
202	1	word = word[:-2]
203	1	step1b_flag = True
204	1	elif early_english:
205	1	if word[-3:] == 'est':
206	1	if self._sb_has_vowel(word[:-3]):
207	1	word = word[:-3]
208	1	step1b_flag = True
209	1	elif word[-3:] == 'eth':
210	1	if self._sb_has_vowel(word[:-3]):
211	1	word = word[:-3]
212	1	step1b_flag = True
213
214	1	if step1b_flag:
215	1	if word[-2:] in {'at', 'bl', 'iz'}:
216	1	word += 'e'
217	1	elif word[-2:] in self._doubles:
218	1	word = word[:-1]
219	1	elif self._sb_short_word(word, self._r1_prefixes):
220	1	word += 'e'
221
222		# Step 1c
223	1	if (
224		len(word) > 2
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
225		and word[-1] in {'Y', 'y'}
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
226		and word[-2] not in self._vowels
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
227		):
228	1	word = word[:-1] + 'i'
229
230		# Step 2
231	1	if word[-2] == 'a':
232	1	if word[-7:] == 'ational':
233	1	if len(word[r1_start:]) >= 7:
234	1	word = word[:-5] + 'e'
235	1	elif word[-6:] == 'tional':
236	1	if len(word[r1_start:]) >= 6:
237	1	word = word[:-2]
238	1	elif word[-2] == 'c':
239	1	if word[-4:] in {'enci', 'anci'}:
240	1	if len(word[r1_start:]) >= 4:
241	1	word = word[:-1] + 'e'
242	1	elif word[-2] == 'e':
243	1	if word[-4:] == 'izer':
244	1	if len(word[r1_start:]) >= 4:
245	1	word = word[:-1]
246	1	elif word[-2] == 'g':
247	1	if word[-3:] == 'ogi':
248	1	if (
249		r1_start >= 1
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
250		and len(word[r1_start:]) >= 3
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
251		and word[-4] == 'l'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
252		):
253	1	word = word[:-1]
254	1	elif word[-2] == 'l':
255	1	if word[-6:] == 'lessli':
256	1	if len(word[r1_start:]) >= 6:
257	1	word = word[:-2]
258	1	elif word[-5:] in {'entli', 'fulli', 'ousli'}:
259	1	if len(word[r1_start:]) >= 5:
260	1	word = word[:-2]
261	1	elif word[-4:] == 'abli':
262	1	if len(word[r1_start:]) >= 4:
263	1	word = word[:-1] + 'e'
264	1	elif word[-4:] == 'alli':
265	1	if len(word[r1_start:]) >= 4:
266	1	word = word[:-2]
267	1	elif word[-3:] == 'bli':
268	1	if len(word[r1_start:]) >= 3:
269	1	word = word[:-1] + 'e'
270	1	elif word[-2:] == 'li':
271	1	if (
272		r1_start >= 1
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
273		and len(word[r1_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
274		and word[-3] in self._li
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
275		):
276	1	word = word[:-2]
277	1	elif word[-2] == 'o':
278	1	if word[-7:] == 'ization':
279	1	if len(word[r1_start:]) >= 7:
280	1	word = word[:-5] + 'e'
281	1	elif word[-5:] == 'ation':
282	1	if len(word[r1_start:]) >= 5:
283	1	word = word[:-3] + 'e'
284	1	elif word[-4:] == 'ator':
285	1	if len(word[r1_start:]) >= 4:
286	1	word = word[:-2] + 'e'
287	1	elif word[-2] == 's':
288	1	if word[-7:] in {'fulness', 'ousness', 'iveness'}:
289	1	if len(word[r1_start:]) >= 7:
290	1	word = word[:-4]
291	1	elif word[-5:] == 'alism':
292	1	if len(word[r1_start:]) >= 5:
293	1	word = word[:-3]
294	1	elif word[-2] == 't':
295	1	if word[-6:] == 'biliti':
296	1	if len(word[r1_start:]) >= 6:
297	1	word = word[:-5] + 'le'
298	1	elif word[-5:] == 'aliti':
299	1	if len(word[r1_start:]) >= 5:
300	1	word = word[:-3]
301	1	elif word[-5:] == 'iviti':
302	1	if len(word[r1_start:]) >= 5:
303	1	word = word[:-3] + 'e'
304
305		# Step 3
306	1	if word[-7:] == 'ational':
307	1	if len(word[r1_start:]) >= 7:
308	1	word = word[:-5] + 'e'
309	1	elif word[-6:] == 'tional':
310	1	if len(word[r1_start:]) >= 6:
311	1	word = word[:-2]
312	1	elif word[-5:] in {'alize', 'icate', 'iciti'}:
313	1	if len(word[r1_start:]) >= 5:
314	1	word = word[:-3]
315	1	elif word[-5:] == 'ative':
316	1	if len(word[r2_start:]) >= 5:
317	1	word = word[:-5]
318	1	elif word[-4:] == 'ical':
319	1	if len(word[r1_start:]) >= 4:
320	1	word = word[:-2]
321	1	elif word[-4:] == 'ness':
322	1	if len(word[r1_start:]) >= 4:
323	1	word = word[:-4]
324	1	elif word[-3:] == 'ful':
325	1	if len(word[r1_start:]) >= 3:
326	1	word = word[:-3]
327
328		# Step 4
329	1	for suffix in (
330		'ement',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
331		'ance',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
332		'ence',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
333		'able',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
334		'ible',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
335		'ment',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
336		'ant',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
337		'ent',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
338		'ism',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
339		'ate',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
340		'iti',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
341		'ous',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
342		'ive',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
343		'ize',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
344		'al',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
345		'er',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
346		'ic',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
347		):
348	1	if word[-len(suffix) :] == suffix:
349	1	if len(word[r2_start:]) >= len(suffix):
350	1	word = word[: -len(suffix)]
351	1	break
352		else:
353	1	if word[-3:] == 'ion':
354	1	if (
355		len(word[r2_start:]) >= 3
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
356		and len(word) >= 4
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
357		and word[-4] in tuple('st')
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
358		):
359	1	word = word[:-3]
360
361		# Step 5
362	1	if word[-1] == 'e':
363	1	if len(word[r2_start:]) >= 1 or (
364		len(word[r1_start:]) >= 1
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
365		and not self._sb_ends_in_short_syllable(word[:-1])
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
366		):
367	1	word = word[:-1]
368	1	elif word[-1] == 'l':
369	1	if len(word[r2_start:]) >= 1 and word[-2] == 'l':
370	1	word = word[:-1]
371
372		# Change 'Y' back to 'y' if it survived stemming
373	1	for i in range(0, len(word)):
374	1	if word[i] == 'Y':
375	1	word = word[:i] + 'y' + word[i + 1 :]
376
377	1	return word
378
379
380	1	def porter2(word, early_english=False):
381		"""Return the Porter2 (Snowball English) stem.
382
383		This is a wrapper for :py:meth:`Porter2.stem`.
384
385		Parameters
386		----------
387		word : str
388		The word to stem
389		early_english : bool
390		Set to True in order to remove -eth & -est (2nd & 3rd person singular
391		verbal agreement suffixes)
392
393		Returns
394		-------
395		str
396		Word stem
397
398		Examples
399		--------
400		>>> porter2('reading')
401		'read'
402		>>> porter2('suspension')
403		'suspens'
404		>>> porter2('elusiveness')
405		'elus'
406
407		>>> porter2('eateth', early_english=True)
408		'eat'
409
410		"""
411	1	return Porter2().stem(word, early_english)
412
413
414		if __name__ == '__main__':
415		import doctest
416
417		doctest.testmod()
418

chrislit / abydos

Push — master ( f43547...71985b )

abydos.stemmer._porter2 F

Complexity

Size/Duplication

Test Coverage

Importance

1 Method

1 Function

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like