abydos.stemmer._snowball_german.SnowballGerman.stem() - Code Metrics - Inspection of "0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#141)

by Chris

created 2018-11-10 03:32 UTC

SnowballGerman.stem() F

↳ Parent: abydos.stemmer._snowball_german

Complexity

Conditions

Size

Total Lines	136
Code Lines	97

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	88
CRAP Score	60

Importance

Changes

Metric	Value
cc	60
eloc	97
nop	3
dl	0
loc	136
ccs	88
cts	88
cp	1
crap	60
rs	0
c	0
b	0
f	0

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_german.

Snowball German stemmer
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize

from six.moves import range

from ._snowball import _Snowball

__all__ = ['SnowballGerman', 'sb_german']


class SnowballGerman(_Snowball):

    """Snowball German stemmer.

    The Snowball German stemmer is defined at:
    http://snowball.tartarus.org/algorithms/german/stemmer.html
    """

    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}

    def stem(self, word, alternate_vowels=False):

        """Return Snowball German stem.

        Args:
            word (str): The word to stem
            alternate_vowels (bool): composes ae as ä, oe as ö, and ue as ü
                before running the algorithm

        Returns:
            str: Word stem

        Examples:
            >>> stmr = SnowballGerman()
            >>> stmr.stem('lesen')
            'les'
            >>> stmr.stem('graues')
            'grau'
            >>> stmr.stem('buchstabieren')
            'buchstabi'

        """
        # lowercase, normalize, and compose
        word = normalize('NFC', word.lower())
        word = word.replace('ß', 'ss')

        if len(word) > 2:
            for i in range(2, len(word)):
                if word[i] in self._vowels and word[i - 2] in self._vowels:
                    if word[i - 1] == 'u':
                        word = word[: i - 1] + 'U' + word[i:]
                    elif word[i - 1] == 'y':
                        word = word[: i - 1] + 'Y' + word[i:]

        if alternate_vowels:
            word = word.replace('ae', 'ä')
            word = word.replace('oe', 'ö')
            word = word.replace('que', 'Q')
            word = word.replace('ue', 'ü')
            word = word.replace('Q', 'que')

        r1_start = max(3, self._sb_r1(word))
        r2_start = self._sb_r2(word)

        # Step 1
        niss_flag = False
        if word[-3:] == 'ern':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'em':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-2:] == 'es':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-1:] == 'e':
            if len(word[r1_start:]) >= 1:
                word = word[:-1]
                niss_flag = True
        elif word[-1:] == 's':
            if (
                len(word[r1_start:]) >= 1

                and len(word) >= 2

                and word[-2] in self._s_endings

            ):
                word = word[:-1]

        if niss_flag and word[-4:] == 'niss':
            word = word[:-1]

        # Step 2
        if word[-3:] == 'est':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'st':
            if (
                len(word[r1_start:]) >= 2

                and len(word) >= 6

                and word[-3] in self._st_endings

            ):
                word = word[:-2]

        # Step 3
        if word[-4:] == 'isch':
            if len(word[r2_start:]) >= 4 and word[-5] != 'e':
                word = word[:-4]
        elif word[-4:] in {'lich', 'heit'}:
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
                    word = word[:-2]
        elif word[-4:] == 'keit':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
                    word = word[:-4]
                elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
                    word = word[:-2]
        elif word[-3:] in {'end', 'ung'}:
            if len(word[r2_start:]) >= 3:
                word = word[:-3]
                if (
                    word[-2:] == 'ig'

                    and len(word[r2_start:]) >= 2

                    and word[-3] != 'e'

                ):
                    word = word[:-2]
        elif word[-2:] in {'ig', 'ik'}:
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
                word = word[:-2]

        # Change 'Y' and 'U' back to lowercase if survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]
            elif word[i] == 'U':
                word = word[:i] + 'u' + word[i + 1 :]

        # Remove umlauts
        _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))

        word = word.translate(_umlauts)

        return word


def sb_german(word, alternate_vowels=False):
    """Return Snowball German stem.

    This is a wrapper for :py:meth:`SnowballGerman.stem`.

    Args:
        word (str): The word to stem
        alternate_vowels (bool): composes ae as ä, oe as ö, and ue as ü
            before running the algorithm

    Returns:
        str: Word stem

    Examples:
        >>> sb_german('lesen')
        'les'
        >>> sb_german('graues')
        'grau'
        >>> sb_german('buchstabieren')
        'buchstabi'

    """
    return SnowballGerman().stem(word, alternate_vowels)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._snowball_german.
20
21		Snowball German stemmer
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize
32
33	1	from six.moves import range
34
35	1	from ._snowball import _Snowball
36
37	1	__all__ = ['SnowballGerman', 'sb_german']
38
39
40	1	class SnowballGerman(_Snowball):
		0 ignored issues – show Unused Code introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Snowball German stemmer.
42
43		The Snowball German stemmer is defined at:
44		http://snowball.tartarus.org/algorithms/german/stemmer.html
45		"""
46
47	1	_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
48	1	_s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
49	1	_st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
50
51	1	def stem(self, word, alternate_vowels=False):
		0 ignored issues – show Bug introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'stem' method Loading history...
52		"""Return Snowball German stem.
53
54		Args:
55		word (str): The word to stem
56		alternate_vowels (bool): composes ae as ä, oe as ö, and ue as ü
57		before running the algorithm
58
59		Returns:
60		str: Word stem
61
62		Examples:
63		>>> stmr = SnowballGerman()
64		>>> stmr.stem('lesen')
65		'les'
66		>>> stmr.stem('graues')
67		'grau'
68		>>> stmr.stem('buchstabieren')
69		'buchstabi'
70
71		"""
72		# lowercase, normalize, and compose
73	1	word = normalize('NFC', word.lower())
74	1	word = word.replace('ß', 'ss')
75
76	1	if len(word) > 2:
77	1	for i in range(2, len(word)):
78	1	if word[i] in self._vowels and word[i - 2] in self._vowels:
79	1	if word[i - 1] == 'u':
80	1	word = word[: i - 1] + 'U' + word[i:]
81	1	elif word[i - 1] == 'y':
82	1	word = word[: i - 1] + 'Y' + word[i:]
83
84	1	if alternate_vowels:
85	1	word = word.replace('ae', 'ä')
86	1	word = word.replace('oe', 'ö')
87	1	word = word.replace('que', 'Q')
88	1	word = word.replace('ue', 'ü')
89	1	word = word.replace('Q', 'que')
90
91	1	r1_start = max(3, self._sb_r1(word))
92	1	r2_start = self._sb_r2(word)
93
94		# Step 1
95	1	niss_flag = False
96	1	if word[-3:] == 'ern':
97	1	if len(word[r1_start:]) >= 3:
98	1	word = word[:-3]
99	1	elif word[-2:] == 'em':
100	1	if len(word[r1_start:]) >= 2:
101	1	word = word[:-2]
102	1	elif word[-2:] == 'er':
103	1	if len(word[r1_start:]) >= 2:
104	1	word = word[:-2]
105	1	elif word[-2:] == 'en':
106	1	if len(word[r1_start:]) >= 2:
107	1	word = word[:-2]
108	1	niss_flag = True
109	1	elif word[-2:] == 'es':
110	1	if len(word[r1_start:]) >= 2:
111	1	word = word[:-2]
112	1	niss_flag = True
113	1	elif word[-1:] == 'e':
114	1	if len(word[r1_start:]) >= 1:
115	1	word = word[:-1]
116	1	niss_flag = True
117	1	elif word[-1:] == 's':
118	1	if (
119		len(word[r1_start:]) >= 1
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
120		and len(word) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
121		and word[-2] in self._s_endings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
122		):
123	1	word = word[:-1]
124
125	1	if niss_flag and word[-4:] == 'niss':
126	1	word = word[:-1]
127
128		# Step 2
129	1	if word[-3:] == 'est':
130	1	if len(word[r1_start:]) >= 3:
131	1	word = word[:-3]
132	1	elif word[-2:] == 'en':
133	1	if len(word[r1_start:]) >= 2:
134	1	word = word[:-2]
135	1	elif word[-2:] == 'er':
136	1	if len(word[r1_start:]) >= 2:
137	1	word = word[:-2]
138	1	elif word[-2:] == 'st':
139	1	if (
140		len(word[r1_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
141		and len(word) >= 6
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
142		and word[-3] in self._st_endings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
143		):
144	1	word = word[:-2]
145
146		# Step 3
147	1	if word[-4:] == 'isch':
148	1	if len(word[r2_start:]) >= 4 and word[-5] != 'e':
149	1	word = word[:-4]
150	1	elif word[-4:] in {'lich', 'heit'}:
151	1	if len(word[r2_start:]) >= 4:
152	1	word = word[:-4]
153	1	if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
154	1	word = word[:-2]
155	1	elif word[-4:] == 'keit':
156	1	if len(word[r2_start:]) >= 4:
157	1	word = word[:-4]
158	1	if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
159	1	word = word[:-4]
160	1	elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
161	1	word = word[:-2]
162	1	elif word[-3:] in {'end', 'ung'}:
163	1	if len(word[r2_start:]) >= 3:
164	1	word = word[:-3]
165	1	if (
166		word[-2:] == 'ig'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
167		and len(word[r2_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
168		and word[-3] != 'e'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
169		):
170	1	word = word[:-2]
171	1	elif word[-2:] in {'ig', 'ik'}:
172	1	if len(word[r2_start:]) >= 2 and word[-3] != 'e':
173	1	word = word[:-2]
174
175		# Change 'Y' and 'U' back to lowercase if survived stemming
176	1	for i in range(0, len(word)):
177	1	if word[i] == 'Y':
178	1	word = word[:i] + 'y' + word[i + 1 :]
179	1	elif word[i] == 'U':
180	1	word = word[:i] + 'u' + word[i + 1 :]
181
182		# Remove umlauts
183	1	_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
184	1	word = word.translate(_umlauts)
185
186	1	return word
187
188
189	1	def sb_german(word, alternate_vowels=False):
190		"""Return Snowball German stem.
191
192		This is a wrapper for :py:meth:`SnowballGerman.stem`.
193
194		Args:
195		word (str): The word to stem
196		alternate_vowels (bool): composes ae as ä, oe as ö, and ue as ü
197		before running the algorithm
198
199		Returns:
200		str: Word stem
201
202		Examples:
203		>>> sb_german('lesen')
204		'les'
205		>>> sb_german('graues')
206		'grau'
207		>>> sb_german('buchstabieren')
208		'buchstabi'
209
210		"""
211	1	return SnowballGerman().stem(word, alternate_vowels)
212
213
214		if __name__ == '__main__':
215		import doctest
216
217		doctest.testmod()
218

chrislit / abydos

Pull Request — master (#141)

SnowballGerman.stem() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like