abydos.stemmer._snowball_german - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

abydos.stemmer._snowball_german F
last analyzed 2020-12-31 20:10 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	206
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	61
eloc	110
dl	0
loc	206
ccs	97
cts	97
cp	1
rs	3.52
c	0
b	0
f	0

2 Methods

Rating	Name	Duplication	Size	Complexity
A	SnowballGerman.__init__()	0	13	1
F	SnowballGerman.stem()	0	144	60

How to fix Complexity

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_german.

Snowball German stemmer
"""

from unicodedata import normalize

from ._snowball import _Snowball

__all__ = ['SnowballGerman']


class SnowballGerman(_Snowball):
    """Snowball German stemmer.

    The Snowball German stemmer is defined at:
    http://snowball.tartarus.org/algorithms/german/stemmer.html

    .. versionadded:: 0.3.6
    """

    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}

    def __init__(self, alternate_vowels: bool = False) -> None:
        """Initialize SnowballGerman instance.

        Parameters
        ----------
        alternate_vowels : bool
            Composes ae as ä, oe as ö, and ue as ü before running the algorithm


        .. versionadded:: 0.4.0

        """
        self._alternate_vowels = alternate_vowels

    def stem(self, word: str) -> str:
        """Return Snowball German stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = SnowballGerman()
        >>> stmr.stem('lesen')
        'les'
        >>> stmr.stem('graues')
        'grau'
        >>> stmr.stem('buchstabieren')
        'buchstabi'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # lowercase, normalize, and compose
        word = normalize('NFC', word.lower())
        word = word.replace('ß', 'ss')

        if len(word) > 2:
            for i in range(2, len(word)):
                if word[i] in self._vowels and word[i - 2] in self._vowels:
                    if word[i - 1] == 'u':
                        word = word[: i - 1] + 'U' + word[i:]
                    elif word[i - 1] == 'y':
                        word = word[: i - 1] + 'Y' + word[i:]

        if self._alternate_vowels:
            word = word.replace('ae', 'ä')
            word = word.replace('oe', 'ö')
            word = word.replace('que', 'Q')
            word = word.replace('ue', 'ü')
            word = word.replace('Q', 'que')

        r1_start = max(3, self._sb_r1(word))
        r2_start = self._sb_r2(word)

        # Step 1
        niss_flag = False
        if word[-3:] == 'ern':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'em':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-2:] == 'es':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-1:] == 'e':
            if len(word[r1_start:]) >= 1:
                word = word[:-1]
                niss_flag = True
        elif word[-1:] == 's':
            if (
                len(word[r1_start:]) >= 1
                and len(word) >= 2
                and word[-2] in self._s_endings
            ):
                word = word[:-1]

        if niss_flag and word[-4:] == 'niss':
            word = word[:-1]

        # Step 2
        if word[-3:] == 'est':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'st':
            if (
                len(word[r1_start:]) >= 2
                and len(word) >= 6
                and word[-3] in self._st_endings
            ):
                word = word[:-2]

        # Step 3
        if word[-4:] == 'isch':
            if len(word[r2_start:]) >= 4 and word[-5] != 'e':
                word = word[:-4]
        elif word[-4:] in {'lich', 'heit'}:
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
                    word = word[:-2]
        elif word[-4:] == 'keit':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
                    word = word[:-4]
                elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
                    word = word[:-2]
        elif word[-3:] in {'end', 'ung'}:
            if len(word[r2_start:]) >= 3:
                word = word[:-3]
                if (
                    word[-2:] == 'ig'
                    and len(word[r2_start:]) >= 2
                    and word[-3] != 'e'
                ):
                    word = word[:-2]
        elif word[-2:] in {'ig', 'ik'}:
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
                word = word[:-2]

        # Change 'Y' and 'U' back to lowercase if survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]
            elif word[i] == 'U':
                word = word[:i] + 'u' + word[i + 1 :]

        # Remove umlauts
        _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))

        word = word.translate(_umlauts)

        return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.stemmer._snowball_german.
18
19	1	Snowball German stemmer
20		"""
21
22		from unicodedata import normalize
23
24	1	from ._snowball import _Snowball
25
26		__all__ = ['SnowballGerman']
27
28
29		class SnowballGerman(_Snowball):
30		"""Snowball German stemmer.
31	1
32		The Snowball German stemmer is defined at:
33	1	http://snowball.tartarus.org/algorithms/german/stemmer.html
34
35	1	.. versionadded:: 0.3.6
36		"""
37	1
38	1	_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
39		_s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
40	1	_st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
41
42		def __init__(self, alternate_vowels: bool = False) -> None:
43	1	"""Initialize SnowballGerman instance.
44
45		Parameters
46		----------
47		alternate_vowels : bool
48		Composes ae as ä, oe as ö, and ue as ü before running the algorithm
49
50
51		.. versionadded:: 0.4.0
52	1
53	1	"""
54	1	self._alternate_vowels = alternate_vowels
55
56	1	def stem(self, word: str) -> str:
57		"""Return Snowball German stem.
58
59		Parameters
60		----------
61		word : str
62		The word to stem
63
64		Returns
65		-------
66		str
67		Word stem
68	1
69		Examples
70	1	--------
71		>>> stmr = SnowballGerman()
72		>>> stmr.stem('lesen')
73		'les'
74		>>> stmr.stem('graues')
75		'grau'
76		>>> stmr.stem('buchstabieren')
77		'buchstabi'
78
79
80		.. versionadded:: 0.1.0
81		.. versionchanged:: 0.3.6
82		Encapsulated in class
83
84		"""
85		# lowercase, normalize, and compose
86		word = normalize('NFC', word.lower())
87		word = word.replace('ß', 'ss')
88
89		if len(word) > 2:
90		for i in range(2, len(word)):
91		if word[i] in self._vowels and word[i - 2] in self._vowels:
92		if word[i - 1] == 'u':
93		word = word[: i - 1] + 'U' + word[i:]
94		elif word[i - 1] == 'y':
95		word = word[: i - 1] + 'Y' + word[i:]
96
97		if self._alternate_vowels:
98		word = word.replace('ae', 'ä')
99		word = word.replace('oe', 'ö')
100	1	word = word.replace('que', 'Q')
101	1	word = word.replace('ue', 'ü')
102		word = word.replace('Q', 'que')
103	1
104	1	r1_start = max(3, self._sb_r1(word))
105	1	r2_start = self._sb_r2(word)
106	1
107	1	# Step 1
108	1	niss_flag = False
109	1	if word[-3:] == 'ern':
110		if len(word[r1_start:]) >= 3:
111	1	word = word[:-3]
112	1	elif word[-2:] == 'em':
113	1	if len(word[r1_start:]) >= 2:
114	1	word = word[:-2]
115	1	elif word[-2:] == 'er':
116	1	if len(word[r1_start:]) >= 2:
117		word = word[:-2]
118	1	elif word[-2:] == 'en':
119	1	if len(word[r1_start:]) >= 2:
120		word = word[:-2]
121		niss_flag = True
122	1	elif word[-2:] == 'es':
123	1	if len(word[r1_start:]) >= 2:
124	1	word = word[:-2]
125	1	niss_flag = True
126	1	elif word[-1:] == 'e':
127	1	if len(word[r1_start:]) >= 1:
128	1	word = word[:-1]
129	1	niss_flag = True
130	1	elif word[-1:] == 's':
131	1	if (
132	1	len(word[r1_start:]) >= 1
133	1	and len(word) >= 2
134	1	and word[-2] in self._s_endings
135	1	):
136	1	word = word[:-1]
137	1
138	1	if niss_flag and word[-4:] == 'niss':
139	1	word = word[:-1]
140	1
141	1	# Step 2
142	1	if word[-3:] == 'est':
143	1	if len(word[r1_start:]) >= 3:
144	1	word = word[:-3]
145	1	elif word[-2:] == 'en':
146		if len(word[r1_start:]) >= 2:
147		word = word[:-2]
148		elif word[-2:] == 'er':
149		if len(word[r1_start:]) >= 2:
150	1	word = word[:-2]
151		elif word[-2:] == 'st':
152	1	if (
153	1	len(word[r1_start:]) >= 2
154		and len(word) >= 6
155		and word[-3] in self._st_endings
156	1	):
157	1	word = word[:-2]
158	1
159	1	# Step 3
160	1	if word[-4:] == 'isch':
161	1	if len(word[r2_start:]) >= 4 and word[-5] != 'e':
162	1	word = word[:-4]
163	1	elif word[-4:] in {'lich', 'heit'}:
164	1	if len(word[r2_start:]) >= 4:
165	1	word = word[:-4]
166	1	if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
167		word = word[:-2]
168		elif word[-4:] == 'keit':
169		if len(word[r2_start:]) >= 4:
170		word = word[:-4]
171	1	if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
172		word = word[:-4]
173		elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
174	1	word = word[:-2]
175	1	elif word[-3:] in {'end', 'ung'}:
176	1	if len(word[r2_start:]) >= 3:
177	1	word = word[:-3]
178	1	if (
179	1	word[-2:] == 'ig'
180	1	and len(word[r2_start:]) >= 2
181	1	and word[-3] != 'e'
182	1	):
183	1	word = word[:-2]
184	1	elif word[-2:] in {'ig', 'ik'}:
185	1	if len(word[r2_start:]) >= 2 and word[-3] != 'e':
186	1	word = word[:-2]
187	1
188	1	# Change 'Y' and 'U' back to lowercase if survived stemming
189	1	for i in range(0, len(word)):
190	1	if word[i] == 'Y':
191	1	word = word[:i] + 'y' + word[i + 1 :]
192	1	elif word[i] == 'U':
193		word = word[:i] + 'u' + word[i + 1 :]
194
195		# Remove umlauts
196		_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
197	1	word = word.translate(_umlauts)
198	1
199	1	return word
200	1
201
202		if __name__ == '__main__':
203	1	import doctest
204	1
205		doctest.testmod()
206

chrislit / abydos

abydos.stemmer._snowball_german F last analyzed 2020-12-31 20:10 UTC

Complexity

Size/Duplication

Test Coverage

Importance

2 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

abydos.stemmer._snowball_german F
last analyzed 2020-12-31 20:10 UTC