abydos.stemmer._snowball_german   F
last analyzed

Complexity

Total Complexity 61

Size/Duplication

Total Lines 206
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 61
eloc 110
dl 0
loc 206
ccs 97
cts 97
cp 1
rs 3.52
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A SnowballGerman.__init__() 0 13 1
F SnowballGerman.stem() 0 144 60

How to fix   Complexity   

Complexity

Complex classes like abydos.stemmer._snowball_german often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._snowball_german.
18
19 1
Snowball German stemmer
20
"""
21
22
from unicodedata import normalize
23
24 1
from ._snowball import _Snowball
25
26
__all__ = ['SnowballGerman']
27
28
29
class SnowballGerman(_Snowball):
30
    """Snowball German stemmer.
31 1
32
    The Snowball German stemmer is defined at:
33 1
    http://snowball.tartarus.org/algorithms/german/stemmer.html
34
35 1
    .. versionadded:: 0.3.6
36
    """
37 1
38 1
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
39
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
40 1
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
41
42
    def __init__(self, alternate_vowels: bool = False) -> None:
43 1
        """Initialize SnowballGerman instance.
44
45
        Parameters
46
        ----------
47
        alternate_vowels : bool
48
            Composes ae as ä, oe as ö, and ue as ü before running the algorithm
49
50
51
        .. versionadded:: 0.4.0
52 1
53 1
        """
54 1
        self._alternate_vowels = alternate_vowels
55
56 1
    def stem(self, word: str) -> str:
57
        """Return Snowball German stem.
58
59
        Parameters
60
        ----------
61
        word : str
62
            The word to stem
63
64
        Returns
65
        -------
66
        str
67
            Word stem
68 1
69
        Examples
70 1
        --------
71
        >>> stmr = SnowballGerman()
72
        >>> stmr.stem('lesen')
73
        'les'
74
        >>> stmr.stem('graues')
75
        'grau'
76
        >>> stmr.stem('buchstabieren')
77
        'buchstabi'
78
79
80
        .. versionadded:: 0.1.0
81
        .. versionchanged:: 0.3.6
82
            Encapsulated in class
83
84
        """
85
        # lowercase, normalize, and compose
86
        word = normalize('NFC', word.lower())
87
        word = word.replace('ß', 'ss')
88
89
        if len(word) > 2:
90
            for i in range(2, len(word)):
91
                if word[i] in self._vowels and word[i - 2] in self._vowels:
92
                    if word[i - 1] == 'u':
93
                        word = word[: i - 1] + 'U' + word[i:]
94
                    elif word[i - 1] == 'y':
95
                        word = word[: i - 1] + 'Y' + word[i:]
96
97
        if self._alternate_vowels:
98
            word = word.replace('ae', 'ä')
99
            word = word.replace('oe', 'ö')
100 1
            word = word.replace('que', 'Q')
101 1
            word = word.replace('ue', 'ü')
102
            word = word.replace('Q', 'que')
103 1
104 1
        r1_start = max(3, self._sb_r1(word))
105 1
        r2_start = self._sb_r2(word)
106 1
107 1
        # Step 1
108 1
        niss_flag = False
109 1
        if word[-3:] == 'ern':
110
            if len(word[r1_start:]) >= 3:
111 1
                word = word[:-3]
112 1
        elif word[-2:] == 'em':
113 1
            if len(word[r1_start:]) >= 2:
114 1
                word = word[:-2]
115 1
        elif word[-2:] == 'er':
116 1
            if len(word[r1_start:]) >= 2:
117
                word = word[:-2]
118 1
        elif word[-2:] == 'en':
119 1
            if len(word[r1_start:]) >= 2:
120
                word = word[:-2]
121
                niss_flag = True
122 1
        elif word[-2:] == 'es':
123 1
            if len(word[r1_start:]) >= 2:
124 1
                word = word[:-2]
125 1
                niss_flag = True
126 1
        elif word[-1:] == 'e':
127 1
            if len(word[r1_start:]) >= 1:
128 1
                word = word[:-1]
129 1
                niss_flag = True
130 1
        elif word[-1:] == 's':
131 1
            if (
132 1
                len(word[r1_start:]) >= 1
133 1
                and len(word) >= 2
134 1
                and word[-2] in self._s_endings
135 1
            ):
136 1
                word = word[:-1]
137 1
138 1
        if niss_flag and word[-4:] == 'niss':
139 1
            word = word[:-1]
140 1
141 1
        # Step 2
142 1
        if word[-3:] == 'est':
143 1
            if len(word[r1_start:]) >= 3:
144 1
                word = word[:-3]
145 1
        elif word[-2:] == 'en':
146
            if len(word[r1_start:]) >= 2:
147
                word = word[:-2]
148
        elif word[-2:] == 'er':
149
            if len(word[r1_start:]) >= 2:
150 1
                word = word[:-2]
151
        elif word[-2:] == 'st':
152 1
            if (
153 1
                len(word[r1_start:]) >= 2
154
                and len(word) >= 6
155
                and word[-3] in self._st_endings
156 1
            ):
157 1
                word = word[:-2]
158 1
159 1
        # Step 3
160 1
        if word[-4:] == 'isch':
161 1
            if len(word[r2_start:]) >= 4 and word[-5] != 'e':
162 1
                word = word[:-4]
163 1
        elif word[-4:] in {'lich', 'heit'}:
164 1
            if len(word[r2_start:]) >= 4:
165 1
                word = word[:-4]
166 1
                if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
167
                    word = word[:-2]
168
        elif word[-4:] == 'keit':
169
            if len(word[r2_start:]) >= 4:
170
                word = word[:-4]
171 1
                if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
172
                    word = word[:-4]
173
                elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
174 1
                    word = word[:-2]
175 1
        elif word[-3:] in {'end', 'ung'}:
176 1
            if len(word[r2_start:]) >= 3:
177 1
                word = word[:-3]
178 1
                if (
179 1
                    word[-2:] == 'ig'
180 1
                    and len(word[r2_start:]) >= 2
181 1
                    and word[-3] != 'e'
182 1
                ):
183 1
                    word = word[:-2]
184 1
        elif word[-2:] in {'ig', 'ik'}:
185 1
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
186 1
                word = word[:-2]
187 1
188 1
        # Change 'Y' and 'U' back to lowercase if survived stemming
189 1
        for i in range(0, len(word)):
190 1
            if word[i] == 'Y':
191 1
                word = word[:i] + 'y' + word[i + 1 :]
192 1
            elif word[i] == 'U':
193
                word = word[:i] + 'u' + word[i + 1 :]
194
195
        # Remove umlauts
196
        _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
197 1
        word = word.translate(_umlauts)
198 1
199 1
        return word
200 1
201
202
if __name__ == '__main__':
203 1
    import doctest
204 1
205
    doctest.testmod()
206