SnowballDutch._undouble()   A
last analyzed

Complexity

Conditions 4

Size

Total Lines 26
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 5
CRAP Score 4

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 26
ccs 5
cts 5
cp 1
rs 10
c 0
b 0
f 0
cc 4
nop 2
crap 4
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._snowball_dutch.
18
19 1
Snowball Dutch stemmer
20
"""
21
22
from unicodedata import normalize
23
24 1
from ._snowball import _Snowball
25
26
__all__ = ['SnowballDutch']
27
28
29
class SnowballDutch(_Snowball):
30
    """Snowball Dutch stemmer.
31 1
32
    The Snowball Dutch stemmer is defined at:
33 1
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
34
35 1
    .. versionadded:: 0.3.6
36 1
    """
37
38 1
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
39 1
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
40
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
41 1
42
    def _undouble(self, word: str) -> str:
43
        """Undouble endings -kk, -dd, and -tt.
44 1
45
        Parameters
46
        ----------
47
        word : str
48
          The word to stem
49
50
        Returns
51
        -------
52
        str
53 1
            The word with doubled endings undoubled
54 1
55 1
56
        .. versionadded:: 0.1.0
57 1
        .. versionchanged:: 0.3.6
58
            Encapsulated in class
59
60
        """
61
        if (
62
            len(word) > 1
63
            and word[-1] == word[-2]
64
            and word[-1] in {'d', 'k', 't'}
65
        ):
66
            return word[:-1]
67
        return word
68
69
    def stem(self, word: str) -> str:
70
        """Return Snowball Dutch stem.
71
72
        Parameters
73
        ----------
74
        word : str
75
            The word to stem
76 1
77
        Returns
78
        -------
79
        str
80
            Word stem
81 1
82 1
        Examples
83
        --------
84 1
        >>> stmr = SnowballDutch()
85
        >>> stmr.stem('lezen')
86
        'lez'
87
        >>> stmr.stem('opschorting')
88
        'opschort'
89
        >>> stmr.stem('ongrijpbaarheid')
90
        'ongrijp'
91
92
93
        .. versionadded:: 0.1.0
94
        .. versionchanged:: 0.3.6
95
            Encapsulated in class
96
97
        """
98
        # lowercase, normalize, decompose, filter umlauts & acutes out, and
99
        # compose
100
        word = normalize('NFC', word.lower())
101
        word = word.translate(self._accented)
102
103
        for i in range(len(word)):
104
            if i == 0 and word[0] == 'y':
105
                word = 'Y' + word[1:]
106
            elif word[i] == 'y' and word[i - 1] in self._vowels:
107
                word = word[:i] + 'Y' + word[i + 1 :]
108
            elif (
109
                word[i] == 'i'
110
                and word[i - 1] in self._vowels
111
                and i + 1 < len(word)
112
                and word[i + 1] in self._vowels
113
            ):
114
                word = word[:i] + 'I' + word[i + 1 :]
115 1
116 1
        r1_start = max(3, self._sb_r1(word))
117
        r2_start = self._sb_r2(word)
118 1
119 1
        # Step 1
120 1
        if word[-5:] == 'heden':
121 1
            if len(word[r1_start:]) >= 5:
122 1
                word = word[:-3] + 'id'
123 1
        elif word[-3:] == 'ene':
124
            if len(word[r1_start:]) >= 3 and (
125
                word[-4] not in self._vowels and word[-6:-3] != 'gem'
126
            ):
127
                word = self._undouble(word[:-3])
128
        elif word[-2:] == 'en':
129 1
            if len(word[r1_start:]) >= 2 and (
130
                word[-3] not in self._vowels and word[-5:-2] != 'gem'
131 1
            ):
132 1
                word = self._undouble(word[:-2])
133
        elif word[-2:] == 'se':
134
            if (
135 1
                len(word[r1_start:]) >= 2
136 1
                and word[-3] not in self._not_s_endings
137 1
            ):
138 1
                word = word[:-2]
139 1
        elif word[-1:] == 's':
140
            if (
141
                len(word[r1_start:]) >= 1
142 1
                and word[-2] not in self._not_s_endings
143 1
            ):
144 1
                word = word[:-1]
145
146
        # Step 2
147 1
        e_removed = False
148 1
        if word[-1:] == 'e':
149 1
            if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels:
150
                word = self._undouble(word[:-1])
151
                e_removed = True
152
153 1
        # Step 3a
154 1
        if word[-4:] == 'heid':
155 1
            if len(word[r2_start:]) >= 4 and word[-5] != 'c':
156
                word = word[:-4]
157
                if word[-2:] == 'en':
158
                    if len(word[r1_start:]) >= 2 and (
159 1
                        word[-3] not in self._vowels and word[-5:-2] != 'gem'
160
                    ):
161
                        word = self._undouble(word[:-2])
162 1
163 1
        # Step 3b
164 1
        if word[-4:] == 'lijk':
165 1
            if len(word[r2_start:]) >= 4:
166 1
                word = word[:-4]
167
                # Repeat step 2
168
                if word[-1:] == 'e':
169 1
                    if (
170 1
                        len(word[r1_start:]) >= 1
171 1
                        and word[-2] not in self._vowels
172 1
                    ):
173 1
                        word = self._undouble(word[:-1])
174
        elif word[-4:] == 'baar':
175
            if len(word[r2_start:]) >= 4:
176 1
                word = word[:-4]
177
        elif word[-3:] in ('end', 'ing'):
178
            if len(word[r2_start:]) >= 3:
179 1
                word = word[:-3]
180 1
                if (
181 1
                    word[-2:] == 'ig'
182
                    and len(word[r2_start:]) >= 2
183 1
                    and word[-3] != 'e'
184 1
                ):
185
                    word = word[:-2]
186
                else:
187
                    word = self._undouble(word)
188 1
        elif word[-3:] == 'bar':
189 1
            if len(word[r2_start:]) >= 3 and e_removed:
190 1
                word = word[:-3]
191 1
        elif word[-2:] == 'ig':
192 1
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
193 1
                word = word[:-2]
194 1
195 1
        # Step 4
196
        if (
197
            len(word) >= 4
198
            and word[-3] == word[-2]
199
            and word[-2] in {'a', 'e', 'o', 'u'}
200 1
            and word[-4] not in self._vowels
201
            and word[-1] not in self._vowels
202 1
            and word[-1] != 'I'
203 1
        ):
204 1
            word = word[:-2] + word[-1]
205 1
206 1
        # Change 'Y' and 'U' back to lowercase if survived stemming
207 1
        for i in range(0, len(word)):
208 1
            if word[i] == 'Y':
209
                word = word[:i] + 'y' + word[i + 1 :]
210
            elif word[i] == 'I':
211 1
                word = word[:i] + 'i' + word[i + 1 :]
212
213
        return word
214
215
216
if __name__ == '__main__':
217
    import doctest
218
219
    doctest.testmod()
220