abydos.stemmer._snowball_dutch.SnowballDutch._undouble() - Code Metrics - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

SnowballDutch._undouble() A
last analyzed 2020-12-31 20:10 UTC

↳ Parent: abydos.stemmer._snowball_dutch

Complexity

Conditions

Size

Total Lines	26
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	5
CRAP Score	4

Importance

Changes

Metric	Value
eloc	7
dl	0
loc	26
ccs	5
cts	5
cp	1
rs	10
c	0
b	0
f	0
cc	4
nop	2
crap	4

# Copyright 2014-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_dutch.

Snowball Dutch stemmer
"""

from unicodedata import normalize

from ._snowball import _Snowball

__all__ = ['SnowballDutch']


class SnowballDutch(_Snowball):
    """Snowball Dutch stemmer.

    The Snowball Dutch stemmer is defined at:
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html

    .. versionadded:: 0.3.6
    """

    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))


    def _undouble(self, word: str) -> str:
        """Undouble endings -kk, -dd, and -tt.

        Parameters
        ----------
        word : str
          The word to stem

        Returns
        -------
        str
            The word with doubled endings undoubled


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        if (
            len(word) > 1
            and word[-1] == word[-2]
            and word[-1] in {'d', 'k', 't'}
        ):
            return word[:-1]
        return word

    def stem(self, word: str) -> str:
        """Return Snowball Dutch stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = SnowballDutch()
        >>> stmr.stem('lezen')
        'lez'
        >>> stmr.stem('opschorting')
        'opschort'
        >>> stmr.stem('ongrijpbaarheid')
        'ongrijp'


        .. versionadded:: 0.1.0
        .. versionchanged:: 0.3.6
            Encapsulated in class

        """
        # lowercase, normalize, decompose, filter umlauts & acutes out, and
        # compose
        word = normalize('NFC', word.lower())
        word = word.translate(self._accented)

        for i in range(len(word)):
            if i == 0 and word[0] == 'y':
                word = 'Y' + word[1:]
            elif word[i] == 'y' and word[i - 1] in self._vowels:
                word = word[:i] + 'Y' + word[i + 1 :]
            elif (
                word[i] == 'i'
                and word[i - 1] in self._vowels
                and i + 1 < len(word)
                and word[i + 1] in self._vowels
            ):
                word = word[:i] + 'I' + word[i + 1 :]

        r1_start = max(3, self._sb_r1(word))
        r2_start = self._sb_r2(word)

        # Step 1
        if word[-5:] == 'heden':
            if len(word[r1_start:]) >= 5:
                word = word[:-3] + 'id'
        elif word[-3:] == 'ene':
            if len(word[r1_start:]) >= 3 and (
                word[-4] not in self._vowels and word[-6:-3] != 'gem'
            ):
                word = self._undouble(word[:-3])
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2 and (
                word[-3] not in self._vowels and word[-5:-2] != 'gem'
            ):
                word = self._undouble(word[:-2])
        elif word[-2:] == 'se':
            if (
                len(word[r1_start:]) >= 2
                and word[-3] not in self._not_s_endings
            ):
                word = word[:-2]
        elif word[-1:] == 's':
            if (
                len(word[r1_start:]) >= 1
                and word[-2] not in self._not_s_endings
            ):
                word = word[:-1]

        # Step 2
        e_removed = False
        if word[-1:] == 'e':
            if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels:
                word = self._undouble(word[:-1])
                e_removed = True

        # Step 3a
        if word[-4:] == 'heid':
            if len(word[r2_start:]) >= 4 and word[-5] != 'c':
                word = word[:-4]
                if word[-2:] == 'en':
                    if len(word[r1_start:]) >= 2 and (
                        word[-3] not in self._vowels and word[-5:-2] != 'gem'
                    ):
                        word = self._undouble(word[:-2])

        # Step 3b
        if word[-4:] == 'lijk':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                # Repeat step 2
                if word[-1:] == 'e':
                    if (
                        len(word[r1_start:]) >= 1
                        and word[-2] not in self._vowels
                    ):
                        word = self._undouble(word[:-1])
        elif word[-4:] == 'baar':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
        elif word[-3:] in ('end', 'ing'):
            if len(word[r2_start:]) >= 3:
                word = word[:-3]
                if (
                    word[-2:] == 'ig'
                    and len(word[r2_start:]) >= 2
                    and word[-3] != 'e'
                ):
                    word = word[:-2]
                else:
                    word = self._undouble(word)
        elif word[-3:] == 'bar':
            if len(word[r2_start:]) >= 3 and e_removed:
                word = word[:-3]
        elif word[-2:] == 'ig':
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
                word = word[:-2]

        # Step 4
        if (
            len(word) >= 4
            and word[-3] == word[-2]
            and word[-2] in {'a', 'e', 'o', 'u'}
            and word[-4] not in self._vowels
            and word[-1] not in self._vowels
            and word[-1] != 'I'
        ):
            word = word[:-2] + word[-1]

        # Change 'Y' and 'U' back to lowercase if survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]
            elif word[i] == 'I':
                word = word[:i] + 'i' + word[i + 1 :]

        return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# Copyright 2014-2020 by Christopher C. Little.
2		# This file is part of Abydos.
3		#
4		# Abydos is free software: you can redistribute it and/or modify
5		# it under the terms of the GNU General Public License as published by
6		# the Free Software Foundation, either version 3 of the License, or
7		# (at your option) any later version.
8		#
9		# Abydos is distributed in the hope that it will be useful,
10		# but WITHOUT ANY WARRANTY; without even the implied warranty of
11		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		# GNU General Public License for more details.
13		#
14		# You should have received a copy of the GNU General Public License
15		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17		"""abydos.stemmer._snowball_dutch.
18
19	1	Snowball Dutch stemmer
20		"""
21
22		from unicodedata import normalize
23
24	1	from ._snowball import _Snowball
25
26		__all__ = ['SnowballDutch']
27
28
29		class SnowballDutch(_Snowball):
30		"""Snowball Dutch stemmer.
31	1
32		The Snowball Dutch stemmer is defined at:
33	1	http://snowball.tartarus.org/algorithms/dutch/stemmer.html
34
35	1	.. versionadded:: 0.3.6
36	1	"""
37
38	1	_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
39	1	_not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
40		_accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
41	1
42		def _undouble(self, word: str) -> str:
43		"""Undouble endings -kk, -dd, and -tt.
44	1
45		Parameters
46		----------
47		word : str
48		The word to stem
49
50		Returns
51		-------
52		str
53	1	The word with doubled endings undoubled
54	1
55	1
56		.. versionadded:: 0.1.0
57	1	.. versionchanged:: 0.3.6
58		Encapsulated in class
59
60		"""
61		if (
62		len(word) > 1
63		and word[-1] == word[-2]
64		and word[-1] in {'d', 'k', 't'}
65		):
66		return word[:-1]
67		return word
68
69		def stem(self, word: str) -> str:
70		"""Return Snowball Dutch stem.
71
72		Parameters
73		----------
74		word : str
75		The word to stem
76	1
77		Returns
78		-------
79		str
80		Word stem
81	1
82	1	Examples
83		--------
84	1	>>> stmr = SnowballDutch()
85		>>> stmr.stem('lezen')
86		'lez'
87		>>> stmr.stem('opschorting')
88		'opschort'
89		>>> stmr.stem('ongrijpbaarheid')
90		'ongrijp'
91
92
93		.. versionadded:: 0.1.0
94		.. versionchanged:: 0.3.6
95		Encapsulated in class
96
97		"""
98		# lowercase, normalize, decompose, filter umlauts & acutes out, and
99		# compose
100		word = normalize('NFC', word.lower())
101		word = word.translate(self._accented)
102
103		for i in range(len(word)):
104		if i == 0 and word[0] == 'y':
105		word = 'Y' + word[1:]
106		elif word[i] == 'y' and word[i - 1] in self._vowels:
107		word = word[:i] + 'Y' + word[i + 1 :]
108		elif (
109		word[i] == 'i'
110		and word[i - 1] in self._vowels
111		and i + 1 < len(word)
112		and word[i + 1] in self._vowels
113		):
114		word = word[:i] + 'I' + word[i + 1 :]
115	1
116	1	r1_start = max(3, self._sb_r1(word))
117		r2_start = self._sb_r2(word)
118	1
119	1	# Step 1
120	1	if word[-5:] == 'heden':
121	1	if len(word[r1_start:]) >= 5:
122	1	word = word[:-3] + 'id'
123	1	elif word[-3:] == 'ene':
124		if len(word[r1_start:]) >= 3 and (
125		word[-4] not in self._vowels and word[-6:-3] != 'gem'
126		):
127		word = self._undouble(word[:-3])
128		elif word[-2:] == 'en':
129	1	if len(word[r1_start:]) >= 2 and (
130		word[-3] not in self._vowels and word[-5:-2] != 'gem'
131	1	):
132	1	word = self._undouble(word[:-2])
133		elif word[-2:] == 'se':
134		if (
135	1	len(word[r1_start:]) >= 2
136	1	and word[-3] not in self._not_s_endings
137	1	):
138	1	word = word[:-2]
139	1	elif word[-1:] == 's':
140		if (
141		len(word[r1_start:]) >= 1
142	1	and word[-2] not in self._not_s_endings
143	1	):
144	1	word = word[:-1]
145
146		# Step 2
147	1	e_removed = False
148	1	if word[-1:] == 'e':
149	1	if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels:
150		word = self._undouble(word[:-1])
151		e_removed = True
152
153	1	# Step 3a
154	1	if word[-4:] == 'heid':
155	1	if len(word[r2_start:]) >= 4 and word[-5] != 'c':
156		word = word[:-4]
157		if word[-2:] == 'en':
158		if len(word[r1_start:]) >= 2 and (
159	1	word[-3] not in self._vowels and word[-5:-2] != 'gem'
160		):
161		word = self._undouble(word[:-2])
162	1
163	1	# Step 3b
164	1	if word[-4:] == 'lijk':
165	1	if len(word[r2_start:]) >= 4:
166	1	word = word[:-4]
167		# Repeat step 2
168		if word[-1:] == 'e':
169	1	if (
170	1	len(word[r1_start:]) >= 1
171	1	and word[-2] not in self._vowels
172	1	):
173	1	word = self._undouble(word[:-1])
174		elif word[-4:] == 'baar':
175		if len(word[r2_start:]) >= 4:
176	1	word = word[:-4]
177		elif word[-3:] in ('end', 'ing'):
178		if len(word[r2_start:]) >= 3:
179	1	word = word[:-3]
180	1	if (
181	1	word[-2:] == 'ig'
182		and len(word[r2_start:]) >= 2
183	1	and word[-3] != 'e'
184	1	):
185		word = word[:-2]
186		else:
187		word = self._undouble(word)
188	1	elif word[-3:] == 'bar':
189	1	if len(word[r2_start:]) >= 3 and e_removed:
190	1	word = word[:-3]
191	1	elif word[-2:] == 'ig':
192	1	if len(word[r2_start:]) >= 2 and word[-3] != 'e':
193	1	word = word[:-2]
194	1
195	1	# Step 4
196		if (
197		len(word) >= 4
198		and word[-3] == word[-2]
199		and word[-2] in {'a', 'e', 'o', 'u'}
200	1	and word[-4] not in self._vowels
201		and word[-1] not in self._vowels
202	1	and word[-1] != 'I'
203	1	):
204	1	word = word[:-2] + word[-1]
205	1
206	1	# Change 'Y' and 'U' back to lowercase if survived stemming
207	1	for i in range(0, len(word)):
208	1	if word[i] == 'Y':
209		word = word[:i] + 'y' + word[i + 1 :]
210		elif word[i] == 'I':
211	1	word = word[:i] + 'i' + word[i + 1 :]
212
213		return word
214
215
216		if __name__ == '__main__':
217		import doctest
218
219		doctest.testmod()
220

chrislit / abydos

SnowballDutch._undouble() A last analyzed 2020-12-31 20:10 UTC

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like

SnowballDutch._undouble() A
last analyzed 2020-12-31 20:10 UTC