abydos.stemmer._snowball_dutch - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.stemmer._snowball_dutch F

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	248
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
eloc	116
dl	0
loc	248
ccs	84
cts	84
cp	1
rs	2.96
c	0
b	0
f	0
wmc	68

1 Function

Rating	Name	Duplication	Size	Complexity
A	sb_dutch()	0	26	1

2 Methods

Rating	Name	Duplication	Size	Complexity
A	SnowballDutch._undouble()	0	21	4
F	SnowballDutch.stem()	0	140	63

How to fix Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_dutch.

Snowball Dutch stemmer
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize

from six import text_type
from six.moves import range

from ._snowball import _Snowball

__all__ = ['SnowballDutch', 'sb_dutch']


class SnowballDutch(_Snowball):

    """Snowball Dutch stemmer.

    The Snowball Dutch stemmer is defined at:
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
    """

    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))


    def _undouble(self, word):
class Foo:
    def some_method(self, x, y):
        return x + y;
        """Undouble endings -kk, -dd, and -tt.

        Parameters
        ----------
        word : str
          The word to stem

        Returns
        -------
        str
            The word with doubled endings undoubled

        """
        if (
            len(word) > 1

            and word[-1] == word[-2]

            and word[-1] in {'d', 'k', 't'}

        ):
            return word[:-1]
        return word

    def stem(self, word):

        """Return Snowball Dutch stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = SnowballDutch()
        >>> stmr.stem('lezen')
        'lez'
        >>> stmr.stem('opschorting')
        'opschort'
        >>> stmr.stem('ongrijpbaarheid')
        'ongrijp'

        """
        # lowercase, normalize, decompose, filter umlauts & acutes out, and
        # compose
        word = normalize('NFC', text_type(word.lower()))
        word = word.translate(self._accented)

        for i in range(len(word)):

            if i == 0 and word[0] == 'y':
                word = 'Y' + word[1:]
            elif word[i] == 'y' and word[i - 1] in self._vowels:
                word = word[:i] + 'Y' + word[i + 1 :]
            elif (
                word[i] == 'i'

                and word[i - 1] in self._vowels

                and i + 1 < len(word)

                and word[i + 1] in self._vowels

            ):
                word = word[:i] + 'I' + word[i + 1 :]

        r1_start = max(3, self._sb_r1(word))
        r2_start = self._sb_r2(word)

        # Step 1
        if word[-5:] == 'heden':
            if len(word[r1_start:]) >= 5:
                word = word[:-3] + 'id'
        elif word[-3:] == 'ene':
            if len(word[r1_start:]) >= 3 and (
                word[-4] not in self._vowels and word[-6:-3] != 'gem'

            ):
                word = self._undouble(word[:-3])
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2 and (
                word[-3] not in self._vowels and word[-5:-2] != 'gem'

            ):
                word = self._undouble(word[:-2])
        elif word[-2:] == 'se':
            if (
                len(word[r1_start:]) >= 2

                and word[-3] not in self._not_s_endings

            ):
                word = word[:-2]
        elif word[-1:] == 's':
            if (
                len(word[r1_start:]) >= 1

                and word[-2] not in self._not_s_endings

            ):
                word = word[:-1]

        # Step 2
        e_removed = False
        if word[-1:] == 'e':
            if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels:
                word = self._undouble(word[:-1])
                e_removed = True

        # Step 3a
        if word[-4:] == 'heid':
            if len(word[r2_start:]) >= 4 and word[-5] != 'c':
                word = word[:-4]
                if word[-2:] == 'en':
                    if len(word[r1_start:]) >= 2 and (
                        word[-3] not in self._vowels and word[-5:-2] != 'gem'

                    ):
                        word = self._undouble(word[:-2])

        # Step 3b
        if word[-4:] == 'lijk':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                # Repeat step 2
                if word[-1:] == 'e':
                    if (
                        len(word[r1_start:]) >= 1

                        and word[-2] not in self._vowels

                    ):
                        word = self._undouble(word[:-1])
        elif word[-4:] == 'baar':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
        elif word[-3:] in ('end', 'ing'):
            if len(word[r2_start:]) >= 3:
                word = word[:-3]
                if (
                    word[-2:] == 'ig'

                    and len(word[r2_start:]) >= 2

                    and word[-3] != 'e'

                ):
                    word = word[:-2]
                else:
                    word = self._undouble(word)
        elif word[-3:] == 'bar':
            if len(word[r2_start:]) >= 3 and e_removed:
                word = word[:-3]
        elif word[-2:] == 'ig':
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
                word = word[:-2]

        # Step 4
        if (
            len(word) >= 4

            and word[-3] == word[-2]

            and word[-2] in {'a', 'e', 'o', 'u'}

            and word[-4] not in self._vowels

            and word[-1] not in self._vowels

            and word[-1] != 'I'

        ):
            word = word[:-2] + word[-1]

        # Change 'Y' and 'U' back to lowercase if survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]
            elif word[i] == 'I':
                word = word[:i] + 'i' + word[i + 1 :]

        return word


def sb_dutch(word):
    """Return Snowball Dutch stem.

    This is a wrapper for :py:meth:`SnowballDutch.stem`.

    Parameters
    ----------
    word : str
        The word to stem

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> sb_dutch('lezen')
    'lez'
    >>> sb_dutch('opschorting')
    'opschort'
    >>> sb_dutch('ongrijpbaarheid')
    'ongrijp'

    """
    return SnowballDutch().stem(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._snowball_dutch.
20
21		Snowball Dutch stemmer
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize
32
33	1	from six import text_type
34	1	from six.moves import range
35
36	1	from ._snowball import _Snowball
37
38	1	__all__ = ['SnowballDutch', 'sb_dutch']
39
40
41	1	class SnowballDutch(_Snowball):
		0 ignored issues – show Unused Code introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
42		"""Snowball Dutch stemmer.
43
44		The Snowball Dutch stemmer is defined at:
45		http://snowball.tartarus.org/algorithms/dutch/stemmer.html
46		"""
47
48	1	_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
49	1	_not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
50	1	_accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
51
52	1	def _undouble(self, word):
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report This method could be written as a function/class method. If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example class Foo: def some_method(self, x, y): return x + y; could be written as class Foo: @classmethod def some_method(cls, x, y): return x + y; Loading history...
53		"""Undouble endings -kk, -dd, and -tt.
54
55		Parameters
56		----------
57		word : str
58		The word to stem
59
60		Returns
61		-------
62		str
63		The word with doubled endings undoubled
64
65		"""
66	1	if (
67		len(word) > 1
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
68		and word[-1] == word[-2]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
69		and word[-1] in {'d', 'k', 't'}
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
70		):
71	1	return word[:-1]
72	1	return word
73
74	1	def stem(self, word):
		0 ignored issues – show Bug introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'stem' method Loading history...
75		"""Return Snowball Dutch stem.
76
77		Parameters
78		----------
79		word : str
80		The word to stem
81
82		Returns
83		-------
84		str
85		Word stem
86
87		Examples
88		--------
89		>>> stmr = SnowballDutch()
90		>>> stmr.stem('lezen')
91		'lez'
92		>>> stmr.stem('opschorting')
93		'opschort'
94		>>> stmr.stem('ongrijpbaarheid')
95		'ongrijp'
96
97		"""
98		# lowercase, normalize, decompose, filter umlauts & acutes out, and
99		# compose
100	1	word = normalize('NFC', text_type(word.lower()))
101	1	word = word.translate(self._accented)
102
103	1	for i in range(len(word)):
		0 ignored issues – show unused-code introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report Consider using enumerate instead of iterating with range and len Loading history...
104	1	if i == 0 and word[0] == 'y':
105	1	word = 'Y' + word[1:]
106	1	elif word[i] == 'y' and word[i - 1] in self._vowels:
107	1	word = word[:i] + 'Y' + word[i + 1 :]
108	1	elif (
109		word[i] == 'i'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
110		and word[i - 1] in self._vowels
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
111		and i + 1 < len(word)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
112		and word[i + 1] in self._vowels
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
113		):
114	1	word = word[:i] + 'I' + word[i + 1 :]
115
116	1	r1_start = max(3, self._sb_r1(word))
117	1	r2_start = self._sb_r2(word)
118
119		# Step 1
120	1	if word[-5:] == 'heden':
121	1	if len(word[r1_start:]) >= 5:
122	1	word = word[:-3] + 'id'
123	1	elif word[-3:] == 'ene':
124	1	if len(word[r1_start:]) >= 3 and (
125		word[-4] not in self._vowels and word[-6:-3] != 'gem'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
126		):
127	1	word = self._undouble(word[:-3])
128	1	elif word[-2:] == 'en':
129	1	if len(word[r1_start:]) >= 2 and (
130		word[-3] not in self._vowels and word[-5:-2] != 'gem'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
131		):
132	1	word = self._undouble(word[:-2])
133	1	elif word[-2:] == 'se':
134	1	if (
135		len(word[r1_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
136		and word[-3] not in self._not_s_endings
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
137		):
138	1	word = word[:-2]
139	1	elif word[-1:] == 's':
140	1	if (
141		len(word[r1_start:]) >= 1
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
142		and word[-2] not in self._not_s_endings
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
143		):
144	1	word = word[:-1]
145
146		# Step 2
147	1	e_removed = False
148	1	if word[-1:] == 'e':
149	1	if len(word[r1_start:]) >= 1 and word[-2] not in self._vowels:
150	1	word = self._undouble(word[:-1])
151	1	e_removed = True
152
153		# Step 3a
154	1	if word[-4:] == 'heid':
155	1	if len(word[r2_start:]) >= 4 and word[-5] != 'c':
156	1	word = word[:-4]
157	1	if word[-2:] == 'en':
158	1	if len(word[r1_start:]) >= 2 and (
159		word[-3] not in self._vowels and word[-5:-2] != 'gem'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
160		):
161	1	word = self._undouble(word[:-2])
162
163		# Step 3b
164	1	if word[-4:] == 'lijk':
165	1	if len(word[r2_start:]) >= 4:
166	1	word = word[:-4]
167		# Repeat step 2
168	1	if word[-1:] == 'e':
169	1	if (
170		len(word[r1_start:]) >= 1
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
171		and word[-2] not in self._vowels
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
172		):
173	1	word = self._undouble(word[:-1])
174	1	elif word[-4:] == 'baar':
175	1	if len(word[r2_start:]) >= 4:
176	1	word = word[:-4]
177	1	elif word[-3:] in ('end', 'ing'):
178	1	if len(word[r2_start:]) >= 3:
179	1	word = word[:-3]
180	1	if (
181		word[-2:] == 'ig'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
182		and len(word[r2_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
183		and word[-3] != 'e'
		0 ignored issues – show Coding Style introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
184		):
185	1	word = word[:-2]
186		else:
187	1	word = self._undouble(word)
188	1	elif word[-3:] == 'bar':
189	1	if len(word[r2_start:]) >= 3 and e_removed:
190	1	word = word[:-3]
191	1	elif word[-2:] == 'ig':
192	1	if len(word[r2_start:]) >= 2 and word[-3] != 'e':
193	1	word = word[:-2]
194
195		# Step 4
196	1	if (
197		len(word) >= 4
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history... best-practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Too many boolean expressions in if statement (6/5) Loading history...
198		and word[-3] == word[-2]
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
199		and word[-2] in {'a', 'e', 'o', 'u'}
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
200		and word[-4] not in self._vowels
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
201		and word[-1] not in self._vowels
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
202		and word[-1] != 'I'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
203		):
204	1	word = word[:-2] + word[-1]
205
206		# Change 'Y' and 'U' back to lowercase if survived stemming
207	1	for i in range(0, len(word)):
208	1	if word[i] == 'Y':
209	1	word = word[:i] + 'y' + word[i + 1 :]
210	1	elif word[i] == 'I':
211	1	word = word[:i] + 'i' + word[i + 1 :]
212
213	1	return word
214
215
216	1	def sb_dutch(word):
217		"""Return Snowball Dutch stem.
218
219		This is a wrapper for :py:meth:`SnowballDutch.stem`.
220
221		Parameters
222		----------
223		word : str
224		The word to stem
225
226		Returns
227		-------
228		str
229		Word stem
230
231		Examples
232		--------
233		>>> sb_dutch('lezen')
234		'lez'
235		>>> sb_dutch('opschorting')
236		'opschort'
237		>>> sb_dutch('ongrijpbaarheid')
238		'ongrijp'
239
240		"""
241	1	return SnowballDutch().stem(word)
242
243
244		if __name__ == '__main__':
245		import doctest
246
247		doctest.testmod()
248

chrislit / abydos

Push — master ( f43547...71985b )

abydos.stemmer._snowball_dutch F

Complexity

Size/Duplication

Test Coverage

Importance

1 Function

2 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like