abydos.stemmer._snowball_german.sb_german() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.stemmer._snowball_german.sb_german() A

↳ Parent: abydos.stemmer._snowball_german

Complexity

Conditions

Size

Total Lines	28
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	28
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._snowball_german.

Snowball German stemmer
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize

from six.moves import range

from ._snowball import _Snowball

__all__ = ['SnowballGerman', 'sb_german']


class SnowballGerman(_Snowball):

    """Snowball German stemmer.

    The Snowball German stemmer is defined at:
    http://snowball.tartarus.org/algorithms/german/stemmer.html
    """

    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}

    def stem(self, word, alternate_vowels=False):

        """Return Snowball German stem.

        Parameters
        ----------
        word : str
            The word to stem
        alternate_vowels : bool
            Composes ae as ä, oe as ö, and ue as ü before running the algorithm

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = SnowballGerman()
        >>> stmr.stem('lesen')
        'les'
        >>> stmr.stem('graues')
        'grau'
        >>> stmr.stem('buchstabieren')
        'buchstabi'

        """
        # lowercase, normalize, and compose
        word = normalize('NFC', word.lower())
        word = word.replace('ß', 'ss')

        if len(word) > 2:
            for i in range(2, len(word)):
                if word[i] in self._vowels and word[i - 2] in self._vowels:
                    if word[i - 1] == 'u':
                        word = word[: i - 1] + 'U' + word[i:]
                    elif word[i - 1] == 'y':
                        word = word[: i - 1] + 'Y' + word[i:]

        if alternate_vowels:
            word = word.replace('ae', 'ä')
            word = word.replace('oe', 'ö')
            word = word.replace('que', 'Q')
            word = word.replace('ue', 'ü')
            word = word.replace('Q', 'que')

        r1_start = max(3, self._sb_r1(word))
        r2_start = self._sb_r2(word)

        # Step 1
        niss_flag = False
        if word[-3:] == 'ern':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'em':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-2:] == 'es':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
                niss_flag = True
        elif word[-1:] == 'e':
            if len(word[r1_start:]) >= 1:
                word = word[:-1]
                niss_flag = True
        elif word[-1:] == 's':
            if (
                len(word[r1_start:]) >= 1

                and len(word) >= 2

                and word[-2] in self._s_endings

            ):
                word = word[:-1]

        if niss_flag and word[-4:] == 'niss':
            word = word[:-1]

        # Step 2
        if word[-3:] == 'est':
            if len(word[r1_start:]) >= 3:
                word = word[:-3]
        elif word[-2:] == 'en':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'er':
            if len(word[r1_start:]) >= 2:
                word = word[:-2]
        elif word[-2:] == 'st':
            if (
                len(word[r1_start:]) >= 2

                and len(word) >= 6

                and word[-3] in self._st_endings

            ):
                word = word[:-2]

        # Step 3
        if word[-4:] == 'isch':
            if len(word[r2_start:]) >= 4 and word[-5] != 'e':
                word = word[:-4]
        elif word[-4:] in {'lich', 'heit'}:
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
                    word = word[:-2]
        elif word[-4:] == 'keit':
            if len(word[r2_start:]) >= 4:
                word = word[:-4]
                if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
                    word = word[:-4]
                elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
                    word = word[:-2]
        elif word[-3:] in {'end', 'ung'}:
            if len(word[r2_start:]) >= 3:
                word = word[:-3]
                if (
                    word[-2:] == 'ig'

                    and len(word[r2_start:]) >= 2

                    and word[-3] != 'e'

                ):
                    word = word[:-2]
        elif word[-2:] in {'ig', 'ik'}:
            if len(word[r2_start:]) >= 2 and word[-3] != 'e':
                word = word[:-2]

        # Change 'Y' and 'U' back to lowercase if survived stemming
        for i in range(0, len(word)):
            if word[i] == 'Y':
                word = word[:i] + 'y' + word[i + 1 :]
            elif word[i] == 'U':
                word = word[:i] + 'u' + word[i + 1 :]

        # Remove umlauts
        _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))

        word = word.translate(_umlauts)

        return word


def sb_german(word, alternate_vowels=False):
    """Return Snowball German stem.

    This is a wrapper for :py:meth:`SnowballGerman.stem`.

    Parameters
    ----------
    word : str
        The word to stem
    alternate_vowels : bool
        Composes ae as ä, oe as ö, and ue as ü before running the algorithm

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> sb_german('lesen')
    'les'
    >>> sb_german('graues')
    'grau'
    >>> sb_german('buchstabieren')
    'buchstabi'

    """
    return SnowballGerman().stem(word, alternate_vowels)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._snowball_german.
20
21		Snowball German stemmer
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize
32
33	1	from six.moves import range
34
35	1	from ._snowball import _Snowball
36
37	1	__all__ = ['SnowballGerman', 'sb_german']
38
39
40	1	class SnowballGerman(_Snowball):
		0 ignored issues – show Unused Code introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
41		"""Snowball German stemmer.
42
43		The Snowball German stemmer is defined at:
44		http://snowball.tartarus.org/algorithms/german/stemmer.html
45		"""
46
47	1	_vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
48	1	_s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
49	1	_st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
50
51	1	def stem(self, word, alternate_vowels=False):
		0 ignored issues – show Bug introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'stem' method Loading history...
52		"""Return Snowball German stem.
53
54		Parameters
55		----------
56		word : str
57		The word to stem
58		alternate_vowels : bool
59		Composes ae as ä, oe as ö, and ue as ü before running the algorithm
60
61		Returns
62		-------
63		str
64		Word stem
65
66		Examples
67		--------
68		>>> stmr = SnowballGerman()
69		>>> stmr.stem('lesen')
70		'les'
71		>>> stmr.stem('graues')
72		'grau'
73		>>> stmr.stem('buchstabieren')
74		'buchstabi'
75
76		"""
77		# lowercase, normalize, and compose
78	1	word = normalize('NFC', word.lower())
79	1	word = word.replace('ß', 'ss')
80
81	1	if len(word) > 2:
82	1	for i in range(2, len(word)):
83	1	if word[i] in self._vowels and word[i - 2] in self._vowels:
84	1	if word[i - 1] == 'u':
85	1	word = word[: i - 1] + 'U' + word[i:]
86	1	elif word[i - 1] == 'y':
87	1	word = word[: i - 1] + 'Y' + word[i:]
88
89	1	if alternate_vowels:
90	1	word = word.replace('ae', 'ä')
91	1	word = word.replace('oe', 'ö')
92	1	word = word.replace('que', 'Q')
93	1	word = word.replace('ue', 'ü')
94	1	word = word.replace('Q', 'que')
95
96	1	r1_start = max(3, self._sb_r1(word))
97	1	r2_start = self._sb_r2(word)
98
99		# Step 1
100	1	niss_flag = False
101	1	if word[-3:] == 'ern':
102	1	if len(word[r1_start:]) >= 3:
103	1	word = word[:-3]
104	1	elif word[-2:] == 'em':
105	1	if len(word[r1_start:]) >= 2:
106	1	word = word[:-2]
107	1	elif word[-2:] == 'er':
108	1	if len(word[r1_start:]) >= 2:
109	1	word = word[:-2]
110	1	elif word[-2:] == 'en':
111	1	if len(word[r1_start:]) >= 2:
112	1	word = word[:-2]
113	1	niss_flag = True
114	1	elif word[-2:] == 'es':
115	1	if len(word[r1_start:]) >= 2:
116	1	word = word[:-2]
117	1	niss_flag = True
118	1	elif word[-1:] == 'e':
119	1	if len(word[r1_start:]) >= 1:
120	1	word = word[:-1]
121	1	niss_flag = True
122	1	elif word[-1:] == 's':
123	1	if (
124		len(word[r1_start:]) >= 1
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
125		and len(word) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
126		and word[-2] in self._s_endings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
127		):
128	1	word = word[:-1]
129
130	1	if niss_flag and word[-4:] == 'niss':
131	1	word = word[:-1]
132
133		# Step 2
134	1	if word[-3:] == 'est':
135	1	if len(word[r1_start:]) >= 3:
136	1	word = word[:-3]
137	1	elif word[-2:] == 'en':
138	1	if len(word[r1_start:]) >= 2:
139	1	word = word[:-2]
140	1	elif word[-2:] == 'er':
141	1	if len(word[r1_start:]) >= 2:
142	1	word = word[:-2]
143	1	elif word[-2:] == 'st':
144	1	if (
145		len(word[r1_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
146		and len(word) >= 6
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
147		and word[-3] in self._st_endings
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
148		):
149	1	word = word[:-2]
150
151		# Step 3
152	1	if word[-4:] == 'isch':
153	1	if len(word[r2_start:]) >= 4 and word[-5] != 'e':
154	1	word = word[:-4]
155	1	elif word[-4:] in {'lich', 'heit'}:
156	1	if len(word[r2_start:]) >= 4:
157	1	word = word[:-4]
158	1	if word[-2:] in {'er', 'en'} and len(word[r1_start:]) >= 2:
159	1	word = word[:-2]
160	1	elif word[-4:] == 'keit':
161	1	if len(word[r2_start:]) >= 4:
162	1	word = word[:-4]
163	1	if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
164	1	word = word[:-4]
165	1	elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
166	1	word = word[:-2]
167	1	elif word[-3:] in {'end', 'ung'}:
168	1	if len(word[r2_start:]) >= 3:
169	1	word = word[:-3]
170	1	if (
171		word[-2:] == 'ig'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
172		and len(word[r2_start:]) >= 2
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
173		and word[-3] != 'e'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
174		):
175	1	word = word[:-2]
176	1	elif word[-2:] in {'ig', 'ik'}:
177	1	if len(word[r2_start:]) >= 2 and word[-3] != 'e':
178	1	word = word[:-2]
179
180		# Change 'Y' and 'U' back to lowercase if survived stemming
181	1	for i in range(0, len(word)):
182	1	if word[i] == 'Y':
183	1	word = word[:i] + 'y' + word[i + 1 :]
184	1	elif word[i] == 'U':
185	1	word = word[:i] + 'u' + word[i + 1 :]
186
187		# Remove umlauts
188	1	_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
189	1	word = word.translate(_umlauts)
190
191	1	return word
192
193
194	1	def sb_german(word, alternate_vowels=False):
195		"""Return Snowball German stem.
196
197		This is a wrapper for :py:meth:`SnowballGerman.stem`.
198
199		Parameters
200		----------
201		word : str
202		The word to stem
203		alternate_vowels : bool
204		Composes ae as ä, oe as ö, and ue as ü before running the algorithm
205
206		Returns
207		-------
208		str
209		Word stem
210
211		Examples
212		--------
213		>>> sb_german('lesen')
214		'les'
215		>>> sb_german('graues')
216		'grau'
217		>>> sb_german('buchstabieren')
218		'buchstabi'
219
220		"""
221	1	return SnowballGerman().stem(word, alternate_vowels)
222
223
224		if __name__ == '__main__':
225		import doctest
226
227		doctest.testmod()
228

chrislit / abydos

Push — master ( f43547...71985b )

abydos.stemmer._snowball_german.sb_german() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like