abydos.stemmer._clef - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.stemmer._clef A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	178
Duplicated Lines	0 %

Test Coverage

Coverage

100%

Importance

Changes

Metric	Value
wmc	35
eloc	73
dl	0
loc	178
ccs	59
cts	59
cp	1
rs	9.6
c	0
b	0
f	0

3 Functions

Rating	Name	Size	Complexity
B	clef_german()	36	7
F	clef_german_plus()	49	15
D	clef_swedish()	46	13

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._clef.

The stemmer._clef module defines CLEF stemmers for:

    - German
    - German plus
    - Swedish
"""

from __future__ import unicode_literals

from unicodedata import normalize

from six import text_type

__all__ = ['clef_german', 'clef_german_plus', 'clef_swedish']


def clef_german(word):
    """Return CLEF German stem.

    The CLEF German stemmer is defined at :cite:`Savoy:2005`.

    :param str word: the word to calculate the stem of
    :returns: word stem
    :rtype: str

    >>> clef_german('lesen')
    'lese'
    >>> clef_german('graues')
    'grau'
    >>> clef_german('buchstabieren')
    'buchstabier'
    """
    # lowercase, normalize, and compose
    word = normalize('NFC', text_type(word.lower()))

    # remove umlauts
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))

    word = word.translate(_umlauts)

    # remove plurals
    wlen = len(word) - 1

    if wlen > 3:
        if wlen > 5:
            if word[-3:] == 'nen':
                return word[:-3]
        if wlen > 4:
            if word[-2:] in {'en', 'se', 'es', 'er'}:
                return word[:-2]
        if word[-1] in {'e', 'n', 'r', 's'}:
            return word[:-1]
    return word


def clef_german_plus(word):
    """Return 'CLEF German stemmer plus' stem.

    The CLEF German stemmer plus is defined at :cite:`Savoy:2005`.

    :param str word: the word to calculate the stem of
    :returns: word stem
    :rtype: str

    >>> clef_german_plus('lesen')
    'les'
    >>> clef_german_plus('graues')
    'grau'
    >>> clef_german_plus('buchstabieren')
    'buchstabi'
    """
    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}

    # lowercase, normalize, and compose
    word = normalize('NFC', text_type(word.lower()))

    # remove umlauts
    _accents = dict(
        zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')

    )
    word = word.translate(_accents)

    # Step 1
    wlen = len(word) - 1
    if wlen > 4 and word[-3:] == 'ern':
        word = word[:-3]
    elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
        word = word[:-2]
    elif wlen > 2 and (
        word[-1] == 'e' or (word[-1] == 's' and word[-2] in _st_ending)

    ):
        word = word[:-1]

    # Step 2
    wlen = len(word) - 1
    if wlen > 4 and word[-3:] == 'est':
        word = word[:-3]
    elif wlen > 3 and (
        word[-2:] in {'er', 'en'}

        or (word[-2:] == 'st' and word[-3] in _st_ending)

    ):
        word = word[:-2]

    return word


def clef_swedish(word):
    """Return CLEF Swedish stem.

    The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`.

    :param str word: the word to calculate the stem of
    :returns: word stem
    :rtype: str

    >>> clef_swedish('undervisa')
    'undervis'
    >>> clef_swedish('suspension')
    'suspensio'
    >>> clef_swedish('visshet')
    'viss'
    """
    wlen = len(word) - 1

    if wlen > 3 and word[-1] == 's':
        word = word[:-1]
        wlen -= 1

    if wlen > 6:
        if word[-5:] in {'elser', 'heten'}:
            return word[:-5]
    if wlen > 5:
        if word[-4:] in {
            'arne',

            'erna',

            'ande',

            'else',

            'aste',

            'orna',

            'aren',

        }:
            return word[:-4]
    if wlen > 4:
        if word[-3:] in {'are', 'ast', 'het'}:
            return word[:-3]
    if wlen > 3:
        if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
            return word[:-2]
    if wlen > 2:
        if word[-1] in {'a', 'e', 'n', 't'}:
            return word[:-1]
    return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._clef.
20
21		The stemmer._clef module defines CLEF stemmers for:
22
23		- German
24		- German plus
25		- Swedish
26		"""
27
28	1	from __future__ import unicode_literals
29
30	1	from unicodedata import normalize
31
32	1	from six import text_type
33
34	1	__all__ = ['clef_german', 'clef_german_plus', 'clef_swedish']
35
36
37	1	def clef_german(word):
38		"""Return CLEF German stem.
39
40		The CLEF German stemmer is defined at :cite:`Savoy:2005`.
41
42		:param str word: the word to calculate the stem of
43		:returns: word stem
44		:rtype: str
45
46		>>> clef_german('lesen')
47		'lese'
48		>>> clef_german('graues')
49		'grau'
50		>>> clef_german('buchstabieren')
51		'buchstabier'
52		"""
53		# lowercase, normalize, and compose
54	1	word = normalize('NFC', text_type(word.lower()))
55
56		# remove umlauts
57	1	_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
58	1	word = word.translate(_umlauts)
59
60		# remove plurals
61	1	wlen = len(word) - 1
62
63	1	if wlen > 3:
64	1	if wlen > 5:
65	1	if word[-3:] == 'nen':
66	1	return word[:-3]
67	1	if wlen > 4:
68	1	if word[-2:] in {'en', 'se', 'es', 'er'}:
69	1	return word[:-2]
70	1	if word[-1] in {'e', 'n', 'r', 's'}:
71	1	return word[:-1]
72	1	return word
73
74
75	1	def clef_german_plus(word):
76		"""Return 'CLEF German stemmer plus' stem.
77
78		The CLEF German stemmer plus is defined at :cite:`Savoy:2005`.
79
80		:param str word: the word to calculate the stem of
81		:returns: word stem
82		:rtype: str
83
84		>>> clef_german_plus('lesen')
85		'les'
86		>>> clef_german_plus('graues')
87		'grau'
88		>>> clef_german_plus('buchstabieren')
89		'buchstabi'
90		"""
91	1	_st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
92
93		# lowercase, normalize, and compose
94	1	word = normalize('NFC', text_type(word.lower()))
95
96		# remove umlauts
97	1	_accents = dict(
98		zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
99		)
100	1	word = word.translate(_accents)
101
102		# Step 1
103	1	wlen = len(word) - 1
104	1	if wlen > 4 and word[-3:] == 'ern':
105	1	word = word[:-3]
106	1	elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
107	1	word = word[:-2]
108	1	elif wlen > 2 and (
109		word[-1] == 'e' or (word[-1] == 's' and word[-2] in _st_ending)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
110		):
111	1	word = word[:-1]
112
113		# Step 2
114	1	wlen = len(word) - 1
115	1	if wlen > 4 and word[-3:] == 'est':
116	1	word = word[:-3]
117	1	elif wlen > 3 and (
118		word[-2:] in {'er', 'en'}
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
119		or (word[-2:] == 'st' and word[-3] in _st_ending)
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
120		):
121	1	word = word[:-2]
122
123	1	return word
124
125
126	1	def clef_swedish(word):
127		"""Return CLEF Swedish stem.
128
129		The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`.
130
131		:param str word: the word to calculate the stem of
132		:returns: word stem
133		:rtype: str
134
135		>>> clef_swedish('undervisa')
136		'undervis'
137		>>> clef_swedish('suspension')
138		'suspensio'
139		>>> clef_swedish('visshet')
140		'viss'
141		"""
142	1	wlen = len(word) - 1
143
144	1	if wlen > 3 and word[-1] == 's':
145	1	word = word[:-1]
146	1	wlen -= 1
147
148	1	if wlen > 6:
149	1	if word[-5:] in {'elser', 'heten'}:
150	1	return word[:-5]
151	1	if wlen > 5:
152	1	if word[-4:] in {
153		'arne',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
154		'erna',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
155		'ande',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
156		'else',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
157		'aste',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
158		'orna',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
159		'aren',
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
160		}:
161	1	return word[:-4]
162	1	if wlen > 4:
163	1	if word[-3:] in {'are', 'ast', 'het'}:
164	1	return word[:-3]
165	1	if wlen > 3:
166	1	if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
167	1	return word[:-2]
168	1	if wlen > 2:
169	1	if word[-1] in {'a', 'e', 'n', 't'}:
170	1	return word[:-1]
171	1	return word
172
173
174		if __name__ == '__main__':
175		import doctest
176
177		doctest.testmod()
178

chrislit / abydos

Branch — master (78a222)

abydos.stemmer._clef A

Complexity

Size/Duplication

Test Coverage

Importance

3 Functions

Duplication Side-by-Side

Filter issues like