abydos.stemmer._caumanns.caumanns() - Code Metrics - Inspection of "78a222a9f7d8976f6744d263e3d6d01a2a991c27" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (78a222)

by Chris

created 2018-10-26 11:30 UTC

abydos.stemmer._caumanns.caumanns() F

↳ Parent: abydos.stemmer._caumanns

Complexity

Conditions

Size

Total Lines	91
Code Lines	46

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	40
CRAP Score	17

Importance

Changes

Metric	Value
eloc	46
dl	0
loc	91
ccs	40
cts	40
cp	1
rs	1.8
c	0
b	0
f	0
cc	17
nop	1
crap	17

How to fix Long Method Complexity

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._caumanns.

The stemmer._caumanns module defines the Caumanns German stemmer.
"""

from __future__ import unicode_literals

from unicodedata import normalize

from six import text_type
from six.moves import range

__all__ = ['caumanns']


def caumanns(word):
    """Return Caumanns German stem.

    Jörg Caumanns' stemmer is described in his article in
    :cite:`Caumanns:1999`.

    This implementation is based on the GermanStemFilter described at
    :cite:`Lang:2013`.

    :param str word: the word to calculate the stem of
    :returns: word stem
    :rtype: str

    >>> caumanns('lesen')
    'les'
    >>> caumanns('graues')
    'grau'
    >>> caumanns('buchstabieren')
    'buchstabier'
    """
    if not word:
        return ''

    upper_initial = word[0].isupper()
    word = normalize('NFC', text_type(word.lower()))

    # # Part 2: Substitution
    # 1. Change umlauts to corresponding vowels & ß to ss
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))

    word = word.translate(_umlauts)
    word = word.replace('ß', 'ss')

    # 2. Change second of doubled characters to *
    new_word = word[0]
    for i in range(1, len(word)):
        if new_word[i - 1] == word[i]:
            new_word += '*'
        else:
            new_word += word[i]
    word = new_word

    # 3. Replace sch, ch, ei, ie with $, §, %, &
    word = word.replace('sch', '$')
    word = word.replace('ch', '§')
    word = word.replace('ei', '%')
    word = word.replace('ie', '&')
    word = word.replace('ig', '#')
    word = word.replace('st', '!')

    # # Part 1: Recursive Context-Free Stripping
    # 1. Remove the following 7 suffixes recursively
    while len(word) > 3:
        if (len(word) > 4 and word[-2:] in {'em', 'er'}) or (
            len(word) > 5 and word[-2:] == 'nd'

        ):
            word = word[:-2]
        elif (word[-1] in {'e', 's', 'n'}) or (
            not upper_initial and word[-1] in {'t', '!'}

        ):
            word = word[:-1]
        else:
            break

    # Additional optimizations:
    if len(word) > 5 and word[-5:] == 'erin*':
        word = word[:-1]
    if word[-1] == 'z':
        word = word[:-1] + 'x'

    # Reverse substitutions:
    word = word.replace('$', 'sch')
    word = word.replace('§', 'ch')
    word = word.replace('%', 'ei')
    word = word.replace('&', 'ie')
    word = word.replace('#', 'ig')
    word = word.replace('!', 'st')

    # Expand doubled
    word = ''.join(
        [word[0]]
        + [
            word[i - 1] if word[i] == '*' else word[i]
            for i in range(1, len(word))
        ]
    )

    # Finally, convert gege to ge
    if len(word) > 4:
        word = word.replace('gege', 'ge', 1)

    return word


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._caumanns.
20
21		The stemmer._caumanns module defines the Caumanns German stemmer.
22		"""
23
24	1	from __future__ import unicode_literals
25
26	1	from unicodedata import normalize
27
28	1	from six import text_type
29	1	from six.moves import range
30
31	1	__all__ = ['caumanns']
32
33
34	1	def caumanns(word):
35		"""Return Caumanns German stem.
36
37		Jörg Caumanns' stemmer is described in his article in
38		:cite:`Caumanns:1999`.
39
40		This implementation is based on the GermanStemFilter described at
41		:cite:`Lang:2013`.
42
43		:param str word: the word to calculate the stem of
44		:returns: word stem
45		:rtype: str
46
47		>>> caumanns('lesen')
48		'les'
49		>>> caumanns('graues')
50		'grau'
51		>>> caumanns('buchstabieren')
52		'buchstabier'
53		"""
54	1	if not word:
55	1	return ''
56
57	1	upper_initial = word[0].isupper()
58	1	word = normalize('NFC', text_type(word.lower()))
59
60		# # Part 2: Substitution
61		# 1. Change umlauts to corresponding vowels & ß to ss
62	1	_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
63	1	word = word.translate(_umlauts)
64	1	word = word.replace('ß', 'ss')
65
66		# 2. Change second of doubled characters to *
67	1	new_word = word[0]
68	1	for i in range(1, len(word)):
69	1	if new_word[i - 1] == word[i]:
70	1	new_word += '*'
71		else:
72	1	new_word += word[i]
73	1	word = new_word
74
75		# 3. Replace sch, ch, ei, ie with $, §, %, &
76	1	word = word.replace('sch', '$')
77	1	word = word.replace('ch', '§')
78	1	word = word.replace('ei', '%')
79	1	word = word.replace('ie', '&')
80	1	word = word.replace('ig', '#')
81	1	word = word.replace('st', '!')
82
83		# # Part 1: Recursive Context-Free Stripping
84		# 1. Remove the following 7 suffixes recursively
85	1	while len(word) > 3:
86	1	if (len(word) > 4 and word[-2:] in {'em', 'er'}) or (
87		len(word) > 5 and word[-2:] == 'nd'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
88		):
89	1	word = word[:-2]
90	1	elif (word[-1] in {'e', 's', 'n'}) or (
91		not upper_initial and word[-1] in {'t', '!'}
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
92		):
93	1	word = word[:-1]
94		else:
95	1	break
96
97		# Additional optimizations:
98	1	if len(word) > 5 and word[-5:] == 'erin*':
99	1	word = word[:-1]
100	1	if word[-1] == 'z':
101	1	word = word[:-1] + 'x'
102
103		# Reverse substitutions:
104	1	word = word.replace('$', 'sch')
105	1	word = word.replace('§', 'ch')
106	1	word = word.replace('%', 'ei')
107	1	word = word.replace('&', 'ie')
108	1	word = word.replace('#', 'ig')
109	1	word = word.replace('!', 'st')
110
111		# Expand doubled
112	1	word = ''.join(
113		[word[0]]
114		+ [
115		word[i - 1] if word[i] == '*' else word[i]
116		for i in range(1, len(word))
117		]
118		)
119
120		# Finally, convert gege to ge
121	1	if len(word) > 4:
122	1	word = word.replace('gege', 'ge', 1)
123
124	1	return word
125
126
127		if __name__ == '__main__':
128		import doctest
129
130		doctest.testmod()
131

chrislit / abydos

Branch — master (78a222)

abydos.stemmer._caumanns.caumanns() F

Complexity

Size

Duplication

Code Coverage

Importance

How to fix Long Method Complexity

Long Method

Complexity

Duplication Side-by-Side

Filter issues like