abydos.stemmer._caumanns.caumanns() - Code Metrics - Inspection of "Merge pull request #149 from chrislit/0.3.6" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f43547...71985b )

by Chris

created 2018-11-17 08:52 UTC

abydos.stemmer._caumanns.caumanns() A

↳ Parent: abydos.stemmer._caumanns

Complexity

Conditions

Size

Total Lines	26
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Code Coverage

Tests	2
CRAP Score	1

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	1
dl	0
loc	26
ccs	2
cts	2
cp	1
crap	1
rs	10
c	0
b	0
f	0

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.stemmer._caumanns.

Caumanns German stemmer
"""

from __future__ import (
    absolute_import,
    division,
    print_function,
    unicode_literals,
)

from unicodedata import normalize

from six import text_type
from six.moves import range

from ._stemmer import _Stemmer

__all__ = ['Caumanns', 'caumanns']


class Caumanns(_Stemmer):

    """Caumanns stemmer.

    Jörg Caumanns' stemmer is described in his article in
    :cite:`Caumanns:1999`.

    This implementation is based on the GermanStemFilter described at
    :cite:`Lang:2013`.
    """

    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))


    def stem(self, word):

        """Return Caumanns German stem.

        Parameters
        ----------
        word : str
            The word to stem

        Returns
        -------
        str
            Word stem

        Examples
        --------
        >>> stmr = Caumanns()
        >>> stmr.stem('lesen')
        'les'
        >>> stmr.stem('graues')
        'grau'
        >>> stmr.stem('buchstabieren')
        'buchstabier'

        """
        if not word:
            return ''

        upper_initial = word[0].isupper()
        word = normalize('NFC', text_type(word.lower()))

        # # Part 2: Substitution
        # 1. Change umlauts to corresponding vowels & ß to ss
        word = word.translate(self._umlauts)
        word = word.replace('ß', 'ss')

        # 2. Change second of doubled characters to *
        new_word = word[0]
        for i in range(1, len(word)):
            if new_word[i - 1] == word[i]:
                new_word += '*'
            else:
                new_word += word[i]
        word = new_word

        # 3. Replace sch, ch, ei, ie with $, §, %, &
        word = word.replace('sch', '$')
        word = word.replace('ch', '§')
        word = word.replace('ei', '%')
        word = word.replace('ie', '&')
        word = word.replace('ig', '#')
        word = word.replace('st', '!')

        # # Part 1: Recursive Context-Free Stripping
        # 1. Remove the following 7 suffixes recursively
        while len(word) > 3:
            if (len(word) > 4 and word[-2:] in {'em', 'er'}) or (
                len(word) > 5 and word[-2:] == 'nd'

            ):
                word = word[:-2]
            elif (word[-1] in {'e', 's', 'n'}) or (
                not upper_initial and word[-1] in {'t', '!'}

            ):
                word = word[:-1]
            else:
                break

        # Additional optimizations:
        if len(word) > 5 and word[-5:] == 'erin*':
            word = word[:-1]
        if word[-1] == 'z':
            word = word[:-1] + 'x'

        # Reverse substitutions:
        word = word.replace('$', 'sch')
        word = word.replace('§', 'ch')
        word = word.replace('%', 'ei')
        word = word.replace('&', 'ie')
        word = word.replace('#', 'ig')
        word = word.replace('!', 'st')

        # Expand doubled
        word = ''.join(
            [word[0]]
            + [
                word[i - 1] if word[i] == '*' else word[i]
                for i in range(1, len(word))
            ]
        )

        # Finally, convert gege to ge
        if len(word) > 4:
            word = word.replace('gege', 'ge', 1)

        return word


def caumanns(word):
    """Return Caumanns German stem.

    This is a wrapper for :py:meth:`Caumanns.stem`.

    Parameters
    ----------
    word : str
        The word to stem

    Returns
    -------
    str
        Word stem

    Examples
    --------
    >>> caumanns('lesen')
    'les'
    >>> caumanns('graues')
    'grau'
    >>> caumanns('buchstabieren')
    'buchstabier'

    """
    return Caumanns().stem(word)


if __name__ == '__main__':
    import doctest

    doctest.testmod()


1		# -- coding: utf-8 --
2
3		# Copyright 2014-2018 by Christopher C. Little.
4		# This file is part of Abydos.
5		#
6		# Abydos is free software: you can redistribute it and/or modify
7		# it under the terms of the GNU General Public License as published by
8		# the Free Software Foundation, either version 3 of the License, or
9		# (at your option) any later version.
10		#
11		# Abydos is distributed in the hope that it will be useful,
12		# but WITHOUT ANY WARRANTY; without even the implied warranty of
13		# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14		# GNU General Public License for more details.
15		#
16		# You should have received a copy of the GNU General Public License
17		# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19	1	"""abydos.stemmer._caumanns.
20
21		Caumanns German stemmer
22		"""
23
24	1	from __future__ import (
25		absolute_import,
26		division,
27		print_function,
28		unicode_literals,
29		)
30
31	1	from unicodedata import normalize
32
33	1	from six import text_type
34	1	from six.moves import range
35
36	1	from ._stemmer import _Stemmer
37
38	1	__all__ = ['Caumanns', 'caumanns']
39
40
41	1	class Caumanns(_Stemmer):
		0 ignored issues – show Unused Code introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
42		"""Caumanns stemmer.
43
44		Jörg Caumanns' stemmer is described in his article in
45		:cite:`Caumanns:1999`.
46
47		This implementation is based on the GermanStemFilter described at
48		:cite:`Lang:2013`.
49		"""
50
51	1	_umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
		0 ignored issues – show Comprehensibility Best Practice introduced 2018-08-02 19:04 UTC by Report Bug Copy Issue Report The variable `_` does not seem to be defined. Loading history...
52
53	1	def stem(self, word):
		0 ignored issues – show Bug introduced 2018-11-05 04:21 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'stem' method Loading history...
54		"""Return Caumanns German stem.
55
56		Parameters
57		----------
58		word : str
59		The word to stem
60
61		Returns
62		-------
63		str
64		Word stem
65
66		Examples
67		--------
68		>>> stmr = Caumanns()
69		>>> stmr.stem('lesen')
70		'les'
71		>>> stmr.stem('graues')
72		'grau'
73		>>> stmr.stem('buchstabieren')
74		'buchstabier'
75
76		"""
77	1	if not word:
78	1	return ''
79
80	1	upper_initial = word[0].isupper()
81	1	word = normalize('NFC', text_type(word.lower()))
82
83		# # Part 2: Substitution
84		# 1. Change umlauts to corresponding vowels & ß to ss
85	1	word = word.translate(self._umlauts)
86	1	word = word.replace('ß', 'ss')
87
88		# 2. Change second of doubled characters to *
89	1	new_word = word[0]
90	1	for i in range(1, len(word)):
91	1	if new_word[i - 1] == word[i]:
92	1	new_word += '*'
93		else:
94	1	new_word += word[i]
95	1	word = new_word
96
97		# 3. Replace sch, ch, ei, ie with $, §, %, &
98	1	word = word.replace('sch', '$')
99	1	word = word.replace('ch', '§')
100	1	word = word.replace('ei', '%')
101	1	word = word.replace('ie', '&')
102	1	word = word.replace('ig', '#')
103	1	word = word.replace('st', '!')
104
105		# # Part 1: Recursive Context-Free Stripping
106		# 1. Remove the following 7 suffixes recursively
107	1	while len(word) > 3:
108	1	if (len(word) > 4 and word[-2:] in {'em', 'er'}) or (
109		len(word) > 5 and word[-2:] == 'nd'
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
110		):
111	1	word = word[:-2]
112	1	elif (word[-1] in {'e', 's', 'n'}) or (
113		not upper_initial and word[-1] in {'t', '!'}
		0 ignored issues – show Coding Style introduced 2018-10-24 06:00 UTC by Report Bug Copy Issue Report Wrong hanging indentation before block (add 4 spaces). Loading history...
114		):
115	1	word = word[:-1]
116		else:
117	1	break
118
119		# Additional optimizations:
120	1	if len(word) > 5 and word[-5:] == 'erin*':
121	1	word = word[:-1]
122	1	if word[-1] == 'z':
123	1	word = word[:-1] + 'x'
124
125		# Reverse substitutions:
126	1	word = word.replace('$', 'sch')
127	1	word = word.replace('§', 'ch')
128	1	word = word.replace('%', 'ei')
129	1	word = word.replace('&', 'ie')
130	1	word = word.replace('#', 'ig')
131	1	word = word.replace('!', 'st')
132
133		# Expand doubled
134	1	word = ''.join(
135		[word[0]]
136		+ [
137		word[i - 1] if word[i] == '*' else word[i]
138		for i in range(1, len(word))
139		]
140		)
141
142		# Finally, convert gege to ge
143	1	if len(word) > 4:
144	1	word = word.replace('gege', 'ge', 1)
145
146	1	return word
147
148
149	1	def caumanns(word):
150		"""Return Caumanns German stem.
151
152		This is a wrapper for :py:meth:`Caumanns.stem`.
153
154		Parameters
155		----------
156		word : str
157		The word to stem
158
159		Returns
160		-------
161		str
162		Word stem
163
164		Examples
165		--------
166		>>> caumanns('lesen')
167		'les'
168		>>> caumanns('graues')
169		'grau'
170		>>> caumanns('buchstabieren')
171		'buchstabier'
172
173		"""
174	1	return Caumanns().stem(word)
175
176
177		if __name__ == '__main__':
178		import doctest
179
180		doctest.testmod()
181

chrislit / abydos

Push — master ( f43547...71985b )

abydos.stemmer._caumanns.caumanns() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like