Completed
Branch master (78a222)
by Chris
14:36
created

abydos.stemmer._clef   A

Complexity

Total Complexity 35

Size/Duplication

Total Lines 178
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 35
eloc 73
dl 0
loc 178
ccs 59
cts 59
cp 1
rs 9.6
c 0
b 0
f 0

3 Functions

Rating   Name   Duplication   Size   Complexity  
B clef_german() 0 36 7
F clef_german_plus() 0 49 15
D clef_swedish() 0 46 13
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._clef.
20
21
The stemmer._clef module defines CLEF stemmers for:
22
23
    - German
24
    - German plus
25
    - Swedish
26
"""
27
28 1
from __future__ import unicode_literals
29
30 1
from unicodedata import normalize
31
32 1
from six import text_type
33
34 1
__all__ = ['clef_german', 'clef_german_plus', 'clef_swedish']
35
36
37 1
def clef_german(word):
38
    """Return CLEF German stem.
39
40
    The CLEF German stemmer is defined at :cite:`Savoy:2005`.
41
42
    :param str word: the word to calculate the stem of
43
    :returns: word stem
44
    :rtype: str
45
46
    >>> clef_german('lesen')
47
    'lese'
48
    >>> clef_german('graues')
49
    'grau'
50
    >>> clef_german('buchstabieren')
51
    'buchstabier'
52
    """
53
    # lowercase, normalize, and compose
54 1
    word = normalize('NFC', text_type(word.lower()))
55
56
    # remove umlauts
57 1
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
58 1
    word = word.translate(_umlauts)
59
60
    # remove plurals
61 1
    wlen = len(word) - 1
62
63 1
    if wlen > 3:
64 1
        if wlen > 5:
65 1
            if word[-3:] == 'nen':
66 1
                return word[:-3]
67 1
        if wlen > 4:
68 1
            if word[-2:] in {'en', 'se', 'es', 'er'}:
69 1
                return word[:-2]
70 1
        if word[-1] in {'e', 'n', 'r', 's'}:
71 1
            return word[:-1]
72 1
    return word
73
74
75 1
def clef_german_plus(word):
76
    """Return 'CLEF German stemmer plus' stem.
77
78
    The CLEF German stemmer plus is defined at :cite:`Savoy:2005`.
79
80
    :param str word: the word to calculate the stem of
81
    :returns: word stem
82
    :rtype: str
83
84
    >>> clef_german_plus('lesen')
85
    'les'
86
    >>> clef_german_plus('graues')
87
    'grau'
88
    >>> clef_german_plus('buchstabieren')
89
    'buchstabi'
90
    """
91 1
    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
92
93
    # lowercase, normalize, and compose
94 1
    word = normalize('NFC', text_type(word.lower()))
95
96
    # remove umlauts
97 1
    _accents = dict(
98
        zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'), 'aaaaooooiiiiuuuu')
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
99
    )
100 1
    word = word.translate(_accents)
101
102
    # Step 1
103 1
    wlen = len(word) - 1
104 1
    if wlen > 4 and word[-3:] == 'ern':
105 1
        word = word[:-3]
106 1
    elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
107 1
        word = word[:-2]
108 1
    elif wlen > 2 and (
109
        word[-1] == 'e' or (word[-1] == 's' and word[-2] in _st_ending)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
110
    ):
111 1
        word = word[:-1]
112
113
    # Step 2
114 1
    wlen = len(word) - 1
115 1
    if wlen > 4 and word[-3:] == 'est':
116 1
        word = word[:-3]
117 1
    elif wlen > 3 and (
118
        word[-2:] in {'er', 'en'}
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
119
        or (word[-2:] == 'st' and word[-3] in _st_ending)
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
120
    ):
121 1
        word = word[:-2]
122
123 1
    return word
124
125
126 1
def clef_swedish(word):
127
    """Return CLEF Swedish stem.
128
129
    The CLEF Swedish stemmer is defined at :cite:`Savoy:2005`.
130
131
    :param str word: the word to calculate the stem of
132
    :returns: word stem
133
    :rtype: str
134
135
    >>> clef_swedish('undervisa')
136
    'undervis'
137
    >>> clef_swedish('suspension')
138
    'suspensio'
139
    >>> clef_swedish('visshet')
140
    'viss'
141
    """
142 1
    wlen = len(word) - 1
143
144 1
    if wlen > 3 and word[-1] == 's':
145 1
        word = word[:-1]
146 1
        wlen -= 1
147
148 1
    if wlen > 6:
149 1
        if word[-5:] in {'elser', 'heten'}:
150 1
            return word[:-5]
151 1
    if wlen > 5:
152 1
        if word[-4:] in {
153
            'arne',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
154
            'erna',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
155
            'ande',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
156
            'else',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
157
            'aste',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
158
            'orna',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
159
            'aren',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
160
        }:
161 1
            return word[:-4]
162 1
    if wlen > 4:
163 1
        if word[-3:] in {'are', 'ast', 'het'}:
164 1
            return word[:-3]
165 1
    if wlen > 3:
166 1
        if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
167 1
            return word[:-2]
168 1
    if wlen > 2:
169 1
        if word[-1] in {'a', 'e', 'n', 't'}:
170 1
            return word[:-1]
171 1
    return word
172
173
174
if __name__ == '__main__':
175
    import doctest
176
177
    doctest.testmod()
178