Completed
Branch master (78a222)
by Chris
14:36
created

abydos.stemmer._schinke.schinke()   F

Complexity

Conditions 14

Size

Total Lines 197
Code Lines 152

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 40
CRAP Score 14

Importance

Changes 0
Metric Value
eloc 152
dl 0
loc 197
ccs 40
cts 40
cp 1
rs 2.52
c 0
b 0
f 0
cc 14
nop 1
crap 14

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.stemmer._schinke.schinke() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._schinke.
20
21
The stemmer.schinke module defines the Schinke Latin stemmer.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from unicodedata import normalize
27
28 1
from six import text_type
29 1
from six.moves import range
30
31 1
__all__ = ['schinke']
32
33
34 1
def schinke(word):
35
    """Return the stem of a word according to the Schinke stemmer.
36
37
    This is defined in :cite:`Schinke:1996`.
38
39
    :param str word: the word to stem
40
    :returns: a dict of the noun- and verb-stemmed word
41
    :rtype: dict
42
43
    >>> schinke('atque')
44
    {'n': 'atque', 'v': 'atque'}
45
    >>> schinke('census')
46
    {'n': 'cens', 'v': 'censu'}
47
    >>> schinke('virum')
48
    {'n': 'uir', 'v': 'uiru'}
49
    >>> schinke('populusque')
50
    {'n': 'popul', 'v': 'populu'}
51
    >>> schinke('senatus')
52
    {'n': 'senat', 'v': 'senatu'}
53
    """
54 1
    word = normalize('NFKD', text_type(word.lower()))
55 1
    word = ''.join(
56
        c
57
        for c in word
58
        if c
59
        in {
60
            'a',
61
            'b',
62
            'c',
63
            'd',
64
            'e',
65
            'f',
66
            'g',
67
            'h',
68
            'i',
69
            'j',
70
            'k',
71
            'l',
72
            'm',
73
            'n',
74
            'o',
75
            'p',
76
            'q',
77
            'r',
78
            's',
79
            't',
80
            'u',
81
            'v',
82
            'w',
83
            'x',
84
            'y',
85
            'z',
86
        }
87
    )
88
89
    # Rule 2
90 1
    word = word.replace('j', 'i').replace('v', 'u')
91
92
    # Rule 3
93 1
    keep_que = {
94
        'at',
95
        'quo',
96
        'ne',
97
        'ita',
98
        'abs',
99
        'aps',
100
        'abus',
101
        'adae',
102
        'adus',
103
        'deni',
104
        'de',
105
        'sus',
106
        'obli',
107
        'perae',
108
        'plenis',
109
        'quando',
110
        'quis',
111
        'quae',
112
        'cuius',
113
        'cui',
114
        'quem',
115
        'quam',
116
        'qua',
117
        'qui',
118
        'quorum',
119
        'quarum',
120
        'quibus',
121
        'quos',
122
        'quas',
123
        'quotusquis',
124
        'quous',
125
        'ubi',
126
        'undi',
127
        'us',
128
        'uter',
129
        'uti',
130
        'utro',
131
        'utribi',
132
        'tor',
133
        'co',
134
        'conco',
135
        'contor',
136
        'detor',
137
        'deco',
138
        'exco',
139
        'extor',
140
        'obtor',
141
        'optor',
142
        'retor',
143
        'reco',
144
        'attor',
145
        'inco',
146
        'intor',
147
        'praetor',
148
    }
149 1
    if word[-3:] == 'que':
150
        # This diverges from the paper by also returning 'que' itself unstemmed
151 1
        if word[:-3] in keep_que or word == 'que':
152 1
            return {'n': word, 'v': word}
153
        else:
154 1
            word = word[:-3]
155
156
    # Base case will mean returning the words as is
157 1
    noun = word
158 1
    verb = word
159
160
    # Rule 4
161 1
    n_endings = {
162
        4: {'ibus'},
163
        3: {'ius'},
164
        2: {
165
            'is',
166
            'nt',
167
            'ae',
168
            'os',
169
            'am',
170
            'ud',
171
            'as',
172
            'um',
173
            'em',
174
            'us',
175
            'es',
176
            'ia',
177
        },
178
        1: {'a', 'e', 'i', 'o', 'u'},
179
    }
180 1
    for endlen in range(4, 0, -1):
181 1
        if word[-endlen:] in n_endings[endlen]:
182 1
            if len(word) - 2 >= endlen:
183 1
                noun = word[:-endlen]
184
            else:
185 1
                noun = word
186 1
            break
187
188 1
    v_endings_strip = {
189
        6: {},
190
        5: {},
191
        4: {'mini', 'ntur', 'stis'},
192
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
193
        2: {'ns', 'nt', 'ri'},
194
        1: {'m', 'r', 's', 't'},
195
    }
196 1
    v_endings_alter = {
197
        6: {'iuntur'},
198
        5: {'beris', 'erunt', 'untur'},
199
        4: {'iunt'},
200
        3: {'bor', 'ero', 'unt'},
201
        2: {'bo'},
202
        1: {},
203
    }
204 1
    for endlen in range(6, 0, -1):
205 1
        if word[-endlen:] in v_endings_strip[endlen]:
206 1
            if len(word) - 2 >= endlen:
207 1
                verb = word[:-endlen]
208
            else:
209 1
                verb = word
210 1
            break
211 1
        if word[-endlen:] in v_endings_alter[endlen]:
212 1
            if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
213 1
                new_word = word[:-endlen] + 'i'
214 1
                addlen = 1
215 1
            elif word[-endlen:] in {'beris', 'bor', 'bo'}:
216 1
                new_word = word[:-endlen] + 'bi'
217 1
                addlen = 2
218
            else:
219 1
                new_word = word[:-endlen] + 'eri'
220 1
                addlen = 3
221
222
            # Technically this diverges from the paper by considering the
223
            # length of the stem without the new suffix
224 1
            if len(new_word) >= 2 + addlen:
225 1
                verb = new_word
226
            else:
227 1
                verb = word
228 1
            break
229
230 1
    return {'n': noun, 'v': verb}
231
232
233
if __name__ == '__main__':
234
    import doctest
235
236
    doctest.testmod()
237