Completed
Branch master (78a222)
by Chris
14:36
created

abydos.stemmer._schinke   A

Complexity

Total Complexity 14

Size/Duplication

Total Lines 237
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 14
eloc 161
dl 0
loc 237
ccs 46
cts 46
cp 1
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F schinke() 0 197 14
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._schinke.
20
21
The stemmer.schinke module defines the Schinke Latin stemmer.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from unicodedata import normalize
27
28 1
from six import text_type
29 1
from six.moves import range
30
31 1
__all__ = ['schinke']
32
33
34 1
def schinke(word):
35
    """Return the stem of a word according to the Schinke stemmer.
36
37
    This is defined in :cite:`Schinke:1996`.
38
39
    :param str word: the word to stem
40
    :returns: a dict of the noun- and verb-stemmed word
41
    :rtype: dict
42
43
    >>> schinke('atque')
44
    {'n': 'atque', 'v': 'atque'}
45
    >>> schinke('census')
46
    {'n': 'cens', 'v': 'censu'}
47
    >>> schinke('virum')
48
    {'n': 'uir', 'v': 'uiru'}
49
    >>> schinke('populusque')
50
    {'n': 'popul', 'v': 'populu'}
51
    >>> schinke('senatus')
52
    {'n': 'senat', 'v': 'senatu'}
53
    """
54 1
    word = normalize('NFKD', text_type(word.lower()))
55 1
    word = ''.join(
56
        c
57
        for c in word
58
        if c
59
        in {
60
            'a',
61
            'b',
62
            'c',
63
            'd',
64
            'e',
65
            'f',
66
            'g',
67
            'h',
68
            'i',
69
            'j',
70
            'k',
71
            'l',
72
            'm',
73
            'n',
74
            'o',
75
            'p',
76
            'q',
77
            'r',
78
            's',
79
            't',
80
            'u',
81
            'v',
82
            'w',
83
            'x',
84
            'y',
85
            'z',
86
        }
87
    )
88
89
    # Rule 2
90 1
    word = word.replace('j', 'i').replace('v', 'u')
91
92
    # Rule 3
93 1
    keep_que = {
94
        'at',
95
        'quo',
96
        'ne',
97
        'ita',
98
        'abs',
99
        'aps',
100
        'abus',
101
        'adae',
102
        'adus',
103
        'deni',
104
        'de',
105
        'sus',
106
        'obli',
107
        'perae',
108
        'plenis',
109
        'quando',
110
        'quis',
111
        'quae',
112
        'cuius',
113
        'cui',
114
        'quem',
115
        'quam',
116
        'qua',
117
        'qui',
118
        'quorum',
119
        'quarum',
120
        'quibus',
121
        'quos',
122
        'quas',
123
        'quotusquis',
124
        'quous',
125
        'ubi',
126
        'undi',
127
        'us',
128
        'uter',
129
        'uti',
130
        'utro',
131
        'utribi',
132
        'tor',
133
        'co',
134
        'conco',
135
        'contor',
136
        'detor',
137
        'deco',
138
        'exco',
139
        'extor',
140
        'obtor',
141
        'optor',
142
        'retor',
143
        'reco',
144
        'attor',
145
        'inco',
146
        'intor',
147
        'praetor',
148
    }
149 1
    if word[-3:] == 'que':
150
        # This diverges from the paper by also returning 'que' itself unstemmed
151 1
        if word[:-3] in keep_que or word == 'que':
152 1
            return {'n': word, 'v': word}
153
        else:
154 1
            word = word[:-3]
155
156
    # Base case will mean returning the words as is
157 1
    noun = word
158 1
    verb = word
159
160
    # Rule 4
161 1
    n_endings = {
162
        4: {'ibus'},
163
        3: {'ius'},
164
        2: {
165
            'is',
166
            'nt',
167
            'ae',
168
            'os',
169
            'am',
170
            'ud',
171
            'as',
172
            'um',
173
            'em',
174
            'us',
175
            'es',
176
            'ia',
177
        },
178
        1: {'a', 'e', 'i', 'o', 'u'},
179
    }
180 1
    for endlen in range(4, 0, -1):
181 1
        if word[-endlen:] in n_endings[endlen]:
182 1
            if len(word) - 2 >= endlen:
183 1
                noun = word[:-endlen]
184
            else:
185 1
                noun = word
186 1
            break
187
188 1
    v_endings_strip = {
189
        6: {},
190
        5: {},
191
        4: {'mini', 'ntur', 'stis'},
192
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
193
        2: {'ns', 'nt', 'ri'},
194
        1: {'m', 'r', 's', 't'},
195
    }
196 1
    v_endings_alter = {
197
        6: {'iuntur'},
198
        5: {'beris', 'erunt', 'untur'},
199
        4: {'iunt'},
200
        3: {'bor', 'ero', 'unt'},
201
        2: {'bo'},
202
        1: {},
203
    }
204 1
    for endlen in range(6, 0, -1):
205 1
        if word[-endlen:] in v_endings_strip[endlen]:
206 1
            if len(word) - 2 >= endlen:
207 1
                verb = word[:-endlen]
208
            else:
209 1
                verb = word
210 1
            break
211 1
        if word[-endlen:] in v_endings_alter[endlen]:
212 1
            if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
213 1
                new_word = word[:-endlen] + 'i'
214 1
                addlen = 1
215 1
            elif word[-endlen:] in {'beris', 'bor', 'bo'}:
216 1
                new_word = word[:-endlen] + 'bi'
217 1
                addlen = 2
218
            else:
219 1
                new_word = word[:-endlen] + 'eri'
220 1
                addlen = 3
221
222
            # Technically this diverges from the paper by considering the
223
            # length of the stem without the new suffix
224 1
            if len(new_word) >= 2 + addlen:
225 1
                verb = new_word
226
            else:
227 1
                verb = word
228 1
            break
229
230 1
    return {'n': noun, 'v': verb}
231
232
233
if __name__ == '__main__':
234
    import doctest
235
236
    doctest.testmod()
237