abydos.stemmer._schinke   A
last analyzed

Complexity

Total Complexity 15

Size/Duplication

Total Lines 300
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 15
eloc 169
dl 0
loc 300
ccs 51
cts 51
cp 1
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
F Schinke.stem_dict() 0 124 14
A Schinke.stem() 0 37 1
1
# Copyright 2014-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._schinke.
18
19 1
Schinke Latin stemmer.
20
"""
21
22
from typing import Dict
23
from unicodedata import normalize
24 1
25
from ._stemmer import _Stemmer
26
27
__all__ = ['Schinke']
28
29
30
class Schinke(_Stemmer):
31 1
    """Schinke stemmer.
32
33 1
    This is defined in :cite:`Schinke:1996`.
34
35 1
    .. versionadded:: 0.3.6
36 1
    """
37
38 1
    _keep_que = {
39 1
        'at',
40
        'quo',
41 1
        'ne',
42
        'ita',
43
        'abs',
44 1
        'aps',
45
        'abus',
46
        'adae',
47
        'adus',
48
        'deni',
49
        'de',
50
        'sus',
51
        'obli',
52 1
        'perae',
53
        'plenis',
54
        'quando',
55
        'quis',
56
        'quae',
57
        'cuius',
58
        'cui',
59
        'quem',
60
        'quam',
61
        'qua',
62
        'qui',
63
        'quorum',
64
        'quarum',
65
        'quibus',
66
        'quos',
67
        'quas',
68
        'quotusquis',
69
        'quous',
70
        'ubi',
71
        'undi',
72
        'us',
73
        'uter',
74
        'uti',
75
        'utro',
76
        'utribi',
77
        'tor',
78
        'co',
79
        'conco',
80
        'contor',
81
        'detor',
82
        'deco',
83
        'exco',
84
        'extor',
85
        'obtor',
86
        'optor',
87
        'retor',
88
        'reco',
89
        'attor',
90
        'inco',
91
        'intor',
92
        'praetor',
93
    }
94
95
    _n_endings = {
96
        4: {'ibus'},
97
        3: {'ius'},
98
        2: {
99
            'is',
100
            'nt',
101
            'ae',
102
            'os',
103
            'am',
104
            'ud',
105
            'as',
106
            'um',
107
            'em',
108
            'us',
109 1
            'es',
110
            'ia',
111
        },
112
        1: {'a', 'e', 'i', 'o', 'u'},
113
    }
114
115
    _v_endings_strip = {
116
        6: {},
117
        5: {},
118
        4: {'mini', 'ntur', 'stis'},
119
        3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
120
        2: {'ns', 'nt', 'ri'},
121
        1: {'m', 'r', 's', 't'},
122
    }
123
    _v_endings_alter = {
124
        6: {'iuntur'},
125
        5: {'beris', 'erunt', 'untur'},
126
        4: {'iunt'},
127
        3: {'bor', 'ero', 'unt'},
128
        2: {'bo'},
129 1
        1: {},
130
    }
131
132
    def stem(self, word: str) -> str:
133
        """Return the stem of a word according to the Schinke stemmer.
134
135
        Parameters
136
        ----------
137 1
        word : str
138
            The word to stem
139
140
        Returns
141
        -------
142
        str
143
            Word stem
144
145
        Examples
146 1
        --------
147
        >>> stmr = Schinke()
148
        >>> stmr.stem('atque')
149
        'atque,atque'
150
        >>> stmr.stem('census')
151
        'cens,censu'
152
        >>> stmr.stem('virum')
153
        'uir,uiru'
154
        >>> stmr.stem('populusque')
155
        'popul,populu'
156
        >>> stmr.stem('senatus')
157
        'senat,senatu'
158
159
160
        .. versionadded:: 0.3.0
161
        .. versionchanged:: 0.3.6
162
            Encapsulated in class
163
        .. versionchanged:: 0.6.0
164
            Made return a str with the noun then verb stem, comma-separated
165
166
        """
167
        nv = self.stem_dict(word)
168
        return '{0},{1}'.format(nv['n'], nv['v'])
169
170
    def stem_dict(self, word: str) -> Dict[str, str]:
171
        """Return the stem of a word according to the Schinke stemmer.
172
173
        Parameters
174
        ----------
175
        word : str
176
            The word to stem
177
178
        Returns
179 1
        -------
180 1
        dict
181
            Word stems in a dictionary
182
183
        Examples
184
        --------
185
        >>> stmr = Schinke()
186
        >>> stmr.stem_dict('atque')
187
        {'n': 'atque', 'v': 'atque'}
188
        >>> stmr.stem_dict('census')
189
        {'n': 'cens', 'v': 'censu'}
190
        >>> stmr.stem_dict('virum')
191
        {'n': 'uir', 'v': 'uiru'}
192
        >>> stmr.stem_dict('populusque')
193
        {'n': 'popul', 'v': 'populu'}
194
        >>> stmr.stem_dict('senatus')
195
        {'n': 'senat', 'v': 'senatu'}
196
197
198
        .. versionadded:: 0.6.0
199
200
        """
201
        word = normalize('NFKD', word.lower())
202
        word = ''.join(
203
            c
204
            for c in word
205
            if c
206
            in {
207
                'a',
208
                'b',
209
                'c',
210
                'd',
211
                'e',
212
                'f',
213
                'g',
214
                'h',
215 1
                'i',
216
                'j',
217
                'k',
218 1
                'l',
219
                'm',
220
                'n',
221 1
                'o',
222 1
                'p',
223
                'q',
224 1
                'r',
225
                's',
226
                't',
227 1
                'u',
228 1
                'v',
229
                'w',
230
                'x',
231 1
                'y',
232 1
                'z',
233 1
            }
234 1
        )
235
236 1
        # Rule 2
237 1
        word = word.replace('j', 'i').replace('v', 'u')
238
239 1
        # Rule 3
240 1
        if word[-3:] == 'que':
241 1
            # This diverges from the paper by also returning 'que' itself
242 1
            #  unstemmed
243
            if word[:-3] in self._keep_que or word == 'que':
244 1
                return {'n': word, 'v': word}
245 1
            else:
246 1
                word = word[:-3]
247 1
248
        # Base case will mean returning the words as is
249
        noun = word
250
        verb = word
251
252
        # Rule 4
253
        for endlen in range(4, 0, -1):
254 1
            if word[-endlen:] in self._n_endings[endlen]:
255 1
                if len(word) - 2 >= endlen:
256 1
                    noun = word[:-endlen]
257 1
                else:
258 1
                    noun = word
259
                break
260 1
261 1
        for endlen in range(6, 0, -1):
262
            if word[-endlen:] in self._v_endings_strip[endlen]:
263
                if len(word) - 2 >= endlen:
264
                    verb = word[:-endlen]
265 1
                else:
266 1
                    verb = word
267
                break
268 1
            if word[-endlen:] in self._v_endings_alter[endlen]:
269 1
                if word[-endlen:] in {
270
                    'iuntur',
271 1
                    'erunt',
272
                    'untur',
273
                    'iunt',
274 1
                    'unt',
275
                }:
276
                    new_word = word[:-endlen] + 'i'
277
                    addlen = 1
278
                elif word[-endlen:] in {'beris', 'bor', 'bo'}:
279
                    new_word = word[:-endlen] + 'bi'
280
                    addlen = 2
281
                else:
282
                    new_word = word[:-endlen] + 'eri'
283
                    addlen = 3
284
285
                # Technically this diverges from the paper by considering the
286
                # length of the stem without the new suffix
287
                if len(new_word) >= 2 + addlen:
288
                    verb = new_word
289
                else:
290
                    verb = word
291
                break
292
293
        return {'n': noun, 'v': verb}
294
295
296
if __name__ == '__main__':
297
    import doctest
298
299
    doctest.testmod()
300