Completed
Push — master ( fc7245...01ba1b )
by Chris
11:55
created

abydos.stemmer.schinke()   F

Complexity

Conditions 14

Size

Total Lines 95
Code Lines 65

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 14
eloc 65
nop 1
dl 0
loc 95
rs 3.6
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.stemmer.schinke() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (2345/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.stemmer.
20
21
The stemmer module defines word stemmers including:
22
23
    - the Lovins stemmer
24
    - the Porter and Porter2 (Snowball English) stemmers
25
    - Snowball stemmers for German, Dutch, Norwegian, Swedish, and Danish
26
    - CLEF German, German plus, and Swedish stemmers
27
    - Caumann's German stemmer
28
    - UEA-Lite Stemmer
29
    - Paice-Husk Stemmer
30
"""
31
32
from __future__ import unicode_literals
33
34
import re
35
import unicodedata
36
37
from six import text_type
38
from six.moves import range
39
40
41
def lovins(word):
42
    """Return Lovins stem.
43
44
    Lovins stemmer
45
46
    The Lovins stemmer is described in Julie Beth Lovins's article at:
47
    http://www.mt-archive.info/MT-1968-Lovins.pdf
48
49
    :param word: the word to stem
50
    :returns: word stem
51
    :rtype: string
52
53
    >>> lovins('reading')
54
    'read'
55
    >>> lovins('suspension')
56
    'suspens'
57
    >>> lovins('elusiveness')
58
    'elus'
59
    """
60
    # pylint: disable=too-many-branches, too-many-locals
61
62
    # lowercase, normalize, and compose
63
    word = unicodedata.normalize('NFC', text_type(word.lower()))
64
65
    def cond_b(word, suffix_len):
66
        """Return Lovins' condition B."""
67
        return len(word)-suffix_len >= 3
68
69
    def cond_c(word, suffix_len):
70
        """Return Lovins' condition C."""
71
        return len(word)-suffix_len >= 4
72
73
    def cond_d(word, suffix_len):
74
        """Return Lovins' condition D."""
75
        return len(word)-suffix_len >= 5
76
77
    def cond_e(word, suffix_len):
78
        """Return Lovins' condition E."""
79
        return word[-suffix_len-1] != 'e'
80
81
    def cond_f(word, suffix_len):
82
        """Return Lovins' condition F."""
83
        return (len(word)-suffix_len >= 3 and
84
                word[-suffix_len-1] != 'e')
85
86
    def cond_g(word, suffix_len):
87
        """Return Lovins' condition G."""
88
        return (len(word)-suffix_len >= 3 and
89
                word[-suffix_len-1] == 'f')
90
91
    def cond_h(word, suffix_len):
92
        """Return Lovins' condition H."""
93
        return (word[-suffix_len-1] == 't' or
94
                word[-suffix_len-2:-suffix_len] == 'll')
95
96
    def cond_i(word, suffix_len):
97
        """Return Lovins' condition I."""
98
        return word[-suffix_len-1] not in {'e', 'o'}
99
100
    def cond_j(word, suffix_len):
101
        """Return Lovins' condition J."""
102
        return word[-suffix_len-1] not in {'a', 'e'}
103
104
    def cond_k(word, suffix_len):
105
        """Return Lovins' condition K."""
106
        return (len(word)-suffix_len >= 3 and
107
                (word[-suffix_len-1] in {'i', 'l'} or
108
                 (word[-suffix_len-3] == 'u' and word[-suffix_len-1] == 'e')))
109
110
    def cond_l(word, suffix_len):
111
        """Return Lovins' condition L."""
112
        return (word[-suffix_len-1] not in {'s', 'u', 'x'} or
113
                word[-suffix_len-1] == 'os')
114
115
    def cond_m(word, suffix_len):
116
        """Return Lovins' condition M."""
117
        return word[-suffix_len-1] not in {'a', 'c', 'e', 'm'}
118
119
    def cond_n(word, suffix_len):
120
        """Return Lovins' condition N."""
121
        if len(word)-suffix_len >= 3:
122
            if word[-suffix_len-3] == 's':
123
                if len(word)-suffix_len >= 4:
124
                    return True
125
            else:
126
                return True
127
        return False
128
129
    def cond_o(word, suffix_len):
130
        """Return Lovins' condition O."""
131
        return word[-suffix_len-1] in {'i', 'l'}
132
133
    def cond_p(word, suffix_len):
134
        """Return Lovins' condition P."""
135
        return word[-suffix_len-1] != 'c'
136
137
    def cond_q(word, suffix_len):
138
        """Return Lovins' condition Q."""
139
        return (len(word)-suffix_len >= 3 and
140
                word[-suffix_len-1] not in {'l', 'n'})
141
142
    def cond_r(word, suffix_len):
143
        """Return Lovins' condition R."""
144
        return word[-suffix_len-1] in {'n', 'r'}
145
146
    def cond_s(word, suffix_len):
147
        """Return Lovins' condition S."""
148
        return (word[-suffix_len-2:-suffix_len] == 'dr' or
149
                (word[-suffix_len-1] == 't' and
150
                 word[-suffix_len-2:-suffix_len] != 'tt'))
151
152
    def cond_t(word, suffix_len):
153
        """Return Lovins' condition T."""
154
        return (word[-suffix_len-1] in {'s', 't'} and
155
                word[-suffix_len-2:-suffix_len] != 'ot')
156
157
    def cond_u(word, suffix_len):
158
        """Return Lovins' condition U."""
159
        return word[-suffix_len-1] in {'l', 'm', 'n', 'r'}
160
161
    def cond_v(word, suffix_len):
162
        """Return Lovins' condition V."""
163
        return word[-suffix_len-1] == 'c'
164
165
    def cond_w(word, suffix_len):
166
        """Return Lovins' condition W."""
167
        return word[-suffix_len-1] not in {'s', 'u'}
168
169
    def cond_x(word, suffix_len):
170
        """Return Lovins' condition X."""
171
        return (word[-suffix_len-1] in {'i', 'l'} or
172
                (word[-suffix_len-3:-suffix_len] == 'u' and
173
                 word[-suffix_len-1] == 'e'))
174
175
    def cond_y(word, suffix_len):
176
        """Return Lovins' condition Y."""
177
        return word[-suffix_len-2:-suffix_len] == 'in'
178
179
    def cond_z(word, suffix_len):
180
        """Return Lovins' condition Z."""
181
        return word[-suffix_len-1] != 'f'
182
183
    def cond_aa(word, suffix_len):
184
        """Return Lovins' condition AA."""
185
        return (word[-suffix_len-1] in {'d', 'f', 'l', 't'} or
186
                word[-suffix_len-2:-suffix_len] in {'ph', 'th', 'er', 'or',
187
                                                    'es'})
188
189
    def cond_bb(word, suffix_len):
190
        """Return Lovins' condition BB."""
191
        return (len(word)-suffix_len >= 3 and
192
                word[-suffix_len-3:-suffix_len] != 'met' and
193
                word[-suffix_len-4:-suffix_len] != 'ryst')
194
195
    def cond_cc(word, suffix_len):
196
        """Return Lovins' condition CC."""
197
        return word[-suffix_len-1] == 'l'
198
199
    suffix = {'alistically': cond_b, 'arizability': None,
200
              'izationally': cond_b, 'antialness': None,
201
              'arisations': None, 'arizations': None, 'entialness': None,
202
              'allically': cond_c, 'antaneous': None, 'antiality': None,
203
              'arisation': None, 'arization': None, 'ationally': cond_b,
204
              'ativeness': None, 'eableness': cond_e, 'entations': None,
205
              'entiality': None, 'entialize': None, 'entiation': None,
206
              'ionalness': None, 'istically': None, 'itousness': None,
207
              'izability': None, 'izational': None, 'ableness': None,
208
              'arizable': None, 'entation': None, 'entially': None,
209
              'eousness': None, 'ibleness': None, 'icalness': None,
210
              'ionalism': None, 'ionality': None, 'ionalize': None,
211
              'iousness': None, 'izations': None, 'lessness': None,
212
              'ability': None, 'aically': None, 'alistic': cond_b,
213
              'alities': None, 'ariness': cond_e, 'aristic': None,
214
              'arizing': None, 'ateness': None, 'atingly': None,
215
              'ational': cond_b, 'atively': None, 'ativism': None,
216
              'elihood': cond_e, 'encible': None, 'entally': None,
217
              'entials': None, 'entiate': None, 'entness': None,
218
              'fulness': None, 'ibility': None, 'icalism': None,
219
              'icalist': None, 'icality': None, 'icalize': None,
220
              'ication': cond_g, 'icianry': None, 'ination': None,
221
              'ingness': None, 'ionally': None, 'isation': None,
222
              'ishness': None, 'istical': None, 'iteness': None,
223
              'iveness': None, 'ivistic': None, 'ivities': None,
224
              'ization': cond_f, 'izement': None, 'oidally': None,
225
              'ousness': None, 'aceous': None, 'acious': cond_b,
226
              'action': cond_g, 'alness': None, 'ancial': None,
227
              'ancies': None, 'ancing': cond_b, 'ariser': None,
228
              'arized': None, 'arizer': None, 'atable': None,
229
              'ations': cond_b, 'atives': None, 'eature': cond_z,
230
              'efully': None, 'encies': None, 'encing': None,
231
              'ential': None, 'enting': cond_c, 'entist': None,
232
              'eously': None, 'ialist': None, 'iality': None,
233
              'ialize': None, 'ically': None, 'icance': None,
234
              'icians': None, 'icists': None, 'ifully': None,
235
              'ionals': None, 'ionate': cond_d, 'ioning': None,
236
              'ionist': None, 'iously': None, 'istics': None,
237
              'izable': cond_e, 'lessly': None, 'nesses': None,
238
              'oidism': None, 'acies': None, 'acity': None,
239
              'aging': cond_b, 'aical': None, 'alist': None,
240
              'alism': cond_b, 'ality': None, 'alize': None,
241
              'allic': cond_bb, 'anced': cond_b, 'ances': cond_b,
242
              'antic': cond_c, 'arial': None, 'aries': None,
243
              'arily': None, 'arity': cond_b, 'arize': None,
244
              'aroid': None, 'ately': None, 'ating': cond_i,
245
              'ation': cond_b, 'ative': None, 'ators': None,
246
              'atory': None, 'ature': cond_e, 'early': cond_y,
247
              'ehood': None, 'eless': None, 'elity': None,
248
              'ement': None, 'enced': None, 'ences': None,
249
              'eness': cond_e, 'ening': cond_e, 'ental': None,
250
              'ented': cond_c, 'ently': None, 'fully': None,
251
              'ially': None, 'icant': None, 'ician': None,
252
              'icide': None, 'icism': None, 'icist': None,
253
              'icity': None, 'idine': cond_i, 'iedly': None,
254
              'ihood': None, 'inate': None, 'iness': None,
255
              'ingly': cond_b, 'inism': cond_j, 'inity': cond_cc,
256
              'ional': None, 'ioned': None, 'ished': None,
257
              'istic': None, 'ities': None, 'itous': None,
258
              'ively': None, 'ivity': None, 'izers': cond_f,
259
              'izing': cond_f, 'oidal': None, 'oides': None,
260
              'otide': None, 'ously': None, 'able': None, 'ably': None,
261
              'ages': cond_b, 'ally': cond_b, 'ance': cond_b, 'ancy': cond_b,
262
              'ants': cond_b, 'aric': None, 'arly': cond_k, 'ated': cond_i,
263
              'ates': None, 'atic': cond_b, 'ator': None, 'ealy': cond_y,
264
              'edly': cond_e, 'eful': None, 'eity': None, 'ence': None,
265
              'ency': None, 'ened': cond_e, 'enly': cond_e, 'eous': None,
266
              'hood': None, 'ials': None, 'ians': None, 'ible': None,
267
              'ibly': None, 'ical': None, 'ides': cond_l, 'iers': None,
268
              'iful': None, 'ines': cond_m, 'ings': cond_n, 'ions': cond_b,
269
              'ious': None, 'isms': cond_b, 'ists': None, 'itic': cond_h,
270
              'ized': cond_f, 'izer': cond_f, 'less': None, 'lily': None,
271
              'ness': None, 'ogen': None, 'ward': None, 'wise': None,
272
              'ying': cond_b, 'yish': None, 'acy': None, 'age': cond_b,
273
              'aic': None, 'als': cond_bb, 'ant': cond_b, 'ars': cond_o,
274
              'ary': cond_f, 'ata': None, 'ate': None, 'eal': cond_y,
275
              'ear': cond_y, 'ely': cond_e, 'ene': cond_e, 'ent': cond_c,
276
              'ery': cond_e, 'ese': None, 'ful': None, 'ial': None,
277
              'ian': None, 'ics': None, 'ide': cond_l, 'ied': None,
278
              'ier': None, 'ies': cond_p, 'ily': None, 'ine': cond_m,
279
              'ing': cond_n, 'ion': cond_q, 'ish': cond_c, 'ism': cond_b,
280
              'ist': None, 'ite': cond_aa, 'ity': None, 'ium': None,
281
              'ive': None, 'ize': cond_f, 'oid': None, 'one': cond_r,
282
              'ous': None, 'ae': None, 'al': cond_bb, 'ar': cond_x,
283
              'as': cond_b, 'ed': cond_e, 'en': cond_f, 'es': cond_e,
284
              'ia': None, 'ic': None, 'is': None, 'ly': cond_b,
285
              'on': cond_s, 'or': cond_t, 'um': cond_u, 'us': cond_v,
286
              'yl': cond_r, '\'s': None, 's\'': None, 'a': None,
287
              'e': None, 'i': None, 'o': None, 's': cond_w, 'y': cond_b}
288
289
    for suffix_len in range(11, 0, -1):
290
        ending = word[-suffix_len:]
291
        if (ending in suffix and
292
                len(word)-suffix_len >= 2 and
293
                (suffix[ending] is None or
294
                 suffix[ending](word, suffix_len))):
295
            word = word[:-suffix_len]
296
            break
297
298
    def recode9(stem):
299
        """Return Lovins' conditional recode rule 9."""
300
        if stem[-3:-2] in {'a', 'i', 'o'}:
301
            return stem
302
        return stem[:-2]+'l'
303
304
    def recode24(stem):
305
        """Return Lovins' conditional recode rule 24."""
306
        if stem[-4:-3] == 's':
307
            return stem
308
        return stem[:-1]+'s'
309
310
    def recode28(stem):
311
        """Return Lovins' conditional recode rule 28."""
312
        if stem[-4:-3] in {'p', 't'}:
313
            return stem
314
        return stem[:-1]+'s'
315
316
    def recode30(stem):
317
        """Return Lovins' conditional recode rule 30."""
318
        if stem[-4:-3] == 'm':
319
            return stem
320
        return stem[:-1]+'s'
321
322
    def recode32(stem):
323
        """Return Lovins' conditional recode rule 32."""
324
        if stem[-3:-2] == 'n':
325
            return stem
326
        return stem[:-1]+'s'
327
328
    if word[-2:] in {'bb', 'dd', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss',
329
                     'tt'}:
330
        word = word[:-1]
331
332
    recode = (('iev', 'ief'),
333
              ('uct', 'uc'),
334
              ('umpt', 'um'),
335
              ('rpt', 'rb'),
336
              ('urs', 'ur'),
337
              ('istr', 'ister'),
338
              ('metr', 'meter'),
339
              ('olv', 'olut'),
340
              ('ul', recode9),
341
              ('bex', 'bic'),
342
              ('dex', 'dic'),
343
              ('pex', 'pic'),
344
              ('tex', 'tic'),
345
              ('ax', 'ac'),
346
              ('ex', 'ec'),
347
              ('ix', 'ic'),
348
              ('lux', 'luc'),
349
              ('uad', 'uas'),
350
              ('vad', 'vas'),
351
              ('cid', 'cis'),
352
              ('lid', 'lis'),
353
              ('erid', 'eris'),
354
              ('pand', 'pans'),
355
              ('end', recode24),
356
              ('ond', 'ons'),
357
              ('lud', 'lus'),
358
              ('rud', 'rus'),
359
              ('her', recode28),
360
              ('mit', 'mis'),
361
              ('ent', recode30),
362
              ('ert', 'ers'),
363
              ('et', recode32),
364
              ('yt', 'ys'),
365
              ('yz', 'ys'))
366
367
    for ending, replacement in recode:
368
        if word.endswith(ending):
369
            if callable(replacement):
370
                word = replacement(word)
371
            else:
372
                word = word[:-len(ending)] + replacement
373
374
    return word
375
376
377
def _m_degree(term, vowels):
378
    """Return Porter helper function _m_degree value.
379
380
    m-degree is equal to the number of V to C transitions
381
382
    :param term: the word for which to calculate the m-degree
383
    :param vowels: the set of vowels in the language
384
    :returns: the m-degree as defined in the Porter stemmer definition
385
    """
386
    mdeg = 0
387
    last_was_vowel = False
388
    for letter in term:
389
        if letter in vowels:
390
            last_was_vowel = True
391
        else:
392
            if last_was_vowel:
393
                mdeg += 1
394
            last_was_vowel = False
395
    return mdeg
396
397
398
def _sb_has_vowel(term, vowels):
399
    """Return Porter helper function _sb_has_vowel value.
400
401
    :param term: the word to scan for vowels
402
    :param vowels: the set of vowels in the language
403
    :returns: true iff a vowel exists in the term (as defined in the Porter
404
        stemmer definition)
405
    """
406
    for letter in term:
407
        if letter in vowels:
408
            return True
409
    return False
410
411
412
def _ends_in_doubled_cons(term, vowels):
413
    """Return Porter helper function _ends_in_doubled_cons value.
414
415
    :param term: the word to check for a final doubled consonant
416
    :param vowels: the set of vowels in the language
417
    :returns: true iff the stem ends in a doubled consonant (as defined in the
418
        Porter stemmer definition)
419
    """
420
    if len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1]:
421
        return True
422
    return False
423
424
425
def _ends_in_cvc(term, vowels):
426
    """Return Porter helper function _ends_in_cvc value.
427
428
    :param term: the word to scan for cvc
429
    :param vowels: the set of vowels in the language
430
    :returns: true iff the stem ends in cvc (as defined in the Porter stemmer
431
        definition)
432
    """
433
    if len(term) > 2 and (term[-1] not in vowels and
434
                          term[-2] in vowels and
435
                          term[-3] not in vowels and
436
                          term[-1] not in tuple('wxY')):
437
        return True
438
    return False
439
440
441
def porter(word, early_english=False):
442
    """Return Porter stem.
443
444
    The Porter stemmer is defined at:
445
    http://snowball.tartarus.org/algorithms/porter/stemmer.html
446
447
    :param word: the word to calculate the stem of
448
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
449
        person singular verbal agreement suffixes)
450
    :returns: word stem
451
    :rtype: str
452
453
    >>> porter('reading')
454
    'read'
455
    >>> porter('suspension')
456
    'suspens'
457
    >>> porter('elusiveness')
458
    'elus'
459
460
    >>> porter('eateth', early_english=True)
461
    'eat'
462
    """
463
    # pylint: disable=too-many-branches
464
465
    # lowercase, normalize, and compose
466
    word = unicodedata.normalize('NFC', text_type(word.lower()))
467
468
    # Return word if stem is shorter than 2
469
    if len(word) < 3:
470
        return word
471
472
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
473
    # Re-map consonantal y to Y (Y will be C, y will be V)
474
    if word[0] == 'y':
475
        word = 'Y' + word[1:]
476
    for i in range(1, len(word)):
477
        if word[i] == 'y' and word[i-1] in _vowels:
478
            word = word[:i] + 'Y' + word[i+1:]
479
480
    # Step 1a
481
    if word[-1] == 's':
482
        if word[-4:] == 'sses':
483
            word = word[:-2]
484
        elif word[-3:] == 'ies':
485
            word = word[:-2]
486
        elif word[-2:] == 'ss':
487
            pass
488
        else:
489
            word = word[:-1]
490
491
    # Step 1b
492
    step1b_flag = False
493
    if word[-3:] == 'eed':
494
        if _m_degree(word[:-3], _vowels) > 0:
495
            word = word[:-1]
496
    elif word[-2:] == 'ed':
497
        if _sb_has_vowel(word[:-2], _vowels):
498
            word = word[:-2]
499
            step1b_flag = True
500
    elif word[-3:] == 'ing':
501
        if _sb_has_vowel(word[:-3], _vowels):
502
            word = word[:-3]
503
            step1b_flag = True
504
    elif early_english:
505
        if word[-3:] == 'est':
506
            if _sb_has_vowel(word[:-3], _vowels):
507
                word = word[:-3]
508
                step1b_flag = True
509
        elif word[-3:] == 'eth':
510
            if _sb_has_vowel(word[:-3], _vowels):
511
                word = word[:-3]
512
                step1b_flag = True
513
514
    if step1b_flag:
515
        if word[-2:] in {'at', 'bl', 'iz'}:
516
            word += 'e'
517
        elif (_ends_in_doubled_cons(word, _vowels) and
518
              word[-1] not in {'l', 's', 'z'}):
519
            word = word[:-1]
520
        elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels):
521
            word += 'e'
522
523
    # Step 1c
524
    if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels):
525
        word = word[:-1] + 'i'
526
527
    # Step 2
528
    if len(word) > 1:
529
        if word[-2] == 'a':
530
            if word[-7:] == 'ational':
531
                if _m_degree(word[:-7], _vowels) > 0:
532
                    word = word[:-5] + 'e'
533
            elif word[-6:] == 'tional':
534
                if _m_degree(word[:-6], _vowels) > 0:
535
                    word = word[:-2]
536
        elif word[-2] == 'c':
537
            if word[-4:] in {'enci', 'anci'}:
538
                if _m_degree(word[:-4], _vowels) > 0:
539
                    word = word[:-1] + 'e'
540
        elif word[-2] == 'e':
541
            if word[-4:] == 'izer':
542
                if _m_degree(word[:-4], _vowels) > 0:
543
                    word = word[:-1]
544
        elif word[-2] == 'g':
545
            if word[-4:] == 'logi':
546
                if _m_degree(word[:-4], _vowels) > 0:
547
                    word = word[:-1]
548
        elif word[-2] == 'l':
549
            if word[-3:] == 'bli':
550
                if _m_degree(word[:-3], _vowels) > 0:
551
                    word = word[:-1] + 'e'
552
            elif word[-4:] == 'alli':
553
                if _m_degree(word[:-4], _vowels) > 0:
554
                    word = word[:-2]
555
            elif word[-5:] == 'entli':
556
                if _m_degree(word[:-5], _vowels) > 0:
557
                    word = word[:-2]
558
            elif word[-3:] == 'eli':
559
                if _m_degree(word[:-3], _vowels) > 0:
560
                    word = word[:-2]
561
            elif word[-5:] == 'ousli':
562
                if _m_degree(word[:-5], _vowels) > 0:
563
                    word = word[:-2]
564
        elif word[-2] == 'o':
565
            if word[-7:] == 'ization':
566
                if _m_degree(word[:-7], _vowels) > 0:
567
                    word = word[:-5] + 'e'
568
            elif word[-5:] == 'ation':
569
                if _m_degree(word[:-5], _vowels) > 0:
570
                    word = word[:-3] + 'e'
571
            elif word[-4:] == 'ator':
572
                if _m_degree(word[:-4], _vowels) > 0:
573
                    word = word[:-2] + 'e'
574
        elif word[-2] == 's':
575
            if word[-5:] == 'alism':
576
                if _m_degree(word[:-5], _vowels) > 0:
577
                    word = word[:-3]
578
            elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
579
                if _m_degree(word[:-7], _vowels) > 0:
580
                    word = word[:-4]
581
        elif word[-2] == 't':
582
            if word[-5:] == 'aliti':
583
                if _m_degree(word[:-5], _vowels) > 0:
584
                    word = word[:-3]
585
            elif word[-5:] == 'iviti':
586
                if _m_degree(word[:-5], _vowels) > 0:
587
                    word = word[:-3] + 'e'
588
            elif word[-6:] == 'biliti':
589
                if _m_degree(word[:-6], _vowels) > 0:
590
                    word = word[:-5] + 'le'
591
592
    # Step 3
593
    if word[-5:] == 'icate':
594
        if _m_degree(word[:-5], _vowels) > 0:
595
            word = word[:-3]
596
    elif word[-5:] == 'ative':
597
        if _m_degree(word[:-5], _vowels) > 0:
598
            word = word[:-5]
599
    elif word[-5:] in {'alize', 'iciti'}:
600
        if _m_degree(word[:-5], _vowels) > 0:
601
            word = word[:-3]
602
    elif word[-4:] == 'ical':
603
        if _m_degree(word[:-4], _vowels) > 0:
604
            word = word[:-2]
605
    elif word[-3:] == 'ful':
606
        if _m_degree(word[:-3], _vowels) > 0:
607
            word = word[:-3]
608
    elif word[-4:] == 'ness':
609
        if _m_degree(word[:-4], _vowels) > 0:
610
            word = word[:-4]
611
612
    # Step 4
613
    if word[-2:] == 'al':
614
        if _m_degree(word[:-2], _vowels) > 1:
615
            word = word[:-2]
616
    elif word[-4:] == 'ance':
617
        if _m_degree(word[:-4], _vowels) > 1:
618
            word = word[:-4]
619
    elif word[-4:] == 'ence':
620
        if _m_degree(word[:-4], _vowels) > 1:
621
            word = word[:-4]
622
    elif word[-2:] == 'er':
623
        if _m_degree(word[:-2], _vowels) > 1:
624
            word = word[:-2]
625
    elif word[-2:] == 'ic':
626
        if _m_degree(word[:-2], _vowels) > 1:
627
            word = word[:-2]
628
    elif word[-4:] == 'able':
629
        if _m_degree(word[:-4], _vowels) > 1:
630
            word = word[:-4]
631
    elif word[-4:] == 'ible':
632
        if _m_degree(word[:-4], _vowels) > 1:
633
            word = word[:-4]
634
    elif word[-3:] == 'ant':
635
        if _m_degree(word[:-3], _vowels) > 1:
636
            word = word[:-3]
637
    elif word[-5:] == 'ement':
638
        if _m_degree(word[:-5], _vowels) > 1:
639
            word = word[:-5]
640
    elif word[-4:] == 'ment':
641
        if _m_degree(word[:-4], _vowels) > 1:
642
            word = word[:-4]
643
    elif word[-3:] == 'ent':
644
        if _m_degree(word[:-3], _vowels) > 1:
645
            word = word[:-3]
646
    elif word[-4:] in {'sion', 'tion'}:
647
        if _m_degree(word[:-3], _vowels) > 1:
648
            word = word[:-3]
649
    elif word[-2:] == 'ou':
650
        if _m_degree(word[:-2], _vowels) > 1:
651
            word = word[:-2]
652
    elif word[-3:] == 'ism':
653
        if _m_degree(word[:-3], _vowels) > 1:
654
            word = word[:-3]
655
    elif word[-3:] == 'ate':
656
        if _m_degree(word[:-3], _vowels) > 1:
657
            word = word[:-3]
658
    elif word[-3:] == 'iti':
659
        if _m_degree(word[:-3], _vowels) > 1:
660
            word = word[:-3]
661
    elif word[-3:] == 'ous':
662
        if _m_degree(word[:-3], _vowels) > 1:
663
            word = word[:-3]
664
    elif word[-3:] == 'ive':
665
        if _m_degree(word[:-3], _vowels) > 1:
666
            word = word[:-3]
667
    elif word[-3:] == 'ize':
668
        if _m_degree(word[:-3], _vowels) > 1:
669
            word = word[:-3]
670
671
    # Step 5a
672
    if word[-1] == 'e':
673
        if _m_degree(word[:-1], _vowels) > 1:
674
            word = word[:-1]
675
        elif (_m_degree(word[:-1], _vowels) == 1 and
676
              not _ends_in_cvc(word[:-1], _vowels)):
677
            word = word[:-1]
678
679
    # Step 5b
680
    if word[-2:] == 'll' and _m_degree(word, _vowels) > 1:
681
        word = word[:-1]
682
683
    # Change 'Y' back to 'y' if it survived stemming
684
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
685
        if word[i] == 'Y':
686
            word = word[:i] + 'y' + word[i+1:]
687
688
    return word
689
690
691
def _sb_r1(term, vowels, r1_prefixes=None):
692
    """Return the R1 region, as defined in the Porter2 specification."""
693
    vowel_found = False
694
    if hasattr(r1_prefixes, '__iter__'):
695
        for prefix in r1_prefixes:
696
            if term[:len(prefix)] == prefix:
697
                return len(prefix)
698
699
    for i in range(len(term)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
700
        if not vowel_found and term[i] in vowels:
701
            vowel_found = True
702
        elif vowel_found and term[i] not in vowels:
703
            return i + 1
704
    return len(term)
705
706
707
def _sb_r2(term, vowels, r1_prefixes=None):
708
    """Return the R2 region, as defined in the Porter2 specification."""
709
    r1_start = _sb_r1(term, vowels, r1_prefixes)
710
    return r1_start + _sb_r1(term[r1_start:], vowels)
711
712
713
def _sb_ends_in_short_syllable(term, vowels, codanonvowels):
714
    """Return True iff term ends in a short syllable.
715
716
    (...according to the Porter2 specification.)
717
718
    NB: This is akin to the CVC test from the Porter stemmer. The description
719
    is unfortunately poor/ambiguous.
720
    """
721
    if not term:
722
        return False
723
    if len(term) == 2:
724
        if term[-2] in vowels and term[-1] not in vowels:
725
            return True
726
    elif len(term) >= 3:
727
        if ((term[-3] not in vowels and term[-2] in vowels and
728
             term[-1] in codanonvowels)):
729
            return True
730
    return False
731
732
733
def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None):
734
    """Return True iff term is a short word.
735
736
    (...according to the Porter2 specification.)
737
    """
738
    if ((_sb_r1(term, vowels, r1_prefixes) == len(term) and
739
         _sb_ends_in_short_syllable(term, vowels, codanonvowels))):
740
        return True
741
    return False
742
743
744
def porter2(word, early_english=False):
745
    """Return the Porter2 (Snowball English) stem.
746
747
    The Porter2 (Snowball English) stemmer is defined at:
748
    http://snowball.tartarus.org/algorithms/english/stemmer.html
749
750
    :param word: the word to calculate the stem of
751
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
752
        person singular verbal agreement suffixes)
753
    :returns: word stem
754
    :rtype: str
755
756
    >>> porter2('reading')
757
    'read'
758
    >>> porter2('suspension')
759
    'suspens'
760
    >>> porter2('elusiveness')
761
    'elus'
762
763
    >>> porter2('eateth', early_english=True)
764
    'eat'
765
    """
766
    # pylint: disable=too-many-branches
767
    # pylint: disable=too-many-return-statements
768
769
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
770
    _codanonvowels = {"'", 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
771
                      'n', 'p', 'q', 'r', 's', 't', 'v', 'z'}
772
    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
773
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
774
775
    # R1 prefixes should be in order from longest to shortest to prevent
776
    # masking
777
    _r1_prefixes = ('commun', 'gener', 'arsen')
778
    _exception1dict = {  # special changes:
779
        'skis': 'ski', 'skies': 'sky', 'dying': 'die',
780
        'lying': 'lie', 'tying': 'tie',
781
        # special -LY cases:
782
        'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli',
783
        'early': 'earli', 'only': 'onli', 'singly': 'singl'}
784
    _exception1set = {'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias',
785
                      'andes'}
786
    _exception2set = {'inning', 'outing', 'canning', 'herring', 'earring',
787
                      'proceed', 'exceed', 'succeed'}
788
789
    # lowercase, normalize, and compose
790
    word = unicodedata.normalize('NFC', text_type(word.lower()))
791
    # replace apostrophe-like characters with U+0027, per
792
    # http://snowball.tartarus.org/texts/apostrophe.html
793
    word = word.replace('’', '\'')
794
    word = word.replace('’', '\'')
795
796
    # Exceptions 1
797
    if word in _exception1dict:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
798
        return _exception1dict[word]
799
    elif word in _exception1set:
800
        return word
801
802
    # Return word if stem is shorter than 3
803
    if len(word) < 3:
804
        return word
805
806
    # Remove initial ', if present.
807
    while word and word[0] == '\'':
808
        word = word[1:]
809
        # Return word if stem is shorter than 2
810
        if len(word) < 2:
811
            return word
812
813
    # Re-map vocalic Y to y (Y will be C, y will be V)
814
    if word[0] == 'y':
815
        word = 'Y' + word[1:]
816
    for i in range(1, len(word)):
817
        if word[i] == 'y' and word[i-1] in _vowels:
818
            word = word[:i] + 'Y' + word[i+1:]
819
820
    r1_start = _sb_r1(word, _vowels, _r1_prefixes)
821
    r2_start = _sb_r2(word, _vowels, _r1_prefixes)
822
823
    # Step 0
824
    if word[-3:] == '\'s\'':
825
        word = word[:-3]
826
    elif word[-2:] == '\'s':
827
        word = word[:-2]
828
    elif word[-1:] == '\'':
829
        word = word[:-1]
830
    # Return word if stem is shorter than 2
831
    if len(word) < 3:
832
        return word
833
834
    # Step 1a
835
    if word[-4:] == 'sses':
836
        word = word[:-2]
837
    elif word[-3:] in {'ied', 'ies'}:
838
        if len(word) > 4:
839
            word = word[:-2]
840
        else:
841
            word = word[:-1]
842
    elif word[-2:] in {'us', 'ss'}:
843
        pass
844
    elif word[-1] == 's':
845
        if _sb_has_vowel(word[:-2], _vowels):
846
            word = word[:-1]
847
848
    # Exceptions 2
849
    if word in _exception2set:
850
        return word
851
852
    # Step 1b
853
    step1b_flag = False
854
    if word[-5:] == 'eedly':
855
        if len(word[r1_start:]) >= 5:
856
            word = word[:-3]
857
    elif word[-5:] == 'ingly':
858
        if _sb_has_vowel(word[:-5], _vowels):
859
            word = word[:-5]
860
            step1b_flag = True
861
    elif word[-4:] == 'edly':
862
        if _sb_has_vowel(word[:-4], _vowels):
863
            word = word[:-4]
864
            step1b_flag = True
865
    elif word[-3:] == 'eed':
866
        if len(word[r1_start:]) >= 3:
867
            word = word[:-1]
868
    elif word[-3:] == 'ing':
869
        if _sb_has_vowel(word[:-3], _vowels):
870
            word = word[:-3]
871
            step1b_flag = True
872
    elif word[-2:] == 'ed':
873
        if _sb_has_vowel(word[:-2], _vowels):
874
            word = word[:-2]
875
            step1b_flag = True
876
    elif early_english:
877
        if word[-3:] == 'est':
878
            if _sb_has_vowel(word[:-3], _vowels):
879
                word = word[:-3]
880
                step1b_flag = True
881
        elif word[-3:] == 'eth':
882
            if _sb_has_vowel(word[:-3], _vowels):
883
                word = word[:-3]
884
                step1b_flag = True
885
886
    if step1b_flag:
887
        if word[-2:] in {'at', 'bl', 'iz'}:
888
            word += 'e'
889
        elif word[-2:] in _doubles:
890
            word = word[:-1]
891
        elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes):
892
            word += 'e'
893
894
    # Step 1c
895
    if ((len(word) > 2 and word[-1] in {'Y', 'y'} and
896
         word[-2] not in _vowels)):
897
        word = word[:-1] + 'i'
898
899
    # Step 2
900
    if word[-2] == 'a':
901
        if word[-7:] == 'ational':
902
            if len(word[r1_start:]) >= 7:
903
                word = word[:-5] + 'e'
904
        elif word[-6:] == 'tional':
905
            if len(word[r1_start:]) >= 6:
906
                word = word[:-2]
907
    elif word[-2] == 'c':
908
        if word[-4:] in {'enci', 'anci'}:
909
            if len(word[r1_start:]) >= 4:
910
                word = word[:-1] + 'e'
911
    elif word[-2] == 'e':
912
        if word[-4:] == 'izer':
913
            if len(word[r1_start:]) >= 4:
914
                word = word[:-1]
915
    elif word[-2] == 'g':
916
        if word[-3:] == 'ogi':
917
            if ((r1_start >= 1 and len(word[r1_start:]) >= 3 and
918
                 word[-4] == 'l')):
919
                word = word[:-1]
920
    elif word[-2] == 'l':
921
        if word[-6:] == 'lessli':
922
            if len(word[r1_start:]) >= 6:
923
                word = word[:-2]
924
        elif word[-5:] in {'entli', 'fulli', 'ousli'}:
925
            if len(word[r1_start:]) >= 5:
926
                word = word[:-2]
927
        elif word[-4:] == 'abli':
928
            if len(word[r1_start:]) >= 4:
929
                word = word[:-1] + 'e'
930
        elif word[-4:] == 'alli':
931
            if len(word[r1_start:]) >= 4:
932
                word = word[:-2]
933
        elif word[-3:] == 'bli':
934
            if len(word[r1_start:]) >= 3:
935
                word = word[:-1] + 'e'
936
        elif word[-2:] == 'li':
937
            if ((r1_start >= 1 and len(word[r1_start:]) >= 2 and
938
                 word[-3] in _li)):
939
                word = word[:-2]
940
    elif word[-2] == 'o':
941
        if word[-7:] == 'ization':
942
            if len(word[r1_start:]) >= 7:
943
                word = word[:-5] + 'e'
944
        elif word[-5:] == 'ation':
945
            if len(word[r1_start:]) >= 5:
946
                word = word[:-3] + 'e'
947
        elif word[-4:] == 'ator':
948
            if len(word[r1_start:]) >= 4:
949
                word = word[:-2] + 'e'
950
    elif word[-2] == 's':
951
        if word[-7:] in {'fulness', 'ousness', 'iveness'}:
952
            if len(word[r1_start:]) >= 7:
953
                word = word[:-4]
954
        elif word[-5:] == 'alism':
955
            if len(word[r1_start:]) >= 5:
956
                word = word[:-3]
957
    elif word[-2] == 't':
958
        if word[-6:] == 'biliti':
959
            if len(word[r1_start:]) >= 6:
960
                word = word[:-5] + 'le'
961
        elif word[-5:] == 'aliti':
962
            if len(word[r1_start:]) >= 5:
963
                word = word[:-3]
964
        elif word[-5:] == 'iviti':
965
            if len(word[r1_start:]) >= 5:
966
                word = word[:-3] + 'e'
967
968
    # Step 3
969
    if word[-7:] == 'ational':
970
        if len(word[r1_start:]) >= 7:
971
            word = word[:-5] + 'e'
972
    elif word[-6:] == 'tional':
973
        if len(word[r1_start:]) >= 6:
974
            word = word[:-2]
975
    elif word[-5:] in {'alize', 'icate', 'iciti'}:
976
        if len(word[r1_start:]) >= 5:
977
            word = word[:-3]
978
    elif word[-5:] == 'ative':
979
        if len(word[r2_start:]) >= 5:
980
            word = word[:-5]
981
    elif word[-4:] == 'ical':
982
        if len(word[r1_start:]) >= 4:
983
            word = word[:-2]
984
    elif word[-4:] == 'ness':
985
        if len(word[r1_start:]) >= 4:
986
            word = word[:-4]
987
    elif word[-3:] == 'ful':
988
        if len(word[r1_start:]) >= 3:
989
            word = word[:-3]
990
991
    # Step 4
992
    for suffix in ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant',
993
                   'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er',
994
                   'ic'):
995
        if word[-len(suffix):] == suffix:
996
            if len(word[r2_start:]) >= len(suffix):
997
                word = word[:-len(suffix)]
998
            break
999
    else:
1000
        if word[-3:] == 'ion':
1001
            if ((len(word[r2_start:]) >= 3 and len(word) >= 4 and
1002
                 word[-4] in tuple('st'))):
1003
                word = word[:-3]
1004
1005
    # Step 5
1006
    if word[-1] == 'e':
1007
        if (len(word[r2_start:]) >= 1 or
1008
                (len(word[r1_start:]) >= 1 and
1009
                 not _sb_ends_in_short_syllable(word[:-1], _vowels,
1010
                                                _codanonvowels))):
1011
            word = word[:-1]
1012
    elif word[-1] == 'l':
1013
        if len(word[r2_start:]) >= 1 and word[-2] == 'l':
1014
            word = word[:-1]
1015
1016
    # Change 'Y' back to 'y' if it survived stemming
1017
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1018
        if word[i] == 'Y':
1019
            word = word[:i] + 'y' + word[i+1:]
1020
1021
    return word
1022
1023
1024
def sb_german(word, alternate_vowels=False):
1025
    """Return Snowball German stem.
1026
1027
    The Snowball German stemmer is defined at:
1028
    http://snowball.tartarus.org/algorithms/german/stemmer.html
1029
1030
    :param word: the word to calculate the stem of
1031
    :param alternate_vowels: composes ae as ä, oe as ö, and ue as ü before
1032
        running the algorithm
1033
    :returns: word stem
1034
    :rtype: str
1035
1036
    >>> sb_german('lesen')
1037
    'les'
1038
    >>> sb_german('graues')
1039
    'grau'
1040
    >>> sb_german('buchstabieren')
1041
    'buchstabi'
1042
    """
1043
    # pylint: disable=too-many-branches
1044
1045
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
1046
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
1047
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1048
1049
    # lowercase, normalize, and compose
1050
    word = unicodedata.normalize('NFC', word.lower())
1051
    word = word.replace('ß', 'ss')
1052
1053
    if len(word) > 2:
1054
        for i in range(2, len(word)):
1055
            if word[i] in _vowels and word[i-2] in _vowels:
1056
                if word[i-1] == 'u':
1057
                    word = word[:i-1] + 'U' + word[i:]
1058
                elif word[i-1] == 'y':
1059
                    word = word[:i-1] + 'Y' + word[i:]
1060
1061
    if alternate_vowels:
1062
        word = word.replace('ae', 'ä')
1063
        word = word.replace('oe', 'ö')
1064
        word = word.replace('que', 'Q')
1065
        word = word.replace('ue', 'ü')
1066
        word = word.replace('Q', 'que')
1067
1068
    r1_start = max(3, _sb_r1(word, _vowels))
1069
    r2_start = _sb_r2(word, _vowels)
1070
1071
    # Step 1
1072
    niss_flag = False
1073
    if word[-3:] == 'ern':
1074
        if len(word[r1_start:]) >= 3:
1075
            word = word[:-3]
1076
    elif word[-2:] == 'em':
1077
        if len(word[r1_start:]) >= 2:
1078
            word = word[:-2]
1079
    elif word[-2:] == 'er':
1080
        if len(word[r1_start:]) >= 2:
1081
            word = word[:-2]
1082
    elif word[-2:] == 'en':
1083
        if len(word[r1_start:]) >= 2:
1084
            word = word[:-2]
1085
            niss_flag = True
1086
    elif word[-2:] == 'es':
1087
        if len(word[r1_start:]) >= 2:
1088
            word = word[:-2]
1089
            niss_flag = True
1090
    elif word[-1:] == 'e':
1091
        if len(word[r1_start:]) >= 1:
1092
            word = word[:-1]
1093
            niss_flag = True
1094
    elif word[-1:] == 's':
1095
        if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1096
             word[-2] in _s_endings)):
1097
            word = word[:-1]
1098
1099
    if niss_flag and word[-4:] == 'niss':
1100
        word = word[:-1]
1101
1102
    # Step 2
1103
    if word[-3:] == 'est':
1104
        if len(word[r1_start:]) >= 3:
1105
            word = word[:-3]
1106
    elif word[-2:] == 'en':
1107
        if len(word[r1_start:]) >= 2:
1108
            word = word[:-2]
1109
    elif word[-2:] == 'er':
1110
        if len(word[r1_start:]) >= 2:
1111
            word = word[:-2]
1112
    elif word[-2:] == 'st':
1113
        if ((len(word[r1_start:]) >= 2 and len(word) >= 6 and
1114
             word[-3] in _st_endings)):
1115
            word = word[:-2]
1116
1117
    # Step 3
1118
    if word[-4:] == 'isch':
1119
        if len(word[r2_start:]) >= 4 and word[-5] != 'e':
1120
            word = word[:-4]
1121
    elif word[-4:] in {'lich', 'heit'}:
1122
        if len(word[r2_start:]) >= 4:
1123
            word = word[:-4]
1124
            if ((word[-2:] in {'er', 'en'} and
1125
                 len(word[r1_start:]) >= 2)):
1126
                word = word[:-2]
1127
    elif word[-4:] == 'keit':
1128
        if len(word[r2_start:]) >= 4:
1129
            word = word[:-4]
1130
            if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
1131
                word = word[:-4]
1132
            elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
1133
                word = word[:-2]
1134
    elif word[-3:] in {'end', 'ung'}:
1135
        if len(word[r2_start:]) >= 3:
1136
            word = word[:-3]
1137
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1138
                 word[-3] != 'e')):
1139
                word = word[:-2]
1140
    elif word[-2:] in {'ig', 'ik'}:
1141
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1142
            word = word[:-2]
1143
1144
    # Change 'Y' and 'U' back to lowercase if survived stemming
1145
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1146
        if word[i] == 'Y':
1147
            word = word[:i] + 'y' + word[i+1:]
1148
        elif word[i] == 'U':
1149
            word = word[:i] + 'u' + word[i+1:]
1150
1151
    # Remove umlauts
1152
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1153
    word = word.translate(_umlauts)
1154
1155
    return word
1156
1157
1158
def sb_dutch(word):
1159
    """Return Snowball Dutch stem.
1160
1161
    The Snowball Dutch stemmer is defined at:
1162
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
1163
1164
    :param word: the word to calculate the stem of
1165
    :returns: word stem
1166
    :rtype: str
1167
1168
    >>> sb_dutch('lezen')
1169
    'lez'
1170
    >>> sb_dutch('opschorting')
1171
    'opschort'
1172
    >>> sb_dutch('ongrijpbaarheid')
1173
    'ongrijp'
1174
    """
1175
    # pylint: disable=too-many-branches
1176
1177
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
1178
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
1179
1180
    def _undouble(word):
1181
        """Undouble endings -kk, -dd, and -tt."""
1182
        if ((len(word) > 1 and word[-1] == word[-2] and
1183
             word[-1] in {'d', 'k', 't'})):
1184
            return word[:-1]
1185
        return word
1186
1187
    # lowercase, normalize, decompose, filter umlauts & acutes out, and compose
1188
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1189
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1190
    word = word.translate(_accented)
1191
1192
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1193
        if i == 0 and word[0] == 'y':
1194
            word = 'Y' + word[1:]
1195
        elif word[i] == 'y' and word[i-1] in _vowels:
1196
            word = word[:i] + 'Y' + word[i+1:]
1197
        elif (word[i] == 'i' and word[i-1] in _vowels and i+1 < len(word) and
1198
              word[i+1] in _vowels):
1199
            word = word[:i] + 'I' + word[i+1:]
1200
1201
    r1_start = max(3, _sb_r1(word, _vowels))
1202
    r2_start = _sb_r2(word, _vowels)
1203
1204
    # Step 1
1205
    if word[-5:] == 'heden':
1206
        if len(word[r1_start:]) >= 5:
1207
            word = word[:-3] + 'id'
1208
    elif word[-3:] == 'ene':
1209
        if ((len(word[r1_start:]) >= 3 and
1210
             (word[-4] not in _vowels and word[-6:-3] != 'gem'))):
1211
            word = _undouble(word[:-3])
1212
    elif word[-2:] == 'en':
1213
        if ((len(word[r1_start:]) >= 2 and
1214
             (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1215
            word = _undouble(word[:-2])
1216
    elif word[-2:] == 'se':
1217
        if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings:
1218
            word = word[:-2]
1219
    elif word[-1:] == 's':
1220
        if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings:
1221
            word = word[:-1]
1222
1223
    # Step 2
1224
    e_removed = False
1225
    if word[-1:] == 'e':
1226
        if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1227
            word = _undouble(word[:-1])
1228
            e_removed = True
1229
1230
    # Step 3a
1231
    if word[-4:] == 'heid':
1232
        if len(word[r2_start:]) >= 4 and word[-5] != 'c':
1233
            word = word[:-4]
1234
            if word[-2:] == 'en':
1235
                if ((len(word[r1_start:]) >= 2 and
1236
                     (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1237
                    word = _undouble(word[:-2])
1238
1239
    # Step 3b
1240
    if word[-4:] == 'lijk':
1241
        if len(word[r2_start:]) >= 4:
1242
            word = word[:-4]
1243
            # Repeat step 2
1244
            if word[-1:] == 'e':
1245
                if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1246
                    word = _undouble(word[:-1])
1247
    elif word[-4:] == 'baar':
1248
        if len(word[r2_start:]) >= 4:
1249
            word = word[:-4]
1250
    elif word[-3:] in ('end', 'ing'):
1251
        if len(word[r2_start:]) >= 3:
1252
            word = word[:-3]
1253
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1254
                 word[-3] != 'e')):
1255
                word = word[:-2]
1256
            else:
1257
                word = _undouble(word)
1258
    elif word[-3:] == 'bar':
1259
        if len(word[r2_start:]) >= 3 and e_removed:
1260
            word = word[:-3]
1261
    elif word[-2:] == 'ig':
1262
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1263
            word = word[:-2]
1264
1265
    # Step 4
1266
    if ((len(word) >= 4 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1267
         word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and
1268
         word[-4] not in _vowels and
1269
         word[-1] not in _vowels and word[-1] != 'I')):
1270
        word = word[:-2] + word[-1]
1271
1272
    # Change 'Y' and 'U' back to lowercase if survived stemming
1273
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1274
        if word[i] == 'Y':
1275
            word = word[:i] + 'y' + word[i+1:]
1276
        elif word[i] == 'I':
1277
            word = word[:i] + 'i' + word[i+1:]
1278
1279
    return word
1280
1281
1282
def sb_norwegian(word):
1283
    """Return Snowball Norwegian stem.
1284
1285
    The Snowball Norwegian stemmer is defined at:
1286
    http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
1287
1288
    :param word: the word to calculate the stem of
1289
    :returns: word stem
1290
    :rtype: str
1291
1292
    >>> sb_norwegian('lese')
1293
    'les'
1294
    >>> sb_norwegian('suspensjon')
1295
    'suspensjon'
1296
    >>> sb_norwegian('sikkerhet')
1297
    'sikker'
1298
    """
1299
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1300
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p',
1301
                  'r', 't', 'v', 'y', 'z'}
1302
    # lowercase, normalize, and compose
1303
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1304
1305
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1306
1307
    # Step 1
1308
    _r1 = word[r1_start:]
1309
    if _r1[-7:] == 'hetenes':
1310
        word = word[:-7]
1311
    elif _r1[-6:] in {'hetene', 'hetens'}:
1312
        word = word[:-6]
1313
    elif _r1[-5:] in {'heten', 'heter', 'endes'}:
1314
        word = word[:-5]
1315
    elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}:
1316
        if word[-4:] == 'erte':
1317
            word = word[:-2]
1318
        else:
1319
            word = word[:-4]
1320
    elif _r1[-3:] in {'ede', 'ane', 'ene', 'ens', 'ers', 'ets', 'het', 'ast',
1321
                      'ert'}:
1322
        if word[-3:] == 'ert':
1323
            word = word[:-1]
1324
        else:
1325
            word = word[:-3]
1326
    elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}:
1327
        word = word[:-2]
1328
    elif _r1[-1:] in {'a', 'e'}:
1329
        word = word[:-1]
1330
    elif _r1[-1:] == 's':
1331
        if (((len(word) > 1 and word[-2] in _s_endings) or
1332
             (len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels))):
1333
            word = word[:-1]
1334
1335
    # Step 2
1336
    if word[r1_start:][-2:] in {'dt', 'vt'}:
1337
        word = word[:-1]
1338
1339
    # Step 3
1340
    _r1 = word[r1_start:]
1341
    if _r1[-7:] == 'hetslov':
1342
        word = word[:-7]
1343
    elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}:
1344
        word = word[:-4]
1345
    elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}:
1346
        word = word[:-3]
1347
    elif _r1[-2:] == 'ig':
1348
        word = word[:-2]
1349
1350
    return word
1351
1352
1353
def sb_swedish(word):
1354
    """Return Snowball Swedish stem.
1355
1356
    The Snowball Swedish stemmer is defined at:
1357
    http://snowball.tartarus.org/algorithms/swedish/stemmer.html
1358
1359
    :param word: the word to calculate the stem of
1360
    :returns: word stem
1361
    :rtype: str
1362
1363
    >>> sb_swedish('undervisa')
1364
    'undervis'
1365
    >>> sb_swedish('suspension')
1366
    'suspension'
1367
    >>> sb_swedish('visshet')
1368
    'viss'
1369
    """
1370
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'}
1371
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1372
                  'o', 'p', 'r', 't', 'v', 'y'}
1373
1374
    # lowercase, normalize, and compose
1375
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1376
1377
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1378
1379
    # Step 1
1380
    _r1 = word[r1_start:]
1381 View Code Duplication
    if _r1[-7:] == 'heterna':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1382
        word = word[:-7]
1383
    elif _r1[-6:] == 'hetens':
1384
        word = word[:-6]
1385
    elif _r1[-5:] in {'anden', 'heten', 'heter', 'arnas', 'ernas', 'ornas',
1386
                      'andes', 'arens', 'andet'}:
1387
        word = word[:-5]
1388
    elif _r1[-4:] in {'arna', 'erna', 'orna', 'ande', 'arne', 'aste', 'aren',
1389
                      'ades', 'erns'}:
1390
        word = word[:-4]
1391
    elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}:
1392
        word = word[:-3]
1393
    elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}:
1394
        word = word[:-2]
1395
    elif _r1[-1:] in {'a', 'e'}:
1396
        word = word[:-1]
1397
    elif _r1[-1:] == 's':
1398
        if len(word) > 1 and word[-2] in _s_endings:
1399
            word = word[:-1]
1400
1401
    # Step 2
1402
    if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}:
1403
        word = word[:-1]
1404
1405
    # Step 3
1406
    _r1 = word[r1_start:]
1407
    if _r1[-5:] == 'fullt':
1408
        word = word[:-1]
1409
    elif _r1[-4:] == 'löst':
1410
        word = word[:-1]
1411
    elif _r1[-3:] in {'lig', 'els'}:
1412
        word = word[:-3]
1413
    elif _r1[-2:] == 'ig':
1414
        word = word[:-2]
1415
1416
    return word
1417
1418
1419
def sb_danish(word):
1420
    """Return Snowball Danish stem.
1421
1422
    The Snowball Danish stemmer is defined at:
1423
    http://snowball.tartarus.org/algorithms/danish/stemmer.html
1424
1425
    :param word: the word to calculate the stem of
1426
    :returns: word stem
1427
    :rtype: str
1428
1429
    >>> sb_danish('underviser')
1430
    'undervis'
1431
    >>> sb_danish('suspension')
1432
    'suspension'
1433
    >>> sb_danish('sikkerhed')
1434
    'sikker'
1435
    """
1436
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1437
    _s_endings = {'a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1438
                  'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'}
1439
1440
    # lowercase, normalize, and compose
1441
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1442
1443
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1444
1445
    # Step 1
1446
    _r1 = word[r1_start:]
1447 View Code Duplication
    if _r1[-7:] == 'erendes':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1448
        word = word[:-7]
1449
    elif _r1[-6:] in {'erende', 'hedens'}:
1450
        word = word[:-6]
1451
    elif _r1[-5:] in {'ethed', 'erede', 'heden', 'heder', 'endes', 'ernes',
1452
                      'erens', 'erets'}:
1453
        word = word[:-5]
1454
    elif _r1[-4:] in {'ered', 'ende', 'erne', 'eren', 'erer', 'heds', 'enes',
1455
                      'eres', 'eret'}:
1456
        word = word[:-4]
1457
    elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}:
1458
        word = word[:-3]
1459
    elif _r1[-2:] in {'en', 'er', 'es', 'et'}:
1460
        word = word[:-2]
1461
    elif _r1[-1:] == 'e':
1462
        word = word[:-1]
1463
    elif _r1[-1:] == 's':
1464
        if len(word) > 1 and word[-2] in _s_endings:
1465
            word = word[:-1]
1466
1467
    # Step 2
1468
    if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1469
        word = word[:-1]
1470
1471
    # Step 3
1472
    if word[-4:] == 'igst':
1473
        word = word[:-2]
1474
1475
    _r1 = word[r1_start:]
1476
    repeat_step2 = False
1477
    if _r1[-4:] == 'elig':
1478
        word = word[:-4]
1479
        repeat_step2 = True
1480
    elif _r1[-4:] == 'løst':
1481
        word = word[:-1]
1482
    elif _r1[-3:] in {'lig', 'els'}:
1483
        word = word[:-3]
1484
        repeat_step2 = True
1485
    elif _r1[-2:] == 'ig':
1486
        word = word[:-2]
1487
        repeat_step2 = True
1488
1489
    if repeat_step2:
1490
        if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1491
            word = word[:-1]
1492
1493
    # Step 4
1494
    if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1495
         word[-1] == word[-2] and word[-1] not in _vowels)):
1496
        word = word[:-1]
1497
1498
    return word
1499
1500
1501
def clef_german(word):
1502
    """Return CLEF German stem.
1503
1504
    The CLEF German stemmer is defined at:
1505
    http://members.unine.ch/jacques.savoy/clef/germanStemmer.txt
1506
1507
    :param word: the word to calculate the stem of
1508
    :returns: word stem
1509
    :rtype: str
1510
1511
    >>> clef_german('lesen')
1512
    'lese'
1513
    >>> clef_german('graues')
1514
    'grau'
1515
    >>> clef_german('buchstabieren')
1516
    'buchstabier'
1517
    """
1518
    # lowercase, normalize, and compose
1519
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1520
1521
    # remove umlauts
1522
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1523
    word = word.translate(_umlauts)
1524
1525
    # remove plurals
1526
    wlen = len(word)-1
1527
1528
    if wlen > 3:
1529
        if wlen > 5:
1530
            if word[-3:] == 'nen':
1531
                return word[:-3]
1532
        if wlen > 4:
1533
            if word[-2:] in {'en', 'se', 'es', 'er'}:
1534
                return word[:-2]
1535
        if word[-1] in {'e', 'n', 'r', 's'}:
1536
            return word[:-1]
1537
    return word
1538
1539
1540
def clef_german_plus(word):
1541
    """Return 'CLEF German stemmer plus' stem.
1542
1543
    The CLEF German stemmer plus is defined at:
1544
    http://members.unine.ch/jacques.savoy/clef/germanStemmerPlus.txt
1545
1546
    :param word: the word to calculate the stem of
1547
    :returns: word stem
1548
    :rtype: str
1549
1550
    >>> clef_german_plus('lesen')
1551
    'les'
1552
    >>> clef_german_plus('graues')
1553
    'grau'
1554
    >>> clef_german_plus('buchstabieren')
1555
    'buchstabi'
1556
    """
1557
    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1558
1559
    # lowercase, normalize, and compose
1560
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1561
1562
    # remove umlauts
1563
    _accents = dict(zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1564
                        'aaaaooooiiiiuuuu'))
1565
    word = word.translate(_accents)
1566
1567
    # Step 1
1568
    wlen = len(word)-1
1569
    if wlen > 4 and word[-3:] == 'ern':
1570
        word = word[:-3]
1571
    elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
1572
        word = word[:-2]
1573
    elif wlen > 2 and (word[-1] == 'e' or
1574
                       (word[-1] == 's' and word[-2] in _st_ending)):
1575
        word = word[:-1]
1576
1577
    # Step 2
1578
    wlen = len(word)-1
1579
    if wlen > 4 and word[-3:] == 'est':
1580
        word = word[:-3]
1581
    elif wlen > 3 and (word[-2:] in {'er', 'en'} or
1582
                       (word[-2:] == 'st' and word[-3] in _st_ending)):
1583
        word = word[:-2]
1584
1585
    return word
1586
1587
1588
def clef_swedish(word):
1589
    """Return CLEF Swedish stem.
1590
1591
    The CLEF Swedish stemmer is defined at:
1592
    http://members.unine.ch/jacques.savoy/clef/swedishStemmer.txt
1593
1594
    :param word: the word to calculate the stem of
1595
    :returns: word stem
1596
    :rtype: str
1597
1598
    >>> clef_swedish('undervisa')
1599
    'undervis'
1600
    >>> clef_swedish('suspension')
1601
    'suspensio'
1602
    >>> clef_swedish('visshet')
1603
    'viss'
1604
    """
1605
    wlen = len(word)-1
1606
1607
    if wlen > 3 and word[-1] == 's':
1608
        word = word[:-1]
1609
        wlen -= 1
1610
1611
    if wlen > 6:
1612
        if word[-5:] in {'elser', 'heten'}:
1613
            return word[:-5]
1614
    if wlen > 5:
1615
        if word[-4:] in {'arne', 'erna', 'ande', 'else', 'aste', 'orna',
1616
                         'aren'}:
1617
            return word[:-4]
1618
    if wlen > 4:
1619
        if word[-3:] in {'are', 'ast', 'het'}:
1620
            return word[:-3]
1621
    if wlen > 3:
1622
        if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
1623
            return word[:-2]
1624
    if wlen > 2:
1625
        if word[-1] in {'a', 'e', 'n', 't'}:
1626
            return word[:-1]
1627
    return word
1628
1629
1630
def caumanns(word):
1631
    """Return Caumanns German stem.
1632
1633
    Jörg Caumanns' stemmer is described in his article at:
1634
    https://refubium.fu-berlin.de/bitstream/handle/fub188/18405/tr-b-99-16.pdf
1635
1636
    This implementation is based on the GermanStemFilter described at:
1637
    http://www.evelix.ch/unternehmen/Blog/evelix/2013/11/11/inner-workings-of-the-german-analyzer-in-lucene
1638
1639
    :param word: the word to calculate the stem of
1640
    :returns: word stem
1641
    :rtype: str
1642
1643
    >>> caumanns('lesen')
1644
    'les'
1645
    >>> caumanns('graues')
1646
    'grau'
1647
    >>> caumanns('buchstabieren')
1648
    'buchstabier'
1649
    """
1650
    if not word:
1651
        return ''
1652
1653
    upper_initial = word[0].isupper()
1654
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1655
1656
    # # Part 2: Substitution
1657
    # 1. Change umlauts to corresponding vowels & ß to ss
1658
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1659
    word = word.translate(_umlauts)
1660
    word = word.replace('ß', 'ss')
1661
1662
    # 2. Change second of doubled characters to *
1663
    newword = word[0]
1664
    for i in range(1, len(word)):
1665
        if newword[i-1] == word[i]:
1666
            newword += '*'
1667
        else:
1668
            newword += word[i]
1669
    word = newword
1670
1671
    # 3. Replace sch, ch, ei, ie with $, §, %, &
1672
    word = word.replace('sch', '$')
1673
    word = word.replace('ch', '§')
1674
    word = word.replace('ei', '%')
1675
    word = word.replace('ie', '&')
1676
    word = word.replace('ig', '#')
1677
    word = word.replace('st', '!')
1678
1679
    # # Part 1: Recursive Context-Free Stripping
1680
    # 1. Remove the following 7 suffixes recursively
1681
    while len(word) > 3:
1682
        if (((len(word) > 4 and word[-2:] in {'em', 'er'}) or
1683
             (len(word) > 5 and word[-2:] == 'nd'))):
1684
            word = word[:-2]
1685
        elif ((word[-1] in {'e', 's', 'n'}) or
1686
              (not upper_initial and word[-1] in {'t', '!'})):
1687
            word = word[:-1]
1688
        else:
1689
            break
1690
1691
    # Additional optimizations:
1692
    if len(word) > 5 and word[-5:] == 'erin*':
1693
        word = word[:-1]
1694
    if word[-1] == 'z':
1695
        word = word[:-1] + 'x'
1696
1697
    # Reverse substitutions:
1698
    word = word.replace('$', 'sch')
1699
    word = word.replace('§', 'ch')
1700
    word = word.replace('%', 'ei')
1701
    word = word.replace('&', 'ie')
1702
    word = word.replace('#', 'ig')
1703
    word = word.replace('!', 'st')
1704
1705
    # Expand doubled
1706
    word = ''.join([word[0]] + [word[i-1] if word[i] == '*' else word[i] for
1707
                                i in range(1, len(word))])
1708
1709
    # Finally, convert gege to ge
1710
    if len(word) > 4:
1711
        word = word.replace('gege', 'ge', 1)
1712
1713
    return word
1714
1715
1716
def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False,
1717
            var=None):
1718
    """Return UEA-Lite stem.
1719
1720
    The UEA-Lite stemmer is discussed in:
1721
    Jenkins, Marie-Claire and Dan Smith. 2005. "Conservative stemming for
1722
    search and indexing."
1723
    http://lemur.cmp.uea.ac.uk/Research/stemmer/stemmer25feb.pdf
1724
1725
    This is chiefly based on the Java implementation of the algorithm, with
1726
    variants based on the Perl implementation and Jason Adams' Ruby port.
1727
1728
    Java version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.java
1729
    Perl version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.pl
1730
    Ruby version: https://github.com/ealdent/uea-stemmer
1731
1732
    :param word: the word to calculate the stem of
1733
    :param max_word_length: the maximum word length allowed
1734
    :param return_rule_no: if True, returns the stem along with rule number
1735
    :param var: variant to use (set to 'Adams' to use Jason Adams' rules,
1736
                or 'Perl' to use the original Perl set of rules)
1737
    :returns: word stem
1738
    :rtype: str or tuple(str, int)
1739
    """
1740
    problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
1741
1742
    # rule table format:
1743
    # top-level dictionary: length-of-suffix: dict-of-rules
1744
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
1745
    #                         suffix_to_append)
1746
    rule_table = {7: {'titudes': (30, 1, None),
1747
                      'fulness': (34, 4, None),
1748
                      'ousness': (35, 4, None),
1749
                      'eadings': (40.7, 4, None),
1750
                      'oadings': (40.6, 4, None),
1751
                      'ealings': (42.4, 4, None),
1752
                      'ailings': (42.2, 4, None),
1753
                      },
1754
                  6: {'aceous': (1, 6, None),
1755
                      'aining': (24, 3, None),
1756
                      'acting': (25, 3, None),
1757
                      'ttings': (26, 5, None),
1758
                      'viding': (27, 3, 'e'),
1759
                      'ssings': (37, 4, None),
1760
                      'ulting': (38, 3, None),
1761
                      'eading': (40.7, 3, None),
1762
                      'oading': (40.6, 3, None),
1763
                      'edings': (40.5, 4, None),
1764
                      'ddings': (40.4, 5, None),
1765
                      'ldings': (40.3, 4, None),
1766
                      'rdings': (40.2, 4, None),
1767
                      'ndings': (40.1, 4, None),
1768
                      'llings': (41, 5, None),
1769
                      'ealing': (42.4, 3, None),
1770
                      'olings': (42.3, 4, None),
1771
                      'ailing': (42.2, 3, None),
1772
                      'elings': (42.1, 4, None),
1773
                      'mmings': (44.3, 5, None),
1774
                      'ngings': (45.2, 4, None),
1775
                      'ggings': (45.1, 5, None),
1776
                      'stings': (47, 4, None),
1777
                      'etings': (48.4, 4, None),
1778
                      'ntings': (48.2, 4, None),
1779
                      'irings': (54.4, 4, 'e'),
1780
                      'urings': (54.3, 4, 'e'),
1781
                      'ncings': (54.2, 4, 'e'),
1782
                      'things': (58.1, 1, None),
1783
                      },
1784
                  5: {'iases': (11.4, 2, None),
1785
                      'ained': (13.6, 2, None),
1786
                      'erned': (13.5, 2, None),
1787
                      'ifted': (14, 2, None),
1788
                      'ected': (15, 2, None),
1789
                      'vided': (16, 1, None),
1790
                      'erred': (19, 3, None),
1791
                      'urred': (20.5, 3, None),
1792
                      'lored': (20.4, 2, None),
1793
                      'eared': (20.3, 2, None),
1794
                      'tored': (20.2, 1, None),
1795
                      'noted': (22.4, 1, None),
1796
                      'leted': (22.3, 1, None),
1797
                      'anges': (23, 1, None),
1798
                      'tting': (26, 4, None),
1799
                      'ulted': (32, 2, None),
1800
                      'uming': (33, 3, 'e'),
1801
                      'rabed': (36.1, 1, None),
1802
                      'rebed': (36.1, 1, None),
1803
                      'ribed': (36.1, 1, None),
1804
                      'robed': (36.1, 1, None),
1805
                      'rubed': (36.1, 1, None),
1806
                      'ssing': (37, 3, None),
1807
                      'vings': (39, 4, 'e'),
1808
                      'eding': (40.5, 3, None),
1809
                      'dding': (40.4, 4, None),
1810
                      'lding': (40.3, 3, None),
1811
                      'rding': (40.2, 3, None),
1812
                      'nding': (40.1, 3, None),
1813
                      'dings': (40, 4, 'e'),
1814
                      'lling': (41, 4, None),
1815
                      'oling': (42.3, 3, None),
1816
                      'eling': (42.1, 3, None),
1817
                      'lings': (42, 4, 'e'),
1818
                      'mming': (44.3, 4, None),
1819
                      'rming': (44.2, 3, None),
1820
                      'lming': (44.1, 3, None),
1821
                      'mings': (44, 4, 'e'),
1822
                      'nging': (45.2, 3, None),
1823
                      'gging': (45.1, 4, None),
1824
                      'gings': (45, 4, 'e'),
1825
                      'aning': (46.6, 3, None),
1826
                      'ening': (46.5, 3, None),
1827
                      'gning': (46.4, 3, None),
1828
                      'nning': (46.3, 4, None),
1829
                      'oning': (46.2, 3, None),
1830
                      'rning': (46.1, 3, None),
1831
                      'sting': (47, 3, None),
1832
                      'eting': (48.4, 3, None),
1833
                      'pting': (48.3, 3, None),
1834
                      'nting': (48.2, 3, None),
1835
                      'cting': (48.1, 3, None),
1836
                      'tings': (48, 4, 'e'),
1837
                      'iring': (54.4, 3, 'e'),
1838
                      'uring': (54.3, 3, 'e'),
1839
                      'ncing': (54.2, 3, 'e'),
1840
                      'sings': (54, 4, 'e'),
1841
                      # 'lling': (55, 3, None),  # masked by 41
1842
                      'ating': (57, 3, 'e'),
1843
                      'thing': (58.1, 0, None),
1844
                      },
1845
                  4: {'eeds': (7, 1, None),
1846
                      'uses': (11.3, 1, None),
1847
                      'sses': (11.2, 2, None),
1848
                      'eses': (11.1, 2, 'is'),
1849
                      'tled': (12.5, 1, None),
1850
                      'pled': (12.4, 1, None),
1851
                      'bled': (12.3, 1, None),
1852
                      'eled': (12.2, 2, None),
1853
                      'lled': (12.1, 2, None),
1854
                      'ened': (13.7, 2, None),
1855
                      'rned': (13.4, 2, None),
1856
                      'nned': (13.3, 3, None),
1857
                      'oned': (13.2, 2, None),
1858
                      'gned': (13.1, 2, None),
1859
                      'ered': (20.1, 2, None),
1860
                      'reds': (20, 2, None),
1861
                      'tted': (21, 3, None),
1862
                      'uted': (22.2, 1, None),
1863
                      'ated': (22.1, 1, None),
1864
                      'ssed': (28, 2, None),
1865
                      'umed': (31, 1, None),
1866
                      'beds': (36, 3, None),
1867
                      'ving': (39, 3, 'e'),
1868
                      'ding': (40, 3, 'e'),
1869
                      'ling': (42, 3, 'e'),
1870
                      'nged': (43.2, 1, None),
1871
                      'gged': (43.1, 3, None),
1872
                      'ming': (44, 3, 'e'),
1873
                      'ging': (45, 3, 'e'),
1874
                      'ning': (46, 3, 'e'),
1875
                      'ting': (48, 3, 'e'),
1876
                      # 'ssed': (49, 2, None),  # masked by 28
1877
                      # 'lled': (53, 2, None),  # masked by 12.1
1878
                      'zing': (54.1, 3, 'e'),
1879
                      'sing': (54, 3, 'e'),
1880
                      'lves': (60.1, 3, 'f'),
1881
                      'aped': (61.3, 1, None),
1882
                      'uded': (61.2, 1, None),
1883
                      'oded': (61.1, 1, None),
1884
                      # 'ated': (61, 1, None),  # masked by 22.1
1885
                      'ones': (63.6, 1, None),
1886
                      'izes': (63.5, 1, None),
1887
                      'ures': (63.4, 1, None),
1888
                      'ines': (63.3, 1, None),
1889
                      'ides': (63.2, 1, None),
1890
                      },
1891
                  3: {'ces': (2, 1, None),
1892
                      'sis': (4, 0, None),
1893
                      'tis': (5, 0, None),
1894
                      'eed': (7, 0, None),
1895
                      'ued': (8, 1, None),
1896
                      'ues': (9, 1, None),
1897
                      'ees': (10, 1, None),
1898
                      'ses': (11, 1, None),
1899
                      'led': (12, 2, None),
1900
                      'ned': (13, 1, None),
1901
                      'ved': (17, 1, None),
1902
                      'ced': (18, 1, None),
1903
                      'red': (20, 1, None),
1904
                      'ted': (22, 2, None),
1905
                      'sed': (29, 1, None),
1906
                      'bed': (36, 2, None),
1907
                      'ged': (43, 1, None),
1908
                      'les': (50, 1, None),
1909
                      'tes': (51, 1, None),
1910
                      'zed': (52, 1, None),
1911
                      'ied': (56, 3, 'y'),
1912
                      'ies': (59, 3, 'y'),
1913
                      'ves': (60, 1, None),
1914
                      'pes': (63.8, 1, None),
1915
                      'mes': (63.7, 1, None),
1916
                      'ges': (63.1, 1, None),
1917
                      'ous': (65, 0, None),
1918
                      'ums': (66, 0, None),
1919
                      },
1920
                  2: {'cs': (3, 0, None),
1921
                      'ss': (6, 0, None),
1922
                      'es': (63, 2, None),
1923
                      'is': (64, 2, 'e'),
1924
                      'us': (67, 0, None),
1925
                      }}
1926
1927
    if var == 'Perl':
1928
        perl_deletions = {7: ['eadings', 'oadings', 'ealings', 'ailings'],
1929
                          6: ['ttings', 'ssings', 'edings', 'ddings',
1930
                              'ldings', 'rdings', 'ndings', 'llings',
1931
                              'olings', 'elings', 'mmings', 'ngings',
1932
                              'ggings', 'stings', 'etings', 'ntings',
1933
                              'irings', 'urings', 'ncings', 'things'],
1934
                          5: ['vings', 'dings', 'lings', 'mings', 'gings',
1935
                              'tings', 'sings'],
1936
                          4: ['eeds', 'reds', 'beds']}
1937
1938
        # Delete the above rules from rule_table
1939
        for del_len in perl_deletions:
1940
            for term in perl_deletions[del_len]:
1941
                del rule_table[del_len][term]
1942
1943
    elif var == 'Adams':
1944
        adams_additions = {6: {'chited': (22.8, 1, None)},
1945
                           5: {'dying': (58.2, 4, 'ie'),
1946
                               'tying': (58.2, 4, 'ie'),
1947
                               'vited': (22.6, 1, None),
1948
                               'mited': (22.5, 1, None),
1949
                               'vided': (22.9, 1, None),
1950
                               'mided': (22.10, 1, None),
1951
                               'lying': (58.2, 4, 'ie'),
1952
                               'arred': (19.1, 3, None),
1953
                               },
1954
                           4: {'ited': (22.7, 2, None),
1955
                               'oked': (31.1, 1, None),
1956
                               'aked': (31.1, 1, None),
1957
                               'iked': (31.1, 1, None),
1958
                               'uked': (31.1, 1, None),
1959
                               'amed': (31, 1, None),
1960
                               'imed': (31, 1, None),
1961
                               'does': (31.2, 2, None),
1962
                               },
1963
                           3: {'oed': (31.3, 1, None),
1964
                               'oes': (31.2, 1, None),
1965
                               'kes': (63.1, 1, None),
1966
                               'des': (63.10, 1, None),
1967
                               'res': (63.9, 1, None),
1968
                               }}
1969
1970
        # Add the above additional rules to rule_table
1971
        for del_len in adams_additions:
1972
            rule_table[del_len] = dict(rule_table[del_len],
1973
                                       **adams_additions[del_len])
1974
        # Add additional problem word
1975
        problem_words.add('menses')
1976
1977
    def _stem_with_duplicate_character_check(word, del_len):
1978
        if word[-1] == 's':
1979
            del_len += 1
1980
        stemmed_word = word[:-del_len]
1981
        if re.match(r'.*(\w)\1$', stemmed_word):
1982
            stemmed_word = stemmed_word[:-1]
1983
        return stemmed_word
1984
1985
    def _stem(word):
0 ignored issues
show
best-practice introduced by
Too many return statements (16/6)
Loading history...
1986
        stemmed_word = word
1987
        rule_no = 0
1988
1989
        if not word:
1990
            return word, 0
1991
        if word in problem_words:
1992
            return word, 90
1993
        if max_word_length and len(word) > max_word_length:
1994
            return word, 95
1995
1996
        if "'" in word:
1997
            if word[-2:] in {"'s", "'S"}:
1998
                stemmed_word = word[:-2]
1999
            if word[-1:] == "'":
2000
                stemmed_word = word[:-1]
2001
            stemmed_word = stemmed_word.replace("n't", 'not')
2002
            stemmed_word = stemmed_word.replace("'ve", 'have')
2003
            stemmed_word = stemmed_word.replace("'re", 'are')
2004
            stemmed_word = stemmed_word.replace("'m", 'am')
2005
            return stemmed_word, 94
2006
2007
        if word.isdigit():
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2008
            return word, 90.3
2009
        else:
2010
            hyphen = word.find('-')
2011
            if hyphen > 0 and hyphen < len(word):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
2012
                if word[:hyphen].isalpha() and word[hyphen+1:].isalpha():
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2013
                    return word, 90.2
2014
                else:
2015
                    return word, 90.1
2016
            elif '_' in word:
2017
                return word, 90
2018
            elif word[-1] == 's' and word[:-1].isupper():
2019
                if var == 'Adams' and len(word)-1 > max_acro_length:
2020
                    return word, 96
2021
                return word[:-1], 91.1
2022
            elif word.isupper():
2023
                if var == 'Adams' and len(word) > max_acro_length:
2024
                    return word, 96
2025
                return word, 91
2026
            elif re.match(r'^.*[A-Z].*[A-Z].*$', word):
2027
                return word, 92
2028
            elif word[0].isupper():
2029
                return word, 93
2030
            elif var == 'Adams' and re.match(r'^[a-z]{1}(|[rl])(ing|ed)$',
2031
                                             word):
2032
                return word, 97
2033
2034
        for n in range(7, 1, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2035
            if word[-n:] in rule_table[n]:
2036
                rule_no, del_len, add_str = rule_table[n][word[-n:]]
2037
                if del_len:
2038
                    stemmed_word = word[:-del_len]
2039
                else:
2040
                    stemmed_word = word
2041
                if add_str:
2042
                    stemmed_word += add_str
2043
                break
2044
2045
        if not rule_no:
2046
            if re.match(r'.*\w\wings?$', word):  # rule 58
2047
                stemmed_word = _stem_with_duplicate_character_check(word, 3)
2048
                rule_no = 58
2049
            elif re.match(r'.*\w\weds?$', word):  # rule 62
2050
                stemmed_word = _stem_with_duplicate_character_check(word, 2)
2051
                rule_no = 62
2052
            elif word[-1] == 's':  # rule 68
2053
                stemmed_word = word[:-1]
2054
                rule_no = 68
2055
2056
        return stemmed_word, rule_no
2057
2058
    stem, rule_no = _stem(word)
2059
    if return_rule_no:
2060
        return stem, rule_no
2061
    return stem
2062
2063
2064
def paice_husk(word):
2065
    """Return Paice-Husk stem.
2066
2067
    Implementation of the Paice-Husk Stemmer, also known as the Lancaster
2068
    Stemmer, developed by Chris Paice, with the assistance of Gareth Husk
2069
2070
    This is based on the algorithm's description in:
2071
    Paice, Chris D. 1990. "Another stemmer." ACM SIGIR Forum 24:3, Fall 1990.
2072
    56-61. doi:10.1145/101306.101310.
2073
2074
    :param word: the word to calculate the stem of
2075
    :returns: word stem
2076
    :rtype: str
2077
    """
2078
    rule_table = {6: {'ifiabl': (False, 6, None, True),
2079
                      'plicat': (False, 4, 'y', True)},
2080
                  5: {'guish': (False, 5, 'ct', True),
2081
                      'sumpt': (False, 2, None, True),
2082
                      'istry': (False, 5, None, True)},
2083
                  4: {'ytic': (False, 3, 's', True),
2084
                      'ceed': (False, 2, 'ss', True),
2085
                      'hood': (False, 4, None, False),
2086
                      'lief': (False, 1, 'v', True),
2087
                      'verj': (False, 1, 't', True),
2088
                      'misj': (False, 2, 't', True),
2089
                      'iabl': (False, 4, 'y', True),
2090
                      'iful': (False, 4, 'y', True),
2091
                      'sion': (False, 4, 'j', False),
2092
                      'xion': (False, 4, 'ct', True),
2093
                      'ship': (False, 4, None, False),
2094
                      'ness': (False, 4, None, False),
2095
                      'ment': (False, 4, None, False),
2096
                      'ript': (False, 2, 'b', True),
2097
                      'orpt': (False, 2, 'b', True),
2098
                      'duct': (False, 1, None, True),
2099
                      'cept': (False, 2, 'iv', True),
2100
                      'olut': (False, 2, 'v', True),
2101
                      'sist': (False, 0, None, True)},
2102
                  3: {'ied': (False, 3, 'y', False),
2103
                      'eed': (False, 1, None, True),
2104
                      'ing': (False, 3, None, False),
2105
                      'iag': (False, 3, 'y', True),
2106
                      'ish': (False, 3, None, False),
2107
                      'fuj': (False, 1, 's', True),
2108
                      'hej': (False, 1, 'r', True),
2109
                      'abl': (False, 3, None, False),
2110
                      'ibl': (False, 3, None, True),
2111
                      'bil': (False, 2, 'l', False),
2112
                      'ful': (False, 3, None, False),
2113
                      'ial': (False, 3, None, False),
2114
                      'ual': (False, 3, None, False),
2115
                      'ium': (False, 3, None, True),
2116
                      'ism': (False, 3, None, False),
2117
                      'ion': (False, 3, None, False),
2118
                      'ian': (False, 3, None, False),
2119
                      'een': (False, 0, None, True),
2120
                      'ear': (False, 0, None, True),
2121
                      'ier': (False, 3, 'y', False),
2122
                      'ies': (False, 3, 'y', False),
2123
                      'sis': (False, 2, None, True),
2124
                      'ous': (False, 3, None, False),
2125
                      'ent': (False, 3, None, False),
2126
                      'ant': (False, 3, None, False),
2127
                      'ist': (False, 3, None, False),
2128
                      'iqu': (False, 3, None, True),
2129
                      'ogu': (False, 1, None, True),
2130
                      'siv': (False, 3, 'j', False),
2131
                      'eiv': (False, 0, None, True),
2132
                      'bly': (False, 1, None, False),
2133
                      'ily': (False, 3, 'y', False),
2134
                      'ply': (False, 0, None, True),
2135
                      'ogy': (False, 1, None, True),
2136
                      'phy': (False, 1, None, True),
2137
                      'omy': (False, 1, None, True),
2138
                      'opy': (False, 1, None, True),
2139
                      'ity': (False, 3, None, False),
2140
                      'ety': (False, 3, None, False),
2141
                      'lty': (False, 2, None, True),
2142
                      'ary': (False, 3, None, False),
2143
                      'ory': (False, 3, None, False),
2144
                      'ify': (False, 3, None, True),
2145
                      'ncy': (False, 2, 't', False),
2146
                      'acy': (False, 3, None, False)},
2147
                  2: {'ia': (True, 2, None, True),
2148
                      'bb': (False, 1, None, True),
2149
                      'ic': (False, 2, None, False),
2150
                      'nc': (False, 1, 't', False),
2151
                      'dd': (False, 1, None, True),
2152
                      'ed': (False, 2, None, False),
2153
                      'if': (False, 2, None, False),
2154
                      'ag': (False, 2, None, False),
2155
                      'gg': (False, 1, None, True),
2156
                      'th': (True, 2, None, True),
2157
                      'ij': (False, 1, 'd', True),
2158
                      'uj': (False, 1, 'd', True),
2159
                      'oj': (False, 1, 'd', True),
2160
                      'nj': (False, 1, 'd', True),
2161
                      'cl': (False, 1, None, True),
2162
                      'ul': (False, 2, None, True),
2163
                      'al': (False, 2, None, False),
2164
                      'll': (False, 1, None, True),
2165
                      'um': (True, 2, None, True),
2166
                      'mm': (False, 1, None, True),
2167
                      'an': (False, 2, None, False),
2168
                      'en': (False, 2, None, False),
2169
                      'nn': (False, 1, None, True),
2170
                      'pp': (False, 1, None, True),
2171
                      'er': (False, 2, None, False),
2172
                      'ar': (False, 2, None, True),
2173
                      'or': (False, 2, None, False),
2174
                      'ur': (False, 2, None, False),
2175
                      'rr': (False, 1, None, True),
2176
                      'tr': (False, 1, None, False),
2177
                      'is': (False, 2, None, False),
2178
                      'ss': (False, 0, None, True),
2179
                      'us': (True, 2, None, True),
2180
                      'at': (False, 2, None, False),
2181
                      'tt': (False, 1, None, True),
2182
                      'iv': (False, 2, None, False),
2183
                      'ly': (False, 2, None, False),
2184
                      'iz': (False, 2, None, False),
2185
                      'yz': (False, 1, 's', True)},
2186
                  1: {'a': (True, 1, None, True),
2187
                      'e': (False, 1, None, False),
2188
                      'i': ((True, 1, None, True), (False, 1, 'y', False)),
2189
                      'j': (False, 1, 's', True),
2190
                      's': ((True, 1, None, False), (False, 0, None, True))}}
2191
2192
    def _has_vowel(word):
2193
        for char in word:
2194
            if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
2195
                return True
2196
        return False
2197
2198
    def _acceptable(word):
2199
        if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
2200
            return len(word) > 1
2201
        return len(word) > 2 and _has_vowel(word[1:])
2202
2203
    def _apply_rule(word, rule, intact):
2204
        old_word = word
2205
        only_intact, del_len, add_str, set_terminate = rule
2206
        # print(word, word[-n:], rule)
2207
2208
        if (not only_intact) or (intact and only_intact):
2209
            if del_len:
2210
                word = word[:-del_len]
2211
            if add_str:
2212
                word += add_str
2213
        else:
2214
            return word, False, intact, terminate
2215
2216
        if _acceptable(word):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2217
            return word, True, False, set_terminate
2218
        else:
2219
            return old_word, False, intact, terminate
2220
2221
    terminate = False
2222
    intact = True
2223
    while not terminate:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
2224
        for n in range(6, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2225
            if word[-n:] in rule_table[n]:
2226
                accept = False
2227
                if len(rule_table[n][word[-n:]]) < 4:
2228
                    for rule in rule_table[n][word[-n:]]:
2229
                        (word, accept, intact,
2230
                         terminate) = _apply_rule(word, rule, intact)
2231
                        if accept:
2232
                            break
2233
                else:
2234
                    rule = rule_table[n][word[-n:]]
2235
                    (word, accept, intact,
2236
                     terminate) = _apply_rule(word, rule, intact)
2237
2238
                if accept:
2239
                    break
2240
        else:
2241
            break
2242
2243
    return word
2244
2245
2246
def schinke(word):
2247
    """Return the stem of a word according to the Schinke stemmer.
2248
2249
    Source:
2250
    Schinke, Robyn, Mark Greengrass, Alexander M. Robertson, and Peter Willett.
2251
    1996. "A Stemming Algorithm for Latin Text Databases." Journal of
2252
    Documentation, 52(2). 172--187.
2253
    doi:10.1108/eb026966
2254
2255
    :param word:
2256
    :return:
2257
    """
2258
    word = unicodedata.normalize('NFKD', text_type(word.lower()))
2259
    word = ''.join(c for c in word if c in
2260
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
2261
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
2262
                    'y', 'z'})
2263
2264
    # Rule 2
2265
    word = word.replace('j', 'i').replace('v', 'u')
2266
2267
    # Rule 3
2268
    keep_que = {'at', 'quo', 'ne', 'ita', 'abs', 'aps', 'abus', 'adae', 'adus',
2269
                'deni', 'de', 'sus', 'obli', 'perae', 'plenis', 'quando',
2270
                'quis', 'quae', 'cuius', 'cui', 'quem', 'quam', 'qua', 'qui',
2271
                'quorum', 'quarum', 'quibus', 'quos', 'quas', 'quotusquis',
2272
                'quous', 'ubi', 'undi', 'us', 'uter', 'uti', 'utro', 'utribi',
2273
                'tor', 'co', 'conco', 'contor', 'detor', 'deco', 'exco',
2274
                'extor', 'obtor', 'optor', 'retor', 'reco', 'attor', 'inco',
2275
                'intor', 'praetor'}
2276
    if word[-3:] == 'que':
2277
        # This diverges from the paper by also returning 'que' itself unstemmed
2278
        if word[:-3] in keep_que or word == 'que':
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2279
            return {'n': word, 'v': word}
2280
        else:
2281
            word = word[:-3]
2282
2283
    # Base case will mean returning the words as is
2284
    noun = word
2285
    verb = word
2286
2287
    # Rule 4
2288
    n_endings = {4: {'ibus'},
2289
               3: {'ius'},
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2290
               2: {'is', 'nt', 'ae', 'os', 'am', 'ud', 'as', 'um', 'em', 'us',
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2291
                   'es', 'ia'},
2292
               1: {'a', 'e', 'i', 'o', 'u'}}
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2293
    for endlen in range(4, 0, -1):
2294
        if word[-endlen:] in n_endings[endlen]:
2295
            if len(word)-2 >= endlen:
2296
                noun = word[:-endlen]
2297
            else:
2298
                noun = word
2299
            break
2300
2301
    v_endings_strip = {6: {},
2302
                       5: {},
2303
                       4: {'mini', 'ntur', 'stis'},
2304
                       3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
2305
                       2: {'ns', 'nt', 'ri'},
2306
                       1: {'m', 'r', 's', 't'}}
2307
    v_endings_alter = {6: {'iuntur'},
2308
                       5: {'beris', 'erunt', 'untur'},
2309
                       4: {'iunt'},
2310
                       3: {'bor', 'ero', 'unt'},
2311
                       2: {'bo'},
2312
                       1: {}}
2313
    for endlen in range(6, 0, -1):
2314
        if word[-endlen:] in v_endings_strip[endlen]:
2315
            addlen = 0
2316
            if len(word)-2 >= endlen:
2317
                verb = word[:-endlen]
2318
            else:
2319
                verb = word
2320
            break
2321
        if word[-endlen:] in v_endings_alter[endlen]:
2322
            if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
2323
                new_word = word[:-endlen]+'i'
2324
                addlen = 1
2325
            elif word[-endlen:] in {'beris', 'bor', 'bo'}:
2326
                new_word = word[:-endlen]+'bi'
2327
                addlen = 2
2328
            else:
2329
                new_word = word[:-endlen]+'eri'
2330
                addlen = 3
2331
2332
            # Technically this diverges from the paper by considering the
2333
            # length of the stem without the new suffix
2334
            if len(new_word) >= 2+addlen:
2335
                verb = new_word
2336
            else:
2337
                verb = word
2338
            break
2339
2340
    return {'n': noun, 'v': verb}
2341
2342
2343
if __name__ == '__main__':
2344
    import doctest
2345
    doctest.testmod()
2346