Completed
Push — master ( bc96fb...d03127 )
by Chris
11:16
created

abydos.stemmer.s_stemmer()   B

Complexity

Conditions 8

Size

Total Lines 19
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 8
eloc 9
nop 1
dl 0
loc 19
rs 7.3333
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (2368/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.stemmer.
20
21
The stemmer module defines word stemmers including:
22
23
    - the Lovins stemmer
24
    - the Porter and Porter2 (Snowball English) stemmers
25
    - Snowball stemmers for German, Dutch, Norwegian, Swedish, and Danish
26
    - CLEF German, German plus, and Swedish stemmers
27
    - Caumann's German stemmer
28
    - UEA-Lite Stemmer
29
    - Paice-Husk Stemmer
30
    - Schinke Latin stemmer
31
    - S stemmer
32
"""
33
34
from __future__ import unicode_literals
35
36
import re
37
import unicodedata
38
39
from six import text_type
40
from six.moves import range
41
42
43
def lovins(word):
44
    """Return Lovins stem.
45
46
    Lovins stemmer
47
48
    The Lovins stemmer is described in Julie Beth Lovins's article at:
49
    http://www.mt-archive.info/MT-1968-Lovins.pdf
50
51
    :param word: the word to stem
52
    :returns: word stem
53
    :rtype: string
54
55
    >>> lovins('reading')
56
    'read'
57
    >>> lovins('suspension')
58
    'suspens'
59
    >>> lovins('elusiveness')
60
    'elus'
61
    """
62
    # pylint: disable=too-many-branches, too-many-locals
63
64
    # lowercase, normalize, and compose
65
    word = unicodedata.normalize('NFC', text_type(word.lower()))
66
67
    def cond_b(word, suffix_len):
68
        """Return Lovins' condition B."""
69
        return len(word)-suffix_len >= 3
70
71
    def cond_c(word, suffix_len):
72
        """Return Lovins' condition C."""
73
        return len(word)-suffix_len >= 4
74
75
    def cond_d(word, suffix_len):
76
        """Return Lovins' condition D."""
77
        return len(word)-suffix_len >= 5
78
79
    def cond_e(word, suffix_len):
80
        """Return Lovins' condition E."""
81
        return word[-suffix_len-1] != 'e'
82
83
    def cond_f(word, suffix_len):
84
        """Return Lovins' condition F."""
85
        return (len(word)-suffix_len >= 3 and
86
                word[-suffix_len-1] != 'e')
87
88
    def cond_g(word, suffix_len):
89
        """Return Lovins' condition G."""
90
        return (len(word)-suffix_len >= 3 and
91
                word[-suffix_len-1] == 'f')
92
93
    def cond_h(word, suffix_len):
94
        """Return Lovins' condition H."""
95
        return (word[-suffix_len-1] == 't' or
96
                word[-suffix_len-2:-suffix_len] == 'll')
97
98
    def cond_i(word, suffix_len):
99
        """Return Lovins' condition I."""
100
        return word[-suffix_len-1] not in {'e', 'o'}
101
102
    def cond_j(word, suffix_len):
103
        """Return Lovins' condition J."""
104
        return word[-suffix_len-1] not in {'a', 'e'}
105
106
    def cond_k(word, suffix_len):
107
        """Return Lovins' condition K."""
108
        return (len(word)-suffix_len >= 3 and
109
                (word[-suffix_len-1] in {'i', 'l'} or
110
                 (word[-suffix_len-3] == 'u' and word[-suffix_len-1] == 'e')))
111
112
    def cond_l(word, suffix_len):
113
        """Return Lovins' condition L."""
114
        return (word[-suffix_len-1] not in {'s', 'u', 'x'} or
115
                word[-suffix_len-1] == 'os')
116
117
    def cond_m(word, suffix_len):
118
        """Return Lovins' condition M."""
119
        return word[-suffix_len-1] not in {'a', 'c', 'e', 'm'}
120
121
    def cond_n(word, suffix_len):
122
        """Return Lovins' condition N."""
123
        if len(word)-suffix_len >= 3:
124
            if word[-suffix_len-3] == 's':
125
                if len(word)-suffix_len >= 4:
126
                    return True
127
            else:
128
                return True
129
        return False
130
131
    def cond_o(word, suffix_len):
132
        """Return Lovins' condition O."""
133
        return word[-suffix_len-1] in {'i', 'l'}
134
135
    def cond_p(word, suffix_len):
136
        """Return Lovins' condition P."""
137
        return word[-suffix_len-1] != 'c'
138
139
    def cond_q(word, suffix_len):
140
        """Return Lovins' condition Q."""
141
        return (len(word)-suffix_len >= 3 and
142
                word[-suffix_len-1] not in {'l', 'n'})
143
144
    def cond_r(word, suffix_len):
145
        """Return Lovins' condition R."""
146
        return word[-suffix_len-1] in {'n', 'r'}
147
148
    def cond_s(word, suffix_len):
149
        """Return Lovins' condition S."""
150
        return (word[-suffix_len-2:-suffix_len] == 'dr' or
151
                (word[-suffix_len-1] == 't' and
152
                 word[-suffix_len-2:-suffix_len] != 'tt'))
153
154
    def cond_t(word, suffix_len):
155
        """Return Lovins' condition T."""
156
        return (word[-suffix_len-1] in {'s', 't'} and
157
                word[-suffix_len-2:-suffix_len] != 'ot')
158
159
    def cond_u(word, suffix_len):
160
        """Return Lovins' condition U."""
161
        return word[-suffix_len-1] in {'l', 'm', 'n', 'r'}
162
163
    def cond_v(word, suffix_len):
164
        """Return Lovins' condition V."""
165
        return word[-suffix_len-1] == 'c'
166
167
    def cond_w(word, suffix_len):
168
        """Return Lovins' condition W."""
169
        return word[-suffix_len-1] not in {'s', 'u'}
170
171
    def cond_x(word, suffix_len):
172
        """Return Lovins' condition X."""
173
        return (word[-suffix_len-1] in {'i', 'l'} or
174
                (word[-suffix_len-3:-suffix_len] == 'u' and
175
                 word[-suffix_len-1] == 'e'))
176
177
    def cond_y(word, suffix_len):
178
        """Return Lovins' condition Y."""
179
        return word[-suffix_len-2:-suffix_len] == 'in'
180
181
    def cond_z(word, suffix_len):
182
        """Return Lovins' condition Z."""
183
        return word[-suffix_len-1] != 'f'
184
185
    def cond_aa(word, suffix_len):
186
        """Return Lovins' condition AA."""
187
        return (word[-suffix_len-1] in {'d', 'f', 'l', 't'} or
188
                word[-suffix_len-2:-suffix_len] in {'ph', 'th', 'er', 'or',
189
                                                    'es'})
190
191
    def cond_bb(word, suffix_len):
192
        """Return Lovins' condition BB."""
193
        return (len(word)-suffix_len >= 3 and
194
                word[-suffix_len-3:-suffix_len] != 'met' and
195
                word[-suffix_len-4:-suffix_len] != 'ryst')
196
197
    def cond_cc(word, suffix_len):
198
        """Return Lovins' condition CC."""
199
        return word[-suffix_len-1] == 'l'
200
201
    suffix = {'alistically': cond_b, 'arizability': None,
202
              'izationally': cond_b, 'antialness': None,
203
              'arisations': None, 'arizations': None, 'entialness': None,
204
              'allically': cond_c, 'antaneous': None, 'antiality': None,
205
              'arisation': None, 'arization': None, 'ationally': cond_b,
206
              'ativeness': None, 'eableness': cond_e, 'entations': None,
207
              'entiality': None, 'entialize': None, 'entiation': None,
208
              'ionalness': None, 'istically': None, 'itousness': None,
209
              'izability': None, 'izational': None, 'ableness': None,
210
              'arizable': None, 'entation': None, 'entially': None,
211
              'eousness': None, 'ibleness': None, 'icalness': None,
212
              'ionalism': None, 'ionality': None, 'ionalize': None,
213
              'iousness': None, 'izations': None, 'lessness': None,
214
              'ability': None, 'aically': None, 'alistic': cond_b,
215
              'alities': None, 'ariness': cond_e, 'aristic': None,
216
              'arizing': None, 'ateness': None, 'atingly': None,
217
              'ational': cond_b, 'atively': None, 'ativism': None,
218
              'elihood': cond_e, 'encible': None, 'entally': None,
219
              'entials': None, 'entiate': None, 'entness': None,
220
              'fulness': None, 'ibility': None, 'icalism': None,
221
              'icalist': None, 'icality': None, 'icalize': None,
222
              'ication': cond_g, 'icianry': None, 'ination': None,
223
              'ingness': None, 'ionally': None, 'isation': None,
224
              'ishness': None, 'istical': None, 'iteness': None,
225
              'iveness': None, 'ivistic': None, 'ivities': None,
226
              'ization': cond_f, 'izement': None, 'oidally': None,
227
              'ousness': None, 'aceous': None, 'acious': cond_b,
228
              'action': cond_g, 'alness': None, 'ancial': None,
229
              'ancies': None, 'ancing': cond_b, 'ariser': None,
230
              'arized': None, 'arizer': None, 'atable': None,
231
              'ations': cond_b, 'atives': None, 'eature': cond_z,
232
              'efully': None, 'encies': None, 'encing': None,
233
              'ential': None, 'enting': cond_c, 'entist': None,
234
              'eously': None, 'ialist': None, 'iality': None,
235
              'ialize': None, 'ically': None, 'icance': None,
236
              'icians': None, 'icists': None, 'ifully': None,
237
              'ionals': None, 'ionate': cond_d, 'ioning': None,
238
              'ionist': None, 'iously': None, 'istics': None,
239
              'izable': cond_e, 'lessly': None, 'nesses': None,
240
              'oidism': None, 'acies': None, 'acity': None,
241
              'aging': cond_b, 'aical': None, 'alist': None,
242
              'alism': cond_b, 'ality': None, 'alize': None,
243
              'allic': cond_bb, 'anced': cond_b, 'ances': cond_b,
244
              'antic': cond_c, 'arial': None, 'aries': None,
245
              'arily': None, 'arity': cond_b, 'arize': None,
246
              'aroid': None, 'ately': None, 'ating': cond_i,
247
              'ation': cond_b, 'ative': None, 'ators': None,
248
              'atory': None, 'ature': cond_e, 'early': cond_y,
249
              'ehood': None, 'eless': None, 'elity': None,
250
              'ement': None, 'enced': None, 'ences': None,
251
              'eness': cond_e, 'ening': cond_e, 'ental': None,
252
              'ented': cond_c, 'ently': None, 'fully': None,
253
              'ially': None, 'icant': None, 'ician': None,
254
              'icide': None, 'icism': None, 'icist': None,
255
              'icity': None, 'idine': cond_i, 'iedly': None,
256
              'ihood': None, 'inate': None, 'iness': None,
257
              'ingly': cond_b, 'inism': cond_j, 'inity': cond_cc,
258
              'ional': None, 'ioned': None, 'ished': None,
259
              'istic': None, 'ities': None, 'itous': None,
260
              'ively': None, 'ivity': None, 'izers': cond_f,
261
              'izing': cond_f, 'oidal': None, 'oides': None,
262
              'otide': None, 'ously': None, 'able': None, 'ably': None,
263
              'ages': cond_b, 'ally': cond_b, 'ance': cond_b, 'ancy': cond_b,
264
              'ants': cond_b, 'aric': None, 'arly': cond_k, 'ated': cond_i,
265
              'ates': None, 'atic': cond_b, 'ator': None, 'ealy': cond_y,
266
              'edly': cond_e, 'eful': None, 'eity': None, 'ence': None,
267
              'ency': None, 'ened': cond_e, 'enly': cond_e, 'eous': None,
268
              'hood': None, 'ials': None, 'ians': None, 'ible': None,
269
              'ibly': None, 'ical': None, 'ides': cond_l, 'iers': None,
270
              'iful': None, 'ines': cond_m, 'ings': cond_n, 'ions': cond_b,
271
              'ious': None, 'isms': cond_b, 'ists': None, 'itic': cond_h,
272
              'ized': cond_f, 'izer': cond_f, 'less': None, 'lily': None,
273
              'ness': None, 'ogen': None, 'ward': None, 'wise': None,
274
              'ying': cond_b, 'yish': None, 'acy': None, 'age': cond_b,
275
              'aic': None, 'als': cond_bb, 'ant': cond_b, 'ars': cond_o,
276
              'ary': cond_f, 'ata': None, 'ate': None, 'eal': cond_y,
277
              'ear': cond_y, 'ely': cond_e, 'ene': cond_e, 'ent': cond_c,
278
              'ery': cond_e, 'ese': None, 'ful': None, 'ial': None,
279
              'ian': None, 'ics': None, 'ide': cond_l, 'ied': None,
280
              'ier': None, 'ies': cond_p, 'ily': None, 'ine': cond_m,
281
              'ing': cond_n, 'ion': cond_q, 'ish': cond_c, 'ism': cond_b,
282
              'ist': None, 'ite': cond_aa, 'ity': None, 'ium': None,
283
              'ive': None, 'ize': cond_f, 'oid': None, 'one': cond_r,
284
              'ous': None, 'ae': None, 'al': cond_bb, 'ar': cond_x,
285
              'as': cond_b, 'ed': cond_e, 'en': cond_f, 'es': cond_e,
286
              'ia': None, 'ic': None, 'is': None, 'ly': cond_b,
287
              'on': cond_s, 'or': cond_t, 'um': cond_u, 'us': cond_v,
288
              'yl': cond_r, '\'s': None, 's\'': None, 'a': None,
289
              'e': None, 'i': None, 'o': None, 's': cond_w, 'y': cond_b}
290
291
    for suffix_len in range(11, 0, -1):
292
        ending = word[-suffix_len:]
293
        if (ending in suffix and
294
                len(word)-suffix_len >= 2 and
295
                (suffix[ending] is None or
296
                 suffix[ending](word, suffix_len))):
297
            word = word[:-suffix_len]
298
            break
299
300
    def recode9(stem):
301
        """Return Lovins' conditional recode rule 9."""
302
        if stem[-3:-2] in {'a', 'i', 'o'}:
303
            return stem
304
        return stem[:-2]+'l'
305
306
    def recode24(stem):
307
        """Return Lovins' conditional recode rule 24."""
308
        if stem[-4:-3] == 's':
309
            return stem
310
        return stem[:-1]+'s'
311
312
    def recode28(stem):
313
        """Return Lovins' conditional recode rule 28."""
314
        if stem[-4:-3] in {'p', 't'}:
315
            return stem
316
        return stem[:-1]+'s'
317
318
    def recode30(stem):
319
        """Return Lovins' conditional recode rule 30."""
320
        if stem[-4:-3] == 'm':
321
            return stem
322
        return stem[:-1]+'s'
323
324
    def recode32(stem):
325
        """Return Lovins' conditional recode rule 32."""
326
        if stem[-3:-2] == 'n':
327
            return stem
328
        return stem[:-1]+'s'
329
330
    if word[-2:] in {'bb', 'dd', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss',
331
                     'tt'}:
332
        word = word[:-1]
333
334
    recode = (('iev', 'ief'),
335
              ('uct', 'uc'),
336
              ('umpt', 'um'),
337
              ('rpt', 'rb'),
338
              ('urs', 'ur'),
339
              ('istr', 'ister'),
340
              ('metr', 'meter'),
341
              ('olv', 'olut'),
342
              ('ul', recode9),
343
              ('bex', 'bic'),
344
              ('dex', 'dic'),
345
              ('pex', 'pic'),
346
              ('tex', 'tic'),
347
              ('ax', 'ac'),
348
              ('ex', 'ec'),
349
              ('ix', 'ic'),
350
              ('lux', 'luc'),
351
              ('uad', 'uas'),
352
              ('vad', 'vas'),
353
              ('cid', 'cis'),
354
              ('lid', 'lis'),
355
              ('erid', 'eris'),
356
              ('pand', 'pans'),
357
              ('end', recode24),
358
              ('ond', 'ons'),
359
              ('lud', 'lus'),
360
              ('rud', 'rus'),
361
              ('her', recode28),
362
              ('mit', 'mis'),
363
              ('ent', recode30),
364
              ('ert', 'ers'),
365
              ('et', recode32),
366
              ('yt', 'ys'),
367
              ('yz', 'ys'))
368
369
    for ending, replacement in recode:
370
        if word.endswith(ending):
371
            if callable(replacement):
372
                word = replacement(word)
373
            else:
374
                word = word[:-len(ending)] + replacement
375
376
    return word
377
378
379
def _m_degree(term, vowels):
380
    """Return Porter helper function _m_degree value.
381
382
    m-degree is equal to the number of V to C transitions
383
384
    :param term: the word for which to calculate the m-degree
385
    :param vowels: the set of vowels in the language
386
    :returns: the m-degree as defined in the Porter stemmer definition
387
    """
388
    mdeg = 0
389
    last_was_vowel = False
390
    for letter in term:
391
        if letter in vowels:
392
            last_was_vowel = True
393
        else:
394
            if last_was_vowel:
395
                mdeg += 1
396
            last_was_vowel = False
397
    return mdeg
398
399
400
def _sb_has_vowel(term, vowels):
401
    """Return Porter helper function _sb_has_vowel value.
402
403
    :param term: the word to scan for vowels
404
    :param vowels: the set of vowels in the language
405
    :returns: true iff a vowel exists in the term (as defined in the Porter
406
        stemmer definition)
407
    """
408
    for letter in term:
409
        if letter in vowels:
410
            return True
411
    return False
412
413
414
def _ends_in_doubled_cons(term, vowels):
415
    """Return Porter helper function _ends_in_doubled_cons value.
416
417
    :param term: the word to check for a final doubled consonant
418
    :param vowels: the set of vowels in the language
419
    :returns: true iff the stem ends in a doubled consonant (as defined in the
420
        Porter stemmer definition)
421
    """
422
    if len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1]:
423
        return True
424
    return False
425
426
427
def _ends_in_cvc(term, vowels):
428
    """Return Porter helper function _ends_in_cvc value.
429
430
    :param term: the word to scan for cvc
431
    :param vowels: the set of vowels in the language
432
    :returns: true iff the stem ends in cvc (as defined in the Porter stemmer
433
        definition)
434
    """
435
    if len(term) > 2 and (term[-1] not in vowels and
436
                          term[-2] in vowels and
437
                          term[-3] not in vowels and
438
                          term[-1] not in tuple('wxY')):
439
        return True
440
    return False
441
442
443
def porter(word, early_english=False):
444
    """Return Porter stem.
445
446
    The Porter stemmer is defined at:
447
    http://snowball.tartarus.org/algorithms/porter/stemmer.html
448
449
    :param word: the word to calculate the stem of
450
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
451
        person singular verbal agreement suffixes)
452
    :returns: word stem
453
    :rtype: str
454
455
    >>> porter('reading')
456
    'read'
457
    >>> porter('suspension')
458
    'suspens'
459
    >>> porter('elusiveness')
460
    'elus'
461
462
    >>> porter('eateth', early_english=True)
463
    'eat'
464
    """
465
    # pylint: disable=too-many-branches
466
467
    # lowercase, normalize, and compose
468
    word = unicodedata.normalize('NFC', text_type(word.lower()))
469
470
    # Return word if stem is shorter than 2
471
    if len(word) < 3:
472
        return word
473
474
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
475
    # Re-map consonantal y to Y (Y will be C, y will be V)
476
    if word[0] == 'y':
477
        word = 'Y' + word[1:]
478
    for i in range(1, len(word)):
479
        if word[i] == 'y' and word[i-1] in _vowels:
480
            word = word[:i] + 'Y' + word[i+1:]
481
482
    # Step 1a
483
    if word[-1] == 's':
484
        if word[-4:] == 'sses':
485
            word = word[:-2]
486
        elif word[-3:] == 'ies':
487
            word = word[:-2]
488
        elif word[-2:] == 'ss':
489
            pass
490
        else:
491
            word = word[:-1]
492
493
    # Step 1b
494
    step1b_flag = False
495
    if word[-3:] == 'eed':
496
        if _m_degree(word[:-3], _vowels) > 0:
497
            word = word[:-1]
498
    elif word[-2:] == 'ed':
499
        if _sb_has_vowel(word[:-2], _vowels):
500
            word = word[:-2]
501
            step1b_flag = True
502
    elif word[-3:] == 'ing':
503
        if _sb_has_vowel(word[:-3], _vowels):
504
            word = word[:-3]
505
            step1b_flag = True
506
    elif early_english:
507
        if word[-3:] == 'est':
508
            if _sb_has_vowel(word[:-3], _vowels):
509
                word = word[:-3]
510
                step1b_flag = True
511
        elif word[-3:] == 'eth':
512
            if _sb_has_vowel(word[:-3], _vowels):
513
                word = word[:-3]
514
                step1b_flag = True
515
516
    if step1b_flag:
517
        if word[-2:] in {'at', 'bl', 'iz'}:
518
            word += 'e'
519
        elif (_ends_in_doubled_cons(word, _vowels) and
520
              word[-1] not in {'l', 's', 'z'}):
521
            word = word[:-1]
522
        elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels):
523
            word += 'e'
524
525
    # Step 1c
526
    if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels):
527
        word = word[:-1] + 'i'
528
529
    # Step 2
530
    if len(word) > 1:
531
        if word[-2] == 'a':
532
            if word[-7:] == 'ational':
533
                if _m_degree(word[:-7], _vowels) > 0:
534
                    word = word[:-5] + 'e'
535
            elif word[-6:] == 'tional':
536
                if _m_degree(word[:-6], _vowels) > 0:
537
                    word = word[:-2]
538
        elif word[-2] == 'c':
539
            if word[-4:] in {'enci', 'anci'}:
540
                if _m_degree(word[:-4], _vowels) > 0:
541
                    word = word[:-1] + 'e'
542
        elif word[-2] == 'e':
543
            if word[-4:] == 'izer':
544
                if _m_degree(word[:-4], _vowels) > 0:
545
                    word = word[:-1]
546
        elif word[-2] == 'g':
547
            if word[-4:] == 'logi':
548
                if _m_degree(word[:-4], _vowels) > 0:
549
                    word = word[:-1]
550
        elif word[-2] == 'l':
551
            if word[-3:] == 'bli':
552
                if _m_degree(word[:-3], _vowels) > 0:
553
                    word = word[:-1] + 'e'
554
            elif word[-4:] == 'alli':
555
                if _m_degree(word[:-4], _vowels) > 0:
556
                    word = word[:-2]
557
            elif word[-5:] == 'entli':
558
                if _m_degree(word[:-5], _vowels) > 0:
559
                    word = word[:-2]
560
            elif word[-3:] == 'eli':
561
                if _m_degree(word[:-3], _vowels) > 0:
562
                    word = word[:-2]
563
            elif word[-5:] == 'ousli':
564
                if _m_degree(word[:-5], _vowels) > 0:
565
                    word = word[:-2]
566
        elif word[-2] == 'o':
567
            if word[-7:] == 'ization':
568
                if _m_degree(word[:-7], _vowels) > 0:
569
                    word = word[:-5] + 'e'
570
            elif word[-5:] == 'ation':
571
                if _m_degree(word[:-5], _vowels) > 0:
572
                    word = word[:-3] + 'e'
573
            elif word[-4:] == 'ator':
574
                if _m_degree(word[:-4], _vowels) > 0:
575
                    word = word[:-2] + 'e'
576
        elif word[-2] == 's':
577
            if word[-5:] == 'alism':
578
                if _m_degree(word[:-5], _vowels) > 0:
579
                    word = word[:-3]
580
            elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
581
                if _m_degree(word[:-7], _vowels) > 0:
582
                    word = word[:-4]
583
        elif word[-2] == 't':
584
            if word[-5:] == 'aliti':
585
                if _m_degree(word[:-5], _vowels) > 0:
586
                    word = word[:-3]
587
            elif word[-5:] == 'iviti':
588
                if _m_degree(word[:-5], _vowels) > 0:
589
                    word = word[:-3] + 'e'
590
            elif word[-6:] == 'biliti':
591
                if _m_degree(word[:-6], _vowels) > 0:
592
                    word = word[:-5] + 'le'
593
594
    # Step 3
595
    if word[-5:] == 'icate':
596
        if _m_degree(word[:-5], _vowels) > 0:
597
            word = word[:-3]
598
    elif word[-5:] == 'ative':
599
        if _m_degree(word[:-5], _vowels) > 0:
600
            word = word[:-5]
601
    elif word[-5:] in {'alize', 'iciti'}:
602
        if _m_degree(word[:-5], _vowels) > 0:
603
            word = word[:-3]
604
    elif word[-4:] == 'ical':
605
        if _m_degree(word[:-4], _vowels) > 0:
606
            word = word[:-2]
607
    elif word[-3:] == 'ful':
608
        if _m_degree(word[:-3], _vowels) > 0:
609
            word = word[:-3]
610
    elif word[-4:] == 'ness':
611
        if _m_degree(word[:-4], _vowels) > 0:
612
            word = word[:-4]
613
614
    # Step 4
615
    if word[-2:] == 'al':
616
        if _m_degree(word[:-2], _vowels) > 1:
617
            word = word[:-2]
618
    elif word[-4:] == 'ance':
619
        if _m_degree(word[:-4], _vowels) > 1:
620
            word = word[:-4]
621
    elif word[-4:] == 'ence':
622
        if _m_degree(word[:-4], _vowels) > 1:
623
            word = word[:-4]
624
    elif word[-2:] == 'er':
625
        if _m_degree(word[:-2], _vowels) > 1:
626
            word = word[:-2]
627
    elif word[-2:] == 'ic':
628
        if _m_degree(word[:-2], _vowels) > 1:
629
            word = word[:-2]
630
    elif word[-4:] == 'able':
631
        if _m_degree(word[:-4], _vowels) > 1:
632
            word = word[:-4]
633
    elif word[-4:] == 'ible':
634
        if _m_degree(word[:-4], _vowels) > 1:
635
            word = word[:-4]
636
    elif word[-3:] == 'ant':
637
        if _m_degree(word[:-3], _vowels) > 1:
638
            word = word[:-3]
639
    elif word[-5:] == 'ement':
640
        if _m_degree(word[:-5], _vowels) > 1:
641
            word = word[:-5]
642
    elif word[-4:] == 'ment':
643
        if _m_degree(word[:-4], _vowels) > 1:
644
            word = word[:-4]
645
    elif word[-3:] == 'ent':
646
        if _m_degree(word[:-3], _vowels) > 1:
647
            word = word[:-3]
648
    elif word[-4:] in {'sion', 'tion'}:
649
        if _m_degree(word[:-3], _vowels) > 1:
650
            word = word[:-3]
651
    elif word[-2:] == 'ou':
652
        if _m_degree(word[:-2], _vowels) > 1:
653
            word = word[:-2]
654
    elif word[-3:] == 'ism':
655
        if _m_degree(word[:-3], _vowels) > 1:
656
            word = word[:-3]
657
    elif word[-3:] == 'ate':
658
        if _m_degree(word[:-3], _vowels) > 1:
659
            word = word[:-3]
660
    elif word[-3:] == 'iti':
661
        if _m_degree(word[:-3], _vowels) > 1:
662
            word = word[:-3]
663
    elif word[-3:] == 'ous':
664
        if _m_degree(word[:-3], _vowels) > 1:
665
            word = word[:-3]
666
    elif word[-3:] == 'ive':
667
        if _m_degree(word[:-3], _vowels) > 1:
668
            word = word[:-3]
669
    elif word[-3:] == 'ize':
670
        if _m_degree(word[:-3], _vowels) > 1:
671
            word = word[:-3]
672
673
    # Step 5a
674
    if word[-1] == 'e':
675
        if _m_degree(word[:-1], _vowels) > 1:
676
            word = word[:-1]
677
        elif (_m_degree(word[:-1], _vowels) == 1 and
678
              not _ends_in_cvc(word[:-1], _vowels)):
679
            word = word[:-1]
680
681
    # Step 5b
682
    if word[-2:] == 'll' and _m_degree(word, _vowels) > 1:
683
        word = word[:-1]
684
685
    # Change 'Y' back to 'y' if it survived stemming
686
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
687
        if word[i] == 'Y':
688
            word = word[:i] + 'y' + word[i+1:]
689
690
    return word
691
692
693
def _sb_r1(term, vowels, r1_prefixes=None):
694
    """Return the R1 region, as defined in the Porter2 specification."""
695
    vowel_found = False
696
    if hasattr(r1_prefixes, '__iter__'):
697
        for prefix in r1_prefixes:
698
            if term[:len(prefix)] == prefix:
699
                return len(prefix)
700
701
    for i in range(len(term)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
702
        if not vowel_found and term[i] in vowels:
703
            vowel_found = True
704
        elif vowel_found and term[i] not in vowels:
705
            return i + 1
706
    return len(term)
707
708
709
def _sb_r2(term, vowels, r1_prefixes=None):
710
    """Return the R2 region, as defined in the Porter2 specification."""
711
    r1_start = _sb_r1(term, vowels, r1_prefixes)
712
    return r1_start + _sb_r1(term[r1_start:], vowels)
713
714
715
def _sb_ends_in_short_syllable(term, vowels, codanonvowels):
716
    """Return True iff term ends in a short syllable.
717
718
    (...according to the Porter2 specification.)
719
720
    NB: This is akin to the CVC test from the Porter stemmer. The description
721
    is unfortunately poor/ambiguous.
722
    """
723
    if not term:
724
        return False
725
    if len(term) == 2:
726
        if term[-2] in vowels and term[-1] not in vowels:
727
            return True
728
    elif len(term) >= 3:
729
        if ((term[-3] not in vowels and term[-2] in vowels and
730
             term[-1] in codanonvowels)):
731
            return True
732
    return False
733
734
735
def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None):
736
    """Return True iff term is a short word.
737
738
    (...according to the Porter2 specification.)
739
    """
740
    if ((_sb_r1(term, vowels, r1_prefixes) == len(term) and
741
         _sb_ends_in_short_syllable(term, vowels, codanonvowels))):
742
        return True
743
    return False
744
745
746
def porter2(word, early_english=False):
747
    """Return the Porter2 (Snowball English) stem.
748
749
    The Porter2 (Snowball English) stemmer is defined at:
750
    http://snowball.tartarus.org/algorithms/english/stemmer.html
751
752
    :param word: the word to calculate the stem of
753
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
754
        person singular verbal agreement suffixes)
755
    :returns: word stem
756
    :rtype: str
757
758
    >>> porter2('reading')
759
    'read'
760
    >>> porter2('suspension')
761
    'suspens'
762
    >>> porter2('elusiveness')
763
    'elus'
764
765
    >>> porter2('eateth', early_english=True)
766
    'eat'
767
    """
768
    # pylint: disable=too-many-branches
769
    # pylint: disable=too-many-return-statements
770
771
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
772
    _codanonvowels = {"'", 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
773
                      'n', 'p', 'q', 'r', 's', 't', 'v', 'z'}
774
    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
775
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
776
777
    # R1 prefixes should be in order from longest to shortest to prevent
778
    # masking
779
    _r1_prefixes = ('commun', 'gener', 'arsen')
780
    _exception1dict = {  # special changes:
781
        'skis': 'ski', 'skies': 'sky', 'dying': 'die',
782
        'lying': 'lie', 'tying': 'tie',
783
        # special -LY cases:
784
        'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli',
785
        'early': 'earli', 'only': 'onli', 'singly': 'singl'}
786
    _exception1set = {'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias',
787
                      'andes'}
788
    _exception2set = {'inning', 'outing', 'canning', 'herring', 'earring',
789
                      'proceed', 'exceed', 'succeed'}
790
791
    # lowercase, normalize, and compose
792
    word = unicodedata.normalize('NFC', text_type(word.lower()))
793
    # replace apostrophe-like characters with U+0027, per
794
    # http://snowball.tartarus.org/texts/apostrophe.html
795
    word = word.replace('’', '\'')
796
    word = word.replace('’', '\'')
797
798
    # Exceptions 1
799
    if word in _exception1dict:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
800
        return _exception1dict[word]
801
    elif word in _exception1set:
802
        return word
803
804
    # Return word if stem is shorter than 3
805
    if len(word) < 3:
806
        return word
807
808
    # Remove initial ', if present.
809
    while word and word[0] == '\'':
810
        word = word[1:]
811
        # Return word if stem is shorter than 2
812
        if len(word) < 2:
813
            return word
814
815
    # Re-map vocalic Y to y (Y will be C, y will be V)
816
    if word[0] == 'y':
817
        word = 'Y' + word[1:]
818
    for i in range(1, len(word)):
819
        if word[i] == 'y' and word[i-1] in _vowels:
820
            word = word[:i] + 'Y' + word[i+1:]
821
822
    r1_start = _sb_r1(word, _vowels, _r1_prefixes)
823
    r2_start = _sb_r2(word, _vowels, _r1_prefixes)
824
825
    # Step 0
826
    if word[-3:] == '\'s\'':
827
        word = word[:-3]
828
    elif word[-2:] == '\'s':
829
        word = word[:-2]
830
    elif word[-1:] == '\'':
831
        word = word[:-1]
832
    # Return word if stem is shorter than 2
833
    if len(word) < 3:
834
        return word
835
836
    # Step 1a
837
    if word[-4:] == 'sses':
838
        word = word[:-2]
839
    elif word[-3:] in {'ied', 'ies'}:
840
        if len(word) > 4:
841
            word = word[:-2]
842
        else:
843
            word = word[:-1]
844
    elif word[-2:] in {'us', 'ss'}:
845
        pass
846
    elif word[-1] == 's':
847
        if _sb_has_vowel(word[:-2], _vowels):
848
            word = word[:-1]
849
850
    # Exceptions 2
851
    if word in _exception2set:
852
        return word
853
854
    # Step 1b
855
    step1b_flag = False
856
    if word[-5:] == 'eedly':
857
        if len(word[r1_start:]) >= 5:
858
            word = word[:-3]
859
    elif word[-5:] == 'ingly':
860
        if _sb_has_vowel(word[:-5], _vowels):
861
            word = word[:-5]
862
            step1b_flag = True
863
    elif word[-4:] == 'edly':
864
        if _sb_has_vowel(word[:-4], _vowels):
865
            word = word[:-4]
866
            step1b_flag = True
867
    elif word[-3:] == 'eed':
868
        if len(word[r1_start:]) >= 3:
869
            word = word[:-1]
870
    elif word[-3:] == 'ing':
871
        if _sb_has_vowel(word[:-3], _vowels):
872
            word = word[:-3]
873
            step1b_flag = True
874
    elif word[-2:] == 'ed':
875
        if _sb_has_vowel(word[:-2], _vowels):
876
            word = word[:-2]
877
            step1b_flag = True
878
    elif early_english:
879
        if word[-3:] == 'est':
880
            if _sb_has_vowel(word[:-3], _vowels):
881
                word = word[:-3]
882
                step1b_flag = True
883
        elif word[-3:] == 'eth':
884
            if _sb_has_vowel(word[:-3], _vowels):
885
                word = word[:-3]
886
                step1b_flag = True
887
888
    if step1b_flag:
889
        if word[-2:] in {'at', 'bl', 'iz'}:
890
            word += 'e'
891
        elif word[-2:] in _doubles:
892
            word = word[:-1]
893
        elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes):
894
            word += 'e'
895
896
    # Step 1c
897
    if ((len(word) > 2 and word[-1] in {'Y', 'y'} and
898
         word[-2] not in _vowels)):
899
        word = word[:-1] + 'i'
900
901
    # Step 2
902
    if word[-2] == 'a':
903
        if word[-7:] == 'ational':
904
            if len(word[r1_start:]) >= 7:
905
                word = word[:-5] + 'e'
906
        elif word[-6:] == 'tional':
907
            if len(word[r1_start:]) >= 6:
908
                word = word[:-2]
909
    elif word[-2] == 'c':
910
        if word[-4:] in {'enci', 'anci'}:
911
            if len(word[r1_start:]) >= 4:
912
                word = word[:-1] + 'e'
913
    elif word[-2] == 'e':
914
        if word[-4:] == 'izer':
915
            if len(word[r1_start:]) >= 4:
916
                word = word[:-1]
917
    elif word[-2] == 'g':
918
        if word[-3:] == 'ogi':
919
            if ((r1_start >= 1 and len(word[r1_start:]) >= 3 and
920
                 word[-4] == 'l')):
921
                word = word[:-1]
922
    elif word[-2] == 'l':
923
        if word[-6:] == 'lessli':
924
            if len(word[r1_start:]) >= 6:
925
                word = word[:-2]
926
        elif word[-5:] in {'entli', 'fulli', 'ousli'}:
927
            if len(word[r1_start:]) >= 5:
928
                word = word[:-2]
929
        elif word[-4:] == 'abli':
930
            if len(word[r1_start:]) >= 4:
931
                word = word[:-1] + 'e'
932
        elif word[-4:] == 'alli':
933
            if len(word[r1_start:]) >= 4:
934
                word = word[:-2]
935
        elif word[-3:] == 'bli':
936
            if len(word[r1_start:]) >= 3:
937
                word = word[:-1] + 'e'
938
        elif word[-2:] == 'li':
939
            if ((r1_start >= 1 and len(word[r1_start:]) >= 2 and
940
                 word[-3] in _li)):
941
                word = word[:-2]
942
    elif word[-2] == 'o':
943
        if word[-7:] == 'ization':
944
            if len(word[r1_start:]) >= 7:
945
                word = word[:-5] + 'e'
946
        elif word[-5:] == 'ation':
947
            if len(word[r1_start:]) >= 5:
948
                word = word[:-3] + 'e'
949
        elif word[-4:] == 'ator':
950
            if len(word[r1_start:]) >= 4:
951
                word = word[:-2] + 'e'
952
    elif word[-2] == 's':
953
        if word[-7:] in {'fulness', 'ousness', 'iveness'}:
954
            if len(word[r1_start:]) >= 7:
955
                word = word[:-4]
956
        elif word[-5:] == 'alism':
957
            if len(word[r1_start:]) >= 5:
958
                word = word[:-3]
959
    elif word[-2] == 't':
960
        if word[-6:] == 'biliti':
961
            if len(word[r1_start:]) >= 6:
962
                word = word[:-5] + 'le'
963
        elif word[-5:] == 'aliti':
964
            if len(word[r1_start:]) >= 5:
965
                word = word[:-3]
966
        elif word[-5:] == 'iviti':
967
            if len(word[r1_start:]) >= 5:
968
                word = word[:-3] + 'e'
969
970
    # Step 3
971
    if word[-7:] == 'ational':
972
        if len(word[r1_start:]) >= 7:
973
            word = word[:-5] + 'e'
974
    elif word[-6:] == 'tional':
975
        if len(word[r1_start:]) >= 6:
976
            word = word[:-2]
977
    elif word[-5:] in {'alize', 'icate', 'iciti'}:
978
        if len(word[r1_start:]) >= 5:
979
            word = word[:-3]
980
    elif word[-5:] == 'ative':
981
        if len(word[r2_start:]) >= 5:
982
            word = word[:-5]
983
    elif word[-4:] == 'ical':
984
        if len(word[r1_start:]) >= 4:
985
            word = word[:-2]
986
    elif word[-4:] == 'ness':
987
        if len(word[r1_start:]) >= 4:
988
            word = word[:-4]
989
    elif word[-3:] == 'ful':
990
        if len(word[r1_start:]) >= 3:
991
            word = word[:-3]
992
993
    # Step 4
994
    for suffix in ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant',
995
                   'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er',
996
                   'ic'):
997
        if word[-len(suffix):] == suffix:
998
            if len(word[r2_start:]) >= len(suffix):
999
                word = word[:-len(suffix)]
1000
            break
1001
    else:
1002
        if word[-3:] == 'ion':
1003
            if ((len(word[r2_start:]) >= 3 and len(word) >= 4 and
1004
                 word[-4] in tuple('st'))):
1005
                word = word[:-3]
1006
1007
    # Step 5
1008
    if word[-1] == 'e':
1009
        if (len(word[r2_start:]) >= 1 or
1010
                (len(word[r1_start:]) >= 1 and
1011
                 not _sb_ends_in_short_syllable(word[:-1], _vowels,
1012
                                                _codanonvowels))):
1013
            word = word[:-1]
1014
    elif word[-1] == 'l':
1015
        if len(word[r2_start:]) >= 1 and word[-2] == 'l':
1016
            word = word[:-1]
1017
1018
    # Change 'Y' back to 'y' if it survived stemming
1019
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1020
        if word[i] == 'Y':
1021
            word = word[:i] + 'y' + word[i+1:]
1022
1023
    return word
1024
1025
1026
def sb_german(word, alternate_vowels=False):
1027
    """Return Snowball German stem.
1028
1029
    The Snowball German stemmer is defined at:
1030
    http://snowball.tartarus.org/algorithms/german/stemmer.html
1031
1032
    :param word: the word to calculate the stem of
1033
    :param alternate_vowels: composes ae as ä, oe as ö, and ue as ü before
1034
        running the algorithm
1035
    :returns: word stem
1036
    :rtype: str
1037
1038
    >>> sb_german('lesen')
1039
    'les'
1040
    >>> sb_german('graues')
1041
    'grau'
1042
    >>> sb_german('buchstabieren')
1043
    'buchstabi'
1044
    """
1045
    # pylint: disable=too-many-branches
1046
1047
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
1048
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
1049
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1050
1051
    # lowercase, normalize, and compose
1052
    word = unicodedata.normalize('NFC', word.lower())
1053
    word = word.replace('ß', 'ss')
1054
1055
    if len(word) > 2:
1056
        for i in range(2, len(word)):
1057
            if word[i] in _vowels and word[i-2] in _vowels:
1058
                if word[i-1] == 'u':
1059
                    word = word[:i-1] + 'U' + word[i:]
1060
                elif word[i-1] == 'y':
1061
                    word = word[:i-1] + 'Y' + word[i:]
1062
1063
    if alternate_vowels:
1064
        word = word.replace('ae', 'ä')
1065
        word = word.replace('oe', 'ö')
1066
        word = word.replace('que', 'Q')
1067
        word = word.replace('ue', 'ü')
1068
        word = word.replace('Q', 'que')
1069
1070
    r1_start = max(3, _sb_r1(word, _vowels))
1071
    r2_start = _sb_r2(word, _vowels)
1072
1073
    # Step 1
1074
    niss_flag = False
1075
    if word[-3:] == 'ern':
1076
        if len(word[r1_start:]) >= 3:
1077
            word = word[:-3]
1078
    elif word[-2:] == 'em':
1079
        if len(word[r1_start:]) >= 2:
1080
            word = word[:-2]
1081
    elif word[-2:] == 'er':
1082
        if len(word[r1_start:]) >= 2:
1083
            word = word[:-2]
1084
    elif word[-2:] == 'en':
1085
        if len(word[r1_start:]) >= 2:
1086
            word = word[:-2]
1087
            niss_flag = True
1088
    elif word[-2:] == 'es':
1089
        if len(word[r1_start:]) >= 2:
1090
            word = word[:-2]
1091
            niss_flag = True
1092
    elif word[-1:] == 'e':
1093
        if len(word[r1_start:]) >= 1:
1094
            word = word[:-1]
1095
            niss_flag = True
1096
    elif word[-1:] == 's':
1097
        if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1098
             word[-2] in _s_endings)):
1099
            word = word[:-1]
1100
1101
    if niss_flag and word[-4:] == 'niss':
1102
        word = word[:-1]
1103
1104
    # Step 2
1105
    if word[-3:] == 'est':
1106
        if len(word[r1_start:]) >= 3:
1107
            word = word[:-3]
1108
    elif word[-2:] == 'en':
1109
        if len(word[r1_start:]) >= 2:
1110
            word = word[:-2]
1111
    elif word[-2:] == 'er':
1112
        if len(word[r1_start:]) >= 2:
1113
            word = word[:-2]
1114
    elif word[-2:] == 'st':
1115
        if ((len(word[r1_start:]) >= 2 and len(word) >= 6 and
1116
             word[-3] in _st_endings)):
1117
            word = word[:-2]
1118
1119
    # Step 3
1120
    if word[-4:] == 'isch':
1121
        if len(word[r2_start:]) >= 4 and word[-5] != 'e':
1122
            word = word[:-4]
1123
    elif word[-4:] in {'lich', 'heit'}:
1124
        if len(word[r2_start:]) >= 4:
1125
            word = word[:-4]
1126
            if ((word[-2:] in {'er', 'en'} and
1127
                 len(word[r1_start:]) >= 2)):
1128
                word = word[:-2]
1129
    elif word[-4:] == 'keit':
1130
        if len(word[r2_start:]) >= 4:
1131
            word = word[:-4]
1132
            if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
1133
                word = word[:-4]
1134
            elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
1135
                word = word[:-2]
1136
    elif word[-3:] in {'end', 'ung'}:
1137
        if len(word[r2_start:]) >= 3:
1138
            word = word[:-3]
1139
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1140
                 word[-3] != 'e')):
1141
                word = word[:-2]
1142
    elif word[-2:] in {'ig', 'ik'}:
1143
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1144
            word = word[:-2]
1145
1146
    # Change 'Y' and 'U' back to lowercase if survived stemming
1147
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1148
        if word[i] == 'Y':
1149
            word = word[:i] + 'y' + word[i+1:]
1150
        elif word[i] == 'U':
1151
            word = word[:i] + 'u' + word[i+1:]
1152
1153
    # Remove umlauts
1154
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1155
    word = word.translate(_umlauts)
1156
1157
    return word
1158
1159
1160
def sb_dutch(word):
1161
    """Return Snowball Dutch stem.
1162
1163
    The Snowball Dutch stemmer is defined at:
1164
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
1165
1166
    :param word: the word to calculate the stem of
1167
    :returns: word stem
1168
    :rtype: str
1169
1170
    >>> sb_dutch('lezen')
1171
    'lez'
1172
    >>> sb_dutch('opschorting')
1173
    'opschort'
1174
    >>> sb_dutch('ongrijpbaarheid')
1175
    'ongrijp'
1176
    """
1177
    # pylint: disable=too-many-branches
1178
1179
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
1180
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
1181
1182
    def _undouble(word):
1183
        """Undouble endings -kk, -dd, and -tt."""
1184
        if ((len(word) > 1 and word[-1] == word[-2] and
1185
             word[-1] in {'d', 'k', 't'})):
1186
            return word[:-1]
1187
        return word
1188
1189
    # lowercase, normalize, decompose, filter umlauts & acutes out, and compose
1190
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1191
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1192
    word = word.translate(_accented)
1193
1194
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1195
        if i == 0 and word[0] == 'y':
1196
            word = 'Y' + word[1:]
1197
        elif word[i] == 'y' and word[i-1] in _vowels:
1198
            word = word[:i] + 'Y' + word[i+1:]
1199
        elif (word[i] == 'i' and word[i-1] in _vowels and i+1 < len(word) and
1200
              word[i+1] in _vowels):
1201
            word = word[:i] + 'I' + word[i+1:]
1202
1203
    r1_start = max(3, _sb_r1(word, _vowels))
1204
    r2_start = _sb_r2(word, _vowels)
1205
1206
    # Step 1
1207
    if word[-5:] == 'heden':
1208
        if len(word[r1_start:]) >= 5:
1209
            word = word[:-3] + 'id'
1210
    elif word[-3:] == 'ene':
1211
        if ((len(word[r1_start:]) >= 3 and
1212
             (word[-4] not in _vowels and word[-6:-3] != 'gem'))):
1213
            word = _undouble(word[:-3])
1214
    elif word[-2:] == 'en':
1215
        if ((len(word[r1_start:]) >= 2 and
1216
             (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1217
            word = _undouble(word[:-2])
1218
    elif word[-2:] == 'se':
1219
        if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings:
1220
            word = word[:-2]
1221
    elif word[-1:] == 's':
1222
        if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings:
1223
            word = word[:-1]
1224
1225
    # Step 2
1226
    e_removed = False
1227
    if word[-1:] == 'e':
1228
        if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1229
            word = _undouble(word[:-1])
1230
            e_removed = True
1231
1232
    # Step 3a
1233
    if word[-4:] == 'heid':
1234
        if len(word[r2_start:]) >= 4 and word[-5] != 'c':
1235
            word = word[:-4]
1236
            if word[-2:] == 'en':
1237
                if ((len(word[r1_start:]) >= 2 and
1238
                     (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1239
                    word = _undouble(word[:-2])
1240
1241
    # Step 3b
1242
    if word[-4:] == 'lijk':
1243
        if len(word[r2_start:]) >= 4:
1244
            word = word[:-4]
1245
            # Repeat step 2
1246
            if word[-1:] == 'e':
1247
                if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1248
                    word = _undouble(word[:-1])
1249
    elif word[-4:] == 'baar':
1250
        if len(word[r2_start:]) >= 4:
1251
            word = word[:-4]
1252
    elif word[-3:] in ('end', 'ing'):
1253
        if len(word[r2_start:]) >= 3:
1254
            word = word[:-3]
1255
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1256
                 word[-3] != 'e')):
1257
                word = word[:-2]
1258
            else:
1259
                word = _undouble(word)
1260
    elif word[-3:] == 'bar':
1261
        if len(word[r2_start:]) >= 3 and e_removed:
1262
            word = word[:-3]
1263
    elif word[-2:] == 'ig':
1264
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1265
            word = word[:-2]
1266
1267
    # Step 4
1268
    if ((len(word) >= 4 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
1269
         word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and
1270
         word[-4] not in _vowels and
1271
         word[-1] not in _vowels and word[-1] != 'I')):
1272
        word = word[:-2] + word[-1]
1273
1274
    # Change 'Y' and 'U' back to lowercase if survived stemming
1275
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
1276
        if word[i] == 'Y':
1277
            word = word[:i] + 'y' + word[i+1:]
1278
        elif word[i] == 'I':
1279
            word = word[:i] + 'i' + word[i+1:]
1280
1281
    return word
1282
1283
1284
def sb_norwegian(word):
1285
    """Return Snowball Norwegian stem.
1286
1287
    The Snowball Norwegian stemmer is defined at:
1288
    http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
1289
1290
    :param word: the word to calculate the stem of
1291
    :returns: word stem
1292
    :rtype: str
1293
1294
    >>> sb_norwegian('lese')
1295
    'les'
1296
    >>> sb_norwegian('suspensjon')
1297
    'suspensjon'
1298
    >>> sb_norwegian('sikkerhet')
1299
    'sikker'
1300
    """
1301
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1302
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p',
1303
                  'r', 't', 'v', 'y', 'z'}
1304
    # lowercase, normalize, and compose
1305
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1306
1307
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1308
1309
    # Step 1
1310
    _r1 = word[r1_start:]
1311
    if _r1[-7:] == 'hetenes':
1312
        word = word[:-7]
1313
    elif _r1[-6:] in {'hetene', 'hetens'}:
1314
        word = word[:-6]
1315
    elif _r1[-5:] in {'heten', 'heter', 'endes'}:
1316
        word = word[:-5]
1317
    elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}:
1318
        if word[-4:] == 'erte':
1319
            word = word[:-2]
1320
        else:
1321
            word = word[:-4]
1322
    elif _r1[-3:] in {'ede', 'ane', 'ene', 'ens', 'ers', 'ets', 'het', 'ast',
1323
                      'ert'}:
1324
        if word[-3:] == 'ert':
1325
            word = word[:-1]
1326
        else:
1327
            word = word[:-3]
1328
    elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}:
1329
        word = word[:-2]
1330
    elif _r1[-1:] in {'a', 'e'}:
1331
        word = word[:-1]
1332
    elif _r1[-1:] == 's':
1333
        if (((len(word) > 1 and word[-2] in _s_endings) or
1334
             (len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels))):
1335
            word = word[:-1]
1336
1337
    # Step 2
1338
    if word[r1_start:][-2:] in {'dt', 'vt'}:
1339
        word = word[:-1]
1340
1341
    # Step 3
1342
    _r1 = word[r1_start:]
1343
    if _r1[-7:] == 'hetslov':
1344
        word = word[:-7]
1345
    elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}:
1346
        word = word[:-4]
1347
    elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}:
1348
        word = word[:-3]
1349
    elif _r1[-2:] == 'ig':
1350
        word = word[:-2]
1351
1352
    return word
1353
1354
1355
def sb_swedish(word):
1356
    """Return Snowball Swedish stem.
1357
1358
    The Snowball Swedish stemmer is defined at:
1359
    http://snowball.tartarus.org/algorithms/swedish/stemmer.html
1360
1361
    :param word: the word to calculate the stem of
1362
    :returns: word stem
1363
    :rtype: str
1364
1365
    >>> sb_swedish('undervisa')
1366
    'undervis'
1367
    >>> sb_swedish('suspension')
1368
    'suspension'
1369
    >>> sb_swedish('visshet')
1370
    'viss'
1371
    """
1372
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'}
1373
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1374
                  'o', 'p', 'r', 't', 'v', 'y'}
1375
1376
    # lowercase, normalize, and compose
1377
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1378
1379
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1380
1381
    # Step 1
1382
    _r1 = word[r1_start:]
1383 View Code Duplication
    if _r1[-7:] == 'heterna':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1384
        word = word[:-7]
1385
    elif _r1[-6:] == 'hetens':
1386
        word = word[:-6]
1387
    elif _r1[-5:] in {'anden', 'heten', 'heter', 'arnas', 'ernas', 'ornas',
1388
                      'andes', 'arens', 'andet'}:
1389
        word = word[:-5]
1390
    elif _r1[-4:] in {'arna', 'erna', 'orna', 'ande', 'arne', 'aste', 'aren',
1391
                      'ades', 'erns'}:
1392
        word = word[:-4]
1393
    elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}:
1394
        word = word[:-3]
1395
    elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}:
1396
        word = word[:-2]
1397
    elif _r1[-1:] in {'a', 'e'}:
1398
        word = word[:-1]
1399
    elif _r1[-1:] == 's':
1400
        if len(word) > 1 and word[-2] in _s_endings:
1401
            word = word[:-1]
1402
1403
    # Step 2
1404
    if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}:
1405
        word = word[:-1]
1406
1407
    # Step 3
1408
    _r1 = word[r1_start:]
1409
    if _r1[-5:] == 'fullt':
1410
        word = word[:-1]
1411
    elif _r1[-4:] == 'löst':
1412
        word = word[:-1]
1413
    elif _r1[-3:] in {'lig', 'els'}:
1414
        word = word[:-3]
1415
    elif _r1[-2:] == 'ig':
1416
        word = word[:-2]
1417
1418
    return word
1419
1420
1421
def sb_danish(word):
1422
    """Return Snowball Danish stem.
1423
1424
    The Snowball Danish stemmer is defined at:
1425
    http://snowball.tartarus.org/algorithms/danish/stemmer.html
1426
1427
    :param word: the word to calculate the stem of
1428
    :returns: word stem
1429
    :rtype: str
1430
1431
    >>> sb_danish('underviser')
1432
    'undervis'
1433
    >>> sb_danish('suspension')
1434
    'suspension'
1435
    >>> sb_danish('sikkerhed')
1436
    'sikker'
1437
    """
1438
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1439
    _s_endings = {'a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1440
                  'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'}
1441
1442
    # lowercase, normalize, and compose
1443
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1444
1445
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1446
1447
    # Step 1
1448
    _r1 = word[r1_start:]
1449 View Code Duplication
    if _r1[-7:] == 'erendes':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1450
        word = word[:-7]
1451
    elif _r1[-6:] in {'erende', 'hedens'}:
1452
        word = word[:-6]
1453
    elif _r1[-5:] in {'ethed', 'erede', 'heden', 'heder', 'endes', 'ernes',
1454
                      'erens', 'erets'}:
1455
        word = word[:-5]
1456
    elif _r1[-4:] in {'ered', 'ende', 'erne', 'eren', 'erer', 'heds', 'enes',
1457
                      'eres', 'eret'}:
1458
        word = word[:-4]
1459
    elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}:
1460
        word = word[:-3]
1461
    elif _r1[-2:] in {'en', 'er', 'es', 'et'}:
1462
        word = word[:-2]
1463
    elif _r1[-1:] == 'e':
1464
        word = word[:-1]
1465
    elif _r1[-1:] == 's':
1466
        if len(word) > 1 and word[-2] in _s_endings:
1467
            word = word[:-1]
1468
1469
    # Step 2
1470
    if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1471
        word = word[:-1]
1472
1473
    # Step 3
1474
    if word[-4:] == 'igst':
1475
        word = word[:-2]
1476
1477
    _r1 = word[r1_start:]
1478
    repeat_step2 = False
1479
    if _r1[-4:] == 'elig':
1480
        word = word[:-4]
1481
        repeat_step2 = True
1482
    elif _r1[-4:] == 'løst':
1483
        word = word[:-1]
1484
    elif _r1[-3:] in {'lig', 'els'}:
1485
        word = word[:-3]
1486
        repeat_step2 = True
1487
    elif _r1[-2:] == 'ig':
1488
        word = word[:-2]
1489
        repeat_step2 = True
1490
1491
    if repeat_step2:
1492
        if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1493
            word = word[:-1]
1494
1495
    # Step 4
1496
    if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1497
         word[-1] == word[-2] and word[-1] not in _vowels)):
1498
        word = word[:-1]
1499
1500
    return word
1501
1502
1503
def clef_german(word):
1504
    """Return CLEF German stem.
1505
1506
    The CLEF German stemmer is defined at:
1507
    http://members.unine.ch/jacques.savoy/clef/germanStemmer.txt
1508
1509
    :param word: the word to calculate the stem of
1510
    :returns: word stem
1511
    :rtype: str
1512
1513
    >>> clef_german('lesen')
1514
    'lese'
1515
    >>> clef_german('graues')
1516
    'grau'
1517
    >>> clef_german('buchstabieren')
1518
    'buchstabier'
1519
    """
1520
    # lowercase, normalize, and compose
1521
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1522
1523
    # remove umlauts
1524
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1525
    word = word.translate(_umlauts)
1526
1527
    # remove plurals
1528
    wlen = len(word)-1
1529
1530
    if wlen > 3:
1531
        if wlen > 5:
1532
            if word[-3:] == 'nen':
1533
                return word[:-3]
1534
        if wlen > 4:
1535
            if word[-2:] in {'en', 'se', 'es', 'er'}:
1536
                return word[:-2]
1537
        if word[-1] in {'e', 'n', 'r', 's'}:
1538
            return word[:-1]
1539
    return word
1540
1541
1542
def clef_german_plus(word):
1543
    """Return 'CLEF German stemmer plus' stem.
1544
1545
    The CLEF German stemmer plus is defined at:
1546
    http://members.unine.ch/jacques.savoy/clef/germanStemmerPlus.txt
1547
1548
    :param word: the word to calculate the stem of
1549
    :returns: word stem
1550
    :rtype: str
1551
1552
    >>> clef_german_plus('lesen')
1553
    'les'
1554
    >>> clef_german_plus('graues')
1555
    'grau'
1556
    >>> clef_german_plus('buchstabieren')
1557
    'buchstabi'
1558
    """
1559
    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1560
1561
    # lowercase, normalize, and compose
1562
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1563
1564
    # remove umlauts
1565
    _accents = dict(zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1566
                        'aaaaooooiiiiuuuu'))
1567
    word = word.translate(_accents)
1568
1569
    # Step 1
1570
    wlen = len(word)-1
1571
    if wlen > 4 and word[-3:] == 'ern':
1572
        word = word[:-3]
1573
    elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
1574
        word = word[:-2]
1575
    elif wlen > 2 and (word[-1] == 'e' or
1576
                       (word[-1] == 's' and word[-2] in _st_ending)):
1577
        word = word[:-1]
1578
1579
    # Step 2
1580
    wlen = len(word)-1
1581
    if wlen > 4 and word[-3:] == 'est':
1582
        word = word[:-3]
1583
    elif wlen > 3 and (word[-2:] in {'er', 'en'} or
1584
                       (word[-2:] == 'st' and word[-3] in _st_ending)):
1585
        word = word[:-2]
1586
1587
    return word
1588
1589
1590
def clef_swedish(word):
1591
    """Return CLEF Swedish stem.
1592
1593
    The CLEF Swedish stemmer is defined at:
1594
    http://members.unine.ch/jacques.savoy/clef/swedishStemmer.txt
1595
1596
    :param word: the word to calculate the stem of
1597
    :returns: word stem
1598
    :rtype: str
1599
1600
    >>> clef_swedish('undervisa')
1601
    'undervis'
1602
    >>> clef_swedish('suspension')
1603
    'suspensio'
1604
    >>> clef_swedish('visshet')
1605
    'viss'
1606
    """
1607
    wlen = len(word)-1
1608
1609
    if wlen > 3 and word[-1] == 's':
1610
        word = word[:-1]
1611
        wlen -= 1
1612
1613
    if wlen > 6:
1614
        if word[-5:] in {'elser', 'heten'}:
1615
            return word[:-5]
1616
    if wlen > 5:
1617
        if word[-4:] in {'arne', 'erna', 'ande', 'else', 'aste', 'orna',
1618
                         'aren'}:
1619
            return word[:-4]
1620
    if wlen > 4:
1621
        if word[-3:] in {'are', 'ast', 'het'}:
1622
            return word[:-3]
1623
    if wlen > 3:
1624
        if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
1625
            return word[:-2]
1626
    if wlen > 2:
1627
        if word[-1] in {'a', 'e', 'n', 't'}:
1628
            return word[:-1]
1629
    return word
1630
1631
1632
def caumanns(word):
1633
    """Return Caumanns German stem.
1634
1635
    Jörg Caumanns' stemmer is described in his article at:
1636
    https://refubium.fu-berlin.de/bitstream/handle/fub188/18405/tr-b-99-16.pdf
1637
1638
    This implementation is based on the GermanStemFilter described at:
1639
    http://www.evelix.ch/unternehmen/Blog/evelix/2013/11/11/inner-workings-of-the-german-analyzer-in-lucene
1640
1641
    :param word: the word to calculate the stem of
1642
    :returns: word stem
1643
    :rtype: str
1644
1645
    >>> caumanns('lesen')
1646
    'les'
1647
    >>> caumanns('graues')
1648
    'grau'
1649
    >>> caumanns('buchstabieren')
1650
    'buchstabier'
1651
    """
1652
    if not word:
1653
        return ''
1654
1655
    upper_initial = word[0].isupper()
1656
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1657
1658
    # # Part 2: Substitution
1659
    # 1. Change umlauts to corresponding vowels & ß to ss
1660
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1661
    word = word.translate(_umlauts)
1662
    word = word.replace('ß', 'ss')
1663
1664
    # 2. Change second of doubled characters to *
1665
    newword = word[0]
1666
    for i in range(1, len(word)):
1667
        if newword[i-1] == word[i]:
1668
            newword += '*'
1669
        else:
1670
            newword += word[i]
1671
    word = newword
1672
1673
    # 3. Replace sch, ch, ei, ie with $, §, %, &
1674
    word = word.replace('sch', '$')
1675
    word = word.replace('ch', '§')
1676
    word = word.replace('ei', '%')
1677
    word = word.replace('ie', '&')
1678
    word = word.replace('ig', '#')
1679
    word = word.replace('st', '!')
1680
1681
    # # Part 1: Recursive Context-Free Stripping
1682
    # 1. Remove the following 7 suffixes recursively
1683
    while len(word) > 3:
1684
        if (((len(word) > 4 and word[-2:] in {'em', 'er'}) or
1685
             (len(word) > 5 and word[-2:] == 'nd'))):
1686
            word = word[:-2]
1687
        elif ((word[-1] in {'e', 's', 'n'}) or
1688
              (not upper_initial and word[-1] in {'t', '!'})):
1689
            word = word[:-1]
1690
        else:
1691
            break
1692
1693
    # Additional optimizations:
1694
    if len(word) > 5 and word[-5:] == 'erin*':
1695
        word = word[:-1]
1696
    if word[-1] == 'z':
1697
        word = word[:-1] + 'x'
1698
1699
    # Reverse substitutions:
1700
    word = word.replace('$', 'sch')
1701
    word = word.replace('§', 'ch')
1702
    word = word.replace('%', 'ei')
1703
    word = word.replace('&', 'ie')
1704
    word = word.replace('#', 'ig')
1705
    word = word.replace('!', 'st')
1706
1707
    # Expand doubled
1708
    word = ''.join([word[0]] + [word[i-1] if word[i] == '*' else word[i] for
1709
                                i in range(1, len(word))])
1710
1711
    # Finally, convert gege to ge
1712
    if len(word) > 4:
1713
        word = word.replace('gege', 'ge', 1)
1714
1715
    return word
1716
1717
1718
def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False,
1719
            var=None):
1720
    """Return UEA-Lite stem.
1721
1722
    The UEA-Lite stemmer is discussed in:
1723
    Jenkins, Marie-Claire and Dan Smith. 2005. "Conservative stemming for
1724
    search and indexing."
1725
    http://lemur.cmp.uea.ac.uk/Research/stemmer/stemmer25feb.pdf
1726
1727
    This is chiefly based on the Java implementation of the algorithm, with
1728
    variants based on the Perl implementation and Jason Adams' Ruby port.
1729
1730
    Java version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.java
1731
    Perl version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.pl
1732
    Ruby version: https://github.com/ealdent/uea-stemmer
1733
1734
    :param word: the word to calculate the stem of
1735
    :param max_word_length: the maximum word length allowed
1736
    :param return_rule_no: if True, returns the stem along with rule number
1737
    :param var: variant to use (set to 'Adams' to use Jason Adams' rules,
1738
                or 'Perl' to use the original Perl set of rules)
1739
    :returns: word stem
1740
    :rtype: str or tuple(str, int)
1741
    """
1742
    problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
1743
1744
    # rule table format:
1745
    # top-level dictionary: length-of-suffix: dict-of-rules
1746
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
1747
    #                         suffix_to_append)
1748
    rule_table = {7: {'titudes': (30, 1, None),
1749
                      'fulness': (34, 4, None),
1750
                      'ousness': (35, 4, None),
1751
                      'eadings': (40.7, 4, None),
1752
                      'oadings': (40.6, 4, None),
1753
                      'ealings': (42.4, 4, None),
1754
                      'ailings': (42.2, 4, None),
1755
                      },
1756
                  6: {'aceous': (1, 6, None),
1757
                      'aining': (24, 3, None),
1758
                      'acting': (25, 3, None),
1759
                      'ttings': (26, 5, None),
1760
                      'viding': (27, 3, 'e'),
1761
                      'ssings': (37, 4, None),
1762
                      'ulting': (38, 3, None),
1763
                      'eading': (40.7, 3, None),
1764
                      'oading': (40.6, 3, None),
1765
                      'edings': (40.5, 4, None),
1766
                      'ddings': (40.4, 5, None),
1767
                      'ldings': (40.3, 4, None),
1768
                      'rdings': (40.2, 4, None),
1769
                      'ndings': (40.1, 4, None),
1770
                      'llings': (41, 5, None),
1771
                      'ealing': (42.4, 3, None),
1772
                      'olings': (42.3, 4, None),
1773
                      'ailing': (42.2, 3, None),
1774
                      'elings': (42.1, 4, None),
1775
                      'mmings': (44.3, 5, None),
1776
                      'ngings': (45.2, 4, None),
1777
                      'ggings': (45.1, 5, None),
1778
                      'stings': (47, 4, None),
1779
                      'etings': (48.4, 4, None),
1780
                      'ntings': (48.2, 4, None),
1781
                      'irings': (54.4, 4, 'e'),
1782
                      'urings': (54.3, 4, 'e'),
1783
                      'ncings': (54.2, 4, 'e'),
1784
                      'things': (58.1, 1, None),
1785
                      },
1786
                  5: {'iases': (11.4, 2, None),
1787
                      'ained': (13.6, 2, None),
1788
                      'erned': (13.5, 2, None),
1789
                      'ifted': (14, 2, None),
1790
                      'ected': (15, 2, None),
1791
                      'vided': (16, 1, None),
1792
                      'erred': (19, 3, None),
1793
                      'urred': (20.5, 3, None),
1794
                      'lored': (20.4, 2, None),
1795
                      'eared': (20.3, 2, None),
1796
                      'tored': (20.2, 1, None),
1797
                      'noted': (22.4, 1, None),
1798
                      'leted': (22.3, 1, None),
1799
                      'anges': (23, 1, None),
1800
                      'tting': (26, 4, None),
1801
                      'ulted': (32, 2, None),
1802
                      'uming': (33, 3, 'e'),
1803
                      'rabed': (36.1, 1, None),
1804
                      'rebed': (36.1, 1, None),
1805
                      'ribed': (36.1, 1, None),
1806
                      'robed': (36.1, 1, None),
1807
                      'rubed': (36.1, 1, None),
1808
                      'ssing': (37, 3, None),
1809
                      'vings': (39, 4, 'e'),
1810
                      'eding': (40.5, 3, None),
1811
                      'dding': (40.4, 4, None),
1812
                      'lding': (40.3, 3, None),
1813
                      'rding': (40.2, 3, None),
1814
                      'nding': (40.1, 3, None),
1815
                      'dings': (40, 4, 'e'),
1816
                      'lling': (41, 4, None),
1817
                      'oling': (42.3, 3, None),
1818
                      'eling': (42.1, 3, None),
1819
                      'lings': (42, 4, 'e'),
1820
                      'mming': (44.3, 4, None),
1821
                      'rming': (44.2, 3, None),
1822
                      'lming': (44.1, 3, None),
1823
                      'mings': (44, 4, 'e'),
1824
                      'nging': (45.2, 3, None),
1825
                      'gging': (45.1, 4, None),
1826
                      'gings': (45, 4, 'e'),
1827
                      'aning': (46.6, 3, None),
1828
                      'ening': (46.5, 3, None),
1829
                      'gning': (46.4, 3, None),
1830
                      'nning': (46.3, 4, None),
1831
                      'oning': (46.2, 3, None),
1832
                      'rning': (46.1, 3, None),
1833
                      'sting': (47, 3, None),
1834
                      'eting': (48.4, 3, None),
1835
                      'pting': (48.3, 3, None),
1836
                      'nting': (48.2, 3, None),
1837
                      'cting': (48.1, 3, None),
1838
                      'tings': (48, 4, 'e'),
1839
                      'iring': (54.4, 3, 'e'),
1840
                      'uring': (54.3, 3, 'e'),
1841
                      'ncing': (54.2, 3, 'e'),
1842
                      'sings': (54, 4, 'e'),
1843
                      # 'lling': (55, 3, None),  # masked by 41
1844
                      'ating': (57, 3, 'e'),
1845
                      'thing': (58.1, 0, None),
1846
                      },
1847
                  4: {'eeds': (7, 1, None),
1848
                      'uses': (11.3, 1, None),
1849
                      'sses': (11.2, 2, None),
1850
                      'eses': (11.1, 2, 'is'),
1851
                      'tled': (12.5, 1, None),
1852
                      'pled': (12.4, 1, None),
1853
                      'bled': (12.3, 1, None),
1854
                      'eled': (12.2, 2, None),
1855
                      'lled': (12.1, 2, None),
1856
                      'ened': (13.7, 2, None),
1857
                      'rned': (13.4, 2, None),
1858
                      'nned': (13.3, 3, None),
1859
                      'oned': (13.2, 2, None),
1860
                      'gned': (13.1, 2, None),
1861
                      'ered': (20.1, 2, None),
1862
                      'reds': (20, 2, None),
1863
                      'tted': (21, 3, None),
1864
                      'uted': (22.2, 1, None),
1865
                      'ated': (22.1, 1, None),
1866
                      'ssed': (28, 2, None),
1867
                      'umed': (31, 1, None),
1868
                      'beds': (36, 3, None),
1869
                      'ving': (39, 3, 'e'),
1870
                      'ding': (40, 3, 'e'),
1871
                      'ling': (42, 3, 'e'),
1872
                      'nged': (43.2, 1, None),
1873
                      'gged': (43.1, 3, None),
1874
                      'ming': (44, 3, 'e'),
1875
                      'ging': (45, 3, 'e'),
1876
                      'ning': (46, 3, 'e'),
1877
                      'ting': (48, 3, 'e'),
1878
                      # 'ssed': (49, 2, None),  # masked by 28
1879
                      # 'lled': (53, 2, None),  # masked by 12.1
1880
                      'zing': (54.1, 3, 'e'),
1881
                      'sing': (54, 3, 'e'),
1882
                      'lves': (60.1, 3, 'f'),
1883
                      'aped': (61.3, 1, None),
1884
                      'uded': (61.2, 1, None),
1885
                      'oded': (61.1, 1, None),
1886
                      # 'ated': (61, 1, None),  # masked by 22.1
1887
                      'ones': (63.6, 1, None),
1888
                      'izes': (63.5, 1, None),
1889
                      'ures': (63.4, 1, None),
1890
                      'ines': (63.3, 1, None),
1891
                      'ides': (63.2, 1, None),
1892
                      },
1893
                  3: {'ces': (2, 1, None),
1894
                      'sis': (4, 0, None),
1895
                      'tis': (5, 0, None),
1896
                      'eed': (7, 0, None),
1897
                      'ued': (8, 1, None),
1898
                      'ues': (9, 1, None),
1899
                      'ees': (10, 1, None),
1900
                      'ses': (11, 1, None),
1901
                      'led': (12, 2, None),
1902
                      'ned': (13, 1, None),
1903
                      'ved': (17, 1, None),
1904
                      'ced': (18, 1, None),
1905
                      'red': (20, 1, None),
1906
                      'ted': (22, 2, None),
1907
                      'sed': (29, 1, None),
1908
                      'bed': (36, 2, None),
1909
                      'ged': (43, 1, None),
1910
                      'les': (50, 1, None),
1911
                      'tes': (51, 1, None),
1912
                      'zed': (52, 1, None),
1913
                      'ied': (56, 3, 'y'),
1914
                      'ies': (59, 3, 'y'),
1915
                      'ves': (60, 1, None),
1916
                      'pes': (63.8, 1, None),
1917
                      'mes': (63.7, 1, None),
1918
                      'ges': (63.1, 1, None),
1919
                      'ous': (65, 0, None),
1920
                      'ums': (66, 0, None),
1921
                      },
1922
                  2: {'cs': (3, 0, None),
1923
                      'ss': (6, 0, None),
1924
                      'es': (63, 2, None),
1925
                      'is': (64, 2, 'e'),
1926
                      'us': (67, 0, None),
1927
                      }}
1928
1929
    if var == 'Perl':
1930
        perl_deletions = {7: ['eadings', 'oadings', 'ealings', 'ailings'],
1931
                          6: ['ttings', 'ssings', 'edings', 'ddings',
1932
                              'ldings', 'rdings', 'ndings', 'llings',
1933
                              'olings', 'elings', 'mmings', 'ngings',
1934
                              'ggings', 'stings', 'etings', 'ntings',
1935
                              'irings', 'urings', 'ncings', 'things'],
1936
                          5: ['vings', 'dings', 'lings', 'mings', 'gings',
1937
                              'tings', 'sings'],
1938
                          4: ['eeds', 'reds', 'beds']}
1939
1940
        # Delete the above rules from rule_table
1941
        for del_len in perl_deletions:
1942
            for term in perl_deletions[del_len]:
1943
                del rule_table[del_len][term]
1944
1945
    elif var == 'Adams':
1946
        adams_additions = {6: {'chited': (22.8, 1, None)},
1947
                           5: {'dying': (58.2, 4, 'ie'),
1948
                               'tying': (58.2, 4, 'ie'),
1949
                               'vited': (22.6, 1, None),
1950
                               'mited': (22.5, 1, None),
1951
                               'vided': (22.9, 1, None),
1952
                               'mided': (22.10, 1, None),
1953
                               'lying': (58.2, 4, 'ie'),
1954
                               'arred': (19.1, 3, None),
1955
                               },
1956
                           4: {'ited': (22.7, 2, None),
1957
                               'oked': (31.1, 1, None),
1958
                               'aked': (31.1, 1, None),
1959
                               'iked': (31.1, 1, None),
1960
                               'uked': (31.1, 1, None),
1961
                               'amed': (31, 1, None),
1962
                               'imed': (31, 1, None),
1963
                               'does': (31.2, 2, None),
1964
                               },
1965
                           3: {'oed': (31.3, 1, None),
1966
                               'oes': (31.2, 1, None),
1967
                               'kes': (63.1, 1, None),
1968
                               'des': (63.10, 1, None),
1969
                               'res': (63.9, 1, None),
1970
                               }}
1971
1972
        # Add the above additional rules to rule_table
1973
        for del_len in adams_additions:
1974
            rule_table[del_len] = dict(rule_table[del_len],
1975
                                       **adams_additions[del_len])
1976
        # Add additional problem word
1977
        problem_words.add('menses')
1978
1979
    def _stem_with_duplicate_character_check(word, del_len):
1980
        if word[-1] == 's':
1981
            del_len += 1
1982
        stemmed_word = word[:-del_len]
1983
        if re.match(r'.*(\w)\1$', stemmed_word):
1984
            stemmed_word = stemmed_word[:-1]
1985
        return stemmed_word
1986
1987
    def _stem(word):
0 ignored issues
show
best-practice introduced by
Too many return statements (16/6)
Loading history...
1988
        stemmed_word = word
1989
        rule_no = 0
1990
1991
        if not word:
1992
            return word, 0
1993
        if word in problem_words:
1994
            return word, 90
1995
        if max_word_length and len(word) > max_word_length:
1996
            return word, 95
1997
1998
        if "'" in word:
1999
            if word[-2:] in {"'s", "'S"}:
2000
                stemmed_word = word[:-2]
2001
            if word[-1:] == "'":
2002
                stemmed_word = word[:-1]
2003
            stemmed_word = stemmed_word.replace("n't", 'not')
2004
            stemmed_word = stemmed_word.replace("'ve", 'have')
2005
            stemmed_word = stemmed_word.replace("'re", 'are')
2006
            stemmed_word = stemmed_word.replace("'m", 'am')
2007
            return stemmed_word, 94
2008
2009
        if word.isdigit():
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2010
            return word, 90.3
2011
        else:
2012
            hyphen = word.find('-')
2013
            if hyphen > 0 and hyphen < len(word):
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
Unused Code introduced by
Simplify chained comparison between the operands
Loading history...
2014
                if word[:hyphen].isalpha() and word[hyphen+1:].isalpha():
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2015
                    return word, 90.2
2016
                else:
2017
                    return word, 90.1
2018
            elif '_' in word:
2019
                return word, 90
2020
            elif word[-1] == 's' and word[:-1].isupper():
2021
                if var == 'Adams' and len(word)-1 > max_acro_length:
2022
                    return word, 96
2023
                return word[:-1], 91.1
2024
            elif word.isupper():
2025
                if var == 'Adams' and len(word) > max_acro_length:
2026
                    return word, 96
2027
                return word, 91
2028
            elif re.match(r'^.*[A-Z].*[A-Z].*$', word):
2029
                return word, 92
2030
            elif word[0].isupper():
2031
                return word, 93
2032
            elif var == 'Adams' and re.match(r'^[a-z]{1}(|[rl])(ing|ed)$',
2033
                                             word):
2034
                return word, 97
2035
2036
        for n in range(7, 1, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2037
            if word[-n:] in rule_table[n]:
2038
                rule_no, del_len, add_str = rule_table[n][word[-n:]]
2039
                if del_len:
2040
                    stemmed_word = word[:-del_len]
2041
                else:
2042
                    stemmed_word = word
2043
                if add_str:
2044
                    stemmed_word += add_str
2045
                break
2046
2047
        if not rule_no:
2048
            if re.match(r'.*\w\wings?$', word):  # rule 58
2049
                stemmed_word = _stem_with_duplicate_character_check(word, 3)
2050
                rule_no = 58
2051
            elif re.match(r'.*\w\weds?$', word):  # rule 62
2052
                stemmed_word = _stem_with_duplicate_character_check(word, 2)
2053
                rule_no = 62
2054
            elif word[-1] == 's':  # rule 68
2055
                stemmed_word = word[:-1]
2056
                rule_no = 68
2057
2058
        return stemmed_word, rule_no
2059
2060
    stem, rule_no = _stem(word)
2061
    if return_rule_no:
2062
        return stem, rule_no
2063
    return stem
2064
2065
2066
def paice_husk(word):
2067
    """Return Paice-Husk stem.
2068
2069
    Implementation of the Paice-Husk Stemmer, also known as the Lancaster
2070
    Stemmer, developed by Chris Paice, with the assistance of Gareth Husk
2071
2072
    This is based on the algorithm's description in:
2073
    Paice, Chris D. 1990. "Another stemmer." ACM SIGIR Forum 24:3, Fall 1990.
2074
    56-61. doi:10.1145/101306.101310.
2075
2076
    :param word: the word to calculate the stem of
2077
    :returns: word stem
2078
    :rtype: str
2079
    """
2080
    rule_table = {6: {'ifiabl': (False, 6, None, True),
2081
                      'plicat': (False, 4, 'y', True)},
2082
                  5: {'guish': (False, 5, 'ct', True),
2083
                      'sumpt': (False, 2, None, True),
2084
                      'istry': (False, 5, None, True)},
2085
                  4: {'ytic': (False, 3, 's', True),
2086
                      'ceed': (False, 2, 'ss', True),
2087
                      'hood': (False, 4, None, False),
2088
                      'lief': (False, 1, 'v', True),
2089
                      'verj': (False, 1, 't', True),
2090
                      'misj': (False, 2, 't', True),
2091
                      'iabl': (False, 4, 'y', True),
2092
                      'iful': (False, 4, 'y', True),
2093
                      'sion': (False, 4, 'j', False),
2094
                      'xion': (False, 4, 'ct', True),
2095
                      'ship': (False, 4, None, False),
2096
                      'ness': (False, 4, None, False),
2097
                      'ment': (False, 4, None, False),
2098
                      'ript': (False, 2, 'b', True),
2099
                      'orpt': (False, 2, 'b', True),
2100
                      'duct': (False, 1, None, True),
2101
                      'cept': (False, 2, 'iv', True),
2102
                      'olut': (False, 2, 'v', True),
2103
                      'sist': (False, 0, None, True)},
2104
                  3: {'ied': (False, 3, 'y', False),
2105
                      'eed': (False, 1, None, True),
2106
                      'ing': (False, 3, None, False),
2107
                      'iag': (False, 3, 'y', True),
2108
                      'ish': (False, 3, None, False),
2109
                      'fuj': (False, 1, 's', True),
2110
                      'hej': (False, 1, 'r', True),
2111
                      'abl': (False, 3, None, False),
2112
                      'ibl': (False, 3, None, True),
2113
                      'bil': (False, 2, 'l', False),
2114
                      'ful': (False, 3, None, False),
2115
                      'ial': (False, 3, None, False),
2116
                      'ual': (False, 3, None, False),
2117
                      'ium': (False, 3, None, True),
2118
                      'ism': (False, 3, None, False),
2119
                      'ion': (False, 3, None, False),
2120
                      'ian': (False, 3, None, False),
2121
                      'een': (False, 0, None, True),
2122
                      'ear': (False, 0, None, True),
2123
                      'ier': (False, 3, 'y', False),
2124
                      'ies': (False, 3, 'y', False),
2125
                      'sis': (False, 2, None, True),
2126
                      'ous': (False, 3, None, False),
2127
                      'ent': (False, 3, None, False),
2128
                      'ant': (False, 3, None, False),
2129
                      'ist': (False, 3, None, False),
2130
                      'iqu': (False, 3, None, True),
2131
                      'ogu': (False, 1, None, True),
2132
                      'siv': (False, 3, 'j', False),
2133
                      'eiv': (False, 0, None, True),
2134
                      'bly': (False, 1, None, False),
2135
                      'ily': (False, 3, 'y', False),
2136
                      'ply': (False, 0, None, True),
2137
                      'ogy': (False, 1, None, True),
2138
                      'phy': (False, 1, None, True),
2139
                      'omy': (False, 1, None, True),
2140
                      'opy': (False, 1, None, True),
2141
                      'ity': (False, 3, None, False),
2142
                      'ety': (False, 3, None, False),
2143
                      'lty': (False, 2, None, True),
2144
                      'ary': (False, 3, None, False),
2145
                      'ory': (False, 3, None, False),
2146
                      'ify': (False, 3, None, True),
2147
                      'ncy': (False, 2, 't', False),
2148
                      'acy': (False, 3, None, False)},
2149
                  2: {'ia': (True, 2, None, True),
2150
                      'bb': (False, 1, None, True),
2151
                      'ic': (False, 2, None, False),
2152
                      'nc': (False, 1, 't', False),
2153
                      'dd': (False, 1, None, True),
2154
                      'ed': (False, 2, None, False),
2155
                      'if': (False, 2, None, False),
2156
                      'ag': (False, 2, None, False),
2157
                      'gg': (False, 1, None, True),
2158
                      'th': (True, 2, None, True),
2159
                      'ij': (False, 1, 'd', True),
2160
                      'uj': (False, 1, 'd', True),
2161
                      'oj': (False, 1, 'd', True),
2162
                      'nj': (False, 1, 'd', True),
2163
                      'cl': (False, 1, None, True),
2164
                      'ul': (False, 2, None, True),
2165
                      'al': (False, 2, None, False),
2166
                      'll': (False, 1, None, True),
2167
                      'um': (True, 2, None, True),
2168
                      'mm': (False, 1, None, True),
2169
                      'an': (False, 2, None, False),
2170
                      'en': (False, 2, None, False),
2171
                      'nn': (False, 1, None, True),
2172
                      'pp': (False, 1, None, True),
2173
                      'er': (False, 2, None, False),
2174
                      'ar': (False, 2, None, True),
2175
                      'or': (False, 2, None, False),
2176
                      'ur': (False, 2, None, False),
2177
                      'rr': (False, 1, None, True),
2178
                      'tr': (False, 1, None, False),
2179
                      'is': (False, 2, None, False),
2180
                      'ss': (False, 0, None, True),
2181
                      'us': (True, 2, None, True),
2182
                      'at': (False, 2, None, False),
2183
                      'tt': (False, 1, None, True),
2184
                      'iv': (False, 2, None, False),
2185
                      'ly': (False, 2, None, False),
2186
                      'iz': (False, 2, None, False),
2187
                      'yz': (False, 1, 's', True)},
2188
                  1: {'a': (True, 1, None, True),
2189
                      'e': (False, 1, None, False),
2190
                      'i': ((True, 1, None, True), (False, 1, 'y', False)),
2191
                      'j': (False, 1, 's', True),
2192
                      's': ((True, 1, None, False), (False, 0, None, True))}}
2193
2194
    def _has_vowel(word):
2195
        for char in word:
2196
            if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
2197
                return True
2198
        return False
2199
2200
    def _acceptable(word):
2201
        if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
2202
            return len(word) > 1
2203
        return len(word) > 2 and _has_vowel(word[1:])
2204
2205
    def _apply_rule(word, rule, intact):
2206
        old_word = word
2207
        only_intact, del_len, add_str, set_terminate = rule
2208
        # print(word, word[-n:], rule)
2209
2210
        if (not only_intact) or (intact and only_intact):
2211
            if del_len:
2212
                word = word[:-del_len]
2213
            if add_str:
2214
                word += add_str
2215
        else:
2216
            return word, False, intact, terminate
2217
2218
        if _acceptable(word):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2219
            return word, True, False, set_terminate
2220
        else:
2221
            return old_word, False, intact, terminate
2222
2223
    terminate = False
2224
    intact = True
2225
    while not terminate:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
2226
        for n in range(6, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
Variable name "n" doesn't conform to snake_case naming style ('(([a-z_][a-z0-9_]2,)|(_[a-z0-9_]*)|(__[a-z][a-z0-9_]+__))$' pattern)

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
2227
            if word[-n:] in rule_table[n]:
2228
                accept = False
2229
                if len(rule_table[n][word[-n:]]) < 4:
2230
                    for rule in rule_table[n][word[-n:]]:
2231
                        (word, accept, intact,
2232
                         terminate) = _apply_rule(word, rule, intact)
2233
                        if accept:
2234
                            break
2235
                else:
2236
                    rule = rule_table[n][word[-n:]]
2237
                    (word, accept, intact,
2238
                     terminate) = _apply_rule(word, rule, intact)
2239
2240
                if accept:
2241
                    break
2242
        else:
2243
            break
2244
2245
    return word
2246
2247
2248
def schinke(word):
2249
    """Return the stem of a word according to the Schinke stemmer.
2250
2251
    Source:
2252
    Schinke, Robyn, Mark Greengrass, Alexander M. Robertson, and Peter Willett.
2253
    1996. "A Stemming Algorithm for Latin Text Databases." Journal of
2254
    Documentation, 52(2). 172--187.
2255
    doi:10.1108/eb026966
2256
2257
    :param word:
2258
    :return:
2259
    """
2260
    word = unicodedata.normalize('NFKD', text_type(word.lower()))
2261
    word = ''.join(c for c in word if c in
2262
                   {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',
2263
                    'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
2264
                    'y', 'z'})
2265
2266
    # Rule 2
2267
    word = word.replace('j', 'i').replace('v', 'u')
2268
2269
    # Rule 3
2270
    keep_que = {'at', 'quo', 'ne', 'ita', 'abs', 'aps', 'abus', 'adae', 'adus',
2271
                'deni', 'de', 'sus', 'obli', 'perae', 'plenis', 'quando',
2272
                'quis', 'quae', 'cuius', 'cui', 'quem', 'quam', 'qua', 'qui',
2273
                'quorum', 'quarum', 'quibus', 'quos', 'quas', 'quotusquis',
2274
                'quous', 'ubi', 'undi', 'us', 'uter', 'uti', 'utro', 'utribi',
2275
                'tor', 'co', 'conco', 'contor', 'detor', 'deco', 'exco',
2276
                'extor', 'obtor', 'optor', 'retor', 'reco', 'attor', 'inco',
2277
                'intor', 'praetor'}
2278
    if word[-3:] == 'que':
2279
        # This diverges from the paper by also returning 'que' itself unstemmed
2280
        if word[:-3] in keep_que or word == 'que':
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
2281
            return {'n': word, 'v': word}
2282
        else:
2283
            word = word[:-3]
2284
2285
    # Base case will mean returning the words as is
2286
    noun = word
2287
    verb = word
2288
2289
    # Rule 4
2290
    n_endings = {4: {'ibus'},
2291
               3: {'ius'},
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2292
               2: {'is', 'nt', 'ae', 'os', 'am', 'ud', 'as', 'um', 'em', 'us',
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2293
                   'es', 'ia'},
2294
               1: {'a', 'e', 'i', 'o', 'u'}}
0 ignored issues
show
Coding Style introduced by
Wrong continued indentation (add 2 spaces).
Loading history...
2295
    for endlen in range(4, 0, -1):
2296
        if word[-endlen:] in n_endings[endlen]:
2297
            if len(word)-2 >= endlen:
2298
                noun = word[:-endlen]
2299
            else:
2300
                noun = word
2301
            break
2302
2303
    v_endings_strip = {6: {},
2304
                       5: {},
2305
                       4: {'mini', 'ntur', 'stis'},
2306
                       3: {'mur', 'mus', 'ris', 'sti', 'tis', 'tur'},
2307
                       2: {'ns', 'nt', 'ri'},
2308
                       1: {'m', 'r', 's', 't'}}
2309
    v_endings_alter = {6: {'iuntur'},
2310
                       5: {'beris', 'erunt', 'untur'},
2311
                       4: {'iunt'},
2312
                       3: {'bor', 'ero', 'unt'},
2313
                       2: {'bo'},
2314
                       1: {}}
2315
    for endlen in range(6, 0, -1):
2316
        if word[-endlen:] in v_endings_strip[endlen]:
2317
            addlen = 0
2318
            if len(word)-2 >= endlen:
2319
                verb = word[:-endlen]
2320
            else:
2321
                verb = word
2322
            break
2323
        if word[-endlen:] in v_endings_alter[endlen]:
2324
            if word[-endlen:] in {'iuntur', 'erunt', 'untur', 'iunt', 'unt'}:
2325
                new_word = word[:-endlen]+'i'
2326
                addlen = 1
2327
            elif word[-endlen:] in {'beris', 'bor', 'bo'}:
2328
                new_word = word[:-endlen]+'bi'
2329
                addlen = 2
2330
            else:
2331
                new_word = word[:-endlen]+'eri'
2332
                addlen = 3
2333
2334
            # Technically this diverges from the paper by considering the
2335
            # length of the stem without the new suffix
2336
            if len(new_word) >= 2+addlen:
2337
                verb = new_word
2338
            else:
2339
                verb = word
2340
            break
2341
2342
    return {'n': noun, 'v': verb}
2343
2344
2345
def s_stemmer(word):
2346
    """Return the S-stemmed form of a word.
2347
2348
    The S stemmer is defined by:
2349
    Harman, Donna. 1991. "How Effective Is Suffixing?" Journal of the American
2350
    Society for Information Science, 42(1). 7--15.
2351
    doi:10.1002/(SICI)1097-4571(199101)42:1%3C7::AID-ASI2%3E3.0.CO;2-P
2352
2353
    :param word:
2354
    :return:
2355
    """
2356
    lowered = word.lower()
2357
    if lowered[-3:] == 'ies' and lowered[-4:-3] not in {'e', 'a'}:
2358
        return word[:-3] + 'Y' if word[-1:].isupper() else 'y'
2359
    if lowered[-2:] == 'es' and lowered[-3:-2] not in {'a', 'e', 'o'}:
2360
        return word[:-1]
2361
    if lowered[-1:] == 's' and lowered[-2:-1] not in {'u', 's'}:
2362
        return word[:-1]
2363
    return word
2364
2365
2366
if __name__ == '__main__':
2367
    import doctest
2368
    doctest.testmod()
2369