Completed
Push — master ( 76e221...14a933 )
by Chris
08:59
created

abydos.stemmer.uealite()   F

Complexity

Conditions 41

Size

Total Lines 344
Code Lines 286

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 41
eloc 286
nop 5
dl 0
loc 344
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.stemmer.uealite() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.stemmer.
20
21
The stemmer module defines word stemmers including:
22
23
    - the Lovins stemmer
24
    - the Porter and Porter2 (Snowball English) stemmers
25
    - Snowball stemmers for German, Dutch, Norwegian, Swedish, and Danish
26
    - CLEF German, German plus, and Swedish stemmers
27
    - Caumann's German stemmer
28
"""
29
30
from __future__ import unicode_literals
31
32
import re
33
import unicodedata
34
35
from six import text_type
36
from six.moves import range
37
38
39
def lovins(word):
40
    """Return Lovins stem.
41
42
    Lovins stemmer
43
44
    The Lovins stemmer is described in Julie Beth Lovins's article at:
45
    http://www.mt-archive.info/MT-1968-Lovins.pdf
46
47
    :param word: the word to stem
48
    :returns: word stem
49
    :rtype: string
50
51
    >>> lovins('reading')
52
    'read'
53
    >>> lovins('suspension')
54
    'suspens'
55
    >>> lovins('elusiveness')
56
    'elus'
57
    """
58
    # pylint: disable=too-many-branches, too-many-locals
59
60
    # lowercase, normalize, and compose
61
    word = unicodedata.normalize('NFC', text_type(word.lower()))
62
63
    def cond_b(word, suffix_len):
64
        """Return Lovins' condition B."""
65
        return len(word)-suffix_len >= 3
66
67
    def cond_c(word, suffix_len):
68
        """Return Lovins' condition C."""
69
        return len(word)-suffix_len >= 4
70
71
    def cond_d(word, suffix_len):
72
        """Return Lovins' condition D."""
73
        return len(word)-suffix_len >= 5
74
75
    def cond_e(word, suffix_len):
76
        """Return Lovins' condition E."""
77
        return word[-suffix_len-1] != 'e'
78
79
    def cond_f(word, suffix_len):
80
        """Return Lovins' condition F."""
81
        return (len(word)-suffix_len >= 3 and
82
                word[-suffix_len-1] != 'e')
83
84
    def cond_g(word, suffix_len):
85
        """Return Lovins' condition G."""
86
        return (len(word)-suffix_len >= 3 and
87
                word[-suffix_len-1] == 'f')
88
89
    def cond_h(word, suffix_len):
90
        """Return Lovins' condition H."""
91
        return (word[-suffix_len-1] == 't' or
92
                word[-suffix_len-2:-suffix_len] == 'll')
93
94
    def cond_i(word, suffix_len):
95
        """Return Lovins' condition I."""
96
        return word[-suffix_len-1] not in {'e', 'o'}
97
98
    def cond_j(word, suffix_len):
99
        """Return Lovins' condition J."""
100
        return word[-suffix_len-1] not in {'a', 'e'}
101
102
    def cond_k(word, suffix_len):
103
        """Return Lovins' condition K."""
104
        return (len(word)-suffix_len >= 3 and
105
                (word[-suffix_len-1] in {'i', 'l'} or
106
                 (word[-suffix_len-3] == 'u' and word[-suffix_len-1] == 'e')))
107
108
    def cond_l(word, suffix_len):
109
        """Return Lovins' condition L."""
110
        return (word[-suffix_len-1] not in {'s', 'u', 'x'} or
111
                word[-suffix_len-1] == 'os')
112
113
    def cond_m(word, suffix_len):
114
        """Return Lovins' condition M."""
115
        return word[-suffix_len-1] not in {'a', 'c', 'e', 'm'}
116
117
    def cond_n(word, suffix_len):
118
        """Return Lovins' condition N."""
119
        if len(word)-suffix_len >= 3:
120
            if word[-suffix_len-3] == 's':
121
                if len(word)-suffix_len >= 4:
122
                    return True
123
            else:
124
                return True
125
        return False
126
127
    def cond_o(word, suffix_len):
128
        """Return Lovins' condition O."""
129
        return word[-suffix_len-1] in {'i', 'l'}
130
131
    def cond_p(word, suffix_len):
132
        """Return Lovins' condition P."""
133
        return word[-suffix_len-1] != 'c'
134
135
    def cond_q(word, suffix_len):
136
        """Return Lovins' condition Q."""
137
        return (len(word)-suffix_len >= 3 and
138
                word[-suffix_len-1] not in {'l', 'n'})
139
140
    def cond_r(word, suffix_len):
141
        """Return Lovins' condition R."""
142
        return word[-suffix_len-1] in {'n', 'r'}
143
144
    def cond_s(word, suffix_len):
145
        """Return Lovins' condition S."""
146
        return (word[-suffix_len-2:-suffix_len] == 'dr' or
147
                (word[-suffix_len-1] == 't' and
148
                 word[-suffix_len-2:-suffix_len] != 'tt'))
149
150
    def cond_t(word, suffix_len):
151
        """Return Lovins' condition T."""
152
        return (word[-suffix_len-1] in {'s', 't'} and
153
                word[-suffix_len-2:-suffix_len] != 'ot')
154
155
    def cond_u(word, suffix_len):
156
        """Return Lovins' condition U."""
157
        return word[-suffix_len-1] in {'l', 'm', 'n', 'r'}
158
159
    def cond_v(word, suffix_len):
160
        """Return Lovins' condition V."""
161
        return word[-suffix_len-1] == 'c'
162
163
    def cond_w(word, suffix_len):
164
        """Return Lovins' condition W."""
165
        return word[-suffix_len-1] not in {'s', 'u'}
166
167
    def cond_x(word, suffix_len):
168
        """Return Lovins' condition X."""
169
        return (word[-suffix_len-1] in {'i', 'l'} or
170
                (word[-suffix_len-3:-suffix_len] == 'u' and
171
                 word[-suffix_len-1] == 'e'))
172
173
    def cond_y(word, suffix_len):
174
        """Return Lovins' condition Y."""
175
        return word[-suffix_len-2:-suffix_len] == 'in'
176
177
    def cond_z(word, suffix_len):
178
        """Return Lovins' condition Z."""
179
        return word[-suffix_len-1] != 'f'
180
181
    def cond_aa(word, suffix_len):
182
        """Return Lovins' condition AA."""
183
        return (word[-suffix_len-1] in {'d', 'f', 'l', 't'} or
184
                word[-suffix_len-2:-suffix_len] in {'ph', 'th', 'er', 'or',
185
                                                    'es'})
186
187
    def cond_bb(word, suffix_len):
188
        """Return Lovins' condition BB."""
189
        return (len(word)-suffix_len >= 3 and
190
                word[-suffix_len-3:-suffix_len] != 'met' and
191
                word[-suffix_len-4:-suffix_len] != 'ryst')
192
193
    def cond_cc(word, suffix_len):
194
        """Return Lovins' condition CC."""
195
        return word[-suffix_len-1] == 'l'
196
197
    suffix = {'alistically': cond_b, 'arizability': None,
198
              'izationally': cond_b, 'antialness': None,
199
              'arisations': None, 'arizations': None, 'entialness': None,
200
              'allically': cond_c, 'antaneous': None, 'antiality': None,
201
              'arisation': None, 'arization': None, 'ationally': cond_b,
202
              'ativeness': None, 'eableness': cond_e, 'entations': None,
203
              'entiality': None, 'entialize': None, 'entiation': None,
204
              'ionalness': None, 'istically': None, 'itousness': None,
205
              'izability': None, 'izational': None, 'ableness': None,
206
              'arizable': None, 'entation': None, 'entially': None,
207
              'eousness': None, 'ibleness': None, 'icalness': None,
208
              'ionalism': None, 'ionality': None, 'ionalize': None,
209
              'iousness': None, 'izations': None, 'lessness': None,
210
              'ability': None, 'aically': None, 'alistic': cond_b,
211
              'alities': None, 'ariness': cond_e, 'aristic': None,
212
              'arizing': None, 'ateness': None, 'atingly': None,
213
              'ational': cond_b, 'atively': None, 'ativism': None,
214
              'elihood': cond_e, 'encible': None, 'entally': None,
215
              'entials': None, 'entiate': None, 'entness': None,
216
              'fulness': None, 'ibility': None, 'icalism': None,
217
              'icalist': None, 'icality': None, 'icalize': None,
218
              'ication': cond_g, 'icianry': None, 'ination': None,
219
              'ingness': None, 'ionally': None, 'isation': None,
220
              'ishness': None, 'istical': None, 'iteness': None,
221
              'iveness': None, 'ivistic': None, 'ivities': None,
222
              'ization': cond_f, 'izement': None, 'oidally': None,
223
              'ousness': None, 'aceous': None, 'acious': cond_b,
224
              'action': cond_g, 'alness': None, 'ancial': None,
225
              'ancies': None, 'ancing': cond_b, 'ariser': None,
226
              'arized': None, 'arizer': None, 'atable': None,
227
              'ations': cond_b, 'atives': None, 'eature': cond_z,
228
              'efully': None, 'encies': None, 'encing': None,
229
              'ential': None, 'enting': cond_c, 'entist': None,
230
              'eously': None, 'ialist': None, 'iality': None,
231
              'ialize': None, 'ically': None, 'icance': None,
232
              'icians': None, 'icists': None, 'ifully': None,
233
              'ionals': None, 'ionate': cond_d, 'ioning': None,
234
              'ionist': None, 'iously': None, 'istics': None,
235
              'izable': cond_e, 'lessly': None, 'nesses': None,
236
              'oidism': None, 'acies': None, 'acity': None,
237
              'aging': cond_b, 'aical': None, 'alist': None,
238
              'alism': cond_b, 'ality': None, 'alize': None,
239
              'allic': cond_bb, 'anced': cond_b, 'ances': cond_b,
240
              'antic': cond_c, 'arial': None, 'aries': None,
241
              'arily': None, 'arity': cond_b, 'arize': None,
242
              'aroid': None, 'ately': None, 'ating': cond_i,
243
              'ation': cond_b, 'ative': None, 'ators': None,
244
              'atory': None, 'ature': cond_e, 'early': cond_y,
245
              'ehood': None, 'eless': None, 'elity': None,
246
              'ement': None, 'enced': None, 'ences': None,
247
              'eness': cond_e, 'ening': cond_e, 'ental': None,
248
              'ented': cond_c, 'ently': None, 'fully': None,
249
              'ially': None, 'icant': None, 'ician': None,
250
              'icide': None, 'icism': None, 'icist': None,
251
              'icity': None, 'idine': cond_i, 'iedly': None,
252
              'ihood': None, 'inate': None, 'iness': None,
253
              'ingly': cond_b, 'inism': cond_j, 'inity': cond_cc,
254
              'ional': None, 'ioned': None, 'ished': None,
255
              'istic': None, 'ities': None, 'itous': None,
256
              'ively': None, 'ivity': None, 'izers': cond_f,
257
              'izing': cond_f, 'oidal': None, 'oides': None,
258
              'otide': None, 'ously': None, 'able': None, 'ably': None,
259
              'ages': cond_b, 'ally': cond_b, 'ance': cond_b, 'ancy': cond_b,
260
              'ants': cond_b, 'aric': None, 'arly': cond_k, 'ated': cond_i,
261
              'ates': None, 'atic': cond_b, 'ator': None, 'ealy': cond_y,
262
              'edly': cond_e, 'eful': None, 'eity': None, 'ence': None,
263
              'ency': None, 'ened': cond_e, 'enly': cond_e, 'eous': None,
264
              'hood': None, 'ials': None, 'ians': None, 'ible': None,
265
              'ibly': None, 'ical': None, 'ides': cond_l, 'iers': None,
266
              'iful': None, 'ines': cond_m, 'ings': cond_n, 'ions': cond_b,
267
              'ious': None, 'isms': cond_b, 'ists': None, 'itic': cond_h,
268
              'ized': cond_f, 'izer': cond_f, 'less': None, 'lily': None,
269
              'ness': None, 'ogen': None, 'ward': None, 'wise': None,
270
              'ying': cond_b, 'yish': None, 'acy': None, 'age': cond_b,
271
              'aic': None, 'als': cond_bb, 'ant': cond_b, 'ars': cond_o,
272
              'ary': cond_f, 'ata': None, 'ate': None, 'eal': cond_y,
273
              'ear': cond_y, 'ely': cond_e, 'ene': cond_e, 'ent': cond_c,
274
              'ery': cond_e, 'ese': None, 'ful': None, 'ial': None,
275
              'ian': None, 'ics': None, 'ide': cond_l, 'ied': None,
276
              'ier': None, 'ies': cond_p, 'ily': None, 'ine': cond_m,
277
              'ing': cond_n, 'ion': cond_q, 'ish': cond_c, 'ism': cond_b,
278
              'ist': None, 'ite': cond_aa, 'ity': None, 'ium': None,
279
              'ive': None, 'ize': cond_f, 'oid': None, 'one': cond_r,
280
              'ous': None, 'ae': None, 'al': cond_bb, 'ar': cond_x,
281
              'as': cond_b, 'ed': cond_e, 'en': cond_f, 'es': cond_e,
282
              'ia': None, 'ic': None, 'is': None, 'ly': cond_b,
283
              'on': cond_s, 'or': cond_t, 'um': cond_u, 'us': cond_v,
284
              'yl': cond_r, '\'s': None, 's\'': None, 'a': None,
285
              'e': None, 'i': None, 'o': None, 's': cond_w, 'y': cond_b}
286
287
    for suffix_len in range(11, 0, -1):
288
        ending = word[-suffix_len:]
289
        if (ending in suffix and
290
                len(word)-suffix_len >= 2 and
291
                (suffix[ending] is None or
292
                 suffix[ending](word, suffix_len))):
293
            word = word[:-suffix_len]
294
            break
295
296
    def recode9(stem):
297
        """Return Lovins' conditional recode rule 9."""
298
        if stem[-3:-2] in {'a', 'i', 'o'}:
299
            return stem
300
        return stem[:-2]+'l'
301
302
    def recode24(stem):
303
        """Return Lovins' conditional recode rule 24."""
304
        if stem[-4:-3] == 's':
305
            return stem
306
        return stem[:-1]+'s'
307
308
    def recode28(stem):
309
        """Return Lovins' conditional recode rule 28."""
310
        if stem[-4:-3] in {'p', 't'}:
311
            return stem
312
        return stem[:-1]+'s'
313
314
    def recode30(stem):
315
        """Return Lovins' conditional recode rule 30."""
316
        if stem[-4:-3] == 'm':
317
            return stem
318
        return stem[:-1]+'s'
319
320
    def recode32(stem):
321
        """Return Lovins' conditional recode rule 32."""
322
        if stem[-3:-2] == 'n':
323
            return stem
324
        return stem[:-1]+'s'
325
326
    if word[-2:] in {'bb', 'dd', 'gg', 'll', 'mm', 'nn', 'pp', 'rr', 'ss',
327
                     'tt'}:
328
        word = word[:-1]
329
330
    recode = (('iev', 'ief'),
331
              ('uct', 'uc'),
332
              ('umpt', 'um'),
333
              ('rpt', 'rb'),
334
              ('urs', 'ur'),
335
              ('istr', 'ister'),
336
              ('metr', 'meter'),
337
              ('olv', 'olut'),
338
              ('ul', recode9),
339
              ('bex', 'bic'),
340
              ('dex', 'dic'),
341
              ('pex', 'pic'),
342
              ('tex', 'tic'),
343
              ('ax', 'ac'),
344
              ('ex', 'ec'),
345
              ('ix', 'ic'),
346
              ('lux', 'luc'),
347
              ('uad', 'uas'),
348
              ('vad', 'vas'),
349
              ('cid', 'cis'),
350
              ('lid', 'lis'),
351
              ('erid', 'eris'),
352
              ('pand', 'pans'),
353
              ('end', recode24),
354
              ('ond', 'ons'),
355
              ('lud', 'lus'),
356
              ('rud', 'rus'),
357
              ('her', recode28),
358
              ('mit', 'mis'),
359
              ('ent', recode30),
360
              ('ert', 'ers'),
361
              ('et', recode32),
362
              ('yt', 'ys'),
363
              ('yz', 'ys'))
364
365
    for ending, replacement in recode:
366
        if word.endswith(ending):
367
            if callable(replacement):
368
                word = replacement(word)
369
            else:
370
                word = word[:-len(ending)] + replacement
371
372
    return word
373
374
375
def _m_degree(term, vowels):
376
    """Return Porter helper function _m_degree value.
377
378
    m-degree is equal to the number of V to C transitions
379
380
    :param term: the word for which to calculate the m-degree
381
    :param vowels: the set of vowels in the language
382
    :returns: the m-degree as defined in the Porter stemmer definition
383
    """
384
    mdeg = 0
385
    last_was_vowel = False
386
    for letter in term:
387
        if letter in vowels:
388
            last_was_vowel = True
389
        else:
390
            if last_was_vowel:
391
                mdeg += 1
392
            last_was_vowel = False
393
    return mdeg
394
395
396
def _sb_has_vowel(term, vowels):
397
    """Return Porter helper function _sb_has_vowel value.
398
399
    :param term: the word to scan for vowels
400
    :param vowels: the set of vowels in the language
401
    :returns: true iff a vowel exists in the term (as defined in the Porter
402
        stemmer definition)
403
    """
404
    for letter in term:
405
        if letter in vowels:
406
            return True
407
    return False
408
409
410
def _ends_in_doubled_cons(term, vowels):
411
    """Return Porter helper function _ends_in_doubled_cons value.
412
413
    :param term: the word to check for a final doubled consonant
414
    :param vowels: the set of vowels in the language
415
    :returns: true iff the stem ends in a doubled consonant (as defined in the
416
        Porter stemmer definition)
417
    """
418
    if len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1]:
419
        return True
420
    return False
421
422
423
def _ends_in_cvc(term, vowels):
424
    """Return Porter helper function _ends_in_cvc value.
425
426
    :param term: the word to scan for cvc
427
    :param vowels: the set of vowels in the language
428
    :returns: true iff the stem ends in cvc (as defined in the Porter stemmer
429
        definition)
430
    """
431
    if len(term) > 2 and (term[-1] not in vowels and
432
                          term[-2] in vowels and
433
                          term[-3] not in vowels and
434
                          term[-1] not in tuple('wxY')):
435
        return True
436
    return False
437
438
439
def porter(word, early_english=False):
440
    """Return Porter stem.
441
442
    The Porter stemmer is defined at:
443
    http://snowball.tartarus.org/algorithms/porter/stemmer.html
444
445
    :param word: the word to calculate the stem of
446
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
447
        person singular verbal agreement suffixes)
448
    :returns: word stem
449
    :rtype: str
450
451
    >>> porter('reading')
452
    'read'
453
    >>> porter('suspension')
454
    'suspens'
455
    >>> porter('elusiveness')
456
    'elus'
457
458
    >>> porter('eateth', early_english=True)
459
    'eat'
460
    """
461
    # pylint: disable=too-many-branches
462
463
    # lowercase, normalize, and compose
464
    word = unicodedata.normalize('NFC', text_type(word.lower()))
465
466
    # Return word if stem is shorter than 2
467
    if len(word) < 3:
468
        return word
469
470
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
471
    # Re-map consonantal y to Y (Y will be C, y will be V)
472
    if word[0] == 'y':
473
        word = 'Y' + word[1:]
474
    for i in range(1, len(word)):
475
        if word[i] == 'y' and word[i-1] in _vowels:
476
            word = word[:i] + 'Y' + word[i+1:]
477
478
    # Step 1a
479
    if word[-1] == 's':
480
        if word[-4:] == 'sses':
481
            word = word[:-2]
482
        elif word[-3:] == 'ies':
483
            word = word[:-2]
484
        elif word[-2:] == 'ss':
485
            pass
486
        else:
487
            word = word[:-1]
488
489
    # Step 1b
490
    step1b_flag = False
491
    if word[-3:] == 'eed':
492
        if _m_degree(word[:-3], _vowels) > 0:
493
            word = word[:-1]
494
    elif word[-2:] == 'ed':
495
        if _sb_has_vowel(word[:-2], _vowels):
496
            word = word[:-2]
497
            step1b_flag = True
498
    elif word[-3:] == 'ing':
499
        if _sb_has_vowel(word[:-3], _vowels):
500
            word = word[:-3]
501
            step1b_flag = True
502
    elif early_english:
503
        if word[-3:] == 'est':
504
            if _sb_has_vowel(word[:-3], _vowels):
505
                word = word[:-3]
506
                step1b_flag = True
507
        elif word[-3:] == 'eth':
508
            if _sb_has_vowel(word[:-3], _vowels):
509
                word = word[:-3]
510
                step1b_flag = True
511
512
    if step1b_flag:
513
        if word[-2:] in {'at', 'bl', 'iz'}:
514
            word += 'e'
515
        elif (_ends_in_doubled_cons(word, _vowels) and
516
              word[-1] not in {'l', 's', 'z'}):
517
            word = word[:-1]
518
        elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels):
519
            word += 'e'
520
521
    # Step 1c
522
    if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels):
523
        word = word[:-1] + 'i'
524
525
    # Step 2
526
    if len(word) > 1:
527
        if word[-2] == 'a':
528
            if word[-7:] == 'ational':
529
                if _m_degree(word[:-7], _vowels) > 0:
530
                    word = word[:-5] + 'e'
531
            elif word[-6:] == 'tional':
532
                if _m_degree(word[:-6], _vowels) > 0:
533
                    word = word[:-2]
534
        elif word[-2] == 'c':
535
            if word[-4:] in {'enci', 'anci'}:
536
                if _m_degree(word[:-4], _vowels) > 0:
537
                    word = word[:-1] + 'e'
538
        elif word[-2] == 'e':
539
            if word[-4:] == 'izer':
540
                if _m_degree(word[:-4], _vowels) > 0:
541
                    word = word[:-1]
542
        elif word[-2] == 'g':
543
            if word[-4:] == 'logi':
544
                if _m_degree(word[:-4], _vowels) > 0:
545
                    word = word[:-1]
546
        elif word[-2] == 'l':
547
            if word[-3:] == 'bli':
548
                if _m_degree(word[:-3], _vowels) > 0:
549
                    word = word[:-1] + 'e'
550
            elif word[-4:] == 'alli':
551
                if _m_degree(word[:-4], _vowels) > 0:
552
                    word = word[:-2]
553
            elif word[-5:] == 'entli':
554
                if _m_degree(word[:-5], _vowels) > 0:
555
                    word = word[:-2]
556
            elif word[-3:] == 'eli':
557
                if _m_degree(word[:-3], _vowels) > 0:
558
                    word = word[:-2]
559
            elif word[-5:] == 'ousli':
560
                if _m_degree(word[:-5], _vowels) > 0:
561
                    word = word[:-2]
562
        elif word[-2] == 'o':
563
            if word[-7:] == 'ization':
564
                if _m_degree(word[:-7], _vowels) > 0:
565
                    word = word[:-5] + 'e'
566
            elif word[-5:] == 'ation':
567
                if _m_degree(word[:-5], _vowels) > 0:
568
                    word = word[:-3] + 'e'
569
            elif word[-4:] == 'ator':
570
                if _m_degree(word[:-4], _vowels) > 0:
571
                    word = word[:-2] + 'e'
572
        elif word[-2] == 's':
573
            if word[-5:] == 'alism':
574
                if _m_degree(word[:-5], _vowels) > 0:
575
                    word = word[:-3]
576
            elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
577
                if _m_degree(word[:-7], _vowels) > 0:
578
                    word = word[:-4]
579
        elif word[-2] == 't':
580
            if word[-5:] == 'aliti':
581
                if _m_degree(word[:-5], _vowels) > 0:
582
                    word = word[:-3]
583
            elif word[-5:] == 'iviti':
584
                if _m_degree(word[:-5], _vowels) > 0:
585
                    word = word[:-3] + 'e'
586
            elif word[-6:] == 'biliti':
587
                if _m_degree(word[:-6], _vowels) > 0:
588
                    word = word[:-5] + 'le'
589
590
    # Step 3
591
    if word[-5:] == 'icate':
592
        if _m_degree(word[:-5], _vowels) > 0:
593
            word = word[:-3]
594
    elif word[-5:] == 'ative':
595
        if _m_degree(word[:-5], _vowels) > 0:
596
            word = word[:-5]
597
    elif word[-5:] in {'alize', 'iciti'}:
598
        if _m_degree(word[:-5], _vowels) > 0:
599
            word = word[:-3]
600
    elif word[-4:] == 'ical':
601
        if _m_degree(word[:-4], _vowels) > 0:
602
            word = word[:-2]
603
    elif word[-3:] == 'ful':
604
        if _m_degree(word[:-3], _vowels) > 0:
605
            word = word[:-3]
606
    elif word[-4:] == 'ness':
607
        if _m_degree(word[:-4], _vowels) > 0:
608
            word = word[:-4]
609
610
    # Step 4
611
    if word[-2:] == 'al':
612
        if _m_degree(word[:-2], _vowels) > 1:
613
            word = word[:-2]
614
    elif word[-4:] == 'ance':
615
        if _m_degree(word[:-4], _vowels) > 1:
616
            word = word[:-4]
617
    elif word[-4:] == 'ence':
618
        if _m_degree(word[:-4], _vowels) > 1:
619
            word = word[:-4]
620
    elif word[-2:] == 'er':
621
        if _m_degree(word[:-2], _vowels) > 1:
622
            word = word[:-2]
623
    elif word[-2:] == 'ic':
624
        if _m_degree(word[:-2], _vowels) > 1:
625
            word = word[:-2]
626
    elif word[-4:] == 'able':
627
        if _m_degree(word[:-4], _vowels) > 1:
628
            word = word[:-4]
629
    elif word[-4:] == 'ible':
630
        if _m_degree(word[:-4], _vowels) > 1:
631
            word = word[:-4]
632
    elif word[-3:] == 'ant':
633
        if _m_degree(word[:-3], _vowels) > 1:
634
            word = word[:-3]
635
    elif word[-5:] == 'ement':
636
        if _m_degree(word[:-5], _vowels) > 1:
637
            word = word[:-5]
638
    elif word[-4:] == 'ment':
639
        if _m_degree(word[:-4], _vowels) > 1:
640
            word = word[:-4]
641
    elif word[-3:] == 'ent':
642
        if _m_degree(word[:-3], _vowels) > 1:
643
            word = word[:-3]
644
    elif word[-4:] in {'sion', 'tion'}:
645
        if _m_degree(word[:-3], _vowels) > 1:
646
            word = word[:-3]
647
    elif word[-2:] == 'ou':
648
        if _m_degree(word[:-2], _vowels) > 1:
649
            word = word[:-2]
650
    elif word[-3:] == 'ism':
651
        if _m_degree(word[:-3], _vowels) > 1:
652
            word = word[:-3]
653
    elif word[-3:] == 'ate':
654
        if _m_degree(word[:-3], _vowels) > 1:
655
            word = word[:-3]
656
    elif word[-3:] == 'iti':
657
        if _m_degree(word[:-3], _vowels) > 1:
658
            word = word[:-3]
659
    elif word[-3:] == 'ous':
660
        if _m_degree(word[:-3], _vowels) > 1:
661
            word = word[:-3]
662
    elif word[-3:] == 'ive':
663
        if _m_degree(word[:-3], _vowels) > 1:
664
            word = word[:-3]
665
    elif word[-3:] == 'ize':
666
        if _m_degree(word[:-3], _vowels) > 1:
667
            word = word[:-3]
668
669
    # Step 5a
670
    if word[-1] == 'e':
671
        if _m_degree(word[:-1], _vowels) > 1:
672
            word = word[:-1]
673
        elif (_m_degree(word[:-1], _vowels) == 1 and
674
              not _ends_in_cvc(word[:-1], _vowels)):
675
            word = word[:-1]
676
677
    # Step 5b
678
    if word[-2:] == 'll' and _m_degree(word, _vowels) > 1:
679
        word = word[:-1]
680
681
    # Change 'Y' back to 'y' if it survived stemming
682
    for i in range(len(word)):
683
        if word[i] == 'Y':
684
            word = word[:i] + 'y' + word[i+1:]
685
686
    return word
687
688
689
def _sb_r1(term, vowels, r1_prefixes=None):
690
    """Return the R1 region, as defined in the Porter2 specification."""
691
    vowel_found = False
692
    if hasattr(r1_prefixes, '__iter__'):
693
        for prefix in r1_prefixes:
694
            if term[:len(prefix)] == prefix:
695
                return len(prefix)
696
697
    for i in range(len(term)):
698
        if not vowel_found and term[i] in vowels:
699
            vowel_found = True
700
        elif vowel_found and term[i] not in vowels:
701
            return i + 1
702
    return len(term)
703
704
705
def _sb_r2(term, vowels, r1_prefixes=None):
706
    """Return the R2 region, as defined in the Porter2 specification."""
707
    r1_start = _sb_r1(term, vowels, r1_prefixes)
708
    return r1_start + _sb_r1(term[r1_start:], vowels)
709
710
711
def _sb_ends_in_short_syllable(term, vowels, codanonvowels):
712
    """Return True iff term ends in a short syllable.
713
714
    (...according to the Porter2 specification.)
715
716
    NB: This is akin to the CVC test from the Porter stemmer. The description
717
    is unfortunately poor/ambiguous.
718
    """
719
    if not term:
720
        return False
721
    if len(term) == 2:
722
        if term[-2] in vowels and term[-1] not in vowels:
723
            return True
724
    elif len(term) >= 3:
725
        if ((term[-3] not in vowels and term[-2] in vowels and
726
             term[-1] in codanonvowels)):
727
            return True
728
    return False
729
730
731
def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None):
732
    """Return True iff term is a short word.
733
734
    (...according to the Porter2 specification.)
735
    """
736
    if ((_sb_r1(term, vowels, r1_prefixes) == len(term) and
737
         _sb_ends_in_short_syllable(term, vowels, codanonvowels))):
738
        return True
739
    return False
740
741
742
def porter2(word, early_english=False):
743
    """Return the Porter2 (Snowball English) stem.
744
745
    The Porter2 (Snowball English) stemmer is defined at:
746
    http://snowball.tartarus.org/algorithms/english/stemmer.html
747
748
    :param word: the word to calculate the stem of
749
    :param early_english: set to True in order to remove -eth & -est (2nd & 3rd
750
        person singular verbal agreement suffixes)
751
    :returns: word stem
752
    :rtype: str
753
754
    >>> porter2('reading')
755
    'read'
756
    >>> porter2('suspension')
757
    'suspens'
758
    >>> porter2('elusiveness')
759
    'elus'
760
761
    >>> porter2('eateth', early_english=True)
762
    'eat'
763
    """
764
    # pylint: disable=too-many-branches
765
    # pylint: disable=too-many-return-statements
766
767
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
768
    _codanonvowels = {"'", 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
769
                      'n', 'p', 'q', 'r', 's', 't', 'v', 'z'}
770
    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
771
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
772
773
    # R1 prefixes should be in order from longest to shortest to prevent
774
    # masking
775
    _r1_prefixes = ('commun', 'gener', 'arsen')
776
    _exception1dict = {  # special changes:
777
        'skis': 'ski', 'skies': 'sky', 'dying': 'die',
778
        'lying': 'lie', 'tying': 'tie',
779
        # special -LY cases:
780
        'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli',
781
        'early': 'earli', 'only': 'onli', 'singly': 'singl'}
782
    _exception1set = {'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias',
783
                      'andes'}
784
    _exception2set = {'inning', 'outing', 'canning', 'herring', 'earring',
785
                      'proceed', 'exceed', 'succeed'}
786
787
    # lowercase, normalize, and compose
788
    word = unicodedata.normalize('NFC', text_type(word.lower()))
789
    # replace apostrophe-like characters with U+0027, per
790
    # http://snowball.tartarus.org/texts/apostrophe.html
791
    word = word.replace('’', '\'')
792
    word = word.replace('’', '\'')
793
794
    # Exceptions 1
795
    if word in _exception1dict:
796
        return _exception1dict[word]
797
    elif word in _exception1set:
798
        return word
799
800
    # Return word if stem is shorter than 3
801
    if len(word) < 3:
802
        return word
803
804
    # Remove initial ', if present.
805
    while word and word[0] == '\'':
806
        word = word[1:]
807
        # Return word if stem is shorter than 2
808
        if len(word) < 2:
809
            return word
810
811
    # Re-map vocalic Y to y (Y will be C, y will be V)
812
    if word[0] == 'y':
813
        word = 'Y' + word[1:]
814
    for i in range(1, len(word)):
815
        if word[i] == 'y' and word[i-1] in _vowels:
816
            word = word[:i] + 'Y' + word[i+1:]
817
818
    r1_start = _sb_r1(word, _vowels, _r1_prefixes)
819
    r2_start = _sb_r2(word, _vowels, _r1_prefixes)
820
821
    # Step 0
822
    if word[-3:] == '\'s\'':
823
        word = word[:-3]
824
    elif word[-2:] == '\'s':
825
        word = word[:-2]
826
    elif word[-1:] == '\'':
827
        word = word[:-1]
828
    # Return word if stem is shorter than 2
829
    if len(word) < 3:
830
        return word
831
832
    # Step 1a
833
    if word[-4:] == 'sses':
834
        word = word[:-2]
835
    elif word[-3:] in {'ied', 'ies'}:
836
        if len(word) > 4:
837
            word = word[:-2]
838
        else:
839
            word = word[:-1]
840
    elif word[-2:] in {'us', 'ss'}:
841
        pass
842
    elif word[-1] == 's':
843
        if _sb_has_vowel(word[:-2], _vowels):
844
            word = word[:-1]
845
846
    # Exceptions 2
847
    if word in _exception2set:
848
        return word
849
850
    # Step 1b
851
    step1b_flag = False
852
    if word[-5:] == 'eedly':
853
        if len(word[r1_start:]) >= 5:
854
            word = word[:-3]
855
    elif word[-5:] == 'ingly':
856
        if _sb_has_vowel(word[:-5], _vowels):
857
            word = word[:-5]
858
            step1b_flag = True
859
    elif word[-4:] == 'edly':
860
        if _sb_has_vowel(word[:-4], _vowels):
861
            word = word[:-4]
862
            step1b_flag = True
863
    elif word[-3:] == 'eed':
864
        if len(word[r1_start:]) >= 3:
865
            word = word[:-1]
866
    elif word[-3:] == 'ing':
867
        if _sb_has_vowel(word[:-3], _vowels):
868
            word = word[:-3]
869
            step1b_flag = True
870
    elif word[-2:] == 'ed':
871
        if _sb_has_vowel(word[:-2], _vowels):
872
            word = word[:-2]
873
            step1b_flag = True
874
    elif early_english:
875
        if word[-3:] == 'est':
876
            if _sb_has_vowel(word[:-3], _vowels):
877
                word = word[:-3]
878
                step1b_flag = True
879
        elif word[-3:] == 'eth':
880
            if _sb_has_vowel(word[:-3], _vowels):
881
                word = word[:-3]
882
                step1b_flag = True
883
884
    if step1b_flag:
885
        if word[-2:] in {'at', 'bl', 'iz'}:
886
            word += 'e'
887
        elif word[-2:] in _doubles:
888
            word = word[:-1]
889
        elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes):
890
            word += 'e'
891
892
    # Step 1c
893
    if ((len(word) > 2 and word[-1] in {'Y', 'y'} and
894
         word[-2] not in _vowels)):
895
        word = word[:-1] + 'i'
896
897
    # Step 2
898
    if word[-2] == 'a':
899
        if word[-7:] == 'ational':
900
            if len(word[r1_start:]) >= 7:
901
                word = word[:-5] + 'e'
902
        elif word[-6:] == 'tional':
903
            if len(word[r1_start:]) >= 6:
904
                word = word[:-2]
905
    elif word[-2] == 'c':
906
        if word[-4:] in {'enci', 'anci'}:
907
            if len(word[r1_start:]) >= 4:
908
                word = word[:-1] + 'e'
909
    elif word[-2] == 'e':
910
        if word[-4:] == 'izer':
911
            if len(word[r1_start:]) >= 4:
912
                word = word[:-1]
913
    elif word[-2] == 'g':
914
        if word[-3:] == 'ogi':
915
            if ((r1_start >= 1 and len(word[r1_start:]) >= 3 and
916
                 word[-4] == 'l')):
917
                word = word[:-1]
918
    elif word[-2] == 'l':
919
        if word[-6:] == 'lessli':
920
            if len(word[r1_start:]) >= 6:
921
                word = word[:-2]
922
        elif word[-5:] in {'entli', 'fulli', 'ousli'}:
923
            if len(word[r1_start:]) >= 5:
924
                word = word[:-2]
925
        elif word[-4:] == 'abli':
926
            if len(word[r1_start:]) >= 4:
927
                word = word[:-1] + 'e'
928
        elif word[-4:] == 'alli':
929
            if len(word[r1_start:]) >= 4:
930
                word = word[:-2]
931
        elif word[-3:] == 'bli':
932
            if len(word[r1_start:]) >= 3:
933
                word = word[:-1] + 'e'
934
        elif word[-2:] == 'li':
935
            if ((r1_start >= 1 and len(word[r1_start:]) >= 2 and
936
                 word[-3] in _li)):
937
                word = word[:-2]
938
    elif word[-2] == 'o':
939
        if word[-7:] == 'ization':
940
            if len(word[r1_start:]) >= 7:
941
                word = word[:-5] + 'e'
942
        elif word[-5:] == 'ation':
943
            if len(word[r1_start:]) >= 5:
944
                word = word[:-3] + 'e'
945
        elif word[-4:] == 'ator':
946
            if len(word[r1_start:]) >= 4:
947
                word = word[:-2] + 'e'
948
    elif word[-2] == 's':
949
        if word[-7:] in {'fulness', 'ousness', 'iveness'}:
950
            if len(word[r1_start:]) >= 7:
951
                word = word[:-4]
952
        elif word[-5:] == 'alism':
953
            if len(word[r1_start:]) >= 5:
954
                word = word[:-3]
955
    elif word[-2] == 't':
956
        if word[-6:] == 'biliti':
957
            if len(word[r1_start:]) >= 6:
958
                word = word[:-5] + 'le'
959
        elif word[-5:] == 'aliti':
960
            if len(word[r1_start:]) >= 5:
961
                word = word[:-3]
962
        elif word[-5:] == 'iviti':
963
            if len(word[r1_start:]) >= 5:
964
                word = word[:-3] + 'e'
965
966
    # Step 3
967
    if word[-7:] == 'ational':
968
        if len(word[r1_start:]) >= 7:
969
            word = word[:-5] + 'e'
970
    elif word[-6:] == 'tional':
971
        if len(word[r1_start:]) >= 6:
972
            word = word[:-2]
973
    elif word[-5:] in {'alize', 'icate', 'iciti'}:
974
        if len(word[r1_start:]) >= 5:
975
            word = word[:-3]
976
    elif word[-5:] == 'ative':
977
        if len(word[r2_start:]) >= 5:
978
            word = word[:-5]
979
    elif word[-4:] == 'ical':
980
        if len(word[r1_start:]) >= 4:
981
            word = word[:-2]
982
    elif word[-4:] == 'ness':
983
        if len(word[r1_start:]) >= 4:
984
            word = word[:-4]
985
    elif word[-3:] == 'ful':
986
        if len(word[r1_start:]) >= 3:
987
            word = word[:-3]
988
989
    # Step 4
990
    for suffix in ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant',
991
                   'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er',
992
                   'ic'):
993
        if word[-len(suffix):] == suffix:
994
            if len(word[r2_start:]) >= len(suffix):
995
                word = word[:-len(suffix)]
996
            break
997
    else:
998
        if word[-3:] == 'ion':
999
            if ((len(word[r2_start:]) >= 3 and len(word) >= 4 and
1000
                 word[-4] in tuple('st'))):
1001
                word = word[:-3]
1002
1003
    # Step 5
1004
    if word[-1] == 'e':
1005
        if (len(word[r2_start:]) >= 1 or
1006
                (len(word[r1_start:]) >= 1 and
1007
                 not _sb_ends_in_short_syllable(word[:-1], _vowels,
1008
                                                _codanonvowels))):
1009
            word = word[:-1]
1010
    elif word[-1] == 'l':
1011
        if len(word[r2_start:]) >= 1 and word[-2] == 'l':
1012
            word = word[:-1]
1013
1014
    # Change 'Y' back to 'y' if it survived stemming
1015
    for i in range(0, len(word)):
1016
        if word[i] == 'Y':
1017
            word = word[:i] + 'y' + word[i+1:]
1018
1019
    return word
1020
1021
1022
def sb_german(word, alternate_vowels=False):
1023
    """Return Snowball German stem.
1024
1025
    The Snowball German stemmer is defined at:
1026
    http://snowball.tartarus.org/algorithms/german/stemmer.html
1027
1028
    :param word: the word to calculate the stem of
1029
    :param alternate_vowels: composes ae as ä, oe as ö, and ue as ü before
1030
        running the algorithm
1031
    :returns: word stem
1032
    :rtype: str
1033
1034
    >>> sb_german('lesen')
1035
    'les'
1036
    >>> sb_german('graues')
1037
    'grau'
1038
    >>> sb_german('buchstabieren')
1039
    'buchstabi'
1040
    """
1041
    # pylint: disable=too-many-branches
1042
1043
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
1044
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
1045
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1046
1047
    # lowercase, normalize, and compose
1048
    word = unicodedata.normalize('NFC', word.lower())
1049
    word = word.replace('ß', 'ss')
1050
1051
    if len(word) > 2:
1052
        for i in range(2, len(word)):
1053
            if word[i] in _vowels and word[i-2] in _vowels:
1054
                if word[i-1] == 'u':
1055
                    word = word[:i-1] + 'U' + word[i:]
1056
                elif word[i-1] == 'y':
1057
                    word = word[:i-1] + 'Y' + word[i:]
1058
1059
    if alternate_vowels:
1060
        word = word.replace('ae', 'ä')
1061
        word = word.replace('oe', 'ö')
1062
        word = word.replace('que', 'Q')
1063
        word = word.replace('ue', 'ü')
1064
        word = word.replace('Q', 'que')
1065
1066
    r1_start = max(3, _sb_r1(word, _vowels))
1067
    r2_start = _sb_r2(word, _vowels)
1068
1069
    # Step 1
1070
    niss_flag = False
1071
    if word[-3:] == 'ern':
1072
        if len(word[r1_start:]) >= 3:
1073
            word = word[:-3]
1074
    elif word[-2:] == 'em':
1075
        if len(word[r1_start:]) >= 2:
1076
            word = word[:-2]
1077
    elif word[-2:] == 'er':
1078
        if len(word[r1_start:]) >= 2:
1079
            word = word[:-2]
1080
    elif word[-2:] == 'en':
1081
        if len(word[r1_start:]) >= 2:
1082
            word = word[:-2]
1083
            niss_flag = True
1084
    elif word[-2:] == 'es':
1085
        if len(word[r1_start:]) >= 2:
1086
            word = word[:-2]
1087
            niss_flag = True
1088
    elif word[-1:] == 'e':
1089
        if len(word[r1_start:]) >= 1:
1090
            word = word[:-1]
1091
            niss_flag = True
1092
    elif word[-1:] == 's':
1093
        if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1094
             word[-2] in _s_endings)):
1095
            word = word[:-1]
1096
1097
    if niss_flag and word[-4:] == 'niss':
1098
        word = word[:-1]
1099
1100
    # Step 2
1101
    if word[-3:] == 'est':
1102
        if len(word[r1_start:]) >= 3:
1103
            word = word[:-3]
1104
    elif word[-2:] == 'en':
1105
        if len(word[r1_start:]) >= 2:
1106
            word = word[:-2]
1107
    elif word[-2:] == 'er':
1108
        if len(word[r1_start:]) >= 2:
1109
            word = word[:-2]
1110
    elif word[-2:] == 'st':
1111
        if ((len(word[r1_start:]) >= 2 and len(word) >= 6 and
1112
             word[-3] in _st_endings)):
1113
            word = word[:-2]
1114
1115
    # Step 3
1116
    if word[-4:] == 'isch':
1117
        if len(word[r2_start:]) >= 4 and word[-5] != 'e':
1118
            word = word[:-4]
1119
    elif word[-4:] in {'lich', 'heit'}:
1120
        if len(word[r2_start:]) >= 4:
1121
            word = word[:-4]
1122
            if ((word[-2:] in {'er', 'en'} and
1123
                 len(word[r1_start:]) >= 2)):
1124
                word = word[:-2]
1125
    elif word[-4:] == 'keit':
1126
        if len(word[r2_start:]) >= 4:
1127
            word = word[:-4]
1128
            if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
1129
                word = word[:-4]
1130
            elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
1131
                word = word[:-2]
1132
    elif word[-3:] in {'end', 'ung'}:
1133
        if len(word[r2_start:]) >= 3:
1134
            word = word[:-3]
1135
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1136
                 word[-3] != 'e')):
1137
                word = word[:-2]
1138
    elif word[-2:] in {'ig', 'ik'}:
1139
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1140
            word = word[:-2]
1141
1142
    # Change 'Y' and 'U' back to lowercase if survived stemming
1143
    for i in range(0, len(word)):
1144
        if word[i] == 'Y':
1145
            word = word[:i] + 'y' + word[i+1:]
1146
        elif word[i] == 'U':
1147
            word = word[:i] + 'u' + word[i+1:]
1148
1149
    # Remove umlauts
1150
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1151
    word = word.translate(_umlauts)
1152
1153
    return word
1154
1155
1156
def sb_dutch(word):
1157
    """Return Snowball Dutch stem.
1158
1159
    The Snowball Dutch stemmer is defined at:
1160
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
1161
1162
    :param word: the word to calculate the stem of
1163
    :returns: word stem
1164
    :rtype: str
1165
1166
    >>> sb_dutch('lezen')
1167
    'lez'
1168
    >>> sb_dutch('opschorting')
1169
    'opschort'
1170
    >>> sb_dutch('ongrijpbaarheid')
1171
    'ongrijp'
1172
    """
1173
    # pylint: disable=too-many-branches
1174
1175
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
1176
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
1177
1178
    def _undouble(word):
1179
        """Undouble endings -kk, -dd, and -tt."""
1180
        if ((len(word) > 1 and word[-1] == word[-2] and
1181
             word[-1] in {'d', 'k', 't'})):
1182
            return word[:-1]
1183
        return word
1184
1185
    # lowercase, normalize, decompose, filter umlauts & acutes out, and compose
1186
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1187
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1188
    word = word.translate(_accented)
1189
1190
    for i in range(len(word)):
1191
        if i == 0 and word[0] == 'y':
1192
            word = 'Y' + word[1:]
1193
        elif word[i] == 'y' and word[i-1] in _vowels:
1194
            word = word[:i] + 'Y' + word[i+1:]
1195
        elif (word[i] == 'i' and word[i-1] in _vowels and i+1 < len(word) and
1196
              word[i+1] in _vowels):
1197
            word = word[:i] + 'I' + word[i+1:]
1198
1199
    r1_start = max(3, _sb_r1(word, _vowels))
1200
    r2_start = _sb_r2(word, _vowels)
1201
1202
    # Step 1
1203
    if word[-5:] == 'heden':
1204
        if len(word[r1_start:]) >= 5:
1205
            word = word[:-3] + 'id'
1206
    elif word[-3:] == 'ene':
1207
        if ((len(word[r1_start:]) >= 3 and
1208
             (word[-4] not in _vowels and word[-6:-3] != 'gem'))):
1209
            word = _undouble(word[:-3])
1210
    elif word[-2:] == 'en':
1211
        if ((len(word[r1_start:]) >= 2 and
1212
             (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1213
            word = _undouble(word[:-2])
1214
    elif word[-2:] == 'se':
1215
        if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings:
1216
            word = word[:-2]
1217
    elif word[-1:] == 's':
1218
        if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings:
1219
            word = word[:-1]
1220
1221
    # Step 2
1222
    e_removed = False
1223
    if word[-1:] == 'e':
1224
        if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1225
            word = _undouble(word[:-1])
1226
            e_removed = True
1227
1228
    # Step 3a
1229
    if word[-4:] == 'heid':
1230
        if len(word[r2_start:]) >= 4 and word[-5] != 'c':
1231
            word = word[:-4]
1232
            if word[-2:] == 'en':
1233
                if ((len(word[r1_start:]) >= 2 and
1234
                     (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
1235
                    word = _undouble(word[:-2])
1236
1237
    # Step 3b
1238
    if word[-4:] == 'lijk':
1239
        if len(word[r2_start:]) >= 4:
1240
            word = word[:-4]
1241
            # Repeat step 2
1242
            if word[-1:] == 'e':
1243
                if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
1244
                    word = _undouble(word[:-1])
1245
    elif word[-4:] == 'baar':
1246
        if len(word[r2_start:]) >= 4:
1247
            word = word[:-4]
1248
    elif word[-3:] in ('end', 'ing'):
1249
        if len(word[r2_start:]) >= 3:
1250
            word = word[:-3]
1251
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
1252
                 word[-3] != 'e')):
1253
                word = word[:-2]
1254
            else:
1255
                word = _undouble(word)
1256
    elif word[-3:] == 'bar':
1257
        if len(word[r2_start:]) >= 3 and e_removed:
1258
            word = word[:-3]
1259
    elif word[-2:] == 'ig':
1260
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
1261
            word = word[:-2]
1262
1263
    # Step 4
1264
    if ((len(word) >= 4 and
1265
         word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and
1266
         word[-4] not in _vowels and
1267
         word[-1] not in _vowels and word[-1] != 'I')):
1268
        word = word[:-2] + word[-1]
1269
1270
    # Change 'Y' and 'U' back to lowercase if survived stemming
1271
    for i in range(0, len(word)):
1272
        if word[i] == 'Y':
1273
            word = word[:i] + 'y' + word[i+1:]
1274
        elif word[i] == 'I':
1275
            word = word[:i] + 'i' + word[i+1:]
1276
1277
    return word
1278
1279
1280
def sb_norwegian(word):
1281
    """Return Snowball Norwegian stem.
1282
1283
    The Snowball Norwegian stemmer is defined at:
1284
    http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
1285
1286
    :param word: the word to calculate the stem of
1287
    :returns: word stem
1288
    :rtype: str
1289
1290
    >>> sb_norwegian('lese')
1291
    'les'
1292
    >>> sb_norwegian('suspensjon')
1293
    'suspensjon'
1294
    >>> sb_norwegian('sikkerhet')
1295
    'sikker'
1296
    """
1297
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1298
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p',
1299
                  'r', 't', 'v', 'y', 'z'}
1300
    # lowercase, normalize, and compose
1301
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1302
1303
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1304
1305
    # Step 1
1306
    _r1 = word[r1_start:]
1307
    if _r1[-7:] == 'hetenes':
1308
        word = word[:-7]
1309
    elif _r1[-6:] in {'hetene', 'hetens'}:
1310
        word = word[:-6]
1311
    elif _r1[-5:] in {'heten', 'heter', 'endes'}:
1312
        word = word[:-5]
1313
    elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}:
1314
        if word[-4:] == 'erte':
1315
            word = word[:-2]
1316
        else:
1317
            word = word[:-4]
1318
    elif _r1[-3:] in {'ede', 'ane', 'ene', 'ens', 'ers', 'ets', 'het', 'ast',
1319
                      'ert'}:
1320
        if word[-3:] == 'ert':
1321
            word = word[:-1]
1322
        else:
1323
            word = word[:-3]
1324
    elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}:
1325
        word = word[:-2]
1326
    elif _r1[-1:] in {'a', 'e'}:
1327
        word = word[:-1]
1328
    elif _r1[-1:] == 's':
1329
        if (((len(word) > 1 and word[-2] in _s_endings) or
1330
             (len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels))):
1331
            word = word[:-1]
1332
1333
    # Step 2
1334
    if word[r1_start:][-2:] in {'dt', 'vt'}:
1335
        word = word[:-1]
1336
1337
    # Step 3
1338
    _r1 = word[r1_start:]
1339
    if _r1[-7:] == 'hetslov':
1340
        word = word[:-7]
1341
    elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}:
1342
        word = word[:-4]
1343
    elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}:
1344
        word = word[:-3]
1345
    elif _r1[-2:] == 'ig':
1346
        word = word[:-2]
1347
1348
    return word
1349
1350
1351
def sb_swedish(word):
1352
    """Return Snowball Swedish stem.
1353
1354
    The Snowball Swedish stemmer is defined at:
1355
    http://snowball.tartarus.org/algorithms/swedish/stemmer.html
1356
1357
    :param word: the word to calculate the stem of
1358
    :returns: word stem
1359
    :rtype: str
1360
1361
    >>> sb_swedish('undervisa')
1362
    'undervis'
1363
    >>> sb_swedish('suspension')
1364
    'suspension'
1365
    >>> sb_swedish('visshet')
1366
    'viss'
1367
    """
1368
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'}
1369
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1370
                  'o', 'p', 'r', 't', 'v', 'y'}
1371
1372
    # lowercase, normalize, and compose
1373
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1374
1375
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1376
1377
    # Step 1
1378
    _r1 = word[r1_start:]
1379 View Code Duplication
    if _r1[-7:] == 'heterna':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1380
        word = word[:-7]
1381
    elif _r1[-6:] == 'hetens':
1382
        word = word[:-6]
1383
    elif _r1[-5:] in {'anden', 'heten', 'heter', 'arnas', 'ernas', 'ornas',
1384
                      'andes', 'arens', 'andet'}:
1385
        word = word[:-5]
1386
    elif _r1[-4:] in {'arna', 'erna', 'orna', 'ande', 'arne', 'aste', 'aren',
1387
                      'ades', 'erns'}:
1388
        word = word[:-4]
1389
    elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}:
1390
        word = word[:-3]
1391
    elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}:
1392
        word = word[:-2]
1393
    elif _r1[-1:] in {'a', 'e'}:
1394
        word = word[:-1]
1395
    elif _r1[-1:] == 's':
1396
        if len(word) > 1 and word[-2] in _s_endings:
1397
            word = word[:-1]
1398
1399
    # Step 2
1400
    if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}:
1401
        word = word[:-1]
1402
1403
    # Step 3
1404
    _r1 = word[r1_start:]
1405
    if _r1[-5:] == 'fullt':
1406
        word = word[:-1]
1407
    elif _r1[-4:] == 'löst':
1408
        word = word[:-1]
1409
    elif _r1[-3:] in {'lig', 'els'}:
1410
        word = word[:-3]
1411
    elif _r1[-2:] == 'ig':
1412
        word = word[:-2]
1413
1414
    return word
1415
1416
1417
def sb_danish(word):
1418
    """Return Snowball Danish stem.
1419
1420
    The Snowball Danish stemmer is defined at:
1421
    http://snowball.tartarus.org/algorithms/danish/stemmer.html
1422
1423
    :param word: the word to calculate the stem of
1424
    :returns: word stem
1425
    :rtype: str
1426
1427
    >>> sb_danish('underviser')
1428
    'undervis'
1429
    >>> sb_danish('suspension')
1430
    'suspension'
1431
    >>> sb_danish('sikkerhed')
1432
    'sikker'
1433
    """
1434
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1435
    _s_endings = {'a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1436
                  'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'}
1437
1438
    # lowercase, normalize, and compose
1439
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1440
1441
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1442
1443
    # Step 1
1444
    _r1 = word[r1_start:]
1445 View Code Duplication
    if _r1[-7:] == 'erendes':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1446
        word = word[:-7]
1447
    elif _r1[-6:] in {'erende', 'hedens'}:
1448
        word = word[:-6]
1449
    elif _r1[-5:] in {'ethed', 'erede', 'heden', 'heder', 'endes', 'ernes',
1450
                      'erens', 'erets'}:
1451
        word = word[:-5]
1452
    elif _r1[-4:] in {'ered', 'ende', 'erne', 'eren', 'erer', 'heds', 'enes',
1453
                      'eres', 'eret'}:
1454
        word = word[:-4]
1455
    elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}:
1456
        word = word[:-3]
1457
    elif _r1[-2:] in {'en', 'er', 'es', 'et'}:
1458
        word = word[:-2]
1459
    elif _r1[-1:] == 'e':
1460
        word = word[:-1]
1461
    elif _r1[-1:] == 's':
1462
        if len(word) > 1 and word[-2] in _s_endings:
1463
            word = word[:-1]
1464
1465
    # Step 2
1466
    if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1467
        word = word[:-1]
1468
1469
    # Step 3
1470
    if word[-4:] == 'igst':
1471
        word = word[:-2]
1472
1473
    _r1 = word[r1_start:]
1474
    repeat_step2 = False
1475
    if _r1[-4:] == 'elig':
1476
        word = word[:-4]
1477
        repeat_step2 = True
1478
    elif _r1[-4:] == 'løst':
1479
        word = word[:-1]
1480
    elif _r1[-3:] in {'lig', 'els'}:
1481
        word = word[:-3]
1482
        repeat_step2 = True
1483
    elif _r1[-2:] == 'ig':
1484
        word = word[:-2]
1485
        repeat_step2 = True
1486
1487
    if repeat_step2:
1488
        if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1489
            word = word[:-1]
1490
1491
    # Step 4
1492
    if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1493
         word[-1] == word[-2] and word[-1] not in _vowels)):
1494
        word = word[:-1]
1495
1496
    return word
1497
1498
1499
def clef_german(word):
1500
    """Return CLEF German stem.
1501
1502
    The CLEF German stemmer is defined at:
1503
    http://members.unine.ch/jacques.savoy/clef/germanStemmer.txt
1504
1505
    :param word: the word to calculate the stem of
1506
    :returns: word stem
1507
    :rtype: str
1508
1509
    >>> clef_german('lesen')
1510
    'lese'
1511
    >>> clef_german('graues')
1512
    'grau'
1513
    >>> clef_german('buchstabieren')
1514
    'buchstabier'
1515
    """
1516
    # lowercase, normalize, and compose
1517
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1518
1519
    # remove umlauts
1520
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1521
    word = word.translate(_umlauts)
1522
1523
    # remove plurals
1524
    wlen = len(word)-1
1525
1526
    if wlen > 3:
1527
        if wlen > 5:
1528
            if word[-3:] == 'nen':
1529
                return word[:-3]
1530
        if wlen > 4:
1531
            if word[-2:] in {'en', 'se', 'es', 'er'}:
1532
                return word[:-2]
1533
        if word[-1] in {'e', 'n', 'r', 's'}:
1534
            return word[:-1]
1535
    return word
1536
1537
1538
def clef_german_plus(word):
1539
    """Return 'CLEF German stemmer plus' stem.
1540
1541
    The CLEF German stemmer plus is defined at:
1542
    http://members.unine.ch/jacques.savoy/clef/germanStemmerPlus.txt
1543
1544
    :param word: the word to calculate the stem of
1545
    :returns: word stem
1546
    :rtype: str
1547
1548
    >>> clef_german_plus('lesen')
1549
    'les'
1550
    >>> clef_german_plus('graues')
1551
    'grau'
1552
    >>> clef_german_plus('buchstabieren')
1553
    'buchstabi'
1554
    """
1555
    _st_ending = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
1556
1557
    # lowercase, normalize, and compose
1558
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1559
1560
    # remove umlauts
1561
    _accents = dict(zip((ord(_) for _ in 'äàáâöòóôïìíîüùúû'),
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1562
                        'aaaaooooiiiiuuuu'))
1563
    word = word.translate(_accents)
1564
1565
    # Step 1
1566
    wlen = len(word)-1
1567
    if wlen > 4 and word[-3:] == 'ern':
1568
        word = word[:-3]
1569
    elif wlen > 3 and word[-2:] in {'em', 'en', 'er', 'es'}:
1570
        word = word[:-2]
1571
    elif wlen > 2 and (word[-1] == 'e' or
1572
                       (word[-1] == 's' and word[-2] in _st_ending)):
1573
        word = word[:-1]
1574
1575
    # Step 2
1576
    wlen = len(word)-1
1577
    if wlen > 4 and word[-3:] == 'est':
1578
        word = word[:-3]
1579
    elif wlen > 3 and (word[-2:] in {'er', 'en'} or
1580
                       (word[-2:] == 'st' and word[-3] in _st_ending)):
1581
        word = word[:-2]
1582
1583
    return word
1584
1585
1586
def clef_swedish(word):
1587
    """Return CLEF Swedish stem.
1588
1589
    The CLEF Swedish stemmer is defined at:
1590
    http://members.unine.ch/jacques.savoy/clef/swedishStemmer.txt
1591
1592
    :param word: the word to calculate the stem of
1593
    :returns: word stem
1594
    :rtype: str
1595
1596
    >>> clef_swedish('undervisa')
1597
    'undervis'
1598
    >>> clef_swedish('suspension')
1599
    'suspensio'
1600
    >>> clef_swedish('visshet')
1601
    'viss'
1602
    """
1603
    wlen = len(word)-1
1604
1605
    if wlen > 3 and word[-1] == 's':
1606
        word = word[:-1]
1607
        wlen -= 1
1608
1609
    if wlen > 6:
1610
        if word[-5:] in {'elser', 'heten'}:
1611
            return word[:-5]
1612
    if wlen > 5:
1613
        if word[-4:] in {'arne', 'erna', 'ande', 'else', 'aste', 'orna',
1614
                         'aren'}:
1615
            return word[:-4]
1616
    if wlen > 4:
1617
        if word[-3:] in {'are', 'ast', 'het'}:
1618
            return word[:-3]
1619
    if wlen > 3:
1620
        if word[-2:] in {'ar', 'er', 'or', 'en', 'at', 'te', 'et'}:
1621
            return word[:-2]
1622
    if wlen > 2:
1623
        if word[-1] in {'a', 'e', 'n', 't'}:
1624
            return word[:-1]
1625
    return word
1626
1627
1628
def caumanns(word):
1629
    """Return Caumanns German stem.
1630
1631
    Jörg Caumanns' stemmer is described in his article at:
1632
    http://edocs.fu-berlin.de/docs/servlets/MCRFileNodeServlet/FUDOCS_derivate_000000000350/tr-b-99-16.pdf
1633
1634
    This implementation is based on the GermanStemFilter described at:
1635
    http://www.evelix.ch/unternehmen/Blog/evelix/2013/11/11/inner-workings-of-the-german-analyzer-in-lucene
1636
1637
    :param word: the word to calculate the stem of
1638
    :returns: word stem
1639
    :rtype: str
1640
1641
    >>> caumanns('lesen')
1642
    'les'
1643
    >>> caumanns('graues')
1644
    'grau'
1645
    >>> caumanns('buchstabieren')
1646
    'buchstabier'
1647
    """
1648
    if not word:
1649
        return ''
1650
1651
    upper_initial = word[0].isupper()
1652
    word = unicodedata.normalize('NFC', text_type(word.lower()))
1653
1654
    # # Part 2: Substitution
1655
    # 1. Change umlauts to corresponding vowels & ß to ss
1656
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
1657
    word = word.translate(_umlauts)
1658
    word = word.replace('ß', 'ss')
1659
1660
    # 2. Change second of doubled characters to *
1661
    newword = word[0]
1662
    for i in range(1, len(word)):
1663
        if newword[i-1] == word[i]:
1664
            newword += '*'
1665
        else:
1666
            newword += word[i]
1667
    word = newword
1668
1669
    # 3. Replace sch, ch, ei, ie with $, §, %, &
1670
    word = word.replace('sch', '$')
1671
    word = word.replace('ch', '§')
1672
    word = word.replace('ei', '%')
1673
    word = word.replace('ie', '&')
1674
    word = word.replace('ig', '#')
1675
    word = word.replace('st', '!')
1676
1677
    # # Part 1: Recursive Context-Free Stripping
1678
    # 1. Remove the following 7 suffixes recursively
1679
    while len(word) > 3:
1680
        if (((len(word) > 4 and word[-2:] in {'em', 'er'}) or
1681
             (len(word) > 5 and word[-2:] == 'nd'))):
1682
            word = word[:-2]
1683
        elif ((word[-1] in {'e', 's', 'n'}) or
1684
              (not upper_initial and word[-1] in {'t', '!'})):
1685
            word = word[:-1]
1686
        else:
1687
            break
1688
1689
    # Additional optimizations:
1690
    if len(word) > 5 and word[-5:] == 'erin*':
1691
        word = word[:-1]
1692
    if word[-1] == 'z':
1693
        word = word[:-1] + 'x'
1694
1695
    # Reverse substitutions:
1696
    word = word.replace('$', 'sch')
1697
    word = word.replace('§', 'ch')
1698
    word = word.replace('%', 'ei')
1699
    word = word.replace('&', 'ie')
1700
    word = word.replace('#', 'ig')
1701
    word = word.replace('!', 'st')
1702
1703
    # Expand doubled
1704
    word = ''.join([word[0]] + [word[i-1] if word[i] == '*' else word[i] for
1705
                                i in range(1, len(word))])
1706
1707
    # Finally, convert gege to ge
1708
    if len(word) > 4:
1709
        word = word.replace('gege', 'ge', 1)
1710
1711
    return word
1712
1713
1714
def uealite(word, max_word_length=20, max_acro_length=8, return_rule_no=False,
1715
            var=None):
1716
    """Return UEA-Lite stem.
1717
1718
    The UEA-Lite stemmer is discussed in:
1719
    Jenkins, Marie-Claire and Dan Smith. 2005. "Conservative stemming for
1720
    search and indexing."
1721
    http://lemur.cmp.uea.ac.uk/Research/stemmer/stemmer25feb.pdf
1722
1723
    This is chiefly based on the Java implementation of the algorithm, with
1724
    variants based on the Perl implementation and Jason Adams' Ruby port.
1725
1726
    Java version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.java
1727
    Perl version: http://lemur.cmp.uea.ac.uk/Research/stemmer/UEAstem.pl
1728
    Ruby version: https://github.com/ealdent/uea-stemmer
1729
1730
    :param word: the word to calculate the stem of
1731
    :param max_word_length: the maximum word length allowed
1732
    :param return_rule_no: if True, returns the stem along with rule number
1733
    :param var: variant to use (set to 'Adams' to use Jason Adams' rules,
1734
                or 'Perl' to use the original Perl set of rules)
1735
    :returns: word stem
1736
    :rtype: str or tuple(str, int)
1737
    """
1738
    problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
1739
1740
    # rule table format:
1741
    # top-level dictionary: length-of-suffix: dict-of-rules
1742
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
1743
    #                         suffix_to_append)
1744
    rule_table = {7: {'titudes': (30, 1, None),
1745
                      'fulness': (34, 4, None),
1746
                      'ousness': (35, 4, None),
1747
                      'eadings': (40.7, 4, None),
1748
                      'oadings': (40.6, 4, None),
1749
                      'ealings': (42.4, 4, None),
1750
                      'ailings': (42.2, 4, None),
1751
                      },
1752
                  6: {'aceous': (1, 6, None),
1753
                      'aining': (24, 3, None),
1754
                      'acting': (25, 3, None),
1755
                      'ttings': (26, 5, None),
1756
                      'viding': (27, 3, 'e'),
1757
                      'ssings': (37, 4, None),
1758
                      'ulting': (38, 3, None),
1759
                      'eading': (40.7, 3, None),
1760
                      'oading': (40.6, 3, None),
1761
                      'edings': (40.5, 4, None),
1762
                      'ddings': (40.4, 5, None),
1763
                      'ldings': (40.3, 4, None),
1764
                      'rdings': (40.2, 4, None),
1765
                      'ndings': (40.1, 4, None),
1766
                      'llings': (41, 5, None),
1767
                      'ealing': (42.4, 3, None),
1768
                      'olings': (42.3, 4, None),
1769
                      'ailing': (42.2, 3, None),
1770
                      'elings': (42.1, 4, None),
1771
                      'mmings': (44.3, 5, None),
1772
                      'ngings': (45.2, 4, None),
1773
                      'ggings': (45.1, 5, None),
1774
                      'stings': (47, 4, None),
1775
                      'etings': (48.4, 4, None),
1776
                      'ntings': (48.2, 4, None),
1777
                      'irings': (54.4, 4, 'e'),
1778
                      'urings': (54.3, 4, 'e'),
1779
                      'ncings': (54.2, 4, 'e'),
1780
                      'things': (58.1, 1, None),
1781
                      },
1782
                  5: {'iases': (11.4, 2, None),
1783
                      'ained': (13.6, 2, None),
1784
                      'erned': (13.5, 2, None),
1785
                      'ifted': (14, 2, None),
1786
                      'ected': (15, 2, None),
1787
                      'vided': (16, 1, None),
1788
                      'erred': (19, 3, None),
1789
                      'urred': (20.5, 3, None),
1790
                      'lored': (20.4, 2, None),
1791
                      'eared': (20.3, 2, None),
1792
                      'tored': (20.2, 1, None),
1793
                      'noted': (22.4, 1, None),
1794
                      'leted': (22.3, 1, None),
1795
                      'anges': (23, 1, None),
1796
                      'tting': (26, 4, None),
1797
                      'ulted': (32, 2, None),
1798
                      'uming': (33, 3, 'e'),
1799
                      'rabed': (36.1, 1, None),
1800
                      'rebed': (36.1, 1, None),
1801
                      'ribed': (36.1, 1, None),
1802
                      'robed': (36.1, 1, None),
1803
                      'rubed': (36.1, 1, None),
1804
                      'ssing': (37, 3, None),
1805
                      'vings': (39, 4, 'e'),
1806
                      'eding': (40.5, 3, None),
1807
                      'dding': (40.4, 4, None),
1808
                      'lding': (40.3, 3, None),
1809
                      'rding': (40.2, 3, None),
1810
                      'nding': (40.1, 3, None),
1811
                      'dings': (40, 4, 'e'),
1812
                      'lling': (41, 4, None),
1813
                      'oling': (42.3, 3, None),
1814
                      'eling': (42.1, 3, None),
1815
                      'lings': (42, 4, 'e'),
1816
                      'mming': (44.3, 4, None),
1817
                      'rming': (44.2, 3, None),
1818
                      'lming': (44.1, 3, None),
1819
                      'mings': (44, 4, 'e'),
1820
                      'nging': (45.2, 3, None),
1821
                      'gging': (45.1, 4, None),
1822
                      'gings': (45, 4, 'e'),
1823
                      'aning': (46.6, 3, None),
1824
                      'ening': (46.5, 3, None),
1825
                      'gning': (46.4, 3, None),
1826
                      'nning': (46.3, 4, None),
1827
                      'oning': (46.2, 3, None),
1828
                      'rning': (46.1, 3, None),
1829
                      'sting': (47, 3, None),
1830
                      'eting': (48.4, 3, None),
1831
                      'pting': (48.3, 3, None),
1832
                      'nting': (48.2, 3, None),
1833
                      'cting': (48.1, 3, None),
1834
                      'tings': (48, 4, 'e'),
1835
                      'iring': (54.4, 3, 'e'),
1836
                      'uring': (54.3, 3, 'e'),
1837
                      'ncing': (54.2, 3, 'e'),
1838
                      'sings': (54, 4, 'e'),
1839
                      # 'lling': (55, 3, None),  # masked by 41
1840
                      'ating': (57, 3, 'e'),
1841
                      'thing': (58.1, 0, None),
1842
                      },
1843
                  4: {'eeds': (7, 1, None),
1844
                      'uses': (11.3, 1, None),
1845
                      'sses': (11.2, 2, None),
1846
                      'eses': (11.1, 2, 'is'),
1847
                      'tled': (12.5, 1, None),
1848
                      'pled': (12.4, 1, None),
1849
                      'bled': (12.3, 1, None),
1850
                      'eled': (12.2, 2, None),
1851
                      'lled': (12.1, 2, None),
1852
                      'ened': (13.7, 2, None),
1853
                      'rned': (13.4, 2, None),
1854
                      'nned': (13.3, 3, None),
1855
                      'oned': (13.2, 2, None),
1856
                      'gned': (13.1, 2, None),
1857
                      'ered': (20.1, 2, None),
1858
                      'reds': (20, 2, None),
1859
                      'tted': (21, 3, None),
1860
                      'uted': (22.2, 1, None),
1861
                      'ated': (22.1, 1, None),
1862
                      'ssed': (28, 2, None),
1863
                      'umed': (31, 1, None),
1864
                      'beds': (36, 3, None),
1865
                      'ving': (39, 3, 'e'),
1866
                      'ding': (40, 3, 'e'),
1867
                      'ling': (42, 3, 'e'),
1868
                      'nged': (43.2, 1, None),
1869
                      'gged': (43.1, 3, None),
1870
                      'ming': (44, 3, 'e'),
1871
                      'ging': (45, 3, 'e'),
1872
                      'ning': (46, 3, 'e'),
1873
                      'ting': (48, 3, 'e'),
1874
                      # 'ssed': (49, 2, None),  # masked by 28
1875
                      # 'lled': (53, 2, None),  # masked by 12.1
1876
                      'zing': (54.1, 3, 'e'),
1877
                      'sing': (54, 3, 'e'),
1878
                      'lves': (60.1, 3, 'f'),
1879
                      'aped': (61.3, 1, None),
1880
                      'uded': (61.2, 1, None),
1881
                      'oded': (61.1, 1, None),
1882
                      # 'ated': (61, 1, None),  # masked by 22.1
1883
                      'ones': (63.6, 1, None),
1884
                      'izes': (63.5, 1, None),
1885
                      'ures': (63.4, 1, None),
1886
                      'ines': (63.3, 1, None),
1887
                      'ides': (63.2, 1, None),
1888
                      },
1889
                  3: {'ces': (2, 1, None),
1890
                      'sis': (4, 0, None),
1891
                      'tis': (5, 0, None),
1892
                      'eed': (7, 0, None),
1893
                      'ued': (8, 1, None),
1894
                      'ues': (9, 1, None),
1895
                      'ees': (10, 1, None),
1896
                      'ses': (11, 1, None),
1897
                      'led': (12, 2, None),
1898
                      'ned': (13, 1, None),
1899
                      'ved': (17, 1, None),
1900
                      'ced': (18, 1, None),
1901
                      'red': (20, 1, None),
1902
                      'ted': (22, 2, None),
1903
                      'sed': (29, 1, None),
1904
                      'bed': (36, 2, None),
1905
                      'ged': (43, 1, None),
1906
                      'les': (50, 1, None),
1907
                      'tes': (51, 1, None),
1908
                      'zed': (52, 1, None),
1909
                      'ied': (56, 3, 'y'),
1910
                      'ies': (59, 3, 'y'),
1911
                      'ves': (60, 1, None),
1912
                      'pes': (63.8, 1, None),
1913
                      'mes': (63.7, 1, None),
1914
                      'ges': (63.1, 1, None),
1915
                      'ous': (65, 0, None),
1916
                      'ums': (66, 0, None),
1917
                      },
1918
                  2: {'cs': (3, 0, None),
1919
                      'ss': (6, 0, None),
1920
                      'es': (63, 2, None),
1921
                      'is': (64, 2, 'e'),
1922
                      'us': (67, 0, None),
1923
                      }}
1924
1925
    if var == 'Perl':
1926
        perl_deletions = {7: ['eadings', 'oadings', 'ealings', 'ailings'],
1927
                          6: ['ttings', 'ssings', 'edings', 'ddings',
1928
                              'ldings', 'rdings', 'ndings', 'llings',
1929
                              'olings', 'elings', 'mmings', 'ngings',
1930
                              'ggings', 'stings', 'etings', 'ntings',
1931
                              'irings', 'urings', 'ncings', 'things'],
1932
                          5: ['vings', 'dings', 'lings', 'mings', 'gings',
1933
                              'tings', 'sings'],
1934
                          4: ['eeds', 'reds', 'beds']}
1935
1936
        # Delete the above rules from rule_table
1937
        for del_len in perl_deletions:
1938
            for term in perl_deletions[del_len]:
1939
                del rule_table[del_len][term]
1940
1941
    elif var == 'Adams':
1942
        adams_additions = {6: {'chited': (22.8, 1, None)},
1943
                           5: {'dying': (58.2, 4, 'ie'),
1944
                               'tying': (58.2, 4, 'ie'),
1945
                               'vited': (22.6, 1, None),
1946
                               'mited': (22.5, 1, None),
1947
                               'vided': (22.9, 1, None),
1948
                               'mided': (22.10, 1, None),
1949
                               'lying': (58.2, 4, 'ie'),
1950
                               'arred': (19.1, 3, None),
1951
                               },
1952
                           4: {'ited': (22.7, 2, None),
1953
                               'oked': (31.1, 1, None),
1954
                               'aked': (31.1, 1, None),
1955
                               'iked': (31.1, 1, None),
1956
                               'uked': (31.1, 1, None),
1957
                               'amed': (31, 1, None),
1958
                               'imed': (31, 1, None),
1959
                               'does': (31.2, 2, None),
1960
                               },
1961
                           3: {'oed': (31.3, 1, None),
1962
                               'oes': (31.2, 1, None),
1963
                               'kes': (63.1, 1, None),
1964
                               'des': (63.10, 1, None),
1965
                               'res': (63.9, 1, None),
1966
                               }}
1967
1968
        # Add the above additional rules to rule_table
1969
        for del_len in adams_additions:
1970
            rule_table[del_len] = dict(rule_table[del_len],
1971
                                       **adams_additions[del_len])
1972
        # Add additional problem word
1973
        problem_words.add('menses')
1974
1975
    def _stem_with_duplicate_character_check(word, del_length):
1976
        if word[-1] == 's':
1977
            del_length += 1
1978
        stemmed_word = word[:-del_length]
1979
        if re.match(r'.*(\w)\1$', stemmed_word):
1980
            stemmed_word = stemmed_word[:-1]
1981
        return stemmed_word
1982
1983
    def _stem(word):
1984
        stemmed_word = word
1985
        rule_no = 0
1986
1987
        if not word:
1988
            return word, 0
1989
        if word in problem_words:
1990
            return word, 90
1991
        if max_word_length and len(word) > max_word_length:
1992
            return word, 95
1993
1994
        if "'" in word:
1995
            if word[-2:] in {"'s", "'S"}:
1996
                stemmed_word = word[:-2]
1997
            if word[-1:] == "'":
1998
                stemmed_word = word[:-1]
1999
            stemmed_word = stemmed_word.replace("n't", 'not')
2000
            stemmed_word = stemmed_word.replace("'ve", 'have')
2001
            stemmed_word = stemmed_word.replace("'re", 'are')
2002
            stemmed_word = stemmed_word.replace("'m", 'am')
2003
            return stemmed_word, 94
2004
2005
        if word.isdigit():
2006
            return word, 90.3
2007
        else:
2008
            hyphen = word.find('-')
2009
            if hyphen > 0 and hyphen < len(word):
2010
                if word[:hyphen].isalpha() and word[hyphen+1:].isalpha():
2011
                    return word, 90.2
2012
                else:
2013
                    return word, 90.1
2014
            elif '_' in word:
2015
                return word, 90
2016
            elif word[-1] == 's' and word[:-1].isupper():
2017
                if var == 'Adams' and len(word)-1 > max_acro_length:
2018
                    return word, 96
2019
                return word[:-1], 91.1
2020
            elif word.isupper():
2021
                if var == 'Adams' and len(word) > max_acro_length:
2022
                    return word, 96
2023
                return word, 91
2024
            elif re.match(r'^.*[A-Z].*[A-Z].*$', word):
2025
                return word, 92
2026
            elif word[0].isupper():
2027
                return word, 93
2028
            elif var == 'Adams' and re.match(r'^[a-z]{1}(|[rl])(ing|ed)$',
2029
                                             word):
2030
                return word, 97
2031
2032
        for n in range(7, 1, -1):
2033
            if word[-n:] in rule_table[n]:
2034
                rule_no, del_length, add_str = rule_table[n][word[-n:]]
2035
                if del_length:
2036
                    stemmed_word = word[:-del_length]
2037
                if add_str:
2038
                    stemmed_word += add_str
2039
                break
2040
2041
        if not rule_no:
2042
            if re.match(r'.*\w\wings?$', word):  # rule 58
2043
                stemmed_word = _stem_with_duplicate_character_check(word, 3)
2044
                rule_no = 58
2045
            elif re.match(r'.*\w\weds?$', word):  # rule 62
2046
                stemmed_word = _stem_with_duplicate_character_check(word, 2)
2047
                rule_no = 62
2048
            elif word[-1] == 's':  # rule 68
2049
                stemmed_word = word[:-1]
2050
                rule_no = 68
2051
2052
        return stemmed_word, rule_no
2053
2054
    stem, rule_no = _stem(word)
2055
    if return_rule_no:
2056
        return stem, rule_no
2057
    return stem
2058
2059
2060
def lancaster(word):
2061
    """Return Lancaster stem.
2062
2063
    Implementation of the Lancaster Stemming Algorithm, developed by
2064
    Chris Paice, with the assistance of Gareth Husk
2065
2066
    Arguments:
2067
    word -- the word to calculate the stem of
2068
2069
    Description:
2070
    The Lancaster Stemming Algorithm, described at:
2071
    http://wayback.archive.org/web/20140826000545/http://www.comp.lancs.ac.uk/computing/research/stemming/Links/paice.htm
2072
2073
    Based on the Paice & Husk's original Pascal reference implementation:
2074
    http://wayback.archive.org/web/20150104225538/http://www.comp.lancs.ac.uk/computing/research/stemming/Files/Pascal.zip
2075
    """
2076
    _lancaster_rules = ('ai*2.', 'a*1.', 'bb1.', 'city3s.', 'ci2>', 'cn1t>',
2077
                        'dd1.', 'dei3y>', 'deec2ss.', 'dee1.', 'de2>',
2078
                        'dooh4>', 'e1>', 'feil1v.', 'fi2>', 'gni3>', 'gai3y.',
2079
                        'ga2>', 'gg1.', 'ht*2.', 'hsiug5ct.', 'hsi3>', 'i*1.',
2080
                        'i1y>', 'ji1d.', 'juf1s.', 'ju1d.', 'jo1d.', 'jeh1r.',
2081
                        'jrev1t.', 'jsim2t.', 'jn1d.', 'j1s.', 'lbaifi6.',
2082
                        'lbai4y.', 'lba3>', 'lbi3.', 'lib2l>', 'lc1.',
2083
                        'lufi4y.', 'luf3>', 'lu2.', 'lai3>', 'lau3>', 'la2>',
2084
                        'll1.', 'mui3.', 'mu*2.', 'msi3>', 'mm1.', 'nois4j>',
2085
                        'noix4ct.', 'noi3>', 'nai3>', 'na2>', 'nee0.', 'ne2>',
2086
                        'nn1.', 'pihs4>', 'pp1.', 're2>', 'rae0.', 'ra2.',
2087
                        'ro2>', 'ru2>', 'rr1.', 'rt1>', 'rei3y>', 'sei3y>',
2088
                        'sis2.', 'si2>', 'ssen4>', 'ss0.', 'suo3>', 'su*2.',
2089
                        's*1>', 's0.', 'tacilp4y.', 'ta2>', 'tnem4>', 'tne3>',
2090
                        'tna3>', 'tpir2b.', 'tpro2b.', 'tcud1.', 'tpmus2.',
2091
                        'tpec2iv.', 'tulo2v.', 'tsis0.', 'tsi3>', 'tt1.',
2092
                        'uqi3.', 'ugo1.', 'vis3j>', 'vie0.', 'vi2>', 'ylb1>',
2093
                        'yli3y>', 'ylp0.', 'yl2>', 'ygo1.', 'yhp1.', 'ymo1.',
2094
                        'ypo1.', 'yti3>', 'yte3>', 'ytl2.', 'yrtsi5.',
2095
                        'yra3>', 'yro3>', 'yfi3.', 'ycn2t>', 'yca3>', 'zi2>',
2096
                        'zy1s.')
2097
2098
    _rule_table = []
2099
    _rule_index = {'a': -1, 'b': -1, 'c': -1, 'd': -1, 'e': -1, 'f': -1,
2100
                   'g': -1, 'h': -1, 'i': -1, 'j': -1, 'k': -1, 'l': -1,
2101
                   'm': -1, 'n': -1, 'o': -1, 'p': -1, 'q': -1, 'r': -1,
2102
                   's': -1, 't': -1, 'u': -1, 'v': -1, 'w': -1, 'x': -1,
2103
                   'y': -1, 'z': -1}
2104
2105
    def read_rules(stem_rules=_lancaster_rules):
2106
        """Read the rules table.
2107
2108
        read_rules reads in stemming rules from a text file and enter them
2109
        into _rule_table. _rule_index is set up to provide faster access to
2110
        relevant rules.
2111
        """
2112
        for rule in stem_rules:
2113
            _rule_table.append(rule)
2114
            if _rule_index[rule[0]] == -1:
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _rule_index does not seem to be defined.
Loading history...
2115
                _rule_index[rule[0]] = len(_rule_table)-1
2116
2117
    def stemmers(word):
2118
        """Reduce a word.
2119
2120
        stemmers takes the specified word and reduces it to a set by
2121
        referring to _rule_table
2122
        """
2123
        # TODO: This looks very incomplete.
2124
        return word
2125