Completed
Pull Request — master (#120)
by Chris
12:34
created

abydos.stemmer.snowball.porter2()   F

Complexity

Conditions 127

Size

Total Lines 274
Code Lines 211

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 127
eloc 211
nop 2
dl 0
loc 274
rs 0
c 0
b 0
f 0

How to fix   Long Method    Complexity   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

Complexity

Complex classes like abydos.stemmer.snowball.porter2() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# -*- coding: utf-8 -*-
0 ignored issues
show
coding-style introduced by
Too many lines in module (1158/1000)
Loading history...
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.stemmer.snowball.
20
21
The stemmer.snowball module defines the stemmers:
22
23
    - Porter
24
    - Porter2 (Snowball English)
25
    - Snowball German
26
    - Snowball Dutch
27
    - Snowball Norwegian
28
    - Snowball Swedish
29
    - Snowball Danish
30
"""
31
32
from __future__ import unicode_literals
33
34
from unicodedata import normalize
35
36
from six import text_type
37
from six.moves import range
38
39
__all__ = ['porter', 'porter2', 'sb_danish', 'sb_dutch', 'sb_german',
40
           'sb_norwegian', 'sb_swedish']
41
42
43
def _m_degree(term, vowels):
44
    """Return Porter helper function _m_degree value.
45
46
    m-degree is equal to the number of V to C transitions
47
48
    :param str term: the word for which to calculate the m-degree
49
    :param set vowels: the set of vowels in the language
50
    :returns: the m-degree as defined in the Porter stemmer definition
51
    :rtype: int
52
    """
53
    mdeg = 0
54
    last_was_vowel = False
55
    for letter in term:
56
        if letter in vowels:
57
            last_was_vowel = True
58
        else:
59
            if last_was_vowel:
60
                mdeg += 1
61
            last_was_vowel = False
62
    return mdeg
63
64
65
def _sb_has_vowel(term, vowels):
66
    """Return Porter helper function _sb_has_vowel value.
67
68
    :param str term: the word to scan for vowels
69
    :param set vowels: the set of vowels in the language
70
    :returns: true iff a vowel exists in the term (as defined in the Porter
71
        stemmer definition)
72
    :rtype: bool
73
    """
74
    for letter in term:
75
        if letter in vowels:
76
            return True
77
    return False
78
79
80
def _ends_in_doubled_cons(term, vowels):
81
    """Return Porter helper function _ends_in_doubled_cons value.
82
83
    :param str term: the word to check for a final doubled consonant
84
    :param set vowels: the set of vowels in the language
85
    :returns: true iff the stem ends in a doubled consonant (as defined in the
86
        Porter stemmer definition)
87
    :rtype: bool
88
    """
89
    return len(term) > 1 and term[-1] not in vowels and term[-2] == term[-1]
90
91
92
def _ends_in_cvc(term, vowels):
93
    """Return Porter helper function _ends_in_cvc value.
94
95
    :param str term: the word to scan for cvc
96
    :param set vowels: the set of vowels in the language
97
    :returns: true iff the stem ends in cvc (as defined in the Porter stemmer
98
        definition)
99
    :rtype: bool
100
    """
101
    return (len(term) > 2 and (term[-1] not in vowels and
102
                               term[-2] in vowels and
103
                               term[-3] not in vowels and
104
                               term[-1] not in tuple('wxY')))
105
106
107
def porter(word, early_english=False):
108
    """Return Porter stem.
109
110
    The Porter stemmer is described in :cite:`Porter:1980`.
111
112
    :param str word: the word to calculate the stem of
113
    :param bool early_english: set to True in order to remove -eth & -est
114
        (2nd & 3rd person singular verbal agreement suffixes)
115
    :returns: word stem
116
    :rtype: str
117
118
    >>> porter('reading')
119
    'read'
120
    >>> porter('suspension')
121
    'suspens'
122
    >>> porter('elusiveness')
123
    'elus'
124
125
    >>> porter('eateth', early_english=True)
126
    'eat'
127
    """
128
    # lowercase, normalize, and compose
129
    word = normalize('NFC', text_type(word.lower()))
130
131
    # Return word if stem is shorter than 2
132
    if len(word) < 3:
133
        return word
134
135
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
136
    # Re-map consonantal y to Y (Y will be C, y will be V)
137
    if word[0] == 'y':
138
        word = 'Y' + word[1:]
139
    for i in range(1, len(word)):
140
        if word[i] == 'y' and word[i-1] in _vowels:
141
            word = word[:i] + 'Y' + word[i+1:]
142
143
    # Step 1a
144
    if word[-1] == 's':
145
        if word[-4:] == 'sses':
146
            word = word[:-2]
147
        elif word[-3:] == 'ies':
148
            word = word[:-2]
149
        elif word[-2:] == 'ss':
150
            pass
151
        else:
152
            word = word[:-1]
153
154
    # Step 1b
155
    step1b_flag = False
156
    if word[-3:] == 'eed':
157
        if _m_degree(word[:-3], _vowels) > 0:
158
            word = word[:-1]
159
    elif word[-2:] == 'ed':
160
        if _sb_has_vowel(word[:-2], _vowels):
161
            word = word[:-2]
162
            step1b_flag = True
163
    elif word[-3:] == 'ing':
164
        if _sb_has_vowel(word[:-3], _vowels):
165
            word = word[:-3]
166
            step1b_flag = True
167
    elif early_english:
168
        if word[-3:] == 'est':
169
            if _sb_has_vowel(word[:-3], _vowels):
170
                word = word[:-3]
171
                step1b_flag = True
172
        elif word[-3:] == 'eth':
173
            if _sb_has_vowel(word[:-3], _vowels):
174
                word = word[:-3]
175
                step1b_flag = True
176
177
    if step1b_flag:
178
        if word[-2:] in {'at', 'bl', 'iz'}:
179
            word += 'e'
180
        elif (_ends_in_doubled_cons(word, _vowels) and
181
              word[-1] not in {'l', 's', 'z'}):
182
            word = word[:-1]
183
        elif _m_degree(word, _vowels) == 1 and _ends_in_cvc(word, _vowels):
184
            word += 'e'
185
186
    # Step 1c
187
    if word[-1] in {'Y', 'y'} and _sb_has_vowel(word[:-1], _vowels):
188
        word = word[:-1] + 'i'
189
190
    # Step 2
191
    if len(word) > 1:
192
        if word[-2] == 'a':
193
            if word[-7:] == 'ational':
194
                if _m_degree(word[:-7], _vowels) > 0:
195
                    word = word[:-5] + 'e'
196
            elif word[-6:] == 'tional':
197
                if _m_degree(word[:-6], _vowels) > 0:
198
                    word = word[:-2]
199
        elif word[-2] == 'c':
200
            if word[-4:] in {'enci', 'anci'}:
201
                if _m_degree(word[:-4], _vowels) > 0:
202
                    word = word[:-1] + 'e'
203
        elif word[-2] == 'e':
204
            if word[-4:] == 'izer':
205
                if _m_degree(word[:-4], _vowels) > 0:
206
                    word = word[:-1]
207
        elif word[-2] == 'g':
208
            if word[-4:] == 'logi':
209
                if _m_degree(word[:-4], _vowels) > 0:
210
                    word = word[:-1]
211
        elif word[-2] == 'l':
212
            if word[-3:] == 'bli':
213
                if _m_degree(word[:-3], _vowels) > 0:
214
                    word = word[:-1] + 'e'
215
            elif word[-4:] == 'alli':
216
                if _m_degree(word[:-4], _vowels) > 0:
217
                    word = word[:-2]
218
            elif word[-5:] == 'entli':
219
                if _m_degree(word[:-5], _vowels) > 0:
220
                    word = word[:-2]
221
            elif word[-3:] == 'eli':
222
                if _m_degree(word[:-3], _vowels) > 0:
223
                    word = word[:-2]
224
            elif word[-5:] == 'ousli':
225
                if _m_degree(word[:-5], _vowels) > 0:
226
                    word = word[:-2]
227
        elif word[-2] == 'o':
228
            if word[-7:] == 'ization':
229
                if _m_degree(word[:-7], _vowels) > 0:
230
                    word = word[:-5] + 'e'
231
            elif word[-5:] == 'ation':
232
                if _m_degree(word[:-5], _vowels) > 0:
233
                    word = word[:-3] + 'e'
234
            elif word[-4:] == 'ator':
235
                if _m_degree(word[:-4], _vowels) > 0:
236
                    word = word[:-2] + 'e'
237
        elif word[-2] == 's':
238
            if word[-5:] == 'alism':
239
                if _m_degree(word[:-5], _vowels) > 0:
240
                    word = word[:-3]
241
            elif word[-7:] in {'iveness', 'fulness', 'ousness'}:
242
                if _m_degree(word[:-7], _vowels) > 0:
243
                    word = word[:-4]
244
        elif word[-2] == 't':
245
            if word[-5:] == 'aliti':
246
                if _m_degree(word[:-5], _vowels) > 0:
247
                    word = word[:-3]
248
            elif word[-5:] == 'iviti':
249
                if _m_degree(word[:-5], _vowels) > 0:
250
                    word = word[:-3] + 'e'
251
            elif word[-6:] == 'biliti':
252
                if _m_degree(word[:-6], _vowels) > 0:
253
                    word = word[:-5] + 'le'
254
255
    # Step 3
256
    if word[-5:] == 'icate':
257
        if _m_degree(word[:-5], _vowels) > 0:
258
            word = word[:-3]
259
    elif word[-5:] == 'ative':
260
        if _m_degree(word[:-5], _vowels) > 0:
261
            word = word[:-5]
262
    elif word[-5:] in {'alize', 'iciti'}:
263
        if _m_degree(word[:-5], _vowels) > 0:
264
            word = word[:-3]
265
    elif word[-4:] == 'ical':
266
        if _m_degree(word[:-4], _vowels) > 0:
267
            word = word[:-2]
268
    elif word[-3:] == 'ful':
269
        if _m_degree(word[:-3], _vowels) > 0:
270
            word = word[:-3]
271
    elif word[-4:] == 'ness':
272
        if _m_degree(word[:-4], _vowels) > 0:
273
            word = word[:-4]
274
275
    # Step 4
276
    if word[-2:] == 'al':
277
        if _m_degree(word[:-2], _vowels) > 1:
278
            word = word[:-2]
279
    elif word[-4:] == 'ance':
280
        if _m_degree(word[:-4], _vowels) > 1:
281
            word = word[:-4]
282
    elif word[-4:] == 'ence':
283
        if _m_degree(word[:-4], _vowels) > 1:
284
            word = word[:-4]
285
    elif word[-2:] == 'er':
286
        if _m_degree(word[:-2], _vowels) > 1:
287
            word = word[:-2]
288
    elif word[-2:] == 'ic':
289
        if _m_degree(word[:-2], _vowels) > 1:
290
            word = word[:-2]
291
    elif word[-4:] == 'able':
292
        if _m_degree(word[:-4], _vowels) > 1:
293
            word = word[:-4]
294
    elif word[-4:] == 'ible':
295
        if _m_degree(word[:-4], _vowels) > 1:
296
            word = word[:-4]
297
    elif word[-3:] == 'ant':
298
        if _m_degree(word[:-3], _vowels) > 1:
299
            word = word[:-3]
300
    elif word[-5:] == 'ement':
301
        if _m_degree(word[:-5], _vowels) > 1:
302
            word = word[:-5]
303
    elif word[-4:] == 'ment':
304
        if _m_degree(word[:-4], _vowels) > 1:
305
            word = word[:-4]
306
    elif word[-3:] == 'ent':
307
        if _m_degree(word[:-3], _vowels) > 1:
308
            word = word[:-3]
309
    elif word[-4:] in {'sion', 'tion'}:
310
        if _m_degree(word[:-3], _vowels) > 1:
311
            word = word[:-3]
312
    elif word[-2:] == 'ou':
313
        if _m_degree(word[:-2], _vowels) > 1:
314
            word = word[:-2]
315
    elif word[-3:] == 'ism':
316
        if _m_degree(word[:-3], _vowels) > 1:
317
            word = word[:-3]
318
    elif word[-3:] == 'ate':
319
        if _m_degree(word[:-3], _vowels) > 1:
320
            word = word[:-3]
321
    elif word[-3:] == 'iti':
322
        if _m_degree(word[:-3], _vowels) > 1:
323
            word = word[:-3]
324
    elif word[-3:] == 'ous':
325
        if _m_degree(word[:-3], _vowels) > 1:
326
            word = word[:-3]
327
    elif word[-3:] == 'ive':
328
        if _m_degree(word[:-3], _vowels) > 1:
329
            word = word[:-3]
330
    elif word[-3:] == 'ize':
331
        if _m_degree(word[:-3], _vowels) > 1:
332
            word = word[:-3]
333
334
    # Step 5a
335
    if word[-1] == 'e':
336
        if _m_degree(word[:-1], _vowels) > 1:
337
            word = word[:-1]
338
        elif (_m_degree(word[:-1], _vowels) == 1 and
339
              not _ends_in_cvc(word[:-1], _vowels)):
340
            word = word[:-1]
341
342
    # Step 5b
343
    if word[-2:] == 'll' and _m_degree(word, _vowels) > 1:
344
        word = word[:-1]
345
346
    # Change 'Y' back to 'y' if it survived stemming
347
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
348
        if word[i] == 'Y':
349
            word = word[:i] + 'y' + word[i+1:]
350
351
    return word
352
353
354
def _sb_r1(term, vowels, r1_prefixes=None):
355
    """Return the R1 region, as defined in the Porter2 specification."""
356
    vowel_found = False
357
    if hasattr(r1_prefixes, '__iter__'):
358
        for prefix in r1_prefixes:
359
            if term[:len(prefix)] == prefix:
360
                return len(prefix)
361
362
    for i in range(len(term)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
363
        if not vowel_found and term[i] in vowels:
364
            vowel_found = True
365
        elif vowel_found and term[i] not in vowels:
366
            return i + 1
367
    return len(term)
368
369
370
def _sb_r2(term, vowels, r1_prefixes=None):
371
    """Return the R2 region, as defined in the Porter2 specification."""
372
    r1_start = _sb_r1(term, vowels, r1_prefixes)
373
    return r1_start + _sb_r1(term[r1_start:], vowels)
374
375
376
def _sb_ends_in_short_syllable(term, vowels, codanonvowels):
377
    """Return True iff term ends in a short syllable.
378
379
    (...according to the Porter2 specification.)
380
381
    NB: This is akin to the CVC test from the Porter stemmer. The description
382
    is unfortunately poor/ambiguous.
383
    """
384
    if not term:
385
        return False
386
    if len(term) == 2:
387
        if term[-2] in vowels and term[-1] not in vowels:
388
            return True
389
    elif len(term) >= 3:
390
        if ((term[-3] not in vowels and term[-2] in vowels and
391
             term[-1] in codanonvowels)):
392
            return True
393
    return False
394
395
396
def _sb_short_word(term, vowels, codanonvowels, r1_prefixes=None):
397
    """Return True iff term is a short word.
398
399
    (...according to the Porter2 specification.)
400
    """
401
    if ((_sb_r1(term, vowels, r1_prefixes) == len(term) and
402
         _sb_ends_in_short_syllable(term, vowels, codanonvowels))):
403
        return True
404
    return False
405
406
407
def porter2(word, early_english=False):
0 ignored issues
show
best-practice introduced by
Too many return statements (7/6)
Loading history...
408
    """Return the Porter2 (Snowball English) stem.
409
410
    The Porter2 (Snowball English) stemmer is defined in :cite:`Porter:2002`.
411
412
    :param str word: the word to calculate the stem of
413
    :param bool early_english: set to True in order to remove -eth & -est
414
        (2nd & 3rd person singular verbal agreement suffixes)
415
    :returns: word stem
416
    :rtype: str
417
418
    >>> porter2('reading')
419
    'read'
420
    >>> porter2('suspension')
421
    'suspens'
422
    >>> porter2('elusiveness')
423
    'elus'
424
425
    >>> porter2('eateth', early_english=True)
426
    'eat'
427
    """
428
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y'}
429
    _codanonvowels = {"'", 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm',
430
                      'n', 'p', 'q', 'r', 's', 't', 'v', 'z'}
431
    _doubles = {'bb', 'dd', 'ff', 'gg', 'mm', 'nn', 'pp', 'rr', 'tt'}
432
    _li = {'c', 'd', 'e', 'g', 'h', 'k', 'm', 'n', 'r', 't'}
433
434
    # R1 prefixes should be in order from longest to shortest to prevent
435
    # masking
436
    _r1_prefixes = ('commun', 'gener', 'arsen')
437
    _exception1dict = {  # special changes:
438
        'skis': 'ski', 'skies': 'sky', 'dying': 'die',
439
        'lying': 'lie', 'tying': 'tie',
440
        # special -LY cases:
441
        'idly': 'idl', 'gently': 'gentl', 'ugly': 'ugli',
442
        'early': 'earli', 'only': 'onli', 'singly': 'singl'}
443
    _exception1set = {'sky', 'news', 'howe', 'atlas', 'cosmos', 'bias',
444
                      'andes'}
445
    _exception2set = {'inning', 'outing', 'canning', 'herring', 'earring',
446
                      'proceed', 'exceed', 'succeed'}
447
448
    # lowercase, normalize, and compose
449
    word = normalize('NFC', text_type(word.lower()))
450
    # replace apostrophe-like characters with U+0027, per
451
    # http://snowball.tartarus.org/texts/apostrophe.html
452
    word = word.replace('’', '\'')
453
    word = word.replace('’', '\'')
454
455
    # Exceptions 1
456
    if word in _exception1dict:
0 ignored issues
show
unused-code introduced by
Unnecessary "elif" after "return"
Loading history...
457
        return _exception1dict[word]
458
    elif word in _exception1set:
459
        return word
460
461
    # Return word if stem is shorter than 3
462
    if len(word) < 3:
463
        return word
464
465
    # Remove initial ', if present.
466
    while word and word[0] == '\'':
467
        word = word[1:]
468
        # Return word if stem is shorter than 2
469
        if len(word) < 2:
470
            return word
471
472
    # Re-map vocalic Y to y (Y will be C, y will be V)
473
    if word[0] == 'y':
474
        word = 'Y' + word[1:]
475
    for i in range(1, len(word)):
476
        if word[i] == 'y' and word[i-1] in _vowels:
477
            word = word[:i] + 'Y' + word[i+1:]
478
479
    r1_start = _sb_r1(word, _vowels, _r1_prefixes)
480
    r2_start = _sb_r2(word, _vowels, _r1_prefixes)
481
482
    # Step 0
483
    if word[-3:] == '\'s\'':
484
        word = word[:-3]
485
    elif word[-2:] == '\'s':
486
        word = word[:-2]
487
    elif word[-1:] == '\'':
488
        word = word[:-1]
489
    # Return word if stem is shorter than 2
490
    if len(word) < 3:
491
        return word
492
493
    # Step 1a
494
    if word[-4:] == 'sses':
495
        word = word[:-2]
496
    elif word[-3:] in {'ied', 'ies'}:
497
        if len(word) > 4:
498
            word = word[:-2]
499
        else:
500
            word = word[:-1]
501
    elif word[-2:] in {'us', 'ss'}:
502
        pass
503
    elif word[-1] == 's':
504
        if _sb_has_vowel(word[:-2], _vowels):
505
            word = word[:-1]
506
507
    # Exceptions 2
508
    if word in _exception2set:
509
        return word
510
511
    # Step 1b
512
    step1b_flag = False
513
    if word[-5:] == 'eedly':
514
        if len(word[r1_start:]) >= 5:
515
            word = word[:-3]
516
    elif word[-5:] == 'ingly':
517
        if _sb_has_vowel(word[:-5], _vowels):
518
            word = word[:-5]
519
            step1b_flag = True
520
    elif word[-4:] == 'edly':
521
        if _sb_has_vowel(word[:-4], _vowels):
522
            word = word[:-4]
523
            step1b_flag = True
524
    elif word[-3:] == 'eed':
525
        if len(word[r1_start:]) >= 3:
526
            word = word[:-1]
527
    elif word[-3:] == 'ing':
528
        if _sb_has_vowel(word[:-3], _vowels):
529
            word = word[:-3]
530
            step1b_flag = True
531
    elif word[-2:] == 'ed':
532
        if _sb_has_vowel(word[:-2], _vowels):
533
            word = word[:-2]
534
            step1b_flag = True
535
    elif early_english:
536
        if word[-3:] == 'est':
537
            if _sb_has_vowel(word[:-3], _vowels):
538
                word = word[:-3]
539
                step1b_flag = True
540
        elif word[-3:] == 'eth':
541
            if _sb_has_vowel(word[:-3], _vowels):
542
                word = word[:-3]
543
                step1b_flag = True
544
545
    if step1b_flag:
546
        if word[-2:] in {'at', 'bl', 'iz'}:
547
            word += 'e'
548
        elif word[-2:] in _doubles:
549
            word = word[:-1]
550
        elif _sb_short_word(word, _vowels, _codanonvowels, _r1_prefixes):
551
            word += 'e'
552
553
    # Step 1c
554
    if ((len(word) > 2 and word[-1] in {'Y', 'y'} and
555
         word[-2] not in _vowels)):
556
        word = word[:-1] + 'i'
557
558
    # Step 2
559
    if word[-2] == 'a':
560
        if word[-7:] == 'ational':
561
            if len(word[r1_start:]) >= 7:
562
                word = word[:-5] + 'e'
563
        elif word[-6:] == 'tional':
564
            if len(word[r1_start:]) >= 6:
565
                word = word[:-2]
566
    elif word[-2] == 'c':
567
        if word[-4:] in {'enci', 'anci'}:
568
            if len(word[r1_start:]) >= 4:
569
                word = word[:-1] + 'e'
570
    elif word[-2] == 'e':
571
        if word[-4:] == 'izer':
572
            if len(word[r1_start:]) >= 4:
573
                word = word[:-1]
574
    elif word[-2] == 'g':
575
        if word[-3:] == 'ogi':
576
            if ((r1_start >= 1 and len(word[r1_start:]) >= 3 and
577
                 word[-4] == 'l')):
578
                word = word[:-1]
579
    elif word[-2] == 'l':
580
        if word[-6:] == 'lessli':
581
            if len(word[r1_start:]) >= 6:
582
                word = word[:-2]
583
        elif word[-5:] in {'entli', 'fulli', 'ousli'}:
584
            if len(word[r1_start:]) >= 5:
585
                word = word[:-2]
586
        elif word[-4:] == 'abli':
587
            if len(word[r1_start:]) >= 4:
588
                word = word[:-1] + 'e'
589
        elif word[-4:] == 'alli':
590
            if len(word[r1_start:]) >= 4:
591
                word = word[:-2]
592
        elif word[-3:] == 'bli':
593
            if len(word[r1_start:]) >= 3:
594
                word = word[:-1] + 'e'
595
        elif word[-2:] == 'li':
596
            if ((r1_start >= 1 and len(word[r1_start:]) >= 2 and
597
                 word[-3] in _li)):
598
                word = word[:-2]
599
    elif word[-2] == 'o':
600
        if word[-7:] == 'ization':
601
            if len(word[r1_start:]) >= 7:
602
                word = word[:-5] + 'e'
603
        elif word[-5:] == 'ation':
604
            if len(word[r1_start:]) >= 5:
605
                word = word[:-3] + 'e'
606
        elif word[-4:] == 'ator':
607
            if len(word[r1_start:]) >= 4:
608
                word = word[:-2] + 'e'
609
    elif word[-2] == 's':
610
        if word[-7:] in {'fulness', 'ousness', 'iveness'}:
611
            if len(word[r1_start:]) >= 7:
612
                word = word[:-4]
613
        elif word[-5:] == 'alism':
614
            if len(word[r1_start:]) >= 5:
615
                word = word[:-3]
616
    elif word[-2] == 't':
617
        if word[-6:] == 'biliti':
618
            if len(word[r1_start:]) >= 6:
619
                word = word[:-5] + 'le'
620
        elif word[-5:] == 'aliti':
621
            if len(word[r1_start:]) >= 5:
622
                word = word[:-3]
623
        elif word[-5:] == 'iviti':
624
            if len(word[r1_start:]) >= 5:
625
                word = word[:-3] + 'e'
626
627
    # Step 3
628
    if word[-7:] == 'ational':
629
        if len(word[r1_start:]) >= 7:
630
            word = word[:-5] + 'e'
631
    elif word[-6:] == 'tional':
632
        if len(word[r1_start:]) >= 6:
633
            word = word[:-2]
634
    elif word[-5:] in {'alize', 'icate', 'iciti'}:
635
        if len(word[r1_start:]) >= 5:
636
            word = word[:-3]
637
    elif word[-5:] == 'ative':
638
        if len(word[r2_start:]) >= 5:
639
            word = word[:-5]
640
    elif word[-4:] == 'ical':
641
        if len(word[r1_start:]) >= 4:
642
            word = word[:-2]
643
    elif word[-4:] == 'ness':
644
        if len(word[r1_start:]) >= 4:
645
            word = word[:-4]
646
    elif word[-3:] == 'ful':
647
        if len(word[r1_start:]) >= 3:
648
            word = word[:-3]
649
650
    # Step 4
651
    for suffix in ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant',
652
                   'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er',
653
                   'ic'):
654
        if word[-len(suffix):] == suffix:
655
            if len(word[r2_start:]) >= len(suffix):
656
                word = word[:-len(suffix)]
657
            break
658
    else:
659
        if word[-3:] == 'ion':
660
            if ((len(word[r2_start:]) >= 3 and len(word) >= 4 and
661
                 word[-4] in tuple('st'))):
662
                word = word[:-3]
663
664
    # Step 5
665
    if word[-1] == 'e':
666
        if (len(word[r2_start:]) >= 1 or
667
                (len(word[r1_start:]) >= 1 and
668
                 not _sb_ends_in_short_syllable(word[:-1], _vowels,
669
                                                _codanonvowels))):
670
            word = word[:-1]
671
    elif word[-1] == 'l':
672
        if len(word[r2_start:]) >= 1 and word[-2] == 'l':
673
            word = word[:-1]
674
675
    # Change 'Y' back to 'y' if it survived stemming
676
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
677
        if word[i] == 'Y':
678
            word = word[:i] + 'y' + word[i+1:]
679
680
    return word
681
682
683
def sb_german(word, alternate_vowels=False):
684
    """Return Snowball German stem.
685
686
    The Snowball German stemmer is defined at:
687
    http://snowball.tartarus.org/algorithms/german/stemmer.html
688
689
    :param str word: the word to calculate the stem of
690
    :param bool alternate_vowels: composes ae as ä, oe as ö, and ue as ü before
691
        running the algorithm
692
    :returns: word stem
693
    :rtype: str
694
695
    >>> sb_german('lesen')
696
    'les'
697
    >>> sb_german('graues')
698
    'grau'
699
    >>> sb_german('buchstabieren')
700
    'buchstabi'
701
    """
702
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö', 'ü'}
703
    _s_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 'r', 't'}
704
    _st_endings = {'b', 'd', 'f', 'g', 'h', 'k', 'l', 'm', 'n', 't'}
705
706
    # lowercase, normalize, and compose
707
    word = normalize('NFC', word.lower())
708
    word = word.replace('ß', 'ss')
709
710
    if len(word) > 2:
711
        for i in range(2, len(word)):
712
            if word[i] in _vowels and word[i-2] in _vowels:
713
                if word[i-1] == 'u':
714
                    word = word[:i-1] + 'U' + word[i:]
715
                elif word[i-1] == 'y':
716
                    word = word[:i-1] + 'Y' + word[i:]
717
718
    if alternate_vowels:
719
        word = word.replace('ae', 'ä')
720
        word = word.replace('oe', 'ö')
721
        word = word.replace('que', 'Q')
722
        word = word.replace('ue', 'ü')
723
        word = word.replace('Q', 'que')
724
725
    r1_start = max(3, _sb_r1(word, _vowels))
726
    r2_start = _sb_r2(word, _vowels)
727
728
    # Step 1
729
    niss_flag = False
730
    if word[-3:] == 'ern':
731
        if len(word[r1_start:]) >= 3:
732
            word = word[:-3]
733
    elif word[-2:] == 'em':
734
        if len(word[r1_start:]) >= 2:
735
            word = word[:-2]
736
    elif word[-2:] == 'er':
737
        if len(word[r1_start:]) >= 2:
738
            word = word[:-2]
739
    elif word[-2:] == 'en':
740
        if len(word[r1_start:]) >= 2:
741
            word = word[:-2]
742
            niss_flag = True
743
    elif word[-2:] == 'es':
744
        if len(word[r1_start:]) >= 2:
745
            word = word[:-2]
746
            niss_flag = True
747
    elif word[-1:] == 'e':
748
        if len(word[r1_start:]) >= 1:
749
            word = word[:-1]
750
            niss_flag = True
751
    elif word[-1:] == 's':
752
        if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
753
             word[-2] in _s_endings)):
754
            word = word[:-1]
755
756
    if niss_flag and word[-4:] == 'niss':
757
        word = word[:-1]
758
759
    # Step 2
760
    if word[-3:] == 'est':
761
        if len(word[r1_start:]) >= 3:
762
            word = word[:-3]
763
    elif word[-2:] == 'en':
764
        if len(word[r1_start:]) >= 2:
765
            word = word[:-2]
766
    elif word[-2:] == 'er':
767
        if len(word[r1_start:]) >= 2:
768
            word = word[:-2]
769
    elif word[-2:] == 'st':
770
        if ((len(word[r1_start:]) >= 2 and len(word) >= 6 and
771
             word[-3] in _st_endings)):
772
            word = word[:-2]
773
774
    # Step 3
775
    if word[-4:] == 'isch':
776
        if len(word[r2_start:]) >= 4 and word[-5] != 'e':
777
            word = word[:-4]
778
    elif word[-4:] in {'lich', 'heit'}:
779
        if len(word[r2_start:]) >= 4:
780
            word = word[:-4]
781
            if ((word[-2:] in {'er', 'en'} and
782
                 len(word[r1_start:]) >= 2)):
783
                word = word[:-2]
784
    elif word[-4:] == 'keit':
785
        if len(word[r2_start:]) >= 4:
786
            word = word[:-4]
787
            if word[-4:] == 'lich' and len(word[r2_start:]) >= 4:
788
                word = word[:-4]
789
            elif word[-2:] == 'ig' and len(word[r2_start:]) >= 2:
790
                word = word[:-2]
791
    elif word[-3:] in {'end', 'ung'}:
792
        if len(word[r2_start:]) >= 3:
793
            word = word[:-3]
794
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
795
                 word[-3] != 'e')):
796
                word = word[:-2]
797
    elif word[-2:] in {'ig', 'ik'}:
798
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
799
            word = word[:-2]
800
801
    # Change 'Y' and 'U' back to lowercase if survived stemming
802
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
803
        if word[i] == 'Y':
804
            word = word[:i] + 'y' + word[i+1:]
805
        elif word[i] == 'U':
806
            word = word[:i] + 'u' + word[i+1:]
807
808
    # Remove umlauts
809
    _umlauts = dict(zip((ord(_) for _ in 'äöü'), 'aou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
810
    word = word.translate(_umlauts)
811
812
    return word
813
814
815
def sb_dutch(word):
816
    """Return Snowball Dutch stem.
817
818
    The Snowball Dutch stemmer is defined at:
819
    http://snowball.tartarus.org/algorithms/dutch/stemmer.html
820
821
    :param str word: the word to calculate the stem of
822
    :returns: word stem
823
    :rtype: str
824
825
    >>> sb_dutch('lezen')
826
    'lez'
827
    >>> sb_dutch('opschorting')
828
    'opschort'
829
    >>> sb_dutch('ongrijpbaarheid')
830
    'ongrijp'
831
    """
832
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'è'}
833
    _not_s_endings = {'a', 'e', 'i', 'j', 'o', 'u', 'y', 'è'}
834
835
    def _undouble(word):
836
        """Undouble endings -kk, -dd, and -tt."""
837
        if ((len(word) > 1 and word[-1] == word[-2] and
838
             word[-1] in {'d', 'k', 't'})):
839
            return word[:-1]
840
        return word
841
842
    # lowercase, normalize, decompose, filter umlauts & acutes out, and compose
843
    word = normalize('NFC', text_type(word.lower()))
844
    _accented = dict(zip((ord(_) for _ in 'äëïöüáéíóú'), 'aeiouaeiou'))
0 ignored issues
show
Comprehensibility Best Practice introduced by
The variable _ does not seem to be defined.
Loading history...
845
    word = word.translate(_accented)
846
847
    for i in range(len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
848
        if i == 0 and word[0] == 'y':
849
            word = 'Y' + word[1:]
850
        elif word[i] == 'y' and word[i-1] in _vowels:
851
            word = word[:i] + 'Y' + word[i+1:]
852
        elif (word[i] == 'i' and word[i-1] in _vowels and i+1 < len(word) and
853
              word[i+1] in _vowels):
854
            word = word[:i] + 'I' + word[i+1:]
855
856
    r1_start = max(3, _sb_r1(word, _vowels))
857
    r2_start = _sb_r2(word, _vowels)
858
859
    # Step 1
860
    if word[-5:] == 'heden':
861
        if len(word[r1_start:]) >= 5:
862
            word = word[:-3] + 'id'
863
    elif word[-3:] == 'ene':
864
        if ((len(word[r1_start:]) >= 3 and
865
             (word[-4] not in _vowels and word[-6:-3] != 'gem'))):
866
            word = _undouble(word[:-3])
867
    elif word[-2:] == 'en':
868
        if ((len(word[r1_start:]) >= 2 and
869
             (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
870
            word = _undouble(word[:-2])
871
    elif word[-2:] == 'se':
872
        if len(word[r1_start:]) >= 2 and word[-3] not in _not_s_endings:
873
            word = word[:-2]
874
    elif word[-1:] == 's':
875
        if len(word[r1_start:]) >= 1 and word[-2] not in _not_s_endings:
876
            word = word[:-1]
877
878
    # Step 2
879
    e_removed = False
880
    if word[-1:] == 'e':
881
        if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
882
            word = _undouble(word[:-1])
883
            e_removed = True
884
885
    # Step 3a
886
    if word[-4:] == 'heid':
887
        if len(word[r2_start:]) >= 4 and word[-5] != 'c':
888
            word = word[:-4]
889
            if word[-2:] == 'en':
890
                if ((len(word[r1_start:]) >= 2 and
891
                     (word[-3] not in _vowels and word[-5:-2] != 'gem'))):
892
                    word = _undouble(word[:-2])
893
894
    # Step 3b
895
    if word[-4:] == 'lijk':
896
        if len(word[r2_start:]) >= 4:
897
            word = word[:-4]
898
            # Repeat step 2
899
            if word[-1:] == 'e':
900
                if len(word[r1_start:]) >= 1 and word[-2] not in _vowels:
901
                    word = _undouble(word[:-1])
902
    elif word[-4:] == 'baar':
903
        if len(word[r2_start:]) >= 4:
904
            word = word[:-4]
905
    elif word[-3:] in ('end', 'ing'):
906
        if len(word[r2_start:]) >= 3:
907
            word = word[:-3]
908
            if ((word[-2:] == 'ig' and len(word[r2_start:]) >= 2 and
909
                 word[-3] != 'e')):
910
                word = word[:-2]
911
            else:
912
                word = _undouble(word)
913
    elif word[-3:] == 'bar':
914
        if len(word[r2_start:]) >= 3 and e_removed:
915
            word = word[:-3]
916
    elif word[-2:] == 'ig':
917
        if len(word[r2_start:]) >= 2 and word[-3] != 'e':
918
            word = word[:-2]
919
920
    # Step 4
921
    if ((len(word) >= 4 and
0 ignored issues
show
best-practice introduced by
Too many boolean expressions in if statement (6/5)
Loading history...
922
         word[-3] == word[-2] and word[-2] in {'a', 'e', 'o', 'u'} and
923
         word[-4] not in _vowels and
924
         word[-1] not in _vowels and word[-1] != 'I')):
925
        word = word[:-2] + word[-1]
926
927
    # Change 'Y' and 'U' back to lowercase if survived stemming
928
    for i in range(0, len(word)):
0 ignored issues
show
unused-code introduced by
Consider using enumerate instead of iterating with range and len
Loading history...
929
        if word[i] == 'Y':
930
            word = word[:i] + 'y' + word[i+1:]
931
        elif word[i] == 'I':
932
            word = word[:i] + 'i' + word[i+1:]
933
934
    return word
935
936
937
def sb_norwegian(word):
938
    """Return Snowball Norwegian stem.
939
940
    The Snowball Norwegian stemmer is defined at:
941
    http://snowball.tartarus.org/algorithms/norwegian/stemmer.html
942
943
    :param str word: the word to calculate the stem of
944
    :returns: word stem
945
    :rtype: str
946
947
    >>> sb_norwegian('lese')
948
    'les'
949
    >>> sb_norwegian('suspensjon')
950
    'suspensjon'
951
    >>> sb_norwegian('sikkerhet')
952
    'sikker'
953
    """
954
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
955
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p',
956
                  'r', 't', 'v', 'y', 'z'}
957
    # lowercase, normalize, and compose
958
    word = normalize('NFC', text_type(word.lower()))
959
960
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
961
962
    # Step 1
963
    _r1 = word[r1_start:]
964
    if _r1[-7:] == 'hetenes':
965
        word = word[:-7]
966
    elif _r1[-6:] in {'hetene', 'hetens'}:
967
        word = word[:-6]
968
    elif _r1[-5:] in {'heten', 'heter', 'endes'}:
969
        word = word[:-5]
970
    elif _r1[-4:] in {'ande', 'ende', 'edes', 'enes', 'erte'}:
971
        if word[-4:] == 'erte':
972
            word = word[:-2]
973
        else:
974
            word = word[:-4]
975
    elif _r1[-3:] in {'ede', 'ane', 'ene', 'ens', 'ers', 'ets', 'het', 'ast',
976
                      'ert'}:
977
        if word[-3:] == 'ert':
978
            word = word[:-1]
979
        else:
980
            word = word[:-3]
981
    elif _r1[-2:] in {'en', 'ar', 'er', 'as', 'es', 'et'}:
982
        word = word[:-2]
983
    elif _r1[-1:] in {'a', 'e'}:
984
        word = word[:-1]
985
    elif _r1[-1:] == 's':
986
        if (((len(word) > 1 and word[-2] in _s_endings) or
987
             (len(word) > 2 and word[-2] == 'k' and word[-3] not in _vowels))):
988
            word = word[:-1]
989
990
    # Step 2
991
    if word[r1_start:][-2:] in {'dt', 'vt'}:
992
        word = word[:-1]
993
994
    # Step 3
995
    _r1 = word[r1_start:]
996
    if _r1[-7:] == 'hetslov':
997
        word = word[:-7]
998
    elif _r1[-4:] in {'eleg', 'elig', 'elov', 'slov'}:
999
        word = word[:-4]
1000
    elif _r1[-3:] in {'leg', 'eig', 'lig', 'els', 'lov'}:
1001
        word = word[:-3]
1002
    elif _r1[-2:] == 'ig':
1003
        word = word[:-2]
1004
1005
    return word
1006
1007
1008
def sb_swedish(word):
1009
    """Return Snowball Swedish stem.
1010
1011
    The Snowball Swedish stemmer is defined at:
1012
    http://snowball.tartarus.org/algorithms/swedish/stemmer.html
1013
1014
    :param str word: the word to calculate the stem of
1015
    :returns: word stem
1016
    :rtype: str
1017
1018
    >>> sb_swedish('undervisa')
1019
    'undervis'
1020
    >>> sb_swedish('suspension')
1021
    'suspension'
1022
    >>> sb_swedish('visshet')
1023
    'viss'
1024
    """
1025
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'ä', 'å', 'ö'}
1026
    _s_endings = {'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1027
                  'o', 'p', 'r', 't', 'v', 'y'}
1028
1029
    # lowercase, normalize, and compose
1030
    word = normalize('NFC', text_type(word.lower()))
1031
1032
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1033
1034
    # Step 1
1035
    _r1 = word[r1_start:]
1036 View Code Duplication
    if _r1[-7:] == 'heterna':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1037
        word = word[:-7]
1038
    elif _r1[-6:] == 'hetens':
1039
        word = word[:-6]
1040
    elif _r1[-5:] in {'anden', 'heten', 'heter', 'arnas', 'ernas', 'ornas',
1041
                      'andes', 'arens', 'andet'}:
1042
        word = word[:-5]
1043
    elif _r1[-4:] in {'arna', 'erna', 'orna', 'ande', 'arne', 'aste', 'aren',
1044
                      'ades', 'erns'}:
1045
        word = word[:-4]
1046
    elif _r1[-3:] in {'ade', 'are', 'ern', 'ens', 'het', 'ast'}:
1047
        word = word[:-3]
1048
    elif _r1[-2:] in {'ad', 'en', 'ar', 'er', 'or', 'as', 'es', 'at'}:
1049
        word = word[:-2]
1050
    elif _r1[-1:] in {'a', 'e'}:
1051
        word = word[:-1]
1052
    elif _r1[-1:] == 's':
1053
        if len(word) > 1 and word[-2] in _s_endings:
1054
            word = word[:-1]
1055
1056
    # Step 2
1057
    if word[r1_start:][-2:] in {'dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt'}:
1058
        word = word[:-1]
1059
1060
    # Step 3
1061
    _r1 = word[r1_start:]
1062
    if _r1[-5:] == 'fullt':
1063
        word = word[:-1]
1064
    elif _r1[-4:] == 'löst':
1065
        word = word[:-1]
1066
    elif _r1[-3:] in {'lig', 'els'}:
1067
        word = word[:-3]
1068
    elif _r1[-2:] == 'ig':
1069
        word = word[:-2]
1070
1071
    return word
1072
1073
1074
def sb_danish(word):
1075
    """Return Snowball Danish stem.
1076
1077
    The Snowball Danish stemmer is defined at:
1078
    http://snowball.tartarus.org/algorithms/danish/stemmer.html
1079
1080
    :param str word: the word to calculate the stem of
1081
    :returns: word stem
1082
    :rtype: str
1083
1084
    >>> sb_danish('underviser')
1085
    'undervis'
1086
    >>> sb_danish('suspension')
1087
    'suspension'
1088
    >>> sb_danish('sikkerhed')
1089
    'sikker'
1090
    """
1091
    _vowels = {'a', 'e', 'i', 'o', 'u', 'y', 'å', 'æ', 'ø'}
1092
    _s_endings = {'a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n',
1093
                  'o', 'p', 'r', 't', 'v', 'y', 'z', 'å'}
1094
1095
    # lowercase, normalize, and compose
1096
    word = normalize('NFC', text_type(word.lower()))
1097
1098
    r1_start = min(max(3, _sb_r1(word, _vowels)), len(word))
1099
1100
    # Step 1
1101
    _r1 = word[r1_start:]
1102 View Code Duplication
    if _r1[-7:] == 'erendes':
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
1103
        word = word[:-7]
1104
    elif _r1[-6:] in {'erende', 'hedens'}:
1105
        word = word[:-6]
1106
    elif _r1[-5:] in {'ethed', 'erede', 'heden', 'heder', 'endes', 'ernes',
1107
                      'erens', 'erets'}:
1108
        word = word[:-5]
1109
    elif _r1[-4:] in {'ered', 'ende', 'erne', 'eren', 'erer', 'heds', 'enes',
1110
                      'eres', 'eret'}:
1111
        word = word[:-4]
1112
    elif _r1[-3:] in {'hed', 'ene', 'ere', 'ens', 'ers', 'ets'}:
1113
        word = word[:-3]
1114
    elif _r1[-2:] in {'en', 'er', 'es', 'et'}:
1115
        word = word[:-2]
1116
    elif _r1[-1:] == 'e':
1117
        word = word[:-1]
1118
    elif _r1[-1:] == 's':
1119
        if len(word) > 1 and word[-2] in _s_endings:
1120
            word = word[:-1]
1121
1122
    # Step 2
1123
    if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1124
        word = word[:-1]
1125
1126
    # Step 3
1127
    if word[-4:] == 'igst':
1128
        word = word[:-2]
1129
1130
    _r1 = word[r1_start:]
1131
    repeat_step2 = False
1132
    if _r1[-4:] == 'elig':
1133
        word = word[:-4]
1134
        repeat_step2 = True
1135
    elif _r1[-4:] == 'løst':
1136
        word = word[:-1]
1137
    elif _r1[-3:] in {'lig', 'els'}:
1138
        word = word[:-3]
1139
        repeat_step2 = True
1140
    elif _r1[-2:] == 'ig':
1141
        word = word[:-2]
1142
        repeat_step2 = True
1143
1144
    if repeat_step2:
1145
        if word[r1_start:][-2:] in {'gd', 'dt', 'gt', 'kt'}:
1146
            word = word[:-1]
1147
1148
    # Step 4
1149
    if ((len(word[r1_start:]) >= 1 and len(word) >= 2 and
1150
         word[-1] == word[-2] and word[-1] not in _vowels)):
1151
        word = word[:-1]
1152
1153
    return word
1154
1155
1156
if __name__ == '__main__':
1157
    import doctest
1158
    doctest.testmod()
1159