Completed
Branch master (78a222)
by Chris
14:36
created

abydos.stemmer._lovins   A

Complexity

Total Complexity 18

Size/Duplication

Total Lines 743
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 18
eloc 468
dl 0
loc 743
ccs 104
cts 104
cp 1
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F lovins() 0 703 18
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._lovins.
20
21
The stemmer._lovins module implements the Lovins stemmer.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from unicodedata import normalize
27
28 1
from six import text_type
29 1
from six.moves import range
30
31 1
__all__ = ['lovins']
32
33
34 1
def lovins(word):
0 ignored issues
show
Comprehensibility introduced by
This function exceeds the maximum number of variables (39/15).
Loading history...
35
    """Return Lovins stem.
36
37
    Lovins stemmer
38
39
    The Lovins stemmer is described in Julie Beth Lovins's article
40
    :cite:`Lovins:1968`.
41
42
    :param str word: the word to stem
43
    :returns: word stem
44
    :rtype: str
45
46
    >>> lovins('reading')
47
    'read'
48
    >>> lovins('suspension')
49
    'suspens'
50
    >>> lovins('elusiveness')
51
    'elus'
52
    """
53
    # lowercase, normalize, and compose
54 1
    word = normalize('NFC', text_type(word.lower()))
55
56 1
    def cond_b(word, suffix_len):
57
        """Return Lovins' condition B.
58
59
        :param str word: word to check
60
        :param int suffix_len: suffix length
61
        :rtype: bool
62
        """
63 1
        return len(word) - suffix_len >= 3
64
65 1
    def cond_c(word, suffix_len):
66
        """Return Lovins' condition C.
67
68
        :param str word: word to check
69
        :param int suffix_len: suffix length
70
        :rtype: bool
71
        """
72 1
        return len(word) - suffix_len >= 4
73
74 1
    def cond_d(word, suffix_len):
75
        """Return Lovins' condition D.
76
77
        :param str word: word to check
78
        :param int suffix_len: suffix length
79
        :rtype: bool
80
        """
81 1
        return len(word) - suffix_len >= 5
82
83 1
    def cond_e(word, suffix_len):
84
        """Return Lovins' condition E.
85
86
        :param str word: word to check
87
        :param int suffix_len: suffix length
88
        :rtype: bool
89
        """
90 1
        return word[-suffix_len - 1] != 'e'
91
92 1
    def cond_f(word, suffix_len):
93
        """Return Lovins' condition F.
94
95
        :param str word: word to check
96
        :param int suffix_len: suffix length
97
        :rtype: bool
98
        """
99 1
        return len(word) - suffix_len >= 3 and word[-suffix_len - 1] != 'e'
100
101 1
    def cond_g(word, suffix_len):
102
        """Return Lovins' condition G.
103
104
        :param str word: word to check
105
        :param int suffix_len: suffix length
106
        :rtype: bool
107
        """
108 1
        return len(word) - suffix_len >= 3 and word[-suffix_len - 1] == 'f'
109
110 1
    def cond_h(word, suffix_len):
111
        """Return Lovins' condition H.
112
113
        :param str word: word to check
114
        :param int suffix_len: suffix length
115
        :rtype: bool
116
        """
117 1
        return (
118
            word[-suffix_len - 1] == 't'
119
            or word[-suffix_len - 2 : -suffix_len] == 'll'
120
        )
121
122 1
    def cond_i(word, suffix_len):
123
        """Return Lovins' condition I.
124
125
        :param str word: word to check
126
        :param int suffix_len: suffix length
127
        :rtype: bool
128
        """
129 1
        return word[-suffix_len - 1] not in {'e', 'o'}
130
131 1
    def cond_j(word, suffix_len):
132
        """Return Lovins' condition J.
133
134
        :param str word: word to check
135
        :param int suffix_len: suffix length
136
        :rtype: bool
137
        """
138 1
        return word[-suffix_len - 1] not in {'a', 'e'}
139
140 1
    def cond_k(word, suffix_len):
141
        """Return Lovins' condition K.
142
143
        :param str word: word to check
144
        :param int suffix_len: suffix length
145
        :rtype: bool
146
        """
147 1
        return len(word) - suffix_len >= 3 and (
148
            word[-suffix_len - 1] in {'i', 'l'}
149
            or (word[-suffix_len - 3] == 'u' and word[-suffix_len - 1] == 'e')
150
        )
151
152 1
    def cond_l(word, suffix_len):
153
        """Return Lovins' condition L.
154
155
        :param str word: word to check
156
        :param int suffix_len: suffix length
157
        :rtype: bool
158
        """
159 1
        return (
160
            word[-suffix_len - 1] not in {'s', 'u', 'x'}
161
            or word[-suffix_len - 1] == 'os'
162
        )
163
164 1
    def cond_m(word, suffix_len):
165
        """Return Lovins' condition M.
166
167
        :param str word: word to check
168
        :param int suffix_len: suffix length
169
        :rtype: bool
170
        """
171 1
        return word[-suffix_len - 1] not in {'a', 'c', 'e', 'm'}
172
173 1
    def cond_n(word, suffix_len):
174
        """Return Lovins' condition N.
175
176
        :param str word: word to check
177
        :param int suffix_len: suffix length
178
        :rtype: bool
179
        """
180 1
        if len(word) - suffix_len >= 3:
181 1
            if word[-suffix_len - 3] == 's':
182 1
                if len(word) - suffix_len >= 4:
183 1
                    return True
184
            else:
185 1
                return True
186 1
        return False
187
188 1
    def cond_o(word, suffix_len):
189
        """Return Lovins' condition O.
190
191
        :param str word: word to check
192
        :param int suffix_len: suffix length
193
        :rtype: bool
194
        """
195 1
        return word[-suffix_len - 1] in {'i', 'l'}
196
197 1
    def cond_p(word, suffix_len):
198
        """Return Lovins' condition P.
199
200
        :param str word: word to check
201
        :param int suffix_len: suffix length
202
        :rtype: bool
203
        """
204 1
        return word[-suffix_len - 1] != 'c'
205
206 1
    def cond_q(word, suffix_len):
207
        """Return Lovins' condition Q.
208
209
        :param str word: word to check
210
        :param int suffix_len: suffix length
211
        :rtype: bool
212
        """
213 1
        return len(word) - suffix_len >= 3 and word[-suffix_len - 1] not in {
214
            'l',
215
            'n',
216
        }
217
218 1
    def cond_r(word, suffix_len):
219
        """Return Lovins' condition R.
220
221
        :param str word: word to check
222
        :param int suffix_len: suffix length
223
        :rtype: bool
224
        """
225 1
        return word[-suffix_len - 1] in {'n', 'r'}
226
227 1
    def cond_s(word, suffix_len):
228
        """Return Lovins' condition S.
229
230
        :param str word: word to check
231
        :param int suffix_len: suffix length
232
        :rtype: bool
233
        """
234 1
        return word[-suffix_len - 2 : -suffix_len] == 'dr' or (
235
            word[-suffix_len - 1] == 't'
236
            and word[-suffix_len - 2 : -suffix_len] != 'tt'
237
        )
238
239 1
    def cond_t(word, suffix_len):
240
        """Return Lovins' condition T.
241
242
        :param str word: word to check
243
        :param int suffix_len: suffix length
244
        :rtype: bool
245
        """
246 1
        return (
247
            word[-suffix_len - 1] in {'s', 't'}
248
            and word[-suffix_len - 2 : -suffix_len] != 'ot'
249
        )
250
251 1
    def cond_u(word, suffix_len):
252
        """Return Lovins' condition U.
253
254
        :param str word: word to check
255
        :param int suffix_len: suffix length
256
        :rtype: bool
257
        """
258 1
        return word[-suffix_len - 1] in {'l', 'm', 'n', 'r'}
259
260 1
    def cond_v(word, suffix_len):
261
        """Return Lovins' condition V.
262
263
        :param str word: word to check
264
        :param int suffix_len: suffix length
265
        :rtype: bool
266
        """
267 1
        return word[-suffix_len - 1] == 'c'
268
269 1
    def cond_w(word, suffix_len):
270
        """Return Lovins' condition W.
271
272
        :param str word: word to check
273
        :param int suffix_len: suffix length
274
        :rtype: bool
275
        """
276 1
        return word[-suffix_len - 1] not in {'s', 'u'}
277
278 1
    def cond_x(word, suffix_len):
279
        """Return Lovins' condition X.
280
281
        :param str word: word to check
282
        :param int suffix_len: suffix length
283
        :rtype: bool
284
        """
285 1
        return word[-suffix_len - 1] in {'i', 'l'} or (
286
            word[-suffix_len - 3 : -suffix_len] == 'u'
287
            and word[-suffix_len - 1] == 'e'
288
        )
289
290 1
    def cond_y(word, suffix_len):
291
        """Return Lovins' condition Y.
292
293
        :param str word: word to check
294
        :param int suffix_len: suffix length
295
        :rtype: bool
296
        """
297 1
        return word[-suffix_len - 2 : -suffix_len] == 'in'
298
299 1
    def cond_z(word, suffix_len):
300
        """Return Lovins' condition Z.
301
302
        :param str word: word to check
303
        :param int suffix_len: suffix length
304
        :rtype: bool
305
        """
306 1
        return word[-suffix_len - 1] != 'f'
307
308 1
    def cond_aa(word, suffix_len):
309
        """Return Lovins' condition AA.
310
311
        :param str word: word to check
312
        :param int suffix_len: suffix length
313
        :rtype: bool
314
        """
315 1
        return word[-suffix_len - 1] in {'d', 'f', 'l', 't'} or word[
316
            -suffix_len - 2 : -suffix_len
317
        ] in {'ph', 'th', 'er', 'or', 'es'}
318
319 1
    def cond_bb(word, suffix_len):
320
        """Return Lovins' condition BB.
321
322
        :param str word: word to check
323
        :param int suffix_len: suffix length
324
        :rtype: bool
325
        """
326 1
        return (
327
            len(word) - suffix_len >= 3
328
            and word[-suffix_len - 3 : -suffix_len] != 'met'
329
            and word[-suffix_len - 4 : -suffix_len] != 'ryst'
330
        )
331
332 1
    def cond_cc(word, suffix_len):
333
        """Return Lovins' condition CC.
334
335
        :param str word: word to check
336
        :param int suffix_len: suffix length
337
        :rtype: bool
338
        """
339 1
        return word[-suffix_len - 1] == 'l'
340
341 1
    suffix = {
342
        'alistically': cond_b,
343
        'arizability': None,
344
        'izationally': cond_b,
345
        'antialness': None,
346
        'arisations': None,
347
        'arizations': None,
348
        'entialness': None,
349
        'allically': cond_c,
350
        'antaneous': None,
351
        'antiality': None,
352
        'arisation': None,
353
        'arization': None,
354
        'ationally': cond_b,
355
        'ativeness': None,
356
        'eableness': cond_e,
357
        'entations': None,
358
        'entiality': None,
359
        'entialize': None,
360
        'entiation': None,
361
        'ionalness': None,
362
        'istically': None,
363
        'itousness': None,
364
        'izability': None,
365
        'izational': None,
366
        'ableness': None,
367
        'arizable': None,
368
        'entation': None,
369
        'entially': None,
370
        'eousness': None,
371
        'ibleness': None,
372
        'icalness': None,
373
        'ionalism': None,
374
        'ionality': None,
375
        'ionalize': None,
376
        'iousness': None,
377
        'izations': None,
378
        'lessness': None,
379
        'ability': None,
380
        'aically': None,
381
        'alistic': cond_b,
382
        'alities': None,
383
        'ariness': cond_e,
384
        'aristic': None,
385
        'arizing': None,
386
        'ateness': None,
387
        'atingly': None,
388
        'ational': cond_b,
389
        'atively': None,
390
        'ativism': None,
391
        'elihood': cond_e,
392
        'encible': None,
393
        'entally': None,
394
        'entials': None,
395
        'entiate': None,
396
        'entness': None,
397
        'fulness': None,
398
        'ibility': None,
399
        'icalism': None,
400
        'icalist': None,
401
        'icality': None,
402
        'icalize': None,
403
        'ication': cond_g,
404
        'icianry': None,
405
        'ination': None,
406
        'ingness': None,
407
        'ionally': None,
408
        'isation': None,
409
        'ishness': None,
410
        'istical': None,
411
        'iteness': None,
412
        'iveness': None,
413
        'ivistic': None,
414
        'ivities': None,
415
        'ization': cond_f,
416
        'izement': None,
417
        'oidally': None,
418
        'ousness': None,
419
        'aceous': None,
420
        'acious': cond_b,
421
        'action': cond_g,
422
        'alness': None,
423
        'ancial': None,
424
        'ancies': None,
425
        'ancing': cond_b,
426
        'ariser': None,
427
        'arized': None,
428
        'arizer': None,
429
        'atable': None,
430
        'ations': cond_b,
431
        'atives': None,
432
        'eature': cond_z,
433
        'efully': None,
434
        'encies': None,
435
        'encing': None,
436
        'ential': None,
437
        'enting': cond_c,
438
        'entist': None,
439
        'eously': None,
440
        'ialist': None,
441
        'iality': None,
442
        'ialize': None,
443
        'ically': None,
444
        'icance': None,
445
        'icians': None,
446
        'icists': None,
447
        'ifully': None,
448
        'ionals': None,
449
        'ionate': cond_d,
450
        'ioning': None,
451
        'ionist': None,
452
        'iously': None,
453
        'istics': None,
454
        'izable': cond_e,
455
        'lessly': None,
456
        'nesses': None,
457
        'oidism': None,
458
        'acies': None,
459
        'acity': None,
460
        'aging': cond_b,
461
        'aical': None,
462
        'alist': None,
463
        'alism': cond_b,
464
        'ality': None,
465
        'alize': None,
466
        'allic': cond_bb,
467
        'anced': cond_b,
468
        'ances': cond_b,
469
        'antic': cond_c,
470
        'arial': None,
471
        'aries': None,
472
        'arily': None,
473
        'arity': cond_b,
474
        'arize': None,
475
        'aroid': None,
476
        'ately': None,
477
        'ating': cond_i,
478
        'ation': cond_b,
479
        'ative': None,
480
        'ators': None,
481
        'atory': None,
482
        'ature': cond_e,
483
        'early': cond_y,
484
        'ehood': None,
485
        'eless': None,
486
        'elity': None,
487
        'ement': None,
488
        'enced': None,
489
        'ences': None,
490
        'eness': cond_e,
491
        'ening': cond_e,
492
        'ental': None,
493
        'ented': cond_c,
494
        'ently': None,
495
        'fully': None,
496
        'ially': None,
497
        'icant': None,
498
        'ician': None,
499
        'icide': None,
500
        'icism': None,
501
        'icist': None,
502
        'icity': None,
503
        'idine': cond_i,
504
        'iedly': None,
505
        'ihood': None,
506
        'inate': None,
507
        'iness': None,
508
        'ingly': cond_b,
509
        'inism': cond_j,
510
        'inity': cond_cc,
511
        'ional': None,
512
        'ioned': None,
513
        'ished': None,
514
        'istic': None,
515
        'ities': None,
516
        'itous': None,
517
        'ively': None,
518
        'ivity': None,
519
        'izers': cond_f,
520
        'izing': cond_f,
521
        'oidal': None,
522
        'oides': None,
523
        'otide': None,
524
        'ously': None,
525
        'able': None,
526
        'ably': None,
527
        'ages': cond_b,
528
        'ally': cond_b,
529
        'ance': cond_b,
530
        'ancy': cond_b,
531
        'ants': cond_b,
532
        'aric': None,
533
        'arly': cond_k,
534
        'ated': cond_i,
535
        'ates': None,
536
        'atic': cond_b,
537
        'ator': None,
538
        'ealy': cond_y,
539
        'edly': cond_e,
540
        'eful': None,
541
        'eity': None,
542
        'ence': None,
543
        'ency': None,
544
        'ened': cond_e,
545
        'enly': cond_e,
546
        'eous': None,
547
        'hood': None,
548
        'ials': None,
549
        'ians': None,
550
        'ible': None,
551
        'ibly': None,
552
        'ical': None,
553
        'ides': cond_l,
554
        'iers': None,
555
        'iful': None,
556
        'ines': cond_m,
557
        'ings': cond_n,
558
        'ions': cond_b,
559
        'ious': None,
560
        'isms': cond_b,
561
        'ists': None,
562
        'itic': cond_h,
563
        'ized': cond_f,
564
        'izer': cond_f,
565
        'less': None,
566
        'lily': None,
567
        'ness': None,
568
        'ogen': None,
569
        'ward': None,
570
        'wise': None,
571
        'ying': cond_b,
572
        'yish': None,
573
        'acy': None,
574
        'age': cond_b,
575
        'aic': None,
576
        'als': cond_bb,
577
        'ant': cond_b,
578
        'ars': cond_o,
579
        'ary': cond_f,
580
        'ata': None,
581
        'ate': None,
582
        'eal': cond_y,
583
        'ear': cond_y,
584
        'ely': cond_e,
585
        'ene': cond_e,
586
        'ent': cond_c,
587
        'ery': cond_e,
588
        'ese': None,
589
        'ful': None,
590
        'ial': None,
591
        'ian': None,
592
        'ics': None,
593
        'ide': cond_l,
594
        'ied': None,
595
        'ier': None,
596
        'ies': cond_p,
597
        'ily': None,
598
        'ine': cond_m,
599
        'ing': cond_n,
600
        'ion': cond_q,
601
        'ish': cond_c,
602
        'ism': cond_b,
603
        'ist': None,
604
        'ite': cond_aa,
605
        'ity': None,
606
        'ium': None,
607
        'ive': None,
608
        'ize': cond_f,
609
        'oid': None,
610
        'one': cond_r,
611
        'ous': None,
612
        'ae': None,
613
        'al': cond_bb,
614
        'ar': cond_x,
615
        'as': cond_b,
616
        'ed': cond_e,
617
        'en': cond_f,
618
        'es': cond_e,
619
        'ia': None,
620
        'ic': None,
621
        'is': None,
622
        'ly': cond_b,
623
        'on': cond_s,
624
        'or': cond_t,
625
        'um': cond_u,
626
        'us': cond_v,
627
        'yl': cond_r,
628
        '\'s': None,
629
        's\'': None,
630
        'a': None,
631
        'e': None,
632
        'i': None,
633
        'o': None,
634
        's': cond_w,
635
        'y': cond_b,
636
    }
637
638 1
    for suffix_len in range(11, 0, -1):
639 1
        ending = word[-suffix_len:]
640 1
        if (
641
            ending in suffix
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
642
            and len(word) - suffix_len >= 2
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
643
            and (suffix[ending] is None or suffix[ending](word, suffix_len))
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
644
        ):
645 1
            word = word[:-suffix_len]
646 1
            break
647
648 1
    def recode9(stem):
649
        """Return Lovins' conditional recode rule 9."""
650 1
        if stem[-3:-2] in {'a', 'i', 'o'}:
651 1
            return stem
652 1
        return stem[:-2] + 'l'
653
654 1
    def recode24(stem):
655
        """Return Lovins' conditional recode rule 24."""
656 1
        if stem[-4:-3] == 's':
657 1
            return stem
658 1
        return stem[:-1] + 's'
659
660 1
    def recode28(stem):
661
        """Return Lovins' conditional recode rule 28."""
662 1
        if stem[-4:-3] in {'p', 't'}:
663 1
            return stem
664 1
        return stem[:-1] + 's'
665
666 1
    def recode30(stem):
667
        """Return Lovins' conditional recode rule 30."""
668 1
        if stem[-4:-3] == 'm':
669 1
            return stem
670 1
        return stem[:-1] + 's'
671
672 1
    def recode32(stem):
673
        """Return Lovins' conditional recode rule 32."""
674 1
        if stem[-3:-2] == 'n':
675 1
            return stem
676 1
        return stem[:-1] + 's'
677
678 1
    if word[-2:] in {
679
        'bb',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
680
        'dd',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
681
        'gg',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
682
        'll',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
683
        'mm',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
684
        'nn',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
685
        'pp',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
686
        'rr',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
687
        'ss',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
688
        'tt',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
689
    }:
690 1
        word = word[:-1]
691
692 1
    recode = (
693
        ('iev', 'ief'),
694
        ('uct', 'uc'),
695
        ('umpt', 'um'),
696
        ('rpt', 'rb'),
697
        ('urs', 'ur'),
698
        ('istr', 'ister'),
699
        ('metr', 'meter'),
700
        ('olv', 'olut'),
701
        ('ul', recode9),
702
        ('bex', 'bic'),
703
        ('dex', 'dic'),
704
        ('pex', 'pic'),
705
        ('tex', 'tic'),
706
        ('ax', 'ac'),
707
        ('ex', 'ec'),
708
        ('ix', 'ic'),
709
        ('lux', 'luc'),
710
        ('uad', 'uas'),
711
        ('vad', 'vas'),
712
        ('cid', 'cis'),
713
        ('lid', 'lis'),
714
        ('erid', 'eris'),
715
        ('pand', 'pans'),
716
        ('end', recode24),
717
        ('ond', 'ons'),
718
        ('lud', 'lus'),
719
        ('rud', 'rus'),
720
        ('her', recode28),
721
        ('mit', 'mis'),
722
        ('ent', recode30),
723
        ('ert', 'ers'),
724
        ('et', recode32),
725
        ('yt', 'ys'),
726
        ('yz', 'ys'),
727
    )
728
729 1
    for ending, replacement in recode:
730 1
        if word.endswith(ending):
731 1
            if callable(replacement):
732 1
                word = replacement(word)
733
            else:
734 1
                word = word[: -len(ending)] + replacement
735
736 1
    return word
737
738
739
if __name__ == '__main__':
740
    import doctest
741
742
    doctest.testmod()
743