Passed
Push — master ( c2a3b6...15a61d )
by Chris
01:00 queued 14s
created

abydos.stemmer._uea_lite.uealite()   A

Complexity

Conditions 1

Size

Total Lines 56
Code Lines 13

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 1

Importance

Changes 0
Metric Value
eloc 13
dl 0
loc 56
ccs 9
cts 9
cp 1
rs 9.75
c 0
b 0
f 0
cc 1
nop 5
crap 1

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._uea_lite.
18
19 1
UEA-Lite stemmer
20
"""
21
22
from re import match as re_match
23
24 1
from ._stemmer import _Stemmer
25
26
__all__ = ['UEALite']
27
28
29
class UEALite(_Stemmer):
30
    """UEA-Lite stemmer.
31 1
32
    The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`.
33 1
34
    This is chiefly based on the Java implementation of the algorithm, with
35 1
    variants based on the Perl implementation and Jason Adams' Ruby port.
36
37 1
    Java version: :cite:`Churchill:2005`
38 1
    Perl version: :cite:`Jenkins:2005`
39
    Ruby version: :cite:`Adams:2017`
40 1
41
    .. versionadded:: 0.3.6
42
    """
43 1
44
    _problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
45
46
    # rule table format:
47
    # top-level dictionary: length-of-suffix: dict-of-rules
48
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
49
    #                         suffix_to_append)
50
    _standard_rule_table = {
51
        7: {
52
            'titudes': (30, 1, None),
53
            'fulness': (34, 4, None),
54
            'ousness': (35, 4, None),
55
            'eadings': (40.7, 4, None),
56
            'oadings': (40.6, 4, None),
57
            'ealings': (42.4, 4, None),
58 1
            'ailings': (42.2, 4, None),
59
        },
60
        6: {
61
            'aceous': (1, 6, None),
62
            'aining': (24, 3, None),
63
            'acting': (25, 3, None),
64 1
            'ttings': (26, 5, None),
65
            'viding': (27, 3, 'e'),
66
            'ssings': (37, 4, None),
67
            'ulting': (38, 3, None),
68
            'eading': (40.7, 3, None),
69
            'oading': (40.6, 3, None),
70
            'edings': (40.5, 4, None),
71
            'ddings': (40.4, 5, None),
72
            'ldings': (40.3, 4, None),
73
            'rdings': (40.2, 4, None),
74
            'ndings': (40.1, 4, None),
75
            'llings': (41, 5, None),
76
            'ealing': (42.4, 3, None),
77
            'olings': (42.3, 4, None),
78
            'ailing': (42.2, 3, None),
79
            'elings': (42.1, 4, None),
80
            'mmings': (44.3, 5, None),
81
            'ngings': (45.2, 4, None),
82
            'ggings': (45.1, 5, None),
83
            'stings': (47, 4, None),
84
            'etings': (48.4, 4, None),
85
            'ntings': (48.2, 4, None),
86
            'irings': (54.4, 4, 'e'),
87
            'urings': (54.3, 4, 'e'),
88
            'ncings': (54.2, 4, 'e'),
89
            'things': (58.1, 1, None),
90
        },
91
        5: {
92
            'iases': (11.4, 2, None),
93
            'ained': (13.6, 2, None),
94
            'erned': (13.5, 2, None),
95
            'ifted': (14, 2, None),
96
            'ected': (15, 2, None),
97
            'vided': (16, 1, None),
98
            'erred': (19, 3, None),
99
            'urred': (20.5, 3, None),
100
            'lored': (20.4, 2, None),
101
            'eared': (20.3, 2, None),
102
            'tored': (20.2, 1, None),
103
            'noted': (22.4, 1, None),
104
            'leted': (22.3, 1, None),
105
            'anges': (23, 1, None),
106
            'tting': (26, 4, None),
107
            'ulted': (32, 2, None),
108
            'uming': (33, 3, 'e'),
109
            'rabed': (36.1, 1, None),
110
            'rebed': (36.1, 1, None),
111
            'ribed': (36.1, 1, None),
112
            'robed': (36.1, 1, None),
113
            'rubed': (36.1, 1, None),
114
            'ssing': (37, 3, None),
115
            'vings': (39, 4, 'e'),
116
            'eding': (40.5, 3, None),
117
            'dding': (40.4, 4, None),
118
            'lding': (40.3, 3, None),
119
            'rding': (40.2, 3, None),
120
            'nding': (40.1, 3, None),
121
            'dings': (40, 4, 'e'),
122
            'lling': (41, 4, None),
123
            'oling': (42.3, 3, None),
124
            'eling': (42.1, 3, None),
125
            'lings': (42, 4, 'e'),
126
            'mming': (44.3, 4, None),
127
            'rming': (44.2, 3, None),
128
            'lming': (44.1, 3, None),
129
            'mings': (44, 4, 'e'),
130
            'nging': (45.2, 3, None),
131
            'gging': (45.1, 4, None),
132
            'gings': (45, 4, 'e'),
133
            'aning': (46.6, 3, None),
134
            'ening': (46.5, 3, None),
135
            'gning': (46.4, 3, None),
136
            'nning': (46.3, 4, None),
137
            'oning': (46.2, 3, None),
138
            'rning': (46.1, 3, None),
139
            'sting': (47, 3, None),
140
            'eting': (48.4, 3, None),
141
            'pting': (48.3, 3, None),
142
            'nting': (48.2, 3, None),
143
            'cting': (48.1, 3, None),
144
            'tings': (48, 4, 'e'),
145
            'iring': (54.4, 3, 'e'),
146
            'uring': (54.3, 3, 'e'),
147
            'ncing': (54.2, 3, 'e'),
148
            'sings': (54, 4, 'e'),
149
            # 'lling': (55, 3, None),  # masked by 41
150
            'ating': (57, 3, 'e'),
151
            'thing': (58.1, 0, None),
152
        },
153
        4: {
154
            'eeds': (7, 1, None),
155
            'uses': (11.3, 1, None),
156
            'sses': (11.2, 2, None),
157
            'eses': (11.1, 2, 'is'),
158
            'tled': (12.5, 1, None),
159
            'pled': (12.4, 1, None),
160
            'bled': (12.3, 1, None),
161
            'eled': (12.2, 2, None),
162
            'lled': (12.1, 2, None),
163
            'ened': (13.7, 2, None),
164
            'rned': (13.4, 2, None),
165
            'nned': (13.3, 3, None),
166
            'oned': (13.2, 2, None),
167
            'gned': (13.1, 2, None),
168
            'ered': (20.1, 2, None),
169
            'reds': (20, 2, None),
170
            'tted': (21, 3, None),
171
            'uted': (22.2, 1, None),
172
            'ated': (22.1, 1, None),
173
            'ssed': (28, 2, None),
174
            'umed': (31, 1, None),
175
            'beds': (36, 3, None),
176
            'ving': (39, 3, 'e'),
177
            'ding': (40, 3, 'e'),
178
            'ling': (42, 3, 'e'),
179
            'nged': (43.2, 1, None),
180
            'gged': (43.1, 3, None),
181
            'ming': (44, 3, 'e'),
182
            'ging': (45, 3, 'e'),
183
            'ning': (46, 3, 'e'),
184
            'ting': (48, 3, 'e'),
185
            # 'ssed': (49, 2, None),  # masked by 28
186
            # 'lled': (53, 2, None),  # masked by 12.1
187
            'zing': (54.1, 3, 'e'),
188
            'sing': (54, 3, 'e'),
189
            'lves': (60.1, 3, 'f'),
190
            'aped': (61.3, 1, None),
191
            'uded': (61.2, 1, None),
192
            'oded': (61.1, 1, None),
193
            # 'ated': (61, 1, None),  # masked by 22.1
194
            'ones': (63.6, 1, None),
195
            'izes': (63.5, 1, None),
196
            'ures': (63.4, 1, None),
197
            'ines': (63.3, 1, None),
198
            'ides': (63.2, 1, None),
199
        },
200
        3: {
201
            'ces': (2, 1, None),
202
            'sis': (4, 0, None),
203
            'tis': (5, 0, None),
204
            'eed': (7, 0, None),
205
            'ued': (8, 1, None),
206
            'ues': (9, 1, None),
207
            'ees': (10, 1, None),
208
            'ses': (11, 1, None),
209
            'led': (12, 2, None),
210
            'ned': (13, 1, None),
211
            'ved': (17, 1, None),
212
            'ced': (18, 1, None),
213
            'red': (20, 1, None),
214
            'ted': (22, 2, None),
215
            'sed': (29, 1, None),
216
            'bed': (36, 2, None),
217
            'ged': (43, 1, None),
218
            'les': (50, 1, None),
219
            'tes': (51, 1, None),
220
            'zed': (52, 1, None),
221
            'ied': (56, 3, 'y'),
222
            'ies': (59, 3, 'y'),
223
            'ves': (60, 1, None),
224
            'pes': (63.8, 1, None),
225
            'mes': (63.7, 1, None),
226
            'ges': (63.1, 1, None),
227
            'ous': (65, 0, None),
228
            'ums': (66, 0, None),
229
        },
230
        2: {
231
            'cs': (3, 0, None),
232
            'ss': (6, 0, None),
233
            'es': (63, 2, None),
234
            'is': (64, 2, 'e'),
235
            'us': (67, 0, None),
236
        },
237
    }
238
239
    _perl_rule_table = {
240
        7: {
241
            'titudes': (30, 1, None),
242
            'fulness': (34, 4, None),
243
            'ousness': (35, 4, None),
244
        },
245
        6: {
246
            'aceous': (1, 6, None),
247
            'aining': (24, 3, None),
248
            'acting': (25, 3, None),
249
            'viding': (27, 3, 'e'),
250
            'ulting': (38, 3, None),
251
            'eading': (40.7, 3, None),
252
            'oading': (40.6, 3, None),
253 1
            'ealing': (42.4, 3, None),
254
            'ailing': (42.2, 3, None),
255
        },
256
        5: {
257
            'iases': (11.4, 2, None),
258
            'ained': (13.6, 2, None),
259
            'erned': (13.5, 2, None),
260
            'ifted': (14, 2, None),
261
            'ected': (15, 2, None),
262
            'vided': (16, 1, None),
263
            'erred': (19, 3, None),
264
            'urred': (20.5, 3, None),
265
            'lored': (20.4, 2, None),
266
            'eared': (20.3, 2, None),
267
            'tored': (20.2, 1, None),
268
            'noted': (22.4, 1, None),
269
            'leted': (22.3, 1, None),
270
            'anges': (23, 1, None),
271
            'tting': (26, 4, None),
272
            'ulted': (32, 2, None),
273
            'uming': (33, 3, 'e'),
274
            'rabed': (36.1, 1, None),
275
            'rebed': (36.1, 1, None),
276
            'ribed': (36.1, 1, None),
277
            'robed': (36.1, 1, None),
278
            'rubed': (36.1, 1, None),
279
            'ssing': (37, 3, None),
280
            'eding': (40.5, 3, None),
281
            'dding': (40.4, 4, None),
282
            'lding': (40.3, 3, None),
283
            'rding': (40.2, 3, None),
284
            'nding': (40.1, 3, None),
285
            'lling': (41, 4, None),
286
            'oling': (42.3, 3, None),
287
            'eling': (42.1, 3, None),
288
            'mming': (44.3, 4, None),
289
            'rming': (44.2, 3, None),
290
            'lming': (44.1, 3, None),
291
            'nging': (45.2, 3, None),
292
            'gging': (45.1, 4, None),
293
            'aning': (46.6, 3, None),
294
            'ening': (46.5, 3, None),
295
            'gning': (46.4, 3, None),
296
            'nning': (46.3, 4, None),
297
            'oning': (46.2, 3, None),
298
            'rning': (46.1, 3, None),
299
            'sting': (47, 3, None),
300
            'eting': (48.4, 3, None),
301
            'pting': (48.3, 3, None),
302
            'nting': (48.2, 3, None),
303
            'cting': (48.1, 3, None),
304
            'iring': (54.4, 3, 'e'),
305
            'uring': (54.3, 3, 'e'),
306
            'ncing': (54.2, 3, 'e'),
307
            # 'lling': (55, 3, None),  # masked by 41
308
            'ating': (57, 3, 'e'),
309
            'thing': (58.1, 0, None),
310
        },
311
        4: {
312
            'uses': (11.3, 1, None),
313
            'sses': (11.2, 2, None),
314
            'eses': (11.1, 2, 'is'),
315
            'tled': (12.5, 1, None),
316
            'pled': (12.4, 1, None),
317
            'bled': (12.3, 1, None),
318
            'eled': (12.2, 2, None),
319
            'lled': (12.1, 2, None),
320
            'ened': (13.7, 2, None),
321
            'rned': (13.4, 2, None),
322
            'nned': (13.3, 3, None),
323
            'oned': (13.2, 2, None),
324
            'gned': (13.1, 2, None),
325
            'ered': (20.1, 2, None),
326
            'tted': (21, 3, None),
327
            'uted': (22.2, 1, None),
328
            'ated': (22.1, 1, None),
329
            'ssed': (28, 2, None),
330
            'umed': (31, 1, None),
331
            'ving': (39, 3, 'e'),
332
            'ding': (40, 3, 'e'),
333
            'ling': (42, 3, 'e'),
334
            'nged': (43.2, 1, None),
335
            'gged': (43.1, 3, None),
336
            'ming': (44, 3, 'e'),
337
            'ging': (45, 3, 'e'),
338
            'ning': (46, 3, 'e'),
339
            'ting': (48, 3, 'e'),
340
            # 'ssed': (49, 2, None),  # masked by 28
341
            # 'lled': (53, 2, None),  # masked by 12.1
342
            'zing': (54.1, 3, 'e'),
343
            'sing': (54, 3, 'e'),
344
            'lves': (60.1, 3, 'f'),
345
            'aped': (61.3, 1, None),
346
            'uded': (61.2, 1, None),
347
            'oded': (61.1, 1, None),
348
            # 'ated': (61, 1, None),  # masked by 22.1
349
            'ones': (63.6, 1, None),
350
            'izes': (63.5, 1, None),
351
            'ures': (63.4, 1, None),
352
            'ines': (63.3, 1, None),
353
            'ides': (63.2, 1, None),
354
        },
355
        3: {
356
            'ces': (2, 1, None),
357
            'sis': (4, 0, None),
358
            'tis': (5, 0, None),
359
            'eed': (7, 0, None),
360
            'ued': (8, 1, None),
361
            'ues': (9, 1, None),
362
            'ees': (10, 1, None),
363
            'ses': (11, 1, None),
364
            'led': (12, 2, None),
365
            'ned': (13, 1, None),
366
            'ved': (17, 1, None),
367
            'ced': (18, 1, None),
368
            'red': (20, 1, None),
369
            'ted': (22, 2, None),
370
            'sed': (29, 1, None),
371
            'bed': (36, 2, None),
372
            'ged': (43, 1, None),
373
            'les': (50, 1, None),
374
            'tes': (51, 1, None),
375
            'zed': (52, 1, None),
376
            'ied': (56, 3, 'y'),
377
            'ies': (59, 3, 'y'),
378
            'ves': (60, 1, None),
379
            'pes': (63.8, 1, None),
380
            'mes': (63.7, 1, None),
381
            'ges': (63.1, 1, None),
382
            'ous': (65, 0, None),
383
            'ums': (66, 0, None),
384
        },
385
        2: {
386
            'cs': (3, 0, None),
387
            'ss': (6, 0, None),
388
            'es': (63, 2, None),
389
            'is': (64, 2, 'e'),
390
            'us': (67, 0, None),
391
        },
392
    }
393
394
    _adams_rule_table = {
395
        7: {
396
            'titudes': (30, 1, None),
397
            'fulness': (34, 4, None),
398
            'ousness': (35, 4, None),
399
            'eadings': (40.7, 4, None),
400
            'oadings': (40.6, 4, None),
401
            'ealings': (42.4, 4, None),
402
            'ailings': (42.2, 4, None),
403
        },
404
        6: {
405
            'aceous': (1, 6, None),
406
            'aining': (24, 3, None),
407
            'acting': (25, 3, None),
408 1
            'ttings': (26, 5, None),
409
            'viding': (27, 3, 'e'),
410
            'ssings': (37, 4, None),
411
            'ulting': (38, 3, None),
412
            'eading': (40.7, 3, None),
413
            'oading': (40.6, 3, None),
414
            'edings': (40.5, 4, None),
415
            'ddings': (40.4, 5, None),
416
            'ldings': (40.3, 4, None),
417
            'rdings': (40.2, 4, None),
418
            'ndings': (40.1, 4, None),
419
            'llings': (41, 5, None),
420
            'ealing': (42.4, 3, None),
421
            'olings': (42.3, 4, None),
422
            'ailing': (42.2, 3, None),
423
            'elings': (42.1, 4, None),
424
            'mmings': (44.3, 5, None),
425
            'ngings': (45.2, 4, None),
426
            'ggings': (45.1, 5, None),
427
            'stings': (47, 4, None),
428
            'etings': (48.4, 4, None),
429
            'ntings': (48.2, 4, None),
430
            'irings': (54.4, 4, 'e'),
431
            'urings': (54.3, 4, 'e'),
432
            'ncings': (54.2, 4, 'e'),
433
            'things': (58.1, 1, None),
434
            'chited': (22.8, 1, None),
435
        },
436
        5: {
437
            'iases': (11.4, 2, None),
438
            'ained': (13.6, 2, None),
439
            'erned': (13.5, 2, None),
440
            'ifted': (14, 2, None),
441
            'ected': (15, 2, None),
442
            # 'vided': (16, 1, None),
443
            'erred': (19, 3, None),
444
            'urred': (20.5, 3, None),
445
            'lored': (20.4, 2, None),
446
            'eared': (20.3, 2, None),
447
            'tored': (20.2, 1, None),
448
            'noted': (22.4, 1, None),
449
            'leted': (22.3, 1, None),
450
            'anges': (23, 1, None),
451
            'tting': (26, 4, None),
452
            'ulted': (32, 2, None),
453
            'uming': (33, 3, 'e'),
454
            'rabed': (36.1, 1, None),
455
            'rebed': (36.1, 1, None),
456
            'ribed': (36.1, 1, None),
457
            'robed': (36.1, 1, None),
458
            'rubed': (36.1, 1, None),
459
            'ssing': (37, 3, None),
460
            'vings': (39, 4, 'e'),
461
            'eding': (40.5, 3, None),
462
            'dding': (40.4, 4, None),
463
            'lding': (40.3, 3, None),
464
            'rding': (40.2, 3, None),
465
            'nding': (40.1, 3, None),
466
            'dings': (40, 4, 'e'),
467
            'lling': (41, 4, None),
468
            'oling': (42.3, 3, None),
469
            'eling': (42.1, 3, None),
470
            'lings': (42, 4, 'e'),
471
            'mming': (44.3, 4, None),
472
            'rming': (44.2, 3, None),
473
            'lming': (44.1, 3, None),
474
            'mings': (44, 4, 'e'),
475
            'nging': (45.2, 3, None),
476
            'gging': (45.1, 4, None),
477
            'gings': (45, 4, 'e'),
478
            'aning': (46.6, 3, None),
479
            'ening': (46.5, 3, None),
480
            'gning': (46.4, 3, None),
481
            'nning': (46.3, 4, None),
482
            'oning': (46.2, 3, None),
483
            'rning': (46.1, 3, None),
484
            'sting': (47, 3, None),
485
            'eting': (48.4, 3, None),
486
            'pting': (48.3, 3, None),
487
            'nting': (48.2, 3, None),
488
            'cting': (48.1, 3, None),
489
            'tings': (48, 4, 'e'),
490
            'iring': (54.4, 3, 'e'),
491
            'uring': (54.3, 3, 'e'),
492
            'ncing': (54.2, 3, 'e'),
493
            'sings': (54, 4, 'e'),
494
            # 'lling': (55, 3, None),  # masked by 41
495
            'ating': (57, 3, 'e'),
496
            'thing': (58.1, 0, None),
497
            'dying': (58.2, 4, 'ie'),
498
            'tying': (58.2, 4, 'ie'),
499
            'vited': (22.6, 1, None),
500
            'mited': (22.5, 1, None),
501
            'vided': (22.9, 1, None),
502
            'mided': (22.10, 1, None),
503
            'lying': (58.2, 4, 'ie'),
504
            'arred': (19.1, 3, None),
505
        },
506
        4: {
507
            'eeds': (7, 1, None),
508
            'uses': (11.3, 1, None),
509
            'sses': (11.2, 2, None),
510
            'eses': (11.1, 2, 'is'),
511
            'tled': (12.5, 1, None),
512
            'pled': (12.4, 1, None),
513
            'bled': (12.3, 1, None),
514
            'eled': (12.2, 2, None),
515
            'lled': (12.1, 2, None),
516
            'ened': (13.7, 2, None),
517
            'rned': (13.4, 2, None),
518
            'nned': (13.3, 3, None),
519
            'oned': (13.2, 2, None),
520
            'gned': (13.1, 2, None),
521
            'ered': (20.1, 2, None),
522
            'reds': (20, 2, None),
523
            'tted': (21, 3, None),
524
            'uted': (22.2, 1, None),
525
            'ated': (22.1, 1, None),
526
            'ssed': (28, 2, None),
527
            'umed': (31, 1, None),
528
            'beds': (36, 3, None),
529
            'ving': (39, 3, 'e'),
530
            'ding': (40, 3, 'e'),
531
            'ling': (42, 3, 'e'),
532
            'nged': (43.2, 1, None),
533
            'gged': (43.1, 3, None),
534
            'ming': (44, 3, 'e'),
535
            'ging': (45, 3, 'e'),
536
            'ning': (46, 3, 'e'),
537
            'ting': (48, 3, 'e'),
538
            # 'ssed': (49, 2, None),  # masked by 28
539
            # 'lled': (53, 2, None),  # masked by 12.1
540
            'zing': (54.1, 3, 'e'),
541
            'sing': (54, 3, 'e'),
542
            'lves': (60.1, 3, 'f'),
543
            'aped': (61.3, 1, None),
544
            'uded': (61.2, 1, None),
545
            'oded': (61.1, 1, None),
546
            # 'ated': (61, 1, None),  # masked by 22.1
547
            'ones': (63.6, 1, None),
548
            'izes': (63.5, 1, None),
549
            'ures': (63.4, 1, None),
550
            'ines': (63.3, 1, None),
551
            'ides': (63.2, 1, None),
552
            'ited': (22.7, 2, None),
553
            'oked': (31.1, 1, None),
554
            'aked': (31.1, 1, None),
555
            'iked': (31.1, 1, None),
556
            'uked': (31.1, 1, None),
557
            'amed': (31, 1, None),
558
            'imed': (31, 1, None),
559
            'does': (31.2, 2, None),
560
        },
561
        3: {
562
            'ces': (2, 1, None),
563
            'sis': (4, 0, None),
564
            'tis': (5, 0, None),
565
            'eed': (7, 0, None),
566
            'ued': (8, 1, None),
567
            'ues': (9, 1, None),
568
            'ees': (10, 1, None),
569
            'ses': (11, 1, None),
570
            'led': (12, 2, None),
571
            'ned': (13, 1, None),
572
            'ved': (17, 1, None),
573
            'ced': (18, 1, None),
574
            'red': (20, 1, None),
575
            'ted': (22, 2, None),
576
            'sed': (29, 1, None),
577
            'bed': (36, 2, None),
578
            'ged': (43, 1, None),
579
            'les': (50, 1, None),
580
            'tes': (51, 1, None),
581
            'zed': (52, 1, None),
582
            'ied': (56, 3, 'y'),
583
            'ies': (59, 3, 'y'),
584
            'ves': (60, 1, None),
585
            'pes': (63.8, 1, None),
586
            'mes': (63.7, 1, None),
587
            'ges': (63.1, 1, None),
588
            'ous': (65, 0, None),
589
            'ums': (66, 0, None),
590
            'oed': (31.3, 1, None),
591
            'oes': (31.2, 1, None),
592
            'kes': (63.1, 1, None),
593
            'des': (63.10, 1, None),
594
            'res': (63.9, 1, None),
595
        },
596
        2: {
597
            'cs': (3, 0, None),
598
            'ss': (6, 0, None),
599
            'es': (63, 2, None),
600
            'is': (64, 2, 'e'),
601
            'us': (67, 0, None),
602
        },
603
    }
604
605
    _rules = {
606
        'standard': _standard_rule_table,
607
        'Adams': _adams_rule_table,
608
        'Perl': _perl_rule_table,
609
    }
610
611
    def __init__(
612
        self,
613
        max_word_length=20,
614
        max_acro_length=8,
615
        return_rule_no=False,
616
        var='standard',
617
    ):
618
        """Initialize UEALite instance.
619 1
620
        Parameters
621
        ----------
622
        max_word_length : int
623
            The maximum word length allowed
624
        max_acro_length : int
625 1
            The maximum acronym length allowed
626
        return_rule_no : bool
627
            If True, returns the stem along with rule number
628
        var : str
629
            Variant rules to use:
630
631
                - ``standard`` to use the original (Java-version) rules
632
                - ``Adams`` to use Jason Adams' rules
633
                - ``Perl`` to use the original Perl rules
634
635
636
        .. versionadded:: 0.4.0
637
638
        """
639
        self._max_word_length = max_word_length
640
        self._max_acro_length = max_acro_length
641
        self._return_rule_no = return_rule_no
642
        self._var = var
643
644
    def stem(self, word):
645
        """Return UEA-Lite stem.
646
647
        Parameters
648
        ----------
649
        word : str
650
            The word to stem
651
652
        Returns
653 1
        -------
654 1
        str or (str, int)
655 1
            Word stem
656 1
657
        Examples
658 1
        --------
659
        >>> stmr = UEALite()
660
        >>> stmr.stem('readings')
661
        'read'
662
        >>> stmr.stem('insulted')
663
        'insult'
664
        >>> stmr.stem('cussed')
665
        'cuss'
666
        >>> stmr.stem('fancies')
667
        'fancy'
668
        >>> stmr.stem('eroded')
669
        'erode'
670
671
672
        .. versionadded:: 0.1.0
673
        .. versionchanged:: 0.3.6
674
            Encapsulated in class
675
676
        """
677
678
        def _stem_with_duplicate_character_check(word, del_len):
679
            if word[-1] == 's':
680
                del_len += 1
681
            stemmed_word = word[:-del_len]
682
            if re_match(r'.*(\w)\1$', stemmed_word):
683
                stemmed_word = stemmed_word[:-1]
684
            return stemmed_word
685
686
        def _stem(word):
687
            stemmed_word = word
688
            rule_no = 0
689
690
            if not word:
691 1
                return word, 0
692 1
            if word in self._problem_words or (
693 1
                word == 'menses' and self._var == 'Adams'
694 1
            ):
695 1
                return word, 90
696 1
            if self._max_word_length and len(word) > self._max_word_length:
697 1
                return word, 95
698
699 1
            if "'" in word:
700 1
                if word[-2:] in {"'s", "'S"}:
701 1
                    stemmed_word = word[:-2]
702
                if word[-1:] == "'":
703 1
                    stemmed_word = word[:-1]
704 1
                stemmed_word = stemmed_word.replace("n't", 'not')
705 1
                stemmed_word = stemmed_word.replace("'ve", 'have')
706
                stemmed_word = stemmed_word.replace("'re", 'are')
707
                stemmed_word = stemmed_word.replace("'m", 'am')
708 1
                return stemmed_word, 94
709 1
710 1
            if word.isdigit():
711
                return word, 90.3
712 1
            else:
713 1
                hyphen = word.find('-')
714 1
                if len(word) > hyphen > 0:
715 1
                    if (
716 1
                        word[:hyphen].isalpha()
717 1
                        and word[hyphen + 1 :].isalpha()
718 1
                    ):
719 1
                        return word, 90.2
720 1
                    else:
721 1
                        return word, 90.1
722
                elif '_' in word:
723 1
                    return word, 90
724 1
                elif word[-1] == 's' and word[:-1].isupper():
725
                    if (
726 1
                        self._var == 'Adams'
727 1
                        and len(word) - 1 > self._max_acro_length
728 1
                    ):
729
                        return word, 96
730
                    return word[:-1], 91.1
731
                elif word.isupper():
732 1
                    if (
733
                        self._var == 'Adams'
734 1
                        and len(word) > self._max_acro_length
735 1
                    ):
736 1
                        return word, 96
737 1
                    return word, 91
738 1
                elif re_match(r'^.*[A-Z].*[A-Z].*$', word):
739
                    return word, 92
740
                elif word[0].isupper():
741
                    return word, 93
742 1
                elif self._var == 'Adams' and re_match(
743 1
                    r'^[a-z](|[rl])(ing|ed)$', word
744 1
                ):
745 1
                    return word, 97
746
747
            for n in range(7, 1, -1):
748
                if word[-n:] in self._rules[self._var][n]:
749 1
                    rule_no, del_len, add_str = self._rules[self._var][n][
750 1
                        word[-n:]
751 1
                    ]
752 1
                    if del_len:
753 1
                        stemmed_word = word[:-del_len]
754 1
                    else:
755 1
                        stemmed_word = word
756
                    if add_str:
757
                        stemmed_word += add_str
758 1
                    break
759
760 1
            if not rule_no:
761 1
                if re_match(r'.*\w\wings?$', word):  # rule 58
762 1
                    stemmed_word = _stem_with_duplicate_character_check(
763
                        word, 3
764
                    )
765 1
                    rule_no = 58
766 1
                elif re_match(r'.*\w\weds?$', word):  # rule 62
767
                    stemmed_word = _stem_with_duplicate_character_check(
768 1
                        word, 2
769 1
                    )
770 1
                    rule_no = 62
771 1
                elif word[-1] == 's':  # rule 68
772
                    stemmed_word = word[:-1]
773 1
                    rule_no = 68
774 1
775 1
            return stemmed_word, rule_no
776
777
        stem, rule_no = _stem(word)
778 1
        if self._return_rule_no:
779 1
            return stem, rule_no
780 1
        return stem
781
782
783 1
if __name__ == '__main__':
784 1
    import doctest
785 1
786
    doctest.testmod()
787