abydos.stemmer._uea_lite   A
last analyzed

Complexity

Total Complexity 38

Size/Duplication

Total Lines 820
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 38
eloc 627
dl 0
loc 820
ccs 95
cts 95
cp 1
rs 9.333
c 0
b 0
f 0

3 Methods

Rating   Name   Duplication   Size   Complexity  
A UEALite.__init__() 0 28 1
F UEALite._stem_and_rule() 0 136 36
A UEALite.stem() 0 36 1
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._uea_lite.
18
19 1
UEA-Lite stemmer
20
"""
21
22
from re import match as re_match
23
from typing import Dict, Optional, Tuple
24 1
25
from ._stemmer import _Stemmer
26
27
__all__ = ['UEALite']
28
29
30
class UEALite(_Stemmer):
31 1
    """UEA-Lite stemmer.
32
33 1
    The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`.
34
35 1
    This is chiefly based on the Java implementation of the algorithm, with
36
    variants based on the Perl implementation and Jason Adams' Ruby port.
37 1
38 1
    Java version: :cite:`Churchill:2005`
39
    Perl version: :cite:`Jenkins:2005`
40 1
    Ruby version: :cite:`Adams:2017`
41
42
    .. versionadded:: 0.3.6
43 1
    """
44
45
    _problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
46
47
    # rule table format:
48
    # top-level dictionary: length-of-suffix: dict-of-rules
49
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
50
    #                         suffix_to_append)
51
    _standard_rule_table = {
52
        7: {
53
            'titudes': (30, 1, None),
54
            'fulness': (34, 4, None),
55
            'ousness': (35, 4, None),
56
            'eadings': (40.7, 4, None),
57
            'oadings': (40.6, 4, None),
58 1
            'ealings': (42.4, 4, None),
59
            'ailings': (42.2, 4, None),
60
        },
61
        6: {
62
            'aceous': (1, 6, None),
63
            'aining': (24, 3, None),
64 1
            'acting': (25, 3, None),
65
            'ttings': (26, 5, None),
66
            'viding': (27, 3, 'e'),
67
            'ssings': (37, 4, None),
68
            'ulting': (38, 3, None),
69
            'eading': (40.7, 3, None),
70
            'oading': (40.6, 3, None),
71
            'edings': (40.5, 4, None),
72
            'ddings': (40.4, 5, None),
73
            'ldings': (40.3, 4, None),
74
            'rdings': (40.2, 4, None),
75
            'ndings': (40.1, 4, None),
76
            'llings': (41, 5, None),
77
            'ealing': (42.4, 3, None),
78
            'olings': (42.3, 4, None),
79
            'ailing': (42.2, 3, None),
80
            'elings': (42.1, 4, None),
81
            'mmings': (44.3, 5, None),
82
            'ngings': (45.2, 4, None),
83
            'ggings': (45.1, 5, None),
84
            'stings': (47, 4, None),
85
            'etings': (48.4, 4, None),
86
            'ntings': (48.2, 4, None),
87
            'irings': (54.4, 4, 'e'),
88
            'urings': (54.3, 4, 'e'),
89
            'ncings': (54.2, 4, 'e'),
90
            'things': (58.1, 1, None),
91
        },
92
        5: {
93
            'iases': (11.4, 2, None),
94
            'ained': (13.6, 2, None),
95
            'erned': (13.5, 2, None),
96
            'ifted': (14, 2, None),
97
            'ected': (15, 2, None),
98
            'vided': (16, 1, None),
99
            'erred': (19, 3, None),
100
            'urred': (20.5, 3, None),
101
            'lored': (20.4, 2, None),
102
            'eared': (20.3, 2, None),
103
            'tored': (20.2, 1, None),
104
            'noted': (22.4, 1, None),
105
            'leted': (22.3, 1, None),
106
            'anges': (23, 1, None),
107
            'tting': (26, 4, None),
108
            'ulted': (32, 2, None),
109
            'uming': (33, 3, 'e'),
110
            'rabed': (36.1, 1, None),
111
            'rebed': (36.1, 1, None),
112
            'ribed': (36.1, 1, None),
113
            'robed': (36.1, 1, None),
114
            'rubed': (36.1, 1, None),
115
            'ssing': (37, 3, None),
116
            'vings': (39, 4, 'e'),
117
            'eding': (40.5, 3, None),
118
            'dding': (40.4, 4, None),
119
            'lding': (40.3, 3, None),
120
            'rding': (40.2, 3, None),
121
            'nding': (40.1, 3, None),
122
            'dings': (40, 4, 'e'),
123
            'lling': (41, 4, None),
124
            'oling': (42.3, 3, None),
125
            'eling': (42.1, 3, None),
126
            'lings': (42, 4, 'e'),
127
            'mming': (44.3, 4, None),
128
            'rming': (44.2, 3, None),
129
            'lming': (44.1, 3, None),
130
            'mings': (44, 4, 'e'),
131
            'nging': (45.2, 3, None),
132
            'gging': (45.1, 4, None),
133
            'gings': (45, 4, 'e'),
134
            'aning': (46.6, 3, None),
135
            'ening': (46.5, 3, None),
136
            'gning': (46.4, 3, None),
137
            'nning': (46.3, 4, None),
138
            'oning': (46.2, 3, None),
139
            'rning': (46.1, 3, None),
140
            'sting': (47, 3, None),
141
            'eting': (48.4, 3, None),
142
            'pting': (48.3, 3, None),
143
            'nting': (48.2, 3, None),
144
            'cting': (48.1, 3, None),
145
            'tings': (48, 4, 'e'),
146
            'iring': (54.4, 3, 'e'),
147
            'uring': (54.3, 3, 'e'),
148
            'ncing': (54.2, 3, 'e'),
149
            'sings': (54, 4, 'e'),
150
            # 'lling': (55, 3, None),  # masked by 41
151
            'ating': (57, 3, 'e'),
152
            'thing': (58.1, 0, None),
153
        },
154
        4: {
155
            'eeds': (7, 1, None),
156
            'uses': (11.3, 1, None),
157
            'sses': (11.2, 2, None),
158
            'eses': (11.1, 2, 'is'),
159
            'tled': (12.5, 1, None),
160
            'pled': (12.4, 1, None),
161
            'bled': (12.3, 1, None),
162
            'eled': (12.2, 2, None),
163
            'lled': (12.1, 2, None),
164
            'ened': (13.7, 2, None),
165
            'rned': (13.4, 2, None),
166
            'nned': (13.3, 3, None),
167
            'oned': (13.2, 2, None),
168
            'gned': (13.1, 2, None),
169
            'ered': (20.1, 2, None),
170
            'reds': (20, 2, None),
171
            'tted': (21, 3, None),
172
            'uted': (22.2, 1, None),
173
            'ated': (22.1, 1, None),
174
            'ssed': (28, 2, None),
175
            'umed': (31, 1, None),
176
            'beds': (36, 3, None),
177
            'ving': (39, 3, 'e'),
178
            'ding': (40, 3, 'e'),
179
            'ling': (42, 3, 'e'),
180
            'nged': (43.2, 1, None),
181
            'gged': (43.1, 3, None),
182
            'ming': (44, 3, 'e'),
183
            'ging': (45, 3, 'e'),
184
            'ning': (46, 3, 'e'),
185
            'ting': (48, 3, 'e'),
186
            # 'ssed': (49, 2, None),  # masked by 28
187
            # 'lled': (53, 2, None),  # masked by 12.1
188
            'zing': (54.1, 3, 'e'),
189
            'sing': (54, 3, 'e'),
190
            'lves': (60.1, 3, 'f'),
191
            'aped': (61.3, 1, None),
192
            'uded': (61.2, 1, None),
193
            'oded': (61.1, 1, None),
194
            # 'ated': (61, 1, None),  # masked by 22.1
195
            'ones': (63.6, 1, None),
196
            'izes': (63.5, 1, None),
197
            'ures': (63.4, 1, None),
198
            'ines': (63.3, 1, None),
199
            'ides': (63.2, 1, None),
200
        },
201
        3: {
202
            'ces': (2, 1, None),
203
            'sis': (4, 0, None),
204
            'tis': (5, 0, None),
205
            'eed': (7, 0, None),
206
            'ued': (8, 1, None),
207
            'ues': (9, 1, None),
208
            'ees': (10, 1, None),
209
            'ses': (11, 1, None),
210
            'led': (12, 2, None),
211
            'ned': (13, 1, None),
212
            'ved': (17, 1, None),
213
            'ced': (18, 1, None),
214
            'red': (20, 1, None),
215
            'ted': (22, 2, None),
216
            'sed': (29, 1, None),
217
            'bed': (36, 2, None),
218
            'ged': (43, 1, None),
219
            'les': (50, 1, None),
220
            'tes': (51, 1, None),
221
            'zed': (52, 1, None),
222
            'ied': (56, 3, 'y'),
223
            'ies': (59, 3, 'y'),
224
            'ves': (60, 1, None),
225
            'pes': (63.8, 1, None),
226
            'mes': (63.7, 1, None),
227
            'ges': (63.1, 1, None),
228
            'ous': (65, 0, None),
229
            'ums': (66, 0, None),
230
        },
231
        2: {
232
            'cs': (3, 0, None),
233
            'ss': (6, 0, None),
234
            'es': (63, 2, None),
235
            'is': (64, 2, 'e'),
236
            'us': (67, 0, None),
237
        },
238
    }  # type: Dict[int, Dict[str, Tuple[float, int, Optional[str]]]]
239
240
    _perl_rule_table = {
241
        7: {
242
            'titudes': (30, 1, None),
243
            'fulness': (34, 4, None),
244
            'ousness': (35, 4, None),
245
        },
246
        6: {
247
            'aceous': (1, 6, None),
248
            'aining': (24, 3, None),
249
            'acting': (25, 3, None),
250
            'viding': (27, 3, 'e'),
251
            'ulting': (38, 3, None),
252
            'eading': (40.7, 3, None),
253 1
            'oading': (40.6, 3, None),
254
            'ealing': (42.4, 3, None),
255
            'ailing': (42.2, 3, None),
256
        },
257
        5: {
258
            'iases': (11.4, 2, None),
259
            'ained': (13.6, 2, None),
260
            'erned': (13.5, 2, None),
261
            'ifted': (14, 2, None),
262
            'ected': (15, 2, None),
263
            'vided': (16, 1, None),
264
            'erred': (19, 3, None),
265
            'urred': (20.5, 3, None),
266
            'lored': (20.4, 2, None),
267
            'eared': (20.3, 2, None),
268
            'tored': (20.2, 1, None),
269
            'noted': (22.4, 1, None),
270
            'leted': (22.3, 1, None),
271
            'anges': (23, 1, None),
272
            'tting': (26, 4, None),
273
            'ulted': (32, 2, None),
274
            'uming': (33, 3, 'e'),
275
            'rabed': (36.1, 1, None),
276
            'rebed': (36.1, 1, None),
277
            'ribed': (36.1, 1, None),
278
            'robed': (36.1, 1, None),
279
            'rubed': (36.1, 1, None),
280
            'ssing': (37, 3, None),
281
            'eding': (40.5, 3, None),
282
            'dding': (40.4, 4, None),
283
            'lding': (40.3, 3, None),
284
            'rding': (40.2, 3, None),
285
            'nding': (40.1, 3, None),
286
            'lling': (41, 4, None),
287
            'oling': (42.3, 3, None),
288
            'eling': (42.1, 3, None),
289
            'mming': (44.3, 4, None),
290
            'rming': (44.2, 3, None),
291
            'lming': (44.1, 3, None),
292
            'nging': (45.2, 3, None),
293
            'gging': (45.1, 4, None),
294
            'aning': (46.6, 3, None),
295
            'ening': (46.5, 3, None),
296
            'gning': (46.4, 3, None),
297
            'nning': (46.3, 4, None),
298
            'oning': (46.2, 3, None),
299
            'rning': (46.1, 3, None),
300
            'sting': (47, 3, None),
301
            'eting': (48.4, 3, None),
302
            'pting': (48.3, 3, None),
303
            'nting': (48.2, 3, None),
304
            'cting': (48.1, 3, None),
305
            'iring': (54.4, 3, 'e'),
306
            'uring': (54.3, 3, 'e'),
307
            'ncing': (54.2, 3, 'e'),
308
            # 'lling': (55, 3, None),  # masked by 41
309
            'ating': (57, 3, 'e'),
310
            'thing': (58.1, 0, None),
311
        },
312
        4: {
313
            'uses': (11.3, 1, None),
314
            'sses': (11.2, 2, None),
315
            'eses': (11.1, 2, 'is'),
316
            'tled': (12.5, 1, None),
317
            'pled': (12.4, 1, None),
318
            'bled': (12.3, 1, None),
319
            'eled': (12.2, 2, None),
320
            'lled': (12.1, 2, None),
321
            'ened': (13.7, 2, None),
322
            'rned': (13.4, 2, None),
323
            'nned': (13.3, 3, None),
324
            'oned': (13.2, 2, None),
325
            'gned': (13.1, 2, None),
326
            'ered': (20.1, 2, None),
327
            'tted': (21, 3, None),
328
            'uted': (22.2, 1, None),
329
            'ated': (22.1, 1, None),
330
            'ssed': (28, 2, None),
331
            'umed': (31, 1, None),
332
            'ving': (39, 3, 'e'),
333
            'ding': (40, 3, 'e'),
334
            'ling': (42, 3, 'e'),
335
            'nged': (43.2, 1, None),
336
            'gged': (43.1, 3, None),
337
            'ming': (44, 3, 'e'),
338
            'ging': (45, 3, 'e'),
339
            'ning': (46, 3, 'e'),
340
            'ting': (48, 3, 'e'),
341
            # 'ssed': (49, 2, None),  # masked by 28
342
            # 'lled': (53, 2, None),  # masked by 12.1
343
            'zing': (54.1, 3, 'e'),
344
            'sing': (54, 3, 'e'),
345
            'lves': (60.1, 3, 'f'),
346
            'aped': (61.3, 1, None),
347
            'uded': (61.2, 1, None),
348
            'oded': (61.1, 1, None),
349
            # 'ated': (61, 1, None),  # masked by 22.1
350
            'ones': (63.6, 1, None),
351
            'izes': (63.5, 1, None),
352
            'ures': (63.4, 1, None),
353
            'ines': (63.3, 1, None),
354
            'ides': (63.2, 1, None),
355
        },
356
        3: {
357
            'ces': (2, 1, None),
358
            'sis': (4, 0, None),
359
            'tis': (5, 0, None),
360
            'eed': (7, 0, None),
361
            'ued': (8, 1, None),
362
            'ues': (9, 1, None),
363
            'ees': (10, 1, None),
364
            'ses': (11, 1, None),
365
            'led': (12, 2, None),
366
            'ned': (13, 1, None),
367
            'ved': (17, 1, None),
368
            'ced': (18, 1, None),
369
            'red': (20, 1, None),
370
            'ted': (22, 2, None),
371
            'sed': (29, 1, None),
372
            'bed': (36, 2, None),
373
            'ged': (43, 1, None),
374
            'les': (50, 1, None),
375
            'tes': (51, 1, None),
376
            'zed': (52, 1, None),
377
            'ied': (56, 3, 'y'),
378
            'ies': (59, 3, 'y'),
379
            'ves': (60, 1, None),
380
            'pes': (63.8, 1, None),
381
            'mes': (63.7, 1, None),
382
            'ges': (63.1, 1, None),
383
            'ous': (65, 0, None),
384
            'ums': (66, 0, None),
385
        },
386
        2: {
387
            'cs': (3, 0, None),
388
            'ss': (6, 0, None),
389
            'es': (63, 2, None),
390
            'is': (64, 2, 'e'),
391
            'us': (67, 0, None),
392
        },
393
    }  # type: Dict[int, Dict[str, Tuple[float, int, Optional[str]]]]
394
395
    _adams_rule_table = {
396
        7: {
397
            'titudes': (30, 1, None),
398
            'fulness': (34, 4, None),
399
            'ousness': (35, 4, None),
400
            'eadings': (40.7, 4, None),
401
            'oadings': (40.6, 4, None),
402
            'ealings': (42.4, 4, None),
403
            'ailings': (42.2, 4, None),
404
        },
405
        6: {
406
            'aceous': (1, 6, None),
407
            'aining': (24, 3, None),
408 1
            'acting': (25, 3, None),
409
            'ttings': (26, 5, None),
410
            'viding': (27, 3, 'e'),
411
            'ssings': (37, 4, None),
412
            'ulting': (38, 3, None),
413
            'eading': (40.7, 3, None),
414
            'oading': (40.6, 3, None),
415
            'edings': (40.5, 4, None),
416
            'ddings': (40.4, 5, None),
417
            'ldings': (40.3, 4, None),
418
            'rdings': (40.2, 4, None),
419
            'ndings': (40.1, 4, None),
420
            'llings': (41, 5, None),
421
            'ealing': (42.4, 3, None),
422
            'olings': (42.3, 4, None),
423
            'ailing': (42.2, 3, None),
424
            'elings': (42.1, 4, None),
425
            'mmings': (44.3, 5, None),
426
            'ngings': (45.2, 4, None),
427
            'ggings': (45.1, 5, None),
428
            'stings': (47, 4, None),
429
            'etings': (48.4, 4, None),
430
            'ntings': (48.2, 4, None),
431
            'irings': (54.4, 4, 'e'),
432
            'urings': (54.3, 4, 'e'),
433
            'ncings': (54.2, 4, 'e'),
434
            'things': (58.1, 1, None),
435
            'chited': (22.8, 1, None),
436
        },
437
        5: {
438
            'iases': (11.4, 2, None),
439
            'ained': (13.6, 2, None),
440
            'erned': (13.5, 2, None),
441
            'ifted': (14, 2, None),
442
            'ected': (15, 2, None),
443
            # 'vided': (16, 1, None),
444
            'erred': (19, 3, None),
445
            'urred': (20.5, 3, None),
446
            'lored': (20.4, 2, None),
447
            'eared': (20.3, 2, None),
448
            'tored': (20.2, 1, None),
449
            'noted': (22.4, 1, None),
450
            'leted': (22.3, 1, None),
451
            'anges': (23, 1, None),
452
            'tting': (26, 4, None),
453
            'ulted': (32, 2, None),
454
            'uming': (33, 3, 'e'),
455
            'rabed': (36.1, 1, None),
456
            'rebed': (36.1, 1, None),
457
            'ribed': (36.1, 1, None),
458
            'robed': (36.1, 1, None),
459
            'rubed': (36.1, 1, None),
460
            'ssing': (37, 3, None),
461
            'vings': (39, 4, 'e'),
462
            'eding': (40.5, 3, None),
463
            'dding': (40.4, 4, None),
464
            'lding': (40.3, 3, None),
465
            'rding': (40.2, 3, None),
466
            'nding': (40.1, 3, None),
467
            'dings': (40, 4, 'e'),
468
            'lling': (41, 4, None),
469
            'oling': (42.3, 3, None),
470
            'eling': (42.1, 3, None),
471
            'lings': (42, 4, 'e'),
472
            'mming': (44.3, 4, None),
473
            'rming': (44.2, 3, None),
474
            'lming': (44.1, 3, None),
475
            'mings': (44, 4, 'e'),
476
            'nging': (45.2, 3, None),
477
            'gging': (45.1, 4, None),
478
            'gings': (45, 4, 'e'),
479
            'aning': (46.6, 3, None),
480
            'ening': (46.5, 3, None),
481
            'gning': (46.4, 3, None),
482
            'nning': (46.3, 4, None),
483
            'oning': (46.2, 3, None),
484
            'rning': (46.1, 3, None),
485
            'sting': (47, 3, None),
486
            'eting': (48.4, 3, None),
487
            'pting': (48.3, 3, None),
488
            'nting': (48.2, 3, None),
489
            'cting': (48.1, 3, None),
490
            'tings': (48, 4, 'e'),
491
            'iring': (54.4, 3, 'e'),
492
            'uring': (54.3, 3, 'e'),
493
            'ncing': (54.2, 3, 'e'),
494
            'sings': (54, 4, 'e'),
495
            # 'lling': (55, 3, None),  # masked by 41
496
            'ating': (57, 3, 'e'),
497
            'thing': (58.1, 0, None),
498
            'dying': (58.2, 4, 'ie'),
499
            'tying': (58.2, 4, 'ie'),
500
            'vited': (22.6, 1, None),
501
            'mited': (22.5, 1, None),
502
            'vided': (22.9, 1, None),
503
            'mided': (22.10, 1, None),
504
            'lying': (58.2, 4, 'ie'),
505
            'arred': (19.1, 3, None),
506
        },
507
        4: {
508
            'eeds': (7, 1, None),
509
            'uses': (11.3, 1, None),
510
            'sses': (11.2, 2, None),
511
            'eses': (11.1, 2, 'is'),
512
            'tled': (12.5, 1, None),
513
            'pled': (12.4, 1, None),
514
            'bled': (12.3, 1, None),
515
            'eled': (12.2, 2, None),
516
            'lled': (12.1, 2, None),
517
            'ened': (13.7, 2, None),
518
            'rned': (13.4, 2, None),
519
            'nned': (13.3, 3, None),
520
            'oned': (13.2, 2, None),
521
            'gned': (13.1, 2, None),
522
            'ered': (20.1, 2, None),
523
            'reds': (20, 2, None),
524
            'tted': (21, 3, None),
525
            'uted': (22.2, 1, None),
526
            'ated': (22.1, 1, None),
527
            'ssed': (28, 2, None),
528
            'umed': (31, 1, None),
529
            'beds': (36, 3, None),
530
            'ving': (39, 3, 'e'),
531
            'ding': (40, 3, 'e'),
532
            'ling': (42, 3, 'e'),
533
            'nged': (43.2, 1, None),
534
            'gged': (43.1, 3, None),
535
            'ming': (44, 3, 'e'),
536
            'ging': (45, 3, 'e'),
537
            'ning': (46, 3, 'e'),
538
            'ting': (48, 3, 'e'),
539
            # 'ssed': (49, 2, None),  # masked by 28
540
            # 'lled': (53, 2, None),  # masked by 12.1
541
            'zing': (54.1, 3, 'e'),
542
            'sing': (54, 3, 'e'),
543
            'lves': (60.1, 3, 'f'),
544
            'aped': (61.3, 1, None),
545
            'uded': (61.2, 1, None),
546
            'oded': (61.1, 1, None),
547
            # 'ated': (61, 1, None),  # masked by 22.1
548
            'ones': (63.6, 1, None),
549
            'izes': (63.5, 1, None),
550
            'ures': (63.4, 1, None),
551
            'ines': (63.3, 1, None),
552
            'ides': (63.2, 1, None),
553
            'ited': (22.7, 2, None),
554
            'oked': (31.1, 1, None),
555
            'aked': (31.1, 1, None),
556
            'iked': (31.1, 1, None),
557
            'uked': (31.1, 1, None),
558
            'amed': (31, 1, None),
559
            'imed': (31, 1, None),
560
            'does': (31.2, 2, None),
561
        },
562
        3: {
563
            'ces': (2, 1, None),
564
            'sis': (4, 0, None),
565
            'tis': (5, 0, None),
566
            'eed': (7, 0, None),
567
            'ued': (8, 1, None),
568
            'ues': (9, 1, None),
569
            'ees': (10, 1, None),
570
            'ses': (11, 1, None),
571
            'led': (12, 2, None),
572
            'ned': (13, 1, None),
573
            'ved': (17, 1, None),
574
            'ced': (18, 1, None),
575
            'red': (20, 1, None),
576
            'ted': (22, 2, None),
577
            'sed': (29, 1, None),
578
            'bed': (36, 2, None),
579
            'ged': (43, 1, None),
580
            'les': (50, 1, None),
581
            'tes': (51, 1, None),
582
            'zed': (52, 1, None),
583
            'ied': (56, 3, 'y'),
584
            'ies': (59, 3, 'y'),
585
            'ves': (60, 1, None),
586
            'pes': (63.8, 1, None),
587
            'mes': (63.7, 1, None),
588
            'ges': (63.1, 1, None),
589
            'ous': (65, 0, None),
590
            'ums': (66, 0, None),
591
            'oed': (31.3, 1, None),
592
            'oes': (31.2, 1, None),
593
            'kes': (63.1, 1, None),
594
            'des': (63.10, 1, None),
595
            'res': (63.9, 1, None),
596
        },
597
        2: {
598
            'cs': (3, 0, None),
599
            'ss': (6, 0, None),
600
            'es': (63, 2, None),
601
            'is': (64, 2, 'e'),
602
            'us': (67, 0, None),
603
        },
604
    }  # type: Dict[int, Dict[str, Tuple[float, int, Optional[str]]]]
605
606
    _rules = {
607
        'standard': _standard_rule_table,
608
        'Adams': _adams_rule_table,
609
        'Perl': _perl_rule_table,
610
    }  # type: Dict[str, Dict[int, Dict[str, Tuple[float, int, Optional[str]]]]]
611
612
    def __init__(
613
        self,
614
        max_word_length: int = 20,
615
        max_acro_length: int = 8,
616
        var: str = 'standard',
617
    ) -> None:
618
        """Initialize UEALite instance.
619 1
620
        Parameters
621
        ----------
622
        max_word_length : int
623
            The maximum word length allowed
624
        max_acro_length : int
625 1
            The maximum acronym length allowed
626
        var : str
627
            Variant rules to use:
628
629
                - ``standard`` to use the original (Java-version) rules
630
                - ``Adams`` to use Jason Adams' rules
631
                - ``Perl`` to use the original Perl rules
632
633
634
        .. versionadded:: 0.4.0
635
636
        """
637
        self._max_word_length = max_word_length
638
        self._max_acro_length = max_acro_length
639
        self._var = var
640
641
    def stem(self, word: str) -> str:
642
        """Return UEA-Lite stem.
643
644
        Parameters
645
        ----------
646
        word : str
647
            The word to stem
648
649
        Returns
650
        -------
651
        str or (str, int)
652
            Word stem
653 1
654 1
        Examples
655 1
        --------
656 1
        >>> stmr = UEALite()
657
        >>> stmr.stem('readings')
658 1
        'read'
659
        >>> stmr.stem('insulted')
660
        'insult'
661
        >>> stmr.stem('cussed')
662
        'cuss'
663
        >>> stmr.stem('fancies')
664
        'fancy'
665
        >>> stmr.stem('eroded')
666
        'erode'
667
668
669
        .. versionadded:: 0.1.0
670
        .. versionchanged:: 0.3.6
671
            Encapsulated in class
672
        .. versionchanged:: 0.6.0
673
            Made return a str exclusively
674
675
        """
676
        return self._stem_and_rule(word)[0]
677
678
    def _stem_and_rule(self, word: str) -> Tuple[str, float]:
679
        """Return UEA-Lite stem.
680
681
        Parameters
682
        ----------
683
        word : str
684
            The word to stem
685
686
        Returns
687
        -------
688
        (str, float)
689
            Word stem
690
691 1
        Examples
692 1
        --------
693 1
        >>> stmr = UEALite()
694 1
        >>> stmr._stem_and_rule('readings')
695 1
        ('read', 40.7)
696 1
        >>> stmr._stem_and_rule('insulted')
697 1
        ('insult', 32)
698
        >>> stmr._stem_and_rule('cussed')
699 1
        ('cuss', 28)
700 1
        >>> stmr._stem_and_rule('fancies')
701 1
        ('fancy', 59)
702
        >>> stmr._stem_and_rule('eroded')
703 1
        ('erode', 61.1)
704 1
705 1
706
        .. versionadded:: 0.6.0
707
708 1
        """
709 1
710 1
        def _stem_with_duplicate_character_check(
711
            word: str, del_len: int
712 1
        ) -> str:
713 1
            if word[-1] == 's':
714 1
                del_len += 1
715 1
            stemmed_word = word[:-del_len]
716 1
            if re_match(r'.*(\w)\1$', stemmed_word):
717 1
                stemmed_word = stemmed_word[:-1]
718 1
            return stemmed_word
719 1
720 1
        def _stem(word: str) -> Tuple[str, float]:
721 1
            stemmed_word = word
722
            rule_no = 0.0
723 1
724 1
            if not word:
725
                return word, 0
726 1
            if word in self._problem_words or (
727 1
                word == 'menses' and self._var == 'Adams'
728 1
            ):
729
                return word, 90
730
            if self._max_word_length and len(word) > self._max_word_length:
731
                return word, 95
732 1
733
            if "'" in word:
734 1
                if word[-2:] in {"'s", "'S"}:
735 1
                    stemmed_word = word[:-2]
736 1
                if word[-1:] == "'":
737 1
                    stemmed_word = word[:-1]
738 1
                stemmed_word = stemmed_word.replace("n't", 'not')
739
                stemmed_word = stemmed_word.replace("'ve", 'have')
740
                stemmed_word = stemmed_word.replace("'re", 'are')
741
                stemmed_word = stemmed_word.replace("'m", 'am')
742 1
                return stemmed_word, 94
743 1
744 1
            if word.isdigit():
745 1
                return word, 90.3
746
            else:
747
                hyphen = word.find('-')
748
                if len(word) > hyphen > 0:
749 1
                    if (
750 1
                        word[:hyphen].isalpha()
751 1
                        and word[hyphen + 1 :].isalpha()
752 1
                    ):
753 1
                        return word, 90.2
754 1
                    else:
755 1
                        return word, 90.1
756
                elif '_' in word:
757
                    return word, 90
758 1
                elif word[-1] == 's' and word[:-1].isupper():
759
                    if (
760 1
                        self._var == 'Adams'
761 1
                        and len(word) - 1 > self._max_acro_length
762 1
                    ):
763
                        return word, 96
764
                    return word[:-1], 91.1
765 1
                elif word.isupper():
766 1
                    if (
767
                        self._var == 'Adams'
768 1
                        and len(word) > self._max_acro_length
769 1
                    ):
770 1
                        return word, 96
771 1
                    return word, 91
772
                elif re_match(r'^.*[A-Z].*[A-Z].*$', word):
773 1
                    return word, 92
774 1
                elif word[0].isupper():
775 1
                    return word, 93
776
                elif self._var == 'Adams' and re_match(
777
                    r'^[a-z](|[rl])(ing|ed)$', word
778 1
                ):
779 1
                    return word, 97
780 1
781
            for n in range(7, 1, -1):
782
                if word[-n:] in self._rules[self._var][n]:
783 1
                    rule_no, del_len, add_str = self._rules[self._var][n][
784 1
                        word[-n:]
785 1
                    ]
786 1
                    if del_len:
787
                        stemmed_word = word[:-del_len]
788 1
                    else:
789
                        stemmed_word = word
790 1
                    if add_str:
791 1
                        stemmed_word += add_str
792 1
                    break
793 1
794
            if not rule_no:
795
                if re_match(r'.*\w\wings?$', word):  # rule 58
796 1
                    stemmed_word = _stem_with_duplicate_character_check(
797
                        word, 3
798
                    )
799
                    rule_no = 58
800
                elif re_match(r'.*\w\weds?$', word):  # rule 62
801
                    stemmed_word = _stem_with_duplicate_character_check(
802 1
                        word, 2
803
                    )
804
                    rule_no = 62
805
                elif word[-1] == 's':  # rule 68
806
                    stemmed_word = word[:-1]
807
                    rule_no = 68
808
809
            return stemmed_word, rule_no
810
811
        stem, rule_no = _stem(word)
812
813
        return stem, rule_no
814
815
816
if __name__ == '__main__':
817
    import doctest
818
819
    doctest.testmod()
820