Completed
Push — master ( f43547...71985b )
by Chris
12:00 queued 10s
created

abydos.stemmer._uea_lite.uealite()   A

Complexity

Conditions 1

Size

Total Lines 48
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 2
CRAP Score 1

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 5
dl 0
loc 48
ccs 2
cts 2
cp 1
crap 1
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._uea_lite.
20
21
UEA-Lite stemmer
22
"""
23
24 1
from __future__ import (
25
    absolute_import,
26
    division,
27
    print_function,
28
    unicode_literals,
29
)
30
31 1
from re import match as re_match
32
33 1
from six.moves import range
34
35 1
from ._stemmer import _Stemmer
36
37 1
__all__ = ['UEALite', 'uealite']
38
39
40 1
class UEALite(_Stemmer):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
41
    """UEA-Lite stemmer.
42
43
    The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`.
44
45
    This is chiefly based on the Java implementation of the algorithm, with
46
    variants based on the Perl implementation and Jason Adams' Ruby port.
47
48
    Java version: :cite:`Churchill:2005`
49
    Perl version: :cite:`Jenkins:2005`
50
    Ruby version: :cite:`Adams:2017`
51
    """
52
53 1
    _problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
54
55
    # rule table format:
56
    # top-level dictionary: length-of-suffix: dict-of-rules
57
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
58
    #                         suffix_to_append)
59 1
    _standard_rule_table = {
60
        7: {
61
            'titudes': (30, 1, None),
62
            'fulness': (34, 4, None),
63
            'ousness': (35, 4, None),
64
            'eadings': (40.7, 4, None),
65
            'oadings': (40.6, 4, None),
66
            'ealings': (42.4, 4, None),
67
            'ailings': (42.2, 4, None),
68
        },
69
        6: {
70
            'aceous': (1, 6, None),
71
            'aining': (24, 3, None),
72
            'acting': (25, 3, None),
73
            'ttings': (26, 5, None),
74
            'viding': (27, 3, 'e'),
75
            'ssings': (37, 4, None),
76
            'ulting': (38, 3, None),
77
            'eading': (40.7, 3, None),
78
            'oading': (40.6, 3, None),
79
            'edings': (40.5, 4, None),
80
            'ddings': (40.4, 5, None),
81
            'ldings': (40.3, 4, None),
82
            'rdings': (40.2, 4, None),
83
            'ndings': (40.1, 4, None),
84
            'llings': (41, 5, None),
85
            'ealing': (42.4, 3, None),
86
            'olings': (42.3, 4, None),
87
            'ailing': (42.2, 3, None),
88
            'elings': (42.1, 4, None),
89
            'mmings': (44.3, 5, None),
90
            'ngings': (45.2, 4, None),
91
            'ggings': (45.1, 5, None),
92
            'stings': (47, 4, None),
93
            'etings': (48.4, 4, None),
94
            'ntings': (48.2, 4, None),
95
            'irings': (54.4, 4, 'e'),
96
            'urings': (54.3, 4, 'e'),
97
            'ncings': (54.2, 4, 'e'),
98
            'things': (58.1, 1, None),
99
        },
100
        5: {
101
            'iases': (11.4, 2, None),
102
            'ained': (13.6, 2, None),
103
            'erned': (13.5, 2, None),
104
            'ifted': (14, 2, None),
105
            'ected': (15, 2, None),
106
            'vided': (16, 1, None),
107
            'erred': (19, 3, None),
108
            'urred': (20.5, 3, None),
109
            'lored': (20.4, 2, None),
110
            'eared': (20.3, 2, None),
111
            'tored': (20.2, 1, None),
112
            'noted': (22.4, 1, None),
113
            'leted': (22.3, 1, None),
114
            'anges': (23, 1, None),
115
            'tting': (26, 4, None),
116
            'ulted': (32, 2, None),
117
            'uming': (33, 3, 'e'),
118
            'rabed': (36.1, 1, None),
119
            'rebed': (36.1, 1, None),
120
            'ribed': (36.1, 1, None),
121
            'robed': (36.1, 1, None),
122
            'rubed': (36.1, 1, None),
123
            'ssing': (37, 3, None),
124
            'vings': (39, 4, 'e'),
125
            'eding': (40.5, 3, None),
126
            'dding': (40.4, 4, None),
127
            'lding': (40.3, 3, None),
128
            'rding': (40.2, 3, None),
129
            'nding': (40.1, 3, None),
130
            'dings': (40, 4, 'e'),
131
            'lling': (41, 4, None),
132
            'oling': (42.3, 3, None),
133
            'eling': (42.1, 3, None),
134
            'lings': (42, 4, 'e'),
135
            'mming': (44.3, 4, None),
136
            'rming': (44.2, 3, None),
137
            'lming': (44.1, 3, None),
138
            'mings': (44, 4, 'e'),
139
            'nging': (45.2, 3, None),
140
            'gging': (45.1, 4, None),
141
            'gings': (45, 4, 'e'),
142
            'aning': (46.6, 3, None),
143
            'ening': (46.5, 3, None),
144
            'gning': (46.4, 3, None),
145
            'nning': (46.3, 4, None),
146
            'oning': (46.2, 3, None),
147
            'rning': (46.1, 3, None),
148
            'sting': (47, 3, None),
149
            'eting': (48.4, 3, None),
150
            'pting': (48.3, 3, None),
151
            'nting': (48.2, 3, None),
152
            'cting': (48.1, 3, None),
153
            'tings': (48, 4, 'e'),
154
            'iring': (54.4, 3, 'e'),
155
            'uring': (54.3, 3, 'e'),
156
            'ncing': (54.2, 3, 'e'),
157
            'sings': (54, 4, 'e'),
158
            # 'lling': (55, 3, None),  # masked by 41
159
            'ating': (57, 3, 'e'),
160
            'thing': (58.1, 0, None),
161
        },
162
        4: {
163
            'eeds': (7, 1, None),
164
            'uses': (11.3, 1, None),
165
            'sses': (11.2, 2, None),
166
            'eses': (11.1, 2, 'is'),
167
            'tled': (12.5, 1, None),
168
            'pled': (12.4, 1, None),
169
            'bled': (12.3, 1, None),
170
            'eled': (12.2, 2, None),
171
            'lled': (12.1, 2, None),
172
            'ened': (13.7, 2, None),
173
            'rned': (13.4, 2, None),
174
            'nned': (13.3, 3, None),
175
            'oned': (13.2, 2, None),
176
            'gned': (13.1, 2, None),
177
            'ered': (20.1, 2, None),
178
            'reds': (20, 2, None),
179
            'tted': (21, 3, None),
180
            'uted': (22.2, 1, None),
181
            'ated': (22.1, 1, None),
182
            'ssed': (28, 2, None),
183
            'umed': (31, 1, None),
184
            'beds': (36, 3, None),
185
            'ving': (39, 3, 'e'),
186
            'ding': (40, 3, 'e'),
187
            'ling': (42, 3, 'e'),
188
            'nged': (43.2, 1, None),
189
            'gged': (43.1, 3, None),
190
            'ming': (44, 3, 'e'),
191
            'ging': (45, 3, 'e'),
192
            'ning': (46, 3, 'e'),
193
            'ting': (48, 3, 'e'),
194
            # 'ssed': (49, 2, None),  # masked by 28
195
            # 'lled': (53, 2, None),  # masked by 12.1
196
            'zing': (54.1, 3, 'e'),
197
            'sing': (54, 3, 'e'),
198
            'lves': (60.1, 3, 'f'),
199
            'aped': (61.3, 1, None),
200
            'uded': (61.2, 1, None),
201
            'oded': (61.1, 1, None),
202
            # 'ated': (61, 1, None),  # masked by 22.1
203
            'ones': (63.6, 1, None),
204
            'izes': (63.5, 1, None),
205
            'ures': (63.4, 1, None),
206
            'ines': (63.3, 1, None),
207
            'ides': (63.2, 1, None),
208
        },
209
        3: {
210
            'ces': (2, 1, None),
211
            'sis': (4, 0, None),
212
            'tis': (5, 0, None),
213
            'eed': (7, 0, None),
214
            'ued': (8, 1, None),
215
            'ues': (9, 1, None),
216
            'ees': (10, 1, None),
217
            'ses': (11, 1, None),
218
            'led': (12, 2, None),
219
            'ned': (13, 1, None),
220
            'ved': (17, 1, None),
221
            'ced': (18, 1, None),
222
            'red': (20, 1, None),
223
            'ted': (22, 2, None),
224
            'sed': (29, 1, None),
225
            'bed': (36, 2, None),
226
            'ged': (43, 1, None),
227
            'les': (50, 1, None),
228
            'tes': (51, 1, None),
229
            'zed': (52, 1, None),
230
            'ied': (56, 3, 'y'),
231
            'ies': (59, 3, 'y'),
232
            'ves': (60, 1, None),
233
            'pes': (63.8, 1, None),
234
            'mes': (63.7, 1, None),
235
            'ges': (63.1, 1, None),
236
            'ous': (65, 0, None),
237
            'ums': (66, 0, None),
238
        },
239
        2: {
240
            'cs': (3, 0, None),
241
            'ss': (6, 0, None),
242
            'es': (63, 2, None),
243
            'is': (64, 2, 'e'),
244
            'us': (67, 0, None),
245
        },
246
    }
247
248 1
    _perl_rule_table = {
249
        7: {
250
            'titudes': (30, 1, None),
251
            'fulness': (34, 4, None),
252
            'ousness': (35, 4, None),
253
        },
254
        6: {
255
            'aceous': (1, 6, None),
256
            'aining': (24, 3, None),
257
            'acting': (25, 3, None),
258
            'viding': (27, 3, 'e'),
259
            'ulting': (38, 3, None),
260
            'eading': (40.7, 3, None),
261
            'oading': (40.6, 3, None),
262
            'ealing': (42.4, 3, None),
263
            'ailing': (42.2, 3, None),
264
        },
265
        5: {
266
            'iases': (11.4, 2, None),
267
            'ained': (13.6, 2, None),
268
            'erned': (13.5, 2, None),
269
            'ifted': (14, 2, None),
270
            'ected': (15, 2, None),
271
            'vided': (16, 1, None),
272
            'erred': (19, 3, None),
273
            'urred': (20.5, 3, None),
274
            'lored': (20.4, 2, None),
275
            'eared': (20.3, 2, None),
276
            'tored': (20.2, 1, None),
277
            'noted': (22.4, 1, None),
278
            'leted': (22.3, 1, None),
279
            'anges': (23, 1, None),
280
            'tting': (26, 4, None),
281
            'ulted': (32, 2, None),
282
            'uming': (33, 3, 'e'),
283
            'rabed': (36.1, 1, None),
284
            'rebed': (36.1, 1, None),
285
            'ribed': (36.1, 1, None),
286
            'robed': (36.1, 1, None),
287
            'rubed': (36.1, 1, None),
288
            'ssing': (37, 3, None),
289
            'eding': (40.5, 3, None),
290
            'dding': (40.4, 4, None),
291
            'lding': (40.3, 3, None),
292
            'rding': (40.2, 3, None),
293
            'nding': (40.1, 3, None),
294
            'lling': (41, 4, None),
295
            'oling': (42.3, 3, None),
296
            'eling': (42.1, 3, None),
297
            'mming': (44.3, 4, None),
298
            'rming': (44.2, 3, None),
299
            'lming': (44.1, 3, None),
300
            'nging': (45.2, 3, None),
301
            'gging': (45.1, 4, None),
302
            'aning': (46.6, 3, None),
303
            'ening': (46.5, 3, None),
304
            'gning': (46.4, 3, None),
305
            'nning': (46.3, 4, None),
306
            'oning': (46.2, 3, None),
307
            'rning': (46.1, 3, None),
308
            'sting': (47, 3, None),
309
            'eting': (48.4, 3, None),
310
            'pting': (48.3, 3, None),
311
            'nting': (48.2, 3, None),
312
            'cting': (48.1, 3, None),
313
            'iring': (54.4, 3, 'e'),
314
            'uring': (54.3, 3, 'e'),
315
            'ncing': (54.2, 3, 'e'),
316
            # 'lling': (55, 3, None),  # masked by 41
317
            'ating': (57, 3, 'e'),
318
            'thing': (58.1, 0, None),
319
        },
320
        4: {
321
            'uses': (11.3, 1, None),
322
            'sses': (11.2, 2, None),
323
            'eses': (11.1, 2, 'is'),
324
            'tled': (12.5, 1, None),
325
            'pled': (12.4, 1, None),
326
            'bled': (12.3, 1, None),
327
            'eled': (12.2, 2, None),
328
            'lled': (12.1, 2, None),
329
            'ened': (13.7, 2, None),
330
            'rned': (13.4, 2, None),
331
            'nned': (13.3, 3, None),
332
            'oned': (13.2, 2, None),
333
            'gned': (13.1, 2, None),
334
            'ered': (20.1, 2, None),
335
            'tted': (21, 3, None),
336
            'uted': (22.2, 1, None),
337
            'ated': (22.1, 1, None),
338
            'ssed': (28, 2, None),
339
            'umed': (31, 1, None),
340
            'ving': (39, 3, 'e'),
341
            'ding': (40, 3, 'e'),
342
            'ling': (42, 3, 'e'),
343
            'nged': (43.2, 1, None),
344
            'gged': (43.1, 3, None),
345
            'ming': (44, 3, 'e'),
346
            'ging': (45, 3, 'e'),
347
            'ning': (46, 3, 'e'),
348
            'ting': (48, 3, 'e'),
349
            # 'ssed': (49, 2, None),  # masked by 28
350
            # 'lled': (53, 2, None),  # masked by 12.1
351
            'zing': (54.1, 3, 'e'),
352
            'sing': (54, 3, 'e'),
353
            'lves': (60.1, 3, 'f'),
354
            'aped': (61.3, 1, None),
355
            'uded': (61.2, 1, None),
356
            'oded': (61.1, 1, None),
357
            # 'ated': (61, 1, None),  # masked by 22.1
358
            'ones': (63.6, 1, None),
359
            'izes': (63.5, 1, None),
360
            'ures': (63.4, 1, None),
361
            'ines': (63.3, 1, None),
362
            'ides': (63.2, 1, None),
363
        },
364
        3: {
365
            'ces': (2, 1, None),
366
            'sis': (4, 0, None),
367
            'tis': (5, 0, None),
368
            'eed': (7, 0, None),
369
            'ued': (8, 1, None),
370
            'ues': (9, 1, None),
371
            'ees': (10, 1, None),
372
            'ses': (11, 1, None),
373
            'led': (12, 2, None),
374
            'ned': (13, 1, None),
375
            'ved': (17, 1, None),
376
            'ced': (18, 1, None),
377
            'red': (20, 1, None),
378
            'ted': (22, 2, None),
379
            'sed': (29, 1, None),
380
            'bed': (36, 2, None),
381
            'ged': (43, 1, None),
382
            'les': (50, 1, None),
383
            'tes': (51, 1, None),
384
            'zed': (52, 1, None),
385
            'ied': (56, 3, 'y'),
386
            'ies': (59, 3, 'y'),
387
            'ves': (60, 1, None),
388
            'pes': (63.8, 1, None),
389
            'mes': (63.7, 1, None),
390
            'ges': (63.1, 1, None),
391
            'ous': (65, 0, None),
392
            'ums': (66, 0, None),
393
        },
394
        2: {
395
            'cs': (3, 0, None),
396
            'ss': (6, 0, None),
397
            'es': (63, 2, None),
398
            'is': (64, 2, 'e'),
399
            'us': (67, 0, None),
400
        },
401
    }
402
403 1
    _adams_rule_table = {
404
        7: {
405
            'titudes': (30, 1, None),
406
            'fulness': (34, 4, None),
407
            'ousness': (35, 4, None),
408
            'eadings': (40.7, 4, None),
409
            'oadings': (40.6, 4, None),
410
            'ealings': (42.4, 4, None),
411
            'ailings': (42.2, 4, None),
412
        },
413
        6: {
414
            'aceous': (1, 6, None),
415
            'aining': (24, 3, None),
416
            'acting': (25, 3, None),
417
            'ttings': (26, 5, None),
418
            'viding': (27, 3, 'e'),
419
            'ssings': (37, 4, None),
420
            'ulting': (38, 3, None),
421
            'eading': (40.7, 3, None),
422
            'oading': (40.6, 3, None),
423
            'edings': (40.5, 4, None),
424
            'ddings': (40.4, 5, None),
425
            'ldings': (40.3, 4, None),
426
            'rdings': (40.2, 4, None),
427
            'ndings': (40.1, 4, None),
428
            'llings': (41, 5, None),
429
            'ealing': (42.4, 3, None),
430
            'olings': (42.3, 4, None),
431
            'ailing': (42.2, 3, None),
432
            'elings': (42.1, 4, None),
433
            'mmings': (44.3, 5, None),
434
            'ngings': (45.2, 4, None),
435
            'ggings': (45.1, 5, None),
436
            'stings': (47, 4, None),
437
            'etings': (48.4, 4, None),
438
            'ntings': (48.2, 4, None),
439
            'irings': (54.4, 4, 'e'),
440
            'urings': (54.3, 4, 'e'),
441
            'ncings': (54.2, 4, 'e'),
442
            'things': (58.1, 1, None),
443
            'chited': (22.8, 1, None),
444
        },
445
        5: {
446
            'iases': (11.4, 2, None),
447
            'ained': (13.6, 2, None),
448
            'erned': (13.5, 2, None),
449
            'ifted': (14, 2, None),
450
            'ected': (15, 2, None),
451
            # 'vided': (16, 1, None),
452
            'erred': (19, 3, None),
453
            'urred': (20.5, 3, None),
454
            'lored': (20.4, 2, None),
455
            'eared': (20.3, 2, None),
456
            'tored': (20.2, 1, None),
457
            'noted': (22.4, 1, None),
458
            'leted': (22.3, 1, None),
459
            'anges': (23, 1, None),
460
            'tting': (26, 4, None),
461
            'ulted': (32, 2, None),
462
            'uming': (33, 3, 'e'),
463
            'rabed': (36.1, 1, None),
464
            'rebed': (36.1, 1, None),
465
            'ribed': (36.1, 1, None),
466
            'robed': (36.1, 1, None),
467
            'rubed': (36.1, 1, None),
468
            'ssing': (37, 3, None),
469
            'vings': (39, 4, 'e'),
470
            'eding': (40.5, 3, None),
471
            'dding': (40.4, 4, None),
472
            'lding': (40.3, 3, None),
473
            'rding': (40.2, 3, None),
474
            'nding': (40.1, 3, None),
475
            'dings': (40, 4, 'e'),
476
            'lling': (41, 4, None),
477
            'oling': (42.3, 3, None),
478
            'eling': (42.1, 3, None),
479
            'lings': (42, 4, 'e'),
480
            'mming': (44.3, 4, None),
481
            'rming': (44.2, 3, None),
482
            'lming': (44.1, 3, None),
483
            'mings': (44, 4, 'e'),
484
            'nging': (45.2, 3, None),
485
            'gging': (45.1, 4, None),
486
            'gings': (45, 4, 'e'),
487
            'aning': (46.6, 3, None),
488
            'ening': (46.5, 3, None),
489
            'gning': (46.4, 3, None),
490
            'nning': (46.3, 4, None),
491
            'oning': (46.2, 3, None),
492
            'rning': (46.1, 3, None),
493
            'sting': (47, 3, None),
494
            'eting': (48.4, 3, None),
495
            'pting': (48.3, 3, None),
496
            'nting': (48.2, 3, None),
497
            'cting': (48.1, 3, None),
498
            'tings': (48, 4, 'e'),
499
            'iring': (54.4, 3, 'e'),
500
            'uring': (54.3, 3, 'e'),
501
            'ncing': (54.2, 3, 'e'),
502
            'sings': (54, 4, 'e'),
503
            # 'lling': (55, 3, None),  # masked by 41
504
            'ating': (57, 3, 'e'),
505
            'thing': (58.1, 0, None),
506
            'dying': (58.2, 4, 'ie'),
507
            'tying': (58.2, 4, 'ie'),
508
            'vited': (22.6, 1, None),
509
            'mited': (22.5, 1, None),
510
            'vided': (22.9, 1, None),
511
            'mided': (22.10, 1, None),
512
            'lying': (58.2, 4, 'ie'),
513
            'arred': (19.1, 3, None),
514
        },
515
        4: {
516
            'eeds': (7, 1, None),
517
            'uses': (11.3, 1, None),
518
            'sses': (11.2, 2, None),
519
            'eses': (11.1, 2, 'is'),
520
            'tled': (12.5, 1, None),
521
            'pled': (12.4, 1, None),
522
            'bled': (12.3, 1, None),
523
            'eled': (12.2, 2, None),
524
            'lled': (12.1, 2, None),
525
            'ened': (13.7, 2, None),
526
            'rned': (13.4, 2, None),
527
            'nned': (13.3, 3, None),
528
            'oned': (13.2, 2, None),
529
            'gned': (13.1, 2, None),
530
            'ered': (20.1, 2, None),
531
            'reds': (20, 2, None),
532
            'tted': (21, 3, None),
533
            'uted': (22.2, 1, None),
534
            'ated': (22.1, 1, None),
535
            'ssed': (28, 2, None),
536
            'umed': (31, 1, None),
537
            'beds': (36, 3, None),
538
            'ving': (39, 3, 'e'),
539
            'ding': (40, 3, 'e'),
540
            'ling': (42, 3, 'e'),
541
            'nged': (43.2, 1, None),
542
            'gged': (43.1, 3, None),
543
            'ming': (44, 3, 'e'),
544
            'ging': (45, 3, 'e'),
545
            'ning': (46, 3, 'e'),
546
            'ting': (48, 3, 'e'),
547
            # 'ssed': (49, 2, None),  # masked by 28
548
            # 'lled': (53, 2, None),  # masked by 12.1
549
            'zing': (54.1, 3, 'e'),
550
            'sing': (54, 3, 'e'),
551
            'lves': (60.1, 3, 'f'),
552
            'aped': (61.3, 1, None),
553
            'uded': (61.2, 1, None),
554
            'oded': (61.1, 1, None),
555
            # 'ated': (61, 1, None),  # masked by 22.1
556
            'ones': (63.6, 1, None),
557
            'izes': (63.5, 1, None),
558
            'ures': (63.4, 1, None),
559
            'ines': (63.3, 1, None),
560
            'ides': (63.2, 1, None),
561
            'ited': (22.7, 2, None),
562
            'oked': (31.1, 1, None),
563
            'aked': (31.1, 1, None),
564
            'iked': (31.1, 1, None),
565
            'uked': (31.1, 1, None),
566
            'amed': (31, 1, None),
567
            'imed': (31, 1, None),
568
            'does': (31.2, 2, None),
569
        },
570
        3: {
571
            'ces': (2, 1, None),
572
            'sis': (4, 0, None),
573
            'tis': (5, 0, None),
574
            'eed': (7, 0, None),
575
            'ued': (8, 1, None),
576
            'ues': (9, 1, None),
577
            'ees': (10, 1, None),
578
            'ses': (11, 1, None),
579
            'led': (12, 2, None),
580
            'ned': (13, 1, None),
581
            'ved': (17, 1, None),
582
            'ced': (18, 1, None),
583
            'red': (20, 1, None),
584
            'ted': (22, 2, None),
585
            'sed': (29, 1, None),
586
            'bed': (36, 2, None),
587
            'ged': (43, 1, None),
588
            'les': (50, 1, None),
589
            'tes': (51, 1, None),
590
            'zed': (52, 1, None),
591
            'ied': (56, 3, 'y'),
592
            'ies': (59, 3, 'y'),
593
            'ves': (60, 1, None),
594
            'pes': (63.8, 1, None),
595
            'mes': (63.7, 1, None),
596
            'ges': (63.1, 1, None),
597
            'ous': (65, 0, None),
598
            'ums': (66, 0, None),
599
            'oed': (31.3, 1, None),
600
            'oes': (31.2, 1, None),
601
            'kes': (63.1, 1, None),
602
            'des': (63.10, 1, None),
603
            'res': (63.9, 1, None),
604
        },
605
        2: {
606
            'cs': (3, 0, None),
607
            'ss': (6, 0, None),
608
            'es': (63, 2, None),
609
            'is': (64, 2, 'e'),
610
            'us': (67, 0, None),
611
        },
612
    }
613
614 1
    _rules = {
615
        'standard': _standard_rule_table,
616
        'Adams': _adams_rule_table,
617
        'Perl': _perl_rule_table,
618
    }
619
620 1
    def stem(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Bug introduced by
Parameters differ from overridden 'stem' method
Loading history...
621
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
622
        word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
623
        max_word_length=20,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
624
        max_acro_length=8,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
625
        return_rule_no=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
626
        var='standard',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
627
    ):
628
        """Return UEA-Lite stem.
629
630
        Parameters
631
        ----------
632
        word : str
633
            The word to stem
634
        max_word_length : int
635
            The maximum word length allowed
636
        max_acro_length : int
637
            The maximum acronym length allowed
638
        return_rule_no : bool
639
            If True, returns the stem along with rule number
640
        var : str
641
            Variant rules to use:
642
643
                - ``Adams`` to use Jason Adams' rules
644
                - ``Perl`` to use the original Perl rules
645
646
        Returns
647
        -------
648
        str or (str, int)
649
            Word stem
650
651
        Examples
652
        --------
653
        >>> uealite('readings')
654
        'read'
655
        >>> uealite('insulted')
656
        'insult'
657
        >>> uealite('cussed')
658
        'cuss'
659
        >>> uealite('fancies')
660
        'fancy'
661
        >>> uealite('eroded')
662
        'erode'
663
664
        """
665
666 1
        def _stem_with_duplicate_character_check(word, del_len):
667 1
            if word[-1] == 's':
668 1
                del_len += 1
669 1
            stemmed_word = word[:-del_len]
670 1
            if re_match(r'.*(\w)\1$', stemmed_word):
671 1
                stemmed_word = stemmed_word[:-1]
672 1
            return stemmed_word
673
674 1
        def _stem(word):
0 ignored issues
show
best-practice introduced by
Too many return statements (16/6)
Loading history...
675 1
            stemmed_word = word
676 1
            rule_no = 0
677
678 1
            if not word:
679 1
                return word, 0
680 1
            if word in self._problem_words or (
681
                word == 'menses' and var == 'Adams'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
682
            ):
683 1
                return word, 90
684 1
            if max_word_length and len(word) > max_word_length:
685 1
                return word, 95
686
687 1
            if "'" in word:
688 1
                if word[-2:] in {"'s", "'S"}:
689 1
                    stemmed_word = word[:-2]
690 1
                if word[-1:] == "'":
691 1
                    stemmed_word = word[:-1]
692 1
                stemmed_word = stemmed_word.replace("n't", 'not')
693 1
                stemmed_word = stemmed_word.replace("'ve", 'have')
694 1
                stemmed_word = stemmed_word.replace("'re", 'are')
695 1
                stemmed_word = stemmed_word.replace("'m", 'am')
696 1
                return stemmed_word, 94
697
698 1
            if word.isdigit():
699 1
                return word, 90.3
700
            else:
701 1
                hyphen = word.find('-')
702 1
                if len(word) > hyphen > 0:
703 1
                    if (
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
704
                        word[:hyphen].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
705
                        and word[hyphen + 1 :].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
706
                    ):
707 1
                        return word, 90.2
708
                    else:
709 1
                        return word, 90.1
710 1
                elif '_' in word:
711 1
                    return word, 90
712 1
                elif word[-1] == 's' and word[:-1].isupper():
713 1
                    if var == 'Adams' and len(word) - 1 > max_acro_length:
714 1
                        return word, 96
715 1
                    return word[:-1], 91.1
716 1
                elif word.isupper():
717 1
                    if var == 'Adams' and len(word) > max_acro_length:
718 1
                        return word, 96
719 1
                    return word, 91
720 1
                elif re_match(r'^.*[A-Z].*[A-Z].*$', word):
721 1
                    return word, 92
722 1
                elif word[0].isupper():
723 1
                    return word, 93
724 1
                elif var == 'Adams' and re_match(
725
                    r'^[a-z](|[rl])(ing|ed)$', word
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
726
                ):
727 1
                    return word, 97
728
729 1
            for n in range(7, 1, -1):
0 ignored issues
show
Coding Style Naming introduced by
The name n does not conform to the variable naming conventions ((([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
730 1
                if word[-n:] in self._rules[var][n]:
731 1
                    rule_no, del_len, add_str = self._rules[var][n][word[-n:]]
732 1
                    if del_len:
733 1
                        stemmed_word = word[:-del_len]
734
                    else:
735 1
                        stemmed_word = word
736 1
                    if add_str:
737 1
                        stemmed_word += add_str
738 1
                    break
739
740 1
            if not rule_no:
741 1
                if re_match(r'.*\w\wings?$', word):  # rule 58
742 1
                    stemmed_word = _stem_with_duplicate_character_check(
743
                        word, 3
744
                    )
745 1
                    rule_no = 58
746 1
                elif re_match(r'.*\w\weds?$', word):  # rule 62
747 1
                    stemmed_word = _stem_with_duplicate_character_check(
748
                        word, 2
749
                    )
750 1
                    rule_no = 62
751 1
                elif word[-1] == 's':  # rule 68
752 1
                    stemmed_word = word[:-1]
753 1
                    rule_no = 68
754
755 1
            return stemmed_word, rule_no
756
757 1
        stem, rule_no = _stem(word)
758 1
        if return_rule_no:
759 1
            return stem, rule_no
760 1
        return stem
761
762
763 1
def uealite(
764
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
765
    max_word_length=20,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
766
    max_acro_length=8,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
767
    return_rule_no=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
768
    var='standard',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
769
):
770
    """Return UEA-Lite stem.
771
772
    This is a wrapper for :py:meth:`UEALite.stem`.
773
774
    Parameters
775
    ----------
776
    word : str
777
        The word to stem
778
    max_word_length : int
779
        The maximum word length allowed
780
    max_acro_length : int
781
        The maximum acronym length allowed
782
    return_rule_no : bool
783
        If True, returns the stem along with rule number
784
    var : str
785
        Variant rules to use:
786
787
            - ``Adams`` to use Jason Adams' rules
788
            - ``Perl`` to use the original Perl rules
789
790
    Returns
791
    -------
792
    str or (str, int)
793
        Word stem
794
795
    Examples
796
    --------
797
    >>> uealite('readings')
798
    'read'
799
    >>> uealite('insulted')
800
    'insult'
801
    >>> uealite('cussed')
802
    'cuss'
803
    >>> uealite('fancies')
804
    'fancy'
805
    >>> uealite('eroded')
806
    'erode'
807
808
    """
809 1
    return UEALite().stem(
810
        word, max_word_length, max_acro_length, return_rule_no, var
811
    )
812
813
814
if __name__ == '__main__':
815
    import doctest
816
817
    doctest.testmod()
818