Completed
Pull Request — master (#141)
by Chris
13:03
created

abydos.stemmer._uealite   A

Complexity

Total Complexity 38

Size/Duplication

Total Lines 794
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 38
eloc 628
dl 0
loc 794
ccs 88
cts 88
cp 1
rs 9.3318
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
A uealite() 0 38 1

1 Method

Rating   Name   Duplication   Size   Complexity  
F UEALite.stem() 0 132 37
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._uealite.
20
21
The stemmer._uealite module defines the UEA-Lite Stemmer.
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from re import match as re_match
27
28 1
from six.moves import range
29
30 1
from ._stemmer import Stemmer
31
32 1
__all__ = ['UEALite', 'uealite']
33
34
35 1
class UEALite(Stemmer):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
36
    """UEA-Lite stemmer.
37
38
    The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`.
39
40
    This is chiefly based on the Java implementation of the algorithm, with
41
    variants based on the Perl implementation and Jason Adams' Ruby port.
42
43
    Java version: :cite:`Churchill:2005`
44
    Perl version: :cite:`Jenkins:2005`
45
    Ruby version: :cite:`Adams:2017`
46
    """
47
48 1
    _problem_words = {'is', 'as', 'this', 'has', 'was', 'during'}
49
50
    # rule table format:
51
    # top-level dictionary: length-of-suffix: dict-of-rules
52
    # dict-of-rules: suffix: (rule_no, suffix_length_to_delete,
53
    #                         suffix_to_append)
54 1
    _standard_rule_table = {
55
        7: {
56
            'titudes': (30, 1, None),
57
            'fulness': (34, 4, None),
58
            'ousness': (35, 4, None),
59
            'eadings': (40.7, 4, None),
60
            'oadings': (40.6, 4, None),
61
            'ealings': (42.4, 4, None),
62
            'ailings': (42.2, 4, None),
63
        },
64
        6: {
65
            'aceous': (1, 6, None),
66
            'aining': (24, 3, None),
67
            'acting': (25, 3, None),
68
            'ttings': (26, 5, None),
69
            'viding': (27, 3, 'e'),
70
            'ssings': (37, 4, None),
71
            'ulting': (38, 3, None),
72
            'eading': (40.7, 3, None),
73
            'oading': (40.6, 3, None),
74
            'edings': (40.5, 4, None),
75
            'ddings': (40.4, 5, None),
76
            'ldings': (40.3, 4, None),
77
            'rdings': (40.2, 4, None),
78
            'ndings': (40.1, 4, None),
79
            'llings': (41, 5, None),
80
            'ealing': (42.4, 3, None),
81
            'olings': (42.3, 4, None),
82
            'ailing': (42.2, 3, None),
83
            'elings': (42.1, 4, None),
84
            'mmings': (44.3, 5, None),
85
            'ngings': (45.2, 4, None),
86
            'ggings': (45.1, 5, None),
87
            'stings': (47, 4, None),
88
            'etings': (48.4, 4, None),
89
            'ntings': (48.2, 4, None),
90
            'irings': (54.4, 4, 'e'),
91
            'urings': (54.3, 4, 'e'),
92
            'ncings': (54.2, 4, 'e'),
93
            'things': (58.1, 1, None),
94
        },
95
        5: {
96
            'iases': (11.4, 2, None),
97
            'ained': (13.6, 2, None),
98
            'erned': (13.5, 2, None),
99
            'ifted': (14, 2, None),
100
            'ected': (15, 2, None),
101
            'vided': (16, 1, None),
102
            'erred': (19, 3, None),
103
            'urred': (20.5, 3, None),
104
            'lored': (20.4, 2, None),
105
            'eared': (20.3, 2, None),
106
            'tored': (20.2, 1, None),
107
            'noted': (22.4, 1, None),
108
            'leted': (22.3, 1, None),
109
            'anges': (23, 1, None),
110
            'tting': (26, 4, None),
111
            'ulted': (32, 2, None),
112
            'uming': (33, 3, 'e'),
113
            'rabed': (36.1, 1, None),
114
            'rebed': (36.1, 1, None),
115
            'ribed': (36.1, 1, None),
116
            'robed': (36.1, 1, None),
117
            'rubed': (36.1, 1, None),
118
            'ssing': (37, 3, None),
119
            'vings': (39, 4, 'e'),
120
            'eding': (40.5, 3, None),
121
            'dding': (40.4, 4, None),
122
            'lding': (40.3, 3, None),
123
            'rding': (40.2, 3, None),
124
            'nding': (40.1, 3, None),
125
            'dings': (40, 4, 'e'),
126
            'lling': (41, 4, None),
127
            'oling': (42.3, 3, None),
128
            'eling': (42.1, 3, None),
129
            'lings': (42, 4, 'e'),
130
            'mming': (44.3, 4, None),
131
            'rming': (44.2, 3, None),
132
            'lming': (44.1, 3, None),
133
            'mings': (44, 4, 'e'),
134
            'nging': (45.2, 3, None),
135
            'gging': (45.1, 4, None),
136
            'gings': (45, 4, 'e'),
137
            'aning': (46.6, 3, None),
138
            'ening': (46.5, 3, None),
139
            'gning': (46.4, 3, None),
140
            'nning': (46.3, 4, None),
141
            'oning': (46.2, 3, None),
142
            'rning': (46.1, 3, None),
143
            'sting': (47, 3, None),
144
            'eting': (48.4, 3, None),
145
            'pting': (48.3, 3, None),
146
            'nting': (48.2, 3, None),
147
            'cting': (48.1, 3, None),
148
            'tings': (48, 4, 'e'),
149
            'iring': (54.4, 3, 'e'),
150
            'uring': (54.3, 3, 'e'),
151
            'ncing': (54.2, 3, 'e'),
152
            'sings': (54, 4, 'e'),
153
            # 'lling': (55, 3, None),  # masked by 41
154
            'ating': (57, 3, 'e'),
155
            'thing': (58.1, 0, None),
156
        },
157
        4: {
158
            'eeds': (7, 1, None),
159
            'uses': (11.3, 1, None),
160
            'sses': (11.2, 2, None),
161
            'eses': (11.1, 2, 'is'),
162
            'tled': (12.5, 1, None),
163
            'pled': (12.4, 1, None),
164
            'bled': (12.3, 1, None),
165
            'eled': (12.2, 2, None),
166
            'lled': (12.1, 2, None),
167
            'ened': (13.7, 2, None),
168
            'rned': (13.4, 2, None),
169
            'nned': (13.3, 3, None),
170
            'oned': (13.2, 2, None),
171
            'gned': (13.1, 2, None),
172
            'ered': (20.1, 2, None),
173
            'reds': (20, 2, None),
174
            'tted': (21, 3, None),
175
            'uted': (22.2, 1, None),
176
            'ated': (22.1, 1, None),
177
            'ssed': (28, 2, None),
178
            'umed': (31, 1, None),
179
            'beds': (36, 3, None),
180
            'ving': (39, 3, 'e'),
181
            'ding': (40, 3, 'e'),
182
            'ling': (42, 3, 'e'),
183
            'nged': (43.2, 1, None),
184
            'gged': (43.1, 3, None),
185
            'ming': (44, 3, 'e'),
186
            'ging': (45, 3, 'e'),
187
            'ning': (46, 3, 'e'),
188
            'ting': (48, 3, 'e'),
189
            # 'ssed': (49, 2, None),  # masked by 28
190
            # 'lled': (53, 2, None),  # masked by 12.1
191
            'zing': (54.1, 3, 'e'),
192
            'sing': (54, 3, 'e'),
193
            'lves': (60.1, 3, 'f'),
194
            'aped': (61.3, 1, None),
195
            'uded': (61.2, 1, None),
196
            'oded': (61.1, 1, None),
197
            # 'ated': (61, 1, None),  # masked by 22.1
198
            'ones': (63.6, 1, None),
199
            'izes': (63.5, 1, None),
200
            'ures': (63.4, 1, None),
201
            'ines': (63.3, 1, None),
202
            'ides': (63.2, 1, None),
203
        },
204
        3: {
205
            'ces': (2, 1, None),
206
            'sis': (4, 0, None),
207
            'tis': (5, 0, None),
208
            'eed': (7, 0, None),
209
            'ued': (8, 1, None),
210
            'ues': (9, 1, None),
211
            'ees': (10, 1, None),
212
            'ses': (11, 1, None),
213
            'led': (12, 2, None),
214
            'ned': (13, 1, None),
215
            'ved': (17, 1, None),
216
            'ced': (18, 1, None),
217
            'red': (20, 1, None),
218
            'ted': (22, 2, None),
219
            'sed': (29, 1, None),
220
            'bed': (36, 2, None),
221
            'ged': (43, 1, None),
222
            'les': (50, 1, None),
223
            'tes': (51, 1, None),
224
            'zed': (52, 1, None),
225
            'ied': (56, 3, 'y'),
226
            'ies': (59, 3, 'y'),
227
            'ves': (60, 1, None),
228
            'pes': (63.8, 1, None),
229
            'mes': (63.7, 1, None),
230
            'ges': (63.1, 1, None),
231
            'ous': (65, 0, None),
232
            'ums': (66, 0, None),
233
        },
234
        2: {
235
            'cs': (3, 0, None),
236
            'ss': (6, 0, None),
237
            'es': (63, 2, None),
238
            'is': (64, 2, 'e'),
239
            'us': (67, 0, None),
240
        },
241
    }
242
243 1
    _perl_rule_table = {
244
        7: {
245
            'titudes': (30, 1, None),
246
            'fulness': (34, 4, None),
247
            'ousness': (35, 4, None),
248
        },
249
        6: {
250
            'aceous': (1, 6, None),
251
            'aining': (24, 3, None),
252
            'acting': (25, 3, None),
253
            'viding': (27, 3, 'e'),
254
            'ulting': (38, 3, None),
255
            'eading': (40.7, 3, None),
256
            'oading': (40.6, 3, None),
257
            'ealing': (42.4, 3, None),
258
            'ailing': (42.2, 3, None),
259
        },
260
        5: {
261
            'iases': (11.4, 2, None),
262
            'ained': (13.6, 2, None),
263
            'erned': (13.5, 2, None),
264
            'ifted': (14, 2, None),
265
            'ected': (15, 2, None),
266
            'vided': (16, 1, None),
267
            'erred': (19, 3, None),
268
            'urred': (20.5, 3, None),
269
            'lored': (20.4, 2, None),
270
            'eared': (20.3, 2, None),
271
            'tored': (20.2, 1, None),
272
            'noted': (22.4, 1, None),
273
            'leted': (22.3, 1, None),
274
            'anges': (23, 1, None),
275
            'tting': (26, 4, None),
276
            'ulted': (32, 2, None),
277
            'uming': (33, 3, 'e'),
278
            'rabed': (36.1, 1, None),
279
            'rebed': (36.1, 1, None),
280
            'ribed': (36.1, 1, None),
281
            'robed': (36.1, 1, None),
282
            'rubed': (36.1, 1, None),
283
            'ssing': (37, 3, None),
284
            'eding': (40.5, 3, None),
285
            'dding': (40.4, 4, None),
286
            'lding': (40.3, 3, None),
287
            'rding': (40.2, 3, None),
288
            'nding': (40.1, 3, None),
289
            'lling': (41, 4, None),
290
            'oling': (42.3, 3, None),
291
            'eling': (42.1, 3, None),
292
            'mming': (44.3, 4, None),
293
            'rming': (44.2, 3, None),
294
            'lming': (44.1, 3, None),
295
            'nging': (45.2, 3, None),
296
            'gging': (45.1, 4, None),
297
            'aning': (46.6, 3, None),
298
            'ening': (46.5, 3, None),
299
            'gning': (46.4, 3, None),
300
            'nning': (46.3, 4, None),
301
            'oning': (46.2, 3, None),
302
            'rning': (46.1, 3, None),
303
            'sting': (47, 3, None),
304
            'eting': (48.4, 3, None),
305
            'pting': (48.3, 3, None),
306
            'nting': (48.2, 3, None),
307
            'cting': (48.1, 3, None),
308
            'iring': (54.4, 3, 'e'),
309
            'uring': (54.3, 3, 'e'),
310
            'ncing': (54.2, 3, 'e'),
311
            # 'lling': (55, 3, None),  # masked by 41
312
            'ating': (57, 3, 'e'),
313
            'thing': (58.1, 0, None),
314
        },
315
        4: {
316
            'uses': (11.3, 1, None),
317
            'sses': (11.2, 2, None),
318
            'eses': (11.1, 2, 'is'),
319
            'tled': (12.5, 1, None),
320
            'pled': (12.4, 1, None),
321
            'bled': (12.3, 1, None),
322
            'eled': (12.2, 2, None),
323
            'lled': (12.1, 2, None),
324
            'ened': (13.7, 2, None),
325
            'rned': (13.4, 2, None),
326
            'nned': (13.3, 3, None),
327
            'oned': (13.2, 2, None),
328
            'gned': (13.1, 2, None),
329
            'ered': (20.1, 2, None),
330
            'tted': (21, 3, None),
331
            'uted': (22.2, 1, None),
332
            'ated': (22.1, 1, None),
333
            'ssed': (28, 2, None),
334
            'umed': (31, 1, None),
335
            'ving': (39, 3, 'e'),
336
            'ding': (40, 3, 'e'),
337
            'ling': (42, 3, 'e'),
338
            'nged': (43.2, 1, None),
339
            'gged': (43.1, 3, None),
340
            'ming': (44, 3, 'e'),
341
            'ging': (45, 3, 'e'),
342
            'ning': (46, 3, 'e'),
343
            'ting': (48, 3, 'e'),
344
            # 'ssed': (49, 2, None),  # masked by 28
345
            # 'lled': (53, 2, None),  # masked by 12.1
346
            'zing': (54.1, 3, 'e'),
347
            'sing': (54, 3, 'e'),
348
            'lves': (60.1, 3, 'f'),
349
            'aped': (61.3, 1, None),
350
            'uded': (61.2, 1, None),
351
            'oded': (61.1, 1, None),
352
            # 'ated': (61, 1, None),  # masked by 22.1
353
            'ones': (63.6, 1, None),
354
            'izes': (63.5, 1, None),
355
            'ures': (63.4, 1, None),
356
            'ines': (63.3, 1, None),
357
            'ides': (63.2, 1, None),
358
        },
359
        3: {
360
            'ces': (2, 1, None),
361
            'sis': (4, 0, None),
362
            'tis': (5, 0, None),
363
            'eed': (7, 0, None),
364
            'ued': (8, 1, None),
365
            'ues': (9, 1, None),
366
            'ees': (10, 1, None),
367
            'ses': (11, 1, None),
368
            'led': (12, 2, None),
369
            'ned': (13, 1, None),
370
            'ved': (17, 1, None),
371
            'ced': (18, 1, None),
372
            'red': (20, 1, None),
373
            'ted': (22, 2, None),
374
            'sed': (29, 1, None),
375
            'bed': (36, 2, None),
376
            'ged': (43, 1, None),
377
            'les': (50, 1, None),
378
            'tes': (51, 1, None),
379
            'zed': (52, 1, None),
380
            'ied': (56, 3, 'y'),
381
            'ies': (59, 3, 'y'),
382
            'ves': (60, 1, None),
383
            'pes': (63.8, 1, None),
384
            'mes': (63.7, 1, None),
385
            'ges': (63.1, 1, None),
386
            'ous': (65, 0, None),
387
            'ums': (66, 0, None),
388
        },
389
        2: {
390
            'cs': (3, 0, None),
391
            'ss': (6, 0, None),
392
            'es': (63, 2, None),
393
            'is': (64, 2, 'e'),
394
            'us': (67, 0, None),
395
        },
396
    }
397
398 1
    _adams_rule_table = {
399
        7: {
400
            'titudes': (30, 1, None),
401
            'fulness': (34, 4, None),
402
            'ousness': (35, 4, None),
403
            'eadings': (40.7, 4, None),
404
            'oadings': (40.6, 4, None),
405
            'ealings': (42.4, 4, None),
406
            'ailings': (42.2, 4, None),
407
        },
408
        6: {
409
            'aceous': (1, 6, None),
410
            'aining': (24, 3, None),
411
            'acting': (25, 3, None),
412
            'ttings': (26, 5, None),
413
            'viding': (27, 3, 'e'),
414
            'ssings': (37, 4, None),
415
            'ulting': (38, 3, None),
416
            'eading': (40.7, 3, None),
417
            'oading': (40.6, 3, None),
418
            'edings': (40.5, 4, None),
419
            'ddings': (40.4, 5, None),
420
            'ldings': (40.3, 4, None),
421
            'rdings': (40.2, 4, None),
422
            'ndings': (40.1, 4, None),
423
            'llings': (41, 5, None),
424
            'ealing': (42.4, 3, None),
425
            'olings': (42.3, 4, None),
426
            'ailing': (42.2, 3, None),
427
            'elings': (42.1, 4, None),
428
            'mmings': (44.3, 5, None),
429
            'ngings': (45.2, 4, None),
430
            'ggings': (45.1, 5, None),
431
            'stings': (47, 4, None),
432
            'etings': (48.4, 4, None),
433
            'ntings': (48.2, 4, None),
434
            'irings': (54.4, 4, 'e'),
435
            'urings': (54.3, 4, 'e'),
436
            'ncings': (54.2, 4, 'e'),
437
            'things': (58.1, 1, None),
438
            'chited': (22.8, 1, None),
439
        },
440
        5: {
441
            'iases': (11.4, 2, None),
442
            'ained': (13.6, 2, None),
443
            'erned': (13.5, 2, None),
444
            'ifted': (14, 2, None),
445
            'ected': (15, 2, None),
446
            # 'vided': (16, 1, None),
447
            'erred': (19, 3, None),
448
            'urred': (20.5, 3, None),
449
            'lored': (20.4, 2, None),
450
            'eared': (20.3, 2, None),
451
            'tored': (20.2, 1, None),
452
            'noted': (22.4, 1, None),
453
            'leted': (22.3, 1, None),
454
            'anges': (23, 1, None),
455
            'tting': (26, 4, None),
456
            'ulted': (32, 2, None),
457
            'uming': (33, 3, 'e'),
458
            'rabed': (36.1, 1, None),
459
            'rebed': (36.1, 1, None),
460
            'ribed': (36.1, 1, None),
461
            'robed': (36.1, 1, None),
462
            'rubed': (36.1, 1, None),
463
            'ssing': (37, 3, None),
464
            'vings': (39, 4, 'e'),
465
            'eding': (40.5, 3, None),
466
            'dding': (40.4, 4, None),
467
            'lding': (40.3, 3, None),
468
            'rding': (40.2, 3, None),
469
            'nding': (40.1, 3, None),
470
            'dings': (40, 4, 'e'),
471
            'lling': (41, 4, None),
472
            'oling': (42.3, 3, None),
473
            'eling': (42.1, 3, None),
474
            'lings': (42, 4, 'e'),
475
            'mming': (44.3, 4, None),
476
            'rming': (44.2, 3, None),
477
            'lming': (44.1, 3, None),
478
            'mings': (44, 4, 'e'),
479
            'nging': (45.2, 3, None),
480
            'gging': (45.1, 4, None),
481
            'gings': (45, 4, 'e'),
482
            'aning': (46.6, 3, None),
483
            'ening': (46.5, 3, None),
484
            'gning': (46.4, 3, None),
485
            'nning': (46.3, 4, None),
486
            'oning': (46.2, 3, None),
487
            'rning': (46.1, 3, None),
488
            'sting': (47, 3, None),
489
            'eting': (48.4, 3, None),
490
            'pting': (48.3, 3, None),
491
            'nting': (48.2, 3, None),
492
            'cting': (48.1, 3, None),
493
            'tings': (48, 4, 'e'),
494
            'iring': (54.4, 3, 'e'),
495
            'uring': (54.3, 3, 'e'),
496
            'ncing': (54.2, 3, 'e'),
497
            'sings': (54, 4, 'e'),
498
            # 'lling': (55, 3, None),  # masked by 41
499
            'ating': (57, 3, 'e'),
500
            'thing': (58.1, 0, None),
501
            'dying': (58.2, 4, 'ie'),
502
            'tying': (58.2, 4, 'ie'),
503
            'vited': (22.6, 1, None),
504
            'mited': (22.5, 1, None),
505
            'vided': (22.9, 1, None),
506
            'mided': (22.10, 1, None),
507
            'lying': (58.2, 4, 'ie'),
508
            'arred': (19.1, 3, None),
509
        },
510
        4: {
511
            'eeds': (7, 1, None),
512
            'uses': (11.3, 1, None),
513
            'sses': (11.2, 2, None),
514
            'eses': (11.1, 2, 'is'),
515
            'tled': (12.5, 1, None),
516
            'pled': (12.4, 1, None),
517
            'bled': (12.3, 1, None),
518
            'eled': (12.2, 2, None),
519
            'lled': (12.1, 2, None),
520
            'ened': (13.7, 2, None),
521
            'rned': (13.4, 2, None),
522
            'nned': (13.3, 3, None),
523
            'oned': (13.2, 2, None),
524
            'gned': (13.1, 2, None),
525
            'ered': (20.1, 2, None),
526
            'reds': (20, 2, None),
527
            'tted': (21, 3, None),
528
            'uted': (22.2, 1, None),
529
            'ated': (22.1, 1, None),
530
            'ssed': (28, 2, None),
531
            'umed': (31, 1, None),
532
            'beds': (36, 3, None),
533
            'ving': (39, 3, 'e'),
534
            'ding': (40, 3, 'e'),
535
            'ling': (42, 3, 'e'),
536
            'nged': (43.2, 1, None),
537
            'gged': (43.1, 3, None),
538
            'ming': (44, 3, 'e'),
539
            'ging': (45, 3, 'e'),
540
            'ning': (46, 3, 'e'),
541
            'ting': (48, 3, 'e'),
542
            # 'ssed': (49, 2, None),  # masked by 28
543
            # 'lled': (53, 2, None),  # masked by 12.1
544
            'zing': (54.1, 3, 'e'),
545
            'sing': (54, 3, 'e'),
546
            'lves': (60.1, 3, 'f'),
547
            'aped': (61.3, 1, None),
548
            'uded': (61.2, 1, None),
549
            'oded': (61.1, 1, None),
550
            # 'ated': (61, 1, None),  # masked by 22.1
551
            'ones': (63.6, 1, None),
552
            'izes': (63.5, 1, None),
553
            'ures': (63.4, 1, None),
554
            'ines': (63.3, 1, None),
555
            'ides': (63.2, 1, None),
556
            'ited': (22.7, 2, None),
557
            'oked': (31.1, 1, None),
558
            'aked': (31.1, 1, None),
559
            'iked': (31.1, 1, None),
560
            'uked': (31.1, 1, None),
561
            'amed': (31, 1, None),
562
            'imed': (31, 1, None),
563
            'does': (31.2, 2, None),
564
        },
565
        3: {
566
            'ces': (2, 1, None),
567
            'sis': (4, 0, None),
568
            'tis': (5, 0, None),
569
            'eed': (7, 0, None),
570
            'ued': (8, 1, None),
571
            'ues': (9, 1, None),
572
            'ees': (10, 1, None),
573
            'ses': (11, 1, None),
574
            'led': (12, 2, None),
575
            'ned': (13, 1, None),
576
            'ved': (17, 1, None),
577
            'ced': (18, 1, None),
578
            'red': (20, 1, None),
579
            'ted': (22, 2, None),
580
            'sed': (29, 1, None),
581
            'bed': (36, 2, None),
582
            'ged': (43, 1, None),
583
            'les': (50, 1, None),
584
            'tes': (51, 1, None),
585
            'zed': (52, 1, None),
586
            'ied': (56, 3, 'y'),
587
            'ies': (59, 3, 'y'),
588
            'ves': (60, 1, None),
589
            'pes': (63.8, 1, None),
590
            'mes': (63.7, 1, None),
591
            'ges': (63.1, 1, None),
592
            'ous': (65, 0, None),
593
            'ums': (66, 0, None),
594
            'oed': (31.3, 1, None),
595
            'oes': (31.2, 1, None),
596
            'kes': (63.1, 1, None),
597
            'des': (63.10, 1, None),
598
            'res': (63.9, 1, None),
599
        },
600
        2: {
601
            'cs': (3, 0, None),
602
            'ss': (6, 0, None),
603
            'es': (63, 2, None),
604
            'is': (64, 2, 'e'),
605
            'us': (67, 0, None),
606
        },
607
    }
608
609 1
    _rules = {
610
        'standard': _standard_rule_table,
611
        'Adams': _adams_rule_table,
612
        'Perl': _perl_rule_table,
613
    }
614
615 1
    def stem(
0 ignored issues
show
best-practice introduced by
Too many arguments (6/5)
Loading history...
Bug introduced by
Parameters differ from overridden 'stem' method
Loading history...
616
        self,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
617
        word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
618
        max_word_length=20,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
619
        max_acro_length=8,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
620
        return_rule_no=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
621
        var='standard',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
622
    ):
623
        """Return UEA-Lite stem.
624
625
        Args:
626
            word (str): The word to stem
627
            max_word_length (int): The maximum word length allowed
628
            max_acro_length (int): The maximum acronym length allowed
629
            return_rule_no (bool): If True, returns the stem along with rule
630
                number
631
            var (str): variant rules to use:
632
                - ``Adams`` to use Jason Adams' rules
633
                - ``Perl`` to use the original Perl rules
634
635
        Returns:
636
            str or (str, int): Word stem
637
638
        Examples:
639
            >>> uealite('readings')
640
            'read'
641
            >>> uealite('insulted')
642
            'insult'
643
            >>> uealite('cussed')
644
            'cuss'
645
            >>> uealite('fancies')
646
            'fancy'
647
            >>> uealite('eroded')
648
            'erode'
649
650
        """
651
652 1
        def _stem_with_duplicate_character_check(word, del_len):
653 1
            if word[-1] == 's':
654 1
                del_len += 1
655 1
            stemmed_word = word[:-del_len]
656 1
            if re_match(r'.*(\w)\1$', stemmed_word):
657 1
                stemmed_word = stemmed_word[:-1]
658 1
            return stemmed_word
659
660 1
        def _stem(word):
0 ignored issues
show
best-practice introduced by
Too many return statements (16/6)
Loading history...
661 1
            stemmed_word = word
662 1
            rule_no = 0
663
664 1
            if not word:
665 1
                return word, 0
666 1
            if word in self._problem_words or (
667
                word == 'menses' and var == 'Adams'
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
668
            ):
669 1
                return word, 90
670 1
            if max_word_length and len(word) > max_word_length:
671 1
                return word, 95
672
673 1
            if "'" in word:
674 1
                if word[-2:] in {"'s", "'S"}:
675 1
                    stemmed_word = word[:-2]
676 1
                if word[-1:] == "'":
677 1
                    stemmed_word = word[:-1]
678 1
                stemmed_word = stemmed_word.replace("n't", 'not')
679 1
                stemmed_word = stemmed_word.replace("'ve", 'have')
680 1
                stemmed_word = stemmed_word.replace("'re", 'are')
681 1
                stemmed_word = stemmed_word.replace("'m", 'am')
682 1
                return stemmed_word, 94
683
684 1
            if word.isdigit():
685 1
                return word, 90.3
686
            else:
687 1
                hyphen = word.find('-')
688 1
                if len(word) > hyphen > 0:
689 1
                    if (
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
690
                        word[:hyphen].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
691
                        and word[hyphen + 1 :].isalpha()
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
692
                    ):
693 1
                        return word, 90.2
694
                    else:
695 1
                        return word, 90.1
696 1
                elif '_' in word:
697 1
                    return word, 90
698 1
                elif word[-1] == 's' and word[:-1].isupper():
699 1
                    if var == 'Adams' and len(word) - 1 > max_acro_length:
700 1
                        return word, 96
701 1
                    return word[:-1], 91.1
702 1
                elif word.isupper():
703 1
                    if var == 'Adams' and len(word) > max_acro_length:
704 1
                        return word, 96
705 1
                    return word, 91
706 1
                elif re_match(r'^.*[A-Z].*[A-Z].*$', word):
707 1
                    return word, 92
708 1
                elif word[0].isupper():
709 1
                    return word, 93
710 1
                elif var == 'Adams' and re_match(
711
                    r'^[a-z](|[rl])(ing|ed)$', word
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
712
                ):
713 1
                    return word, 97
714
715 1
            for n in range(7, 1, -1):
0 ignored issues
show
Coding Style Naming introduced by
The name n does not conform to the variable naming conventions ((([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
716 1
                if word[-n:] in self._rules[var][n]:
717 1
                    rule_no, del_len, add_str = self._rules[var][n][word[-n:]]
718 1
                    if del_len:
719 1
                        stemmed_word = word[:-del_len]
720
                    else:
721 1
                        stemmed_word = word
722 1
                    if add_str:
723 1
                        stemmed_word += add_str
724 1
                    break
725
726 1
            if not rule_no:
727 1
                if re_match(r'.*\w\wings?$', word):  # rule 58
728 1
                    stemmed_word = _stem_with_duplicate_character_check(
729
                        word, 3
730
                    )
731 1
                    rule_no = 58
732 1
                elif re_match(r'.*\w\weds?$', word):  # rule 62
733 1
                    stemmed_word = _stem_with_duplicate_character_check(
734
                        word, 2
735
                    )
736 1
                    rule_no = 62
737 1
                elif word[-1] == 's':  # rule 68
738 1
                    stemmed_word = word[:-1]
739 1
                    rule_no = 68
740
741 1
            return stemmed_word, rule_no
742
743 1
        stem, rule_no = _stem(word)
744 1
        if return_rule_no:
745 1
            return stem, rule_no
746 1
        return stem
747
748
749 1
def uealite(
750
    word,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
751
    max_word_length=20,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
752
    max_acro_length=8,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
753
    return_rule_no=False,
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
754
    var='standard',
0 ignored issues
show
Coding Style introduced by
Wrong hanging indentation before block (add 4 spaces).
Loading history...
755
):
756
    """Return UEA-Lite stem.
757
758
    This is a wrapper for :py:meth:`UEALite.stem`.
759
760
    Args:
761
        word (str): The word to stem
762
        max_word_length (int): The maximum word length allowed
763
        max_acro_length (int): The maximum acronym length allowed
764
        return_rule_no (bool): If True, returns the stem along with rule number
765
        var (str): variant rules to use:
766
            - ``Adams`` to use Jason Adams' rules
767
            - ``Perl`` to use the original Perl rules
768
769
    Returns:
770
        str or (str, int): Word stem
771
772
    Examples:
773
        >>> uealite('readings')
774
        'read'
775
        >>> uealite('insulted')
776
        'insult'
777
        >>> uealite('cussed')
778
        'cuss'
779
        >>> uealite('fancies')
780
        'fancy'
781
        >>> uealite('eroded')
782
        'erode'
783
784
    """
785 1
    return UEALite().stem(
786
        word, max_word_length, max_acro_length, return_rule_no, var
787
    )
788
789
790
if __name__ == '__main__':
791
    import doctest
792
793
    doctest.testmod()
794