1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.stemmer._uealite. |
20
|
|
|
|
21
|
|
|
The stemmer._uealite module defines the UEA-Lite Stemmer. |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import unicode_literals |
25
|
|
|
|
26
|
1 |
|
from re import match as re_match |
27
|
|
|
|
28
|
1 |
|
from six.moves import range |
29
|
|
|
|
30
|
1 |
|
__all__ = ['uealite'] |
31
|
|
|
|
32
|
|
|
|
33
|
1 |
|
def uealite( |
34
|
|
|
word, max_word_length=20, max_acro_length=8, return_rule_no=False, var=None |
|
|
|
|
35
|
|
|
): |
36
|
|
|
"""Return UEA-Lite stem. |
37
|
|
|
|
38
|
|
|
The UEA-Lite stemmer is discussed in :cite:`Jenkins:2005`. |
39
|
|
|
|
40
|
|
|
This is chiefly based on the Java implementation of the algorithm, with |
41
|
|
|
variants based on the Perl implementation and Jason Adams' Ruby port. |
42
|
|
|
|
43
|
|
|
Java version: :cite:`Churchill:2005` |
44
|
|
|
Perl version: :cite:`Jenkins:2005` |
45
|
|
|
Ruby version: :cite:`Adams:2017` |
46
|
|
|
|
47
|
|
|
:param str word: the word to calculate the stem of |
48
|
|
|
:param int max_word_length: the maximum word length allowed |
49
|
|
|
:param int max_acro_length: the maximum acryonym length allowed |
50
|
|
|
:param bool return_rule_no: if True, returns the stem along with rule |
51
|
|
|
number |
52
|
|
|
:param str var: variant to use (set to 'Adams' to use Jason Adams' rules, |
53
|
|
|
or 'Perl' to use the original Perl set of rules) |
54
|
|
|
:returns: word stem |
55
|
|
|
:rtype: str or (str, int) |
56
|
|
|
|
57
|
|
|
>>> uealite('readings') |
58
|
|
|
'read' |
59
|
|
|
>>> uealite('insulted') |
60
|
|
|
'insult' |
61
|
|
|
>>> uealite('cussed') |
62
|
|
|
'cuss' |
63
|
|
|
>>> uealite('fancies') |
64
|
|
|
'fancy' |
65
|
|
|
>>> uealite('eroded') |
66
|
|
|
'erode' |
67
|
|
|
""" |
68
|
1 |
|
problem_words = {'is', 'as', 'this', 'has', 'was', 'during'} |
69
|
|
|
|
70
|
|
|
# rule table format: |
71
|
|
|
# top-level dictionary: length-of-suffix: dict-of-rules |
72
|
|
|
# dict-of-rules: suffix: (rule_no, suffix_length_to_delete, |
73
|
|
|
# suffix_to_append) |
74
|
1 |
|
rule_table = { |
75
|
|
|
7: { |
76
|
|
|
'titudes': (30, 1, None), |
77
|
|
|
'fulness': (34, 4, None), |
78
|
|
|
'ousness': (35, 4, None), |
79
|
|
|
'eadings': (40.7, 4, None), |
80
|
|
|
'oadings': (40.6, 4, None), |
81
|
|
|
'ealings': (42.4, 4, None), |
82
|
|
|
'ailings': (42.2, 4, None), |
83
|
|
|
}, |
84
|
|
|
6: { |
85
|
|
|
'aceous': (1, 6, None), |
86
|
|
|
'aining': (24, 3, None), |
87
|
|
|
'acting': (25, 3, None), |
88
|
|
|
'ttings': (26, 5, None), |
89
|
|
|
'viding': (27, 3, 'e'), |
90
|
|
|
'ssings': (37, 4, None), |
91
|
|
|
'ulting': (38, 3, None), |
92
|
|
|
'eading': (40.7, 3, None), |
93
|
|
|
'oading': (40.6, 3, None), |
94
|
|
|
'edings': (40.5, 4, None), |
95
|
|
|
'ddings': (40.4, 5, None), |
96
|
|
|
'ldings': (40.3, 4, None), |
97
|
|
|
'rdings': (40.2, 4, None), |
98
|
|
|
'ndings': (40.1, 4, None), |
99
|
|
|
'llings': (41, 5, None), |
100
|
|
|
'ealing': (42.4, 3, None), |
101
|
|
|
'olings': (42.3, 4, None), |
102
|
|
|
'ailing': (42.2, 3, None), |
103
|
|
|
'elings': (42.1, 4, None), |
104
|
|
|
'mmings': (44.3, 5, None), |
105
|
|
|
'ngings': (45.2, 4, None), |
106
|
|
|
'ggings': (45.1, 5, None), |
107
|
|
|
'stings': (47, 4, None), |
108
|
|
|
'etings': (48.4, 4, None), |
109
|
|
|
'ntings': (48.2, 4, None), |
110
|
|
|
'irings': (54.4, 4, 'e'), |
111
|
|
|
'urings': (54.3, 4, 'e'), |
112
|
|
|
'ncings': (54.2, 4, 'e'), |
113
|
|
|
'things': (58.1, 1, None), |
114
|
|
|
}, |
115
|
|
|
5: { |
116
|
|
|
'iases': (11.4, 2, None), |
117
|
|
|
'ained': (13.6, 2, None), |
118
|
|
|
'erned': (13.5, 2, None), |
119
|
|
|
'ifted': (14, 2, None), |
120
|
|
|
'ected': (15, 2, None), |
121
|
|
|
'vided': (16, 1, None), |
122
|
|
|
'erred': (19, 3, None), |
123
|
|
|
'urred': (20.5, 3, None), |
124
|
|
|
'lored': (20.4, 2, None), |
125
|
|
|
'eared': (20.3, 2, None), |
126
|
|
|
'tored': (20.2, 1, None), |
127
|
|
|
'noted': (22.4, 1, None), |
128
|
|
|
'leted': (22.3, 1, None), |
129
|
|
|
'anges': (23, 1, None), |
130
|
|
|
'tting': (26, 4, None), |
131
|
|
|
'ulted': (32, 2, None), |
132
|
|
|
'uming': (33, 3, 'e'), |
133
|
|
|
'rabed': (36.1, 1, None), |
134
|
|
|
'rebed': (36.1, 1, None), |
135
|
|
|
'ribed': (36.1, 1, None), |
136
|
|
|
'robed': (36.1, 1, None), |
137
|
|
|
'rubed': (36.1, 1, None), |
138
|
|
|
'ssing': (37, 3, None), |
139
|
|
|
'vings': (39, 4, 'e'), |
140
|
|
|
'eding': (40.5, 3, None), |
141
|
|
|
'dding': (40.4, 4, None), |
142
|
|
|
'lding': (40.3, 3, None), |
143
|
|
|
'rding': (40.2, 3, None), |
144
|
|
|
'nding': (40.1, 3, None), |
145
|
|
|
'dings': (40, 4, 'e'), |
146
|
|
|
'lling': (41, 4, None), |
147
|
|
|
'oling': (42.3, 3, None), |
148
|
|
|
'eling': (42.1, 3, None), |
149
|
|
|
'lings': (42, 4, 'e'), |
150
|
|
|
'mming': (44.3, 4, None), |
151
|
|
|
'rming': (44.2, 3, None), |
152
|
|
|
'lming': (44.1, 3, None), |
153
|
|
|
'mings': (44, 4, 'e'), |
154
|
|
|
'nging': (45.2, 3, None), |
155
|
|
|
'gging': (45.1, 4, None), |
156
|
|
|
'gings': (45, 4, 'e'), |
157
|
|
|
'aning': (46.6, 3, None), |
158
|
|
|
'ening': (46.5, 3, None), |
159
|
|
|
'gning': (46.4, 3, None), |
160
|
|
|
'nning': (46.3, 4, None), |
161
|
|
|
'oning': (46.2, 3, None), |
162
|
|
|
'rning': (46.1, 3, None), |
163
|
|
|
'sting': (47, 3, None), |
164
|
|
|
'eting': (48.4, 3, None), |
165
|
|
|
'pting': (48.3, 3, None), |
166
|
|
|
'nting': (48.2, 3, None), |
167
|
|
|
'cting': (48.1, 3, None), |
168
|
|
|
'tings': (48, 4, 'e'), |
169
|
|
|
'iring': (54.4, 3, 'e'), |
170
|
|
|
'uring': (54.3, 3, 'e'), |
171
|
|
|
'ncing': (54.2, 3, 'e'), |
172
|
|
|
'sings': (54, 4, 'e'), |
173
|
|
|
# 'lling': (55, 3, None), # masked by 41 |
174
|
|
|
'ating': (57, 3, 'e'), |
175
|
|
|
'thing': (58.1, 0, None), |
176
|
|
|
}, |
177
|
|
|
4: { |
178
|
|
|
'eeds': (7, 1, None), |
179
|
|
|
'uses': (11.3, 1, None), |
180
|
|
|
'sses': (11.2, 2, None), |
181
|
|
|
'eses': (11.1, 2, 'is'), |
182
|
|
|
'tled': (12.5, 1, None), |
183
|
|
|
'pled': (12.4, 1, None), |
184
|
|
|
'bled': (12.3, 1, None), |
185
|
|
|
'eled': (12.2, 2, None), |
186
|
|
|
'lled': (12.1, 2, None), |
187
|
|
|
'ened': (13.7, 2, None), |
188
|
|
|
'rned': (13.4, 2, None), |
189
|
|
|
'nned': (13.3, 3, None), |
190
|
|
|
'oned': (13.2, 2, None), |
191
|
|
|
'gned': (13.1, 2, None), |
192
|
|
|
'ered': (20.1, 2, None), |
193
|
|
|
'reds': (20, 2, None), |
194
|
|
|
'tted': (21, 3, None), |
195
|
|
|
'uted': (22.2, 1, None), |
196
|
|
|
'ated': (22.1, 1, None), |
197
|
|
|
'ssed': (28, 2, None), |
198
|
|
|
'umed': (31, 1, None), |
199
|
|
|
'beds': (36, 3, None), |
200
|
|
|
'ving': (39, 3, 'e'), |
201
|
|
|
'ding': (40, 3, 'e'), |
202
|
|
|
'ling': (42, 3, 'e'), |
203
|
|
|
'nged': (43.2, 1, None), |
204
|
|
|
'gged': (43.1, 3, None), |
205
|
|
|
'ming': (44, 3, 'e'), |
206
|
|
|
'ging': (45, 3, 'e'), |
207
|
|
|
'ning': (46, 3, 'e'), |
208
|
|
|
'ting': (48, 3, 'e'), |
209
|
|
|
# 'ssed': (49, 2, None), # masked by 28 |
210
|
|
|
# 'lled': (53, 2, None), # masked by 12.1 |
211
|
|
|
'zing': (54.1, 3, 'e'), |
212
|
|
|
'sing': (54, 3, 'e'), |
213
|
|
|
'lves': (60.1, 3, 'f'), |
214
|
|
|
'aped': (61.3, 1, None), |
215
|
|
|
'uded': (61.2, 1, None), |
216
|
|
|
'oded': (61.1, 1, None), |
217
|
|
|
# 'ated': (61, 1, None), # masked by 22.1 |
218
|
|
|
'ones': (63.6, 1, None), |
219
|
|
|
'izes': (63.5, 1, None), |
220
|
|
|
'ures': (63.4, 1, None), |
221
|
|
|
'ines': (63.3, 1, None), |
222
|
|
|
'ides': (63.2, 1, None), |
223
|
|
|
}, |
224
|
|
|
3: { |
225
|
|
|
'ces': (2, 1, None), |
226
|
|
|
'sis': (4, 0, None), |
227
|
|
|
'tis': (5, 0, None), |
228
|
|
|
'eed': (7, 0, None), |
229
|
|
|
'ued': (8, 1, None), |
230
|
|
|
'ues': (9, 1, None), |
231
|
|
|
'ees': (10, 1, None), |
232
|
|
|
'ses': (11, 1, None), |
233
|
|
|
'led': (12, 2, None), |
234
|
|
|
'ned': (13, 1, None), |
235
|
|
|
'ved': (17, 1, None), |
236
|
|
|
'ced': (18, 1, None), |
237
|
|
|
'red': (20, 1, None), |
238
|
|
|
'ted': (22, 2, None), |
239
|
|
|
'sed': (29, 1, None), |
240
|
|
|
'bed': (36, 2, None), |
241
|
|
|
'ged': (43, 1, None), |
242
|
|
|
'les': (50, 1, None), |
243
|
|
|
'tes': (51, 1, None), |
244
|
|
|
'zed': (52, 1, None), |
245
|
|
|
'ied': (56, 3, 'y'), |
246
|
|
|
'ies': (59, 3, 'y'), |
247
|
|
|
'ves': (60, 1, None), |
248
|
|
|
'pes': (63.8, 1, None), |
249
|
|
|
'mes': (63.7, 1, None), |
250
|
|
|
'ges': (63.1, 1, None), |
251
|
|
|
'ous': (65, 0, None), |
252
|
|
|
'ums': (66, 0, None), |
253
|
|
|
}, |
254
|
|
|
2: { |
255
|
|
|
'cs': (3, 0, None), |
256
|
|
|
'ss': (6, 0, None), |
257
|
|
|
'es': (63, 2, None), |
258
|
|
|
'is': (64, 2, 'e'), |
259
|
|
|
'us': (67, 0, None), |
260
|
|
|
}, |
261
|
|
|
} |
262
|
|
|
|
263
|
1 |
|
if var == 'Perl': |
264
|
1 |
|
perl_deletions = { |
265
|
|
|
7: ['eadings', 'oadings', 'ealings', 'ailings'], |
266
|
|
|
6: [ |
267
|
|
|
'ttings', |
268
|
|
|
'ssings', |
269
|
|
|
'edings', |
270
|
|
|
'ddings', |
271
|
|
|
'ldings', |
272
|
|
|
'rdings', |
273
|
|
|
'ndings', |
274
|
|
|
'llings', |
275
|
|
|
'olings', |
276
|
|
|
'elings', |
277
|
|
|
'mmings', |
278
|
|
|
'ngings', |
279
|
|
|
'ggings', |
280
|
|
|
'stings', |
281
|
|
|
'etings', |
282
|
|
|
'ntings', |
283
|
|
|
'irings', |
284
|
|
|
'urings', |
285
|
|
|
'ncings', |
286
|
|
|
'things', |
287
|
|
|
], |
288
|
|
|
5: ['vings', 'dings', 'lings', 'mings', 'gings', 'tings', 'sings'], |
289
|
|
|
4: ['eeds', 'reds', 'beds'], |
290
|
|
|
} |
291
|
|
|
|
292
|
|
|
# Delete the above rules from rule_table |
293
|
1 |
|
for del_len in perl_deletions: |
294
|
1 |
|
for term in perl_deletions[del_len]: |
295
|
1 |
|
del rule_table[del_len][term] |
296
|
|
|
|
297
|
1 |
|
elif var == 'Adams': |
298
|
1 |
|
adams_additions = { |
299
|
|
|
6: {'chited': (22.8, 1, None)}, |
300
|
|
|
5: { |
301
|
|
|
'dying': (58.2, 4, 'ie'), |
302
|
|
|
'tying': (58.2, 4, 'ie'), |
303
|
|
|
'vited': (22.6, 1, None), |
304
|
|
|
'mited': (22.5, 1, None), |
305
|
|
|
'vided': (22.9, 1, None), |
306
|
|
|
'mided': (22.10, 1, None), |
307
|
|
|
'lying': (58.2, 4, 'ie'), |
308
|
|
|
'arred': (19.1, 3, None), |
309
|
|
|
}, |
310
|
|
|
4: { |
311
|
|
|
'ited': (22.7, 2, None), |
312
|
|
|
'oked': (31.1, 1, None), |
313
|
|
|
'aked': (31.1, 1, None), |
314
|
|
|
'iked': (31.1, 1, None), |
315
|
|
|
'uked': (31.1, 1, None), |
316
|
|
|
'amed': (31, 1, None), |
317
|
|
|
'imed': (31, 1, None), |
318
|
|
|
'does': (31.2, 2, None), |
319
|
|
|
}, |
320
|
|
|
3: { |
321
|
|
|
'oed': (31.3, 1, None), |
322
|
|
|
'oes': (31.2, 1, None), |
323
|
|
|
'kes': (63.1, 1, None), |
324
|
|
|
'des': (63.10, 1, None), |
325
|
|
|
'res': (63.9, 1, None), |
326
|
|
|
}, |
327
|
|
|
} |
328
|
|
|
|
329
|
|
|
# Add the above additional rules to rule_table |
330
|
1 |
|
for del_len in adams_additions: |
331
|
1 |
|
rule_table[del_len] = dict( |
332
|
|
|
rule_table[del_len], **adams_additions[del_len] |
333
|
|
|
) |
334
|
|
|
# Add additional problem word |
335
|
1 |
|
problem_words.add('menses') |
336
|
|
|
|
337
|
1 |
|
def _stem_with_duplicate_character_check(word, del_len): |
338
|
1 |
|
if word[-1] == 's': |
339
|
1 |
|
del_len += 1 |
340
|
1 |
|
stemmed_word = word[:-del_len] |
341
|
1 |
|
if re_match(r'.*(\w)\1$', stemmed_word): |
342
|
1 |
|
stemmed_word = stemmed_word[:-1] |
343
|
1 |
|
return stemmed_word |
344
|
|
|
|
345
|
1 |
|
def _stem(word): |
|
|
|
|
346
|
1 |
|
stemmed_word = word |
347
|
1 |
|
rule_no = 0 |
348
|
|
|
|
349
|
1 |
|
if not word: |
350
|
1 |
|
return word, 0 |
351
|
1 |
|
if word in problem_words: |
352
|
1 |
|
return word, 90 |
353
|
1 |
|
if max_word_length and len(word) > max_word_length: |
354
|
1 |
|
return word, 95 |
355
|
|
|
|
356
|
1 |
|
if "'" in word: |
357
|
1 |
|
if word[-2:] in {"'s", "'S"}: |
358
|
1 |
|
stemmed_word = word[:-2] |
359
|
1 |
|
if word[-1:] == "'": |
360
|
1 |
|
stemmed_word = word[:-1] |
361
|
1 |
|
stemmed_word = stemmed_word.replace("n't", 'not') |
362
|
1 |
|
stemmed_word = stemmed_word.replace("'ve", 'have') |
363
|
1 |
|
stemmed_word = stemmed_word.replace("'re", 'are') |
364
|
1 |
|
stemmed_word = stemmed_word.replace("'m", 'am') |
365
|
1 |
|
return stemmed_word, 94 |
366
|
|
|
|
367
|
1 |
|
if word.isdigit(): |
368
|
1 |
|
return word, 90.3 |
369
|
|
|
else: |
370
|
1 |
|
hyphen = word.find('-') |
371
|
1 |
|
if len(word) > hyphen > 0: |
372
|
1 |
|
if word[:hyphen].isalpha() and word[hyphen + 1 :].isalpha(): |
|
|
|
|
373
|
1 |
|
return word, 90.2 |
374
|
|
|
else: |
375
|
1 |
|
return word, 90.1 |
376
|
1 |
|
elif '_' in word: |
377
|
1 |
|
return word, 90 |
378
|
1 |
|
elif word[-1] == 's' and word[:-1].isupper(): |
379
|
1 |
|
if var == 'Adams' and len(word) - 1 > max_acro_length: |
380
|
1 |
|
return word, 96 |
381
|
1 |
|
return word[:-1], 91.1 |
382
|
1 |
|
elif word.isupper(): |
383
|
1 |
|
if var == 'Adams' and len(word) > max_acro_length: |
384
|
1 |
|
return word, 96 |
385
|
1 |
|
return word, 91 |
386
|
1 |
|
elif re_match(r'^.*[A-Z].*[A-Z].*$', word): |
387
|
1 |
|
return word, 92 |
388
|
1 |
|
elif word[0].isupper(): |
389
|
1 |
|
return word, 93 |
390
|
1 |
|
elif var == 'Adams' and re_match(r'^[a-z](|[rl])(ing|ed)$', word): |
391
|
1 |
|
return word, 97 |
392
|
|
|
|
393
|
1 |
|
for n in range(7, 1, -1): |
|
|
|
|
394
|
1 |
|
if word[-n:] in rule_table[n]: |
395
|
1 |
|
rule_no, del_len, add_str = rule_table[n][word[-n:]] |
396
|
1 |
|
if del_len: |
397
|
1 |
|
stemmed_word = word[:-del_len] |
398
|
|
|
else: |
399
|
1 |
|
stemmed_word = word |
400
|
1 |
|
if add_str: |
401
|
1 |
|
stemmed_word += add_str |
402
|
1 |
|
break |
403
|
|
|
|
404
|
1 |
|
if not rule_no: |
405
|
1 |
|
if re_match(r'.*\w\wings?$', word): # rule 58 |
406
|
1 |
|
stemmed_word = _stem_with_duplicate_character_check(word, 3) |
407
|
1 |
|
rule_no = 58 |
408
|
1 |
|
elif re_match(r'.*\w\weds?$', word): # rule 62 |
409
|
1 |
|
stemmed_word = _stem_with_duplicate_character_check(word, 2) |
410
|
1 |
|
rule_no = 62 |
411
|
1 |
|
elif word[-1] == 's': # rule 68 |
412
|
1 |
|
stemmed_word = word[:-1] |
413
|
1 |
|
rule_no = 68 |
414
|
|
|
|
415
|
1 |
|
return stemmed_word, rule_no |
416
|
|
|
|
417
|
1 |
|
stem, rule_no = _stem(word) |
418
|
1 |
|
if return_rule_no: |
419
|
1 |
|
return stem, rule_no |
420
|
1 |
|
return stem |
421
|
|
|
|
422
|
|
|
|
423
|
|
|
if __name__ == '__main__': |
424
|
|
|
import doctest |
425
|
|
|
|
426
|
|
|
doctest.testmod() |
427
|
|
|
|