|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
1 |
|
"""abydos.stemmer._paice_husk. |
|
20
|
|
|
|
|
21
|
|
|
The stemmer._paice_husk module defines the Paice-Husk Stemmer |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
1 |
|
from __future__ import unicode_literals |
|
25
|
|
|
|
|
26
|
1 |
|
from six.moves import range |
|
27
|
|
|
|
|
28
|
1 |
|
__all__ = ['paice_husk'] |
|
29
|
|
|
|
|
30
|
|
|
|
|
31
|
1 |
|
def paice_husk(word): |
|
32
|
|
|
"""Return Paice-Husk stem. |
|
33
|
|
|
|
|
34
|
|
|
Implementation of the Paice-Husk Stemmer, also known as the Lancaster |
|
35
|
|
|
Stemmer, developed by Chris Paice, with the assistance of Gareth Husk |
|
36
|
|
|
|
|
37
|
|
|
This is based on the algorithm's description in :cite:`Paice:1990`. |
|
38
|
|
|
|
|
39
|
|
|
:param str word: the word to stem |
|
40
|
|
|
:returns: the stemmed word |
|
41
|
|
|
:rtype: str |
|
42
|
|
|
|
|
43
|
|
|
>>> paice_husk('assumption') |
|
44
|
|
|
'assum' |
|
45
|
|
|
>>> paice_husk('verifiable') |
|
46
|
|
|
'ver' |
|
47
|
|
|
>>> paice_husk('fancies') |
|
48
|
|
|
'fant' |
|
49
|
|
|
>>> paice_husk('fanciful') |
|
50
|
|
|
'fancy' |
|
51
|
|
|
>>> paice_husk('torment') |
|
52
|
|
|
'tor' |
|
53
|
|
|
""" |
|
54
|
1 |
|
rule_table = { |
|
55
|
|
|
6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)}, |
|
56
|
|
|
5: { |
|
57
|
|
|
'guish': (False, 5, 'ct', True), |
|
58
|
|
|
'sumpt': (False, 2, None, True), |
|
59
|
|
|
'istry': (False, 5, None, True), |
|
60
|
|
|
}, |
|
61
|
|
|
4: { |
|
62
|
|
|
'ytic': (False, 3, 's', True), |
|
63
|
|
|
'ceed': (False, 2, 'ss', True), |
|
64
|
|
|
'hood': (False, 4, None, False), |
|
65
|
|
|
'lief': (False, 1, 'v', True), |
|
66
|
|
|
'verj': (False, 1, 't', True), |
|
67
|
|
|
'misj': (False, 2, 't', True), |
|
68
|
|
|
'iabl': (False, 4, 'y', True), |
|
69
|
|
|
'iful': (False, 4, 'y', True), |
|
70
|
|
|
'sion': (False, 4, 'j', False), |
|
71
|
|
|
'xion': (False, 4, 'ct', True), |
|
72
|
|
|
'ship': (False, 4, None, False), |
|
73
|
|
|
'ness': (False, 4, None, False), |
|
74
|
|
|
'ment': (False, 4, None, False), |
|
75
|
|
|
'ript': (False, 2, 'b', True), |
|
76
|
|
|
'orpt': (False, 2, 'b', True), |
|
77
|
|
|
'duct': (False, 1, None, True), |
|
78
|
|
|
'cept': (False, 2, 'iv', True), |
|
79
|
|
|
'olut': (False, 2, 'v', True), |
|
80
|
|
|
'sist': (False, 0, None, True), |
|
81
|
|
|
}, |
|
82
|
|
|
3: { |
|
83
|
|
|
'ied': (False, 3, 'y', False), |
|
84
|
|
|
'eed': (False, 1, None, True), |
|
85
|
|
|
'ing': (False, 3, None, False), |
|
86
|
|
|
'iag': (False, 3, 'y', True), |
|
87
|
|
|
'ish': (False, 3, None, False), |
|
88
|
|
|
'fuj': (False, 1, 's', True), |
|
89
|
|
|
'hej': (False, 1, 'r', True), |
|
90
|
|
|
'abl': (False, 3, None, False), |
|
91
|
|
|
'ibl': (False, 3, None, True), |
|
92
|
|
|
'bil': (False, 2, 'l', False), |
|
93
|
|
|
'ful': (False, 3, None, False), |
|
94
|
|
|
'ial': (False, 3, None, False), |
|
95
|
|
|
'ual': (False, 3, None, False), |
|
96
|
|
|
'ium': (False, 3, None, True), |
|
97
|
|
|
'ism': (False, 3, None, False), |
|
98
|
|
|
'ion': (False, 3, None, False), |
|
99
|
|
|
'ian': (False, 3, None, False), |
|
100
|
|
|
'een': (False, 0, None, True), |
|
101
|
|
|
'ear': (False, 0, None, True), |
|
102
|
|
|
'ier': (False, 3, 'y', False), |
|
103
|
|
|
'ies': (False, 3, 'y', False), |
|
104
|
|
|
'sis': (False, 2, None, True), |
|
105
|
|
|
'ous': (False, 3, None, False), |
|
106
|
|
|
'ent': (False, 3, None, False), |
|
107
|
|
|
'ant': (False, 3, None, False), |
|
108
|
|
|
'ist': (False, 3, None, False), |
|
109
|
|
|
'iqu': (False, 3, None, True), |
|
110
|
|
|
'ogu': (False, 1, None, True), |
|
111
|
|
|
'siv': (False, 3, 'j', False), |
|
112
|
|
|
'eiv': (False, 0, None, True), |
|
113
|
|
|
'bly': (False, 1, None, False), |
|
114
|
|
|
'ily': (False, 3, 'y', False), |
|
115
|
|
|
'ply': (False, 0, None, True), |
|
116
|
|
|
'ogy': (False, 1, None, True), |
|
117
|
|
|
'phy': (False, 1, None, True), |
|
118
|
|
|
'omy': (False, 1, None, True), |
|
119
|
|
|
'opy': (False, 1, None, True), |
|
120
|
|
|
'ity': (False, 3, None, False), |
|
121
|
|
|
'ety': (False, 3, None, False), |
|
122
|
|
|
'lty': (False, 2, None, True), |
|
123
|
|
|
'ary': (False, 3, None, False), |
|
124
|
|
|
'ory': (False, 3, None, False), |
|
125
|
|
|
'ify': (False, 3, None, True), |
|
126
|
|
|
'ncy': (False, 2, 't', False), |
|
127
|
|
|
'acy': (False, 3, None, False), |
|
128
|
|
|
}, |
|
129
|
|
|
2: { |
|
130
|
|
|
'ia': (True, 2, None, True), |
|
131
|
|
|
'bb': (False, 1, None, True), |
|
132
|
|
|
'ic': (False, 2, None, False), |
|
133
|
|
|
'nc': (False, 1, 't', False), |
|
134
|
|
|
'dd': (False, 1, None, True), |
|
135
|
|
|
'ed': (False, 2, None, False), |
|
136
|
|
|
'if': (False, 2, None, False), |
|
137
|
|
|
'ag': (False, 2, None, False), |
|
138
|
|
|
'gg': (False, 1, None, True), |
|
139
|
|
|
'th': (True, 2, None, True), |
|
140
|
|
|
'ij': (False, 1, 'd', True), |
|
141
|
|
|
'uj': (False, 1, 'd', True), |
|
142
|
|
|
'oj': (False, 1, 'd', True), |
|
143
|
|
|
'nj': (False, 1, 'd', True), |
|
144
|
|
|
'cl': (False, 1, None, True), |
|
145
|
|
|
'ul': (False, 2, None, True), |
|
146
|
|
|
'al': (False, 2, None, False), |
|
147
|
|
|
'll': (False, 1, None, True), |
|
148
|
|
|
'um': (True, 2, None, True), |
|
149
|
|
|
'mm': (False, 1, None, True), |
|
150
|
|
|
'an': (False, 2, None, False), |
|
151
|
|
|
'en': (False, 2, None, False), |
|
152
|
|
|
'nn': (False, 1, None, True), |
|
153
|
|
|
'pp': (False, 1, None, True), |
|
154
|
|
|
'er': (False, 2, None, False), |
|
155
|
|
|
'ar': (False, 2, None, True), |
|
156
|
|
|
'or': (False, 2, None, False), |
|
157
|
|
|
'ur': (False, 2, None, False), |
|
158
|
|
|
'rr': (False, 1, None, True), |
|
159
|
|
|
'tr': (False, 1, None, False), |
|
160
|
|
|
'is': (False, 2, None, False), |
|
161
|
|
|
'ss': (False, 0, None, True), |
|
162
|
|
|
'us': (True, 2, None, True), |
|
163
|
|
|
'at': (False, 2, None, False), |
|
164
|
|
|
'tt': (False, 1, None, True), |
|
165
|
|
|
'iv': (False, 2, None, False), |
|
166
|
|
|
'ly': (False, 2, None, False), |
|
167
|
|
|
'iz': (False, 2, None, False), |
|
168
|
|
|
'yz': (False, 1, 's', True), |
|
169
|
|
|
}, |
|
170
|
|
|
1: { |
|
171
|
|
|
'a': (True, 1, None, True), |
|
172
|
|
|
'e': (False, 1, None, False), |
|
173
|
|
|
'i': ((True, 1, None, True), (False, 1, 'y', False)), |
|
174
|
|
|
'j': (False, 1, 's', True), |
|
175
|
|
|
's': ((True, 1, None, False), (False, 0, None, True)), |
|
176
|
|
|
}, |
|
177
|
|
|
} |
|
178
|
|
|
|
|
179
|
1 |
|
def _has_vowel(word): |
|
180
|
1 |
|
for char in word: |
|
181
|
1 |
|
if char in {'a', 'e', 'i', 'o', 'u', 'y'}: |
|
182
|
1 |
|
return True |
|
183
|
1 |
|
return False |
|
184
|
|
|
|
|
185
|
1 |
|
def _acceptable(word): |
|
186
|
1 |
|
if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: |
|
187
|
1 |
|
return len(word) > 1 |
|
188
|
1 |
|
return len(word) > 2 and _has_vowel(word[1:]) |
|
189
|
|
|
|
|
190
|
1 |
|
def _apply_rule(word, rule, intact): |
|
191
|
1 |
|
old_word = word |
|
192
|
1 |
|
only_intact, del_len, add_str, set_terminate = rule |
|
193
|
|
|
# print(word, word[-n:], rule) |
|
194
|
|
|
|
|
195
|
1 |
|
if (not only_intact) or (intact and only_intact): |
|
196
|
1 |
|
if del_len: |
|
197
|
1 |
|
word = word[:-del_len] |
|
198
|
1 |
|
if add_str: |
|
199
|
1 |
|
word += add_str |
|
200
|
|
|
else: |
|
201
|
1 |
|
return word, False, intact, terminate |
|
202
|
|
|
|
|
203
|
1 |
|
if _acceptable(word): |
|
|
|
|
|
|
204
|
1 |
|
return word, True, False, set_terminate |
|
205
|
|
|
else: |
|
206
|
1 |
|
return old_word, False, intact, terminate |
|
207
|
|
|
|
|
208
|
1 |
|
terminate = False |
|
209
|
1 |
|
intact = True |
|
210
|
1 |
|
while not terminate: |
|
|
|
|
|
|
211
|
1 |
|
for n in range(6, 0, -1): |
|
|
|
|
|
|
212
|
1 |
|
if word[-n:] in rule_table[n]: |
|
213
|
1 |
|
accept = False |
|
214
|
1 |
|
if len(rule_table[n][word[-n:]]) < 4: |
|
215
|
1 |
|
for rule in rule_table[n][word[-n:]]: |
|
216
|
1 |
|
(word, accept, intact, terminate) = _apply_rule( |
|
217
|
|
|
word, rule, intact |
|
218
|
|
|
) |
|
219
|
1 |
|
if accept: |
|
220
|
1 |
|
break |
|
221
|
|
|
else: |
|
222
|
1 |
|
rule = rule_table[n][word[-n:]] |
|
223
|
1 |
|
(word, accept, intact, terminate) = _apply_rule( |
|
224
|
|
|
word, rule, intact |
|
225
|
|
|
) |
|
226
|
|
|
|
|
227
|
1 |
|
if accept: |
|
228
|
1 |
|
break |
|
229
|
|
|
else: |
|
230
|
1 |
|
break |
|
231
|
|
|
|
|
232
|
1 |
|
return word |
|
233
|
|
|
|
|
234
|
|
|
|
|
235
|
|
|
if __name__ == '__main__': |
|
236
|
|
|
import doctest |
|
237
|
|
|
|
|
238
|
|
|
doctest.testmod() |
|
239
|
|
|
|