1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
1 |
|
"""abydos.stemmer._paice_husk. |
20
|
|
|
|
21
|
|
|
The stemmer._paice_husk module defines the Paice-Husk Stemmer |
22
|
|
|
""" |
23
|
|
|
|
24
|
1 |
|
from __future__ import unicode_literals |
25
|
|
|
|
26
|
1 |
|
from six.moves import range |
27
|
|
|
|
28
|
1 |
|
__all__ = ['paice_husk'] |
29
|
|
|
|
30
|
|
|
|
31
|
1 |
|
def paice_husk(word): |
32
|
|
|
"""Return Paice-Husk stem. |
33
|
|
|
|
34
|
|
|
Implementation of the Paice-Husk Stemmer, also known as the Lancaster |
35
|
|
|
Stemmer, developed by Chris Paice, with the assistance of Gareth Husk |
36
|
|
|
|
37
|
|
|
This is based on the algorithm's description in :cite:`Paice:1990`. |
38
|
|
|
|
39
|
|
|
:param str word: the word to stem |
40
|
|
|
:returns: the stemmed word |
41
|
|
|
:rtype: str |
42
|
|
|
|
43
|
|
|
>>> paice_husk('assumption') |
44
|
|
|
'assum' |
45
|
|
|
>>> paice_husk('verifiable') |
46
|
|
|
'ver' |
47
|
|
|
>>> paice_husk('fancies') |
48
|
|
|
'fant' |
49
|
|
|
>>> paice_husk('fanciful') |
50
|
|
|
'fancy' |
51
|
|
|
>>> paice_husk('torment') |
52
|
|
|
'tor' |
53
|
|
|
""" |
54
|
1 |
|
rule_table = { |
55
|
|
|
6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)}, |
56
|
|
|
5: { |
57
|
|
|
'guish': (False, 5, 'ct', True), |
58
|
|
|
'sumpt': (False, 2, None, True), |
59
|
|
|
'istry': (False, 5, None, True), |
60
|
|
|
}, |
61
|
|
|
4: { |
62
|
|
|
'ytic': (False, 3, 's', True), |
63
|
|
|
'ceed': (False, 2, 'ss', True), |
64
|
|
|
'hood': (False, 4, None, False), |
65
|
|
|
'lief': (False, 1, 'v', True), |
66
|
|
|
'verj': (False, 1, 't', True), |
67
|
|
|
'misj': (False, 2, 't', True), |
68
|
|
|
'iabl': (False, 4, 'y', True), |
69
|
|
|
'iful': (False, 4, 'y', True), |
70
|
|
|
'sion': (False, 4, 'j', False), |
71
|
|
|
'xion': (False, 4, 'ct', True), |
72
|
|
|
'ship': (False, 4, None, False), |
73
|
|
|
'ness': (False, 4, None, False), |
74
|
|
|
'ment': (False, 4, None, False), |
75
|
|
|
'ript': (False, 2, 'b', True), |
76
|
|
|
'orpt': (False, 2, 'b', True), |
77
|
|
|
'duct': (False, 1, None, True), |
78
|
|
|
'cept': (False, 2, 'iv', True), |
79
|
|
|
'olut': (False, 2, 'v', True), |
80
|
|
|
'sist': (False, 0, None, True), |
81
|
|
|
}, |
82
|
|
|
3: { |
83
|
|
|
'ied': (False, 3, 'y', False), |
84
|
|
|
'eed': (False, 1, None, True), |
85
|
|
|
'ing': (False, 3, None, False), |
86
|
|
|
'iag': (False, 3, 'y', True), |
87
|
|
|
'ish': (False, 3, None, False), |
88
|
|
|
'fuj': (False, 1, 's', True), |
89
|
|
|
'hej': (False, 1, 'r', True), |
90
|
|
|
'abl': (False, 3, None, False), |
91
|
|
|
'ibl': (False, 3, None, True), |
92
|
|
|
'bil': (False, 2, 'l', False), |
93
|
|
|
'ful': (False, 3, None, False), |
94
|
|
|
'ial': (False, 3, None, False), |
95
|
|
|
'ual': (False, 3, None, False), |
96
|
|
|
'ium': (False, 3, None, True), |
97
|
|
|
'ism': (False, 3, None, False), |
98
|
|
|
'ion': (False, 3, None, False), |
99
|
|
|
'ian': (False, 3, None, False), |
100
|
|
|
'een': (False, 0, None, True), |
101
|
|
|
'ear': (False, 0, None, True), |
102
|
|
|
'ier': (False, 3, 'y', False), |
103
|
|
|
'ies': (False, 3, 'y', False), |
104
|
|
|
'sis': (False, 2, None, True), |
105
|
|
|
'ous': (False, 3, None, False), |
106
|
|
|
'ent': (False, 3, None, False), |
107
|
|
|
'ant': (False, 3, None, False), |
108
|
|
|
'ist': (False, 3, None, False), |
109
|
|
|
'iqu': (False, 3, None, True), |
110
|
|
|
'ogu': (False, 1, None, True), |
111
|
|
|
'siv': (False, 3, 'j', False), |
112
|
|
|
'eiv': (False, 0, None, True), |
113
|
|
|
'bly': (False, 1, None, False), |
114
|
|
|
'ily': (False, 3, 'y', False), |
115
|
|
|
'ply': (False, 0, None, True), |
116
|
|
|
'ogy': (False, 1, None, True), |
117
|
|
|
'phy': (False, 1, None, True), |
118
|
|
|
'omy': (False, 1, None, True), |
119
|
|
|
'opy': (False, 1, None, True), |
120
|
|
|
'ity': (False, 3, None, False), |
121
|
|
|
'ety': (False, 3, None, False), |
122
|
|
|
'lty': (False, 2, None, True), |
123
|
|
|
'ary': (False, 3, None, False), |
124
|
|
|
'ory': (False, 3, None, False), |
125
|
|
|
'ify': (False, 3, None, True), |
126
|
|
|
'ncy': (False, 2, 't', False), |
127
|
|
|
'acy': (False, 3, None, False), |
128
|
|
|
}, |
129
|
|
|
2: { |
130
|
|
|
'ia': (True, 2, None, True), |
131
|
|
|
'bb': (False, 1, None, True), |
132
|
|
|
'ic': (False, 2, None, False), |
133
|
|
|
'nc': (False, 1, 't', False), |
134
|
|
|
'dd': (False, 1, None, True), |
135
|
|
|
'ed': (False, 2, None, False), |
136
|
|
|
'if': (False, 2, None, False), |
137
|
|
|
'ag': (False, 2, None, False), |
138
|
|
|
'gg': (False, 1, None, True), |
139
|
|
|
'th': (True, 2, None, True), |
140
|
|
|
'ij': (False, 1, 'd', True), |
141
|
|
|
'uj': (False, 1, 'd', True), |
142
|
|
|
'oj': (False, 1, 'd', True), |
143
|
|
|
'nj': (False, 1, 'd', True), |
144
|
|
|
'cl': (False, 1, None, True), |
145
|
|
|
'ul': (False, 2, None, True), |
146
|
|
|
'al': (False, 2, None, False), |
147
|
|
|
'll': (False, 1, None, True), |
148
|
|
|
'um': (True, 2, None, True), |
149
|
|
|
'mm': (False, 1, None, True), |
150
|
|
|
'an': (False, 2, None, False), |
151
|
|
|
'en': (False, 2, None, False), |
152
|
|
|
'nn': (False, 1, None, True), |
153
|
|
|
'pp': (False, 1, None, True), |
154
|
|
|
'er': (False, 2, None, False), |
155
|
|
|
'ar': (False, 2, None, True), |
156
|
|
|
'or': (False, 2, None, False), |
157
|
|
|
'ur': (False, 2, None, False), |
158
|
|
|
'rr': (False, 1, None, True), |
159
|
|
|
'tr': (False, 1, None, False), |
160
|
|
|
'is': (False, 2, None, False), |
161
|
|
|
'ss': (False, 0, None, True), |
162
|
|
|
'us': (True, 2, None, True), |
163
|
|
|
'at': (False, 2, None, False), |
164
|
|
|
'tt': (False, 1, None, True), |
165
|
|
|
'iv': (False, 2, None, False), |
166
|
|
|
'ly': (False, 2, None, False), |
167
|
|
|
'iz': (False, 2, None, False), |
168
|
|
|
'yz': (False, 1, 's', True), |
169
|
|
|
}, |
170
|
|
|
1: { |
171
|
|
|
'a': (True, 1, None, True), |
172
|
|
|
'e': (False, 1, None, False), |
173
|
|
|
'i': ((True, 1, None, True), (False, 1, 'y', False)), |
174
|
|
|
'j': (False, 1, 's', True), |
175
|
|
|
's': ((True, 1, None, False), (False, 0, None, True)), |
176
|
|
|
}, |
177
|
|
|
} |
178
|
|
|
|
179
|
1 |
|
def _has_vowel(word): |
180
|
1 |
|
for char in word: |
181
|
1 |
|
if char in {'a', 'e', 'i', 'o', 'u', 'y'}: |
182
|
1 |
|
return True |
183
|
1 |
|
return False |
184
|
|
|
|
185
|
1 |
|
def _acceptable(word): |
186
|
1 |
|
if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: |
187
|
1 |
|
return len(word) > 1 |
188
|
1 |
|
return len(word) > 2 and _has_vowel(word[1:]) |
189
|
|
|
|
190
|
1 |
|
def _apply_rule(word, rule, intact): |
191
|
1 |
|
old_word = word |
192
|
1 |
|
only_intact, del_len, add_str, set_terminate = rule |
193
|
|
|
# print(word, word[-n:], rule) |
194
|
|
|
|
195
|
1 |
|
if (not only_intact) or (intact and only_intact): |
196
|
1 |
|
if del_len: |
197
|
1 |
|
word = word[:-del_len] |
198
|
1 |
|
if add_str: |
199
|
1 |
|
word += add_str |
200
|
|
|
else: |
201
|
1 |
|
return word, False, intact, terminate |
202
|
|
|
|
203
|
1 |
|
if _acceptable(word): |
|
|
|
|
204
|
1 |
|
return word, True, False, set_terminate |
205
|
|
|
else: |
206
|
1 |
|
return old_word, False, intact, terminate |
207
|
|
|
|
208
|
1 |
|
terminate = False |
209
|
1 |
|
intact = True |
210
|
1 |
|
while not terminate: |
|
|
|
|
211
|
1 |
|
for n in range(6, 0, -1): |
|
|
|
|
212
|
1 |
|
if word[-n:] in rule_table[n]: |
213
|
1 |
|
accept = False |
214
|
1 |
|
if len(rule_table[n][word[-n:]]) < 4: |
215
|
1 |
|
for rule in rule_table[n][word[-n:]]: |
216
|
1 |
|
(word, accept, intact, terminate) = _apply_rule( |
217
|
|
|
word, rule, intact |
218
|
|
|
) |
219
|
1 |
|
if accept: |
220
|
1 |
|
break |
221
|
|
|
else: |
222
|
1 |
|
rule = rule_table[n][word[-n:]] |
223
|
1 |
|
(word, accept, intact, terminate) = _apply_rule( |
224
|
|
|
word, rule, intact |
225
|
|
|
) |
226
|
|
|
|
227
|
1 |
|
if accept: |
228
|
1 |
|
break |
229
|
|
|
else: |
230
|
1 |
|
break |
231
|
|
|
|
232
|
1 |
|
return word |
233
|
|
|
|
234
|
|
|
|
235
|
|
|
if __name__ == '__main__': |
236
|
|
|
import doctest |
237
|
|
|
|
238
|
|
|
doctest.testmod() |
239
|
|
|
|