1
|
|
|
# Copyright 2018-2020 by Christopher C. Little. |
2
|
|
|
# This file is part of Abydos. |
3
|
|
|
# |
4
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
5
|
|
|
# it under the terms of the GNU General Public License as published by |
6
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
7
|
|
|
# (at your option) any later version. |
8
|
|
|
# |
9
|
|
|
# Abydos is distributed in the hope that it will be useful, |
10
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
11
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
12
|
|
|
# GNU General Public License for more details. |
13
|
|
|
# |
14
|
|
|
# You should have received a copy of the GNU General Public License |
15
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
16
|
|
|
|
17
|
|
|
"""abydos.stemmer._paice_husk. |
18
|
|
|
|
19
|
1 |
|
Paice-Husk Stemmer |
20
|
|
|
""" |
21
|
|
|
|
22
|
|
|
from typing import Dict, Optional, Tuple |
23
|
|
|
|
24
|
1 |
|
from ._stemmer import _Stemmer |
25
|
|
|
|
26
|
|
|
__all__ = ['PaiceHusk'] |
27
|
|
|
|
28
|
|
|
|
29
|
|
|
class PaiceHusk(_Stemmer): |
30
|
|
|
"""Paice-Husk stemmer. |
31
|
1 |
|
|
32
|
|
|
Implementation of the Paice-Husk Stemmer, also known as the Lancaster |
33
|
1 |
|
Stemmer, developed by Chris Paice, with the assistance of Gareth Husk |
34
|
|
|
|
35
|
1 |
|
This is based on the algorithm's description in :cite:`Paice:1990`. |
36
|
1 |
|
|
37
|
|
|
.. versionadded:: 0.3.6 |
38
|
1 |
|
""" |
39
|
|
|
|
40
|
|
|
_rule_table = { |
41
|
1 |
|
6: { |
42
|
|
|
'ifiabl': ((False, 6, None, True),), |
43
|
|
|
'plicat': ((False, 4, 'y', True),), |
44
|
|
|
}, |
45
|
|
|
5: { |
46
|
|
|
'guish': ((False, 5, 'ct', True),), |
47
|
|
|
'sumpt': ((False, 2, None, True),), |
48
|
|
|
'istry': ((False, 5, None, True),), |
49
|
|
|
}, |
50
|
|
|
4: { |
51
|
|
|
'ytic': ((False, 3, 's', True),), |
52
|
1 |
|
'ceed': ((False, 2, 'ss', True),), |
53
|
|
|
'hood': ((False, 4, None, False),), |
54
|
|
|
'lief': ((False, 1, 'v', True),), |
55
|
|
|
'verj': ((False, 1, 't', True),), |
56
|
|
|
'misj': ((False, 2, 't', True),), |
57
|
|
|
'iabl': ((False, 4, 'y', True),), |
58
|
|
|
'iful': ((False, 4, 'y', True),), |
59
|
|
|
'sion': ((False, 4, 'j', False),), |
60
|
|
|
'xion': ((False, 4, 'ct', True),), |
61
|
|
|
'ship': ((False, 4, None, False),), |
62
|
|
|
'ness': ((False, 4, None, False),), |
63
|
|
|
'ment': ((False, 4, None, False),), |
64
|
|
|
'ript': ((False, 2, 'b', True),), |
65
|
|
|
'orpt': ((False, 2, 'b', True),), |
66
|
|
|
'duct': ((False, 1, None, True),), |
67
|
|
|
'cept': ((False, 2, 'iv', True),), |
68
|
|
|
'olut': ((False, 2, 'v', True),), |
69
|
|
|
'sist': ((False, 0, None, True),), |
70
|
|
|
}, |
71
|
|
|
3: { |
72
|
|
|
'ied': ((False, 3, 'y', False),), |
73
|
|
|
'eed': ((False, 1, None, True),), |
74
|
|
|
'ing': ((False, 3, None, False),), |
75
|
|
|
'iag': ((False, 3, 'y', True),), |
76
|
|
|
'ish': ((False, 3, None, False),), |
77
|
|
|
'fuj': ((False, 1, 's', True),), |
78
|
|
|
'hej': ((False, 1, 'r', True),), |
79
|
|
|
'abl': ((False, 3, None, False),), |
80
|
|
|
'ibl': ((False, 3, None, True),), |
81
|
|
|
'bil': ((False, 2, 'l', False),), |
82
|
|
|
'ful': ((False, 3, None, False),), |
83
|
|
|
'ial': ((False, 3, None, False),), |
84
|
|
|
'ual': ((False, 3, None, False),), |
85
|
|
|
'ium': ((False, 3, None, True),), |
86
|
|
|
'ism': ((False, 3, None, False),), |
87
|
|
|
'ion': ((False, 3, None, False),), |
88
|
|
|
'ian': ((False, 3, None, False),), |
89
|
|
|
'een': ((False, 0, None, True),), |
90
|
|
|
'ear': ((False, 0, None, True),), |
91
|
|
|
'ier': ((False, 3, 'y', False),), |
92
|
|
|
'ies': ((False, 3, 'y', False),), |
93
|
|
|
'sis': ((False, 2, None, True),), |
94
|
|
|
'ous': ((False, 3, None, False),), |
95
|
|
|
'ent': ((False, 3, None, False),), |
96
|
|
|
'ant': ((False, 3, None, False),), |
97
|
|
|
'ist': ((False, 3, None, False),), |
98
|
|
|
'iqu': ((False, 3, None, True),), |
99
|
|
|
'ogu': ((False, 1, None, True),), |
100
|
|
|
'siv': ((False, 3, 'j', False),), |
101
|
|
|
'eiv': ((False, 0, None, True),), |
102
|
|
|
'bly': ((False, 1, None, False),), |
103
|
|
|
'ily': ((False, 3, 'y', False),), |
104
|
|
|
'ply': ((False, 0, None, True),), |
105
|
|
|
'ogy': ((False, 1, None, True),), |
106
|
|
|
'phy': ((False, 1, None, True),), |
107
|
|
|
'omy': ((False, 1, None, True),), |
108
|
|
|
'opy': ((False, 1, None, True),), |
109
|
|
|
'ity': ((False, 3, None, False),), |
110
|
|
|
'ety': ((False, 3, None, False),), |
111
|
|
|
'lty': ((False, 2, None, True),), |
112
|
|
|
'ary': ((False, 3, None, False),), |
113
|
|
|
'ory': ((False, 3, None, False),), |
114
|
|
|
'ify': ((False, 3, None, True),), |
115
|
|
|
'ncy': ((False, 2, 't', False),), |
116
|
|
|
'acy': ((False, 3, None, False),), |
117
|
|
|
}, |
118
|
|
|
2: { |
119
|
|
|
'ia': ((True, 2, None, True),), |
120
|
|
|
'bb': ((False, 1, None, True),), |
121
|
|
|
'ic': ((False, 2, None, False),), |
122
|
|
|
'nc': ((False, 1, 't', False),), |
123
|
|
|
'dd': ((False, 1, None, True),), |
124
|
|
|
'ed': ((False, 2, None, False),), |
125
|
|
|
'if': ((False, 2, None, False),), |
126
|
|
|
'ag': ((False, 2, None, False),), |
127
|
|
|
'gg': ((False, 1, None, True),), |
128
|
|
|
'th': ((True, 2, None, True),), |
129
|
|
|
'ij': ((False, 1, 'd', True),), |
130
|
|
|
'uj': ((False, 1, 'd', True),), |
131
|
|
|
'oj': ((False, 1, 'd', True),), |
132
|
|
|
'nj': ((False, 1, 'd', True),), |
133
|
|
|
'cl': ((False, 1, None, True),), |
134
|
|
|
'ul': ((False, 2, None, True),), |
135
|
|
|
'al': ((False, 2, None, False),), |
136
|
|
|
'll': ((False, 1, None, True),), |
137
|
|
|
'um': ((True, 2, None, True),), |
138
|
|
|
'mm': ((False, 1, None, True),), |
139
|
|
|
'an': ((False, 2, None, False),), |
140
|
|
|
'en': ((False, 2, None, False),), |
141
|
|
|
'nn': ((False, 1, None, True),), |
142
|
|
|
'pp': ((False, 1, None, True),), |
143
|
|
|
'er': ((False, 2, None, False),), |
144
|
|
|
'ar': ((False, 2, None, True),), |
145
|
|
|
'or': ((False, 2, None, False),), |
146
|
|
|
'ur': ((False, 2, None, False),), |
147
|
|
|
'rr': ((False, 1, None, True),), |
148
|
|
|
'tr': ((False, 1, None, False),), |
149
|
|
|
'is': ((False, 2, None, False),), |
150
|
|
|
'ss': ((False, 0, None, True),), |
151
|
|
|
'us': ((True, 2, None, True),), |
152
|
|
|
'at': ((False, 2, None, False),), |
153
|
|
|
'tt': ((False, 1, None, True),), |
154
|
|
|
'iv': ((False, 2, None, False),), |
155
|
|
|
'ly': ((False, 2, None, False),), |
156
|
|
|
'iz': ((False, 2, None, False),), |
157
|
|
|
'yz': ((False, 1, 's', True),), |
158
|
|
|
}, |
159
|
|
|
1: { |
160
|
|
|
'a': ((True, 1, None, True),), |
161
|
|
|
'e': ((False, 1, None, False),), |
162
|
|
|
'i': ((True, 1, None, True), (False, 1, 'y', False)), |
163
|
|
|
'j': ((False, 1, 's', True),), |
164
|
|
|
's': ((True, 1, None, False), (False, 0, None, True)), |
165
|
|
|
}, |
166
|
|
|
} # type: Dict[int, Dict[str, Tuple[Tuple[bool, int, Optional[str], bool], ...]]] # noqa: E501 |
167
|
|
|
|
168
|
|
|
def _has_vowel(self, word: str) -> bool: |
169
|
|
|
for char in word: |
170
|
|
|
if char in {'a', 'e', 'i', 'o', 'u', 'y'}: |
171
|
|
|
return True |
172
|
|
|
return False |
173
|
|
|
|
174
|
|
|
def _acceptable(self, word: str) -> bool: |
175
|
|
|
if word and word[0] in {'a', 'e', 'i', 'o', 'u'}: |
176
|
|
|
return len(word) > 1 |
177
|
1 |
|
return len(word) > 2 and self._has_vowel(word[1:]) |
178
|
1 |
|
|
179
|
1 |
|
def _apply_rule( |
180
|
1 |
|
self, |
181
|
1 |
|
word: str, |
182
|
|
|
rule: Tuple[bool, int, Optional[str], bool], |
183
|
1 |
|
intact: bool, |
184
|
1 |
|
terminate: bool, |
185
|
1 |
|
) -> Tuple[str, bool, bool, bool]: |
186
|
1 |
|
old_word = word |
187
|
|
|
only_intact, del_len, add_str, set_terminate = rule |
188
|
1 |
|
# print(word, word[-n:], rule) |
189
|
1 |
|
|
190
|
1 |
|
if (not only_intact) or (intact and only_intact): |
191
|
|
|
if del_len: |
192
|
|
|
word = word[:-del_len] |
193
|
1 |
|
if add_str: |
194
|
1 |
|
word += add_str |
195
|
1 |
|
else: |
196
|
1 |
|
return word, False, intact, terminate |
197
|
1 |
|
|
198
|
|
|
if self._acceptable(word): |
199
|
1 |
|
return word, True, False, set_terminate |
200
|
|
|
else: |
201
|
1 |
|
return old_word, False, intact, terminate |
202
|
1 |
|
|
203
|
|
|
def stem(self, word: str) -> str: |
204
|
1 |
|
"""Return Paice-Husk stem. |
205
|
|
|
|
206
|
1 |
|
Parameters |
207
|
|
|
---------- |
208
|
|
|
word : str |
209
|
|
|
The word to stem |
210
|
|
|
|
211
|
|
|
Returns |
212
|
|
|
------- |
213
|
|
|
str |
214
|
|
|
Word stem |
215
|
|
|
|
216
|
|
|
Examples |
217
|
|
|
-------- |
218
|
|
|
>>> stmr = PaiceHusk() |
219
|
|
|
>>> stmr.stem('assumption') |
220
|
|
|
'assum' |
221
|
|
|
>>> stmr.stem('verifiable') |
222
|
|
|
'ver' |
223
|
|
|
>>> stmr.stem('fancies') |
224
|
|
|
'fant' |
225
|
|
|
>>> stmr.stem('fanciful') |
226
|
|
|
'fancy' |
227
|
|
|
>>> stmr.stem('torment') |
228
|
|
|
'tor' |
229
|
|
|
|
230
|
|
|
|
231
|
|
|
.. versionadded:: 0.3.0 |
232
|
|
|
.. versionchanged:: 0.3.6 |
233
|
|
|
Encapsulated in class |
234
|
|
|
|
235
|
|
|
""" |
236
|
|
|
terminate = False |
237
|
|
|
intact = True |
238
|
|
|
while not terminate: |
239
|
1 |
|
for n in range(6, 0, -1): |
240
|
1 |
|
if word[-n:] in self._rule_table[n]: |
241
|
1 |
|
accept = False |
242
|
1 |
|
for rule in self._rule_table[n][word[-n:]]: |
243
|
1 |
|
(word, accept, intact, terminate,) = self._apply_rule( |
244
|
1 |
|
word, rule, intact, terminate |
245
|
1 |
|
) |
246
|
1 |
|
if accept: |
247
|
1 |
|
break |
248
|
|
|
|
249
|
|
|
if accept: |
250
|
|
|
break |
251
|
|
|
else: |
252
|
|
|
break |
253
|
1 |
|
|
254
|
1 |
|
return word |
255
|
|
|
|
256
|
1 |
|
|
257
|
1 |
|
if __name__ == '__main__': |
258
|
|
|
import doctest |
259
|
|
|
|
260
|
|
|
doctest.testmod() |
261
|
|
|
|