Passed
Push — master ( 416c2f...9ec382 )
by Chris
01:03 queued 13s
created

abydos.stemmer._paice_husk.PaiceHusk._apply_rule()   B

Complexity

Conditions 7

Size

Total Lines 23
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 17
CRAP Score 7

Importance

Changes 0
Metric Value
eloc 17
dl 0
loc 23
ccs 17
cts 17
cp 1
rs 8
c 0
b 0
f 0
cc 7
nop 5
crap 7
1
# Copyright 2018-2020 by Christopher C. Little.
2
# This file is part of Abydos.
3
#
4
# Abydos is free software: you can redistribute it and/or modify
5
# it under the terms of the GNU General Public License as published by
6
# the Free Software Foundation, either version 3 of the License, or
7
# (at your option) any later version.
8
#
9
# Abydos is distributed in the hope that it will be useful,
10
# but WITHOUT ANY WARRANTY; without even the implied warranty of
11
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
# GNU General Public License for more details.
13
#
14
# You should have received a copy of the GNU General Public License
15
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
16
17
"""abydos.stemmer._paice_husk.
18
19 1
Paice-Husk Stemmer
20
"""
21
22
from typing import Dict, Optional, Tuple
23
24 1
from ._stemmer import _Stemmer
25
26
__all__ = ['PaiceHusk']
27
28
29
class PaiceHusk(_Stemmer):
30
    """Paice-Husk stemmer.
31 1
32
    Implementation of the Paice-Husk Stemmer, also known as the Lancaster
33 1
    Stemmer, developed by Chris Paice, with the assistance of Gareth Husk
34
35 1
    This is based on the algorithm's description in :cite:`Paice:1990`.
36 1
37
    .. versionadded:: 0.3.6
38 1
    """
39
40
    _rule_table = {
41 1
        6: {
42
            'ifiabl': ((False, 6, None, True),),
43
            'plicat': ((False, 4, 'y', True),),
44
        },
45
        5: {
46
            'guish': ((False, 5, 'ct', True),),
47
            'sumpt': ((False, 2, None, True),),
48
            'istry': ((False, 5, None, True),),
49
        },
50
        4: {
51
            'ytic': ((False, 3, 's', True),),
52 1
            'ceed': ((False, 2, 'ss', True),),
53
            'hood': ((False, 4, None, False),),
54
            'lief': ((False, 1, 'v', True),),
55
            'verj': ((False, 1, 't', True),),
56
            'misj': ((False, 2, 't', True),),
57
            'iabl': ((False, 4, 'y', True),),
58
            'iful': ((False, 4, 'y', True),),
59
            'sion': ((False, 4, 'j', False),),
60
            'xion': ((False, 4, 'ct', True),),
61
            'ship': ((False, 4, None, False),),
62
            'ness': ((False, 4, None, False),),
63
            'ment': ((False, 4, None, False),),
64
            'ript': ((False, 2, 'b', True),),
65
            'orpt': ((False, 2, 'b', True),),
66
            'duct': ((False, 1, None, True),),
67
            'cept': ((False, 2, 'iv', True),),
68
            'olut': ((False, 2, 'v', True),),
69
            'sist': ((False, 0, None, True),),
70
        },
71
        3: {
72
            'ied': ((False, 3, 'y', False),),
73
            'eed': ((False, 1, None, True),),
74
            'ing': ((False, 3, None, False),),
75
            'iag': ((False, 3, 'y', True),),
76
            'ish': ((False, 3, None, False),),
77
            'fuj': ((False, 1, 's', True),),
78
            'hej': ((False, 1, 'r', True),),
79
            'abl': ((False, 3, None, False),),
80
            'ibl': ((False, 3, None, True),),
81
            'bil': ((False, 2, 'l', False),),
82
            'ful': ((False, 3, None, False),),
83
            'ial': ((False, 3, None, False),),
84
            'ual': ((False, 3, None, False),),
85
            'ium': ((False, 3, None, True),),
86
            'ism': ((False, 3, None, False),),
87
            'ion': ((False, 3, None, False),),
88
            'ian': ((False, 3, None, False),),
89
            'een': ((False, 0, None, True),),
90
            'ear': ((False, 0, None, True),),
91
            'ier': ((False, 3, 'y', False),),
92
            'ies': ((False, 3, 'y', False),),
93
            'sis': ((False, 2, None, True),),
94
            'ous': ((False, 3, None, False),),
95
            'ent': ((False, 3, None, False),),
96
            'ant': ((False, 3, None, False),),
97
            'ist': ((False, 3, None, False),),
98
            'iqu': ((False, 3, None, True),),
99
            'ogu': ((False, 1, None, True),),
100
            'siv': ((False, 3, 'j', False),),
101
            'eiv': ((False, 0, None, True),),
102
            'bly': ((False, 1, None, False),),
103
            'ily': ((False, 3, 'y', False),),
104
            'ply': ((False, 0, None, True),),
105
            'ogy': ((False, 1, None, True),),
106
            'phy': ((False, 1, None, True),),
107
            'omy': ((False, 1, None, True),),
108
            'opy': ((False, 1, None, True),),
109
            'ity': ((False, 3, None, False),),
110
            'ety': ((False, 3, None, False),),
111
            'lty': ((False, 2, None, True),),
112
            'ary': ((False, 3, None, False),),
113
            'ory': ((False, 3, None, False),),
114
            'ify': ((False, 3, None, True),),
115
            'ncy': ((False, 2, 't', False),),
116
            'acy': ((False, 3, None, False),),
117
        },
118
        2: {
119
            'ia': ((True, 2, None, True),),
120
            'bb': ((False, 1, None, True),),
121
            'ic': ((False, 2, None, False),),
122
            'nc': ((False, 1, 't', False),),
123
            'dd': ((False, 1, None, True),),
124
            'ed': ((False, 2, None, False),),
125
            'if': ((False, 2, None, False),),
126
            'ag': ((False, 2, None, False),),
127
            'gg': ((False, 1, None, True),),
128
            'th': ((True, 2, None, True),),
129
            'ij': ((False, 1, 'd', True),),
130
            'uj': ((False, 1, 'd', True),),
131
            'oj': ((False, 1, 'd', True),),
132
            'nj': ((False, 1, 'd', True),),
133
            'cl': ((False, 1, None, True),),
134
            'ul': ((False, 2, None, True),),
135
            'al': ((False, 2, None, False),),
136
            'll': ((False, 1, None, True),),
137
            'um': ((True, 2, None, True),),
138
            'mm': ((False, 1, None, True),),
139
            'an': ((False, 2, None, False),),
140
            'en': ((False, 2, None, False),),
141
            'nn': ((False, 1, None, True),),
142
            'pp': ((False, 1, None, True),),
143
            'er': ((False, 2, None, False),),
144
            'ar': ((False, 2, None, True),),
145
            'or': ((False, 2, None, False),),
146
            'ur': ((False, 2, None, False),),
147
            'rr': ((False, 1, None, True),),
148
            'tr': ((False, 1, None, False),),
149
            'is': ((False, 2, None, False),),
150
            'ss': ((False, 0, None, True),),
151
            'us': ((True, 2, None, True),),
152
            'at': ((False, 2, None, False),),
153
            'tt': ((False, 1, None, True),),
154
            'iv': ((False, 2, None, False),),
155
            'ly': ((False, 2, None, False),),
156
            'iz': ((False, 2, None, False),),
157
            'yz': ((False, 1, 's', True),),
158
        },
159
        1: {
160
            'a': ((True, 1, None, True),),
161
            'e': ((False, 1, None, False),),
162
            'i': ((True, 1, None, True), (False, 1, 'y', False)),
163
            'j': ((False, 1, 's', True),),
164
            's': ((True, 1, None, False), (False, 0, None, True)),
165
        },
166
    }  # type: Dict[int, Dict[str, Tuple[Tuple[bool, int, Optional[str], bool], ...]]]  # noqa: E501
167
168
    def _has_vowel(self, word: str) -> bool:
169
        for char in word:
170
            if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
171
                return True
172
        return False
173
174
    def _acceptable(self, word: str) -> bool:
175
        if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
176
            return len(word) > 1
177 1
        return len(word) > 2 and self._has_vowel(word[1:])
178 1
179 1
    def _apply_rule(
180 1
        self,
181 1
        word: str,
182
        rule: Tuple[bool, int, Optional[str], bool],
183 1
        intact: bool,
184 1
        terminate: bool,
185 1
    ) -> Tuple[str, bool, bool, bool]:
186 1
        old_word = word
187
        only_intact, del_len, add_str, set_terminate = rule
188 1
        # print(word, word[-n:], rule)
189 1
190 1
        if (not only_intact) or (intact and only_intact):
191
            if del_len:
192
                word = word[:-del_len]
193 1
            if add_str:
194 1
                word += add_str
195 1
        else:
196 1
            return word, False, intact, terminate
197 1
198
        if self._acceptable(word):
199 1
            return word, True, False, set_terminate
200
        else:
201 1
            return old_word, False, intact, terminate
202 1
203
    def stem(self, word: str) -> str:
204 1
        """Return Paice-Husk stem.
205
206 1
        Parameters
207
        ----------
208
        word : str
209
            The word to stem
210
211
        Returns
212
        -------
213
        str
214
            Word stem
215
216
        Examples
217
        --------
218
        >>> stmr = PaiceHusk()
219
        >>> stmr.stem('assumption')
220
        'assum'
221
        >>> stmr.stem('verifiable')
222
        'ver'
223
        >>> stmr.stem('fancies')
224
        'fant'
225
        >>> stmr.stem('fanciful')
226
        'fancy'
227
        >>> stmr.stem('torment')
228
        'tor'
229
230
231
        .. versionadded:: 0.3.0
232
        .. versionchanged:: 0.3.6
233
            Encapsulated in class
234
235
        """
236
        terminate = False
237
        intact = True
238
        while not terminate:
239 1
            for n in range(6, 0, -1):
240 1
                if word[-n:] in self._rule_table[n]:
241 1
                    accept = False
242 1
                    for rule in self._rule_table[n][word[-n:]]:
243 1
                        (word, accept, intact, terminate,) = self._apply_rule(
244 1
                            word, rule, intact, terminate
245 1
                        )
246 1
                        if accept:
247 1
                            break
248
249
                    if accept:
250
                        break
251
            else:
252
                break
253 1
254 1
        return word
255
256 1
257 1
if __name__ == '__main__':
258
    import doctest
259
260
    doctest.testmod()
261