Completed
Branch master (78a222)
by Chris
14:36
created

abydos.stemmer._paice_husk   A

Complexity

Total Complexity 19

Size/Duplication

Total Lines 239
Duplicated Lines 0 %

Test Coverage

Coverage 100%

Importance

Changes 0
Metric Value
wmc 19
eloc 167
dl 0
loc 239
ccs 44
cts 44
cp 1
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
F paice_husk() 0 202 19
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19 1
"""abydos.stemmer._paice_husk.
20
21
The stemmer._paice_husk module defines the Paice-Husk Stemmer
22
"""
23
24 1
from __future__ import unicode_literals
25
26 1
from six.moves import range
27
28 1
__all__ = ['paice_husk']
29
30
31 1
def paice_husk(word):
32
    """Return Paice-Husk stem.
33
34
    Implementation of the Paice-Husk Stemmer, also known as the Lancaster
35
    Stemmer, developed by Chris Paice, with the assistance of Gareth Husk
36
37
    This is based on the algorithm's description in :cite:`Paice:1990`.
38
39
    :param str word: the word to stem
40
    :returns: the stemmed word
41
    :rtype: str
42
43
    >>> paice_husk('assumption')
44
    'assum'
45
    >>> paice_husk('verifiable')
46
    'ver'
47
    >>> paice_husk('fancies')
48
    'fant'
49
    >>> paice_husk('fanciful')
50
    'fancy'
51
    >>> paice_husk('torment')
52
    'tor'
53
    """
54 1
    rule_table = {
55
        6: {'ifiabl': (False, 6, None, True), 'plicat': (False, 4, 'y', True)},
56
        5: {
57
            'guish': (False, 5, 'ct', True),
58
            'sumpt': (False, 2, None, True),
59
            'istry': (False, 5, None, True),
60
        },
61
        4: {
62
            'ytic': (False, 3, 's', True),
63
            'ceed': (False, 2, 'ss', True),
64
            'hood': (False, 4, None, False),
65
            'lief': (False, 1, 'v', True),
66
            'verj': (False, 1, 't', True),
67
            'misj': (False, 2, 't', True),
68
            'iabl': (False, 4, 'y', True),
69
            'iful': (False, 4, 'y', True),
70
            'sion': (False, 4, 'j', False),
71
            'xion': (False, 4, 'ct', True),
72
            'ship': (False, 4, None, False),
73
            'ness': (False, 4, None, False),
74
            'ment': (False, 4, None, False),
75
            'ript': (False, 2, 'b', True),
76
            'orpt': (False, 2, 'b', True),
77
            'duct': (False, 1, None, True),
78
            'cept': (False, 2, 'iv', True),
79
            'olut': (False, 2, 'v', True),
80
            'sist': (False, 0, None, True),
81
        },
82
        3: {
83
            'ied': (False, 3, 'y', False),
84
            'eed': (False, 1, None, True),
85
            'ing': (False, 3, None, False),
86
            'iag': (False, 3, 'y', True),
87
            'ish': (False, 3, None, False),
88
            'fuj': (False, 1, 's', True),
89
            'hej': (False, 1, 'r', True),
90
            'abl': (False, 3, None, False),
91
            'ibl': (False, 3, None, True),
92
            'bil': (False, 2, 'l', False),
93
            'ful': (False, 3, None, False),
94
            'ial': (False, 3, None, False),
95
            'ual': (False, 3, None, False),
96
            'ium': (False, 3, None, True),
97
            'ism': (False, 3, None, False),
98
            'ion': (False, 3, None, False),
99
            'ian': (False, 3, None, False),
100
            'een': (False, 0, None, True),
101
            'ear': (False, 0, None, True),
102
            'ier': (False, 3, 'y', False),
103
            'ies': (False, 3, 'y', False),
104
            'sis': (False, 2, None, True),
105
            'ous': (False, 3, None, False),
106
            'ent': (False, 3, None, False),
107
            'ant': (False, 3, None, False),
108
            'ist': (False, 3, None, False),
109
            'iqu': (False, 3, None, True),
110
            'ogu': (False, 1, None, True),
111
            'siv': (False, 3, 'j', False),
112
            'eiv': (False, 0, None, True),
113
            'bly': (False, 1, None, False),
114
            'ily': (False, 3, 'y', False),
115
            'ply': (False, 0, None, True),
116
            'ogy': (False, 1, None, True),
117
            'phy': (False, 1, None, True),
118
            'omy': (False, 1, None, True),
119
            'opy': (False, 1, None, True),
120
            'ity': (False, 3, None, False),
121
            'ety': (False, 3, None, False),
122
            'lty': (False, 2, None, True),
123
            'ary': (False, 3, None, False),
124
            'ory': (False, 3, None, False),
125
            'ify': (False, 3, None, True),
126
            'ncy': (False, 2, 't', False),
127
            'acy': (False, 3, None, False),
128
        },
129
        2: {
130
            'ia': (True, 2, None, True),
131
            'bb': (False, 1, None, True),
132
            'ic': (False, 2, None, False),
133
            'nc': (False, 1, 't', False),
134
            'dd': (False, 1, None, True),
135
            'ed': (False, 2, None, False),
136
            'if': (False, 2, None, False),
137
            'ag': (False, 2, None, False),
138
            'gg': (False, 1, None, True),
139
            'th': (True, 2, None, True),
140
            'ij': (False, 1, 'd', True),
141
            'uj': (False, 1, 'd', True),
142
            'oj': (False, 1, 'd', True),
143
            'nj': (False, 1, 'd', True),
144
            'cl': (False, 1, None, True),
145
            'ul': (False, 2, None, True),
146
            'al': (False, 2, None, False),
147
            'll': (False, 1, None, True),
148
            'um': (True, 2, None, True),
149
            'mm': (False, 1, None, True),
150
            'an': (False, 2, None, False),
151
            'en': (False, 2, None, False),
152
            'nn': (False, 1, None, True),
153
            'pp': (False, 1, None, True),
154
            'er': (False, 2, None, False),
155
            'ar': (False, 2, None, True),
156
            'or': (False, 2, None, False),
157
            'ur': (False, 2, None, False),
158
            'rr': (False, 1, None, True),
159
            'tr': (False, 1, None, False),
160
            'is': (False, 2, None, False),
161
            'ss': (False, 0, None, True),
162
            'us': (True, 2, None, True),
163
            'at': (False, 2, None, False),
164
            'tt': (False, 1, None, True),
165
            'iv': (False, 2, None, False),
166
            'ly': (False, 2, None, False),
167
            'iz': (False, 2, None, False),
168
            'yz': (False, 1, 's', True),
169
        },
170
        1: {
171
            'a': (True, 1, None, True),
172
            'e': (False, 1, None, False),
173
            'i': ((True, 1, None, True), (False, 1, 'y', False)),
174
            'j': (False, 1, 's', True),
175
            's': ((True, 1, None, False), (False, 0, None, True)),
176
        },
177
    }
178
179 1
    def _has_vowel(word):
180 1
        for char in word:
181 1
            if char in {'a', 'e', 'i', 'o', 'u', 'y'}:
182 1
                return True
183 1
        return False
184
185 1
    def _acceptable(word):
186 1
        if word and word[0] in {'a', 'e', 'i', 'o', 'u'}:
187 1
            return len(word) > 1
188 1
        return len(word) > 2 and _has_vowel(word[1:])
189
190 1
    def _apply_rule(word, rule, intact):
191 1
        old_word = word
192 1
        only_intact, del_len, add_str, set_terminate = rule
193
        # print(word, word[-n:], rule)
194
195 1
        if (not only_intact) or (intact and only_intact):
196 1
            if del_len:
197 1
                word = word[:-del_len]
198 1
            if add_str:
199 1
                word += add_str
200
        else:
201 1
            return word, False, intact, terminate
202
203 1
        if _acceptable(word):
0 ignored issues
show
unused-code introduced by
Unnecessary "else" after "return"
Loading history...
204 1
            return word, True, False, set_terminate
205
        else:
206 1
            return old_word, False, intact, terminate
207
208 1
    terminate = False
209 1
    intact = True
210 1
    while not terminate:
0 ignored issues
show
unused-code introduced by
Too many nested blocks (6/5)
Loading history...
211 1
        for n in range(6, 0, -1):
0 ignored issues
show
Coding Style Naming introduced by
The name n does not conform to the variable naming conventions ((([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
212 1
            if word[-n:] in rule_table[n]:
213 1
                accept = False
214 1
                if len(rule_table[n][word[-n:]]) < 4:
215 1
                    for rule in rule_table[n][word[-n:]]:
216 1
                        (word, accept, intact, terminate) = _apply_rule(
217
                            word, rule, intact
218
                        )
219 1
                        if accept:
220 1
                            break
221
                else:
222 1
                    rule = rule_table[n][word[-n:]]
223 1
                    (word, accept, intact, terminate) = _apply_rule(
224
                        word, rule, intact
225
                    )
226
227 1
                if accept:
228 1
                    break
229
        else:
230 1
            break
231
232 1
    return word
233
234
235
if __name__ == '__main__':
236
    import doctest
237
238
    doctest.testmod()
239