|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
|
|
"""abydos.tests.test_stemmer. |
|
20
|
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.stemmer |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
|
25
|
|
|
|
|
26
|
|
|
import codecs |
|
27
|
|
|
import os |
|
28
|
|
|
import unittest |
|
29
|
|
|
|
|
30
|
|
|
from abydos.stemmer import _ends_in_cvc, _ends_in_doubled_cons, _m_degree, \ |
|
31
|
|
|
_sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \ |
|
32
|
|
|
_sb_short_word, caumanns, clef_german, clef_german_plus, clef_swedish, \ |
|
33
|
|
|
lovins, paice_husk, porter, porter2, sb_danish, sb_dutch, sb_german, \ |
|
34
|
|
|
sb_norwegian, sb_swedish, uealite |
|
35
|
|
|
|
|
36
|
|
|
TESTDIR = os.path.dirname(__file__) |
|
37
|
|
|
|
|
38
|
|
|
|
|
39
|
|
|
class LovinsTestCases(unittest.TestCase): |
|
40
|
|
|
"""Test Lovins functions. |
|
41
|
|
|
|
|
42
|
|
|
abydos.stemmer.lovins |
|
43
|
|
|
""" |
|
44
|
|
|
|
|
45
|
|
|
def test_lovins(self): |
|
46
|
|
|
"""Test abydos.stemmer.lovins.""" |
|
47
|
|
|
# base case |
|
48
|
|
|
self.assertEqual(lovins(''), '') |
|
49
|
|
|
|
|
50
|
|
|
# test cases from Lovins' "Development of a Stemming Algorithm": |
|
51
|
|
|
# http://www.mt-archive.info/MT-1968-Lovins.pdf |
|
52
|
|
|
self.assertEqual(lovins('magnesia'), 'magnes') |
|
53
|
|
|
self.assertEqual(lovins('magnesite'), 'magnes') |
|
54
|
|
|
self.assertEqual(lovins('magnesian'), 'magnes') |
|
55
|
|
|
self.assertEqual(lovins('magnesium'), 'magnes') |
|
56
|
|
|
self.assertEqual(lovins('magnet'), 'magnet') |
|
57
|
|
|
self.assertEqual(lovins('magnetic'), 'magnet') |
|
58
|
|
|
self.assertEqual(lovins('magneto'), 'magnet') |
|
59
|
|
|
self.assertEqual(lovins('magnetically'), 'magnet') |
|
60
|
|
|
self.assertEqual(lovins('magnetism'), 'magnet') |
|
61
|
|
|
self.assertEqual(lovins('magnetite'), 'magnet') |
|
62
|
|
|
self.assertEqual(lovins('magnetitic'), 'magnet') |
|
63
|
|
|
self.assertEqual(lovins('magnetizable'), 'magnet') |
|
64
|
|
|
self.assertEqual(lovins('magnetization'), 'magnet') |
|
65
|
|
|
self.assertEqual(lovins('magnetize'), 'magnet') |
|
66
|
|
|
self.assertEqual(lovins('magnetometer'), 'magnetometer') |
|
67
|
|
|
self.assertEqual(lovins('magnetometric'), 'magnetometer') |
|
68
|
|
|
self.assertEqual(lovins('magnetometry'), 'magnetometer') |
|
69
|
|
|
self.assertEqual(lovins('magnetomotive'), 'magnetomot') |
|
70
|
|
|
self.assertEqual(lovins('magnetron'), 'magnetron') |
|
71
|
|
|
self.assertEqual(lovins('metal'), 'metal') |
|
72
|
|
|
self.assertEqual(lovins('metall'), 'metal') |
|
73
|
|
|
self.assertEqual(lovins('metallically'), 'metal') |
|
74
|
|
|
self.assertEqual(lovins('metalliferous'), 'metallifer') |
|
75
|
|
|
self.assertEqual(lovins('metallize'), 'metal') |
|
76
|
|
|
self.assertEqual(lovins('metallurgical'), 'metallurg') |
|
77
|
|
|
self.assertEqual(lovins('metallurgy'), 'metallurg') |
|
78
|
|
|
self.assertEqual(lovins('induction'), 'induc') |
|
79
|
|
|
self.assertEqual(lovins('inductance'), 'induc') |
|
80
|
|
|
self.assertEqual(lovins('induced'), 'induc') |
|
81
|
|
|
self.assertEqual(lovins('angular'), 'angl') |
|
82
|
|
|
self.assertEqual(lovins('angle'), 'angl') |
|
83
|
|
|
|
|
84
|
|
|
# missed branch test cases |
|
85
|
|
|
self.assertEqual(lovins('feminism'), 'fem') |
|
86
|
|
|
|
|
87
|
|
|
def test_lovins_snowball(self): |
|
88
|
|
|
"""Test abydos.stemmer.lovins (Snowball testset). |
|
89
|
|
|
|
|
90
|
|
|
These test cases are from |
|
91
|
|
|
https://github.com/snowballstem/snowball-data/tree/master/lovins |
|
92
|
|
|
""" |
|
93
|
|
|
# Snowball Lovins test set |
|
94
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_lovins.csv', |
|
95
|
|
|
encoding='utf-8') as snowball_testset: |
|
96
|
|
|
next(snowball_testset) |
|
97
|
|
|
for line in snowball_testset: |
|
98
|
|
|
if line[0] != '#': |
|
99
|
|
|
line = line.strip().split(',') |
|
100
|
|
|
word, stem = line[0], line[1] |
|
101
|
|
|
self.assertEqual(lovins(word), stem.lower()) |
|
102
|
|
|
|
|
103
|
|
|
|
|
104
|
|
|
class PorterTestCases(unittest.TestCase): |
|
105
|
|
|
"""Test Porter functions. |
|
106
|
|
|
|
|
107
|
|
|
abydos.stemmer._m_degree, abydos.stemmer.porter, |
|
108
|
|
|
abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons, |
|
109
|
|
|
& abydos.stemmer._ends_in_cvc |
|
110
|
|
|
""" |
|
111
|
|
|
|
|
112
|
|
|
def test_m_degree(self): |
|
113
|
|
|
"""Test abydos.stemmer._m_degree.""" |
|
114
|
|
|
_vowels = set('aeiouy') |
|
115
|
|
|
# base case |
|
116
|
|
|
self.assertEqual(_m_degree('', _vowels), 0) |
|
117
|
|
|
|
|
118
|
|
|
# m==0 |
|
119
|
|
|
self.assertEqual(_m_degree('tr', _vowels), 0) |
|
120
|
|
|
self.assertEqual(_m_degree('ee', _vowels), 0) |
|
121
|
|
|
self.assertEqual(_m_degree('tree', _vowels), 0) |
|
122
|
|
|
self.assertEqual(_m_degree('y', _vowels), 0) |
|
123
|
|
|
self.assertEqual(_m_degree('by', _vowels), 0) |
|
124
|
|
|
|
|
125
|
|
|
# m==1 |
|
126
|
|
|
self.assertEqual(_m_degree('trouble', _vowels), 1) |
|
127
|
|
|
self.assertEqual(_m_degree('oats', _vowels), 1) |
|
128
|
|
|
self.assertEqual(_m_degree('trees', _vowels), 1) |
|
129
|
|
|
self.assertEqual(_m_degree('ivy', _vowels), 1) |
|
130
|
|
|
|
|
131
|
|
|
# m==2 |
|
132
|
|
|
self.assertEqual(_m_degree('troubles', _vowels), 2) |
|
133
|
|
|
self.assertEqual(_m_degree('private', _vowels), 2) |
|
134
|
|
|
self.assertEqual(_m_degree('oaten', _vowels), 2) |
|
135
|
|
|
self.assertEqual(_m_degree('orrery', _vowels), 2) |
|
136
|
|
|
|
|
137
|
|
|
def test_has_vowel(self): |
|
138
|
|
|
"""Test abydos.stemmer._has_vowel.""" |
|
139
|
|
|
_vowels = set('aeiouy') |
|
140
|
|
|
# base case |
|
141
|
|
|
self.assertFalse(_sb_has_vowel('', _vowels)) |
|
142
|
|
|
|
|
143
|
|
|
# False cases |
|
144
|
|
|
self.assertFalse(_sb_has_vowel('b', _vowels)) |
|
145
|
|
|
self.assertFalse(_sb_has_vowel('c', _vowels)) |
|
146
|
|
|
self.assertFalse(_sb_has_vowel('bc', _vowels)) |
|
147
|
|
|
self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels)) |
|
148
|
|
|
self.assertFalse(_sb_has_vowel('Y', _vowels)) |
|
149
|
|
|
|
|
150
|
|
|
# True cases |
|
151
|
|
|
self.assertTrue(_sb_has_vowel('a', _vowels)) |
|
152
|
|
|
self.assertTrue(_sb_has_vowel('e', _vowels)) |
|
153
|
|
|
self.assertTrue(_sb_has_vowel('ae', _vowels)) |
|
154
|
|
|
self.assertTrue(_sb_has_vowel('aeiouy', _vowels)) |
|
155
|
|
|
self.assertTrue(_sb_has_vowel('y', _vowels)) |
|
156
|
|
|
|
|
157
|
|
|
self.assertTrue(_sb_has_vowel('ade', _vowels)) |
|
158
|
|
|
self.assertTrue(_sb_has_vowel('cad', _vowels)) |
|
159
|
|
|
self.assertTrue(_sb_has_vowel('add', _vowels)) |
|
160
|
|
|
self.assertTrue(_sb_has_vowel('phi', _vowels)) |
|
161
|
|
|
self.assertTrue(_sb_has_vowel('pfy', _vowels)) |
|
162
|
|
|
|
|
163
|
|
|
self.assertFalse(_sb_has_vowel('pfY', _vowels)) |
|
164
|
|
|
|
|
165
|
|
|
def test_ends_in_doubled_cons(self): |
|
166
|
|
|
"""Test abydos.stemmer._ends_in_doubled_cons.""" |
|
167
|
|
|
_vowels = set('aeiouy') |
|
168
|
|
|
# base case |
|
169
|
|
|
self.assertFalse(_ends_in_doubled_cons('', _vowels)) |
|
170
|
|
|
|
|
171
|
|
|
# False cases |
|
172
|
|
|
self.assertFalse(_ends_in_doubled_cons('b', _vowels)) |
|
173
|
|
|
self.assertFalse(_ends_in_doubled_cons('c', _vowels)) |
|
174
|
|
|
self.assertFalse(_ends_in_doubled_cons('bc', _vowels)) |
|
175
|
|
|
self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz', |
|
176
|
|
|
_vowels)) |
|
177
|
|
|
self.assertFalse(_ends_in_doubled_cons('Y', _vowels)) |
|
178
|
|
|
self.assertFalse(_ends_in_doubled_cons('a', _vowels)) |
|
179
|
|
|
self.assertFalse(_ends_in_doubled_cons('e', _vowels)) |
|
180
|
|
|
self.assertFalse(_ends_in_doubled_cons('ae', _vowels)) |
|
181
|
|
|
self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels)) |
|
182
|
|
|
self.assertFalse(_ends_in_doubled_cons('y', _vowels)) |
|
183
|
|
|
self.assertFalse(_ends_in_doubled_cons('ade', _vowels)) |
|
184
|
|
|
self.assertFalse(_ends_in_doubled_cons('cad', _vowels)) |
|
185
|
|
|
self.assertFalse(_ends_in_doubled_cons('phi', _vowels)) |
|
186
|
|
|
self.assertFalse(_ends_in_doubled_cons('pfy', _vowels)) |
|
187
|
|
|
self.assertFalse(_ends_in_doubled_cons('faddy', _vowels)) |
|
188
|
|
|
self.assertFalse(_ends_in_doubled_cons('aiii', _vowels)) |
|
189
|
|
|
self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels)) |
|
190
|
|
|
|
|
191
|
|
|
# True cases |
|
192
|
|
|
self.assertTrue(_ends_in_doubled_cons('add', _vowels)) |
|
193
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadd', _vowels)) |
|
194
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels)) |
|
195
|
|
|
self.assertTrue(_ends_in_doubled_cons('raYY', _vowels)) |
|
196
|
|
|
self.assertTrue(_ends_in_doubled_cons('doll', _vowels)) |
|
197
|
|
|
self.assertTrue(_ends_in_doubled_cons('parr', _vowels)) |
|
198
|
|
|
self.assertTrue(_ends_in_doubled_cons('parrr', _vowels)) |
|
199
|
|
|
self.assertTrue(_ends_in_doubled_cons('bacc', _vowels)) |
|
200
|
|
|
|
|
201
|
|
|
def test_ends_in_cvc(self): |
|
202
|
|
|
"""Test abydos.stemmer._ends_in_cvc.""" |
|
203
|
|
|
_vowels = set('aeiouy') |
|
204
|
|
|
# base case |
|
205
|
|
|
self.assertFalse(_ends_in_cvc('', _vowels)) |
|
206
|
|
|
|
|
207
|
|
|
# False cases |
|
208
|
|
|
self.assertFalse(_ends_in_cvc('b', _vowels)) |
|
209
|
|
|
self.assertFalse(_ends_in_cvc('c', _vowels)) |
|
210
|
|
|
self.assertFalse(_ends_in_cvc('bc', _vowels)) |
|
211
|
|
|
self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels)) |
|
212
|
|
|
self.assertFalse(_ends_in_cvc('YYY', _vowels)) |
|
213
|
|
|
self.assertFalse(_ends_in_cvc('ddd', _vowels)) |
|
214
|
|
|
self.assertFalse(_ends_in_cvc('faaf', _vowels)) |
|
215
|
|
|
self.assertFalse(_ends_in_cvc('rare', _vowels)) |
|
216
|
|
|
self.assertFalse(_ends_in_cvc('rhy', _vowels)) |
|
217
|
|
|
|
|
218
|
|
|
# True cases |
|
219
|
|
|
self.assertTrue(_ends_in_cvc('dad', _vowels)) |
|
220
|
|
|
self.assertTrue(_ends_in_cvc('phad', _vowels)) |
|
221
|
|
|
self.assertTrue(_ends_in_cvc('faded', _vowels)) |
|
222
|
|
|
self.assertTrue(_ends_in_cvc('maYor', _vowels)) |
|
223
|
|
|
self.assertTrue(_ends_in_cvc('enlil', _vowels)) |
|
224
|
|
|
self.assertTrue(_ends_in_cvc('parer', _vowels)) |
|
225
|
|
|
self.assertTrue(_ends_in_cvc('padres', _vowels)) |
|
226
|
|
|
self.assertTrue(_ends_in_cvc('bacyc', _vowels)) |
|
227
|
|
|
|
|
228
|
|
|
# Special case for W, X, & Y |
|
229
|
|
|
self.assertFalse(_ends_in_cvc('craw', _vowels)) |
|
230
|
|
|
self.assertFalse(_ends_in_cvc('max', _vowels)) |
|
231
|
|
|
self.assertFalse(_ends_in_cvc('cray', _vowels)) |
|
232
|
|
|
|
|
233
|
|
|
def test_porter(self): |
|
234
|
|
|
"""Test abydos.stemmer.porter.""" |
|
235
|
|
|
# base case |
|
236
|
|
|
self.assertEqual(porter(''), '') |
|
237
|
|
|
|
|
238
|
|
|
# simple cases |
|
239
|
|
|
self.assertEqual(porter('c'), 'c') |
|
240
|
|
|
self.assertEqual(porter('da'), 'da') |
|
241
|
|
|
self.assertEqual(porter('ad'), 'ad') |
|
242
|
|
|
self.assertEqual(porter('sing'), 'sing') |
|
243
|
|
|
self.assertEqual(porter('singing'), 'sing') |
|
244
|
|
|
|
|
245
|
|
|
# missed branch test cases |
|
246
|
|
|
self.assertEqual(porter('capitalism'), 'capit') |
|
247
|
|
|
self.assertEqual(porter('fatalism'), 'fatal') |
|
248
|
|
|
self.assertEqual(porter('stional'), 'stional') |
|
249
|
|
|
self.assertEqual(porter('palism'), 'palism') |
|
250
|
|
|
self.assertEqual(porter('sization'), 'sizat') |
|
251
|
|
|
self.assertEqual(porter('licated'), 'licat') |
|
252
|
|
|
self.assertEqual(porter('lical'), 'lical') |
|
253
|
|
|
|
|
254
|
|
|
def test_porter_early_english(self): |
|
255
|
|
|
"""Test abydos.stemmer.porter (early English).""" |
|
256
|
|
|
# base case |
|
257
|
|
|
self.assertEqual(porter('', early_english=True), '') |
|
258
|
|
|
|
|
259
|
|
|
# simple cases (no different from regular stemmer) |
|
260
|
|
|
self.assertEqual(porter('c', early_english=True), 'c') |
|
261
|
|
|
self.assertEqual(porter('da', early_english=True), 'da') |
|
262
|
|
|
self.assertEqual(porter('ad', early_english=True), 'ad') |
|
263
|
|
|
self.assertEqual(porter('sing', early_english=True), 'sing') |
|
264
|
|
|
self.assertEqual(porter('singing', early_english=True), 'sing') |
|
265
|
|
|
|
|
266
|
|
|
# make |
|
267
|
|
|
self.assertEqual(porter('make', early_english=True), 'make') |
|
268
|
|
|
self.assertEqual(porter('makes', early_english=True), 'make') |
|
269
|
|
|
self.assertEqual(porter('maketh', early_english=True), 'make') |
|
270
|
|
|
self.assertEqual(porter('makest', early_english=True), 'make') |
|
271
|
|
|
|
|
272
|
|
|
# say |
|
273
|
|
|
self.assertEqual(porter('say', early_english=True), 'sai') |
|
274
|
|
|
self.assertEqual(porter('says', early_english=True), 'sai') |
|
275
|
|
|
self.assertEqual(porter('sayeth', early_english=True), 'sai') |
|
276
|
|
|
self.assertEqual(porter('sayest', early_english=True), 'sai') |
|
277
|
|
|
|
|
278
|
|
|
# missed branch test cases |
|
279
|
|
|
self.assertEqual(porter('best', early_english=True), 'best') |
|
280
|
|
|
self.assertEqual(porter('meth', early_english=True), 'meth') |
|
281
|
|
|
|
|
282
|
|
|
def test_porter_snowball(self): |
|
283
|
|
|
"""Test abydos.stemmer.porter (Snowball testset). |
|
284
|
|
|
|
|
285
|
|
|
These test cases are from |
|
286
|
|
|
http://snowball.tartarus.org/algorithms/porter/diffs.txt |
|
287
|
|
|
""" |
|
288
|
|
|
# Snowball Porter test set |
|
289
|
|
|
with open(TESTDIR+'/corpora/snowball_porter.csv') as snowball_testset: |
|
290
|
|
|
next(snowball_testset) |
|
291
|
|
|
for line in snowball_testset: |
|
292
|
|
|
if line[0] != '#': |
|
293
|
|
|
line = line.strip().split(',') |
|
294
|
|
|
word, stem = line[0], line[1] |
|
295
|
|
|
self.assertEqual(porter(word), stem.lower()) |
|
296
|
|
|
|
|
297
|
|
|
|
|
298
|
|
|
class Porter2TestCases(unittest.TestCase): |
|
299
|
|
|
"""Test Porter2 functions. |
|
300
|
|
|
|
|
301
|
|
|
abydos.stemmer._sb_r1, abydos.stemmer._sb_r2, |
|
302
|
|
|
abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word, |
|
303
|
|
|
& abydos.stemmer.porter2 |
|
304
|
|
|
""" |
|
305
|
|
|
|
|
306
|
|
|
def test_sb_r1(self): |
|
307
|
|
|
"""Test abydos.stemmer._sb_r1.""" |
|
308
|
|
|
_vowels = set('aeiouy') |
|
309
|
|
|
# base case |
|
310
|
|
|
self.assertEqual(_sb_r1('', _vowels), 0) |
|
311
|
|
|
|
|
312
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
|
313
|
|
|
self.assertEqual(_sb_r1('beautiful', _vowels), 5) |
|
314
|
|
|
self.assertEqual(_sb_r1('beauty', _vowels), 5) |
|
315
|
|
|
self.assertEqual(_sb_r1('beau', _vowels), 4) |
|
316
|
|
|
self.assertEqual(_sb_r1('animadversion', _vowels), 2) |
|
317
|
|
|
self.assertEqual(_sb_r1('sprinkled', _vowels), 5) |
|
318
|
|
|
self.assertEqual(_sb_r1('eucharist', _vowels), 3) |
|
319
|
|
|
|
|
320
|
|
|
def test_sb_r2(self): |
|
321
|
|
|
"""Test abydos.stemmer._sb_r2.""" |
|
322
|
|
|
_vowels = set('aeiouy') |
|
323
|
|
|
# base case |
|
324
|
|
|
self.assertEqual(_sb_r2('', _vowels), 0) |
|
325
|
|
|
|
|
326
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
|
327
|
|
|
self.assertEqual(_sb_r2('beautiful', _vowels), 7) |
|
328
|
|
|
self.assertEqual(_sb_r2('beauty', _vowels), 6) |
|
329
|
|
|
self.assertEqual(_sb_r2('beau', _vowels), 4) |
|
330
|
|
|
self.assertEqual(_sb_r2('animadversion', _vowels), 4) |
|
331
|
|
|
self.assertEqual(_sb_r2('sprinkled', _vowels), 9) |
|
332
|
|
|
self.assertEqual(_sb_r2('eucharist', _vowels), 6) |
|
333
|
|
|
|
|
334
|
|
|
def test_sb_ends_in_short_syllable(self): |
|
335
|
|
|
"""Test abydos.stemmer._sb_ends_in_short_syllable.""" |
|
336
|
|
|
_vowels = set('aeiouy') |
|
337
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
|
338
|
|
|
# base case |
|
339
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('', _vowels, |
|
340
|
|
|
_codanonvowels)) |
|
341
|
|
|
|
|
342
|
|
|
# examples from |
|
343
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
|
344
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels, |
|
345
|
|
|
_codanonvowels)) |
|
346
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels, |
|
347
|
|
|
_codanonvowels)) |
|
348
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels, |
|
349
|
|
|
_codanonvowels)) |
|
350
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels, |
|
351
|
|
|
_codanonvowels)) |
|
352
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('on', _vowels, |
|
353
|
|
|
_codanonvowels)) |
|
354
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('at', _vowels, |
|
355
|
|
|
_codanonvowels)) |
|
356
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
|
357
|
|
|
_codanonvowels)) |
|
358
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
|
359
|
|
|
_codanonvowels)) |
|
360
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels, |
|
361
|
|
|
_codanonvowels)) |
|
362
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels, |
|
363
|
|
|
_codanonvowels)) |
|
364
|
|
|
|
|
365
|
|
|
# missed branch test cases |
|
366
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('d', _vowels, |
|
367
|
|
|
_codanonvowels)) |
|
368
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('a', _vowels, |
|
369
|
|
|
_codanonvowels)) |
|
370
|
|
|
|
|
371
|
|
|
def test_sb_short_word(self): |
|
372
|
|
|
"""Test abydos.stemmer._sb_short_word.""" |
|
373
|
|
|
_vowels = set('aeiouy') |
|
374
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
|
375
|
|
|
# base case |
|
376
|
|
|
self.assertFalse(_sb_short_word('', _vowels, _codanonvowels)) |
|
377
|
|
|
|
|
378
|
|
|
# examples from |
|
379
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
|
380
|
|
|
self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels)) |
|
381
|
|
|
self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels)) |
|
382
|
|
|
self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels)) |
|
383
|
|
|
self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels)) |
|
384
|
|
|
self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels)) |
|
385
|
|
|
self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels)) |
|
386
|
|
|
|
|
387
|
|
|
def test_porter2(self): |
|
388
|
|
|
"""Test abydos.stemmer.porter2.""" |
|
389
|
|
|
# base case |
|
390
|
|
|
self.assertEqual(porter2(''), '') |
|
391
|
|
|
|
|
392
|
|
|
# simple cases |
|
393
|
|
|
self.assertEqual(porter2('c'), 'c') |
|
394
|
|
|
self.assertEqual(porter2('da'), 'da') |
|
395
|
|
|
self.assertEqual(porter2('ad'), 'ad') |
|
396
|
|
|
self.assertEqual(porter2('sing'), 'sing') |
|
397
|
|
|
self.assertEqual(porter2('singing'), 'sing') |
|
398
|
|
|
|
|
399
|
|
|
# missed branch test cases |
|
400
|
|
|
self.assertEqual(porter2('capitalism'), 'capit') |
|
401
|
|
|
self.assertEqual(porter2('fatalism'), 'fatal') |
|
402
|
|
|
self.assertEqual(porter2('dog\'s'), 'dog') |
|
403
|
|
|
self.assertEqual(porter2('A\'s\''), 'a') |
|
404
|
|
|
self.assertEqual(porter2('agreedly'), 'agre') |
|
405
|
|
|
self.assertEqual(porter2('feedly'), 'feed') |
|
406
|
|
|
self.assertEqual(porter2('stional'), 'stional') |
|
407
|
|
|
self.assertEqual(porter2('palism'), 'palism') |
|
408
|
|
|
self.assertEqual(porter2('sization'), 'sizat') |
|
409
|
|
|
self.assertEqual(porter2('licated'), 'licat') |
|
410
|
|
|
self.assertEqual(porter2('lical'), 'lical') |
|
411
|
|
|
self.assertEqual(porter2('clessly'), 'clessli') |
|
412
|
|
|
self.assertEqual(porter2('tably'), 'tabli') |
|
413
|
|
|
self.assertEqual(porter2('sizer'), 'sizer') |
|
414
|
|
|
self.assertEqual(porter2('livity'), 'liviti') |
|
415
|
|
|
|
|
416
|
|
|
def test_porter2_early_english(self): |
|
417
|
|
|
"""Test abydos.stemmer.porter2 (early English).""" |
|
418
|
|
|
# base case |
|
419
|
|
|
self.assertEqual(porter2('', early_english=True), '') |
|
420
|
|
|
|
|
421
|
|
|
# simple cases (no different from regular stemmer) |
|
422
|
|
|
self.assertEqual(porter2('c', early_english=True), 'c') |
|
423
|
|
|
self.assertEqual(porter2('da', early_english=True), 'da') |
|
424
|
|
|
self.assertEqual(porter2('ad', early_english=True), 'ad') |
|
425
|
|
|
self.assertEqual(porter2('sing', early_english=True), 'sing') |
|
426
|
|
|
self.assertEqual(porter2('singing', early_english=True), 'sing') |
|
427
|
|
|
|
|
428
|
|
|
# make |
|
429
|
|
|
self.assertEqual(porter2('make', early_english=True), 'make') |
|
430
|
|
|
self.assertEqual(porter2('makes', early_english=True), 'make') |
|
431
|
|
|
self.assertEqual(porter2('maketh', early_english=True), 'make') |
|
432
|
|
|
self.assertEqual(porter2('makest', early_english=True), 'make') |
|
433
|
|
|
|
|
434
|
|
|
# say |
|
435
|
|
|
self.assertEqual(porter2('say', early_english=True), 'say') |
|
436
|
|
|
self.assertEqual(porter2('says', early_english=True), 'say') |
|
437
|
|
|
self.assertEqual(porter2('sayeth', early_english=True), 'say') |
|
438
|
|
|
self.assertEqual(porter2('sayest', early_english=True), 'say') |
|
439
|
|
|
|
|
440
|
|
|
# missed branch test cases |
|
441
|
|
|
self.assertEqual(porter2('best', early_english=True), 'best') |
|
442
|
|
|
self.assertEqual(porter2('meth', early_english=True), 'meth') |
|
443
|
|
|
|
|
444
|
|
|
def test_porter2_snowball(self): |
|
445
|
|
|
"""Test abydos.stemmer.porter2 (Snowball testset). |
|
446
|
|
|
|
|
447
|
|
|
These test cases are from |
|
448
|
|
|
http://snowball.tartarus.org/algorithms/english/diffs.txt |
|
449
|
|
|
""" |
|
450
|
|
|
# Snowball Porter test set |
|
451
|
|
|
with open(TESTDIR+'/corpora/snowball_porter2.csv') as snowball_testset: |
|
452
|
|
|
next(snowball_testset) |
|
453
|
|
|
for line in snowball_testset: |
|
454
|
|
|
if line[0] != '#': |
|
455
|
|
|
line = line.strip().split(',') |
|
456
|
|
|
word, stem = line[0], line[1] |
|
457
|
|
|
self.assertEqual(porter2(word), stem.lower()) |
|
458
|
|
|
|
|
459
|
|
|
|
|
460
|
|
|
class SnowballTestCases(unittest.TestCase): |
|
461
|
|
|
"""Test Snowball functions. |
|
462
|
|
|
|
|
463
|
|
|
abydos.stemmer.sb_german, abydos.stemmer.sb_dutch, |
|
464
|
|
|
abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, & |
|
465
|
|
|
abydos.stemmer.sb_danish |
|
466
|
|
|
""" |
|
467
|
|
|
|
|
468
|
|
View Code Duplication |
def test_sb_german_snowball(self): |
|
|
|
|
|
|
469
|
|
|
"""Test abydos.stemmer.sb_german (Snowball testset). |
|
470
|
|
|
|
|
471
|
|
|
These test cases are from |
|
472
|
|
|
http://snowball.tartarus.org/algorithms/german/diffs.txt |
|
473
|
|
|
""" |
|
474
|
|
|
# base case |
|
475
|
|
|
self.assertEqual(sb_german(''), '') |
|
476
|
|
|
|
|
477
|
|
|
# Snowball German test set |
|
478
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_german.csv', |
|
479
|
|
|
encoding='utf-8') as snowball_testset: |
|
480
|
|
|
next(snowball_testset) |
|
481
|
|
|
for line in snowball_testset: |
|
482
|
|
|
if line[0] != '#': |
|
483
|
|
|
line = line.strip().split(',') |
|
484
|
|
|
word, stem = line[0], line[1] |
|
485
|
|
|
self.assertEqual(sb_german(word), stem.lower()) |
|
486
|
|
|
|
|
487
|
|
|
# missed branch test cases |
|
488
|
|
|
self.assertEqual(sb_german('ikeit'), 'ikeit') |
|
489
|
|
|
|
|
490
|
|
|
def test_sb_german_snowball_alt(self): |
|
491
|
|
|
"""Test abydos.stemmer.sb_german (alternate vowels).""" |
|
492
|
|
|
# base case |
|
493
|
|
|
self.assertEqual(sb_german('', alternate_vowels=True), '') |
|
494
|
|
|
|
|
495
|
|
|
# dämmerung,dammer |
|
496
|
|
|
self.assertEqual(sb_german('dämmerung', alternate_vowels=True), |
|
497
|
|
|
'dammer') |
|
498
|
|
|
self.assertEqual(sb_german('daemmerung', alternate_vowels=True), |
|
499
|
|
|
'dammer') |
|
500
|
|
|
self.assertEqual(sb_german('dämmerung'), 'dammer') |
|
501
|
|
|
self.assertEqual(sb_german('daemmerung'), 'daemmer') |
|
502
|
|
|
|
|
503
|
|
|
# brötchen,brotch |
|
504
|
|
|
self.assertEqual(sb_german('brötchen', alternate_vowels=True), |
|
505
|
|
|
'brotch') |
|
506
|
|
|
self.assertEqual(sb_german('broetchen', alternate_vowels=True), |
|
507
|
|
|
'brotch') |
|
508
|
|
|
self.assertEqual(sb_german('brötchen'), 'brotch') |
|
509
|
|
|
self.assertEqual(sb_german('broetchen'), 'broetch') |
|
510
|
|
|
|
|
511
|
|
|
# büro,buro |
|
512
|
|
|
self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro') |
|
513
|
|
|
self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro') |
|
514
|
|
|
self.assertEqual(sb_german('büro'), 'buro') |
|
515
|
|
|
self.assertEqual(sb_german('buero'), 'buero') |
|
516
|
|
|
|
|
517
|
|
|
# häufen,hauf |
|
518
|
|
|
self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf') |
|
519
|
|
|
self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf') |
|
520
|
|
|
self.assertEqual(sb_german('häufen'), 'hauf') |
|
521
|
|
|
self.assertEqual(sb_german('haeufen'), 'haeuf') |
|
522
|
|
|
|
|
523
|
|
|
# quelle,quell |
|
524
|
|
|
self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull') |
|
525
|
|
|
self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell') |
|
526
|
|
|
self.assertEqual(sb_german('qülle'), 'qull') |
|
527
|
|
|
self.assertEqual(sb_german('quelle'), 'quell') |
|
528
|
|
|
|
|
529
|
|
|
# feuer,feuer |
|
530
|
|
|
self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur') |
|
531
|
|
|
self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu') |
|
532
|
|
|
self.assertEqual(sb_german('feür'), 'feur') |
|
533
|
|
|
self.assertEqual(sb_german('feuer'), 'feu') |
|
534
|
|
|
|
|
535
|
|
|
# über,uber |
|
536
|
|
|
self.assertEqual(sb_german('über', alternate_vowels=True), 'uber') |
|
537
|
|
|
self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber') |
|
538
|
|
|
self.assertEqual(sb_german('über'), 'uber') |
|
539
|
|
|
self.assertEqual(sb_german('ueber'), 'ueb') |
|
540
|
|
|
|
|
541
|
|
View Code Duplication |
def test_sb_dutch_snowball(self): |
|
|
|
|
|
|
542
|
|
|
"""Test abydos.stemmer.sb_dutch (Snowball testset). |
|
543
|
|
|
|
|
544
|
|
|
These test cases are from |
|
545
|
|
|
http://snowball.tartarus.org/algorithms/dutch/diffs.txt |
|
546
|
|
|
""" |
|
547
|
|
|
# base case |
|
548
|
|
|
self.assertEqual(sb_dutch(''), '') |
|
549
|
|
|
|
|
550
|
|
|
# Snowball Dutch test set |
|
551
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_dutch.csv', |
|
552
|
|
|
encoding='utf-8') as snowball_testset: |
|
553
|
|
|
next(snowball_testset) |
|
554
|
|
|
for line in snowball_testset: |
|
555
|
|
|
if line[0] != '#': |
|
556
|
|
|
line = line.strip().split(',') |
|
557
|
|
|
word, stem = line[0], line[1] |
|
558
|
|
|
self.assertEqual(sb_dutch(word), stem.lower()) |
|
559
|
|
|
|
|
560
|
|
|
# missed branch test cases |
|
561
|
|
|
self.assertEqual(sb_dutch('zondulielijk'), 'zondulie') |
|
562
|
|
|
|
|
563
|
|
View Code Duplication |
def test_sb_norwegian_snowball(self): |
|
|
|
|
|
|
564
|
|
|
"""Test abydos.stemmer.sb_norwegian (Snowball testset). |
|
565
|
|
|
|
|
566
|
|
|
These test cases are from |
|
567
|
|
|
http://snowball.tartarus.org/algorithms/norwegian/diffs.txt |
|
568
|
|
|
""" |
|
569
|
|
|
# base case |
|
570
|
|
|
self.assertEqual(sb_norwegian(''), '') |
|
571
|
|
|
|
|
572
|
|
|
# Snowball Norwegian test set |
|
573
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_norwegian.csv', |
|
574
|
|
|
encoding='utf-8') as snowball_testset: |
|
575
|
|
|
next(snowball_testset) |
|
576
|
|
|
for line in snowball_testset: |
|
577
|
|
|
if line[0] != '#': |
|
578
|
|
|
line = line.strip().split(',') |
|
579
|
|
|
word, stem = line[0], line[1] |
|
580
|
|
|
self.assertEqual(sb_norwegian(word), stem.lower()) |
|
581
|
|
|
|
|
582
|
|
View Code Duplication |
def test_sb_swedish_snowball(self): |
|
|
|
|
|
|
583
|
|
|
"""Test abydos.stemmer.sb_swedish (Snowball testset). |
|
584
|
|
|
|
|
585
|
|
|
These test cases are from |
|
586
|
|
|
http://snowball.tartarus.org/algorithms/swedish/diffs.txt |
|
587
|
|
|
""" |
|
588
|
|
|
# base case |
|
589
|
|
|
self.assertEqual(sb_swedish(''), '') |
|
590
|
|
|
|
|
591
|
|
|
# Snowball Swedish test set |
|
592
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_swedish.csv', |
|
593
|
|
|
encoding='utf-8') as snowball_testset: |
|
594
|
|
|
next(snowball_testset) |
|
595
|
|
|
for line in snowball_testset: |
|
596
|
|
|
if line[0] != '#': |
|
597
|
|
|
line = line.strip().split(',') |
|
598
|
|
|
word, stem = line[0], line[1] |
|
599
|
|
|
self.assertEqual(sb_swedish(word), stem.lower()) |
|
600
|
|
|
|
|
601
|
|
View Code Duplication |
def test_sb_danish_snowball(self): |
|
|
|
|
|
|
602
|
|
|
"""Test abydos.stemmer.sb_danish (Snowball testset). |
|
603
|
|
|
|
|
604
|
|
|
These test cases are from |
|
605
|
|
|
http://snowball.tartarus.org/algorithms/danish/diffs.txt |
|
606
|
|
|
""" |
|
607
|
|
|
# base case |
|
608
|
|
|
self.assertEqual(sb_danish(''), '') |
|
609
|
|
|
|
|
610
|
|
|
# Snowball Danish test set |
|
611
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_danish.csv', |
|
612
|
|
|
encoding='utf-8') as snowball_testset: |
|
613
|
|
|
next(snowball_testset) |
|
614
|
|
|
for line in snowball_testset: |
|
615
|
|
|
if line[0] != '#': |
|
616
|
|
|
line = line.strip().split(',') |
|
617
|
|
|
word, stem = line[0], line[1] |
|
618
|
|
|
self.assertEqual(sb_danish(word), stem.lower()) |
|
619
|
|
|
|
|
620
|
|
|
|
|
621
|
|
|
class CLEFTestCases(unittest.TestCase): |
|
622
|
|
|
"""Test CLEF functions. |
|
623
|
|
|
|
|
624
|
|
|
abydos.stemmer.clef_german, abydos.stemmer.clef_german_plus, & |
|
625
|
|
|
abydos.stemmer.clef_swedish |
|
626
|
|
|
""" |
|
627
|
|
|
|
|
628
|
|
|
def test_clef_german(self): |
|
629
|
|
|
"""Test abydos.stemmer.clef_german.""" |
|
630
|
|
|
# base case |
|
631
|
|
|
self.assertEqual(clef_german(''), '') |
|
632
|
|
|
|
|
633
|
|
|
# len <= 2 |
|
634
|
|
|
self.assertEqual(clef_german('ä'), 'a') |
|
635
|
|
|
self.assertEqual(clef_german('er'), 'er') |
|
636
|
|
|
self.assertEqual(clef_german('es'), 'es') |
|
637
|
|
|
self.assertEqual(clef_german('äh'), 'ah') |
|
638
|
|
|
|
|
639
|
|
|
# len > 2 |
|
640
|
|
|
self.assertEqual(clef_german('deinen'), 'dein') |
|
641
|
|
|
self.assertEqual(clef_german('können'), 'konn') |
|
642
|
|
|
self.assertEqual(clef_german('Damen'), 'dame') |
|
643
|
|
|
self.assertEqual(clef_german('kleines'), 'klein') |
|
644
|
|
|
self.assertEqual(clef_german('Namen'), 'name') |
|
645
|
|
|
self.assertEqual(clef_german('Äpfel'), 'apfel') |
|
646
|
|
|
self.assertEqual(clef_german('Jahre'), 'jahr') |
|
647
|
|
|
self.assertEqual(clef_german('Mannes'), 'mann') |
|
648
|
|
|
self.assertEqual(clef_german('Häuser'), 'haus') |
|
649
|
|
|
self.assertEqual(clef_german('Motoren'), 'motor') |
|
650
|
|
|
self.assertEqual(clef_german('kleine'), 'klein') |
|
651
|
|
|
self.assertEqual(clef_german('Pfingsten'), 'pfingst') |
|
652
|
|
|
self.assertEqual(clef_german('lautest'), 'lautest') |
|
653
|
|
|
self.assertEqual(clef_german('lauteste'), 'lautest') |
|
654
|
|
|
self.assertEqual(clef_german('lautere'), 'lauter') |
|
655
|
|
|
self.assertEqual(clef_german('lautste'), 'lautst') |
|
656
|
|
|
self.assertEqual(clef_german('kleinen'), 'klei') |
|
657
|
|
|
|
|
658
|
|
|
def test_clef_german_plus(self): |
|
659
|
|
|
"""Test abydos.stemmer.clef_german_plus.""" |
|
660
|
|
|
# base case |
|
661
|
|
|
self.assertEqual(clef_german_plus(''), '') |
|
662
|
|
|
|
|
663
|
|
|
# len <= 2 |
|
664
|
|
|
self.assertEqual(clef_german_plus('ä'), 'a') |
|
665
|
|
|
self.assertEqual(clef_german_plus('er'), 'er') |
|
666
|
|
|
self.assertEqual(clef_german_plus('es'), 'es') |
|
667
|
|
|
self.assertEqual(clef_german_plus('äh'), 'ah') |
|
668
|
|
|
|
|
669
|
|
|
# len > 2 |
|
670
|
|
|
self.assertEqual(clef_german_plus('deinen'), 'dein') |
|
671
|
|
|
self.assertEqual(clef_german_plus('können'), 'konn') |
|
672
|
|
|
self.assertEqual(clef_german_plus('Damen'), 'dam') |
|
673
|
|
|
self.assertEqual(clef_german_plus('kleines'), 'klein') |
|
674
|
|
|
self.assertEqual(clef_german_plus('Namen'), 'nam') |
|
675
|
|
|
self.assertEqual(clef_german_plus('Äpfel'), 'apfel') |
|
676
|
|
|
self.assertEqual(clef_german_plus('Jahre'), 'jahr') |
|
677
|
|
|
self.assertEqual(clef_german_plus('Mannes'), 'mann') |
|
678
|
|
|
self.assertEqual(clef_german_plus('Häuser'), 'haus') |
|
679
|
|
|
self.assertEqual(clef_german_plus('Motoren'), 'motor') |
|
680
|
|
|
self.assertEqual(clef_german_plus('kleine'), 'klein') |
|
681
|
|
|
self.assertEqual(clef_german_plus('Pfingsten'), 'pfing') |
|
682
|
|
|
self.assertEqual(clef_german_plus('lautest'), 'laut') |
|
683
|
|
|
self.assertEqual(clef_german_plus('lauteste'), 'laut') |
|
684
|
|
|
self.assertEqual(clef_german_plus('lautere'), 'laut') |
|
685
|
|
|
self.assertEqual(clef_german_plus('lautste'), 'laut') |
|
686
|
|
|
self.assertEqual(clef_german_plus('kleinen'), 'klein') |
|
687
|
|
|
self.assertEqual(clef_german_plus('Pfarrern'), 'pfarr') |
|
688
|
|
|
|
|
689
|
|
|
def test_clef_swedish(self): |
|
690
|
|
|
"""Test abydos.stemmer.clef_swedish.""" |
|
691
|
|
|
# base case |
|
692
|
|
|
self.assertEqual(clef_swedish(''), '') |
|
693
|
|
|
|
|
694
|
|
|
# unstemmed |
|
695
|
|
|
self.assertEqual(clef_swedish('konung'), 'konung') |
|
696
|
|
|
|
|
697
|
|
|
# len <= 3 |
|
698
|
|
|
self.assertEqual(clef_swedish('km'), 'km') |
|
699
|
|
|
self.assertEqual(clef_swedish('ja'), 'ja') |
|
700
|
|
|
self.assertEqual(clef_swedish('de'), 'de') |
|
701
|
|
|
self.assertEqual(clef_swedish('in'), 'in') |
|
702
|
|
|
self.assertEqual(clef_swedish('a'), 'a') |
|
703
|
|
|
self.assertEqual(clef_swedish('mer'), 'mer') |
|
704
|
|
|
self.assertEqual(clef_swedish('s'), 's') |
|
705
|
|
|
self.assertEqual(clef_swedish('e'), 'e') |
|
706
|
|
|
self.assertEqual(clef_swedish('oss'), 'oss') |
|
707
|
|
|
self.assertEqual(clef_swedish('hos'), 'hos') |
|
708
|
|
|
|
|
709
|
|
|
# genitive |
|
710
|
|
|
self.assertEqual(clef_swedish('svenskars'), 'svensk') |
|
711
|
|
|
self.assertEqual(clef_swedish('stadens'), 'stad') |
|
712
|
|
|
self.assertEqual(clef_swedish('kommuns'), 'kommu') |
|
713
|
|
|
self.assertEqual(clef_swedish('aftonbladets'), 'aftonblad') |
|
714
|
|
|
|
|
715
|
|
|
# len > 7 |
|
716
|
|
|
self.assertEqual(clef_swedish('fängelser'), 'fäng') |
|
717
|
|
|
self.assertEqual(clef_swedish('möjligheten'), 'möjlig') |
|
718
|
|
|
|
|
719
|
|
|
# len > 6 |
|
720
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
|
721
|
|
|
self.assertEqual(clef_swedish('myndigheterna'), 'myndighet') |
|
722
|
|
|
self.assertEqual(clef_swedish('avgörande'), 'avgör') |
|
723
|
|
|
self.assertEqual(clef_swedish('fängelse'), 'fäng') |
|
724
|
|
|
self.assertEqual(clef_swedish('viktigaste'), 'viktig') |
|
725
|
|
|
self.assertEqual(clef_swedish('kvinnorna'), 'kvinn') |
|
726
|
|
|
self.assertEqual(clef_swedish('åklagaren'), 'åklag') |
|
727
|
|
|
|
|
728
|
|
|
# len > 5 |
|
729
|
|
|
self.assertEqual(clef_swedish('tidigare'), 'tidig') |
|
730
|
|
|
self.assertEqual(clef_swedish('senast'), 'sen') |
|
731
|
|
|
self.assertEqual(clef_swedish('möjlighet'), 'möjlig') |
|
732
|
|
|
|
|
733
|
|
|
# len > 4 |
|
734
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
|
735
|
|
|
self.assertEqual(clef_swedish('skriver'), 'skriv') |
|
736
|
|
|
self.assertEqual(clef_swedish('människor'), 'människ') |
|
737
|
|
|
self.assertEqual(clef_swedish('staden'), 'stad') |
|
738
|
|
|
self.assertEqual(clef_swedish('kunnat'), 'kunn') |
|
739
|
|
|
self.assertEqual(clef_swedish('samarbete'), 'samarbe') |
|
740
|
|
|
self.assertEqual(clef_swedish('aftonbladet'), 'aftonblad') |
|
741
|
|
|
|
|
742
|
|
|
# len > 3 |
|
743
|
|
|
self.assertEqual(clef_swedish('allt'), 'all') |
|
744
|
|
|
self.assertEqual(clef_swedish('vilka'), 'vilk') |
|
745
|
|
|
self.assertEqual(clef_swedish('länge'), 'läng') |
|
746
|
|
|
self.assertEqual(clef_swedish('kommun'), 'kommu') |
|
747
|
|
|
|
|
748
|
|
|
|
|
749
|
|
|
class CaumannsTestCases(unittest.TestCase): |
|
750
|
|
|
"""Test Caumanns functions. |
|
751
|
|
|
|
|
752
|
|
|
abydos.stemmer.caumanns |
|
753
|
|
|
""" |
|
754
|
|
|
|
|
755
|
|
|
def test_caumanns(self): |
|
756
|
|
|
"""Test abydos.stemmer.caumanns.""" |
|
757
|
|
|
# base case |
|
758
|
|
|
self.assertEqual(caumanns(''), '') |
|
759
|
|
|
|
|
760
|
|
|
# tests from Caumanns' description of the algorithm |
|
761
|
|
|
self.assertEqual(caumanns('singt'), 'sing') |
|
762
|
|
|
self.assertEqual(caumanns('singen'), 'sing') |
|
763
|
|
|
self.assertEqual(caumanns('beliebt'), 'belieb') |
|
764
|
|
|
self.assertEqual(caumanns('beliebtester'), 'belieb') |
|
765
|
|
|
self.assertEqual(caumanns('stören'), 'stor') |
|
766
|
|
|
self.assertEqual(caumanns('stöhnen'), 'stoh') |
|
767
|
|
|
self.assertEqual(caumanns('Kuß'), 'kuss') |
|
768
|
|
|
self.assertEqual(caumanns('Küsse'), 'kuss') |
|
769
|
|
|
self.assertEqual(caumanns('Verlierer'), 'verlier') |
|
770
|
|
|
self.assertEqual(caumanns('Verlies'), 'verlie') |
|
771
|
|
|
self.assertEqual(caumanns('Maus'), 'mau') |
|
772
|
|
|
self.assertEqual(caumanns('Mauer'), 'mau') |
|
773
|
|
|
self.assertEqual(caumanns('Störsender'), 'stor') |
|
774
|
|
|
|
|
775
|
|
|
# additional tests to achieve full coverage |
|
776
|
|
|
self.assertEqual(caumanns('Müllerinnen'), 'mullerin') |
|
777
|
|
|
self.assertEqual(caumanns('Matrix'), 'matrix') |
|
778
|
|
|
self.assertEqual(caumanns('Matrizen'), 'matrix') |
|
779
|
|
|
|
|
780
|
|
|
def test_caumanns_lucene(self): |
|
781
|
|
|
"""Test abydos.stemmer.caumanns (Lucene tests). |
|
782
|
|
|
|
|
783
|
|
|
Based on tests from |
|
784
|
|
|
https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt |
|
785
|
|
|
This is presumably Apache-licensed. |
|
786
|
|
|
""" |
|
787
|
|
|
# German special characters are replaced: |
|
788
|
|
|
self.assertEqual(caumanns('häufig'), 'haufig') |
|
789
|
|
|
self.assertEqual(caumanns('üor'), 'uor') |
|
790
|
|
|
self.assertEqual(caumanns('björk'), 'bjork') |
|
791
|
|
|
|
|
792
|
|
|
# here the stemmer works okay, it maps related words to the same stem: |
|
793
|
|
|
self.assertEqual(caumanns('abschließen'), 'abschliess') |
|
794
|
|
|
self.assertEqual(caumanns('abschließender'), 'abschliess') |
|
795
|
|
|
self.assertEqual(caumanns('abschließendes'), 'abschliess') |
|
796
|
|
|
self.assertEqual(caumanns('abschließenden'), 'abschliess') |
|
797
|
|
|
|
|
798
|
|
|
self.assertEqual(caumanns('Tisch'), 'tisch') |
|
799
|
|
|
self.assertEqual(caumanns('Tische'), 'tisch') |
|
800
|
|
|
self.assertEqual(caumanns('Tischen'), 'tisch') |
|
801
|
|
|
self.assertEqual(caumanns('geheimtür'), 'geheimtur') |
|
802
|
|
|
|
|
803
|
|
|
self.assertEqual(caumanns('Haus'), 'hau') |
|
804
|
|
|
self.assertEqual(caumanns('Hauses'), 'hau') |
|
805
|
|
|
self.assertEqual(caumanns('Häuser'), 'hau') |
|
806
|
|
|
self.assertEqual(caumanns('Häusern'), 'hau') |
|
807
|
|
|
# here's a case where overstemming occurs, i.e. a word is |
|
808
|
|
|
# mapped to the same stem as unrelated words: |
|
809
|
|
|
self.assertEqual(caumanns('hauen'), 'hau') |
|
810
|
|
|
|
|
811
|
|
|
# here's a case where understemming occurs, i.e. two related words |
|
812
|
|
|
# are not mapped to the same stem. This is the case with basically |
|
813
|
|
|
# all irregular forms: |
|
814
|
|
|
self.assertEqual(caumanns('Drama'), 'drama') |
|
815
|
|
|
self.assertEqual(caumanns('Dramen'), 'dram') |
|
816
|
|
|
|
|
817
|
|
|
# replace "ß" with 'ss': |
|
818
|
|
|
self.assertEqual(caumanns('Ausmaß'), 'ausmass') |
|
819
|
|
|
|
|
820
|
|
|
# fake words to test if suffixes are cut off: |
|
821
|
|
|
self.assertEqual(caumanns('xxxxxe'), 'xxxxx') |
|
822
|
|
|
self.assertEqual(caumanns('xxxxxs'), 'xxxxx') |
|
823
|
|
|
self.assertEqual(caumanns('xxxxxn'), 'xxxxx') |
|
824
|
|
|
self.assertEqual(caumanns('xxxxxt'), 'xxxxx') |
|
825
|
|
|
self.assertEqual(caumanns('xxxxxem'), 'xxxxx') |
|
826
|
|
|
self.assertEqual(caumanns('xxxxxer'), 'xxxxx') |
|
827
|
|
|
self.assertEqual(caumanns('xxxxxnd'), 'xxxxx') |
|
828
|
|
|
# the suffixes are also removed when combined: |
|
829
|
|
|
self.assertEqual(caumanns('xxxxxetende'), 'xxxxx') |
|
830
|
|
|
|
|
831
|
|
|
# words that are shorter than four charcters are not changed: |
|
832
|
|
|
self.assertEqual(caumanns('xxe'), 'xxe') |
|
833
|
|
|
# -em and -er are not removed from words shorter than five characters: |
|
834
|
|
|
self.assertEqual(caumanns('xxem'), 'xxem') |
|
835
|
|
|
self.assertEqual(caumanns('xxer'), 'xxer') |
|
836
|
|
|
# -nd is not removed from words shorter than six characters: |
|
837
|
|
|
self.assertEqual(caumanns('xxxnd'), 'xxxnd') |
|
838
|
|
|
|
|
839
|
|
|
|
|
840
|
|
|
class UEALiteTestCases(unittest.TestCase): |
|
841
|
|
|
"""Test UEA-lite functions. |
|
842
|
|
|
|
|
843
|
|
|
abydos.stemmer.uealite |
|
844
|
|
|
""" |
|
845
|
|
|
|
|
846
|
|
|
def test_uealite(self): |
|
847
|
|
|
"""Test abydos.stemmer.uealite.""" |
|
848
|
|
|
# base case |
|
849
|
|
|
self.assertEqual(uealite(''), '') |
|
850
|
|
|
|
|
851
|
|
|
# test cases copied from Ruby port |
|
852
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
|
853
|
|
|
# These are corrected to match the Java version's output. |
|
854
|
|
|
# stem base words to just the base word |
|
855
|
|
|
self.assertEqual(uealite('man'), 'man') |
|
856
|
|
|
self.assertEqual(uealite('happiness'), 'happiness') |
|
857
|
|
|
# stem theses as thesis but not bases as basis |
|
858
|
|
|
self.assertEqual(uealite('theses'), 'thesis') |
|
859
|
|
|
self.assertNotEqual(uealite('bases'), 'basis') |
|
860
|
|
|
# stem preterite words ending in -ed without the -ed |
|
861
|
|
|
self.assertEqual(uealite('ordained'), 'ordain') |
|
862
|
|
|
self.assertEqual(uealite('killed'), 'kill') |
|
863
|
|
|
self.assertEqual(uealite('liked'), 'lik') |
|
864
|
|
|
self.assertEqual(uealite('helped'), 'help') |
|
865
|
|
|
self.assertEqual(uealite('scarred'), 'scarre') |
|
866
|
|
|
self.assertEqual(uealite('invited'), 'invit') |
|
867
|
|
|
self.assertEqual(uealite('exited'), 'exit') |
|
868
|
|
|
self.assertEqual(uealite('debited'), 'debit') |
|
869
|
|
|
self.assertEqual(uealite('smited'), 'smit') |
|
870
|
|
|
# stem progressive verbs and gerunds without the -ing |
|
871
|
|
|
self.assertEqual(uealite('running'), 'run') |
|
872
|
|
|
self.assertEqual(uealite('settings'), 'set') |
|
873
|
|
|
self.assertEqual(uealite('timing'), 'time') |
|
874
|
|
|
self.assertEqual(uealite('dying'), 'dy') |
|
875
|
|
|
self.assertEqual(uealite('harping'), 'harp') |
|
876
|
|
|
self.assertEqual(uealite('charring'), 'char') |
|
877
|
|
|
# not stem false progressive verbs such as 'sing' |
|
878
|
|
|
self.assertEqual(uealite('ring'), 'ring') |
|
879
|
|
|
self.assertEqual(uealite('sing'), 'se') |
|
880
|
|
|
self.assertEqual(uealite('bring'), 'br') |
|
881
|
|
|
self.assertEqual(uealite('fling'), 'fle') |
|
882
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
|
883
|
|
|
self.assertEqual(uealite('changes'), 'change') |
|
884
|
|
|
self.assertEqual(uealite('deaths'), 'death') |
|
885
|
|
|
self.assertEqual(uealite('shadows'), 'shadow') |
|
886
|
|
|
self.assertEqual(uealite('flies'), 'fly') |
|
887
|
|
|
self.assertEqual(uealite('things'), 'thing') |
|
888
|
|
|
self.assertEqual(uealite('nothings'), 'nothing') |
|
889
|
|
|
self.assertEqual(uealite('witches'), 'witch') |
|
890
|
|
|
self.assertEqual(uealite('makes'), 'mak') |
|
891
|
|
|
self.assertEqual(uealite('smokes'), 'smok') |
|
892
|
|
|
self.assertEqual(uealite('does'), 'do') |
|
893
|
|
|
# stem various words with -des suffix |
|
894
|
|
|
self.assertEqual(uealite('abodes'), 'abod') |
|
895
|
|
|
self.assertEqual(uealite('escapades'), 'escapad') |
|
896
|
|
|
self.assertEqual(uealite('crusades'), 'crusad') |
|
897
|
|
|
self.assertEqual(uealite('grades'), 'grad') |
|
898
|
|
|
# stem various words with -res suffix |
|
899
|
|
|
self.assertEqual(uealite('wires'), 'wir') |
|
900
|
|
|
self.assertEqual(uealite('acres'), 'acr') |
|
901
|
|
|
self.assertEqual(uealite('fires'), 'fir') |
|
902
|
|
|
self.assertEqual(uealite('cares'), 'car') |
|
903
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
|
904
|
|
|
self.assertEqual(uealite('USA'), 'USA') |
|
905
|
|
|
self.assertEqual(uealite('FLOSS'), 'FLOSS') |
|
906
|
|
|
self.assertEqual(uealite('MREs'), 'MRE') |
|
907
|
|
|
self.assertEqual(uealite('USAED'), 'USAED') |
|
908
|
|
|
|
|
909
|
|
|
# test cases copied from Ruby port |
|
910
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
|
911
|
|
|
# stem base words to just the base word |
|
912
|
|
|
self.assertEqual(uealite('man', var='Adams'), 'man') |
|
913
|
|
|
self.assertEqual(uealite('happiness', var='Adams'), 'happiness') |
|
914
|
|
|
# stem theses as thesis but not bases as basis |
|
915
|
|
|
self.assertEqual(uealite('theses', var='Adams'), 'thesis') |
|
916
|
|
|
self.assertNotEqual(uealite('bases', var='Adams'), 'basis') |
|
917
|
|
|
# stem preterite words ending in -ed without the -ed |
|
918
|
|
|
self.assertEqual(uealite('ordained', var='Adams'), 'ordain') |
|
919
|
|
|
self.assertEqual(uealite('killed', var='Adams'), 'kill') |
|
920
|
|
|
self.assertEqual(uealite('liked', var='Adams'), 'like') |
|
921
|
|
|
self.assertEqual(uealite('helped', var='Adams'), 'help') |
|
922
|
|
|
# self.assertEqual(uealite('scarred', var='Adams'), 'scar') |
|
923
|
|
|
self.assertEqual(uealite('invited', var='Adams'), 'invite') |
|
924
|
|
|
self.assertEqual(uealite('exited', var='Adams'), 'exit') |
|
925
|
|
|
self.assertEqual(uealite('debited', var='Adams'), 'debit') |
|
926
|
|
|
self.assertEqual(uealite('smited', var='Adams'), 'smite') |
|
927
|
|
|
# stem progressive verbs and gerunds without the -ing |
|
928
|
|
|
self.assertEqual(uealite('running', var='Adams'), 'run') |
|
929
|
|
|
self.assertEqual(uealite('settings', var='Adams'), 'set') |
|
930
|
|
|
self.assertEqual(uealite('timing', var='Adams'), 'time') |
|
931
|
|
|
self.assertEqual(uealite('dying', var='Adams'), 'die') |
|
932
|
|
|
self.assertEqual(uealite('harping', var='Adams'), 'harp') |
|
933
|
|
|
self.assertEqual(uealite('charring', var='Adams'), 'char') |
|
934
|
|
|
# not stem false progressive verbs such as 'sing' |
|
935
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
|
936
|
|
|
self.assertEqual(uealite('sing', var='Adams'), 'sing') |
|
937
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
|
938
|
|
|
self.assertEqual(uealite('bring', var='Adams'), 'bring') |
|
939
|
|
|
self.assertEqual(uealite('fling', var='Adams'), 'fling') |
|
940
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
|
941
|
|
|
self.assertEqual(uealite('changes', var='Adams'), 'change') |
|
942
|
|
|
self.assertEqual(uealite('deaths', var='Adams'), 'death') |
|
943
|
|
|
self.assertEqual(uealite('shadows', var='Adams'), 'shadow') |
|
944
|
|
|
self.assertEqual(uealite('flies', var='Adams'), 'fly') |
|
945
|
|
|
self.assertEqual(uealite('things', var='Adams'), 'thing') |
|
946
|
|
|
self.assertEqual(uealite('nothings', var='Adams'), 'nothing') |
|
947
|
|
|
self.assertEqual(uealite('witches', var='Adams'), 'witch') |
|
948
|
|
|
self.assertEqual(uealite('makes', var='Adams'), 'make') |
|
949
|
|
|
self.assertEqual(uealite('smokes', var='Adams'), 'smoke') |
|
950
|
|
|
self.assertEqual(uealite('does', var='Adams'), 'do') |
|
951
|
|
|
# stem various words with -des suffix |
|
952
|
|
|
self.assertEqual(uealite('abodes', var='Adams'), 'abode') |
|
953
|
|
|
self.assertEqual(uealite('escapades', var='Adams'), 'escapade') |
|
954
|
|
|
self.assertEqual(uealite('crusades', var='Adams'), 'crusade') |
|
955
|
|
|
self.assertEqual(uealite('grades', var='Adams'), 'grade') |
|
956
|
|
|
# stem various words with -res suffix |
|
957
|
|
|
self.assertEqual(uealite('wires', var='Adams'), 'wire') |
|
958
|
|
|
self.assertEqual(uealite('acres', var='Adams'), 'acre') |
|
959
|
|
|
self.assertEqual(uealite('fires', var='Adams'), 'fire') |
|
960
|
|
|
self.assertEqual(uealite('cares', var='Adams'), 'care') |
|
961
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
|
962
|
|
|
self.assertEqual(uealite('USA', var='Adams'), 'USA') |
|
963
|
|
|
self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS') |
|
964
|
|
|
self.assertEqual(uealite('MREs', var='Adams'), 'MRE') |
|
965
|
|
|
self.assertEqual(uealite('USAED', var='Adams'), 'USAED') |
|
966
|
|
|
|
|
967
|
|
|
def test_uealite_wsj_set(self): |
|
968
|
|
|
"""Test abydos.stemmer.uealite (WSJ testset).""" |
|
969
|
|
|
with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset: |
|
970
|
|
|
for wsj_line in wsj_testset: |
|
971
|
|
|
(word, uea, rule) = wsj_line.strip().split(',') |
|
972
|
|
|
self.assertEqual(uealite(word, return_rule_no=True), |
|
973
|
|
|
(uea, float(rule))) |
|
974
|
|
|
|
|
975
|
|
|
|
|
976
|
|
|
class PaiceHuskTestCases(unittest.TestCase): |
|
977
|
|
|
"""Test Paice-Husk functions. |
|
978
|
|
|
|
|
979
|
|
|
abydos.stemmer.paice_husk |
|
980
|
|
|
""" |
|
981
|
|
|
|
|
982
|
|
|
def test_paice_husk(self): |
|
983
|
|
|
"""Test abydos.stemmer.paice_husk.""" |
|
984
|
|
|
# base case |
|
985
|
|
|
self.assertEqual(paice_husk(''), '') |
|
986
|
|
|
|
|
987
|
|
|
# cases copied from |
|
988
|
|
|
# https://doi.org/10.1145/101306.101310 |
|
989
|
|
|
self.assertEqual(paice_husk('maximum'), 'maxim') |
|
990
|
|
|
self.assertEqual(paice_husk('presumably'), 'presum') |
|
991
|
|
|
self.assertEqual(paice_husk('multiply'), 'multiply') |
|
992
|
|
|
self.assertEqual(paice_husk('provision'), 'provid') |
|
993
|
|
|
self.assertEqual(paice_husk('owed'), 'ow') |
|
994
|
|
|
self.assertEqual(paice_husk('owing'), 'ow') |
|
995
|
|
|
self.assertEqual(paice_husk('ear'), 'ear') |
|
996
|
|
|
self.assertEqual(paice_husk('saying'), 'say') |
|
997
|
|
|
self.assertEqual(paice_husk('crying'), 'cry') |
|
998
|
|
|
self.assertEqual(paice_husk('string'), 'string') |
|
999
|
|
|
self.assertEqual(paice_husk('meant'), 'meant') |
|
1000
|
|
|
self.assertEqual(paice_husk('cement'), 'cem') |
|
1001
|
|
|
|
|
1002
|
|
|
def test_paice_husk_hopper_set(self): |
|
1003
|
|
|
"""Test abydos.stemmer.paice_husk (Hopper262 testset). |
|
1004
|
|
|
|
|
1005
|
|
|
Source: |
|
1006
|
|
|
https://raw.githubusercontent.com/Hopper262/paice-husk-stemmer/master/wordlist.txt |
|
1007
|
|
|
|
|
1008
|
|
|
The only correction made from stemmed values in the Hopper262 set/ |
|
1009
|
|
|
implementations were: |
|
1010
|
|
|
- ymca : ymc -> ymca |
|
1011
|
|
|
- yttrium : yttr -> yttri |
|
1012
|
|
|
- ywca : ywc -> ywca |
|
1013
|
|
|
The Pascal reference implementation does not consider 'y' in initial |
|
1014
|
|
|
position to be a vowel. |
|
1015
|
|
|
""" |
|
1016
|
|
|
with open(TESTDIR + '/corpora/paicehusk.csv') as hopper_testset: |
|
1017
|
|
|
for hopper_line in hopper_testset: |
|
1018
|
|
|
(word, stem) = hopper_line.strip().split(',') |
|
1019
|
|
|
self.assertEqual(paice_husk(word), stem) |
|
1020
|
|
|
|
|
1021
|
|
|
|
|
1022
|
|
|
if __name__ == '__main__': |
|
1023
|
|
|
unittest.main() |
|
1024
|
|
|
|