1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.tests.test_stemmer. |
20
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.stemmer |
22
|
|
|
""" |
23
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
25
|
|
|
|
26
|
|
|
import codecs |
27
|
|
|
import os |
28
|
|
|
import unittest |
29
|
|
|
|
30
|
|
|
from abydos.stemmer import _ends_in_cvc, _ends_in_doubled_cons, _m_degree, \ |
31
|
|
|
_sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \ |
32
|
|
|
_sb_short_word, caumanns, clef_german, clef_german_plus, clef_swedish, \ |
33
|
|
|
lovins, porter, porter2, sb_danish, sb_dutch, sb_german, sb_norwegian, \ |
34
|
|
|
sb_swedish, uealite |
35
|
|
|
# lancaster, |
36
|
|
|
|
37
|
|
|
TESTDIR = os.path.dirname(__file__) |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
class LovinsTestCases(unittest.TestCase): |
41
|
|
|
"""Test Lovins functions. |
42
|
|
|
|
43
|
|
|
abydos.stemmer.lovins |
44
|
|
|
""" |
45
|
|
|
|
46
|
|
|
def test_lovins(self): |
47
|
|
|
"""Test abydos.stemmer.lovins.""" |
48
|
|
|
# base case |
49
|
|
|
self.assertEqual(lovins(''), '') |
50
|
|
|
|
51
|
|
|
# test cases from Lovins' "Development of a Stemming Algorithm": |
52
|
|
|
# http://www.mt-archive.info/MT-1968-Lovins.pdf |
53
|
|
|
self.assertEqual(lovins('magnesia'), 'magnes') |
54
|
|
|
self.assertEqual(lovins('magnesite'), 'magnes') |
55
|
|
|
self.assertEqual(lovins('magnesian'), 'magnes') |
56
|
|
|
self.assertEqual(lovins('magnesium'), 'magnes') |
57
|
|
|
self.assertEqual(lovins('magnet'), 'magnet') |
58
|
|
|
self.assertEqual(lovins('magnetic'), 'magnet') |
59
|
|
|
self.assertEqual(lovins('magneto'), 'magnet') |
60
|
|
|
self.assertEqual(lovins('magnetically'), 'magnet') |
61
|
|
|
self.assertEqual(lovins('magnetism'), 'magnet') |
62
|
|
|
self.assertEqual(lovins('magnetite'), 'magnet') |
63
|
|
|
self.assertEqual(lovins('magnetitic'), 'magnet') |
64
|
|
|
self.assertEqual(lovins('magnetizable'), 'magnet') |
65
|
|
|
self.assertEqual(lovins('magnetization'), 'magnet') |
66
|
|
|
self.assertEqual(lovins('magnetize'), 'magnet') |
67
|
|
|
self.assertEqual(lovins('magnetometer'), 'magnetometer') |
68
|
|
|
self.assertEqual(lovins('magnetometric'), 'magnetometer') |
69
|
|
|
self.assertEqual(lovins('magnetometry'), 'magnetometer') |
70
|
|
|
self.assertEqual(lovins('magnetomotive'), 'magnetomot') |
71
|
|
|
self.assertEqual(lovins('magnetron'), 'magnetron') |
72
|
|
|
self.assertEqual(lovins('metal'), 'metal') |
73
|
|
|
self.assertEqual(lovins('metall'), 'metal') |
74
|
|
|
self.assertEqual(lovins('metallically'), 'metal') |
75
|
|
|
self.assertEqual(lovins('metalliferous'), 'metallifer') |
76
|
|
|
self.assertEqual(lovins('metallize'), 'metal') |
77
|
|
|
self.assertEqual(lovins('metallurgical'), 'metallurg') |
78
|
|
|
self.assertEqual(lovins('metallurgy'), 'metallurg') |
79
|
|
|
self.assertEqual(lovins('induction'), 'induc') |
80
|
|
|
self.assertEqual(lovins('inductance'), 'induc') |
81
|
|
|
self.assertEqual(lovins('induced'), 'induc') |
82
|
|
|
self.assertEqual(lovins('angular'), 'angl') |
83
|
|
|
self.assertEqual(lovins('angle'), 'angl') |
84
|
|
|
|
85
|
|
|
# missed branch test cases |
86
|
|
|
self.assertEqual(lovins('feminism'), 'fem') |
87
|
|
|
|
88
|
|
|
def test_lovins_snowball(self): |
89
|
|
|
"""Test abydos.stemmer.lovins (Snowball testset). |
90
|
|
|
|
91
|
|
|
These test cases are from |
92
|
|
|
https://github.com/snowballstem/snowball-data/tree/master/lovins |
93
|
|
|
""" |
94
|
|
|
# Snowball Lovins test set |
95
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_lovins.csv', |
96
|
|
|
encoding='utf-8') as snowball_testset: |
97
|
|
|
next(snowball_testset) |
98
|
|
|
for line in snowball_testset: |
99
|
|
|
if line[0] != '#': |
100
|
|
|
line = line.strip().split(',') |
101
|
|
|
word, stem = line[0], line[1] |
102
|
|
|
self.assertEqual(lovins(word), stem.lower()) |
103
|
|
|
|
104
|
|
|
|
105
|
|
|
class PorterTestCases(unittest.TestCase): |
106
|
|
|
"""Test Porter functions. |
107
|
|
|
|
108
|
|
|
abydos.stemmer._m_degree, abydos.stemmer.porter, |
109
|
|
|
abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons, |
110
|
|
|
& abydos.stemmer._ends_in_cvc |
111
|
|
|
""" |
112
|
|
|
|
113
|
|
|
def test_m_degree(self): |
114
|
|
|
"""Test abydos.stemmer._m_degree.""" |
115
|
|
|
_vowels = set('aeiouy') |
116
|
|
|
# base case |
117
|
|
|
self.assertEqual(_m_degree('', _vowels), 0) |
118
|
|
|
|
119
|
|
|
# m==0 |
120
|
|
|
self.assertEqual(_m_degree('tr', _vowels), 0) |
121
|
|
|
self.assertEqual(_m_degree('ee', _vowels), 0) |
122
|
|
|
self.assertEqual(_m_degree('tree', _vowels), 0) |
123
|
|
|
self.assertEqual(_m_degree('y', _vowels), 0) |
124
|
|
|
self.assertEqual(_m_degree('by', _vowels), 0) |
125
|
|
|
|
126
|
|
|
# m==1 |
127
|
|
|
self.assertEqual(_m_degree('trouble', _vowels), 1) |
128
|
|
|
self.assertEqual(_m_degree('oats', _vowels), 1) |
129
|
|
|
self.assertEqual(_m_degree('trees', _vowels), 1) |
130
|
|
|
self.assertEqual(_m_degree('ivy', _vowels), 1) |
131
|
|
|
|
132
|
|
|
# m==2 |
133
|
|
|
self.assertEqual(_m_degree('troubles', _vowels), 2) |
134
|
|
|
self.assertEqual(_m_degree('private', _vowels), 2) |
135
|
|
|
self.assertEqual(_m_degree('oaten', _vowels), 2) |
136
|
|
|
self.assertEqual(_m_degree('orrery', _vowels), 2) |
137
|
|
|
|
138
|
|
|
def test_has_vowel(self): |
139
|
|
|
"""Test abydos.stemmer._has_vowel.""" |
140
|
|
|
_vowels = set('aeiouy') |
141
|
|
|
# base case |
142
|
|
|
self.assertFalse(_sb_has_vowel('', _vowels)) |
143
|
|
|
|
144
|
|
|
# False cases |
145
|
|
|
self.assertFalse(_sb_has_vowel('b', _vowels)) |
146
|
|
|
self.assertFalse(_sb_has_vowel('c', _vowels)) |
147
|
|
|
self.assertFalse(_sb_has_vowel('bc', _vowels)) |
148
|
|
|
self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels)) |
149
|
|
|
self.assertFalse(_sb_has_vowel('Y', _vowels)) |
150
|
|
|
|
151
|
|
|
# True cases |
152
|
|
|
self.assertTrue(_sb_has_vowel('a', _vowels)) |
153
|
|
|
self.assertTrue(_sb_has_vowel('e', _vowels)) |
154
|
|
|
self.assertTrue(_sb_has_vowel('ae', _vowels)) |
155
|
|
|
self.assertTrue(_sb_has_vowel('aeiouy', _vowels)) |
156
|
|
|
self.assertTrue(_sb_has_vowel('y', _vowels)) |
157
|
|
|
|
158
|
|
|
self.assertTrue(_sb_has_vowel('ade', _vowels)) |
159
|
|
|
self.assertTrue(_sb_has_vowel('cad', _vowels)) |
160
|
|
|
self.assertTrue(_sb_has_vowel('add', _vowels)) |
161
|
|
|
self.assertTrue(_sb_has_vowel('phi', _vowels)) |
162
|
|
|
self.assertTrue(_sb_has_vowel('pfy', _vowels)) |
163
|
|
|
|
164
|
|
|
self.assertFalse(_sb_has_vowel('pfY', _vowels)) |
165
|
|
|
|
166
|
|
|
def test_ends_in_doubled_cons(self): |
167
|
|
|
"""Test abydos.stemmer._ends_in_doubled_cons.""" |
168
|
|
|
_vowels = set('aeiouy') |
169
|
|
|
# base case |
170
|
|
|
self.assertFalse(_ends_in_doubled_cons('', _vowels)) |
171
|
|
|
|
172
|
|
|
# False cases |
173
|
|
|
self.assertFalse(_ends_in_doubled_cons('b', _vowels)) |
174
|
|
|
self.assertFalse(_ends_in_doubled_cons('c', _vowels)) |
175
|
|
|
self.assertFalse(_ends_in_doubled_cons('bc', _vowels)) |
176
|
|
|
self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz', |
177
|
|
|
_vowels)) |
178
|
|
|
self.assertFalse(_ends_in_doubled_cons('Y', _vowels)) |
179
|
|
|
self.assertFalse(_ends_in_doubled_cons('a', _vowels)) |
180
|
|
|
self.assertFalse(_ends_in_doubled_cons('e', _vowels)) |
181
|
|
|
self.assertFalse(_ends_in_doubled_cons('ae', _vowels)) |
182
|
|
|
self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels)) |
183
|
|
|
self.assertFalse(_ends_in_doubled_cons('y', _vowels)) |
184
|
|
|
self.assertFalse(_ends_in_doubled_cons('ade', _vowels)) |
185
|
|
|
self.assertFalse(_ends_in_doubled_cons('cad', _vowels)) |
186
|
|
|
self.assertFalse(_ends_in_doubled_cons('phi', _vowels)) |
187
|
|
|
self.assertFalse(_ends_in_doubled_cons('pfy', _vowels)) |
188
|
|
|
self.assertFalse(_ends_in_doubled_cons('faddy', _vowels)) |
189
|
|
|
self.assertFalse(_ends_in_doubled_cons('aiii', _vowels)) |
190
|
|
|
self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels)) |
191
|
|
|
|
192
|
|
|
# True cases |
193
|
|
|
self.assertTrue(_ends_in_doubled_cons('add', _vowels)) |
194
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadd', _vowels)) |
195
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels)) |
196
|
|
|
self.assertTrue(_ends_in_doubled_cons('raYY', _vowels)) |
197
|
|
|
self.assertTrue(_ends_in_doubled_cons('doll', _vowels)) |
198
|
|
|
self.assertTrue(_ends_in_doubled_cons('parr', _vowels)) |
199
|
|
|
self.assertTrue(_ends_in_doubled_cons('parrr', _vowels)) |
200
|
|
|
self.assertTrue(_ends_in_doubled_cons('bacc', _vowels)) |
201
|
|
|
|
202
|
|
|
def test_ends_in_cvc(self): |
203
|
|
|
"""Test abydos.stemmer._ends_in_cvc.""" |
204
|
|
|
_vowels = set('aeiouy') |
205
|
|
|
# base case |
206
|
|
|
self.assertFalse(_ends_in_cvc('', _vowels)) |
207
|
|
|
|
208
|
|
|
# False cases |
209
|
|
|
self.assertFalse(_ends_in_cvc('b', _vowels)) |
210
|
|
|
self.assertFalse(_ends_in_cvc('c', _vowels)) |
211
|
|
|
self.assertFalse(_ends_in_cvc('bc', _vowels)) |
212
|
|
|
self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels)) |
213
|
|
|
self.assertFalse(_ends_in_cvc('YYY', _vowels)) |
214
|
|
|
self.assertFalse(_ends_in_cvc('ddd', _vowels)) |
215
|
|
|
self.assertFalse(_ends_in_cvc('faaf', _vowels)) |
216
|
|
|
self.assertFalse(_ends_in_cvc('rare', _vowels)) |
217
|
|
|
self.assertFalse(_ends_in_cvc('rhy', _vowels)) |
218
|
|
|
|
219
|
|
|
# True cases |
220
|
|
|
self.assertTrue(_ends_in_cvc('dad', _vowels)) |
221
|
|
|
self.assertTrue(_ends_in_cvc('phad', _vowels)) |
222
|
|
|
self.assertTrue(_ends_in_cvc('faded', _vowels)) |
223
|
|
|
self.assertTrue(_ends_in_cvc('maYor', _vowels)) |
224
|
|
|
self.assertTrue(_ends_in_cvc('enlil', _vowels)) |
225
|
|
|
self.assertTrue(_ends_in_cvc('parer', _vowels)) |
226
|
|
|
self.assertTrue(_ends_in_cvc('padres', _vowels)) |
227
|
|
|
self.assertTrue(_ends_in_cvc('bacyc', _vowels)) |
228
|
|
|
|
229
|
|
|
# Special case for W, X, & Y |
230
|
|
|
self.assertFalse(_ends_in_cvc('craw', _vowels)) |
231
|
|
|
self.assertFalse(_ends_in_cvc('max', _vowels)) |
232
|
|
|
self.assertFalse(_ends_in_cvc('cray', _vowels)) |
233
|
|
|
|
234
|
|
|
def test_porter(self): |
235
|
|
|
"""Test abydos.stemmer.porter.""" |
236
|
|
|
# base case |
237
|
|
|
self.assertEqual(porter(''), '') |
238
|
|
|
|
239
|
|
|
# simple cases |
240
|
|
|
self.assertEqual(porter('c'), 'c') |
241
|
|
|
self.assertEqual(porter('da'), 'da') |
242
|
|
|
self.assertEqual(porter('ad'), 'ad') |
243
|
|
|
self.assertEqual(porter('sing'), 'sing') |
244
|
|
|
self.assertEqual(porter('singing'), 'sing') |
245
|
|
|
|
246
|
|
|
# missed branch test cases |
247
|
|
|
self.assertEqual(porter('capitalism'), 'capit') |
248
|
|
|
self.assertEqual(porter('fatalism'), 'fatal') |
249
|
|
|
self.assertEqual(porter('stional'), 'stional') |
250
|
|
|
self.assertEqual(porter('palism'), 'palism') |
251
|
|
|
self.assertEqual(porter('sization'), 'sizat') |
252
|
|
|
self.assertEqual(porter('licated'), 'licat') |
253
|
|
|
self.assertEqual(porter('lical'), 'lical') |
254
|
|
|
|
255
|
|
|
def test_porter_early_english(self): |
256
|
|
|
"""Test abydos.stemmer.porter (early English).""" |
257
|
|
|
# base case |
258
|
|
|
self.assertEqual(porter('', early_english=True), '') |
259
|
|
|
|
260
|
|
|
# simple cases (no different from regular stemmer) |
261
|
|
|
self.assertEqual(porter('c', early_english=True), 'c') |
262
|
|
|
self.assertEqual(porter('da', early_english=True), 'da') |
263
|
|
|
self.assertEqual(porter('ad', early_english=True), 'ad') |
264
|
|
|
self.assertEqual(porter('sing', early_english=True), 'sing') |
265
|
|
|
self.assertEqual(porter('singing', early_english=True), 'sing') |
266
|
|
|
|
267
|
|
|
# make |
268
|
|
|
self.assertEqual(porter('make', early_english=True), 'make') |
269
|
|
|
self.assertEqual(porter('makes', early_english=True), 'make') |
270
|
|
|
self.assertEqual(porter('maketh', early_english=True), 'make') |
271
|
|
|
self.assertEqual(porter('makest', early_english=True), 'make') |
272
|
|
|
|
273
|
|
|
# say |
274
|
|
|
self.assertEqual(porter('say', early_english=True), 'sai') |
275
|
|
|
self.assertEqual(porter('says', early_english=True), 'sai') |
276
|
|
|
self.assertEqual(porter('sayeth', early_english=True), 'sai') |
277
|
|
|
self.assertEqual(porter('sayest', early_english=True), 'sai') |
278
|
|
|
|
279
|
|
|
# missed branch test cases |
280
|
|
|
self.assertEqual(porter('best', early_english=True), 'best') |
281
|
|
|
self.assertEqual(porter('meth', early_english=True), 'meth') |
282
|
|
|
|
283
|
|
|
def test_porter_snowball(self): |
284
|
|
|
"""Test abydos.stemmer.porter (Snowball testset). |
285
|
|
|
|
286
|
|
|
These test cases are from |
287
|
|
|
http://snowball.tartarus.org/algorithms/porter/diffs.txt |
288
|
|
|
""" |
289
|
|
|
# Snowball Porter test set |
290
|
|
|
with open(TESTDIR+'/corpora/snowball_porter.csv') as snowball_testset: |
291
|
|
|
next(snowball_testset) |
292
|
|
|
for line in snowball_testset: |
293
|
|
|
if line[0] != '#': |
294
|
|
|
line = line.strip().split(',') |
295
|
|
|
word, stem = line[0], line[1] |
296
|
|
|
self.assertEqual(porter(word), stem.lower()) |
297
|
|
|
|
298
|
|
|
|
299
|
|
|
class Porter2TestCases(unittest.TestCase): |
300
|
|
|
"""Test Porter2 functions. |
301
|
|
|
|
302
|
|
|
abydos.stemmer._sb_r1, abydos.stemmer._sb_r2, |
303
|
|
|
abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word, |
304
|
|
|
& abydos.stemmer.porter2 |
305
|
|
|
""" |
306
|
|
|
|
307
|
|
|
def test_sb_r1(self): |
308
|
|
|
"""Test abydos.stemmer._sb_r1.""" |
309
|
|
|
_vowels = set('aeiouy') |
310
|
|
|
# base case |
311
|
|
|
self.assertEqual(_sb_r1('', _vowels), 0) |
312
|
|
|
|
313
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
314
|
|
|
self.assertEqual(_sb_r1('beautiful', _vowels), 5) |
315
|
|
|
self.assertEqual(_sb_r1('beauty', _vowels), 5) |
316
|
|
|
self.assertEqual(_sb_r1('beau', _vowels), 4) |
317
|
|
|
self.assertEqual(_sb_r1('animadversion', _vowels), 2) |
318
|
|
|
self.assertEqual(_sb_r1('sprinkled', _vowels), 5) |
319
|
|
|
self.assertEqual(_sb_r1('eucharist', _vowels), 3) |
320
|
|
|
|
321
|
|
|
def test_sb_r2(self): |
322
|
|
|
"""Test abydos.stemmer._sb_r2.""" |
323
|
|
|
_vowels = set('aeiouy') |
324
|
|
|
# base case |
325
|
|
|
self.assertEqual(_sb_r2('', _vowels), 0) |
326
|
|
|
|
327
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
328
|
|
|
self.assertEqual(_sb_r2('beautiful', _vowels), 7) |
329
|
|
|
self.assertEqual(_sb_r2('beauty', _vowels), 6) |
330
|
|
|
self.assertEqual(_sb_r2('beau', _vowels), 4) |
331
|
|
|
self.assertEqual(_sb_r2('animadversion', _vowels), 4) |
332
|
|
|
self.assertEqual(_sb_r2('sprinkled', _vowels), 9) |
333
|
|
|
self.assertEqual(_sb_r2('eucharist', _vowels), 6) |
334
|
|
|
|
335
|
|
|
def test_sb_ends_in_short_syllable(self): |
336
|
|
|
"""Test abydos.stemmer._sb_ends_in_short_syllable.""" |
337
|
|
|
_vowels = set('aeiouy') |
338
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
339
|
|
|
# base case |
340
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('', _vowels, |
341
|
|
|
_codanonvowels)) |
342
|
|
|
|
343
|
|
|
# examples from |
344
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
345
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels, |
346
|
|
|
_codanonvowels)) |
347
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels, |
348
|
|
|
_codanonvowels)) |
349
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels, |
350
|
|
|
_codanonvowels)) |
351
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels, |
352
|
|
|
_codanonvowels)) |
353
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('on', _vowels, |
354
|
|
|
_codanonvowels)) |
355
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('at', _vowels, |
356
|
|
|
_codanonvowels)) |
357
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
358
|
|
|
_codanonvowels)) |
359
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
360
|
|
|
_codanonvowels)) |
361
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels, |
362
|
|
|
_codanonvowels)) |
363
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels, |
364
|
|
|
_codanonvowels)) |
365
|
|
|
|
366
|
|
|
# missed branch test cases |
367
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('d', _vowels, |
368
|
|
|
_codanonvowels)) |
369
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('a', _vowels, |
370
|
|
|
_codanonvowels)) |
371
|
|
|
|
372
|
|
|
def test_sb_short_word(self): |
373
|
|
|
"""Test abydos.stemmer._sb_short_word.""" |
374
|
|
|
_vowels = set('aeiouy') |
375
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
376
|
|
|
# base case |
377
|
|
|
self.assertFalse(_sb_short_word('', _vowels, _codanonvowels)) |
378
|
|
|
|
379
|
|
|
# examples from |
380
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
381
|
|
|
self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels)) |
382
|
|
|
self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels)) |
383
|
|
|
self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels)) |
384
|
|
|
self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels)) |
385
|
|
|
self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels)) |
386
|
|
|
self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels)) |
387
|
|
|
|
388
|
|
|
def test_porter2(self): |
389
|
|
|
"""Test abydos.stemmer.porter2.""" |
390
|
|
|
# base case |
391
|
|
|
self.assertEqual(porter2(''), '') |
392
|
|
|
|
393
|
|
|
# simple cases |
394
|
|
|
self.assertEqual(porter2('c'), 'c') |
395
|
|
|
self.assertEqual(porter2('da'), 'da') |
396
|
|
|
self.assertEqual(porter2('ad'), 'ad') |
397
|
|
|
self.assertEqual(porter2('sing'), 'sing') |
398
|
|
|
self.assertEqual(porter2('singing'), 'sing') |
399
|
|
|
|
400
|
|
|
# missed branch test cases |
401
|
|
|
self.assertEqual(porter2('capitalism'), 'capit') |
402
|
|
|
self.assertEqual(porter2('fatalism'), 'fatal') |
403
|
|
|
self.assertEqual(porter2('dog\'s'), 'dog') |
404
|
|
|
self.assertEqual(porter2('A\'s\''), 'a') |
405
|
|
|
self.assertEqual(porter2('agreedly'), 'agre') |
406
|
|
|
self.assertEqual(porter2('feedly'), 'feed') |
407
|
|
|
self.assertEqual(porter2('stional'), 'stional') |
408
|
|
|
self.assertEqual(porter2('palism'), 'palism') |
409
|
|
|
self.assertEqual(porter2('sization'), 'sizat') |
410
|
|
|
self.assertEqual(porter2('licated'), 'licat') |
411
|
|
|
self.assertEqual(porter2('lical'), 'lical') |
412
|
|
|
self.assertEqual(porter2('clessly'), 'clessli') |
413
|
|
|
self.assertEqual(porter2('tably'), 'tabli') |
414
|
|
|
self.assertEqual(porter2('sizer'), 'sizer') |
415
|
|
|
self.assertEqual(porter2('livity'), 'liviti') |
416
|
|
|
|
417
|
|
|
def test_porter2_early_english(self): |
418
|
|
|
"""Test abydos.stemmer.porter2 (early English).""" |
419
|
|
|
# base case |
420
|
|
|
self.assertEqual(porter2('', early_english=True), '') |
421
|
|
|
|
422
|
|
|
# simple cases (no different from regular stemmer) |
423
|
|
|
self.assertEqual(porter2('c', early_english=True), 'c') |
424
|
|
|
self.assertEqual(porter2('da', early_english=True), 'da') |
425
|
|
|
self.assertEqual(porter2('ad', early_english=True), 'ad') |
426
|
|
|
self.assertEqual(porter2('sing', early_english=True), 'sing') |
427
|
|
|
self.assertEqual(porter2('singing', early_english=True), 'sing') |
428
|
|
|
|
429
|
|
|
# make |
430
|
|
|
self.assertEqual(porter2('make', early_english=True), 'make') |
431
|
|
|
self.assertEqual(porter2('makes', early_english=True), 'make') |
432
|
|
|
self.assertEqual(porter2('maketh', early_english=True), 'make') |
433
|
|
|
self.assertEqual(porter2('makest', early_english=True), 'make') |
434
|
|
|
|
435
|
|
|
# say |
436
|
|
|
self.assertEqual(porter2('say', early_english=True), 'say') |
437
|
|
|
self.assertEqual(porter2('says', early_english=True), 'say') |
438
|
|
|
self.assertEqual(porter2('sayeth', early_english=True), 'say') |
439
|
|
|
self.assertEqual(porter2('sayest', early_english=True), 'say') |
440
|
|
|
|
441
|
|
|
# missed branch test cases |
442
|
|
|
self.assertEqual(porter2('best', early_english=True), 'best') |
443
|
|
|
self.assertEqual(porter2('meth', early_english=True), 'meth') |
444
|
|
|
|
445
|
|
|
def test_porter2_snowball(self): |
446
|
|
|
"""Test abydos.stemmer.porter2 (Snowball testset). |
447
|
|
|
|
448
|
|
|
These test cases are from |
449
|
|
|
http://snowball.tartarus.org/algorithms/english/diffs.txt |
450
|
|
|
""" |
451
|
|
|
# Snowball Porter test set |
452
|
|
|
with open(TESTDIR+'/corpora/snowball_porter2.csv') as snowball_testset: |
453
|
|
|
next(snowball_testset) |
454
|
|
|
for line in snowball_testset: |
455
|
|
|
if line[0] != '#': |
456
|
|
|
line = line.strip().split(',') |
457
|
|
|
word, stem = line[0], line[1] |
458
|
|
|
self.assertEqual(porter2(word), stem.lower()) |
459
|
|
|
|
460
|
|
|
|
461
|
|
|
class SnowballTestCases(unittest.TestCase): |
462
|
|
|
"""Test Snowball functions. |
463
|
|
|
|
464
|
|
|
abydos.stemmer.sb_german, abydos.stemmer.sb_dutch, |
465
|
|
|
abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, & |
466
|
|
|
abydos.stemmer.sb_danish |
467
|
|
|
""" |
468
|
|
|
|
469
|
|
View Code Duplication |
def test_sb_german_snowball(self): |
|
|
|
|
470
|
|
|
"""Test abydos.stemmer.sb_german (Snowball testset). |
471
|
|
|
|
472
|
|
|
These test cases are from |
473
|
|
|
http://snowball.tartarus.org/algorithms/german/diffs.txt |
474
|
|
|
""" |
475
|
|
|
# base case |
476
|
|
|
self.assertEqual(sb_german(''), '') |
477
|
|
|
|
478
|
|
|
# Snowball German test set |
479
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_german.csv', |
480
|
|
|
encoding='utf-8') as snowball_testset: |
481
|
|
|
next(snowball_testset) |
482
|
|
|
for line in snowball_testset: |
483
|
|
|
if line[0] != '#': |
484
|
|
|
line = line.strip().split(',') |
485
|
|
|
word, stem = line[0], line[1] |
486
|
|
|
self.assertEqual(sb_german(word), stem.lower()) |
487
|
|
|
|
488
|
|
|
# missed branch test cases |
489
|
|
|
self.assertEqual(sb_german('ikeit'), 'ikeit') |
490
|
|
|
|
491
|
|
|
def test_sb_german_snowball_alt(self): |
492
|
|
|
"""Test abydos.stemmer.sb_german (alternate vowels).""" |
493
|
|
|
# base case |
494
|
|
|
self.assertEqual(sb_german('', alternate_vowels=True), '') |
495
|
|
|
|
496
|
|
|
# dämmerung,dammer |
497
|
|
|
self.assertEqual(sb_german('dämmerung', alternate_vowels=True), |
498
|
|
|
'dammer') |
499
|
|
|
self.assertEqual(sb_german('daemmerung', alternate_vowels=True), |
500
|
|
|
'dammer') |
501
|
|
|
self.assertEqual(sb_german('dämmerung'), 'dammer') |
502
|
|
|
self.assertEqual(sb_german('daemmerung'), 'daemmer') |
503
|
|
|
|
504
|
|
|
# brötchen,brotch |
505
|
|
|
self.assertEqual(sb_german('brötchen', alternate_vowels=True), |
506
|
|
|
'brotch') |
507
|
|
|
self.assertEqual(sb_german('broetchen', alternate_vowels=True), |
508
|
|
|
'brotch') |
509
|
|
|
self.assertEqual(sb_german('brötchen'), 'brotch') |
510
|
|
|
self.assertEqual(sb_german('broetchen'), 'broetch') |
511
|
|
|
|
512
|
|
|
# büro,buro |
513
|
|
|
self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro') |
514
|
|
|
self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro') |
515
|
|
|
self.assertEqual(sb_german('büro'), 'buro') |
516
|
|
|
self.assertEqual(sb_german('buero'), 'buero') |
517
|
|
|
|
518
|
|
|
# häufen,hauf |
519
|
|
|
self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf') |
520
|
|
|
self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf') |
521
|
|
|
self.assertEqual(sb_german('häufen'), 'hauf') |
522
|
|
|
self.assertEqual(sb_german('haeufen'), 'haeuf') |
523
|
|
|
|
524
|
|
|
# quelle,quell |
525
|
|
|
self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull') |
526
|
|
|
self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell') |
527
|
|
|
self.assertEqual(sb_german('qülle'), 'qull') |
528
|
|
|
self.assertEqual(sb_german('quelle'), 'quell') |
529
|
|
|
|
530
|
|
|
# feuer,feuer |
531
|
|
|
self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur') |
532
|
|
|
self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu') |
533
|
|
|
self.assertEqual(sb_german('feür'), 'feur') |
534
|
|
|
self.assertEqual(sb_german('feuer'), 'feu') |
535
|
|
|
|
536
|
|
|
# über,uber |
537
|
|
|
self.assertEqual(sb_german('über', alternate_vowels=True), 'uber') |
538
|
|
|
self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber') |
539
|
|
|
self.assertEqual(sb_german('über'), 'uber') |
540
|
|
|
self.assertEqual(sb_german('ueber'), 'ueb') |
541
|
|
|
|
542
|
|
View Code Duplication |
def test_sb_dutch_snowball(self): |
|
|
|
|
543
|
|
|
"""Test abydos.stemmer.sb_dutch (Snowball testset). |
544
|
|
|
|
545
|
|
|
These test cases are from |
546
|
|
|
http://snowball.tartarus.org/algorithms/dutch/diffs.txt |
547
|
|
|
""" |
548
|
|
|
# base case |
549
|
|
|
self.assertEqual(sb_dutch(''), '') |
550
|
|
|
|
551
|
|
|
# Snowball Dutch test set |
552
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_dutch.csv', |
553
|
|
|
encoding='utf-8') as snowball_testset: |
554
|
|
|
next(snowball_testset) |
555
|
|
|
for line in snowball_testset: |
556
|
|
|
if line[0] != '#': |
557
|
|
|
line = line.strip().split(',') |
558
|
|
|
word, stem = line[0], line[1] |
559
|
|
|
self.assertEqual(sb_dutch(word), stem.lower()) |
560
|
|
|
|
561
|
|
|
# missed branch test cases |
562
|
|
|
self.assertEqual(sb_dutch('zondulielijk'), 'zondulie') |
563
|
|
|
|
564
|
|
View Code Duplication |
def test_sb_norwegian_snowball(self): |
|
|
|
|
565
|
|
|
"""Test abydos.stemmer.sb_norwegian (Snowball testset). |
566
|
|
|
|
567
|
|
|
These test cases are from |
568
|
|
|
http://snowball.tartarus.org/algorithms/norwegian/diffs.txt |
569
|
|
|
""" |
570
|
|
|
# base case |
571
|
|
|
self.assertEqual(sb_norwegian(''), '') |
572
|
|
|
|
573
|
|
|
# Snowball Norwegian test set |
574
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_norwegian.csv', |
575
|
|
|
encoding='utf-8') as snowball_testset: |
576
|
|
|
next(snowball_testset) |
577
|
|
|
for line in snowball_testset: |
578
|
|
|
if line[0] != '#': |
579
|
|
|
line = line.strip().split(',') |
580
|
|
|
word, stem = line[0], line[1] |
581
|
|
|
self.assertEqual(sb_norwegian(word), stem.lower()) |
582
|
|
|
|
583
|
|
View Code Duplication |
def test_sb_swedish_snowball(self): |
|
|
|
|
584
|
|
|
"""Test abydos.stemmer.sb_swedish (Snowball testset). |
585
|
|
|
|
586
|
|
|
These test cases are from |
587
|
|
|
http://snowball.tartarus.org/algorithms/swedish/diffs.txt |
588
|
|
|
""" |
589
|
|
|
# base case |
590
|
|
|
self.assertEqual(sb_swedish(''), '') |
591
|
|
|
|
592
|
|
|
# Snowball Swedish test set |
593
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_swedish.csv', |
594
|
|
|
encoding='utf-8') as snowball_testset: |
595
|
|
|
next(snowball_testset) |
596
|
|
|
for line in snowball_testset: |
597
|
|
|
if line[0] != '#': |
598
|
|
|
line = line.strip().split(',') |
599
|
|
|
word, stem = line[0], line[1] |
600
|
|
|
self.assertEqual(sb_swedish(word), stem.lower()) |
601
|
|
|
|
602
|
|
View Code Duplication |
def test_sb_danish_snowball(self): |
|
|
|
|
603
|
|
|
"""Test abydos.stemmer.sb_danish (Snowball testset). |
604
|
|
|
|
605
|
|
|
These test cases are from |
606
|
|
|
http://snowball.tartarus.org/algorithms/danish/diffs.txt |
607
|
|
|
""" |
608
|
|
|
# base case |
609
|
|
|
self.assertEqual(sb_danish(''), '') |
610
|
|
|
|
611
|
|
|
# Snowball Danish test set |
612
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_danish.csv', |
613
|
|
|
encoding='utf-8') as snowball_testset: |
614
|
|
|
next(snowball_testset) |
615
|
|
|
for line in snowball_testset: |
616
|
|
|
if line[0] != '#': |
617
|
|
|
line = line.strip().split(',') |
618
|
|
|
word, stem = line[0], line[1] |
619
|
|
|
self.assertEqual(sb_danish(word), stem.lower()) |
620
|
|
|
|
621
|
|
|
|
622
|
|
|
class CLEFTestCases(unittest.TestCase): |
623
|
|
|
"""Test CLEF functions. |
624
|
|
|
|
625
|
|
|
abydos.stemmer.clef_german, abydos.stemmer.clef_german_plus, & |
626
|
|
|
abydos.stemmer.clef_swedish |
627
|
|
|
""" |
628
|
|
|
|
629
|
|
|
def test_clef_german(self): |
630
|
|
|
"""Test abydos.stemmer.clef_german.""" |
631
|
|
|
# base case |
632
|
|
|
self.assertEqual(clef_german(''), '') |
633
|
|
|
|
634
|
|
|
# len <= 2 |
635
|
|
|
self.assertEqual(clef_german('ä'), 'a') |
636
|
|
|
self.assertEqual(clef_german('er'), 'er') |
637
|
|
|
self.assertEqual(clef_german('es'), 'es') |
638
|
|
|
self.assertEqual(clef_german('äh'), 'ah') |
639
|
|
|
|
640
|
|
|
# len > 2 |
641
|
|
|
self.assertEqual(clef_german('deinen'), 'dein') |
642
|
|
|
self.assertEqual(clef_german('können'), 'konn') |
643
|
|
|
self.assertEqual(clef_german('Damen'), 'dame') |
644
|
|
|
self.assertEqual(clef_german('kleines'), 'klein') |
645
|
|
|
self.assertEqual(clef_german('Namen'), 'name') |
646
|
|
|
self.assertEqual(clef_german('Äpfel'), 'apfel') |
647
|
|
|
self.assertEqual(clef_german('Jahre'), 'jahr') |
648
|
|
|
self.assertEqual(clef_german('Mannes'), 'mann') |
649
|
|
|
self.assertEqual(clef_german('Häuser'), 'haus') |
650
|
|
|
self.assertEqual(clef_german('Motoren'), 'motor') |
651
|
|
|
self.assertEqual(clef_german('kleine'), 'klein') |
652
|
|
|
self.assertEqual(clef_german('Pfingsten'), 'pfingst') |
653
|
|
|
self.assertEqual(clef_german('lautest'), 'lautest') |
654
|
|
|
self.assertEqual(clef_german('lauteste'), 'lautest') |
655
|
|
|
self.assertEqual(clef_german('lautere'), 'lauter') |
656
|
|
|
self.assertEqual(clef_german('lautste'), 'lautst') |
657
|
|
|
self.assertEqual(clef_german('kleinen'), 'klei') |
658
|
|
|
|
659
|
|
|
def test_clef_german_plus(self): |
660
|
|
|
"""Test abydos.stemmer.clef_german_plus.""" |
661
|
|
|
# base case |
662
|
|
|
self.assertEqual(clef_german_plus(''), '') |
663
|
|
|
|
664
|
|
|
# len <= 2 |
665
|
|
|
self.assertEqual(clef_german_plus('ä'), 'a') |
666
|
|
|
self.assertEqual(clef_german_plus('er'), 'er') |
667
|
|
|
self.assertEqual(clef_german_plus('es'), 'es') |
668
|
|
|
self.assertEqual(clef_german_plus('äh'), 'ah') |
669
|
|
|
|
670
|
|
|
# len > 2 |
671
|
|
|
self.assertEqual(clef_german_plus('deinen'), 'dein') |
672
|
|
|
self.assertEqual(clef_german_plus('können'), 'konn') |
673
|
|
|
self.assertEqual(clef_german_plus('Damen'), 'dam') |
674
|
|
|
self.assertEqual(clef_german_plus('kleines'), 'klein') |
675
|
|
|
self.assertEqual(clef_german_plus('Namen'), 'nam') |
676
|
|
|
self.assertEqual(clef_german_plus('Äpfel'), 'apfel') |
677
|
|
|
self.assertEqual(clef_german_plus('Jahre'), 'jahr') |
678
|
|
|
self.assertEqual(clef_german_plus('Mannes'), 'mann') |
679
|
|
|
self.assertEqual(clef_german_plus('Häuser'), 'haus') |
680
|
|
|
self.assertEqual(clef_german_plus('Motoren'), 'motor') |
681
|
|
|
self.assertEqual(clef_german_plus('kleine'), 'klein') |
682
|
|
|
self.assertEqual(clef_german_plus('Pfingsten'), 'pfing') |
683
|
|
|
self.assertEqual(clef_german_plus('lautest'), 'laut') |
684
|
|
|
self.assertEqual(clef_german_plus('lauteste'), 'laut') |
685
|
|
|
self.assertEqual(clef_german_plus('lautere'), 'laut') |
686
|
|
|
self.assertEqual(clef_german_plus('lautste'), 'laut') |
687
|
|
|
self.assertEqual(clef_german_plus('kleinen'), 'klein') |
688
|
|
|
self.assertEqual(clef_german_plus('Pfarrern'), 'pfarr') |
689
|
|
|
|
690
|
|
|
def test_clef_swedish(self): |
691
|
|
|
"""Test abydos.stemmer.clef_swedish.""" |
692
|
|
|
# base case |
693
|
|
|
self.assertEqual(clef_swedish(''), '') |
694
|
|
|
|
695
|
|
|
# unstemmed |
696
|
|
|
self.assertEqual(clef_swedish('konung'), 'konung') |
697
|
|
|
|
698
|
|
|
# len <= 3 |
699
|
|
|
self.assertEqual(clef_swedish('km'), 'km') |
700
|
|
|
self.assertEqual(clef_swedish('ja'), 'ja') |
701
|
|
|
self.assertEqual(clef_swedish('de'), 'de') |
702
|
|
|
self.assertEqual(clef_swedish('in'), 'in') |
703
|
|
|
self.assertEqual(clef_swedish('a'), 'a') |
704
|
|
|
self.assertEqual(clef_swedish('mer'), 'mer') |
705
|
|
|
self.assertEqual(clef_swedish('s'), 's') |
706
|
|
|
self.assertEqual(clef_swedish('e'), 'e') |
707
|
|
|
self.assertEqual(clef_swedish('oss'), 'oss') |
708
|
|
|
self.assertEqual(clef_swedish('hos'), 'hos') |
709
|
|
|
|
710
|
|
|
# genitive |
711
|
|
|
self.assertEqual(clef_swedish('svenskars'), 'svensk') |
712
|
|
|
self.assertEqual(clef_swedish('stadens'), 'stad') |
713
|
|
|
self.assertEqual(clef_swedish('kommuns'), 'kommu') |
714
|
|
|
self.assertEqual(clef_swedish('aftonbladets'), 'aftonblad') |
715
|
|
|
|
716
|
|
|
# len > 7 |
717
|
|
|
self.assertEqual(clef_swedish('fängelser'), 'fäng') |
718
|
|
|
self.assertEqual(clef_swedish('möjligheten'), 'möjlig') |
719
|
|
|
|
720
|
|
|
# len > 6 |
721
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
722
|
|
|
self.assertEqual(clef_swedish('myndigheterna'), 'myndighet') |
723
|
|
|
self.assertEqual(clef_swedish('avgörande'), 'avgör') |
724
|
|
|
self.assertEqual(clef_swedish('fängelse'), 'fäng') |
725
|
|
|
self.assertEqual(clef_swedish('viktigaste'), 'viktig') |
726
|
|
|
self.assertEqual(clef_swedish('kvinnorna'), 'kvinn') |
727
|
|
|
self.assertEqual(clef_swedish('åklagaren'), 'åklag') |
728
|
|
|
|
729
|
|
|
# len > 5 |
730
|
|
|
self.assertEqual(clef_swedish('tidigare'), 'tidig') |
731
|
|
|
self.assertEqual(clef_swedish('senast'), 'sen') |
732
|
|
|
self.assertEqual(clef_swedish('möjlighet'), 'möjlig') |
733
|
|
|
|
734
|
|
|
# len > 4 |
735
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
736
|
|
|
self.assertEqual(clef_swedish('skriver'), 'skriv') |
737
|
|
|
self.assertEqual(clef_swedish('människor'), 'människ') |
738
|
|
|
self.assertEqual(clef_swedish('staden'), 'stad') |
739
|
|
|
self.assertEqual(clef_swedish('kunnat'), 'kunn') |
740
|
|
|
self.assertEqual(clef_swedish('samarbete'), 'samarbe') |
741
|
|
|
self.assertEqual(clef_swedish('aftonbladet'), 'aftonblad') |
742
|
|
|
|
743
|
|
|
# len > 3 |
744
|
|
|
self.assertEqual(clef_swedish('allt'), 'all') |
745
|
|
|
self.assertEqual(clef_swedish('vilka'), 'vilk') |
746
|
|
|
self.assertEqual(clef_swedish('länge'), 'läng') |
747
|
|
|
self.assertEqual(clef_swedish('kommun'), 'kommu') |
748
|
|
|
|
749
|
|
|
|
750
|
|
|
class CaumannsTestCases(unittest.TestCase): |
751
|
|
|
"""Test Caumanns functions. |
752
|
|
|
|
753
|
|
|
abydos.stemmer.caumanns |
754
|
|
|
""" |
755
|
|
|
|
756
|
|
|
def test_caumanns(self): |
757
|
|
|
"""Test abydos.stemmer.caumanns.""" |
758
|
|
|
# base case |
759
|
|
|
self.assertEqual(caumanns(''), '') |
760
|
|
|
|
761
|
|
|
# tests from Caumanns' description of the algorithm |
762
|
|
|
self.assertEqual(caumanns('singt'), 'sing') |
763
|
|
|
self.assertEqual(caumanns('singen'), 'sing') |
764
|
|
|
self.assertEqual(caumanns('beliebt'), 'belieb') |
765
|
|
|
self.assertEqual(caumanns('beliebtester'), 'belieb') |
766
|
|
|
self.assertEqual(caumanns('stören'), 'stor') |
767
|
|
|
self.assertEqual(caumanns('stöhnen'), 'stoh') |
768
|
|
|
self.assertEqual(caumanns('Kuß'), 'kuss') |
769
|
|
|
self.assertEqual(caumanns('Küsse'), 'kuss') |
770
|
|
|
self.assertEqual(caumanns('Verlierer'), 'verlier') |
771
|
|
|
self.assertEqual(caumanns('Verlies'), 'verlie') |
772
|
|
|
self.assertEqual(caumanns('Maus'), 'mau') |
773
|
|
|
self.assertEqual(caumanns('Mauer'), 'mau') |
774
|
|
|
self.assertEqual(caumanns('Störsender'), 'stor') |
775
|
|
|
|
776
|
|
|
# additional tests to achieve full coverage |
777
|
|
|
self.assertEqual(caumanns('Müllerinnen'), 'mullerin') |
778
|
|
|
self.assertEqual(caumanns('Matrix'), 'matrix') |
779
|
|
|
self.assertEqual(caumanns('Matrizen'), 'matrix') |
780
|
|
|
|
781
|
|
|
def test_caumanns_lucene(self): |
782
|
|
|
"""Test abydos.stemmer.caumanns (Lucene tests). |
783
|
|
|
|
784
|
|
|
Based on tests from |
785
|
|
|
https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt |
786
|
|
|
This is presumably Apache-licensed. |
787
|
|
|
""" |
788
|
|
|
# German special characters are replaced: |
789
|
|
|
self.assertEqual(caumanns('häufig'), 'haufig') |
790
|
|
|
self.assertEqual(caumanns('üor'), 'uor') |
791
|
|
|
self.assertEqual(caumanns('björk'), 'bjork') |
792
|
|
|
|
793
|
|
|
# here the stemmer works okay, it maps related words to the same stem: |
794
|
|
|
self.assertEqual(caumanns('abschließen'), 'abschliess') |
795
|
|
|
self.assertEqual(caumanns('abschließender'), 'abschliess') |
796
|
|
|
self.assertEqual(caumanns('abschließendes'), 'abschliess') |
797
|
|
|
self.assertEqual(caumanns('abschließenden'), 'abschliess') |
798
|
|
|
|
799
|
|
|
self.assertEqual(caumanns('Tisch'), 'tisch') |
800
|
|
|
self.assertEqual(caumanns('Tische'), 'tisch') |
801
|
|
|
self.assertEqual(caumanns('Tischen'), 'tisch') |
802
|
|
|
self.assertEqual(caumanns('geheimtür'), 'geheimtur') |
803
|
|
|
|
804
|
|
|
self.assertEqual(caumanns('Haus'), 'hau') |
805
|
|
|
self.assertEqual(caumanns('Hauses'), 'hau') |
806
|
|
|
self.assertEqual(caumanns('Häuser'), 'hau') |
807
|
|
|
self.assertEqual(caumanns('Häusern'), 'hau') |
808
|
|
|
# here's a case where overstemming occurs, i.e. a word is |
809
|
|
|
# mapped to the same stem as unrelated words: |
810
|
|
|
self.assertEqual(caumanns('hauen'), 'hau') |
811
|
|
|
|
812
|
|
|
# here's a case where understemming occurs, i.e. two related words |
813
|
|
|
# are not mapped to the same stem. This is the case with basically |
814
|
|
|
# all irregular forms: |
815
|
|
|
self.assertEqual(caumanns('Drama'), 'drama') |
816
|
|
|
self.assertEqual(caumanns('Dramen'), 'dram') |
817
|
|
|
|
818
|
|
|
# replace "ß" with 'ss': |
819
|
|
|
self.assertEqual(caumanns('Ausmaß'), 'ausmass') |
820
|
|
|
|
821
|
|
|
# fake words to test if suffixes are cut off: |
822
|
|
|
self.assertEqual(caumanns('xxxxxe'), 'xxxxx') |
823
|
|
|
self.assertEqual(caumanns('xxxxxs'), 'xxxxx') |
824
|
|
|
self.assertEqual(caumanns('xxxxxn'), 'xxxxx') |
825
|
|
|
self.assertEqual(caumanns('xxxxxt'), 'xxxxx') |
826
|
|
|
self.assertEqual(caumanns('xxxxxem'), 'xxxxx') |
827
|
|
|
self.assertEqual(caumanns('xxxxxer'), 'xxxxx') |
828
|
|
|
self.assertEqual(caumanns('xxxxxnd'), 'xxxxx') |
829
|
|
|
# the suffixes are also removed when combined: |
830
|
|
|
self.assertEqual(caumanns('xxxxxetende'), 'xxxxx') |
831
|
|
|
|
832
|
|
|
# words that are shorter than four charcters are not changed: |
833
|
|
|
self.assertEqual(caumanns('xxe'), 'xxe') |
834
|
|
|
# -em and -er are not removed from words shorter than five characters: |
835
|
|
|
self.assertEqual(caumanns('xxem'), 'xxem') |
836
|
|
|
self.assertEqual(caumanns('xxer'), 'xxer') |
837
|
|
|
# -nd is not removed from words shorter than six characters: |
838
|
|
|
self.assertEqual(caumanns('xxxnd'), 'xxxnd') |
839
|
|
|
|
840
|
|
|
|
841
|
|
|
class UEALiteTestCases(unittest.TestCase): |
842
|
|
|
"""Test UEA-lite functions. |
843
|
|
|
|
844
|
|
|
abydos.stemmer.uealite |
845
|
|
|
""" |
846
|
|
|
|
847
|
|
|
def test_uealite(self): |
848
|
|
|
"""Test abydos.stemmer.uealite.""" |
849
|
|
|
# base case |
850
|
|
|
self.assertEqual(uealite(''), '') |
851
|
|
|
|
852
|
|
|
# test cases copied from Ruby port |
853
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
854
|
|
|
# These are corrected to match the Java version's output. |
855
|
|
|
# stem base words to just the base word |
856
|
|
|
self.assertEqual(uealite('man'), 'man') |
857
|
|
|
self.assertEqual(uealite('happiness'), 'happiness') |
858
|
|
|
# stem theses as thesis but not bases as basis |
859
|
|
|
self.assertEqual(uealite('theses'), 'thesis') |
860
|
|
|
self.assertNotEqual(uealite('bases'), 'basis') |
861
|
|
|
# stem preterite words ending in -ed without the -ed |
862
|
|
|
self.assertEqual(uealite('ordained'), 'ordain') |
863
|
|
|
self.assertEqual(uealite('killed'), 'kill') |
864
|
|
|
self.assertEqual(uealite('liked'), 'lik') |
865
|
|
|
self.assertEqual(uealite('helped'), 'help') |
866
|
|
|
self.assertEqual(uealite('scarred'), 'scarre') |
867
|
|
|
self.assertEqual(uealite('invited'), 'invit') |
868
|
|
|
self.assertEqual(uealite('exited'), 'exit') |
869
|
|
|
self.assertEqual(uealite('debited'), 'debit') |
870
|
|
|
self.assertEqual(uealite('smited'), 'smit') |
871
|
|
|
# stem progressive verbs and gerunds without the -ing |
872
|
|
|
self.assertEqual(uealite('running'), 'run') |
873
|
|
|
self.assertEqual(uealite('settings'), 'set') |
874
|
|
|
self.assertEqual(uealite('timing'), 'time') |
875
|
|
|
self.assertEqual(uealite('dying'), 'dy') |
876
|
|
|
self.assertEqual(uealite('harping'), 'harp') |
877
|
|
|
self.assertEqual(uealite('charring'), 'char') |
878
|
|
|
# not stem false progressive verbs such as 'sing' |
879
|
|
|
self.assertEqual(uealite('ring'), 'ring') |
880
|
|
|
self.assertEqual(uealite('sing'), 'se') |
881
|
|
|
self.assertEqual(uealite('bring'), 'br') |
882
|
|
|
self.assertEqual(uealite('fling'), 'fle') |
883
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
884
|
|
|
self.assertEqual(uealite('changes'), 'change') |
885
|
|
|
self.assertEqual(uealite('deaths'), 'death') |
886
|
|
|
self.assertEqual(uealite('shadows'), 'shadow') |
887
|
|
|
self.assertEqual(uealite('flies'), 'fly') |
888
|
|
|
self.assertEqual(uealite('things'), 'thing') |
889
|
|
|
self.assertEqual(uealite('nothings'), 'nothing') |
890
|
|
|
self.assertEqual(uealite('witches'), 'witch') |
891
|
|
|
self.assertEqual(uealite('makes'), 'mak') |
892
|
|
|
self.assertEqual(uealite('smokes'), 'smok') |
893
|
|
|
self.assertEqual(uealite('does'), 'do') |
894
|
|
|
# stem various words with -des suffix |
895
|
|
|
self.assertEqual(uealite('abodes'), 'abod') |
896
|
|
|
self.assertEqual(uealite('escapades'), 'escapad') |
897
|
|
|
self.assertEqual(uealite('crusades'), 'crusad') |
898
|
|
|
self.assertEqual(uealite('grades'), 'grad') |
899
|
|
|
# stem various words with -res suffix |
900
|
|
|
self.assertEqual(uealite('wires'), 'wir') |
901
|
|
|
self.assertEqual(uealite('acres'), 'acr') |
902
|
|
|
self.assertEqual(uealite('fires'), 'fir') |
903
|
|
|
self.assertEqual(uealite('cares'), 'car') |
904
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
905
|
|
|
self.assertEqual(uealite('USA'), 'USA') |
906
|
|
|
self.assertEqual(uealite('FLOSS'), 'FLOSS') |
907
|
|
|
self.assertEqual(uealite('MREs'), 'MRE') |
908
|
|
|
self.assertEqual(uealite('USAED'), 'USAED') |
909
|
|
|
|
910
|
|
|
# test cases copied from Ruby port |
911
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
912
|
|
|
# stem base words to just the base word |
913
|
|
|
self.assertEqual(uealite('man', var='Adams'), 'man') |
914
|
|
|
self.assertEqual(uealite('happiness', var='Adams'), 'happiness') |
915
|
|
|
# stem theses as thesis but not bases as basis |
916
|
|
|
self.assertEqual(uealite('theses', var='Adams'), 'thesis') |
917
|
|
|
self.assertNotEqual(uealite('bases', var='Adams'), 'basis') |
918
|
|
|
# stem preterite words ending in -ed without the -ed |
919
|
|
|
self.assertEqual(uealite('ordained', var='Adams'), 'ordain') |
920
|
|
|
self.assertEqual(uealite('killed', var='Adams'), 'kill') |
921
|
|
|
self.assertEqual(uealite('liked', var='Adams'), 'like') |
922
|
|
|
self.assertEqual(uealite('helped', var='Adams'), 'help') |
923
|
|
|
# self.assertEqual(uealite('scarred', var='Adams'), 'scar') |
924
|
|
|
self.assertEqual(uealite('invited', var='Adams'), 'invite') |
925
|
|
|
self.assertEqual(uealite('exited', var='Adams'), 'exit') |
926
|
|
|
self.assertEqual(uealite('debited', var='Adams'), 'debit') |
927
|
|
|
self.assertEqual(uealite('smited', var='Adams'), 'smite') |
928
|
|
|
# stem progressive verbs and gerunds without the -ing |
929
|
|
|
self.assertEqual(uealite('running', var='Adams'), 'run') |
930
|
|
|
self.assertEqual(uealite('settings', var='Adams'), 'set') |
931
|
|
|
self.assertEqual(uealite('timing', var='Adams'), 'time') |
932
|
|
|
self.assertEqual(uealite('dying', var='Adams'), 'die') |
933
|
|
|
self.assertEqual(uealite('harping', var='Adams'), 'harp') |
934
|
|
|
self.assertEqual(uealite('charring', var='Adams'), 'char') |
935
|
|
|
# not stem false progressive verbs such as 'sing' |
936
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
937
|
|
|
self.assertEqual(uealite('sing', var='Adams'), 'sing') |
938
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
939
|
|
|
self.assertEqual(uealite('bring', var='Adams'), 'bring') |
940
|
|
|
self.assertEqual(uealite('fling', var='Adams'), 'fling') |
941
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
942
|
|
|
self.assertEqual(uealite('changes', var='Adams'), 'change') |
943
|
|
|
self.assertEqual(uealite('deaths', var='Adams'), 'death') |
944
|
|
|
self.assertEqual(uealite('shadows', var='Adams'), 'shadow') |
945
|
|
|
self.assertEqual(uealite('flies', var='Adams'), 'fly') |
946
|
|
|
self.assertEqual(uealite('things', var='Adams'), 'thing') |
947
|
|
|
self.assertEqual(uealite('nothings', var='Adams'), 'nothing') |
948
|
|
|
self.assertEqual(uealite('witches', var='Adams'), 'witch') |
949
|
|
|
self.assertEqual(uealite('makes', var='Adams'), 'make') |
950
|
|
|
self.assertEqual(uealite('smokes', var='Adams'), 'smoke') |
951
|
|
|
self.assertEqual(uealite('does', var='Adams'), 'do') |
952
|
|
|
# stem various words with -des suffix |
953
|
|
|
self.assertEqual(uealite('abodes', var='Adams'), 'abode') |
954
|
|
|
self.assertEqual(uealite('escapades', var='Adams'), 'escapade') |
955
|
|
|
self.assertEqual(uealite('crusades', var='Adams'), 'crusade') |
956
|
|
|
self.assertEqual(uealite('grades', var='Adams'), 'grade') |
957
|
|
|
# stem various words with -res suffix |
958
|
|
|
self.assertEqual(uealite('wires', var='Adams'), 'wire') |
959
|
|
|
self.assertEqual(uealite('acres', var='Adams'), 'acre') |
960
|
|
|
self.assertEqual(uealite('fires', var='Adams'), 'fire') |
961
|
|
|
self.assertEqual(uealite('cares', var='Adams'), 'care') |
962
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
963
|
|
|
self.assertEqual(uealite('USA', var='Adams'), 'USA') |
964
|
|
|
self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS') |
965
|
|
|
self.assertEqual(uealite('MREs', var='Adams'), 'MRE') |
966
|
|
|
self.assertEqual(uealite('USAED', var='Adams'), 'USAED') |
967
|
|
|
|
968
|
|
|
def test_uealite_wsj_set(self): |
969
|
|
|
"""Test abydos.stemmer.uealite using the WSJ test set.""" |
970
|
|
|
with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset: |
971
|
|
|
for wsj_line in wsj_testset: |
972
|
|
|
(word, uea, rule) = wsj_line.strip().split(',') |
973
|
|
|
self.assertEqual(uealite(word, return_rule_no=True), |
974
|
|
|
(uea, float(rule))) |
975
|
|
|
|
976
|
|
|
|
977
|
|
|
# class LancasterTestCases(unittest.TestCase): |
978
|
|
|
# """Test Lancaster functions. |
979
|
|
|
# |
980
|
|
|
# abydos.stemmer.lancaster |
981
|
|
|
# """ |
982
|
|
|
# |
983
|
|
|
# def test_lancaster(self): |
984
|
|
|
# """Test abydos.stemmer.lancaster.""" |
985
|
|
|
# # base case |
986
|
|
|
# self.assertEqual(lancaster(''), '') |
987
|
|
|
# |
988
|
|
|
# # cases copied from |
989
|
|
|
# # http://www.nltk.org/_modules/nltk/stem/lancaster.html |
990
|
|
|
# # self.assertEqual(lancaster('maximum'), 'maxim') |
991
|
|
|
# # self.assertEqual(lancaster('presumably'), 'presum') |
992
|
|
|
# # self.assertEqual(lancaster('multiply'), 'multiply') |
993
|
|
|
# # self.assertEqual(lancaster('provision'), 'provid') |
994
|
|
|
# # self.assertEqual(lancaster('owed'), 'ow') |
995
|
|
|
# # self.assertEqual(lancaster('ear'), 'ear') |
996
|
|
|
# # self.assertEqual(lancaster('saying'), 'say') |
997
|
|
|
# # self.assertEqual(lancaster('crying'), 'cry') |
998
|
|
|
# # self.assertEqual(lancaster('string'), 'string') |
999
|
|
|
# # self.assertEqual(lancaster('meant'), 'meant') |
1000
|
|
|
# # self.assertEqual(lancaster('cement'), 'cem') |
1001
|
|
|
|
1002
|
|
|
|
1003
|
|
|
if __name__ == '__main__': |
1004
|
|
|
unittest.main() |
1005
|
|
|
|