1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
4
|
|
|
# This file is part of Abydos. |
5
|
|
|
# |
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
7
|
|
|
# it under the terms of the GNU General Public License as published by |
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
9
|
|
|
# (at your option) any later version. |
10
|
|
|
# |
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14
|
|
|
# GNU General Public License for more details. |
15
|
|
|
# |
16
|
|
|
# You should have received a copy of the GNU General Public License |
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
18
|
|
|
|
19
|
|
|
"""abydos.tests.test_stemmer. |
20
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.stemmer |
22
|
|
|
""" |
23
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
25
|
|
|
|
26
|
|
|
import codecs |
27
|
|
|
import os |
28
|
|
|
import unittest |
29
|
|
|
|
30
|
|
|
from abydos.stemmer import _ends_in_cvc, _ends_in_doubled_cons, _m_degree, \ |
31
|
|
|
_sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \ |
32
|
|
|
_sb_short_word, caumanns, clef_german, clef_german_plus, clef_swedish, \ |
33
|
|
|
lovins, paice_husk, porter, porter2, sb_danish, sb_dutch, sb_german, \ |
34
|
|
|
sb_norwegian, sb_swedish, uealite |
35
|
|
|
|
36
|
|
|
TESTDIR = os.path.dirname(__file__) |
37
|
|
|
|
38
|
|
|
|
39
|
|
|
class LovinsTestCases(unittest.TestCase): |
40
|
|
|
"""Test Lovins functions. |
41
|
|
|
|
42
|
|
|
abydos.stemmer.lovins |
43
|
|
|
""" |
44
|
|
|
|
45
|
|
|
def test_lovins(self): |
46
|
|
|
"""Test abydos.stemmer.lovins.""" |
47
|
|
|
# base case |
48
|
|
|
self.assertEqual(lovins(''), '') |
49
|
|
|
|
50
|
|
|
# test cases from Lovins' "Development of a Stemming Algorithm": |
51
|
|
|
# http://www.mt-archive.info/MT-1968-Lovins.pdf |
52
|
|
|
self.assertEqual(lovins('magnesia'), 'magnes') |
53
|
|
|
self.assertEqual(lovins('magnesite'), 'magnes') |
54
|
|
|
self.assertEqual(lovins('magnesian'), 'magnes') |
55
|
|
|
self.assertEqual(lovins('magnesium'), 'magnes') |
56
|
|
|
self.assertEqual(lovins('magnet'), 'magnet') |
57
|
|
|
self.assertEqual(lovins('magnetic'), 'magnet') |
58
|
|
|
self.assertEqual(lovins('magneto'), 'magnet') |
59
|
|
|
self.assertEqual(lovins('magnetically'), 'magnet') |
60
|
|
|
self.assertEqual(lovins('magnetism'), 'magnet') |
61
|
|
|
self.assertEqual(lovins('magnetite'), 'magnet') |
62
|
|
|
self.assertEqual(lovins('magnetitic'), 'magnet') |
63
|
|
|
self.assertEqual(lovins('magnetizable'), 'magnet') |
64
|
|
|
self.assertEqual(lovins('magnetization'), 'magnet') |
65
|
|
|
self.assertEqual(lovins('magnetize'), 'magnet') |
66
|
|
|
self.assertEqual(lovins('magnetometer'), 'magnetometer') |
67
|
|
|
self.assertEqual(lovins('magnetometric'), 'magnetometer') |
68
|
|
|
self.assertEqual(lovins('magnetometry'), 'magnetometer') |
69
|
|
|
self.assertEqual(lovins('magnetomotive'), 'magnetomot') |
70
|
|
|
self.assertEqual(lovins('magnetron'), 'magnetron') |
71
|
|
|
self.assertEqual(lovins('metal'), 'metal') |
72
|
|
|
self.assertEqual(lovins('metall'), 'metal') |
73
|
|
|
self.assertEqual(lovins('metallically'), 'metal') |
74
|
|
|
self.assertEqual(lovins('metalliferous'), 'metallifer') |
75
|
|
|
self.assertEqual(lovins('metallize'), 'metal') |
76
|
|
|
self.assertEqual(lovins('metallurgical'), 'metallurg') |
77
|
|
|
self.assertEqual(lovins('metallurgy'), 'metallurg') |
78
|
|
|
self.assertEqual(lovins('induction'), 'induc') |
79
|
|
|
self.assertEqual(lovins('inductance'), 'induc') |
80
|
|
|
self.assertEqual(lovins('induced'), 'induc') |
81
|
|
|
self.assertEqual(lovins('angular'), 'angl') |
82
|
|
|
self.assertEqual(lovins('angle'), 'angl') |
83
|
|
|
|
84
|
|
|
# missed branch test cases |
85
|
|
|
self.assertEqual(lovins('feminism'), 'fem') |
86
|
|
|
|
87
|
|
|
def test_lovins_snowball(self): |
88
|
|
|
"""Test abydos.stemmer.lovins (Snowball testset). |
89
|
|
|
|
90
|
|
|
These test cases are from |
91
|
|
|
https://github.com/snowballstem/snowball-data/tree/master/lovins |
92
|
|
|
""" |
93
|
|
|
# Snowball Lovins test set |
94
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_lovins.csv', |
95
|
|
|
encoding='utf-8') as snowball_testset: |
96
|
|
|
next(snowball_testset) |
97
|
|
|
for line in snowball_testset: |
98
|
|
|
if line[0] != '#': |
99
|
|
|
line = line.strip().split(',') |
100
|
|
|
word, stem = line[0], line[1] |
101
|
|
|
self.assertEqual(lovins(word), stem.lower()) |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
class PorterTestCases(unittest.TestCase): |
105
|
|
|
"""Test Porter functions. |
106
|
|
|
|
107
|
|
|
abydos.stemmer._m_degree, abydos.stemmer.porter, |
108
|
|
|
abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons, |
109
|
|
|
& abydos.stemmer._ends_in_cvc |
110
|
|
|
""" |
111
|
|
|
|
112
|
|
|
def test_m_degree(self): |
113
|
|
|
"""Test abydos.stemmer._m_degree.""" |
114
|
|
|
_vowels = set('aeiouy') |
115
|
|
|
# base case |
116
|
|
|
self.assertEqual(_m_degree('', _vowels), 0) |
117
|
|
|
|
118
|
|
|
# m==0 |
119
|
|
|
self.assertEqual(_m_degree('tr', _vowels), 0) |
120
|
|
|
self.assertEqual(_m_degree('ee', _vowels), 0) |
121
|
|
|
self.assertEqual(_m_degree('tree', _vowels), 0) |
122
|
|
|
self.assertEqual(_m_degree('y', _vowels), 0) |
123
|
|
|
self.assertEqual(_m_degree('by', _vowels), 0) |
124
|
|
|
|
125
|
|
|
# m==1 |
126
|
|
|
self.assertEqual(_m_degree('trouble', _vowels), 1) |
127
|
|
|
self.assertEqual(_m_degree('oats', _vowels), 1) |
128
|
|
|
self.assertEqual(_m_degree('trees', _vowels), 1) |
129
|
|
|
self.assertEqual(_m_degree('ivy', _vowels), 1) |
130
|
|
|
|
131
|
|
|
# m==2 |
132
|
|
|
self.assertEqual(_m_degree('troubles', _vowels), 2) |
133
|
|
|
self.assertEqual(_m_degree('private', _vowels), 2) |
134
|
|
|
self.assertEqual(_m_degree('oaten', _vowels), 2) |
135
|
|
|
self.assertEqual(_m_degree('orrery', _vowels), 2) |
136
|
|
|
|
137
|
|
|
def test_has_vowel(self): |
138
|
|
|
"""Test abydos.stemmer._has_vowel.""" |
139
|
|
|
_vowels = set('aeiouy') |
140
|
|
|
# base case |
141
|
|
|
self.assertFalse(_sb_has_vowel('', _vowels)) |
142
|
|
|
|
143
|
|
|
# False cases |
144
|
|
|
self.assertFalse(_sb_has_vowel('b', _vowels)) |
145
|
|
|
self.assertFalse(_sb_has_vowel('c', _vowels)) |
146
|
|
|
self.assertFalse(_sb_has_vowel('bc', _vowels)) |
147
|
|
|
self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels)) |
148
|
|
|
self.assertFalse(_sb_has_vowel('Y', _vowels)) |
149
|
|
|
|
150
|
|
|
# True cases |
151
|
|
|
self.assertTrue(_sb_has_vowel('a', _vowels)) |
152
|
|
|
self.assertTrue(_sb_has_vowel('e', _vowels)) |
153
|
|
|
self.assertTrue(_sb_has_vowel('ae', _vowels)) |
154
|
|
|
self.assertTrue(_sb_has_vowel('aeiouy', _vowels)) |
155
|
|
|
self.assertTrue(_sb_has_vowel('y', _vowels)) |
156
|
|
|
|
157
|
|
|
self.assertTrue(_sb_has_vowel('ade', _vowels)) |
158
|
|
|
self.assertTrue(_sb_has_vowel('cad', _vowels)) |
159
|
|
|
self.assertTrue(_sb_has_vowel('add', _vowels)) |
160
|
|
|
self.assertTrue(_sb_has_vowel('phi', _vowels)) |
161
|
|
|
self.assertTrue(_sb_has_vowel('pfy', _vowels)) |
162
|
|
|
|
163
|
|
|
self.assertFalse(_sb_has_vowel('pfY', _vowels)) |
164
|
|
|
|
165
|
|
|
def test_ends_in_doubled_cons(self): |
166
|
|
|
"""Test abydos.stemmer._ends_in_doubled_cons.""" |
167
|
|
|
_vowels = set('aeiouy') |
168
|
|
|
# base case |
169
|
|
|
self.assertFalse(_ends_in_doubled_cons('', _vowels)) |
170
|
|
|
|
171
|
|
|
# False cases |
172
|
|
|
self.assertFalse(_ends_in_doubled_cons('b', _vowels)) |
173
|
|
|
self.assertFalse(_ends_in_doubled_cons('c', _vowels)) |
174
|
|
|
self.assertFalse(_ends_in_doubled_cons('bc', _vowels)) |
175
|
|
|
self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz', |
176
|
|
|
_vowels)) |
177
|
|
|
self.assertFalse(_ends_in_doubled_cons('Y', _vowels)) |
178
|
|
|
self.assertFalse(_ends_in_doubled_cons('a', _vowels)) |
179
|
|
|
self.assertFalse(_ends_in_doubled_cons('e', _vowels)) |
180
|
|
|
self.assertFalse(_ends_in_doubled_cons('ae', _vowels)) |
181
|
|
|
self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels)) |
182
|
|
|
self.assertFalse(_ends_in_doubled_cons('y', _vowels)) |
183
|
|
|
self.assertFalse(_ends_in_doubled_cons('ade', _vowels)) |
184
|
|
|
self.assertFalse(_ends_in_doubled_cons('cad', _vowels)) |
185
|
|
|
self.assertFalse(_ends_in_doubled_cons('phi', _vowels)) |
186
|
|
|
self.assertFalse(_ends_in_doubled_cons('pfy', _vowels)) |
187
|
|
|
self.assertFalse(_ends_in_doubled_cons('faddy', _vowels)) |
188
|
|
|
self.assertFalse(_ends_in_doubled_cons('aiii', _vowels)) |
189
|
|
|
self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels)) |
190
|
|
|
|
191
|
|
|
# True cases |
192
|
|
|
self.assertTrue(_ends_in_doubled_cons('add', _vowels)) |
193
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadd', _vowels)) |
194
|
|
|
self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels)) |
195
|
|
|
self.assertTrue(_ends_in_doubled_cons('raYY', _vowels)) |
196
|
|
|
self.assertTrue(_ends_in_doubled_cons('doll', _vowels)) |
197
|
|
|
self.assertTrue(_ends_in_doubled_cons('parr', _vowels)) |
198
|
|
|
self.assertTrue(_ends_in_doubled_cons('parrr', _vowels)) |
199
|
|
|
self.assertTrue(_ends_in_doubled_cons('bacc', _vowels)) |
200
|
|
|
|
201
|
|
|
def test_ends_in_cvc(self): |
202
|
|
|
"""Test abydos.stemmer._ends_in_cvc.""" |
203
|
|
|
_vowels = set('aeiouy') |
204
|
|
|
# base case |
205
|
|
|
self.assertFalse(_ends_in_cvc('', _vowels)) |
206
|
|
|
|
207
|
|
|
# False cases |
208
|
|
|
self.assertFalse(_ends_in_cvc('b', _vowels)) |
209
|
|
|
self.assertFalse(_ends_in_cvc('c', _vowels)) |
210
|
|
|
self.assertFalse(_ends_in_cvc('bc', _vowels)) |
211
|
|
|
self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels)) |
212
|
|
|
self.assertFalse(_ends_in_cvc('YYY', _vowels)) |
213
|
|
|
self.assertFalse(_ends_in_cvc('ddd', _vowels)) |
214
|
|
|
self.assertFalse(_ends_in_cvc('faaf', _vowels)) |
215
|
|
|
self.assertFalse(_ends_in_cvc('rare', _vowels)) |
216
|
|
|
self.assertFalse(_ends_in_cvc('rhy', _vowels)) |
217
|
|
|
|
218
|
|
|
# True cases |
219
|
|
|
self.assertTrue(_ends_in_cvc('dad', _vowels)) |
220
|
|
|
self.assertTrue(_ends_in_cvc('phad', _vowels)) |
221
|
|
|
self.assertTrue(_ends_in_cvc('faded', _vowels)) |
222
|
|
|
self.assertTrue(_ends_in_cvc('maYor', _vowels)) |
223
|
|
|
self.assertTrue(_ends_in_cvc('enlil', _vowels)) |
224
|
|
|
self.assertTrue(_ends_in_cvc('parer', _vowels)) |
225
|
|
|
self.assertTrue(_ends_in_cvc('padres', _vowels)) |
226
|
|
|
self.assertTrue(_ends_in_cvc('bacyc', _vowels)) |
227
|
|
|
|
228
|
|
|
# Special case for W, X, & Y |
229
|
|
|
self.assertFalse(_ends_in_cvc('craw', _vowels)) |
230
|
|
|
self.assertFalse(_ends_in_cvc('max', _vowels)) |
231
|
|
|
self.assertFalse(_ends_in_cvc('cray', _vowels)) |
232
|
|
|
|
233
|
|
|
def test_porter(self): |
234
|
|
|
"""Test abydos.stemmer.porter.""" |
235
|
|
|
# base case |
236
|
|
|
self.assertEqual(porter(''), '') |
237
|
|
|
|
238
|
|
|
# simple cases |
239
|
|
|
self.assertEqual(porter('c'), 'c') |
240
|
|
|
self.assertEqual(porter('da'), 'da') |
241
|
|
|
self.assertEqual(porter('ad'), 'ad') |
242
|
|
|
self.assertEqual(porter('sing'), 'sing') |
243
|
|
|
self.assertEqual(porter('singing'), 'sing') |
244
|
|
|
|
245
|
|
|
# missed branch test cases |
246
|
|
|
self.assertEqual(porter('capitalism'), 'capit') |
247
|
|
|
self.assertEqual(porter('fatalism'), 'fatal') |
248
|
|
|
self.assertEqual(porter('stional'), 'stional') |
249
|
|
|
self.assertEqual(porter('palism'), 'palism') |
250
|
|
|
self.assertEqual(porter('sization'), 'sizat') |
251
|
|
|
self.assertEqual(porter('licated'), 'licat') |
252
|
|
|
self.assertEqual(porter('lical'), 'lical') |
253
|
|
|
|
254
|
|
|
def test_porter_early_english(self): |
255
|
|
|
"""Test abydos.stemmer.porter (early English).""" |
256
|
|
|
# base case |
257
|
|
|
self.assertEqual(porter('', early_english=True), '') |
258
|
|
|
|
259
|
|
|
# simple cases (no different from regular stemmer) |
260
|
|
|
self.assertEqual(porter('c', early_english=True), 'c') |
261
|
|
|
self.assertEqual(porter('da', early_english=True), 'da') |
262
|
|
|
self.assertEqual(porter('ad', early_english=True), 'ad') |
263
|
|
|
self.assertEqual(porter('sing', early_english=True), 'sing') |
264
|
|
|
self.assertEqual(porter('singing', early_english=True), 'sing') |
265
|
|
|
|
266
|
|
|
# make |
267
|
|
|
self.assertEqual(porter('make', early_english=True), 'make') |
268
|
|
|
self.assertEqual(porter('makes', early_english=True), 'make') |
269
|
|
|
self.assertEqual(porter('maketh', early_english=True), 'make') |
270
|
|
|
self.assertEqual(porter('makest', early_english=True), 'make') |
271
|
|
|
|
272
|
|
|
# say |
273
|
|
|
self.assertEqual(porter('say', early_english=True), 'sai') |
274
|
|
|
self.assertEqual(porter('says', early_english=True), 'sai') |
275
|
|
|
self.assertEqual(porter('sayeth', early_english=True), 'sai') |
276
|
|
|
self.assertEqual(porter('sayest', early_english=True), 'sai') |
277
|
|
|
|
278
|
|
|
# missed branch test cases |
279
|
|
|
self.assertEqual(porter('best', early_english=True), 'best') |
280
|
|
|
self.assertEqual(porter('meth', early_english=True), 'meth') |
281
|
|
|
|
282
|
|
|
def test_porter_snowball(self): |
283
|
|
|
"""Test abydos.stemmer.porter (Snowball testset). |
284
|
|
|
|
285
|
|
|
These test cases are from |
286
|
|
|
http://snowball.tartarus.org/algorithms/porter/diffs.txt |
287
|
|
|
""" |
288
|
|
|
# Snowball Porter test set |
289
|
|
|
with open(TESTDIR+'/corpora/snowball_porter.csv') as snowball_testset: |
290
|
|
|
next(snowball_testset) |
291
|
|
|
for line in snowball_testset: |
292
|
|
|
if line[0] != '#': |
293
|
|
|
line = line.strip().split(',') |
294
|
|
|
word, stem = line[0], line[1] |
295
|
|
|
self.assertEqual(porter(word), stem.lower()) |
296
|
|
|
|
297
|
|
|
|
298
|
|
|
class Porter2TestCases(unittest.TestCase): |
299
|
|
|
"""Test Porter2 functions. |
300
|
|
|
|
301
|
|
|
abydos.stemmer._sb_r1, abydos.stemmer._sb_r2, |
302
|
|
|
abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word, |
303
|
|
|
& abydos.stemmer.porter2 |
304
|
|
|
""" |
305
|
|
|
|
306
|
|
|
def test_sb_r1(self): |
307
|
|
|
"""Test abydos.stemmer._sb_r1.""" |
308
|
|
|
_vowels = set('aeiouy') |
309
|
|
|
# base case |
310
|
|
|
self.assertEqual(_sb_r1('', _vowels), 0) |
311
|
|
|
|
312
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
313
|
|
|
self.assertEqual(_sb_r1('beautiful', _vowels), 5) |
314
|
|
|
self.assertEqual(_sb_r1('beauty', _vowels), 5) |
315
|
|
|
self.assertEqual(_sb_r1('beau', _vowels), 4) |
316
|
|
|
self.assertEqual(_sb_r1('animadversion', _vowels), 2) |
317
|
|
|
self.assertEqual(_sb_r1('sprinkled', _vowels), 5) |
318
|
|
|
self.assertEqual(_sb_r1('eucharist', _vowels), 3) |
319
|
|
|
|
320
|
|
|
def test_sb_r2(self): |
321
|
|
|
"""Test abydos.stemmer._sb_r2.""" |
322
|
|
|
_vowels = set('aeiouy') |
323
|
|
|
# base case |
324
|
|
|
self.assertEqual(_sb_r2('', _vowels), 0) |
325
|
|
|
|
326
|
|
|
# examples from http://snowball.tartarus.org/texts/r1r2.html |
327
|
|
|
self.assertEqual(_sb_r2('beautiful', _vowels), 7) |
328
|
|
|
self.assertEqual(_sb_r2('beauty', _vowels), 6) |
329
|
|
|
self.assertEqual(_sb_r2('beau', _vowels), 4) |
330
|
|
|
self.assertEqual(_sb_r2('animadversion', _vowels), 4) |
331
|
|
|
self.assertEqual(_sb_r2('sprinkled', _vowels), 9) |
332
|
|
|
self.assertEqual(_sb_r2('eucharist', _vowels), 6) |
333
|
|
|
|
334
|
|
|
def test_sb_ends_in_short_syllable(self): |
335
|
|
|
"""Test abydos.stemmer._sb_ends_in_short_syllable.""" |
336
|
|
|
_vowels = set('aeiouy') |
337
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
338
|
|
|
# base case |
339
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('', _vowels, |
340
|
|
|
_codanonvowels)) |
341
|
|
|
|
342
|
|
|
# examples from |
343
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
344
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels, |
345
|
|
|
_codanonvowels)) |
346
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels, |
347
|
|
|
_codanonvowels)) |
348
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels, |
349
|
|
|
_codanonvowels)) |
350
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels, |
351
|
|
|
_codanonvowels)) |
352
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('on', _vowels, |
353
|
|
|
_codanonvowels)) |
354
|
|
|
self.assertTrue(_sb_ends_in_short_syllable('at', _vowels, |
355
|
|
|
_codanonvowels)) |
356
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
357
|
|
|
_codanonvowels)) |
358
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
359
|
|
|
_codanonvowels)) |
360
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels, |
361
|
|
|
_codanonvowels)) |
362
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels, |
363
|
|
|
_codanonvowels)) |
364
|
|
|
|
365
|
|
|
# missed branch test cases |
366
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('d', _vowels, |
367
|
|
|
_codanonvowels)) |
368
|
|
|
self.assertFalse(_sb_ends_in_short_syllable('a', _vowels, |
369
|
|
|
_codanonvowels)) |
370
|
|
|
|
371
|
|
|
def test_sb_short_word(self): |
372
|
|
|
"""Test abydos.stemmer._sb_short_word.""" |
373
|
|
|
_vowels = set('aeiouy') |
374
|
|
|
_codanonvowels = set('bcdfghjklmnpqrstvz\'') |
375
|
|
|
# base case |
376
|
|
|
self.assertFalse(_sb_short_word('', _vowels, _codanonvowels)) |
377
|
|
|
|
378
|
|
|
# examples from |
379
|
|
|
# http://snowball.tartarus.org/algorithms/english/stemmer.html |
380
|
|
|
self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels)) |
381
|
|
|
self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels)) |
382
|
|
|
self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels)) |
383
|
|
|
self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels)) |
384
|
|
|
self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels)) |
385
|
|
|
self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels)) |
386
|
|
|
|
387
|
|
|
def test_porter2(self): |
388
|
|
|
"""Test abydos.stemmer.porter2.""" |
389
|
|
|
# base case |
390
|
|
|
self.assertEqual(porter2(''), '') |
391
|
|
|
|
392
|
|
|
# simple cases |
393
|
|
|
self.assertEqual(porter2('c'), 'c') |
394
|
|
|
self.assertEqual(porter2('da'), 'da') |
395
|
|
|
self.assertEqual(porter2('ad'), 'ad') |
396
|
|
|
self.assertEqual(porter2('sing'), 'sing') |
397
|
|
|
self.assertEqual(porter2('singing'), 'sing') |
398
|
|
|
|
399
|
|
|
# missed branch test cases |
400
|
|
|
self.assertEqual(porter2('capitalism'), 'capit') |
401
|
|
|
self.assertEqual(porter2('fatalism'), 'fatal') |
402
|
|
|
self.assertEqual(porter2('dog\'s'), 'dog') |
403
|
|
|
self.assertEqual(porter2('A\'s\''), 'a') |
404
|
|
|
self.assertEqual(porter2('agreedly'), 'agre') |
405
|
|
|
self.assertEqual(porter2('feedly'), 'feed') |
406
|
|
|
self.assertEqual(porter2('stional'), 'stional') |
407
|
|
|
self.assertEqual(porter2('palism'), 'palism') |
408
|
|
|
self.assertEqual(porter2('sization'), 'sizat') |
409
|
|
|
self.assertEqual(porter2('licated'), 'licat') |
410
|
|
|
self.assertEqual(porter2('lical'), 'lical') |
411
|
|
|
self.assertEqual(porter2('clessly'), 'clessli') |
412
|
|
|
self.assertEqual(porter2('tably'), 'tabli') |
413
|
|
|
self.assertEqual(porter2('sizer'), 'sizer') |
414
|
|
|
self.assertEqual(porter2('livity'), 'liviti') |
415
|
|
|
|
416
|
|
|
def test_porter2_early_english(self): |
417
|
|
|
"""Test abydos.stemmer.porter2 (early English).""" |
418
|
|
|
# base case |
419
|
|
|
self.assertEqual(porter2('', early_english=True), '') |
420
|
|
|
|
421
|
|
|
# simple cases (no different from regular stemmer) |
422
|
|
|
self.assertEqual(porter2('c', early_english=True), 'c') |
423
|
|
|
self.assertEqual(porter2('da', early_english=True), 'da') |
424
|
|
|
self.assertEqual(porter2('ad', early_english=True), 'ad') |
425
|
|
|
self.assertEqual(porter2('sing', early_english=True), 'sing') |
426
|
|
|
self.assertEqual(porter2('singing', early_english=True), 'sing') |
427
|
|
|
|
428
|
|
|
# make |
429
|
|
|
self.assertEqual(porter2('make', early_english=True), 'make') |
430
|
|
|
self.assertEqual(porter2('makes', early_english=True), 'make') |
431
|
|
|
self.assertEqual(porter2('maketh', early_english=True), 'make') |
432
|
|
|
self.assertEqual(porter2('makest', early_english=True), 'make') |
433
|
|
|
|
434
|
|
|
# say |
435
|
|
|
self.assertEqual(porter2('say', early_english=True), 'say') |
436
|
|
|
self.assertEqual(porter2('says', early_english=True), 'say') |
437
|
|
|
self.assertEqual(porter2('sayeth', early_english=True), 'say') |
438
|
|
|
self.assertEqual(porter2('sayest', early_english=True), 'say') |
439
|
|
|
|
440
|
|
|
# missed branch test cases |
441
|
|
|
self.assertEqual(porter2('best', early_english=True), 'best') |
442
|
|
|
self.assertEqual(porter2('meth', early_english=True), 'meth') |
443
|
|
|
|
444
|
|
|
def test_porter2_snowball(self): |
445
|
|
|
"""Test abydos.stemmer.porter2 (Snowball testset). |
446
|
|
|
|
447
|
|
|
These test cases are from |
448
|
|
|
http://snowball.tartarus.org/algorithms/english/diffs.txt |
449
|
|
|
""" |
450
|
|
|
# Snowball Porter test set |
451
|
|
|
with open(TESTDIR+'/corpora/snowball_porter2.csv') as snowball_testset: |
452
|
|
|
next(snowball_testset) |
453
|
|
|
for line in snowball_testset: |
454
|
|
|
if line[0] != '#': |
455
|
|
|
line = line.strip().split(',') |
456
|
|
|
word, stem = line[0], line[1] |
457
|
|
|
self.assertEqual(porter2(word), stem.lower()) |
458
|
|
|
|
459
|
|
|
|
460
|
|
|
class SnowballTestCases(unittest.TestCase): |
461
|
|
|
"""Test Snowball functions. |
462
|
|
|
|
463
|
|
|
abydos.stemmer.sb_german, abydos.stemmer.sb_dutch, |
464
|
|
|
abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, & |
465
|
|
|
abydos.stemmer.sb_danish |
466
|
|
|
""" |
467
|
|
|
|
468
|
|
View Code Duplication |
def test_sb_german_snowball(self): |
|
|
|
|
469
|
|
|
"""Test abydos.stemmer.sb_german (Snowball testset). |
470
|
|
|
|
471
|
|
|
These test cases are from |
472
|
|
|
http://snowball.tartarus.org/algorithms/german/diffs.txt |
473
|
|
|
""" |
474
|
|
|
# base case |
475
|
|
|
self.assertEqual(sb_german(''), '') |
476
|
|
|
|
477
|
|
|
# Snowball German test set |
478
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_german.csv', |
479
|
|
|
encoding='utf-8') as snowball_testset: |
480
|
|
|
next(snowball_testset) |
481
|
|
|
for line in snowball_testset: |
482
|
|
|
if line[0] != '#': |
483
|
|
|
line = line.strip().split(',') |
484
|
|
|
word, stem = line[0], line[1] |
485
|
|
|
self.assertEqual(sb_german(word), stem.lower()) |
486
|
|
|
|
487
|
|
|
# missed branch test cases |
488
|
|
|
self.assertEqual(sb_german('ikeit'), 'ikeit') |
489
|
|
|
|
490
|
|
|
def test_sb_german_snowball_alt(self): |
491
|
|
|
"""Test abydos.stemmer.sb_german (alternate vowels).""" |
492
|
|
|
# base case |
493
|
|
|
self.assertEqual(sb_german('', alternate_vowels=True), '') |
494
|
|
|
|
495
|
|
|
# dämmerung,dammer |
496
|
|
|
self.assertEqual(sb_german('dämmerung', alternate_vowels=True), |
497
|
|
|
'dammer') |
498
|
|
|
self.assertEqual(sb_german('daemmerung', alternate_vowels=True), |
499
|
|
|
'dammer') |
500
|
|
|
self.assertEqual(sb_german('dämmerung'), 'dammer') |
501
|
|
|
self.assertEqual(sb_german('daemmerung'), 'daemmer') |
502
|
|
|
|
503
|
|
|
# brötchen,brotch |
504
|
|
|
self.assertEqual(sb_german('brötchen', alternate_vowels=True), |
505
|
|
|
'brotch') |
506
|
|
|
self.assertEqual(sb_german('broetchen', alternate_vowels=True), |
507
|
|
|
'brotch') |
508
|
|
|
self.assertEqual(sb_german('brötchen'), 'brotch') |
509
|
|
|
self.assertEqual(sb_german('broetchen'), 'broetch') |
510
|
|
|
|
511
|
|
|
# büro,buro |
512
|
|
|
self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro') |
513
|
|
|
self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro') |
514
|
|
|
self.assertEqual(sb_german('büro'), 'buro') |
515
|
|
|
self.assertEqual(sb_german('buero'), 'buero') |
516
|
|
|
|
517
|
|
|
# häufen,hauf |
518
|
|
|
self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf') |
519
|
|
|
self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf') |
520
|
|
|
self.assertEqual(sb_german('häufen'), 'hauf') |
521
|
|
|
self.assertEqual(sb_german('haeufen'), 'haeuf') |
522
|
|
|
|
523
|
|
|
# quelle,quell |
524
|
|
|
self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull') |
525
|
|
|
self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell') |
526
|
|
|
self.assertEqual(sb_german('qülle'), 'qull') |
527
|
|
|
self.assertEqual(sb_german('quelle'), 'quell') |
528
|
|
|
|
529
|
|
|
# feuer,feuer |
530
|
|
|
self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur') |
531
|
|
|
self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu') |
532
|
|
|
self.assertEqual(sb_german('feür'), 'feur') |
533
|
|
|
self.assertEqual(sb_german('feuer'), 'feu') |
534
|
|
|
|
535
|
|
|
# über,uber |
536
|
|
|
self.assertEqual(sb_german('über', alternate_vowels=True), 'uber') |
537
|
|
|
self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber') |
538
|
|
|
self.assertEqual(sb_german('über'), 'uber') |
539
|
|
|
self.assertEqual(sb_german('ueber'), 'ueb') |
540
|
|
|
|
541
|
|
View Code Duplication |
def test_sb_dutch_snowball(self): |
|
|
|
|
542
|
|
|
"""Test abydos.stemmer.sb_dutch (Snowball testset). |
543
|
|
|
|
544
|
|
|
These test cases are from |
545
|
|
|
http://snowball.tartarus.org/algorithms/dutch/diffs.txt |
546
|
|
|
""" |
547
|
|
|
# base case |
548
|
|
|
self.assertEqual(sb_dutch(''), '') |
549
|
|
|
|
550
|
|
|
# Snowball Dutch test set |
551
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_dutch.csv', |
552
|
|
|
encoding='utf-8') as snowball_testset: |
553
|
|
|
next(snowball_testset) |
554
|
|
|
for line in snowball_testset: |
555
|
|
|
if line[0] != '#': |
556
|
|
|
line = line.strip().split(',') |
557
|
|
|
word, stem = line[0], line[1] |
558
|
|
|
self.assertEqual(sb_dutch(word), stem.lower()) |
559
|
|
|
|
560
|
|
|
# missed branch test cases |
561
|
|
|
self.assertEqual(sb_dutch('zondulielijk'), 'zondulie') |
562
|
|
|
|
563
|
|
View Code Duplication |
def test_sb_norwegian_snowball(self): |
|
|
|
|
564
|
|
|
"""Test abydos.stemmer.sb_norwegian (Snowball testset). |
565
|
|
|
|
566
|
|
|
These test cases are from |
567
|
|
|
http://snowball.tartarus.org/algorithms/norwegian/diffs.txt |
568
|
|
|
""" |
569
|
|
|
# base case |
570
|
|
|
self.assertEqual(sb_norwegian(''), '') |
571
|
|
|
|
572
|
|
|
# Snowball Norwegian test set |
573
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_norwegian.csv', |
574
|
|
|
encoding='utf-8') as snowball_testset: |
575
|
|
|
next(snowball_testset) |
576
|
|
|
for line in snowball_testset: |
577
|
|
|
if line[0] != '#': |
578
|
|
|
line = line.strip().split(',') |
579
|
|
|
word, stem = line[0], line[1] |
580
|
|
|
self.assertEqual(sb_norwegian(word), stem.lower()) |
581
|
|
|
|
582
|
|
View Code Duplication |
def test_sb_swedish_snowball(self): |
|
|
|
|
583
|
|
|
"""Test abydos.stemmer.sb_swedish (Snowball testset). |
584
|
|
|
|
585
|
|
|
These test cases are from |
586
|
|
|
http://snowball.tartarus.org/algorithms/swedish/diffs.txt |
587
|
|
|
""" |
588
|
|
|
# base case |
589
|
|
|
self.assertEqual(sb_swedish(''), '') |
590
|
|
|
|
591
|
|
|
# Snowball Swedish test set |
592
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_swedish.csv', |
593
|
|
|
encoding='utf-8') as snowball_testset: |
594
|
|
|
next(snowball_testset) |
595
|
|
|
for line in snowball_testset: |
596
|
|
|
if line[0] != '#': |
597
|
|
|
line = line.strip().split(',') |
598
|
|
|
word, stem = line[0], line[1] |
599
|
|
|
self.assertEqual(sb_swedish(word), stem.lower()) |
600
|
|
|
|
601
|
|
View Code Duplication |
def test_sb_danish_snowball(self): |
|
|
|
|
602
|
|
|
"""Test abydos.stemmer.sb_danish (Snowball testset). |
603
|
|
|
|
604
|
|
|
These test cases are from |
605
|
|
|
http://snowball.tartarus.org/algorithms/danish/diffs.txt |
606
|
|
|
""" |
607
|
|
|
# base case |
608
|
|
|
self.assertEqual(sb_danish(''), '') |
609
|
|
|
|
610
|
|
|
# Snowball Danish test set |
611
|
|
|
with codecs.open(TESTDIR+'/corpora/snowball_danish.csv', |
612
|
|
|
encoding='utf-8') as snowball_testset: |
613
|
|
|
next(snowball_testset) |
614
|
|
|
for line in snowball_testset: |
615
|
|
|
if line[0] != '#': |
616
|
|
|
line = line.strip().split(',') |
617
|
|
|
word, stem = line[0], line[1] |
618
|
|
|
self.assertEqual(sb_danish(word), stem.lower()) |
619
|
|
|
|
620
|
|
|
|
621
|
|
|
class CLEFTestCases(unittest.TestCase): |
622
|
|
|
"""Test CLEF functions. |
623
|
|
|
|
624
|
|
|
abydos.stemmer.clef_german, abydos.stemmer.clef_german_plus, & |
625
|
|
|
abydos.stemmer.clef_swedish |
626
|
|
|
""" |
627
|
|
|
|
628
|
|
|
def test_clef_german(self): |
629
|
|
|
"""Test abydos.stemmer.clef_german.""" |
630
|
|
|
# base case |
631
|
|
|
self.assertEqual(clef_german(''), '') |
632
|
|
|
|
633
|
|
|
# len <= 2 |
634
|
|
|
self.assertEqual(clef_german('ä'), 'a') |
635
|
|
|
self.assertEqual(clef_german('er'), 'er') |
636
|
|
|
self.assertEqual(clef_german('es'), 'es') |
637
|
|
|
self.assertEqual(clef_german('äh'), 'ah') |
638
|
|
|
|
639
|
|
|
# len > 2 |
640
|
|
|
self.assertEqual(clef_german('deinen'), 'dein') |
641
|
|
|
self.assertEqual(clef_german('können'), 'konn') |
642
|
|
|
self.assertEqual(clef_german('Damen'), 'dame') |
643
|
|
|
self.assertEqual(clef_german('kleines'), 'klein') |
644
|
|
|
self.assertEqual(clef_german('Namen'), 'name') |
645
|
|
|
self.assertEqual(clef_german('Äpfel'), 'apfel') |
646
|
|
|
self.assertEqual(clef_german('Jahre'), 'jahr') |
647
|
|
|
self.assertEqual(clef_german('Mannes'), 'mann') |
648
|
|
|
self.assertEqual(clef_german('Häuser'), 'haus') |
649
|
|
|
self.assertEqual(clef_german('Motoren'), 'motor') |
650
|
|
|
self.assertEqual(clef_german('kleine'), 'klein') |
651
|
|
|
self.assertEqual(clef_german('Pfingsten'), 'pfingst') |
652
|
|
|
self.assertEqual(clef_german('lautest'), 'lautest') |
653
|
|
|
self.assertEqual(clef_german('lauteste'), 'lautest') |
654
|
|
|
self.assertEqual(clef_german('lautere'), 'lauter') |
655
|
|
|
self.assertEqual(clef_german('lautste'), 'lautst') |
656
|
|
|
self.assertEqual(clef_german('kleinen'), 'klei') |
657
|
|
|
|
658
|
|
|
def test_clef_german_plus(self): |
659
|
|
|
"""Test abydos.stemmer.clef_german_plus.""" |
660
|
|
|
# base case |
661
|
|
|
self.assertEqual(clef_german_plus(''), '') |
662
|
|
|
|
663
|
|
|
# len <= 2 |
664
|
|
|
self.assertEqual(clef_german_plus('ä'), 'a') |
665
|
|
|
self.assertEqual(clef_german_plus('er'), 'er') |
666
|
|
|
self.assertEqual(clef_german_plus('es'), 'es') |
667
|
|
|
self.assertEqual(clef_german_plus('äh'), 'ah') |
668
|
|
|
|
669
|
|
|
# len > 2 |
670
|
|
|
self.assertEqual(clef_german_plus('deinen'), 'dein') |
671
|
|
|
self.assertEqual(clef_german_plus('können'), 'konn') |
672
|
|
|
self.assertEqual(clef_german_plus('Damen'), 'dam') |
673
|
|
|
self.assertEqual(clef_german_plus('kleines'), 'klein') |
674
|
|
|
self.assertEqual(clef_german_plus('Namen'), 'nam') |
675
|
|
|
self.assertEqual(clef_german_plus('Äpfel'), 'apfel') |
676
|
|
|
self.assertEqual(clef_german_plus('Jahre'), 'jahr') |
677
|
|
|
self.assertEqual(clef_german_plus('Mannes'), 'mann') |
678
|
|
|
self.assertEqual(clef_german_plus('Häuser'), 'haus') |
679
|
|
|
self.assertEqual(clef_german_plus('Motoren'), 'motor') |
680
|
|
|
self.assertEqual(clef_german_plus('kleine'), 'klein') |
681
|
|
|
self.assertEqual(clef_german_plus('Pfingsten'), 'pfing') |
682
|
|
|
self.assertEqual(clef_german_plus('lautest'), 'laut') |
683
|
|
|
self.assertEqual(clef_german_plus('lauteste'), 'laut') |
684
|
|
|
self.assertEqual(clef_german_plus('lautere'), 'laut') |
685
|
|
|
self.assertEqual(clef_german_plus('lautste'), 'laut') |
686
|
|
|
self.assertEqual(clef_german_plus('kleinen'), 'klein') |
687
|
|
|
self.assertEqual(clef_german_plus('Pfarrern'), 'pfarr') |
688
|
|
|
|
689
|
|
|
def test_clef_swedish(self): |
690
|
|
|
"""Test abydos.stemmer.clef_swedish.""" |
691
|
|
|
# base case |
692
|
|
|
self.assertEqual(clef_swedish(''), '') |
693
|
|
|
|
694
|
|
|
# unstemmed |
695
|
|
|
self.assertEqual(clef_swedish('konung'), 'konung') |
696
|
|
|
|
697
|
|
|
# len <= 3 |
698
|
|
|
self.assertEqual(clef_swedish('km'), 'km') |
699
|
|
|
self.assertEqual(clef_swedish('ja'), 'ja') |
700
|
|
|
self.assertEqual(clef_swedish('de'), 'de') |
701
|
|
|
self.assertEqual(clef_swedish('in'), 'in') |
702
|
|
|
self.assertEqual(clef_swedish('a'), 'a') |
703
|
|
|
self.assertEqual(clef_swedish('mer'), 'mer') |
704
|
|
|
self.assertEqual(clef_swedish('s'), 's') |
705
|
|
|
self.assertEqual(clef_swedish('e'), 'e') |
706
|
|
|
self.assertEqual(clef_swedish('oss'), 'oss') |
707
|
|
|
self.assertEqual(clef_swedish('hos'), 'hos') |
708
|
|
|
|
709
|
|
|
# genitive |
710
|
|
|
self.assertEqual(clef_swedish('svenskars'), 'svensk') |
711
|
|
|
self.assertEqual(clef_swedish('stadens'), 'stad') |
712
|
|
|
self.assertEqual(clef_swedish('kommuns'), 'kommu') |
713
|
|
|
self.assertEqual(clef_swedish('aftonbladets'), 'aftonblad') |
714
|
|
|
|
715
|
|
|
# len > 7 |
716
|
|
|
self.assertEqual(clef_swedish('fängelser'), 'fäng') |
717
|
|
|
self.assertEqual(clef_swedish('möjligheten'), 'möjlig') |
718
|
|
|
|
719
|
|
|
# len > 6 |
720
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
721
|
|
|
self.assertEqual(clef_swedish('myndigheterna'), 'myndighet') |
722
|
|
|
self.assertEqual(clef_swedish('avgörande'), 'avgör') |
723
|
|
|
self.assertEqual(clef_swedish('fängelse'), 'fäng') |
724
|
|
|
self.assertEqual(clef_swedish('viktigaste'), 'viktig') |
725
|
|
|
self.assertEqual(clef_swedish('kvinnorna'), 'kvinn') |
726
|
|
|
self.assertEqual(clef_swedish('åklagaren'), 'åklag') |
727
|
|
|
|
728
|
|
|
# len > 5 |
729
|
|
|
self.assertEqual(clef_swedish('tidigare'), 'tidig') |
730
|
|
|
self.assertEqual(clef_swedish('senast'), 'sen') |
731
|
|
|
self.assertEqual(clef_swedish('möjlighet'), 'möjlig') |
732
|
|
|
|
733
|
|
|
# len > 4 |
734
|
|
|
self.assertEqual(clef_swedish('svenskar'), 'svensk') |
735
|
|
|
self.assertEqual(clef_swedish('skriver'), 'skriv') |
736
|
|
|
self.assertEqual(clef_swedish('människor'), 'människ') |
737
|
|
|
self.assertEqual(clef_swedish('staden'), 'stad') |
738
|
|
|
self.assertEqual(clef_swedish('kunnat'), 'kunn') |
739
|
|
|
self.assertEqual(clef_swedish('samarbete'), 'samarbe') |
740
|
|
|
self.assertEqual(clef_swedish('aftonbladet'), 'aftonblad') |
741
|
|
|
|
742
|
|
|
# len > 3 |
743
|
|
|
self.assertEqual(clef_swedish('allt'), 'all') |
744
|
|
|
self.assertEqual(clef_swedish('vilka'), 'vilk') |
745
|
|
|
self.assertEqual(clef_swedish('länge'), 'läng') |
746
|
|
|
self.assertEqual(clef_swedish('kommun'), 'kommu') |
747
|
|
|
|
748
|
|
|
|
749
|
|
|
class CaumannsTestCases(unittest.TestCase): |
750
|
|
|
"""Test Caumanns functions. |
751
|
|
|
|
752
|
|
|
abydos.stemmer.caumanns |
753
|
|
|
""" |
754
|
|
|
|
755
|
|
|
def test_caumanns(self): |
756
|
|
|
"""Test abydos.stemmer.caumanns.""" |
757
|
|
|
# base case |
758
|
|
|
self.assertEqual(caumanns(''), '') |
759
|
|
|
|
760
|
|
|
# tests from Caumanns' description of the algorithm |
761
|
|
|
self.assertEqual(caumanns('singt'), 'sing') |
762
|
|
|
self.assertEqual(caumanns('singen'), 'sing') |
763
|
|
|
self.assertEqual(caumanns('beliebt'), 'belieb') |
764
|
|
|
self.assertEqual(caumanns('beliebtester'), 'belieb') |
765
|
|
|
self.assertEqual(caumanns('stören'), 'stor') |
766
|
|
|
self.assertEqual(caumanns('stöhnen'), 'stoh') |
767
|
|
|
self.assertEqual(caumanns('Kuß'), 'kuss') |
768
|
|
|
self.assertEqual(caumanns('Küsse'), 'kuss') |
769
|
|
|
self.assertEqual(caumanns('Verlierer'), 'verlier') |
770
|
|
|
self.assertEqual(caumanns('Verlies'), 'verlie') |
771
|
|
|
self.assertEqual(caumanns('Maus'), 'mau') |
772
|
|
|
self.assertEqual(caumanns('Mauer'), 'mau') |
773
|
|
|
self.assertEqual(caumanns('Störsender'), 'stor') |
774
|
|
|
|
775
|
|
|
# additional tests to achieve full coverage |
776
|
|
|
self.assertEqual(caumanns('Müllerinnen'), 'mullerin') |
777
|
|
|
self.assertEqual(caumanns('Matrix'), 'matrix') |
778
|
|
|
self.assertEqual(caumanns('Matrizen'), 'matrix') |
779
|
|
|
|
780
|
|
|
def test_caumanns_lucene(self): |
781
|
|
|
"""Test abydos.stemmer.caumanns (Lucene tests). |
782
|
|
|
|
783
|
|
|
Based on tests from |
784
|
|
|
https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt |
785
|
|
|
This is presumably Apache-licensed. |
786
|
|
|
""" |
787
|
|
|
# German special characters are replaced: |
788
|
|
|
self.assertEqual(caumanns('häufig'), 'haufig') |
789
|
|
|
self.assertEqual(caumanns('üor'), 'uor') |
790
|
|
|
self.assertEqual(caumanns('björk'), 'bjork') |
791
|
|
|
|
792
|
|
|
# here the stemmer works okay, it maps related words to the same stem: |
793
|
|
|
self.assertEqual(caumanns('abschließen'), 'abschliess') |
794
|
|
|
self.assertEqual(caumanns('abschließender'), 'abschliess') |
795
|
|
|
self.assertEqual(caumanns('abschließendes'), 'abschliess') |
796
|
|
|
self.assertEqual(caumanns('abschließenden'), 'abschliess') |
797
|
|
|
|
798
|
|
|
self.assertEqual(caumanns('Tisch'), 'tisch') |
799
|
|
|
self.assertEqual(caumanns('Tische'), 'tisch') |
800
|
|
|
self.assertEqual(caumanns('Tischen'), 'tisch') |
801
|
|
|
self.assertEqual(caumanns('geheimtür'), 'geheimtur') |
802
|
|
|
|
803
|
|
|
self.assertEqual(caumanns('Haus'), 'hau') |
804
|
|
|
self.assertEqual(caumanns('Hauses'), 'hau') |
805
|
|
|
self.assertEqual(caumanns('Häuser'), 'hau') |
806
|
|
|
self.assertEqual(caumanns('Häusern'), 'hau') |
807
|
|
|
# here's a case where overstemming occurs, i.e. a word is |
808
|
|
|
# mapped to the same stem as unrelated words: |
809
|
|
|
self.assertEqual(caumanns('hauen'), 'hau') |
810
|
|
|
|
811
|
|
|
# here's a case where understemming occurs, i.e. two related words |
812
|
|
|
# are not mapped to the same stem. This is the case with basically |
813
|
|
|
# all irregular forms: |
814
|
|
|
self.assertEqual(caumanns('Drama'), 'drama') |
815
|
|
|
self.assertEqual(caumanns('Dramen'), 'dram') |
816
|
|
|
|
817
|
|
|
# replace "ß" with 'ss': |
818
|
|
|
self.assertEqual(caumanns('Ausmaß'), 'ausmass') |
819
|
|
|
|
820
|
|
|
# fake words to test if suffixes are cut off: |
821
|
|
|
self.assertEqual(caumanns('xxxxxe'), 'xxxxx') |
822
|
|
|
self.assertEqual(caumanns('xxxxxs'), 'xxxxx') |
823
|
|
|
self.assertEqual(caumanns('xxxxxn'), 'xxxxx') |
824
|
|
|
self.assertEqual(caumanns('xxxxxt'), 'xxxxx') |
825
|
|
|
self.assertEqual(caumanns('xxxxxem'), 'xxxxx') |
826
|
|
|
self.assertEqual(caumanns('xxxxxer'), 'xxxxx') |
827
|
|
|
self.assertEqual(caumanns('xxxxxnd'), 'xxxxx') |
828
|
|
|
# the suffixes are also removed when combined: |
829
|
|
|
self.assertEqual(caumanns('xxxxxetende'), 'xxxxx') |
830
|
|
|
|
831
|
|
|
# words that are shorter than four charcters are not changed: |
832
|
|
|
self.assertEqual(caumanns('xxe'), 'xxe') |
833
|
|
|
# -em and -er are not removed from words shorter than five characters: |
834
|
|
|
self.assertEqual(caumanns('xxem'), 'xxem') |
835
|
|
|
self.assertEqual(caumanns('xxer'), 'xxer') |
836
|
|
|
# -nd is not removed from words shorter than six characters: |
837
|
|
|
self.assertEqual(caumanns('xxxnd'), 'xxxnd') |
838
|
|
|
|
839
|
|
|
|
840
|
|
|
class UEALiteTestCases(unittest.TestCase): |
841
|
|
|
"""Test UEA-lite functions. |
842
|
|
|
|
843
|
|
|
abydos.stemmer.uealite |
844
|
|
|
""" |
845
|
|
|
|
846
|
|
|
def test_uealite(self): |
847
|
|
|
"""Test abydos.stemmer.uealite.""" |
848
|
|
|
# base case |
849
|
|
|
self.assertEqual(uealite(''), '') |
850
|
|
|
|
851
|
|
|
# test cases copied from Ruby port |
852
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
853
|
|
|
# These are corrected to match the Java version's output. |
854
|
|
|
# stem base words to just the base word |
855
|
|
|
self.assertEqual(uealite('man'), 'man') |
856
|
|
|
self.assertEqual(uealite('happiness'), 'happiness') |
857
|
|
|
# stem theses as thesis but not bases as basis |
858
|
|
|
self.assertEqual(uealite('theses'), 'thesis') |
859
|
|
|
self.assertNotEqual(uealite('bases'), 'basis') |
860
|
|
|
# stem preterite words ending in -ed without the -ed |
861
|
|
|
self.assertEqual(uealite('ordained'), 'ordain') |
862
|
|
|
self.assertEqual(uealite('killed'), 'kill') |
863
|
|
|
self.assertEqual(uealite('liked'), 'lik') |
864
|
|
|
self.assertEqual(uealite('helped'), 'help') |
865
|
|
|
self.assertEqual(uealite('scarred'), 'scarre') |
866
|
|
|
self.assertEqual(uealite('invited'), 'invit') |
867
|
|
|
self.assertEqual(uealite('exited'), 'exit') |
868
|
|
|
self.assertEqual(uealite('debited'), 'debit') |
869
|
|
|
self.assertEqual(uealite('smited'), 'smit') |
870
|
|
|
# stem progressive verbs and gerunds without the -ing |
871
|
|
|
self.assertEqual(uealite('running'), 'run') |
872
|
|
|
self.assertEqual(uealite('settings'), 'set') |
873
|
|
|
self.assertEqual(uealite('timing'), 'time') |
874
|
|
|
self.assertEqual(uealite('dying'), 'dy') |
875
|
|
|
self.assertEqual(uealite('harping'), 'harp') |
876
|
|
|
self.assertEqual(uealite('charring'), 'char') |
877
|
|
|
# not stem false progressive verbs such as 'sing' |
878
|
|
|
self.assertEqual(uealite('ring'), 'ring') |
879
|
|
|
self.assertEqual(uealite('sing'), 'se') |
880
|
|
|
self.assertEqual(uealite('bring'), 'br') |
881
|
|
|
self.assertEqual(uealite('fling'), 'fle') |
882
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
883
|
|
|
self.assertEqual(uealite('changes'), 'change') |
884
|
|
|
self.assertEqual(uealite('deaths'), 'death') |
885
|
|
|
self.assertEqual(uealite('shadows'), 'shadow') |
886
|
|
|
self.assertEqual(uealite('flies'), 'fly') |
887
|
|
|
self.assertEqual(uealite('things'), 'thing') |
888
|
|
|
self.assertEqual(uealite('nothings'), 'nothing') |
889
|
|
|
self.assertEqual(uealite('witches'), 'witch') |
890
|
|
|
self.assertEqual(uealite('makes'), 'mak') |
891
|
|
|
self.assertEqual(uealite('smokes'), 'smok') |
892
|
|
|
self.assertEqual(uealite('does'), 'do') |
893
|
|
|
# stem various words with -des suffix |
894
|
|
|
self.assertEqual(uealite('abodes'), 'abod') |
895
|
|
|
self.assertEqual(uealite('escapades'), 'escapad') |
896
|
|
|
self.assertEqual(uealite('crusades'), 'crusad') |
897
|
|
|
self.assertEqual(uealite('grades'), 'grad') |
898
|
|
|
# stem various words with -res suffix |
899
|
|
|
self.assertEqual(uealite('wires'), 'wir') |
900
|
|
|
self.assertEqual(uealite('acres'), 'acr') |
901
|
|
|
self.assertEqual(uealite('fires'), 'fir') |
902
|
|
|
self.assertEqual(uealite('cares'), 'car') |
903
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
904
|
|
|
self.assertEqual(uealite('USA'), 'USA') |
905
|
|
|
self.assertEqual(uealite('FLOSS'), 'FLOSS') |
906
|
|
|
self.assertEqual(uealite('MREs'), 'MRE') |
907
|
|
|
self.assertEqual(uealite('USAED'), 'USAED') |
908
|
|
|
|
909
|
|
|
# test cases copied from Ruby port |
910
|
|
|
# https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb |
911
|
|
|
# stem base words to just the base word |
912
|
|
|
self.assertEqual(uealite('man', var='Adams'), 'man') |
913
|
|
|
self.assertEqual(uealite('happiness', var='Adams'), 'happiness') |
914
|
|
|
# stem theses as thesis but not bases as basis |
915
|
|
|
self.assertEqual(uealite('theses', var='Adams'), 'thesis') |
916
|
|
|
self.assertNotEqual(uealite('bases', var='Adams'), 'basis') |
917
|
|
|
# stem preterite words ending in -ed without the -ed |
918
|
|
|
self.assertEqual(uealite('ordained', var='Adams'), 'ordain') |
919
|
|
|
self.assertEqual(uealite('killed', var='Adams'), 'kill') |
920
|
|
|
self.assertEqual(uealite('liked', var='Adams'), 'like') |
921
|
|
|
self.assertEqual(uealite('helped', var='Adams'), 'help') |
922
|
|
|
# self.assertEqual(uealite('scarred', var='Adams'), 'scar') |
923
|
|
|
self.assertEqual(uealite('invited', var='Adams'), 'invite') |
924
|
|
|
self.assertEqual(uealite('exited', var='Adams'), 'exit') |
925
|
|
|
self.assertEqual(uealite('debited', var='Adams'), 'debit') |
926
|
|
|
self.assertEqual(uealite('smited', var='Adams'), 'smite') |
927
|
|
|
# stem progressive verbs and gerunds without the -ing |
928
|
|
|
self.assertEqual(uealite('running', var='Adams'), 'run') |
929
|
|
|
self.assertEqual(uealite('settings', var='Adams'), 'set') |
930
|
|
|
self.assertEqual(uealite('timing', var='Adams'), 'time') |
931
|
|
|
self.assertEqual(uealite('dying', var='Adams'), 'die') |
932
|
|
|
self.assertEqual(uealite('harping', var='Adams'), 'harp') |
933
|
|
|
self.assertEqual(uealite('charring', var='Adams'), 'char') |
934
|
|
|
# not stem false progressive verbs such as 'sing' |
935
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
936
|
|
|
self.assertEqual(uealite('sing', var='Adams'), 'sing') |
937
|
|
|
self.assertEqual(uealite('ring', var='Adams'), 'ring') |
938
|
|
|
self.assertEqual(uealite('bring', var='Adams'), 'bring') |
939
|
|
|
self.assertEqual(uealite('fling', var='Adams'), 'fling') |
940
|
|
|
# stem various plural nouns and 3rd-pres verbs without the -s/-es |
941
|
|
|
self.assertEqual(uealite('changes', var='Adams'), 'change') |
942
|
|
|
self.assertEqual(uealite('deaths', var='Adams'), 'death') |
943
|
|
|
self.assertEqual(uealite('shadows', var='Adams'), 'shadow') |
944
|
|
|
self.assertEqual(uealite('flies', var='Adams'), 'fly') |
945
|
|
|
self.assertEqual(uealite('things', var='Adams'), 'thing') |
946
|
|
|
self.assertEqual(uealite('nothings', var='Adams'), 'nothing') |
947
|
|
|
self.assertEqual(uealite('witches', var='Adams'), 'witch') |
948
|
|
|
self.assertEqual(uealite('makes', var='Adams'), 'make') |
949
|
|
|
self.assertEqual(uealite('smokes', var='Adams'), 'smoke') |
950
|
|
|
self.assertEqual(uealite('does', var='Adams'), 'do') |
951
|
|
|
# stem various words with -des suffix |
952
|
|
|
self.assertEqual(uealite('abodes', var='Adams'), 'abode') |
953
|
|
|
self.assertEqual(uealite('escapades', var='Adams'), 'escapade') |
954
|
|
|
self.assertEqual(uealite('crusades', var='Adams'), 'crusade') |
955
|
|
|
self.assertEqual(uealite('grades', var='Adams'), 'grade') |
956
|
|
|
# stem various words with -res suffix |
957
|
|
|
self.assertEqual(uealite('wires', var='Adams'), 'wire') |
958
|
|
|
self.assertEqual(uealite('acres', var='Adams'), 'acre') |
959
|
|
|
self.assertEqual(uealite('fires', var='Adams'), 'fire') |
960
|
|
|
self.assertEqual(uealite('cares', var='Adams'), 'care') |
961
|
|
|
# stem acronyms when pluralized otherwise they should be left alone |
962
|
|
|
self.assertEqual(uealite('USA', var='Adams'), 'USA') |
963
|
|
|
self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS') |
964
|
|
|
self.assertEqual(uealite('MREs', var='Adams'), 'MRE') |
965
|
|
|
self.assertEqual(uealite('USAED', var='Adams'), 'USAED') |
966
|
|
|
|
967
|
|
|
def test_uealite_wsj_set(self): |
968
|
|
|
"""Test abydos.stemmer.uealite using the WSJ test set.""" |
969
|
|
|
with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset: |
970
|
|
|
for wsj_line in wsj_testset: |
971
|
|
|
(word, uea, rule) = wsj_line.strip().split(',') |
972
|
|
|
self.assertEqual(uealite(word, return_rule_no=True), |
973
|
|
|
(uea, float(rule))) |
974
|
|
|
|
975
|
|
|
|
976
|
|
|
class PaiceHuskTestCases(unittest.TestCase): |
977
|
|
|
"""Test Paice-Husk functions. |
978
|
|
|
|
979
|
|
|
abydos.stemmer.paice_husk |
980
|
|
|
""" |
981
|
|
|
|
982
|
|
|
def test_paice_husk(self): |
983
|
|
|
"""Test abydos.stemmer.paice_husk.""" |
984
|
|
|
# base case |
985
|
|
|
self.assertEqual(paice_husk(''), '') |
986
|
|
|
|
987
|
|
|
# cases copied from |
988
|
|
|
# https://doi.org/10.1145/101306.101310 |
989
|
|
|
self.assertEqual(paice_husk('maximum'), 'maxim') |
990
|
|
|
self.assertEqual(paice_husk('presumably'), 'presum') |
991
|
|
|
self.assertEqual(paice_husk('multiply'), 'multiply') |
992
|
|
|
self.assertEqual(paice_husk('provision'), 'provid') |
993
|
|
|
self.assertEqual(paice_husk('owed'), 'ow') |
994
|
|
|
self.assertEqual(paice_husk('owing'), 'ow') |
995
|
|
|
self.assertEqual(paice_husk('ear'), 'ear') |
996
|
|
|
self.assertEqual(paice_husk('saying'), 'say') |
997
|
|
|
self.assertEqual(paice_husk('crying'), 'cry') |
998
|
|
|
self.assertEqual(paice_husk('string'), 'string') |
999
|
|
|
self.assertEqual(paice_husk('meant'), 'meant') |
1000
|
|
|
self.assertEqual(paice_husk('cement'), 'cem') |
1001
|
|
|
|
1002
|
|
|
def test_paice_husk_wsj_set(self): |
1003
|
|
|
"""Test abydos.stemmer.uealite using the Hopper262 test set. |
1004
|
|
|
|
1005
|
|
|
Source: |
1006
|
|
|
https://raw.githubusercontent.com/Hopper262/paice-husk-stemmer/master/wordlist.txt |
1007
|
|
|
|
1008
|
|
|
The only correction made from stemmed values in the Hopper262 set/ |
1009
|
|
|
implementations were: |
1010
|
|
|
- ymca : ymc -> ymca |
1011
|
|
|
- yttrium : yttr -> yttri |
1012
|
|
|
- ywca : ywc -> ywca |
1013
|
|
|
The Pascal reference implementation does not consider 'y' in initial |
1014
|
|
|
position to be a vowel. |
1015
|
|
|
""" |
1016
|
|
|
with open(TESTDIR + '/corpora/paicehusk.csv') as hopper_testset: |
1017
|
|
|
for hopper_line in hopper_testset: |
1018
|
|
|
(word, stem) = hopper_line.strip().split(',') |
1019
|
|
|
self.assertEqual(paice_husk(word), stem) |
1020
|
|
|
|
1021
|
|
|
|
1022
|
|
|
if __name__ == '__main__': |
1023
|
|
|
unittest.main() |
1024
|
|
|
|