Completed
Push — master ( 14a933...449757 )
by Chris
09:19
created

tests.test_stemmer.LovinsTestCases.test_lovins()   A

Complexity

Conditions 1

Size

Total Lines 41
Code Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 34
nop 1
dl 0
loc 41
rs 9.064
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.test_stemmer.
20
21
This module contains unit tests for abydos.stemmer
22
"""
23
24
from __future__ import unicode_literals
25
26
import codecs
27
import os
28
import unittest
29
30
from abydos.stemmer import _ends_in_cvc, _ends_in_doubled_cons, _m_degree, \
31
    _sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \
32
    _sb_short_word, caumanns, clef_german, clef_german_plus, clef_swedish, \
33
    lovins, paice_husk, porter, porter2, sb_danish, sb_dutch, sb_german, \
34
    sb_norwegian, sb_swedish, uealite
35
36
TESTDIR = os.path.dirname(__file__)
37
38
39
class LovinsTestCases(unittest.TestCase):
40
    """Test Lovins functions.
41
42
    abydos.stemmer.lovins
43
    """
44
45
    def test_lovins(self):
46
        """Test abydos.stemmer.lovins."""
47
        # base case
48
        self.assertEqual(lovins(''), '')
49
50
        # test cases from Lovins' "Development of a Stemming Algorithm":
51
        # http://www.mt-archive.info/MT-1968-Lovins.pdf
52
        self.assertEqual(lovins('magnesia'), 'magnes')
53
        self.assertEqual(lovins('magnesite'), 'magnes')
54
        self.assertEqual(lovins('magnesian'), 'magnes')
55
        self.assertEqual(lovins('magnesium'), 'magnes')
56
        self.assertEqual(lovins('magnet'), 'magnet')
57
        self.assertEqual(lovins('magnetic'), 'magnet')
58
        self.assertEqual(lovins('magneto'), 'magnet')
59
        self.assertEqual(lovins('magnetically'), 'magnet')
60
        self.assertEqual(lovins('magnetism'), 'magnet')
61
        self.assertEqual(lovins('magnetite'), 'magnet')
62
        self.assertEqual(lovins('magnetitic'), 'magnet')
63
        self.assertEqual(lovins('magnetizable'), 'magnet')
64
        self.assertEqual(lovins('magnetization'), 'magnet')
65
        self.assertEqual(lovins('magnetize'), 'magnet')
66
        self.assertEqual(lovins('magnetometer'), 'magnetometer')
67
        self.assertEqual(lovins('magnetometric'), 'magnetometer')
68
        self.assertEqual(lovins('magnetometry'), 'magnetometer')
69
        self.assertEqual(lovins('magnetomotive'), 'magnetomot')
70
        self.assertEqual(lovins('magnetron'), 'magnetron')
71
        self.assertEqual(lovins('metal'), 'metal')
72
        self.assertEqual(lovins('metall'), 'metal')
73
        self.assertEqual(lovins('metallically'), 'metal')
74
        self.assertEqual(lovins('metalliferous'), 'metallifer')
75
        self.assertEqual(lovins('metallize'), 'metal')
76
        self.assertEqual(lovins('metallurgical'), 'metallurg')
77
        self.assertEqual(lovins('metallurgy'), 'metallurg')
78
        self.assertEqual(lovins('induction'), 'induc')
79
        self.assertEqual(lovins('inductance'), 'induc')
80
        self.assertEqual(lovins('induced'), 'induc')
81
        self.assertEqual(lovins('angular'), 'angl')
82
        self.assertEqual(lovins('angle'), 'angl')
83
84
        # missed branch test cases
85
        self.assertEqual(lovins('feminism'), 'fem')
86
87
    def test_lovins_snowball(self):
88
        """Test abydos.stemmer.lovins (Snowball testset).
89
90
        These test cases are from
91
        https://github.com/snowballstem/snowball-data/tree/master/lovins
92
        """
93
        #  Snowball Lovins test set
94
        with codecs.open(TESTDIR+'/corpora/snowball_lovins.csv',
95
                         encoding='utf-8') as snowball_testset:
96
            next(snowball_testset)
97
            for line in snowball_testset:
98
                if line[0] != '#':
99
                    line = line.strip().split(',')
100
                    word, stem = line[0], line[1]
101
                    self.assertEqual(lovins(word), stem.lower())
102
103
104
class PorterTestCases(unittest.TestCase):
105
    """Test Porter functions.
106
107
    abydos.stemmer._m_degree, abydos.stemmer.porter,
108
    abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons,
109
    & abydos.stemmer._ends_in_cvc
110
    """
111
112
    def test_m_degree(self):
113
        """Test abydos.stemmer._m_degree."""
114
        _vowels = set('aeiouy')
115
        # base case
116
        self.assertEqual(_m_degree('', _vowels), 0)
117
118
        # m==0
119
        self.assertEqual(_m_degree('tr', _vowels), 0)
120
        self.assertEqual(_m_degree('ee', _vowels), 0)
121
        self.assertEqual(_m_degree('tree', _vowels), 0)
122
        self.assertEqual(_m_degree('y', _vowels), 0)
123
        self.assertEqual(_m_degree('by', _vowels), 0)
124
125
        # m==1
126
        self.assertEqual(_m_degree('trouble', _vowels), 1)
127
        self.assertEqual(_m_degree('oats', _vowels), 1)
128
        self.assertEqual(_m_degree('trees', _vowels), 1)
129
        self.assertEqual(_m_degree('ivy', _vowels), 1)
130
131
        # m==2
132
        self.assertEqual(_m_degree('troubles', _vowels), 2)
133
        self.assertEqual(_m_degree('private', _vowels), 2)
134
        self.assertEqual(_m_degree('oaten', _vowels), 2)
135
        self.assertEqual(_m_degree('orrery', _vowels), 2)
136
137
    def test_has_vowel(self):
138
        """Test abydos.stemmer._has_vowel."""
139
        _vowels = set('aeiouy')
140
        # base case
141
        self.assertFalse(_sb_has_vowel('', _vowels))
142
143
        # False cases
144
        self.assertFalse(_sb_has_vowel('b', _vowels))
145
        self.assertFalse(_sb_has_vowel('c', _vowels))
146
        self.assertFalse(_sb_has_vowel('bc', _vowels))
147
        self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels))
148
        self.assertFalse(_sb_has_vowel('Y', _vowels))
149
150
        # True cases
151
        self.assertTrue(_sb_has_vowel('a', _vowels))
152
        self.assertTrue(_sb_has_vowel('e', _vowels))
153
        self.assertTrue(_sb_has_vowel('ae', _vowels))
154
        self.assertTrue(_sb_has_vowel('aeiouy', _vowels))
155
        self.assertTrue(_sb_has_vowel('y', _vowels))
156
157
        self.assertTrue(_sb_has_vowel('ade', _vowels))
158
        self.assertTrue(_sb_has_vowel('cad', _vowels))
159
        self.assertTrue(_sb_has_vowel('add', _vowels))
160
        self.assertTrue(_sb_has_vowel('phi', _vowels))
161
        self.assertTrue(_sb_has_vowel('pfy', _vowels))
162
163
        self.assertFalse(_sb_has_vowel('pfY', _vowels))
164
165
    def test_ends_in_doubled_cons(self):
166
        """Test abydos.stemmer._ends_in_doubled_cons."""
167
        _vowels = set('aeiouy')
168
        # base case
169
        self.assertFalse(_ends_in_doubled_cons('', _vowels))
170
171
        # False cases
172
        self.assertFalse(_ends_in_doubled_cons('b', _vowels))
173
        self.assertFalse(_ends_in_doubled_cons('c', _vowels))
174
        self.assertFalse(_ends_in_doubled_cons('bc', _vowels))
175
        self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz',
176
                                               _vowels))
177
        self.assertFalse(_ends_in_doubled_cons('Y', _vowels))
178
        self.assertFalse(_ends_in_doubled_cons('a', _vowels))
179
        self.assertFalse(_ends_in_doubled_cons('e', _vowels))
180
        self.assertFalse(_ends_in_doubled_cons('ae', _vowels))
181
        self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels))
182
        self.assertFalse(_ends_in_doubled_cons('y', _vowels))
183
        self.assertFalse(_ends_in_doubled_cons('ade', _vowels))
184
        self.assertFalse(_ends_in_doubled_cons('cad', _vowels))
185
        self.assertFalse(_ends_in_doubled_cons('phi', _vowels))
186
        self.assertFalse(_ends_in_doubled_cons('pfy', _vowels))
187
        self.assertFalse(_ends_in_doubled_cons('faddy', _vowels))
188
        self.assertFalse(_ends_in_doubled_cons('aiii', _vowels))
189
        self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels))
190
191
        # True cases
192
        self.assertTrue(_ends_in_doubled_cons('add', _vowels))
193
        self.assertTrue(_ends_in_doubled_cons('fadd', _vowels))
194
        self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels))
195
        self.assertTrue(_ends_in_doubled_cons('raYY', _vowels))
196
        self.assertTrue(_ends_in_doubled_cons('doll', _vowels))
197
        self.assertTrue(_ends_in_doubled_cons('parr', _vowels))
198
        self.assertTrue(_ends_in_doubled_cons('parrr', _vowels))
199
        self.assertTrue(_ends_in_doubled_cons('bacc', _vowels))
200
201
    def test_ends_in_cvc(self):
202
        """Test abydos.stemmer._ends_in_cvc."""
203
        _vowels = set('aeiouy')
204
        # base case
205
        self.assertFalse(_ends_in_cvc('', _vowels))
206
207
        # False cases
208
        self.assertFalse(_ends_in_cvc('b', _vowels))
209
        self.assertFalse(_ends_in_cvc('c', _vowels))
210
        self.assertFalse(_ends_in_cvc('bc', _vowels))
211
        self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels))
212
        self.assertFalse(_ends_in_cvc('YYY', _vowels))
213
        self.assertFalse(_ends_in_cvc('ddd', _vowels))
214
        self.assertFalse(_ends_in_cvc('faaf', _vowels))
215
        self.assertFalse(_ends_in_cvc('rare', _vowels))
216
        self.assertFalse(_ends_in_cvc('rhy', _vowels))
217
218
        # True cases
219
        self.assertTrue(_ends_in_cvc('dad', _vowels))
220
        self.assertTrue(_ends_in_cvc('phad', _vowels))
221
        self.assertTrue(_ends_in_cvc('faded', _vowels))
222
        self.assertTrue(_ends_in_cvc('maYor', _vowels))
223
        self.assertTrue(_ends_in_cvc('enlil', _vowels))
224
        self.assertTrue(_ends_in_cvc('parer', _vowels))
225
        self.assertTrue(_ends_in_cvc('padres', _vowels))
226
        self.assertTrue(_ends_in_cvc('bacyc', _vowels))
227
228
        # Special case for W, X, & Y
229
        self.assertFalse(_ends_in_cvc('craw', _vowels))
230
        self.assertFalse(_ends_in_cvc('max', _vowels))
231
        self.assertFalse(_ends_in_cvc('cray', _vowels))
232
233
    def test_porter(self):
234
        """Test abydos.stemmer.porter."""
235
        # base case
236
        self.assertEqual(porter(''), '')
237
238
        # simple cases
239
        self.assertEqual(porter('c'), 'c')
240
        self.assertEqual(porter('da'), 'da')
241
        self.assertEqual(porter('ad'), 'ad')
242
        self.assertEqual(porter('sing'), 'sing')
243
        self.assertEqual(porter('singing'), 'sing')
244
245
        # missed branch test cases
246
        self.assertEqual(porter('capitalism'), 'capit')
247
        self.assertEqual(porter('fatalism'), 'fatal')
248
        self.assertEqual(porter('stional'), 'stional')
249
        self.assertEqual(porter('palism'), 'palism')
250
        self.assertEqual(porter('sization'), 'sizat')
251
        self.assertEqual(porter('licated'), 'licat')
252
        self.assertEqual(porter('lical'), 'lical')
253
254
    def test_porter_early_english(self):
255
        """Test abydos.stemmer.porter (early English)."""
256
        # base case
257
        self.assertEqual(porter('', early_english=True), '')
258
259
        # simple cases (no different from regular stemmer)
260
        self.assertEqual(porter('c', early_english=True), 'c')
261
        self.assertEqual(porter('da', early_english=True), 'da')
262
        self.assertEqual(porter('ad', early_english=True), 'ad')
263
        self.assertEqual(porter('sing', early_english=True), 'sing')
264
        self.assertEqual(porter('singing', early_english=True), 'sing')
265
266
        # make
267
        self.assertEqual(porter('make', early_english=True), 'make')
268
        self.assertEqual(porter('makes', early_english=True), 'make')
269
        self.assertEqual(porter('maketh', early_english=True), 'make')
270
        self.assertEqual(porter('makest', early_english=True), 'make')
271
272
        # say
273
        self.assertEqual(porter('say', early_english=True), 'sai')
274
        self.assertEqual(porter('says', early_english=True), 'sai')
275
        self.assertEqual(porter('sayeth', early_english=True), 'sai')
276
        self.assertEqual(porter('sayest', early_english=True), 'sai')
277
278
        # missed branch test cases
279
        self.assertEqual(porter('best', early_english=True), 'best')
280
        self.assertEqual(porter('meth', early_english=True), 'meth')
281
282
    def test_porter_snowball(self):
283
        """Test abydos.stemmer.porter (Snowball testset).
284
285
        These test cases are from
286
        http://snowball.tartarus.org/algorithms/porter/diffs.txt
287
        """
288
        #  Snowball Porter test set
289
        with open(TESTDIR+'/corpora/snowball_porter.csv') as snowball_testset:
290
            next(snowball_testset)
291
            for line in snowball_testset:
292
                if line[0] != '#':
293
                    line = line.strip().split(',')
294
                    word, stem = line[0], line[1]
295
                    self.assertEqual(porter(word), stem.lower())
296
297
298
class Porter2TestCases(unittest.TestCase):
299
    """Test Porter2 functions.
300
301
    abydos.stemmer._sb_r1, abydos.stemmer._sb_r2,
302
    abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word,
303
    & abydos.stemmer.porter2
304
    """
305
306
    def test_sb_r1(self):
307
        """Test abydos.stemmer._sb_r1."""
308
        _vowels = set('aeiouy')
309
        # base case
310
        self.assertEqual(_sb_r1('', _vowels), 0)
311
312
        # examples from http://snowball.tartarus.org/texts/r1r2.html
313
        self.assertEqual(_sb_r1('beautiful', _vowels), 5)
314
        self.assertEqual(_sb_r1('beauty', _vowels), 5)
315
        self.assertEqual(_sb_r1('beau', _vowels), 4)
316
        self.assertEqual(_sb_r1('animadversion', _vowels), 2)
317
        self.assertEqual(_sb_r1('sprinkled', _vowels), 5)
318
        self.assertEqual(_sb_r1('eucharist', _vowels), 3)
319
320
    def test_sb_r2(self):
321
        """Test abydos.stemmer._sb_r2."""
322
        _vowels = set('aeiouy')
323
        # base case
324
        self.assertEqual(_sb_r2('', _vowels), 0)
325
326
        # examples from http://snowball.tartarus.org/texts/r1r2.html
327
        self.assertEqual(_sb_r2('beautiful', _vowels), 7)
328
        self.assertEqual(_sb_r2('beauty', _vowels), 6)
329
        self.assertEqual(_sb_r2('beau', _vowels), 4)
330
        self.assertEqual(_sb_r2('animadversion', _vowels), 4)
331
        self.assertEqual(_sb_r2('sprinkled', _vowels), 9)
332
        self.assertEqual(_sb_r2('eucharist', _vowels), 6)
333
334
    def test_sb_ends_in_short_syllable(self):
335
        """Test abydos.stemmer._sb_ends_in_short_syllable."""
336
        _vowels = set('aeiouy')
337
        _codanonvowels = set('bcdfghjklmnpqrstvz\'')
338
        # base case
339
        self.assertFalse(_sb_ends_in_short_syllable('', _vowels,
340
                                                    _codanonvowels))
341
342
        # examples from
343
        # http://snowball.tartarus.org/algorithms/english/stemmer.html
344
        self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels,
345
                                                   _codanonvowels))
346
        self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels,
347
                                                   _codanonvowels))
348
        self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels,
349
                                                   _codanonvowels))
350
        self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels,
351
                                                   _codanonvowels))
352
        self.assertTrue(_sb_ends_in_short_syllable('on', _vowels,
353
                                                   _codanonvowels))
354
        self.assertTrue(_sb_ends_in_short_syllable('at', _vowels,
355
                                                   _codanonvowels))
356
        self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels,
357
                                                    _codanonvowels))
358
        self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels,
359
                                                    _codanonvowels))
360
        self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels,
361
                                                    _codanonvowels))
362
        self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels,
363
                                                    _codanonvowels))
364
365
        # missed branch test cases
366
        self.assertFalse(_sb_ends_in_short_syllable('d', _vowels,
367
                                                    _codanonvowels))
368
        self.assertFalse(_sb_ends_in_short_syllable('a', _vowels,
369
                                                    _codanonvowels))
370
371
    def test_sb_short_word(self):
372
        """Test abydos.stemmer._sb_short_word."""
373
        _vowels = set('aeiouy')
374
        _codanonvowels = set('bcdfghjklmnpqrstvz\'')
375
        # base case
376
        self.assertFalse(_sb_short_word('', _vowels, _codanonvowels))
377
378
        # examples from
379
        # http://snowball.tartarus.org/algorithms/english/stemmer.html
380
        self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels))
381
        self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels))
382
        self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels))
383
        self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels))
384
        self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels))
385
        self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels))
386
387
    def test_porter2(self):
388
        """Test abydos.stemmer.porter2."""
389
        # base case
390
        self.assertEqual(porter2(''), '')
391
392
        # simple cases
393
        self.assertEqual(porter2('c'), 'c')
394
        self.assertEqual(porter2('da'), 'da')
395
        self.assertEqual(porter2('ad'), 'ad')
396
        self.assertEqual(porter2('sing'), 'sing')
397
        self.assertEqual(porter2('singing'), 'sing')
398
399
        # missed branch test cases
400
        self.assertEqual(porter2('capitalism'), 'capit')
401
        self.assertEqual(porter2('fatalism'), 'fatal')
402
        self.assertEqual(porter2('dog\'s'), 'dog')
403
        self.assertEqual(porter2('A\'s\''), 'a')
404
        self.assertEqual(porter2('agreedly'), 'agre')
405
        self.assertEqual(porter2('feedly'), 'feed')
406
        self.assertEqual(porter2('stional'), 'stional')
407
        self.assertEqual(porter2('palism'), 'palism')
408
        self.assertEqual(porter2('sization'), 'sizat')
409
        self.assertEqual(porter2('licated'), 'licat')
410
        self.assertEqual(porter2('lical'), 'lical')
411
        self.assertEqual(porter2('clessly'), 'clessli')
412
        self.assertEqual(porter2('tably'), 'tabli')
413
        self.assertEqual(porter2('sizer'), 'sizer')
414
        self.assertEqual(porter2('livity'), 'liviti')
415
416
    def test_porter2_early_english(self):
417
        """Test abydos.stemmer.porter2 (early English)."""
418
        # base case
419
        self.assertEqual(porter2('', early_english=True), '')
420
421
        # simple cases (no different from regular stemmer)
422
        self.assertEqual(porter2('c', early_english=True), 'c')
423
        self.assertEqual(porter2('da', early_english=True), 'da')
424
        self.assertEqual(porter2('ad', early_english=True), 'ad')
425
        self.assertEqual(porter2('sing', early_english=True), 'sing')
426
        self.assertEqual(porter2('singing', early_english=True), 'sing')
427
428
        # make
429
        self.assertEqual(porter2('make', early_english=True), 'make')
430
        self.assertEqual(porter2('makes', early_english=True), 'make')
431
        self.assertEqual(porter2('maketh', early_english=True), 'make')
432
        self.assertEqual(porter2('makest', early_english=True), 'make')
433
434
        # say
435
        self.assertEqual(porter2('say', early_english=True), 'say')
436
        self.assertEqual(porter2('says', early_english=True), 'say')
437
        self.assertEqual(porter2('sayeth', early_english=True), 'say')
438
        self.assertEqual(porter2('sayest', early_english=True), 'say')
439
440
        # missed branch test cases
441
        self.assertEqual(porter2('best', early_english=True), 'best')
442
        self.assertEqual(porter2('meth', early_english=True), 'meth')
443
444
    def test_porter2_snowball(self):
445
        """Test abydos.stemmer.porter2 (Snowball testset).
446
447
        These test cases are from
448
        http://snowball.tartarus.org/algorithms/english/diffs.txt
449
        """
450
        #  Snowball Porter test set
451
        with open(TESTDIR+'/corpora/snowball_porter2.csv') as snowball_testset:
452
            next(snowball_testset)
453
            for line in snowball_testset:
454
                if line[0] != '#':
455
                    line = line.strip().split(',')
456
                    word, stem = line[0], line[1]
457
                    self.assertEqual(porter2(word), stem.lower())
458
459
460
class SnowballTestCases(unittest.TestCase):
461
    """Test Snowball functions.
462
463
    abydos.stemmer.sb_german, abydos.stemmer.sb_dutch,
464
    abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, &
465
    abydos.stemmer.sb_danish
466
    """
467
468 View Code Duplication
    def test_sb_german_snowball(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
469
        """Test abydos.stemmer.sb_german (Snowball testset).
470
471
        These test cases are from
472
        http://snowball.tartarus.org/algorithms/german/diffs.txt
473
        """
474
        # base case
475
        self.assertEqual(sb_german(''), '')
476
477
        #  Snowball German test set
478
        with codecs.open(TESTDIR+'/corpora/snowball_german.csv',
479
                         encoding='utf-8') as snowball_testset:
480
            next(snowball_testset)
481
            for line in snowball_testset:
482
                if line[0] != '#':
483
                    line = line.strip().split(',')
484
                    word, stem = line[0], line[1]
485
                    self.assertEqual(sb_german(word), stem.lower())
486
487
        # missed branch test cases
488
        self.assertEqual(sb_german('ikeit'), 'ikeit')
489
490
    def test_sb_german_snowball_alt(self):
491
        """Test abydos.stemmer.sb_german (alternate vowels)."""
492
        # base case
493
        self.assertEqual(sb_german('', alternate_vowels=True), '')
494
495
        # dämmerung,dammer
496
        self.assertEqual(sb_german('dämmerung', alternate_vowels=True),
497
                         'dammer')
498
        self.assertEqual(sb_german('daemmerung', alternate_vowels=True),
499
                         'dammer')
500
        self.assertEqual(sb_german('dämmerung'), 'dammer')
501
        self.assertEqual(sb_german('daemmerung'), 'daemmer')
502
503
        # brötchen,brotch
504
        self.assertEqual(sb_german('brötchen', alternate_vowels=True),
505
                         'brotch')
506
        self.assertEqual(sb_german('broetchen', alternate_vowels=True),
507
                         'brotch')
508
        self.assertEqual(sb_german('brötchen'), 'brotch')
509
        self.assertEqual(sb_german('broetchen'), 'broetch')
510
511
        # büro,buro
512
        self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro')
513
        self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro')
514
        self.assertEqual(sb_german('büro'), 'buro')
515
        self.assertEqual(sb_german('buero'), 'buero')
516
517
        # häufen,hauf
518
        self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf')
519
        self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf')
520
        self.assertEqual(sb_german('häufen'), 'hauf')
521
        self.assertEqual(sb_german('haeufen'), 'haeuf')
522
523
        # quelle,quell
524
        self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull')
525
        self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell')
526
        self.assertEqual(sb_german('qülle'), 'qull')
527
        self.assertEqual(sb_german('quelle'), 'quell')
528
529
        # feuer,feuer
530
        self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur')
531
        self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu')
532
        self.assertEqual(sb_german('feür'), 'feur')
533
        self.assertEqual(sb_german('feuer'), 'feu')
534
535
        # über,uber
536
        self.assertEqual(sb_german('über', alternate_vowels=True), 'uber')
537
        self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber')
538
        self.assertEqual(sb_german('über'), 'uber')
539
        self.assertEqual(sb_german('ueber'), 'ueb')
540
541 View Code Duplication
    def test_sb_dutch_snowball(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
542
        """Test abydos.stemmer.sb_dutch (Snowball testset).
543
544
        These test cases are from
545
        http://snowball.tartarus.org/algorithms/dutch/diffs.txt
546
        """
547
        # base case
548
        self.assertEqual(sb_dutch(''), '')
549
550
        #  Snowball Dutch test set
551
        with codecs.open(TESTDIR+'/corpora/snowball_dutch.csv',
552
                         encoding='utf-8') as snowball_testset:
553
            next(snowball_testset)
554
            for line in snowball_testset:
555
                if line[0] != '#':
556
                    line = line.strip().split(',')
557
                    word, stem = line[0], line[1]
558
                    self.assertEqual(sb_dutch(word), stem.lower())
559
560
        # missed branch test cases
561
        self.assertEqual(sb_dutch('zondulielijk'), 'zondulie')
562
563 View Code Duplication
    def test_sb_norwegian_snowball(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
564
        """Test abydos.stemmer.sb_norwegian (Snowball testset).
565
566
        These test cases are from
567
        http://snowball.tartarus.org/algorithms/norwegian/diffs.txt
568
        """
569
        # base case
570
        self.assertEqual(sb_norwegian(''), '')
571
572
        #  Snowball Norwegian test set
573
        with codecs.open(TESTDIR+'/corpora/snowball_norwegian.csv',
574
                         encoding='utf-8') as snowball_testset:
575
            next(snowball_testset)
576
            for line in snowball_testset:
577
                if line[0] != '#':
578
                    line = line.strip().split(',')
579
                    word, stem = line[0], line[1]
580
                    self.assertEqual(sb_norwegian(word), stem.lower())
581
582 View Code Duplication
    def test_sb_swedish_snowball(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
583
        """Test abydos.stemmer.sb_swedish (Snowball testset).
584
585
        These test cases are from
586
        http://snowball.tartarus.org/algorithms/swedish/diffs.txt
587
        """
588
        # base case
589
        self.assertEqual(sb_swedish(''), '')
590
591
        #  Snowball Swedish test set
592
        with codecs.open(TESTDIR+'/corpora/snowball_swedish.csv',
593
                         encoding='utf-8') as snowball_testset:
594
            next(snowball_testset)
595
            for line in snowball_testset:
596
                if line[0] != '#':
597
                    line = line.strip().split(',')
598
                    word, stem = line[0], line[1]
599
                    self.assertEqual(sb_swedish(word), stem.lower())
600
601 View Code Duplication
    def test_sb_danish_snowball(self):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
602
        """Test abydos.stemmer.sb_danish (Snowball testset).
603
604
        These test cases are from
605
        http://snowball.tartarus.org/algorithms/danish/diffs.txt
606
        """
607
        # base case
608
        self.assertEqual(sb_danish(''), '')
609
610
        #  Snowball Danish test set
611
        with codecs.open(TESTDIR+'/corpora/snowball_danish.csv',
612
                         encoding='utf-8') as snowball_testset:
613
            next(snowball_testset)
614
            for line in snowball_testset:
615
                if line[0] != '#':
616
                    line = line.strip().split(',')
617
                    word, stem = line[0], line[1]
618
                    self.assertEqual(sb_danish(word), stem.lower())
619
620
621
class CLEFTestCases(unittest.TestCase):
622
    """Test CLEF functions.
623
624
    abydos.stemmer.clef_german, abydos.stemmer.clef_german_plus, &
625
    abydos.stemmer.clef_swedish
626
    """
627
628
    def test_clef_german(self):
629
        """Test abydos.stemmer.clef_german."""
630
        # base case
631
        self.assertEqual(clef_german(''), '')
632
633
        # len <= 2
634
        self.assertEqual(clef_german('ä'), 'a')
635
        self.assertEqual(clef_german('er'), 'er')
636
        self.assertEqual(clef_german('es'), 'es')
637
        self.assertEqual(clef_german('äh'), 'ah')
638
639
        # len > 2
640
        self.assertEqual(clef_german('deinen'), 'dein')
641
        self.assertEqual(clef_german('können'), 'konn')
642
        self.assertEqual(clef_german('Damen'), 'dame')
643
        self.assertEqual(clef_german('kleines'), 'klein')
644
        self.assertEqual(clef_german('Namen'), 'name')
645
        self.assertEqual(clef_german('Äpfel'), 'apfel')
646
        self.assertEqual(clef_german('Jahre'), 'jahr')
647
        self.assertEqual(clef_german('Mannes'), 'mann')
648
        self.assertEqual(clef_german('Häuser'), 'haus')
649
        self.assertEqual(clef_german('Motoren'), 'motor')
650
        self.assertEqual(clef_german('kleine'), 'klein')
651
        self.assertEqual(clef_german('Pfingsten'), 'pfingst')
652
        self.assertEqual(clef_german('lautest'), 'lautest')
653
        self.assertEqual(clef_german('lauteste'), 'lautest')
654
        self.assertEqual(clef_german('lautere'), 'lauter')
655
        self.assertEqual(clef_german('lautste'), 'lautst')
656
        self.assertEqual(clef_german('kleinen'), 'klei')
657
658
    def test_clef_german_plus(self):
659
        """Test abydos.stemmer.clef_german_plus."""
660
        # base case
661
        self.assertEqual(clef_german_plus(''), '')
662
663
        # len <= 2
664
        self.assertEqual(clef_german_plus('ä'), 'a')
665
        self.assertEqual(clef_german_plus('er'), 'er')
666
        self.assertEqual(clef_german_plus('es'), 'es')
667
        self.assertEqual(clef_german_plus('äh'), 'ah')
668
669
        # len > 2
670
        self.assertEqual(clef_german_plus('deinen'), 'dein')
671
        self.assertEqual(clef_german_plus('können'), 'konn')
672
        self.assertEqual(clef_german_plus('Damen'), 'dam')
673
        self.assertEqual(clef_german_plus('kleines'), 'klein')
674
        self.assertEqual(clef_german_plus('Namen'), 'nam')
675
        self.assertEqual(clef_german_plus('Äpfel'), 'apfel')
676
        self.assertEqual(clef_german_plus('Jahre'), 'jahr')
677
        self.assertEqual(clef_german_plus('Mannes'), 'mann')
678
        self.assertEqual(clef_german_plus('Häuser'), 'haus')
679
        self.assertEqual(clef_german_plus('Motoren'), 'motor')
680
        self.assertEqual(clef_german_plus('kleine'), 'klein')
681
        self.assertEqual(clef_german_plus('Pfingsten'), 'pfing')
682
        self.assertEqual(clef_german_plus('lautest'), 'laut')
683
        self.assertEqual(clef_german_plus('lauteste'), 'laut')
684
        self.assertEqual(clef_german_plus('lautere'), 'laut')
685
        self.assertEqual(clef_german_plus('lautste'), 'laut')
686
        self.assertEqual(clef_german_plus('kleinen'), 'klein')
687
        self.assertEqual(clef_german_plus('Pfarrern'), 'pfarr')
688
689
    def test_clef_swedish(self):
690
        """Test abydos.stemmer.clef_swedish."""
691
        # base case
692
        self.assertEqual(clef_swedish(''), '')
693
694
        # unstemmed
695
        self.assertEqual(clef_swedish('konung'), 'konung')
696
697
        # len <= 3
698
        self.assertEqual(clef_swedish('km'), 'km')
699
        self.assertEqual(clef_swedish('ja'), 'ja')
700
        self.assertEqual(clef_swedish('de'), 'de')
701
        self.assertEqual(clef_swedish('in'), 'in')
702
        self.assertEqual(clef_swedish('a'), 'a')
703
        self.assertEqual(clef_swedish('mer'), 'mer')
704
        self.assertEqual(clef_swedish('s'), 's')
705
        self.assertEqual(clef_swedish('e'), 'e')
706
        self.assertEqual(clef_swedish('oss'), 'oss')
707
        self.assertEqual(clef_swedish('hos'), 'hos')
708
709
        # genitive
710
        self.assertEqual(clef_swedish('svenskars'), 'svensk')
711
        self.assertEqual(clef_swedish('stadens'), 'stad')
712
        self.assertEqual(clef_swedish('kommuns'), 'kommu')
713
        self.assertEqual(clef_swedish('aftonbladets'), 'aftonblad')
714
715
        # len > 7
716
        self.assertEqual(clef_swedish('fängelser'), 'fäng')
717
        self.assertEqual(clef_swedish('möjligheten'), 'möjlig')
718
719
        # len > 6
720
        self.assertEqual(clef_swedish('svenskar'), 'svensk')
721
        self.assertEqual(clef_swedish('myndigheterna'), 'myndighet')
722
        self.assertEqual(clef_swedish('avgörande'), 'avgör')
723
        self.assertEqual(clef_swedish('fängelse'), 'fäng')
724
        self.assertEqual(clef_swedish('viktigaste'), 'viktig')
725
        self.assertEqual(clef_swedish('kvinnorna'), 'kvinn')
726
        self.assertEqual(clef_swedish('åklagaren'), 'åklag')
727
728
        # len > 5
729
        self.assertEqual(clef_swedish('tidigare'), 'tidig')
730
        self.assertEqual(clef_swedish('senast'), 'sen')
731
        self.assertEqual(clef_swedish('möjlighet'), 'möjlig')
732
733
        # len > 4
734
        self.assertEqual(clef_swedish('svenskar'), 'svensk')
735
        self.assertEqual(clef_swedish('skriver'), 'skriv')
736
        self.assertEqual(clef_swedish('människor'), 'människ')
737
        self.assertEqual(clef_swedish('staden'), 'stad')
738
        self.assertEqual(clef_swedish('kunnat'), 'kunn')
739
        self.assertEqual(clef_swedish('samarbete'), 'samarbe')
740
        self.assertEqual(clef_swedish('aftonbladet'), 'aftonblad')
741
742
        # len > 3
743
        self.assertEqual(clef_swedish('allt'), 'all')
744
        self.assertEqual(clef_swedish('vilka'), 'vilk')
745
        self.assertEqual(clef_swedish('länge'), 'läng')
746
        self.assertEqual(clef_swedish('kommun'), 'kommu')
747
748
749
class CaumannsTestCases(unittest.TestCase):
750
    """Test Caumanns functions.
751
752
    abydos.stemmer.caumanns
753
    """
754
755
    def test_caumanns(self):
756
        """Test abydos.stemmer.caumanns."""
757
        # base case
758
        self.assertEqual(caumanns(''), '')
759
760
        # tests from Caumanns' description of the algorithm
761
        self.assertEqual(caumanns('singt'), 'sing')
762
        self.assertEqual(caumanns('singen'), 'sing')
763
        self.assertEqual(caumanns('beliebt'), 'belieb')
764
        self.assertEqual(caumanns('beliebtester'), 'belieb')
765
        self.assertEqual(caumanns('stören'), 'stor')
766
        self.assertEqual(caumanns('stöhnen'), 'stoh')
767
        self.assertEqual(caumanns('Kuß'), 'kuss')
768
        self.assertEqual(caumanns('Küsse'), 'kuss')
769
        self.assertEqual(caumanns('Verlierer'), 'verlier')
770
        self.assertEqual(caumanns('Verlies'), 'verlie')
771
        self.assertEqual(caumanns('Maus'), 'mau')
772
        self.assertEqual(caumanns('Mauer'), 'mau')
773
        self.assertEqual(caumanns('Störsender'), 'stor')
774
775
        # additional tests to achieve full coverage
776
        self.assertEqual(caumanns('Müllerinnen'), 'mullerin')
777
        self.assertEqual(caumanns('Matrix'), 'matrix')
778
        self.assertEqual(caumanns('Matrizen'), 'matrix')
779
780
    def test_caumanns_lucene(self):
781
        """Test abydos.stemmer.caumanns (Lucene tests).
782
783
        Based on tests from
784
        https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
785
        This is presumably Apache-licensed.
786
        """
787
        # German special characters are replaced:
788
        self.assertEqual(caumanns('häufig'), 'haufig')
789
        self.assertEqual(caumanns('üor'), 'uor')
790
        self.assertEqual(caumanns('björk'), 'bjork')
791
792
        # here the stemmer works okay, it maps related words to the same stem:
793
        self.assertEqual(caumanns('abschließen'), 'abschliess')
794
        self.assertEqual(caumanns('abschließender'), 'abschliess')
795
        self.assertEqual(caumanns('abschließendes'), 'abschliess')
796
        self.assertEqual(caumanns('abschließenden'), 'abschliess')
797
798
        self.assertEqual(caumanns('Tisch'), 'tisch')
799
        self.assertEqual(caumanns('Tische'), 'tisch')
800
        self.assertEqual(caumanns('Tischen'), 'tisch')
801
        self.assertEqual(caumanns('geheimtür'), 'geheimtur')
802
803
        self.assertEqual(caumanns('Haus'), 'hau')
804
        self.assertEqual(caumanns('Hauses'), 'hau')
805
        self.assertEqual(caumanns('Häuser'), 'hau')
806
        self.assertEqual(caumanns('Häusern'), 'hau')
807
        # here's a case where overstemming occurs, i.e. a word is
808
        # mapped to the same stem as unrelated words:
809
        self.assertEqual(caumanns('hauen'), 'hau')
810
811
        # here's a case where understemming occurs, i.e. two related words
812
        # are not mapped to the same stem. This is the case with basically
813
        # all irregular forms:
814
        self.assertEqual(caumanns('Drama'), 'drama')
815
        self.assertEqual(caumanns('Dramen'), 'dram')
816
817
        # replace "ß" with 'ss':
818
        self.assertEqual(caumanns('Ausmaß'), 'ausmass')
819
820
        # fake words to test if suffixes are cut off:
821
        self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
822
        self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
823
        self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
824
        self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
825
        self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
826
        self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
827
        self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
828
        # the suffixes are also removed when combined:
829
        self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')
830
831
        # words that are shorter than four charcters are not changed:
832
        self.assertEqual(caumanns('xxe'), 'xxe')
833
        # -em and -er are not removed from words shorter than five characters:
834
        self.assertEqual(caumanns('xxem'), 'xxem')
835
        self.assertEqual(caumanns('xxer'), 'xxer')
836
        # -nd is not removed from words shorter than six characters:
837
        self.assertEqual(caumanns('xxxnd'), 'xxxnd')
838
839
840
class UEALiteTestCases(unittest.TestCase):
841
    """Test UEA-lite functions.
842
843
    abydos.stemmer.uealite
844
    """
845
846
    def test_uealite(self):
847
        """Test abydos.stemmer.uealite."""
848
        # base case
849
        self.assertEqual(uealite(''), '')
850
851
        # test cases copied from Ruby port
852
        # https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb
853
        # These are corrected to match the Java version's output.
854
        # stem base words to just the base word
855
        self.assertEqual(uealite('man'), 'man')
856
        self.assertEqual(uealite('happiness'), 'happiness')
857
        # stem theses as thesis but not bases as basis
858
        self.assertEqual(uealite('theses'), 'thesis')
859
        self.assertNotEqual(uealite('bases'), 'basis')
860
        # stem preterite words ending in -ed without the -ed
861
        self.assertEqual(uealite('ordained'), 'ordain')
862
        self.assertEqual(uealite('killed'), 'kill')
863
        self.assertEqual(uealite('liked'), 'lik')
864
        self.assertEqual(uealite('helped'), 'help')
865
        self.assertEqual(uealite('scarred'), 'scarre')
866
        self.assertEqual(uealite('invited'), 'invit')
867
        self.assertEqual(uealite('exited'), 'exit')
868
        self.assertEqual(uealite('debited'), 'debit')
869
        self.assertEqual(uealite('smited'), 'smit')
870
        # stem progressive verbs and gerunds without the -ing
871
        self.assertEqual(uealite('running'), 'run')
872
        self.assertEqual(uealite('settings'), 'set')
873
        self.assertEqual(uealite('timing'), 'time')
874
        self.assertEqual(uealite('dying'), 'dy')
875
        self.assertEqual(uealite('harping'), 'harp')
876
        self.assertEqual(uealite('charring'), 'char')
877
        # not stem false progressive verbs such as 'sing'
878
        self.assertEqual(uealite('ring'), 'ring')
879
        self.assertEqual(uealite('sing'), 'se')
880
        self.assertEqual(uealite('bring'), 'br')
881
        self.assertEqual(uealite('fling'), 'fle')
882
        # stem various plural nouns and 3rd-pres verbs without the -s/-es
883
        self.assertEqual(uealite('changes'), 'change')
884
        self.assertEqual(uealite('deaths'), 'death')
885
        self.assertEqual(uealite('shadows'), 'shadow')
886
        self.assertEqual(uealite('flies'), 'fly')
887
        self.assertEqual(uealite('things'), 'thing')
888
        self.assertEqual(uealite('nothings'), 'nothing')
889
        self.assertEqual(uealite('witches'), 'witch')
890
        self.assertEqual(uealite('makes'), 'mak')
891
        self.assertEqual(uealite('smokes'), 'smok')
892
        self.assertEqual(uealite('does'), 'do')
893
        # stem various words with -des suffix
894
        self.assertEqual(uealite('abodes'), 'abod')
895
        self.assertEqual(uealite('escapades'), 'escapad')
896
        self.assertEqual(uealite('crusades'), 'crusad')
897
        self.assertEqual(uealite('grades'), 'grad')
898
        # stem various words with -res suffix
899
        self.assertEqual(uealite('wires'), 'wir')
900
        self.assertEqual(uealite('acres'), 'acr')
901
        self.assertEqual(uealite('fires'), 'fir')
902
        self.assertEqual(uealite('cares'), 'car')
903
        # stem acronyms when pluralized otherwise they should be left alone
904
        self.assertEqual(uealite('USA'), 'USA')
905
        self.assertEqual(uealite('FLOSS'), 'FLOSS')
906
        self.assertEqual(uealite('MREs'), 'MRE')
907
        self.assertEqual(uealite('USAED'), 'USAED')
908
909
        # test cases copied from Ruby port
910
        # https://github.com/ealdent/uea-stemmer/blob/master/test/uea_stemmer_test.rb
911
        # stem base words to just the base word
912
        self.assertEqual(uealite('man', var='Adams'), 'man')
913
        self.assertEqual(uealite('happiness', var='Adams'), 'happiness')
914
        # stem theses as thesis but not bases as basis
915
        self.assertEqual(uealite('theses', var='Adams'), 'thesis')
916
        self.assertNotEqual(uealite('bases', var='Adams'), 'basis')
917
        # stem preterite words ending in -ed without the -ed
918
        self.assertEqual(uealite('ordained', var='Adams'), 'ordain')
919
        self.assertEqual(uealite('killed', var='Adams'), 'kill')
920
        self.assertEqual(uealite('liked', var='Adams'), 'like')
921
        self.assertEqual(uealite('helped', var='Adams'), 'help')
922
        # self.assertEqual(uealite('scarred', var='Adams'), 'scar')
923
        self.assertEqual(uealite('invited', var='Adams'), 'invite')
924
        self.assertEqual(uealite('exited', var='Adams'), 'exit')
925
        self.assertEqual(uealite('debited', var='Adams'), 'debit')
926
        self.assertEqual(uealite('smited', var='Adams'), 'smite')
927
        # stem progressive verbs and gerunds without the -ing
928
        self.assertEqual(uealite('running', var='Adams'), 'run')
929
        self.assertEqual(uealite('settings', var='Adams'), 'set')
930
        self.assertEqual(uealite('timing', var='Adams'), 'time')
931
        self.assertEqual(uealite('dying', var='Adams'), 'die')
932
        self.assertEqual(uealite('harping', var='Adams'), 'harp')
933
        self.assertEqual(uealite('charring', var='Adams'), 'char')
934
        # not stem false progressive verbs such as 'sing'
935
        self.assertEqual(uealite('ring', var='Adams'), 'ring')
936
        self.assertEqual(uealite('sing', var='Adams'), 'sing')
937
        self.assertEqual(uealite('ring', var='Adams'), 'ring')
938
        self.assertEqual(uealite('bring', var='Adams'), 'bring')
939
        self.assertEqual(uealite('fling', var='Adams'), 'fling')
940
        # stem various plural nouns and 3rd-pres verbs without the -s/-es
941
        self.assertEqual(uealite('changes', var='Adams'), 'change')
942
        self.assertEqual(uealite('deaths', var='Adams'), 'death')
943
        self.assertEqual(uealite('shadows', var='Adams'), 'shadow')
944
        self.assertEqual(uealite('flies', var='Adams'), 'fly')
945
        self.assertEqual(uealite('things', var='Adams'), 'thing')
946
        self.assertEqual(uealite('nothings', var='Adams'), 'nothing')
947
        self.assertEqual(uealite('witches', var='Adams'), 'witch')
948
        self.assertEqual(uealite('makes', var='Adams'), 'make')
949
        self.assertEqual(uealite('smokes', var='Adams'), 'smoke')
950
        self.assertEqual(uealite('does', var='Adams'), 'do')
951
        # stem various words with -des suffix
952
        self.assertEqual(uealite('abodes', var='Adams'), 'abode')
953
        self.assertEqual(uealite('escapades', var='Adams'), 'escapade')
954
        self.assertEqual(uealite('crusades', var='Adams'), 'crusade')
955
        self.assertEqual(uealite('grades', var='Adams'), 'grade')
956
        # stem various words with -res suffix
957
        self.assertEqual(uealite('wires', var='Adams'), 'wire')
958
        self.assertEqual(uealite('acres', var='Adams'), 'acre')
959
        self.assertEqual(uealite('fires', var='Adams'), 'fire')
960
        self.assertEqual(uealite('cares', var='Adams'), 'care')
961
        # stem acronyms when pluralized otherwise they should be left alone
962
        self.assertEqual(uealite('USA', var='Adams'), 'USA')
963
        self.assertEqual(uealite('FLOSS', var='Adams'), 'FLOSS')
964
        self.assertEqual(uealite('MREs', var='Adams'), 'MRE')
965
        self.assertEqual(uealite('USAED', var='Adams'), 'USAED')
966
967
    def test_uealite_wsj_set(self):
968
        """Test abydos.stemmer.uealite using the WSJ test set."""
969
        with open(TESTDIR + '/corpora/uea-lite_wsj.csv') as wsj_testset:
970
            for wsj_line in wsj_testset:
971
                (word, uea, rule) = wsj_line.strip().split(',')
972
                self.assertEqual(uealite(word, return_rule_no=True),
973
                                 (uea, float(rule)))
974
975
976
class PaiceHuskTestCases(unittest.TestCase):
977
    """Test Paice-Husk functions.
978
979
    abydos.stemmer.paice_husk
980
    """
981
982
    def test_paice_husk(self):
983
        """Test abydos.stemmer.paice_husk."""
984
        # base case
985
        self.assertEqual(paice_husk(''), '')
986
987
        # cases copied from
988
        # https://doi.org/10.1145/101306.101310
989
        self.assertEqual(paice_husk('maximum'), 'maxim')
990
        self.assertEqual(paice_husk('presumably'), 'presum')
991
        self.assertEqual(paice_husk('multiply'), 'multiply')
992
        self.assertEqual(paice_husk('provision'), 'provid')
993
        self.assertEqual(paice_husk('owed'), 'ow')
994
        self.assertEqual(paice_husk('owing'), 'ow')
995
        self.assertEqual(paice_husk('ear'), 'ear')
996
        self.assertEqual(paice_husk('saying'), 'say')
997
        self.assertEqual(paice_husk('crying'), 'cry')
998
        self.assertEqual(paice_husk('string'), 'string')
999
        self.assertEqual(paice_husk('meant'), 'meant')
1000
        self.assertEqual(paice_husk('cement'), 'cem')
1001
1002
    def test_paice_husk_wsj_set(self):
1003
        """Test abydos.stemmer.uealite using the Hopper262 test set.
1004
1005
        Source:
1006
        https://raw.githubusercontent.com/Hopper262/paice-husk-stemmer/master/wordlist.txt
1007
1008
        The only correction made from stemmed values in the Hopper262 set/
1009
        implementations were:
1010
         - ymca : ymc -> ymca
1011
         - yttrium : yttr -> yttri
1012
         - ywca : ywc -> ywca
1013
        The Pascal reference implementation does not consider 'y' in initial
1014
        position to be a vowel.
1015
        """
1016
        with open(TESTDIR + '/corpora/paicehusk.csv') as hopper_testset:
1017
            for hopper_line in hopper_testset:
1018
                (word, stem) = hopper_line.strip().split(',')
1019
                self.assertEqual(paice_husk(word), stem)
1020
1021
1022
if __name__ == '__main__':
1023
    unittest.main()
1024