Total Complexity | 41 |
Total Lines | 556 |
Duplicated Lines | 7.55 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like tests.stemmer.test_stemmer_snowball often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Copyright 2014-2018 by Christopher C. Little. |
||
4 | # This file is part of Abydos. |
||
5 | # |
||
6 | # Abydos is free software: you can redistribute it and/or modify |
||
7 | # it under the terms of the GNU General Public License as published by |
||
8 | # the Free Software Foundation, either version 3 of the License, or |
||
9 | # (at your option) any later version. |
||
10 | # |
||
11 | # Abydos is distributed in the hope that it will be useful, |
||
12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | # GNU General Public License for more details. |
||
15 | # |
||
16 | # You should have received a copy of the GNU General Public License |
||
17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
18 | |||
19 | """abydos.tests.test_stemmer_snowball. |
||
20 | |||
21 | This module contains unit tests for abydos.stemmer.snowball |
||
22 | """ |
||
23 | |||
24 | from __future__ import unicode_literals |
||
25 | |||
26 | import codecs |
||
27 | import unittest |
||
28 | |||
29 | from abydos.stemmer.snowball import _ends_in_cvc, _ends_in_doubled_cons, \ |
||
30 | _m_degree, _sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \ |
||
31 | _sb_short_word, porter, porter2, sb_danish, sb_dutch, sb_german, \ |
||
32 | sb_norwegian, sb_swedish |
||
33 | |||
34 | from .. import _corpus_file |
||
35 | |||
36 | |||
37 | class PorterTestCases(unittest.TestCase): |
||
38 | """Test Porter functions. |
||
39 | |||
40 | abydos.stemmer._m_degree, abydos.stemmer.porter, |
||
41 | abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons, |
||
42 | & abydos.stemmer._ends_in_cvc |
||
43 | """ |
||
44 | |||
45 | def test_m_degree(self): |
||
46 | """Test abydos.stemmer._m_degree.""" |
||
47 | _vowels = set('aeiouy') |
||
48 | # base case |
||
49 | self.assertEqual(_m_degree('', _vowels), 0) |
||
50 | |||
51 | # m==0 |
||
52 | self.assertEqual(_m_degree('tr', _vowels), 0) |
||
53 | self.assertEqual(_m_degree('ee', _vowels), 0) |
||
54 | self.assertEqual(_m_degree('tree', _vowels), 0) |
||
55 | self.assertEqual(_m_degree('y', _vowels), 0) |
||
56 | self.assertEqual(_m_degree('by', _vowels), 0) |
||
57 | |||
58 | # m==1 |
||
59 | self.assertEqual(_m_degree('trouble', _vowels), 1) |
||
60 | self.assertEqual(_m_degree('oats', _vowels), 1) |
||
61 | self.assertEqual(_m_degree('trees', _vowels), 1) |
||
62 | self.assertEqual(_m_degree('ivy', _vowels), 1) |
||
63 | |||
64 | # m==2 |
||
65 | self.assertEqual(_m_degree('troubles', _vowels), 2) |
||
66 | self.assertEqual(_m_degree('private', _vowels), 2) |
||
67 | self.assertEqual(_m_degree('oaten', _vowels), 2) |
||
68 | self.assertEqual(_m_degree('orrery', _vowels), 2) |
||
69 | |||
70 | def test_has_vowel(self): |
||
71 | """Test abydos.stemmer._has_vowel.""" |
||
72 | _vowels = set('aeiouy') |
||
73 | # base case |
||
74 | self.assertFalse(_sb_has_vowel('', _vowels)) |
||
75 | |||
76 | # False cases |
||
77 | self.assertFalse(_sb_has_vowel('b', _vowels)) |
||
78 | self.assertFalse(_sb_has_vowel('c', _vowels)) |
||
79 | self.assertFalse(_sb_has_vowel('bc', _vowels)) |
||
80 | self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels)) |
||
81 | self.assertFalse(_sb_has_vowel('Y', _vowels)) |
||
82 | |||
83 | # True cases |
||
84 | self.assertTrue(_sb_has_vowel('a', _vowels)) |
||
85 | self.assertTrue(_sb_has_vowel('e', _vowels)) |
||
86 | self.assertTrue(_sb_has_vowel('ae', _vowels)) |
||
87 | self.assertTrue(_sb_has_vowel('aeiouy', _vowels)) |
||
88 | self.assertTrue(_sb_has_vowel('y', _vowels)) |
||
89 | |||
90 | self.assertTrue(_sb_has_vowel('ade', _vowels)) |
||
91 | self.assertTrue(_sb_has_vowel('cad', _vowels)) |
||
92 | self.assertTrue(_sb_has_vowel('add', _vowels)) |
||
93 | self.assertTrue(_sb_has_vowel('phi', _vowels)) |
||
94 | self.assertTrue(_sb_has_vowel('pfy', _vowels)) |
||
95 | |||
96 | self.assertFalse(_sb_has_vowel('pfY', _vowels)) |
||
97 | |||
98 | def test_ends_in_doubled_cons(self): |
||
99 | """Test abydos.stemmer._ends_in_doubled_cons.""" |
||
100 | _vowels = set('aeiouy') |
||
101 | # base case |
||
102 | self.assertFalse(_ends_in_doubled_cons('', _vowels)) |
||
103 | |||
104 | # False cases |
||
105 | self.assertFalse(_ends_in_doubled_cons('b', _vowels)) |
||
106 | self.assertFalse(_ends_in_doubled_cons('c', _vowels)) |
||
107 | self.assertFalse(_ends_in_doubled_cons('bc', _vowels)) |
||
108 | self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz', |
||
109 | _vowels)) |
||
110 | self.assertFalse(_ends_in_doubled_cons('Y', _vowels)) |
||
111 | self.assertFalse(_ends_in_doubled_cons('a', _vowels)) |
||
112 | self.assertFalse(_ends_in_doubled_cons('e', _vowels)) |
||
113 | self.assertFalse(_ends_in_doubled_cons('ae', _vowels)) |
||
114 | self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels)) |
||
115 | self.assertFalse(_ends_in_doubled_cons('y', _vowels)) |
||
116 | self.assertFalse(_ends_in_doubled_cons('ade', _vowels)) |
||
117 | self.assertFalse(_ends_in_doubled_cons('cad', _vowels)) |
||
118 | self.assertFalse(_ends_in_doubled_cons('phi', _vowels)) |
||
119 | self.assertFalse(_ends_in_doubled_cons('pfy', _vowels)) |
||
120 | self.assertFalse(_ends_in_doubled_cons('faddy', _vowels)) |
||
121 | self.assertFalse(_ends_in_doubled_cons('aiii', _vowels)) |
||
122 | self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels)) |
||
123 | |||
124 | # True cases |
||
125 | self.assertTrue(_ends_in_doubled_cons('add', _vowels)) |
||
126 | self.assertTrue(_ends_in_doubled_cons('fadd', _vowels)) |
||
127 | self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels)) |
||
128 | self.assertTrue(_ends_in_doubled_cons('raYY', _vowels)) |
||
129 | self.assertTrue(_ends_in_doubled_cons('doll', _vowels)) |
||
130 | self.assertTrue(_ends_in_doubled_cons('parr', _vowels)) |
||
131 | self.assertTrue(_ends_in_doubled_cons('parrr', _vowels)) |
||
132 | self.assertTrue(_ends_in_doubled_cons('bacc', _vowels)) |
||
133 | |||
134 | def test_ends_in_cvc(self): |
||
135 | """Test abydos.stemmer._ends_in_cvc.""" |
||
136 | _vowels = set('aeiouy') |
||
137 | # base case |
||
138 | self.assertFalse(_ends_in_cvc('', _vowels)) |
||
139 | |||
140 | # False cases |
||
141 | self.assertFalse(_ends_in_cvc('b', _vowels)) |
||
142 | self.assertFalse(_ends_in_cvc('c', _vowels)) |
||
143 | self.assertFalse(_ends_in_cvc('bc', _vowels)) |
||
144 | self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels)) |
||
145 | self.assertFalse(_ends_in_cvc('YYY', _vowels)) |
||
146 | self.assertFalse(_ends_in_cvc('ddd', _vowels)) |
||
147 | self.assertFalse(_ends_in_cvc('faaf', _vowels)) |
||
148 | self.assertFalse(_ends_in_cvc('rare', _vowels)) |
||
149 | self.assertFalse(_ends_in_cvc('rhy', _vowels)) |
||
150 | |||
151 | # True cases |
||
152 | self.assertTrue(_ends_in_cvc('dad', _vowels)) |
||
153 | self.assertTrue(_ends_in_cvc('phad', _vowels)) |
||
154 | self.assertTrue(_ends_in_cvc('faded', _vowels)) |
||
155 | self.assertTrue(_ends_in_cvc('maYor', _vowels)) |
||
156 | self.assertTrue(_ends_in_cvc('enlil', _vowels)) |
||
157 | self.assertTrue(_ends_in_cvc('parer', _vowels)) |
||
158 | self.assertTrue(_ends_in_cvc('padres', _vowels)) |
||
159 | self.assertTrue(_ends_in_cvc('bacyc', _vowels)) |
||
160 | |||
161 | # Special case for W, X, & Y |
||
162 | self.assertFalse(_ends_in_cvc('craw', _vowels)) |
||
163 | self.assertFalse(_ends_in_cvc('max', _vowels)) |
||
164 | self.assertFalse(_ends_in_cvc('cray', _vowels)) |
||
165 | |||
166 | def test_porter(self): |
||
167 | """Test abydos.stemmer.porter.""" |
||
168 | # base case |
||
169 | self.assertEqual(porter(''), '') |
||
170 | |||
171 | # simple cases |
||
172 | self.assertEqual(porter('c'), 'c') |
||
173 | self.assertEqual(porter('da'), 'da') |
||
174 | self.assertEqual(porter('ad'), 'ad') |
||
175 | self.assertEqual(porter('sing'), 'sing') |
||
176 | self.assertEqual(porter('singing'), 'sing') |
||
177 | |||
178 | # missed branch test cases |
||
179 | self.assertEqual(porter('capitalism'), 'capit') |
||
180 | self.assertEqual(porter('fatalism'), 'fatal') |
||
181 | self.assertEqual(porter('stional'), 'stional') |
||
182 | self.assertEqual(porter('palism'), 'palism') |
||
183 | self.assertEqual(porter('sization'), 'sizat') |
||
184 | self.assertEqual(porter('licated'), 'licat') |
||
185 | self.assertEqual(porter('lical'), 'lical') |
||
186 | |||
187 | def test_porter_early_english(self): |
||
188 | """Test abydos.stemmer.porter (early English).""" |
||
189 | # base case |
||
190 | self.assertEqual(porter('', early_english=True), '') |
||
191 | |||
192 | # simple cases (no different from regular stemmer) |
||
193 | self.assertEqual(porter('c', early_english=True), 'c') |
||
194 | self.assertEqual(porter('da', early_english=True), 'da') |
||
195 | self.assertEqual(porter('ad', early_english=True), 'ad') |
||
196 | self.assertEqual(porter('sing', early_english=True), 'sing') |
||
197 | self.assertEqual(porter('singing', early_english=True), 'sing') |
||
198 | |||
199 | # make |
||
200 | self.assertEqual(porter('make', early_english=True), 'make') |
||
201 | self.assertEqual(porter('makes', early_english=True), 'make') |
||
202 | self.assertEqual(porter('maketh', early_english=True), 'make') |
||
203 | self.assertEqual(porter('makest', early_english=True), 'make') |
||
204 | |||
205 | # say |
||
206 | self.assertEqual(porter('say', early_english=True), 'sai') |
||
207 | self.assertEqual(porter('says', early_english=True), 'sai') |
||
208 | self.assertEqual(porter('sayeth', early_english=True), 'sai') |
||
209 | self.assertEqual(porter('sayest', early_english=True), 'sai') |
||
210 | |||
211 | # missed branch test cases |
||
212 | self.assertEqual(porter('best', early_english=True), 'best') |
||
213 | self.assertEqual(porter('meth', early_english=True), 'meth') |
||
214 | |||
215 | def test_porter_snowball(self): |
||
216 | """Test abydos.stemmer.porter (Snowball testset). |
||
217 | |||
218 | These test cases are from |
||
219 | http://snowball.tartarus.org/algorithms/porter/diffs.txt |
||
220 | """ |
||
221 | # Snowball Porter test set |
||
222 | with open(_corpus_file('snowball_porter.csv')) as snowball_ts: |
||
223 | next(snowball_ts) |
||
224 | for line in snowball_ts: |
||
225 | if line[0] != '#': |
||
226 | line = line.strip().split(',') |
||
227 | word, stem = line[0], line[1] |
||
228 | self.assertEqual(porter(word), stem.lower()) |
||
229 | |||
230 | |||
231 | class Porter2TestCases(unittest.TestCase): |
||
232 | """Test Porter2 functions. |
||
233 | |||
234 | abydos.stemmer._sb_r1, abydos.stemmer._sb_r2, |
||
235 | abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word, |
||
236 | & abydos.stemmer.porter2 |
||
237 | """ |
||
238 | |||
239 | def test_sb_r1(self): |
||
240 | """Test abydos.stemmer._sb_r1.""" |
||
241 | _vowels = set('aeiouy') |
||
242 | # base case |
||
243 | self.assertEqual(_sb_r1('', _vowels), 0) |
||
244 | |||
245 | # examples from http://snowball.tartarus.org/texts/r1r2.html |
||
246 | self.assertEqual(_sb_r1('beautiful', _vowels), 5) |
||
247 | self.assertEqual(_sb_r1('beauty', _vowels), 5) |
||
248 | self.assertEqual(_sb_r1('beau', _vowels), 4) |
||
249 | self.assertEqual(_sb_r1('animadversion', _vowels), 2) |
||
250 | self.assertEqual(_sb_r1('sprinkled', _vowels), 5) |
||
251 | self.assertEqual(_sb_r1('eucharist', _vowels), 3) |
||
252 | |||
253 | def test_sb_r2(self): |
||
254 | """Test abydos.stemmer._sb_r2.""" |
||
255 | _vowels = set('aeiouy') |
||
256 | # base case |
||
257 | self.assertEqual(_sb_r2('', _vowels), 0) |
||
258 | |||
259 | # examples from http://snowball.tartarus.org/texts/r1r2.html |
||
260 | self.assertEqual(_sb_r2('beautiful', _vowels), 7) |
||
261 | self.assertEqual(_sb_r2('beauty', _vowels), 6) |
||
262 | self.assertEqual(_sb_r2('beau', _vowels), 4) |
||
263 | self.assertEqual(_sb_r2('animadversion', _vowels), 4) |
||
264 | self.assertEqual(_sb_r2('sprinkled', _vowels), 9) |
||
265 | self.assertEqual(_sb_r2('eucharist', _vowels), 6) |
||
266 | |||
267 | def test_sb_ends_in_short_syllable(self): |
||
268 | """Test abydos.stemmer._sb_ends_in_short_syllable.""" |
||
269 | _vowels = set('aeiouy') |
||
270 | _codanonvowels = set('bcdfghjklmnpqrstvz\'') |
||
271 | # base case |
||
272 | self.assertFalse(_sb_ends_in_short_syllable('', _vowels, |
||
273 | _codanonvowels)) |
||
274 | |||
275 | # examples from |
||
276 | # http://snowball.tartarus.org/algorithms/english/stemmer.html |
||
277 | self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels, |
||
278 | _codanonvowels)) |
||
279 | self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels, |
||
280 | _codanonvowels)) |
||
281 | self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels, |
||
282 | _codanonvowels)) |
||
283 | self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels, |
||
284 | _codanonvowels)) |
||
285 | self.assertTrue(_sb_ends_in_short_syllable('on', _vowels, |
||
286 | _codanonvowels)) |
||
287 | self.assertTrue(_sb_ends_in_short_syllable('at', _vowels, |
||
288 | _codanonvowels)) |
||
289 | self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
||
290 | _codanonvowels)) |
||
291 | self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
||
292 | _codanonvowels)) |
||
293 | self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels, |
||
294 | _codanonvowels)) |
||
295 | self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels, |
||
296 | _codanonvowels)) |
||
297 | |||
298 | # missed branch test cases |
||
299 | self.assertFalse(_sb_ends_in_short_syllable('d', _vowels, |
||
300 | _codanonvowels)) |
||
301 | self.assertFalse(_sb_ends_in_short_syllable('a', _vowels, |
||
302 | _codanonvowels)) |
||
303 | |||
304 | def test_sb_short_word(self): |
||
305 | """Test abydos.stemmer._sb_short_word.""" |
||
306 | _vowels = set('aeiouy') |
||
307 | _codanonvowels = set('bcdfghjklmnpqrstvz\'') |
||
308 | # base case |
||
309 | self.assertFalse(_sb_short_word('', _vowels, _codanonvowels)) |
||
310 | |||
311 | # examples from |
||
312 | # http://snowball.tartarus.org/algorithms/english/stemmer.html |
||
313 | self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels)) |
||
314 | self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels)) |
||
315 | self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels)) |
||
316 | self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels)) |
||
317 | self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels)) |
||
318 | self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels)) |
||
319 | |||
320 | def test_porter2(self): |
||
321 | """Test abydos.stemmer.porter2.""" |
||
322 | # base case |
||
323 | self.assertEqual(porter2(''), '') |
||
324 | |||
325 | # simple cases |
||
326 | self.assertEqual(porter2('c'), 'c') |
||
327 | self.assertEqual(porter2('da'), 'da') |
||
328 | self.assertEqual(porter2('ad'), 'ad') |
||
329 | self.assertEqual(porter2('sing'), 'sing') |
||
330 | self.assertEqual(porter2('singing'), 'sing') |
||
331 | |||
332 | # missed branch test cases |
||
333 | self.assertEqual(porter2('capitalism'), 'capit') |
||
334 | self.assertEqual(porter2('fatalism'), 'fatal') |
||
335 | self.assertEqual(porter2('dog\'s'), 'dog') |
||
336 | self.assertEqual(porter2('A\'s\''), 'a') |
||
337 | self.assertEqual(porter2('agreedly'), 'agre') |
||
338 | self.assertEqual(porter2('feedly'), 'feed') |
||
339 | self.assertEqual(porter2('stional'), 'stional') |
||
340 | self.assertEqual(porter2('palism'), 'palism') |
||
341 | self.assertEqual(porter2('sization'), 'sizat') |
||
342 | self.assertEqual(porter2('licated'), 'licat') |
||
343 | self.assertEqual(porter2('lical'), 'lical') |
||
344 | self.assertEqual(porter2('clessly'), 'clessli') |
||
345 | self.assertEqual(porter2('tably'), 'tabli') |
||
346 | self.assertEqual(porter2('sizer'), 'sizer') |
||
347 | self.assertEqual(porter2('livity'), 'liviti') |
||
348 | |||
349 | def test_porter2_early_english(self): |
||
350 | """Test abydos.stemmer.porter2 (early English).""" |
||
351 | # base case |
||
352 | self.assertEqual(porter2('', early_english=True), '') |
||
353 | |||
354 | # simple cases (no different from regular stemmer) |
||
355 | self.assertEqual(porter2('c', early_english=True), 'c') |
||
356 | self.assertEqual(porter2('da', early_english=True), 'da') |
||
357 | self.assertEqual(porter2('ad', early_english=True), 'ad') |
||
358 | self.assertEqual(porter2('sing', early_english=True), 'sing') |
||
359 | self.assertEqual(porter2('singing', early_english=True), 'sing') |
||
360 | |||
361 | # make |
||
362 | self.assertEqual(porter2('make', early_english=True), 'make') |
||
363 | self.assertEqual(porter2('makes', early_english=True), 'make') |
||
364 | self.assertEqual(porter2('maketh', early_english=True), 'make') |
||
365 | self.assertEqual(porter2('makest', early_english=True), 'make') |
||
366 | |||
367 | # say |
||
368 | self.assertEqual(porter2('say', early_english=True), 'say') |
||
369 | self.assertEqual(porter2('says', early_english=True), 'say') |
||
370 | self.assertEqual(porter2('sayeth', early_english=True), 'say') |
||
371 | self.assertEqual(porter2('sayest', early_english=True), 'say') |
||
372 | |||
373 | # missed branch test cases |
||
374 | self.assertEqual(porter2('best', early_english=True), 'best') |
||
375 | self.assertEqual(porter2('meth', early_english=True), 'meth') |
||
376 | |||
377 | def test_porter2_snowball(self): |
||
378 | """Test abydos.stemmer.porter2 (Snowball testset). |
||
379 | |||
380 | These test cases are from |
||
381 | http://snowball.tartarus.org/algorithms/english/diffs.txt |
||
382 | """ |
||
383 | # Snowball Porter test set |
||
384 | with open(_corpus_file('snowball_porter2.csv')) as snowball_ts: |
||
385 | next(snowball_ts) |
||
386 | for line in snowball_ts: |
||
387 | if line[0] != '#': |
||
388 | line = line.strip().split(',') |
||
389 | word, stem = line[0], line[1] |
||
390 | self.assertEqual(porter2(word), stem.lower()) |
||
391 | |||
392 | |||
393 | class SnowballTestCases(unittest.TestCase): |
||
394 | """Test Snowball functions. |
||
395 | |||
396 | abydos.stemmer.sb_german, abydos.stemmer.sb_dutch, |
||
397 | abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, & |
||
398 | abydos.stemmer.sb_danish |
||
399 | """ |
||
400 | |||
401 | def test_sb_german_snowball(self): |
||
402 | """Test abydos.stemmer.sb_german (Snowball testset). |
||
403 | |||
404 | These test cases are from |
||
405 | http://snowball.tartarus.org/algorithms/german/diffs.txt |
||
406 | """ |
||
407 | # base case |
||
408 | self.assertEqual(sb_german(''), '') |
||
409 | |||
410 | # Snowball German test set |
||
411 | with codecs.open(_corpus_file('snowball_german.csv'), |
||
412 | encoding='utf-8') as snowball_ts: |
||
413 | next(snowball_ts) |
||
414 | for line in snowball_ts: |
||
415 | if line[0] != '#': |
||
416 | line = line.strip().split(',') |
||
417 | word, stem = line[0], line[1] |
||
418 | self.assertEqual(sb_german(word), stem.lower()) |
||
419 | |||
420 | # missed branch test cases |
||
421 | self.assertEqual(sb_german('ikeit'), 'ikeit') |
||
422 | |||
423 | def test_sb_german_snowball_alt(self): |
||
424 | """Test abydos.stemmer.sb_german (alternate vowels).""" |
||
425 | # base case |
||
426 | self.assertEqual(sb_german('', alternate_vowels=True), '') |
||
427 | |||
428 | # dämmerung,dammer |
||
429 | self.assertEqual(sb_german('dämmerung', alternate_vowels=True), |
||
430 | 'dammer') |
||
431 | self.assertEqual(sb_german('daemmerung', alternate_vowels=True), |
||
432 | 'dammer') |
||
433 | self.assertEqual(sb_german('dämmerung'), 'dammer') |
||
434 | self.assertEqual(sb_german('daemmerung'), 'daemmer') |
||
435 | |||
436 | # brötchen,brotch |
||
437 | self.assertEqual(sb_german('brötchen', alternate_vowels=True), |
||
438 | 'brotch') |
||
439 | self.assertEqual(sb_german('broetchen', alternate_vowels=True), |
||
440 | 'brotch') |
||
441 | self.assertEqual(sb_german('brötchen'), 'brotch') |
||
442 | self.assertEqual(sb_german('broetchen'), 'broetch') |
||
443 | |||
444 | # büro,buro |
||
445 | self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro') |
||
446 | self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro') |
||
447 | self.assertEqual(sb_german('büro'), 'buro') |
||
448 | self.assertEqual(sb_german('buero'), 'buero') |
||
449 | |||
450 | # häufen,hauf |
||
451 | self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf') |
||
452 | self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf') |
||
453 | self.assertEqual(sb_german('häufen'), 'hauf') |
||
454 | self.assertEqual(sb_german('haeufen'), 'haeuf') |
||
455 | |||
456 | # quelle,quell |
||
457 | self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull') |
||
458 | self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell') |
||
459 | self.assertEqual(sb_german('qülle'), 'qull') |
||
460 | self.assertEqual(sb_german('quelle'), 'quell') |
||
461 | |||
462 | # feuer,feuer |
||
463 | self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur') |
||
464 | self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu') |
||
465 | self.assertEqual(sb_german('feür'), 'feur') |
||
466 | self.assertEqual(sb_german('feuer'), 'feu') |
||
467 | |||
468 | # über,uber |
||
469 | self.assertEqual(sb_german('über', alternate_vowels=True), 'uber') |
||
470 | self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber') |
||
471 | self.assertEqual(sb_german('über'), 'uber') |
||
472 | self.assertEqual(sb_german('ueber'), 'ueb') |
||
473 | |||
474 | def test_sb_dutch_snowball(self): |
||
475 | """Test abydos.stemmer.sb_dutch (Snowball testset). |
||
476 | |||
477 | These test cases are from |
||
478 | http://snowball.tartarus.org/algorithms/dutch/diffs.txt |
||
479 | """ |
||
480 | # base case |
||
481 | self.assertEqual(sb_dutch(''), '') |
||
482 | |||
483 | # Snowball Dutch test set |
||
484 | with codecs.open(_corpus_file('snowball_dutch.csv'), |
||
485 | encoding='utf-8') as snowball_ts: |
||
486 | next(snowball_ts) |
||
487 | for line in snowball_ts: |
||
488 | if line[0] != '#': |
||
489 | line = line.strip().split(',') |
||
490 | word, stem = line[0], line[1] |
||
491 | self.assertEqual(sb_dutch(word), stem.lower()) |
||
492 | |||
493 | # missed branch test cases |
||
494 | self.assertEqual(sb_dutch('zondulielijk'), 'zondulie') |
||
495 | |||
496 | def test_sb_norwegian_snowball(self): |
||
497 | """Test abydos.stemmer.sb_norwegian (Snowball testset). |
||
498 | |||
499 | These test cases are from |
||
500 | http://snowball.tartarus.org/algorithms/norwegian/diffs.txt |
||
501 | """ |
||
502 | # base case |
||
503 | self.assertEqual(sb_norwegian(''), '') |
||
504 | |||
505 | # Snowball Norwegian test set |
||
506 | with codecs.open(_corpus_file('snowball_norwegian.csv'), |
||
507 | encoding='utf-8') as snowball_ts: |
||
508 | next(snowball_ts) |
||
509 | for line in snowball_ts: |
||
510 | if line[0] != '#': |
||
511 | line = line.strip().split(',') |
||
512 | word, stem = line[0], line[1] |
||
513 | self.assertEqual(sb_norwegian(word), stem.lower()) |
||
514 | |||
515 | def test_sb_swedish_snowball(self): |
||
516 | """Test abydos.stemmer.sb_swedish (Snowball testset). |
||
517 | |||
518 | These test cases are from |
||
519 | http://snowball.tartarus.org/algorithms/swedish/diffs.txt |
||
520 | """ |
||
521 | # base case |
||
522 | self.assertEqual(sb_swedish(''), '') |
||
523 | |||
524 | # Snowball Swedish test set |
||
525 | with codecs.open(_corpus_file('snowball_swedish.csv'), |
||
526 | encoding='utf-8') as snowball_ts: |
||
527 | next(snowball_ts) |
||
528 | for line in snowball_ts: |
||
529 | if line[0] != '#': |
||
530 | line = line.strip().split(',') |
||
531 | word, stem = line[0], line[1] |
||
532 | self.assertEqual(sb_swedish(word), stem.lower()) |
||
533 | |||
534 | def test_sb_danish_snowball(self): |
||
535 | """Test abydos.stemmer.sb_danish (Snowball testset). |
||
536 | |||
537 | These test cases are from |
||
538 | http://snowball.tartarus.org/algorithms/danish/diffs.txt |
||
539 | """ |
||
540 | # base case |
||
541 | self.assertEqual(sb_danish(''), '') |
||
542 | |||
543 | # Snowball Danish test set |
||
544 | with codecs.open(_corpus_file('snowball_danish.csv'), |
||
545 | encoding='utf-8') as snowball_ts: |
||
546 | next(snowball_ts) |
||
547 | for line in snowball_ts: |
||
548 | if line[0] != '#': |
||
549 | line = line.strip().split(',') |
||
550 | word, stem = line[0], line[1] |
||
551 | self.assertEqual(sb_danish(word), stem.lower()) |
||
552 | |||
553 | |||
554 | if __name__ == '__main__': |
||
555 | unittest.main() |
||
556 |