| Total Complexity | 41 |
| Total Lines | 556 |
| Duplicated Lines | 7.55 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like tests.stemmer.test_stemmer_snowball often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | # -*- coding: utf-8 -*- |
||
| 2 | |||
| 3 | # Copyright 2014-2018 by Christopher C. Little. |
||
| 4 | # This file is part of Abydos. |
||
| 5 | # |
||
| 6 | # Abydos is free software: you can redistribute it and/or modify |
||
| 7 | # it under the terms of the GNU General Public License as published by |
||
| 8 | # the Free Software Foundation, either version 3 of the License, or |
||
| 9 | # (at your option) any later version. |
||
| 10 | # |
||
| 11 | # Abydos is distributed in the hope that it will be useful, |
||
| 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
| 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
| 14 | # GNU General Public License for more details. |
||
| 15 | # |
||
| 16 | # You should have received a copy of the GNU General Public License |
||
| 17 | # along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
||
| 18 | |||
| 19 | """abydos.tests.test_stemmer_snowball. |
||
| 20 | |||
| 21 | This module contains unit tests for abydos.stemmer.snowball |
||
| 22 | """ |
||
| 23 | |||
| 24 | from __future__ import unicode_literals |
||
| 25 | |||
| 26 | import codecs |
||
| 27 | import unittest |
||
| 28 | |||
| 29 | from abydos.stemmer.snowball import _ends_in_cvc, _ends_in_doubled_cons, \ |
||
| 30 | _m_degree, _sb_ends_in_short_syllable, _sb_has_vowel, _sb_r1, _sb_r2, \ |
||
| 31 | _sb_short_word, porter, porter2, sb_danish, sb_dutch, sb_german, \ |
||
| 32 | sb_norwegian, sb_swedish |
||
| 33 | |||
| 34 | from .. import _corpus_file |
||
| 35 | |||
| 36 | |||
| 37 | class PorterTestCases(unittest.TestCase): |
||
| 38 | """Test Porter functions. |
||
| 39 | |||
| 40 | abydos.stemmer._m_degree, abydos.stemmer.porter, |
||
| 41 | abydos.stemmer._sb_has_vowel, abydos.stemmer._ends_in_doubled_cons, |
||
| 42 | & abydos.stemmer._ends_in_cvc |
||
| 43 | """ |
||
| 44 | |||
| 45 | def test_m_degree(self): |
||
| 46 | """Test abydos.stemmer._m_degree.""" |
||
| 47 | _vowels = set('aeiouy') |
||
| 48 | # base case |
||
| 49 | self.assertEqual(_m_degree('', _vowels), 0) |
||
| 50 | |||
| 51 | # m==0 |
||
| 52 | self.assertEqual(_m_degree('tr', _vowels), 0) |
||
| 53 | self.assertEqual(_m_degree('ee', _vowels), 0) |
||
| 54 | self.assertEqual(_m_degree('tree', _vowels), 0) |
||
| 55 | self.assertEqual(_m_degree('y', _vowels), 0) |
||
| 56 | self.assertEqual(_m_degree('by', _vowels), 0) |
||
| 57 | |||
| 58 | # m==1 |
||
| 59 | self.assertEqual(_m_degree('trouble', _vowels), 1) |
||
| 60 | self.assertEqual(_m_degree('oats', _vowels), 1) |
||
| 61 | self.assertEqual(_m_degree('trees', _vowels), 1) |
||
| 62 | self.assertEqual(_m_degree('ivy', _vowels), 1) |
||
| 63 | |||
| 64 | # m==2 |
||
| 65 | self.assertEqual(_m_degree('troubles', _vowels), 2) |
||
| 66 | self.assertEqual(_m_degree('private', _vowels), 2) |
||
| 67 | self.assertEqual(_m_degree('oaten', _vowels), 2) |
||
| 68 | self.assertEqual(_m_degree('orrery', _vowels), 2) |
||
| 69 | |||
| 70 | def test_has_vowel(self): |
||
| 71 | """Test abydos.stemmer._has_vowel.""" |
||
| 72 | _vowels = set('aeiouy') |
||
| 73 | # base case |
||
| 74 | self.assertFalse(_sb_has_vowel('', _vowels)) |
||
| 75 | |||
| 76 | # False cases |
||
| 77 | self.assertFalse(_sb_has_vowel('b', _vowels)) |
||
| 78 | self.assertFalse(_sb_has_vowel('c', _vowels)) |
||
| 79 | self.assertFalse(_sb_has_vowel('bc', _vowels)) |
||
| 80 | self.assertFalse(_sb_has_vowel('bcdfghjklmnpqrstvwxYz', _vowels)) |
||
| 81 | self.assertFalse(_sb_has_vowel('Y', _vowels)) |
||
| 82 | |||
| 83 | # True cases |
||
| 84 | self.assertTrue(_sb_has_vowel('a', _vowels)) |
||
| 85 | self.assertTrue(_sb_has_vowel('e', _vowels)) |
||
| 86 | self.assertTrue(_sb_has_vowel('ae', _vowels)) |
||
| 87 | self.assertTrue(_sb_has_vowel('aeiouy', _vowels)) |
||
| 88 | self.assertTrue(_sb_has_vowel('y', _vowels)) |
||
| 89 | |||
| 90 | self.assertTrue(_sb_has_vowel('ade', _vowels)) |
||
| 91 | self.assertTrue(_sb_has_vowel('cad', _vowels)) |
||
| 92 | self.assertTrue(_sb_has_vowel('add', _vowels)) |
||
| 93 | self.assertTrue(_sb_has_vowel('phi', _vowels)) |
||
| 94 | self.assertTrue(_sb_has_vowel('pfy', _vowels)) |
||
| 95 | |||
| 96 | self.assertFalse(_sb_has_vowel('pfY', _vowels)) |
||
| 97 | |||
| 98 | def test_ends_in_doubled_cons(self): |
||
| 99 | """Test abydos.stemmer._ends_in_doubled_cons.""" |
||
| 100 | _vowels = set('aeiouy') |
||
| 101 | # base case |
||
| 102 | self.assertFalse(_ends_in_doubled_cons('', _vowels)) |
||
| 103 | |||
| 104 | # False cases |
||
| 105 | self.assertFalse(_ends_in_doubled_cons('b', _vowels)) |
||
| 106 | self.assertFalse(_ends_in_doubled_cons('c', _vowels)) |
||
| 107 | self.assertFalse(_ends_in_doubled_cons('bc', _vowels)) |
||
| 108 | self.assertFalse(_ends_in_doubled_cons('bcdfghjklmnpqrstvwxYz', |
||
| 109 | _vowels)) |
||
| 110 | self.assertFalse(_ends_in_doubled_cons('Y', _vowels)) |
||
| 111 | self.assertFalse(_ends_in_doubled_cons('a', _vowels)) |
||
| 112 | self.assertFalse(_ends_in_doubled_cons('e', _vowels)) |
||
| 113 | self.assertFalse(_ends_in_doubled_cons('ae', _vowels)) |
||
| 114 | self.assertFalse(_ends_in_doubled_cons('aeiouy', _vowels)) |
||
| 115 | self.assertFalse(_ends_in_doubled_cons('y', _vowels)) |
||
| 116 | self.assertFalse(_ends_in_doubled_cons('ade', _vowels)) |
||
| 117 | self.assertFalse(_ends_in_doubled_cons('cad', _vowels)) |
||
| 118 | self.assertFalse(_ends_in_doubled_cons('phi', _vowels)) |
||
| 119 | self.assertFalse(_ends_in_doubled_cons('pfy', _vowels)) |
||
| 120 | self.assertFalse(_ends_in_doubled_cons('faddy', _vowels)) |
||
| 121 | self.assertFalse(_ends_in_doubled_cons('aiii', _vowels)) |
||
| 122 | self.assertFalse(_ends_in_doubled_cons('ayyy', _vowels)) |
||
| 123 | |||
| 124 | # True cases |
||
| 125 | self.assertTrue(_ends_in_doubled_cons('add', _vowels)) |
||
| 126 | self.assertTrue(_ends_in_doubled_cons('fadd', _vowels)) |
||
| 127 | self.assertTrue(_ends_in_doubled_cons('fadddd', _vowels)) |
||
| 128 | self.assertTrue(_ends_in_doubled_cons('raYY', _vowels)) |
||
| 129 | self.assertTrue(_ends_in_doubled_cons('doll', _vowels)) |
||
| 130 | self.assertTrue(_ends_in_doubled_cons('parr', _vowels)) |
||
| 131 | self.assertTrue(_ends_in_doubled_cons('parrr', _vowels)) |
||
| 132 | self.assertTrue(_ends_in_doubled_cons('bacc', _vowels)) |
||
| 133 | |||
| 134 | def test_ends_in_cvc(self): |
||
| 135 | """Test abydos.stemmer._ends_in_cvc.""" |
||
| 136 | _vowels = set('aeiouy') |
||
| 137 | # base case |
||
| 138 | self.assertFalse(_ends_in_cvc('', _vowels)) |
||
| 139 | |||
| 140 | # False cases |
||
| 141 | self.assertFalse(_ends_in_cvc('b', _vowels)) |
||
| 142 | self.assertFalse(_ends_in_cvc('c', _vowels)) |
||
| 143 | self.assertFalse(_ends_in_cvc('bc', _vowels)) |
||
| 144 | self.assertFalse(_ends_in_cvc('bcdfghjklmnpqrstvwxYz', _vowels)) |
||
| 145 | self.assertFalse(_ends_in_cvc('YYY', _vowels)) |
||
| 146 | self.assertFalse(_ends_in_cvc('ddd', _vowels)) |
||
| 147 | self.assertFalse(_ends_in_cvc('faaf', _vowels)) |
||
| 148 | self.assertFalse(_ends_in_cvc('rare', _vowels)) |
||
| 149 | self.assertFalse(_ends_in_cvc('rhy', _vowels)) |
||
| 150 | |||
| 151 | # True cases |
||
| 152 | self.assertTrue(_ends_in_cvc('dad', _vowels)) |
||
| 153 | self.assertTrue(_ends_in_cvc('phad', _vowels)) |
||
| 154 | self.assertTrue(_ends_in_cvc('faded', _vowels)) |
||
| 155 | self.assertTrue(_ends_in_cvc('maYor', _vowels)) |
||
| 156 | self.assertTrue(_ends_in_cvc('enlil', _vowels)) |
||
| 157 | self.assertTrue(_ends_in_cvc('parer', _vowels)) |
||
| 158 | self.assertTrue(_ends_in_cvc('padres', _vowels)) |
||
| 159 | self.assertTrue(_ends_in_cvc('bacyc', _vowels)) |
||
| 160 | |||
| 161 | # Special case for W, X, & Y |
||
| 162 | self.assertFalse(_ends_in_cvc('craw', _vowels)) |
||
| 163 | self.assertFalse(_ends_in_cvc('max', _vowels)) |
||
| 164 | self.assertFalse(_ends_in_cvc('cray', _vowels)) |
||
| 165 | |||
| 166 | def test_porter(self): |
||
| 167 | """Test abydos.stemmer.porter.""" |
||
| 168 | # base case |
||
| 169 | self.assertEqual(porter(''), '') |
||
| 170 | |||
| 171 | # simple cases |
||
| 172 | self.assertEqual(porter('c'), 'c') |
||
| 173 | self.assertEqual(porter('da'), 'da') |
||
| 174 | self.assertEqual(porter('ad'), 'ad') |
||
| 175 | self.assertEqual(porter('sing'), 'sing') |
||
| 176 | self.assertEqual(porter('singing'), 'sing') |
||
| 177 | |||
| 178 | # missed branch test cases |
||
| 179 | self.assertEqual(porter('capitalism'), 'capit') |
||
| 180 | self.assertEqual(porter('fatalism'), 'fatal') |
||
| 181 | self.assertEqual(porter('stional'), 'stional') |
||
| 182 | self.assertEqual(porter('palism'), 'palism') |
||
| 183 | self.assertEqual(porter('sization'), 'sizat') |
||
| 184 | self.assertEqual(porter('licated'), 'licat') |
||
| 185 | self.assertEqual(porter('lical'), 'lical') |
||
| 186 | |||
| 187 | def test_porter_early_english(self): |
||
| 188 | """Test abydos.stemmer.porter (early English).""" |
||
| 189 | # base case |
||
| 190 | self.assertEqual(porter('', early_english=True), '') |
||
| 191 | |||
| 192 | # simple cases (no different from regular stemmer) |
||
| 193 | self.assertEqual(porter('c', early_english=True), 'c') |
||
| 194 | self.assertEqual(porter('da', early_english=True), 'da') |
||
| 195 | self.assertEqual(porter('ad', early_english=True), 'ad') |
||
| 196 | self.assertEqual(porter('sing', early_english=True), 'sing') |
||
| 197 | self.assertEqual(porter('singing', early_english=True), 'sing') |
||
| 198 | |||
| 199 | # make |
||
| 200 | self.assertEqual(porter('make', early_english=True), 'make') |
||
| 201 | self.assertEqual(porter('makes', early_english=True), 'make') |
||
| 202 | self.assertEqual(porter('maketh', early_english=True), 'make') |
||
| 203 | self.assertEqual(porter('makest', early_english=True), 'make') |
||
| 204 | |||
| 205 | # say |
||
| 206 | self.assertEqual(porter('say', early_english=True), 'sai') |
||
| 207 | self.assertEqual(porter('says', early_english=True), 'sai') |
||
| 208 | self.assertEqual(porter('sayeth', early_english=True), 'sai') |
||
| 209 | self.assertEqual(porter('sayest', early_english=True), 'sai') |
||
| 210 | |||
| 211 | # missed branch test cases |
||
| 212 | self.assertEqual(porter('best', early_english=True), 'best') |
||
| 213 | self.assertEqual(porter('meth', early_english=True), 'meth') |
||
| 214 | |||
| 215 | def test_porter_snowball(self): |
||
| 216 | """Test abydos.stemmer.porter (Snowball testset). |
||
| 217 | |||
| 218 | These test cases are from |
||
| 219 | http://snowball.tartarus.org/algorithms/porter/diffs.txt |
||
| 220 | """ |
||
| 221 | # Snowball Porter test set |
||
| 222 | with open(_corpus_file('snowball_porter.csv')) as snowball_ts: |
||
| 223 | next(snowball_ts) |
||
| 224 | for line in snowball_ts: |
||
| 225 | if line[0] != '#': |
||
| 226 | line = line.strip().split(',') |
||
| 227 | word, stem = line[0], line[1] |
||
| 228 | self.assertEqual(porter(word), stem.lower()) |
||
| 229 | |||
| 230 | |||
| 231 | class Porter2TestCases(unittest.TestCase): |
||
| 232 | """Test Porter2 functions. |
||
| 233 | |||
| 234 | abydos.stemmer._sb_r1, abydos.stemmer._sb_r2, |
||
| 235 | abydos.stemmer._sb_ends_in_short_syllable, abydos.stemmer._sb_short_word, |
||
| 236 | & abydos.stemmer.porter2 |
||
| 237 | """ |
||
| 238 | |||
| 239 | def test_sb_r1(self): |
||
| 240 | """Test abydos.stemmer._sb_r1.""" |
||
| 241 | _vowels = set('aeiouy') |
||
| 242 | # base case |
||
| 243 | self.assertEqual(_sb_r1('', _vowels), 0) |
||
| 244 | |||
| 245 | # examples from http://snowball.tartarus.org/texts/r1r2.html |
||
| 246 | self.assertEqual(_sb_r1('beautiful', _vowels), 5) |
||
| 247 | self.assertEqual(_sb_r1('beauty', _vowels), 5) |
||
| 248 | self.assertEqual(_sb_r1('beau', _vowels), 4) |
||
| 249 | self.assertEqual(_sb_r1('animadversion', _vowels), 2) |
||
| 250 | self.assertEqual(_sb_r1('sprinkled', _vowels), 5) |
||
| 251 | self.assertEqual(_sb_r1('eucharist', _vowels), 3) |
||
| 252 | |||
| 253 | def test_sb_r2(self): |
||
| 254 | """Test abydos.stemmer._sb_r2.""" |
||
| 255 | _vowels = set('aeiouy') |
||
| 256 | # base case |
||
| 257 | self.assertEqual(_sb_r2('', _vowels), 0) |
||
| 258 | |||
| 259 | # examples from http://snowball.tartarus.org/texts/r1r2.html |
||
| 260 | self.assertEqual(_sb_r2('beautiful', _vowels), 7) |
||
| 261 | self.assertEqual(_sb_r2('beauty', _vowels), 6) |
||
| 262 | self.assertEqual(_sb_r2('beau', _vowels), 4) |
||
| 263 | self.assertEqual(_sb_r2('animadversion', _vowels), 4) |
||
| 264 | self.assertEqual(_sb_r2('sprinkled', _vowels), 9) |
||
| 265 | self.assertEqual(_sb_r2('eucharist', _vowels), 6) |
||
| 266 | |||
| 267 | def test_sb_ends_in_short_syllable(self): |
||
| 268 | """Test abydos.stemmer._sb_ends_in_short_syllable.""" |
||
| 269 | _vowels = set('aeiouy') |
||
| 270 | _codanonvowels = set('bcdfghjklmnpqrstvz\'') |
||
| 271 | # base case |
||
| 272 | self.assertFalse(_sb_ends_in_short_syllable('', _vowels, |
||
| 273 | _codanonvowels)) |
||
| 274 | |||
| 275 | # examples from |
||
| 276 | # http://snowball.tartarus.org/algorithms/english/stemmer.html |
||
| 277 | self.assertTrue(_sb_ends_in_short_syllable('rap', _vowels, |
||
| 278 | _codanonvowels)) |
||
| 279 | self.assertTrue(_sb_ends_in_short_syllable('trap', _vowels, |
||
| 280 | _codanonvowels)) |
||
| 281 | self.assertTrue(_sb_ends_in_short_syllable('entrap', _vowels, |
||
| 282 | _codanonvowels)) |
||
| 283 | self.assertTrue(_sb_ends_in_short_syllable('ow', _vowels, |
||
| 284 | _codanonvowels)) |
||
| 285 | self.assertTrue(_sb_ends_in_short_syllable('on', _vowels, |
||
| 286 | _codanonvowels)) |
||
| 287 | self.assertTrue(_sb_ends_in_short_syllable('at', _vowels, |
||
| 288 | _codanonvowels)) |
||
| 289 | self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
||
| 290 | _codanonvowels)) |
||
| 291 | self.assertFalse(_sb_ends_in_short_syllable('uproot', _vowels, |
||
| 292 | _codanonvowels)) |
||
| 293 | self.assertFalse(_sb_ends_in_short_syllable('bestow', _vowels, |
||
| 294 | _codanonvowels)) |
||
| 295 | self.assertFalse(_sb_ends_in_short_syllable('disturb', _vowels, |
||
| 296 | _codanonvowels)) |
||
| 297 | |||
| 298 | # missed branch test cases |
||
| 299 | self.assertFalse(_sb_ends_in_short_syllable('d', _vowels, |
||
| 300 | _codanonvowels)) |
||
| 301 | self.assertFalse(_sb_ends_in_short_syllable('a', _vowels, |
||
| 302 | _codanonvowels)) |
||
| 303 | |||
| 304 | def test_sb_short_word(self): |
||
| 305 | """Test abydos.stemmer._sb_short_word.""" |
||
| 306 | _vowels = set('aeiouy') |
||
| 307 | _codanonvowels = set('bcdfghjklmnpqrstvz\'') |
||
| 308 | # base case |
||
| 309 | self.assertFalse(_sb_short_word('', _vowels, _codanonvowels)) |
||
| 310 | |||
| 311 | # examples from |
||
| 312 | # http://snowball.tartarus.org/algorithms/english/stemmer.html |
||
| 313 | self.assertTrue(_sb_short_word('bed', _vowels, _codanonvowels)) |
||
| 314 | self.assertTrue(_sb_short_word('shed', _vowels, _codanonvowels)) |
||
| 315 | self.assertTrue(_sb_short_word('shred', _vowels, _codanonvowels)) |
||
| 316 | self.assertFalse(_sb_short_word('bead', _vowels, _codanonvowels)) |
||
| 317 | self.assertFalse(_sb_short_word('embed', _vowels, _codanonvowels)) |
||
| 318 | self.assertFalse(_sb_short_word('beds', _vowels, _codanonvowels)) |
||
| 319 | |||
| 320 | def test_porter2(self): |
||
| 321 | """Test abydos.stemmer.porter2.""" |
||
| 322 | # base case |
||
| 323 | self.assertEqual(porter2(''), '') |
||
| 324 | |||
| 325 | # simple cases |
||
| 326 | self.assertEqual(porter2('c'), 'c') |
||
| 327 | self.assertEqual(porter2('da'), 'da') |
||
| 328 | self.assertEqual(porter2('ad'), 'ad') |
||
| 329 | self.assertEqual(porter2('sing'), 'sing') |
||
| 330 | self.assertEqual(porter2('singing'), 'sing') |
||
| 331 | |||
| 332 | # missed branch test cases |
||
| 333 | self.assertEqual(porter2('capitalism'), 'capit') |
||
| 334 | self.assertEqual(porter2('fatalism'), 'fatal') |
||
| 335 | self.assertEqual(porter2('dog\'s'), 'dog') |
||
| 336 | self.assertEqual(porter2('A\'s\''), 'a') |
||
| 337 | self.assertEqual(porter2('agreedly'), 'agre') |
||
| 338 | self.assertEqual(porter2('feedly'), 'feed') |
||
| 339 | self.assertEqual(porter2('stional'), 'stional') |
||
| 340 | self.assertEqual(porter2('palism'), 'palism') |
||
| 341 | self.assertEqual(porter2('sization'), 'sizat') |
||
| 342 | self.assertEqual(porter2('licated'), 'licat') |
||
| 343 | self.assertEqual(porter2('lical'), 'lical') |
||
| 344 | self.assertEqual(porter2('clessly'), 'clessli') |
||
| 345 | self.assertEqual(porter2('tably'), 'tabli') |
||
| 346 | self.assertEqual(porter2('sizer'), 'sizer') |
||
| 347 | self.assertEqual(porter2('livity'), 'liviti') |
||
| 348 | |||
| 349 | def test_porter2_early_english(self): |
||
| 350 | """Test abydos.stemmer.porter2 (early English).""" |
||
| 351 | # base case |
||
| 352 | self.assertEqual(porter2('', early_english=True), '') |
||
| 353 | |||
| 354 | # simple cases (no different from regular stemmer) |
||
| 355 | self.assertEqual(porter2('c', early_english=True), 'c') |
||
| 356 | self.assertEqual(porter2('da', early_english=True), 'da') |
||
| 357 | self.assertEqual(porter2('ad', early_english=True), 'ad') |
||
| 358 | self.assertEqual(porter2('sing', early_english=True), 'sing') |
||
| 359 | self.assertEqual(porter2('singing', early_english=True), 'sing') |
||
| 360 | |||
| 361 | # make |
||
| 362 | self.assertEqual(porter2('make', early_english=True), 'make') |
||
| 363 | self.assertEqual(porter2('makes', early_english=True), 'make') |
||
| 364 | self.assertEqual(porter2('maketh', early_english=True), 'make') |
||
| 365 | self.assertEqual(porter2('makest', early_english=True), 'make') |
||
| 366 | |||
| 367 | # say |
||
| 368 | self.assertEqual(porter2('say', early_english=True), 'say') |
||
| 369 | self.assertEqual(porter2('says', early_english=True), 'say') |
||
| 370 | self.assertEqual(porter2('sayeth', early_english=True), 'say') |
||
| 371 | self.assertEqual(porter2('sayest', early_english=True), 'say') |
||
| 372 | |||
| 373 | # missed branch test cases |
||
| 374 | self.assertEqual(porter2('best', early_english=True), 'best') |
||
| 375 | self.assertEqual(porter2('meth', early_english=True), 'meth') |
||
| 376 | |||
| 377 | def test_porter2_snowball(self): |
||
| 378 | """Test abydos.stemmer.porter2 (Snowball testset). |
||
| 379 | |||
| 380 | These test cases are from |
||
| 381 | http://snowball.tartarus.org/algorithms/english/diffs.txt |
||
| 382 | """ |
||
| 383 | # Snowball Porter test set |
||
| 384 | with open(_corpus_file('snowball_porter2.csv')) as snowball_ts: |
||
| 385 | next(snowball_ts) |
||
| 386 | for line in snowball_ts: |
||
| 387 | if line[0] != '#': |
||
| 388 | line = line.strip().split(',') |
||
| 389 | word, stem = line[0], line[1] |
||
| 390 | self.assertEqual(porter2(word), stem.lower()) |
||
| 391 | |||
| 392 | |||
| 393 | class SnowballTestCases(unittest.TestCase): |
||
| 394 | """Test Snowball functions. |
||
| 395 | |||
| 396 | abydos.stemmer.sb_german, abydos.stemmer.sb_dutch, |
||
| 397 | abydos.stemmer.sb_norwegian, abydos.stemmer.sb_swedish, & |
||
| 398 | abydos.stemmer.sb_danish |
||
| 399 | """ |
||
| 400 | |||
| 401 | def test_sb_german_snowball(self): |
||
| 402 | """Test abydos.stemmer.sb_german (Snowball testset). |
||
| 403 | |||
| 404 | These test cases are from |
||
| 405 | http://snowball.tartarus.org/algorithms/german/diffs.txt |
||
| 406 | """ |
||
| 407 | # base case |
||
| 408 | self.assertEqual(sb_german(''), '') |
||
| 409 | |||
| 410 | # Snowball German test set |
||
| 411 | with codecs.open(_corpus_file('snowball_german.csv'), |
||
| 412 | encoding='utf-8') as snowball_ts: |
||
| 413 | next(snowball_ts) |
||
| 414 | for line in snowball_ts: |
||
| 415 | if line[0] != '#': |
||
| 416 | line = line.strip().split(',') |
||
| 417 | word, stem = line[0], line[1] |
||
| 418 | self.assertEqual(sb_german(word), stem.lower()) |
||
| 419 | |||
| 420 | # missed branch test cases |
||
| 421 | self.assertEqual(sb_german('ikeit'), 'ikeit') |
||
| 422 | |||
| 423 | def test_sb_german_snowball_alt(self): |
||
| 424 | """Test abydos.stemmer.sb_german (alternate vowels).""" |
||
| 425 | # base case |
||
| 426 | self.assertEqual(sb_german('', alternate_vowels=True), '') |
||
| 427 | |||
| 428 | # dämmerung,dammer |
||
| 429 | self.assertEqual(sb_german('dämmerung', alternate_vowels=True), |
||
| 430 | 'dammer') |
||
| 431 | self.assertEqual(sb_german('daemmerung', alternate_vowels=True), |
||
| 432 | 'dammer') |
||
| 433 | self.assertEqual(sb_german('dämmerung'), 'dammer') |
||
| 434 | self.assertEqual(sb_german('daemmerung'), 'daemmer') |
||
| 435 | |||
| 436 | # brötchen,brotch |
||
| 437 | self.assertEqual(sb_german('brötchen', alternate_vowels=True), |
||
| 438 | 'brotch') |
||
| 439 | self.assertEqual(sb_german('broetchen', alternate_vowels=True), |
||
| 440 | 'brotch') |
||
| 441 | self.assertEqual(sb_german('brötchen'), 'brotch') |
||
| 442 | self.assertEqual(sb_german('broetchen'), 'broetch') |
||
| 443 | |||
| 444 | # büro,buro |
||
| 445 | self.assertEqual(sb_german('büro', alternate_vowels=True), 'buro') |
||
| 446 | self.assertEqual(sb_german('buero', alternate_vowels=True), 'buro') |
||
| 447 | self.assertEqual(sb_german('büro'), 'buro') |
||
| 448 | self.assertEqual(sb_german('buero'), 'buero') |
||
| 449 | |||
| 450 | # häufen,hauf |
||
| 451 | self.assertEqual(sb_german('häufen', alternate_vowels=True), 'hauf') |
||
| 452 | self.assertEqual(sb_german('haeufen', alternate_vowels=True), 'hauf') |
||
| 453 | self.assertEqual(sb_german('häufen'), 'hauf') |
||
| 454 | self.assertEqual(sb_german('haeufen'), 'haeuf') |
||
| 455 | |||
| 456 | # quelle,quell |
||
| 457 | self.assertEqual(sb_german('qülle', alternate_vowels=True), 'qull') |
||
| 458 | self.assertEqual(sb_german('quelle', alternate_vowels=True), 'quell') |
||
| 459 | self.assertEqual(sb_german('qülle'), 'qull') |
||
| 460 | self.assertEqual(sb_german('quelle'), 'quell') |
||
| 461 | |||
| 462 | # feuer,feuer |
||
| 463 | self.assertEqual(sb_german('feür', alternate_vowels=True), 'feur') |
||
| 464 | self.assertEqual(sb_german('feuer', alternate_vowels=True), 'feu') |
||
| 465 | self.assertEqual(sb_german('feür'), 'feur') |
||
| 466 | self.assertEqual(sb_german('feuer'), 'feu') |
||
| 467 | |||
| 468 | # über,uber |
||
| 469 | self.assertEqual(sb_german('über', alternate_vowels=True), 'uber') |
||
| 470 | self.assertEqual(sb_german('ueber', alternate_vowels=True), 'uber') |
||
| 471 | self.assertEqual(sb_german('über'), 'uber') |
||
| 472 | self.assertEqual(sb_german('ueber'), 'ueb') |
||
| 473 | |||
| 474 | def test_sb_dutch_snowball(self): |
||
| 475 | """Test abydos.stemmer.sb_dutch (Snowball testset). |
||
| 476 | |||
| 477 | These test cases are from |
||
| 478 | http://snowball.tartarus.org/algorithms/dutch/diffs.txt |
||
| 479 | """ |
||
| 480 | # base case |
||
| 481 | self.assertEqual(sb_dutch(''), '') |
||
| 482 | |||
| 483 | # Snowball Dutch test set |
||
| 484 | with codecs.open(_corpus_file('snowball_dutch.csv'), |
||
| 485 | encoding='utf-8') as snowball_ts: |
||
| 486 | next(snowball_ts) |
||
| 487 | for line in snowball_ts: |
||
| 488 | if line[0] != '#': |
||
| 489 | line = line.strip().split(',') |
||
| 490 | word, stem = line[0], line[1] |
||
| 491 | self.assertEqual(sb_dutch(word), stem.lower()) |
||
| 492 | |||
| 493 | # missed branch test cases |
||
| 494 | self.assertEqual(sb_dutch('zondulielijk'), 'zondulie') |
||
| 495 | |||
| 496 | def test_sb_norwegian_snowball(self): |
||
| 497 | """Test abydos.stemmer.sb_norwegian (Snowball testset). |
||
| 498 | |||
| 499 | These test cases are from |
||
| 500 | http://snowball.tartarus.org/algorithms/norwegian/diffs.txt |
||
| 501 | """ |
||
| 502 | # base case |
||
| 503 | self.assertEqual(sb_norwegian(''), '') |
||
| 504 | |||
| 505 | # Snowball Norwegian test set |
||
| 506 | with codecs.open(_corpus_file('snowball_norwegian.csv'), |
||
| 507 | encoding='utf-8') as snowball_ts: |
||
| 508 | next(snowball_ts) |
||
| 509 | for line in snowball_ts: |
||
| 510 | if line[0] != '#': |
||
| 511 | line = line.strip().split(',') |
||
| 512 | word, stem = line[0], line[1] |
||
| 513 | self.assertEqual(sb_norwegian(word), stem.lower()) |
||
| 514 | |||
| 515 | def test_sb_swedish_snowball(self): |
||
| 516 | """Test abydos.stemmer.sb_swedish (Snowball testset). |
||
| 517 | |||
| 518 | These test cases are from |
||
| 519 | http://snowball.tartarus.org/algorithms/swedish/diffs.txt |
||
| 520 | """ |
||
| 521 | # base case |
||
| 522 | self.assertEqual(sb_swedish(''), '') |
||
| 523 | |||
| 524 | # Snowball Swedish test set |
||
| 525 | with codecs.open(_corpus_file('snowball_swedish.csv'), |
||
| 526 | encoding='utf-8') as snowball_ts: |
||
| 527 | next(snowball_ts) |
||
| 528 | for line in snowball_ts: |
||
| 529 | if line[0] != '#': |
||
| 530 | line = line.strip().split(',') |
||
| 531 | word, stem = line[0], line[1] |
||
| 532 | self.assertEqual(sb_swedish(word), stem.lower()) |
||
| 533 | |||
| 534 | def test_sb_danish_snowball(self): |
||
| 535 | """Test abydos.stemmer.sb_danish (Snowball testset). |
||
| 536 | |||
| 537 | These test cases are from |
||
| 538 | http://snowball.tartarus.org/algorithms/danish/diffs.txt |
||
| 539 | """ |
||
| 540 | # base case |
||
| 541 | self.assertEqual(sb_danish(''), '') |
||
| 542 | |||
| 543 | # Snowball Danish test set |
||
| 544 | with codecs.open(_corpus_file('snowball_danish.csv'), |
||
| 545 | encoding='utf-8') as snowball_ts: |
||
| 546 | next(snowball_ts) |
||
| 547 | for line in snowball_ts: |
||
| 548 | if line[0] != '#': |
||
| 549 | line = line.strip().split(',') |
||
| 550 | word, stem = line[0], line[1] |
||
| 551 | self.assertEqual(sb_danish(word), stem.lower()) |
||
| 552 | |||
| 553 | |||
| 554 | if __name__ == '__main__': |
||
| 555 | unittest.main() |
||
| 556 |