Completed
Branch master (87ccc1)
by Chris
08:42
created

tests.stemmer.test_stemmer_caumanns   A

Complexity

Total Complexity 2

Size/Duplication

Total Lines 124
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 57
dl 0
loc 124
rs 10
c 0
b 0
f 0
wmc 2
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.test_stemmer_caumanns.
20
21
This module contains unit tests for abydos.stemmer.caumanns
22
"""
23
24
from __future__ import unicode_literals
25
26
import unittest
27
28
from abydos.stemmer.caumanns import caumanns
29
30
31
class CaumannsTestCases(unittest.TestCase):
32
    """Test Caumanns functions.
33
34
    abydos.stemmer.caumanns
35
    """
36
37
    def test_caumanns(self):
38
        """Test abydos.stemmer.caumanns."""
39
        # base case
40
        self.assertEqual(caumanns(''), '')
41
42
        # tests from Caumanns' description of the algorithm
43
        self.assertEqual(caumanns('singt'), 'sing')
44
        self.assertEqual(caumanns('singen'), 'sing')
45
        self.assertEqual(caumanns('beliebt'), 'belieb')
46
        self.assertEqual(caumanns('beliebtester'), 'belieb')
47
        self.assertEqual(caumanns('stören'), 'stor')
48
        self.assertEqual(caumanns('stöhnen'), 'stoh')
49
        self.assertEqual(caumanns('Kuß'), 'kuss')
50
        self.assertEqual(caumanns('Küsse'), 'kuss')
51
        self.assertEqual(caumanns('Verlierer'), 'verlier')
52
        self.assertEqual(caumanns('Verlies'), 'verlie')
53
        self.assertEqual(caumanns('Maus'), 'mau')
54
        self.assertEqual(caumanns('Mauer'), 'mau')
55
        self.assertEqual(caumanns('Störsender'), 'stor')
56
57
        # additional tests to achieve full coverage
58
        self.assertEqual(caumanns('Müllerinnen'), 'mullerin')
59
        self.assertEqual(caumanns('Matrix'), 'matrix')
60
        self.assertEqual(caumanns('Matrizen'), 'matrix')
61
62
    def test_caumanns_lucene(self):
63
        """Test abydos.stemmer.caumanns (Lucene tests).
64
65
        Based on tests from
66
        https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
67
        This is presumably Apache-licensed.
68
        """
69
        # German special characters are replaced:
70
        self.assertEqual(caumanns('häufig'), 'haufig')
71
        self.assertEqual(caumanns('üor'), 'uor')
72
        self.assertEqual(caumanns('björk'), 'bjork')
73
74
        # here the stemmer works okay, it maps related words to the same stem:
75
        self.assertEqual(caumanns('abschließen'), 'abschliess')
76
        self.assertEqual(caumanns('abschließender'), 'abschliess')
77
        self.assertEqual(caumanns('abschließendes'), 'abschliess')
78
        self.assertEqual(caumanns('abschließenden'), 'abschliess')
79
80
        self.assertEqual(caumanns('Tisch'), 'tisch')
81
        self.assertEqual(caumanns('Tische'), 'tisch')
82
        self.assertEqual(caumanns('Tischen'), 'tisch')
83
        self.assertEqual(caumanns('geheimtür'), 'geheimtur')
84
85
        self.assertEqual(caumanns('Haus'), 'hau')
86
        self.assertEqual(caumanns('Hauses'), 'hau')
87
        self.assertEqual(caumanns('Häuser'), 'hau')
88
        self.assertEqual(caumanns('Häusern'), 'hau')
89
        # here's a case where overstemming occurs, i.e. a word is
90
        # mapped to the same stem as unrelated words:
91
        self.assertEqual(caumanns('hauen'), 'hau')
92
93
        # here's a case where understemming occurs, i.e. two related words
94
        # are not mapped to the same stem. This is the case with basically
95
        # all irregular forms:
96
        self.assertEqual(caumanns('Drama'), 'drama')
97
        self.assertEqual(caumanns('Dramen'), 'dram')
98
99
        # replace "ß" with 'ss':
100
        self.assertEqual(caumanns('Ausmaß'), 'ausmass')
101
102
        # fake words to test if suffixes are cut off:
103
        self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
104
        self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
105
        self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
106
        self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
107
        self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
108
        self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
109
        self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
110
        # the suffixes are also removed when combined:
111
        self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')
112
113
        # words that are shorter than four charcters are not changed:
114
        self.assertEqual(caumanns('xxe'), 'xxe')
115
        # -em and -er are not removed from words shorter than five characters:
116
        self.assertEqual(caumanns('xxem'), 'xxem')
117
        self.assertEqual(caumanns('xxer'), 'xxer')
118
        # -nd is not removed from words shorter than six characters:
119
        self.assertEqual(caumanns('xxxnd'), 'xxxnd')
120
121
122
if __name__ == '__main__':
123
    unittest.main()
124