tests.stemmer.test_stemmer_caumanns - Code Metrics - Inspection of "Merge pull request #127 from chrislit/fix_builds" - chrislit/abydos - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (87ccc1)

by Chris

created 2018-10-23 03:59 UTC

tests.stemmer.test_stemmer_caumanns A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	124
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	57
dl	0
loc	124
rs	10
c	0
b	0
f	0
wmc	2

# -*- coding: utf-8 -*-

# Copyright 2014-2018 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.tests.test_stemmer_caumanns.

This module contains unit tests for abydos.stemmer.caumanns
"""

from __future__ import unicode_literals

import unittest

from abydos.stemmer.caumanns import caumanns


class CaumannsTestCases(unittest.TestCase):
    """Test Caumanns functions.

    abydos.stemmer.caumanns
    """

    def test_caumanns(self):
        """Test abydos.stemmer.caumanns."""
        # base case
        self.assertEqual(caumanns(''), '')

        # tests from Caumanns' description of the algorithm
        self.assertEqual(caumanns('singt'), 'sing')
        self.assertEqual(caumanns('singen'), 'sing')
        self.assertEqual(caumanns('beliebt'), 'belieb')
        self.assertEqual(caumanns('beliebtester'), 'belieb')
        self.assertEqual(caumanns('stören'), 'stor')
        self.assertEqual(caumanns('stöhnen'), 'stoh')
        self.assertEqual(caumanns('Kuß'), 'kuss')
        self.assertEqual(caumanns('Küsse'), 'kuss')
        self.assertEqual(caumanns('Verlierer'), 'verlier')
        self.assertEqual(caumanns('Verlies'), 'verlie')
        self.assertEqual(caumanns('Maus'), 'mau')
        self.assertEqual(caumanns('Mauer'), 'mau')
        self.assertEqual(caumanns('Störsender'), 'stor')

        # additional tests to achieve full coverage
        self.assertEqual(caumanns('Müllerinnen'), 'mullerin')
        self.assertEqual(caumanns('Matrix'), 'matrix')
        self.assertEqual(caumanns('Matrizen'), 'matrix')

    def test_caumanns_lucene(self):
        """Test abydos.stemmer.caumanns (Lucene tests).

        Based on tests from
        https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
        This is presumably Apache-licensed.
        """
        # German special characters are replaced:
        self.assertEqual(caumanns('häufig'), 'haufig')
        self.assertEqual(caumanns('üor'), 'uor')
        self.assertEqual(caumanns('björk'), 'bjork')

        # here the stemmer works okay, it maps related words to the same stem:
        self.assertEqual(caumanns('abschließen'), 'abschliess')
        self.assertEqual(caumanns('abschließender'), 'abschliess')
        self.assertEqual(caumanns('abschließendes'), 'abschliess')
        self.assertEqual(caumanns('abschließenden'), 'abschliess')

        self.assertEqual(caumanns('Tisch'), 'tisch')
        self.assertEqual(caumanns('Tische'), 'tisch')
        self.assertEqual(caumanns('Tischen'), 'tisch')
        self.assertEqual(caumanns('geheimtür'), 'geheimtur')

        self.assertEqual(caumanns('Haus'), 'hau')
        self.assertEqual(caumanns('Hauses'), 'hau')
        self.assertEqual(caumanns('Häuser'), 'hau')
        self.assertEqual(caumanns('Häusern'), 'hau')
        # here's a case where overstemming occurs, i.e. a word is
        # mapped to the same stem as unrelated words:
        self.assertEqual(caumanns('hauen'), 'hau')

        # here's a case where understemming occurs, i.e. two related words
        # are not mapped to the same stem. This is the case with basically
        # all irregular forms:
        self.assertEqual(caumanns('Drama'), 'drama')
        self.assertEqual(caumanns('Dramen'), 'dram')

        # replace "ß" with 'ss':
        self.assertEqual(caumanns('Ausmaß'), 'ausmass')

        # fake words to test if suffixes are cut off:
        self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
        self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
        # the suffixes are also removed when combined:
        self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')

        # words that are shorter than four charcters are not changed:
        self.assertEqual(caumanns('xxe'), 'xxe')
        # -em and -er are not removed from words shorter than five characters:
        self.assertEqual(caumanns('xxem'), 'xxem')
        self.assertEqual(caumanns('xxer'), 'xxer')
        # -nd is not removed from words shorter than six characters:
        self.assertEqual(caumanns('xxxnd'), 'xxxnd')


if __name__ == '__main__':
    unittest.main()


1			# -- coding: utf-8 --
2
3			# Copyright 2014-2018 by Christopher C. Little.
4			# This file is part of Abydos.
5			#
6			# Abydos is free software: you can redistribute it and/or modify
7			# it under the terms of the GNU General Public License as published by
8			# the Free Software Foundation, either version 3 of the License, or
9			# (at your option) any later version.
10			#
11			# Abydos is distributed in the hope that it will be useful,
12			# but WITHOUT ANY WARRANTY; without even the implied warranty of
13			# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14			# GNU General Public License for more details.
15			#
16			# You should have received a copy of the GNU General Public License
17			# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19			"""abydos.tests.test_stemmer_caumanns.
20
21			This module contains unit tests for abydos.stemmer.caumanns
22			"""
23
24			from __future__ import unicode_literals
25
26			import unittest
27
28			from abydos.stemmer.caumanns import caumanns
29
30
31			class CaumannsTestCases(unittest.TestCase):
32			"""Test Caumanns functions.
33
34			abydos.stemmer.caumanns
35			"""
36
37			def test_caumanns(self):
38			"""Test abydos.stemmer.caumanns."""
39			# base case
40			self.assertEqual(caumanns(''), '')
41
42			# tests from Caumanns' description of the algorithm
43			self.assertEqual(caumanns('singt'), 'sing')
44			self.assertEqual(caumanns('singen'), 'sing')
45			self.assertEqual(caumanns('beliebt'), 'belieb')
46			self.assertEqual(caumanns('beliebtester'), 'belieb')
47			self.assertEqual(caumanns('stören'), 'stor')
48			self.assertEqual(caumanns('stöhnen'), 'stoh')
49			self.assertEqual(caumanns('Kuß'), 'kuss')
50			self.assertEqual(caumanns('Küsse'), 'kuss')
51			self.assertEqual(caumanns('Verlierer'), 'verlier')
52			self.assertEqual(caumanns('Verlies'), 'verlie')
53			self.assertEqual(caumanns('Maus'), 'mau')
54			self.assertEqual(caumanns('Mauer'), 'mau')
55			self.assertEqual(caumanns('Störsender'), 'stor')
56
57			# additional tests to achieve full coverage
58			self.assertEqual(caumanns('Müllerinnen'), 'mullerin')
59			self.assertEqual(caumanns('Matrix'), 'matrix')
60			self.assertEqual(caumanns('Matrizen'), 'matrix')
61
62			def test_caumanns_lucene(self):
63			"""Test abydos.stemmer.caumanns (Lucene tests).
64
65			Based on tests from
66			https://svn.apache.org/repos/asf/lucene.net/trunk/test/contrib/Analyzers/De/data.txt
67			This is presumably Apache-licensed.
68			"""
69			# German special characters are replaced:
70			self.assertEqual(caumanns('häufig'), 'haufig')
71			self.assertEqual(caumanns('üor'), 'uor')
72			self.assertEqual(caumanns('björk'), 'bjork')
73
74			# here the stemmer works okay, it maps related words to the same stem:
75			self.assertEqual(caumanns('abschließen'), 'abschliess')
76			self.assertEqual(caumanns('abschließender'), 'abschliess')
77			self.assertEqual(caumanns('abschließendes'), 'abschliess')
78			self.assertEqual(caumanns('abschließenden'), 'abschliess')
79
80			self.assertEqual(caumanns('Tisch'), 'tisch')
81			self.assertEqual(caumanns('Tische'), 'tisch')
82			self.assertEqual(caumanns('Tischen'), 'tisch')
83			self.assertEqual(caumanns('geheimtür'), 'geheimtur')
84
85			self.assertEqual(caumanns('Haus'), 'hau')
86			self.assertEqual(caumanns('Hauses'), 'hau')
87			self.assertEqual(caumanns('Häuser'), 'hau')
88			self.assertEqual(caumanns('Häusern'), 'hau')
89			# here's a case where overstemming occurs, i.e. a word is
90			# mapped to the same stem as unrelated words:
91			self.assertEqual(caumanns('hauen'), 'hau')
92
93			# here's a case where understemming occurs, i.e. two related words
94			# are not mapped to the same stem. This is the case with basically
95			# all irregular forms:
96			self.assertEqual(caumanns('Drama'), 'drama')
97			self.assertEqual(caumanns('Dramen'), 'dram')
98
99			# replace "ß" with 'ss':
100			self.assertEqual(caumanns('Ausmaß'), 'ausmass')
101
102			# fake words to test if suffixes are cut off:
103			self.assertEqual(caumanns('xxxxxe'), 'xxxxx')
104			self.assertEqual(caumanns('xxxxxs'), 'xxxxx')
105			self.assertEqual(caumanns('xxxxxn'), 'xxxxx')
106			self.assertEqual(caumanns('xxxxxt'), 'xxxxx')
107			self.assertEqual(caumanns('xxxxxem'), 'xxxxx')
108			self.assertEqual(caumanns('xxxxxer'), 'xxxxx')
109			self.assertEqual(caumanns('xxxxxnd'), 'xxxxx')
110			# the suffixes are also removed when combined:
111			self.assertEqual(caumanns('xxxxxetende'), 'xxxxx')
112
113			# words that are shorter than four charcters are not changed:
114			self.assertEqual(caumanns('xxe'), 'xxe')
115			# -em and -er are not removed from words shorter than five characters:
116			self.assertEqual(caumanns('xxem'), 'xxem')
117			self.assertEqual(caumanns('xxer'), 'xxer')
118			# -nd is not removed from words shorter than six characters:
119			self.assertEqual(caumanns('xxxnd'), 'xxxnd')
120
121
122			if __name__ == '__main__':
123			unittest.main()
124

chrislit / abydos

Branch — master (87ccc1)

tests.stemmer.test_stemmer_caumanns A

Complexity

Size/Duplication

Importance

Duplication Side-by-Side

Filter issues like