Completed
Branch master (87ccc1)
by Chris
10:18
created

tests.corpus.test_corpus_ngram   A

Complexity

Total Complexity 5

Size/Duplication

Total Lines 187
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 87
dl 0
loc 187
rs 10
c 0
b 0
f 0
wmc 5
1
# -*- coding: utf-8 -*-
2
3
# Copyright 2014-2018 by Christopher C. Little.
4
# This file is part of Abydos.
5
#
6
# Abydos is free software: you can redistribute it and/or modify
7
# it under the terms of the GNU General Public License as published by
8
# the Free Software Foundation, either version 3 of the License, or
9
# (at your option) any later version.
10
#
11
# Abydos is distributed in the hope that it will be useful,
12
# but WITHOUT ANY WARRANTY; without even the implied warranty of
13
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14
# GNU General Public License for more details.
15
#
16
# You should have received a copy of the GNU General Public License
17
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.
18
19
"""abydos.tests.test_ngram.
20
21
This module contains unit tests for abydos.ngram
22
"""
23
24
from __future__ import unicode_literals
25
26
import unittest
27
from collections import Counter
28
29
from abydos.corpus.corpus import Corpus
30
from abydos.corpus.ngram import NGramCorpus
31
32
from .. import _corpus_file
33
34
35
class NGramCorpusTestCases(unittest.TestCase):
36
    """Test abydos.corpus.ngram.NGramCorpus."""
37
38
    simple_corpus = NGramCorpus()
39
    simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))
40
41
    double_corpus = NGramCorpus()
42
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))
43
    double_corpus.gng_importer(_corpus_file('simple-ngrams.txt'))
44
45
    sotu2015Sample = 'Mr. Speaker, Mr. Vice President, Members of Congress, my\
46
    fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\
47
    years that dawned with terror touching our shores; that unfolded with a\
48
    new generation fighting two long and costly wars; that saw a vicious\
49
    recession spread across our nation and the world.\n It has been, and still\
50
    is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\
51
    after a breakthrough year for America, our economy is growing and creating\
52
    jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\
53
    than it was before the financial crisis.\n More of our kids are graduating\
54
    than ever before.\n More of our people are insured than ever before.\n And\
55
    we are as free from the grip of foreign oil as we\'ve been in almost 30\
56
    years.\n\nTonight, for the first time since 9/11, our combat mission in\
57
    Afghanistan is over.\n Six years ago, nearly 180,000 American troops\
58
    served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\
59
    we salute the courage and sacrifice of every man and woman in this 9/11\
60
    Generation who has served to keep us safe.\n We are humbled and grateful\
61
    for your service.\n\nAmerica, for all that we have endured; for all the\
62
    grit and hard work required to come back; for all the tasks that lie\
63
    ahead, know this: The shadow of crisis has passed, and the State of the\
64
    Union is strong.\n\nAt this moment -- with a growing economy, shrinking\
65
    deficits, bustling industry, booming energy production -- we have risen\
66
    from recession freer to write our own future than any other nation on\
67
    Earth.\n It\'s now up to us to choose who we want to be over the next 15\
68
    years and for decades to come.\n\nWill we accept an economy where only a\
69
    few of us do spectacularly well?\n Or will we commit ourselves to an\
70
    economy that generates rising incomes and chances for everyone who makes\
71
    the effort?\n\nWill we approach the world fearful and reactive, dragged\
72
    into costly conflicts that strain our military and set back our\
73
    standing?\n Or will we lead wisely, using all elements of our power to\
74
    defeat new threats and protect our planet?\n\nWill we allow ourselves to\
75
    be sorted into factions and turned against one another?\n Or will we\
76
    recapture the sense of common purpose that has always propelled America\
77
    forward?\n\nIn two weeks, I will send this Congress a budget filled with\
78
    ideas that are practical, not partisan.\n And in the months ahead, I\'ll\
79
    crisscross the country making a case for those ideas.\n So tonight, I want\
80
    to focus less on a checklist of proposals, and focus more on the values at\
81
    stake in the choices before us.'
82
    sotu2015Corpus = Corpus(sotu2015Sample, filter_chars='.?-;,:')
83
84
    sotu_ngcorpus_uni = NGramCorpus(sotu2015Corpus)
85
86
    sotu_ngcorpus_tri = NGramCorpus()
87
    sotu_ngcorpus_tri.corpus_importer(sotu2015Corpus, 3, '<SOS>', '<EOS>')
88
89
    sotu_ngcorpus_5 = NGramCorpus()
90
    sotu_ngcorpus_5.corpus_importer(sotu2015Corpus, 5, '', '')
91
92
    simple_ngcorpus_5 = NGramCorpus()
93
    simple_ngcorpus_5.corpus_importer(Corpus(' '.join(['a']*10)),
94
                                      15)  # 10 a's
95
96
    def test_init(self):
97
        """Test abydos.corpus.ngram.__init__."""
98
        self.assertIsInstance(NGramCorpus(), NGramCorpus)
99
        self.assertRaises(TypeError, NGramCorpus, ['a', 'b', 'c'])
100
        self.assertIsInstance(NGramCorpus(self.sotu2015Corpus), NGramCorpus)
101
102
    def test_corpus_importer(self):
103
        """Test abydos.corpus.ngram.corpus_importer."""
104
        self.assertRaises(TypeError, self.sotu_ngcorpus_5.corpus_importer,
105
                          'a b c d')
106
        self.assertRaises(TypeError, self.sotu_ngcorpus_5.corpus_importer)
107
108
        self.assertIsInstance(self.sotu_ngcorpus_uni, NGramCorpus)
109
        self.assertIsInstance(self.sotu_ngcorpus_tri, NGramCorpus)
110
111
        self.assertIsInstance(self.sotu_ngcorpus_uni.ngcorpus, Counter)
112
        self.assertIsInstance(self.sotu_ngcorpus_tri.ngcorpus, Counter)
113
114
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*1)), 10)
115
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*2)), 9)
116
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*3)), 8)
117
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*4)), 7)
118
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*5)), 6)
119
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*6)), 5)
120
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*7)), 4)
121
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*8)), 3)
122
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*9)), 2)
123
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*10)), 1)
124
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*11)), 0)
125
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*12)), 0)
126
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*13)), 0)
127
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*14)), 0)
128
        self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*15)), 0)
129
        self.assertEqual(self.simple_ngcorpus_5.get_count('_START_ a'), 1)
130
        self.assertEqual(self.simple_ngcorpus_5.get_count('a _END_'), 1)
131
        self.assertEqual(self.simple_ngcorpus_5.get_count('_END_ a'), 0)
132
        self.assertEqual(self.simple_ngcorpus_5.get_count('a _START_'), 0)
133
        self.assertEqual(self.simple_ngcorpus_5.get_count('_START_ a _END_'),
134
                         0)
135
        self.assertEqual(self.simple_ngcorpus_5.get_count('_END_ a _START_'),
136
                         0)
137
138
        self.assertEqual(self.sotu_ngcorpus_uni.get_count('Mr'), 2)
139
        self.assertEqual(self.sotu_ngcorpus_tri.get_count('Mr'), 2)
140
141
        self.assertEqual(self.sotu_ngcorpus_uni.get_count('the'), 19)
142
        self.assertEqual(self.sotu_ngcorpus_tri.get_count('the'), 19)
143
144
        self.assertEqual(self.sotu_ngcorpus_uni.get_count('to come'), 0)
145
        self.assertEqual(self.sotu_ngcorpus_tri.get_count('to come'), 2)
146
147
        self.assertEqual(self.sotu_ngcorpus_tri.get_count('<SOS> And'), 3)
148
        self.assertGreater(self.sotu_ngcorpus_tri.get_count('<SOS> And'),
149
                           self.sotu_ngcorpus_5.get_count('<SOS> And'))
150
151
    def test_gng_importer(self):
152
        """Test abydos.corpus.ngram.gng_importer."""
153
        self.assertIsInstance(self.simple_corpus, NGramCorpus)
154
        self.assertIsInstance(self.simple_corpus.ngcorpus, Counter)
155
156
        self.assertEqual(self.simple_corpus.get_count('the'), 20)
157
        self.assertEqual(self.double_corpus.get_count('the'), 40)
158
159
    def test_get_count(self):
160
        """Test abydos.corpus.ngram.get_count."""
161
        # string-style tests
162
        self.assertEqual(self.simple_corpus.get_count('the'), 20)
163
        self.assertEqual(self.simple_corpus.get_count('the quick'), 2)
164
        self.assertEqual(self.simple_corpus.get_count('trolley'), 0)
165
166
        # list-style tests
167
        self.assertEqual(self.simple_corpus.get_count(['the']), 20)
168
        self.assertEqual(self.simple_corpus.get_count(['the', 'quick']), 2)
169
        self.assertEqual(self.simple_corpus.get_count(['trolley']), 0)
170
171
    def test_tf(self):
172
        """Test abydos.corpus.ngram.tf."""
173
        # zero case
174
        self.assertEqual(self.sotu_ngcorpus_uni.tf('Niall'), 0)
175
176
        # simple cases
177
        self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('the'), 2.2787536)
178
        self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('America'), 1.4771213)
179
180
        # bigrams
181
        self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the sense')
182
        self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the world')
183
184
185
if __name__ == '__main__':
186
    unittest.main()
187