|
1
|
|
|
# -*- coding: utf-8 -*- |
|
2
|
|
|
|
|
3
|
|
|
# Copyright 2014-2018 by Christopher C. Little. |
|
4
|
|
|
# This file is part of Abydos. |
|
5
|
|
|
# |
|
6
|
|
|
# Abydos is free software: you can redistribute it and/or modify |
|
7
|
|
|
# it under the terms of the GNU General Public License as published by |
|
8
|
|
|
# the Free Software Foundation, either version 3 of the License, or |
|
9
|
|
|
# (at your option) any later version. |
|
10
|
|
|
# |
|
11
|
|
|
# Abydos is distributed in the hope that it will be useful, |
|
12
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
13
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
14
|
|
|
# GNU General Public License for more details. |
|
15
|
|
|
# |
|
16
|
|
|
# You should have received a copy of the GNU General Public License |
|
17
|
|
|
# along with Abydos. If not, see <http://www.gnu.org/licenses/>. |
|
18
|
|
|
|
|
19
|
|
|
"""abydos.tests.test_ngram. |
|
20
|
|
|
|
|
21
|
|
|
This module contains unit tests for abydos.ngram |
|
22
|
|
|
""" |
|
23
|
|
|
|
|
24
|
|
|
from __future__ import unicode_literals |
|
25
|
|
|
|
|
26
|
|
|
import unittest |
|
27
|
|
|
from collections import Counter |
|
28
|
|
|
|
|
29
|
|
|
from abydos.corpus.corpus import Corpus |
|
30
|
|
|
from abydos.corpus.ngram import NGramCorpus |
|
31
|
|
|
|
|
32
|
|
|
from .. import _corpus_file |
|
33
|
|
|
|
|
34
|
|
|
|
|
35
|
|
|
class NGramCorpusTestCases(unittest.TestCase): |
|
36
|
|
|
"""Test abydos.corpus.ngram.NGramCorpus.""" |
|
37
|
|
|
|
|
38
|
|
|
simple_corpus = NGramCorpus() |
|
39
|
|
|
simple_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) |
|
40
|
|
|
|
|
41
|
|
|
double_corpus = NGramCorpus() |
|
42
|
|
|
double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) |
|
43
|
|
|
double_corpus.gng_importer(_corpus_file('simple-ngrams.txt')) |
|
44
|
|
|
|
|
45
|
|
|
sotu2015Sample = 'Mr. Speaker, Mr. Vice President, Members of Congress, my\ |
|
46
|
|
|
fellow Americans:\n\nWe are 15 years into this new century.\n Fifteen\ |
|
47
|
|
|
years that dawned with terror touching our shores; that unfolded with a\ |
|
48
|
|
|
new generation fighting two long and costly wars; that saw a vicious\ |
|
49
|
|
|
recession spread across our nation and the world.\n It has been, and still\ |
|
50
|
|
|
is, a hard time for many.\n\nBut tonight, we turn the page.\n Tonight,\ |
|
51
|
|
|
after a breakthrough year for America, our economy is growing and creating\ |
|
52
|
|
|
jobs at the fastest pace since 1999.\n Our unemployment rate is now lower\ |
|
53
|
|
|
than it was before the financial crisis.\n More of our kids are graduating\ |
|
54
|
|
|
than ever before.\n More of our people are insured than ever before.\n And\ |
|
55
|
|
|
we are as free from the grip of foreign oil as we\'ve been in almost 30\ |
|
56
|
|
|
years.\n\nTonight, for the first time since 9/11, our combat mission in\ |
|
57
|
|
|
Afghanistan is over.\n Six years ago, nearly 180,000 American troops\ |
|
58
|
|
|
served in Iraq and Afghanistan.\n Today, fewer than 15,000 remain.\n And\ |
|
59
|
|
|
we salute the courage and sacrifice of every man and woman in this 9/11\ |
|
60
|
|
|
Generation who has served to keep us safe.\n We are humbled and grateful\ |
|
61
|
|
|
for your service.\n\nAmerica, for all that we have endured; for all the\ |
|
62
|
|
|
grit and hard work required to come back; for all the tasks that lie\ |
|
63
|
|
|
ahead, know this: The shadow of crisis has passed, and the State of the\ |
|
64
|
|
|
Union is strong.\n\nAt this moment -- with a growing economy, shrinking\ |
|
65
|
|
|
deficits, bustling industry, booming energy production -- we have risen\ |
|
66
|
|
|
from recession freer to write our own future than any other nation on\ |
|
67
|
|
|
Earth.\n It\'s now up to us to choose who we want to be over the next 15\ |
|
68
|
|
|
years and for decades to come.\n\nWill we accept an economy where only a\ |
|
69
|
|
|
few of us do spectacularly well?\n Or will we commit ourselves to an\ |
|
70
|
|
|
economy that generates rising incomes and chances for everyone who makes\ |
|
71
|
|
|
the effort?\n\nWill we approach the world fearful and reactive, dragged\ |
|
72
|
|
|
into costly conflicts that strain our military and set back our\ |
|
73
|
|
|
standing?\n Or will we lead wisely, using all elements of our power to\ |
|
74
|
|
|
defeat new threats and protect our planet?\n\nWill we allow ourselves to\ |
|
75
|
|
|
be sorted into factions and turned against one another?\n Or will we\ |
|
76
|
|
|
recapture the sense of common purpose that has always propelled America\ |
|
77
|
|
|
forward?\n\nIn two weeks, I will send this Congress a budget filled with\ |
|
78
|
|
|
ideas that are practical, not partisan.\n And in the months ahead, I\'ll\ |
|
79
|
|
|
crisscross the country making a case for those ideas.\n So tonight, I want\ |
|
80
|
|
|
to focus less on a checklist of proposals, and focus more on the values at\ |
|
81
|
|
|
stake in the choices before us.' |
|
82
|
|
|
sotu2015Corpus = Corpus(sotu2015Sample, filter_chars='.?-;,:') |
|
83
|
|
|
|
|
84
|
|
|
sotu_ngcorpus_uni = NGramCorpus(sotu2015Corpus) |
|
85
|
|
|
|
|
86
|
|
|
sotu_ngcorpus_tri = NGramCorpus() |
|
87
|
|
|
sotu_ngcorpus_tri.corpus_importer(sotu2015Corpus, 3, '<SOS>', '<EOS>') |
|
88
|
|
|
|
|
89
|
|
|
sotu_ngcorpus_5 = NGramCorpus() |
|
90
|
|
|
sotu_ngcorpus_5.corpus_importer(sotu2015Corpus, 5, '', '') |
|
91
|
|
|
|
|
92
|
|
|
simple_ngcorpus_5 = NGramCorpus() |
|
93
|
|
|
simple_ngcorpus_5.corpus_importer(Corpus(' '.join(['a']*10)), |
|
94
|
|
|
15) # 10 a's |
|
95
|
|
|
|
|
96
|
|
|
def test_init(self): |
|
97
|
|
|
"""Test abydos.corpus.ngram.__init__.""" |
|
98
|
|
|
self.assertIsInstance(NGramCorpus(), NGramCorpus) |
|
99
|
|
|
self.assertRaises(TypeError, NGramCorpus, ['a', 'b', 'c']) |
|
100
|
|
|
self.assertIsInstance(NGramCorpus(self.sotu2015Corpus), NGramCorpus) |
|
101
|
|
|
|
|
102
|
|
|
def test_corpus_importer(self): |
|
103
|
|
|
"""Test abydos.corpus.ngram.corpus_importer.""" |
|
104
|
|
|
self.assertRaises(TypeError, self.sotu_ngcorpus_5.corpus_importer, |
|
105
|
|
|
'a b c d') |
|
106
|
|
|
self.assertRaises(TypeError, self.sotu_ngcorpus_5.corpus_importer) |
|
107
|
|
|
|
|
108
|
|
|
self.assertIsInstance(self.sotu_ngcorpus_uni, NGramCorpus) |
|
109
|
|
|
self.assertIsInstance(self.sotu_ngcorpus_tri, NGramCorpus) |
|
110
|
|
|
|
|
111
|
|
|
self.assertIsInstance(self.sotu_ngcorpus_uni.ngcorpus, Counter) |
|
112
|
|
|
self.assertIsInstance(self.sotu_ngcorpus_tri.ngcorpus, Counter) |
|
113
|
|
|
|
|
114
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*1)), 10) |
|
115
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*2)), 9) |
|
116
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*3)), 8) |
|
117
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*4)), 7) |
|
118
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*5)), 6) |
|
119
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*6)), 5) |
|
120
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*7)), 4) |
|
121
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*8)), 3) |
|
122
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*9)), 2) |
|
123
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*10)), 1) |
|
124
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*11)), 0) |
|
125
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*12)), 0) |
|
126
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*13)), 0) |
|
127
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*14)), 0) |
|
128
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count(' '.join('a'*15)), 0) |
|
129
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('_START_ a'), 1) |
|
130
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('a _END_'), 1) |
|
131
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('_END_ a'), 0) |
|
132
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('a _START_'), 0) |
|
133
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('_START_ a _END_'), |
|
134
|
|
|
0) |
|
135
|
|
|
self.assertEqual(self.simple_ngcorpus_5.get_count('_END_ a _START_'), |
|
136
|
|
|
0) |
|
137
|
|
|
|
|
138
|
|
|
self.assertEqual(self.sotu_ngcorpus_uni.get_count('Mr'), 2) |
|
139
|
|
|
self.assertEqual(self.sotu_ngcorpus_tri.get_count('Mr'), 2) |
|
140
|
|
|
|
|
141
|
|
|
self.assertEqual(self.sotu_ngcorpus_uni.get_count('the'), 19) |
|
142
|
|
|
self.assertEqual(self.sotu_ngcorpus_tri.get_count('the'), 19) |
|
143
|
|
|
|
|
144
|
|
|
self.assertEqual(self.sotu_ngcorpus_uni.get_count('to come'), 0) |
|
145
|
|
|
self.assertEqual(self.sotu_ngcorpus_tri.get_count('to come'), 2) |
|
146
|
|
|
|
|
147
|
|
|
self.assertEqual(self.sotu_ngcorpus_tri.get_count('<SOS> And'), 3) |
|
148
|
|
|
self.assertGreater(self.sotu_ngcorpus_tri.get_count('<SOS> And'), |
|
149
|
|
|
self.sotu_ngcorpus_5.get_count('<SOS> And')) |
|
150
|
|
|
|
|
151
|
|
|
def test_gng_importer(self): |
|
152
|
|
|
"""Test abydos.corpus.ngram.gng_importer.""" |
|
153
|
|
|
self.assertIsInstance(self.simple_corpus, NGramCorpus) |
|
154
|
|
|
self.assertIsInstance(self.simple_corpus.ngcorpus, Counter) |
|
155
|
|
|
|
|
156
|
|
|
self.assertEqual(self.simple_corpus.get_count('the'), 20) |
|
157
|
|
|
self.assertEqual(self.double_corpus.get_count('the'), 40) |
|
158
|
|
|
|
|
159
|
|
|
def test_get_count(self): |
|
160
|
|
|
"""Test abydos.corpus.ngram.get_count.""" |
|
161
|
|
|
# string-style tests |
|
162
|
|
|
self.assertEqual(self.simple_corpus.get_count('the'), 20) |
|
163
|
|
|
self.assertEqual(self.simple_corpus.get_count('the quick'), 2) |
|
164
|
|
|
self.assertEqual(self.simple_corpus.get_count('trolley'), 0) |
|
165
|
|
|
|
|
166
|
|
|
# list-style tests |
|
167
|
|
|
self.assertEqual(self.simple_corpus.get_count(['the']), 20) |
|
168
|
|
|
self.assertEqual(self.simple_corpus.get_count(['the', 'quick']), 2) |
|
169
|
|
|
self.assertEqual(self.simple_corpus.get_count(['trolley']), 0) |
|
170
|
|
|
|
|
171
|
|
|
def test_tf(self): |
|
172
|
|
|
"""Test abydos.corpus.ngram.tf.""" |
|
173
|
|
|
# zero case |
|
174
|
|
|
self.assertEqual(self.sotu_ngcorpus_uni.tf('Niall'), 0) |
|
175
|
|
|
|
|
176
|
|
|
# simple cases |
|
177
|
|
|
self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('the'), 2.2787536) |
|
178
|
|
|
self.assertAlmostEqual(self.sotu_ngcorpus_uni.tf('America'), 1.4771213) |
|
179
|
|
|
|
|
180
|
|
|
# bigrams |
|
181
|
|
|
self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the sense') |
|
182
|
|
|
self.assertRaises(ValueError, self.sotu_ngcorpus_tri.tf, 'the world') |
|
183
|
|
|
|
|
184
|
|
|
|
|
185
|
|
|
if __name__ == '__main__': |
|
186
|
|
|
unittest.main() |
|
187
|
|
|
|