1
|
|
|
import unittest |
2
|
|
|
from namebot import normalization as norm |
3
|
|
|
|
4
|
|
|
|
5
|
|
|
class RemoveOddWordTestCase(unittest.TestCase): |
6
|
|
|
def test_remove_odd_sounding_words(self): |
7
|
|
|
"""Test four nonsense inputs that should get |
8
|
|
|
captured by the regexes.""" |
9
|
|
|
original = ['bking', 'aaaeee', 'flower', 'rabbit'] |
10
|
|
|
original_count = len(original) |
11
|
|
|
updated = norm.remove_odd_sounding_words(original) |
12
|
|
|
new_count = len(updated) |
13
|
|
|
self.assertEqual(new_count, original_count - 2) |
14
|
|
|
|
15
|
|
|
def test_no_remove_odd_sounding_words(self): |
16
|
|
|
"""Test bad inputs should not get removed.""" |
17
|
|
|
original = ['flower', 'rabbit'] |
18
|
|
|
original_count = len(original) |
19
|
|
|
updated = norm.remove_odd_sounding_words(original) |
20
|
|
|
new_count = len(updated) |
21
|
|
|
self.assertEqual(new_count, original_count) |
22
|
|
|
|
23
|
|
|
def test_none_remove_odd_sounding_words(self): |
24
|
|
|
"""Tests that no input is returned without looping""" |
25
|
|
|
original = None |
26
|
|
|
updated = norm.remove_odd_sounding_words(original) |
27
|
|
|
self.assertEqual(updated, original) |
28
|
|
|
|
29
|
|
|
def test_empty_remove_odd_sounding_words(self): |
30
|
|
|
"""Tests that empty list is returned without looping""" |
31
|
|
|
original = [] |
32
|
|
|
updated = norm.remove_odd_sounding_words(original) |
33
|
|
|
self.assertEqual(len(original), 0) |
34
|
|
|
self.assertEqual(original, updated) |
35
|
|
|
|
36
|
|
|
|
37
|
|
|
class StemWordsTestCase(unittest.TestCase): |
38
|
|
|
def test_stem_words(self): |
39
|
|
|
"""Tests stemmer is working""" |
40
|
|
|
words = ['running', 'jumping'] |
41
|
|
|
stemmed = norm.stem_words(words) |
42
|
|
|
self.assertEqual(['run', 'jump'], stemmed) |
43
|
|
|
|
44
|
|
|
def test_no_stem_words(self): |
45
|
|
|
"""Tests stemmer is not stemming root words""" |
46
|
|
|
words = ['run', 'jump'] |
47
|
|
|
stemmed = norm.stem_words(words) |
48
|
|
|
self.assertEqual(words, stemmed) |
49
|
|
|
|
50
|
|
|
|
51
|
|
|
class RemoveBadWordsTestCase(unittest.TestCase): |
52
|
|
|
def test_stem_words(self): |
53
|
|
|
"""Tests bad words are getting filtered out.""" |
54
|
|
|
bad_words = ['fuck', 'pussy', 'cunt'] |
55
|
|
|
words = bad_words + ['cool', 'neat', 'rad'] |
56
|
|
|
cleaned = norm.remove_bad_words(words) |
57
|
|
|
self.assertNotEqual(bad_words, cleaned) |
58
|
|
|
for bad_word in bad_words: |
59
|
|
|
self.assertFalse(bad_word in cleaned) |
60
|
|
|
|
61
|
|
|
|
62
|
|
|
class RemoveStopWordsTestCase(unittest.TestCase): |
63
|
|
|
def test_filter_long_words(self): |
64
|
|
|
"""Test that no stop words were kept""" |
65
|
|
|
stop_words = ['the', 'is', 'are', 'am', 'but'] |
66
|
|
|
filtered = norm.remove_stop_words(stop_words) |
67
|
|
|
self.assertEqual(len(filtered), 0) |
68
|
|
|
|
69
|
|
|
|
70
|
|
|
class FilterWordsTestCase(unittest.TestCase): |
71
|
|
|
def test_filter_long_words(self): |
72
|
|
|
"""Tests that very long words are filtered out""" |
73
|
|
|
long_words = ['areallyverylongword', 'anextrareallyverylongword'] |
74
|
|
|
words = long_words + ['normal', 'words'] |
75
|
|
|
filtered = norm.filter_words(words) |
76
|
|
|
for long_word in long_words: |
77
|
|
|
self.assertFalse(long_word in filtered) |
78
|
|
|
|
79
|
|
|
|
80
|
|
|
class UniquifyTestCase(unittest.TestCase): |
81
|
|
|
def test_uniquify(self): |
82
|
|
|
words = ['cool', 'neat', 'cool', 'cool', 'neat'] |
83
|
|
|
updated = norm.uniquify(words) |
84
|
|
|
self.assertEqual(len(updated), 2) |
85
|
|
|
|
86
|
|
|
|
87
|
|
|
class CleanSortTestCase(unittest.TestCase): |
88
|
|
|
def test_clean_sort(self): |
89
|
|
|
words = ['!@foobar!#', 'ba3z!@#33_', 'bam!333____#33'] |
90
|
|
|
cleaned = norm.clean_sort(words) |
91
|
|
|
self.assertEqual(cleaned, ['foobar', 'baz', 'bam']) |
92
|
|
|
|
93
|
|
|
def test_clean_string(self): |
94
|
|
|
val = '!@foobar!#' |
95
|
|
|
cleaned = norm.clean_sort(val) |
96
|
|
|
self.assertEqual(cleaned, val) |
97
|
|
|
|
98
|
|
|
|
99
|
|
|
class ChopDuplicateEndsTestCase(unittest.TestCase): |
100
|
|
|
|
101
|
|
|
def test_basic(self): |
102
|
|
|
chopped = norm.chop_duplicate_ends('aabracadabraa') |
103
|
|
|
self.assertEqual(chopped, 'abracadabra') |
104
|
|
|
|
105
|
|
|
def test_one_end(self): |
106
|
|
|
chopped1 = norm.chop_duplicate_ends('aabracadabra') |
107
|
|
|
chopped2 = norm.chop_duplicate_ends('abracadabraa') |
108
|
|
|
self.assertEqual(chopped1, 'abracadabra') |
109
|
|
|
self.assertEqual(chopped2, 'abracadabra') |
110
|
|
|
|
111
|
|
|
|
112
|
|
|
class KeyWordsByPosTagTestCase(unittest.TestCase): |
113
|
|
|
|
114
|
|
|
def test_basic(self): |
115
|
|
|
data = [ |
116
|
|
|
('Monkey', 'NNP'), ('Fly', 'VBG'), ('Fly', 'RB'), |
117
|
|
|
('Dog', 'NNP'), ('Cat', 'NNP')] |
118
|
|
|
keyed = norm.key_words_by_pos_tag(data) |
119
|
|
|
expected = { |
120
|
|
|
'RB': ['Fly'], |
121
|
|
|
'NNP': ['Monkey', 'Dog', 'Cat'], |
122
|
|
|
'VBG': ['Fly'] |
123
|
|
|
} |
124
|
|
|
self.assertEqual(dict(keyed), expected) |
125
|
|
|
|