1
|
1 |
|
import json |
2
|
1 |
|
import itertools |
3
|
1 |
|
from nltk.stem.wordnet import WordNetLemmatizer |
4
|
1 |
|
from ppp_questionparsing_grammatical import Word, DependenciesTree, computeTree, NamedEntityMerging, PrepositionMerging |
5
|
1 |
|
import data |
6
|
|
|
|
7
|
1 |
|
from unittest import TestCase |
8
|
|
|
|
9
|
1 |
|
class PreprocessingMergeTests(TestCase): |
10
|
|
|
|
11
|
1 |
|
def testBasicNamedEntityChildParent(self): |
12
|
1 |
|
tagList = ['LOCATION', 'PERSON', 'NUMBER', 'MONEY', 'MISC'] |
13
|
1 |
|
for tag in tagList: |
14
|
1 |
|
parent = DependenciesTree('parent', 1, namedEntityTag = tag) |
15
|
1 |
|
child = DependenciesTree('child', 2, parent = parent, namedEntityTag = tag) |
16
|
1 |
|
parent.child.append(child) |
17
|
1 |
|
child.dependency = 'conj_and' |
18
|
1 |
|
NamedEntityMerging(parent).merge() |
19
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
20
|
1 |
|
self.assertEqual(parent.child, [child]) |
21
|
1 |
|
self.assertEqual(child.parent, parent) |
22
|
1 |
|
child.dependency = 'foo' |
23
|
1 |
|
NamedEntityMerging(parent).merge() |
24
|
1 |
|
self.assertIn(Word('parent', 1), parent.wordList) |
25
|
1 |
|
self.assertIn(Word('child', 2), parent.wordList) |
26
|
1 |
|
self.assertEqual(parent.child, []) |
27
|
1 |
|
for (tag1, tag2) in itertools.permutations(tagList, 2): |
28
|
1 |
|
parent = DependenciesTree('parent', 1, namedEntityTag = tag1) |
29
|
1 |
|
child = DependenciesTree('child', 2, parent = parent, namedEntityTag = tag2) |
30
|
1 |
|
parent.child.append(child) |
31
|
1 |
|
child.dependency = 'conj_and' |
32
|
1 |
|
NamedEntityMerging(parent).merge() |
33
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
34
|
1 |
|
self.assertEqual(parent.child, [child]) |
35
|
1 |
|
self.assertEqual(child.parent, parent) |
36
|
1 |
|
child.dependency = 'foo' |
37
|
1 |
|
NamedEntityMerging(parent).merge() |
38
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
39
|
1 |
|
self.assertEqual(parent.child, [child]) |
40
|
1 |
|
self.assertEqual(child.parent, parent) |
41
|
|
|
|
42
|
1 |
|
def testBasicNamedEntitySisterBrother(self): |
43
|
1 |
|
tagList = ['LOCATION', 'PERSON', 'NUMBER', 'MONEY', 'MISC'] |
44
|
1 |
|
for tag in tagList: |
45
|
1 |
|
parent = DependenciesTree('parent', 1, namedEntityTag = 'undef') |
46
|
1 |
|
child1 = DependenciesTree('child1', 2, parent = parent, dependency = 'conj_and', namedEntityTag = tag) |
47
|
1 |
|
child2 = DependenciesTree('child2', 3, parent = parent, dependency = 'conj_and', namedEntityTag = tag) |
48
|
1 |
|
parent.child += [child1, child2] |
49
|
1 |
|
NamedEntityMerging(parent).merge() |
50
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
51
|
1 |
|
self.assertEqual(parent.child, [child1, child2]) |
52
|
1 |
|
self.assertEqual(child1.parent, parent) |
53
|
1 |
|
self.assertEqual(child2.parent, parent) |
54
|
1 |
|
child1.dependency = 'foo' |
55
|
1 |
|
child2.dependency = 'foo' |
56
|
1 |
|
NamedEntityMerging(parent).merge() |
57
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
58
|
1 |
|
self.assertEqual(len(parent.child), 1) |
59
|
1 |
|
self.assertIn(Word('child1', 2), parent.child[0].wordList) |
60
|
1 |
|
self.assertIn(Word('child2', 3), parent.child[0].wordList) |
61
|
1 |
|
self.assertEqual(parent.child[0].parent, parent) |
62
|
1 |
|
for (tag1, tag2) in itertools.permutations(tagList, 2): |
63
|
1 |
|
parent = DependenciesTree('parent', 1, namedEntityTag = 'undef') |
64
|
1 |
|
child1 = DependenciesTree('child1', 2, parent = parent, dependency = 'conj_and', namedEntityTag = tag1) |
65
|
1 |
|
child2 = DependenciesTree('child2', 3, parent = parent, dependency = 'conj_and', namedEntityTag = tag2) |
66
|
1 |
|
parent.child += [child1, child2] |
67
|
1 |
|
NamedEntityMerging(parent).merge() |
68
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
69
|
1 |
|
self.assertEqual(parent.child, [child1, child2]) |
70
|
1 |
|
self.assertEqual(child1.parent, parent) |
71
|
1 |
|
self.assertEqual(child2.parent, parent) |
72
|
1 |
|
child1.dependency = 'foo' |
73
|
1 |
|
child2.dependency = 'foo' |
74
|
1 |
|
NamedEntityMerging(parent).merge() |
75
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
76
|
1 |
|
self.assertEqual(parent.child, [child1, child2]) |
77
|
1 |
|
self.assertEqual(child1.parent, parent) |
78
|
1 |
|
self.assertEqual(child2.parent, parent) |
79
|
|
|
|
80
|
1 |
|
def testBasicPrepositionNode(self): |
81
|
1 |
|
parent = DependenciesTree('parent', 1) |
82
|
1 |
|
child = DependenciesTree('child', 2, parent = parent, dependency = 'foo') |
83
|
1 |
|
parent.child.append(child) |
84
|
1 |
|
PrepositionMerging(parent).merge() |
85
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent', 1)]) |
86
|
1 |
|
self.assertEqual(parent.child, [child]) |
87
|
1 |
|
self.assertEqual(child.parent, parent) |
88
|
1 |
|
for prep in PrepositionMerging.prepositionSet: |
89
|
1 |
|
parent = DependenciesTree('parent', 1) |
90
|
1 |
|
child = DependenciesTree(prep, 2, parent = parent, dependency = 'foo') |
91
|
1 |
|
parent.child.append(child) |
92
|
1 |
|
child.dependency = 'conj_and' |
93
|
1 |
|
PrepositionMerging(parent).merge() |
94
|
1 |
|
self.assertIn(Word('parent', 1), parent.wordList) |
95
|
1 |
|
self.assertIn(Word(prep, 2), parent.wordList) |
96
|
1 |
|
self.assertEqual(parent.child, []) |
97
|
|
|
|
98
|
1 |
|
def testBasicPrepositionEdge(self): |
99
|
1 |
|
for prep in ['in', 'of', 'with', 'by']: |
100
|
1 |
|
parent = DependenciesTree('parent', 1) |
101
|
1 |
|
parent.wordList[0].pos = 'VB' |
102
|
1 |
|
child = DependenciesTree('child', 2, parent = parent, dependency = 'prep_'+prep) |
103
|
1 |
|
parent.child.append(child) |
104
|
1 |
|
PrepositionMerging(parent).merge() |
105
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent '+prep, 1, 'VB')]) |
106
|
1 |
|
self.assertEqual(parent.child, [child]) |
107
|
1 |
|
self.assertEqual(child.dependency, 'prep') |
108
|
1 |
|
parent = DependenciesTree('parent', 1) |
109
|
1 |
|
parent.wordList[0].pos = 'VB' |
110
|
1 |
|
child = DependenciesTree('child', 2, parent = parent, dependency = 'agent') |
111
|
1 |
|
parent.child.append(child) |
112
|
1 |
|
PrepositionMerging(parent).merge() |
113
|
1 |
|
self.assertEqual(parent.wordList, [Word('parent by', 1, 'VB')]) |
114
|
1 |
|
self.assertEqual(parent.child, [child]) |
115
|
|
|
|
116
|
1 |
|
def testNamedEntity1(self): |
117
|
1 |
|
tree=computeTree(data.give_john_smith()) |
118
|
1 |
|
NamedEntityMerging(tree).merge() |
119
|
1 |
|
tree.sort() |
120
|
1 |
|
root=tree |
121
|
|
|
# Root |
122
|
1 |
|
self.assertEqual(root.wordList, [Word("ROOT", 0)]) |
123
|
1 |
|
self.assertEqual(root.namedEntityTag, 'undef') |
124
|
1 |
|
self.assertEqual(root.dependency, 'undef') |
125
|
1 |
|
self.assertEqual(root.parent, None) |
126
|
1 |
|
self.assertEqual(len(root.child), 1) |
127
|
1 |
|
self.assertEqual(root.subtreeType, 'undef') |
128
|
1 |
|
self.assertEqual(root.dfsTag, 0) |
129
|
|
|
# Lives |
130
|
1 |
|
lives=root.child[0] |
131
|
1 |
|
self.assertEqual(lives.wordList, [Word("lives", 3, 'VBZ')]) |
132
|
1 |
|
self.assertEqual(lives.namedEntityTag, 'undef') |
133
|
1 |
|
self.assertEqual(lives.dependency, 'ROOT') |
134
|
1 |
|
self.assertEqual(lives.parent, tree) |
135
|
1 |
|
self.assertEqual(len(lives.child), 2) |
136
|
1 |
|
self.assertEqual(lives.subtreeType, 'undef') |
137
|
1 |
|
self.assertEqual(lives.dfsTag, 0) |
138
|
|
|
# John Smith |
139
|
1 |
|
smith=lives.child[0] |
140
|
1 |
|
self.assertEqual(smith.wordList, [Word("John", 1, 'NNP'), Word("Smith", 2, 'NNP')]) |
141
|
1 |
|
self.assertEqual(smith.namedEntityTag, 'PERSON') |
142
|
1 |
|
self.assertEqual(smith.dependency, 'nsubj') |
143
|
1 |
|
self.assertEqual(smith.parent, lives) |
144
|
1 |
|
self.assertEqual(len(smith.child), 0) |
145
|
1 |
|
self.assertEqual(smith.subtreeType, 'undef') |
146
|
1 |
|
self.assertEqual(smith.dfsTag, 0) |
147
|
|
|
# United Kingdom |
148
|
1 |
|
kingdom=lives.child[1] |
149
|
1 |
|
self.assertEqual(kingdom.wordList, [Word("United", 6, 'NNP'), Word("Kingdom", 7, 'NNP')]) |
150
|
1 |
|
self.assertEqual(kingdom.namedEntityTag, 'LOCATION') |
151
|
1 |
|
self.assertEqual(kingdom.dependency, 'prep_in') |
152
|
1 |
|
self.assertEqual(kingdom.parent, lives) |
153
|
1 |
|
self.assertEqual(len(kingdom.child), 1) |
154
|
1 |
|
self.assertEqual(kingdom.subtreeType, 'undef') |
155
|
1 |
|
self.assertEqual(kingdom.dfsTag, 0) |
156
|
|
|
# The |
157
|
1 |
|
the=kingdom.child[0] |
158
|
1 |
|
self.assertEqual(the.wordList, [Word("the", 5, 'DT')]) |
159
|
1 |
|
self.assertEqual(the.namedEntityTag, 'undef') |
160
|
1 |
|
self.assertEqual(the.dependency, 'det') |
161
|
1 |
|
self.assertEqual(the.parent, kingdom) |
162
|
1 |
|
self.assertEqual(len(the.child), 0) |
163
|
1 |
|
self.assertEqual(the.subtreeType, 'undef') |
164
|
1 |
|
self.assertEqual(the.dfsTag, 0) |
165
|
|
|
|
166
|
1 |
View Code Duplication |
def testNamedEntity2(self): |
|
|
|
|
167
|
1 |
|
tree=computeTree(data.give_obama_president_usa()) |
168
|
1 |
|
NamedEntityMerging(tree).merge() |
169
|
1 |
|
tree.sort() |
170
|
1 |
|
root=tree |
171
|
|
|
# Root |
172
|
1 |
|
self.assertEqual(root.wordList, [Word("ROOT", 0)]) |
173
|
1 |
|
self.assertEqual(root.namedEntityTag, 'undef') |
174
|
1 |
|
self.assertEqual(root.dependency, 'undef') |
175
|
1 |
|
self.assertEqual(root.parent, None) |
176
|
1 |
|
self.assertEqual(len(root.child), 1) |
177
|
1 |
|
self.assertEqual(root.subtreeType, 'undef') |
178
|
1 |
|
self.assertEqual(root.dfsTag, 0) |
179
|
|
|
# Is |
180
|
1 |
|
is_=root.child[0] |
181
|
1 |
|
self.assertEqual(is_.wordList, [Word("is", 2, 'VBZ')]) |
182
|
1 |
|
self.assertEqual(is_.namedEntityTag, 'undef') |
183
|
1 |
|
self.assertEqual(is_.dependency, 'ROOT') |
184
|
1 |
|
self.assertEqual(is_.parent, tree) |
185
|
1 |
|
self.assertEqual(len(is_.child), 2) |
186
|
1 |
|
self.assertEqual(is_.subtreeType, 'undef') |
187
|
1 |
|
self.assertEqual(is_.dfsTag, 0) |
188
|
|
|
# Obama |
189
|
1 |
|
obama=is_.child[0] |
190
|
1 |
|
self.assertEqual(obama.wordList, [Word("Obama", 1, 'NNP')]) |
191
|
1 |
|
self.assertEqual(obama.namedEntityTag, 'PERSON') |
192
|
1 |
|
self.assertEqual(obama.dependency, 'nsubj') |
193
|
1 |
|
self.assertEqual(obama.parent, is_) |
194
|
1 |
|
self.assertEqual(len(obama.child), 0) |
195
|
1 |
|
self.assertEqual(obama.subtreeType, 'undef') |
196
|
1 |
|
self.assertEqual(obama.dfsTag, 0) |
197
|
|
|
# president |
198
|
1 |
|
president =is_.child[1] |
199
|
1 |
|
self.assertEqual(president.wordList, [Word("president", 6, 'NN')]) |
200
|
1 |
|
self.assertEqual(president.namedEntityTag, 'undef') |
201
|
1 |
|
self.assertEqual(president.dependency, 'xcomp') |
202
|
1 |
|
self.assertEqual(president.parent, is_) |
203
|
1 |
|
self.assertEqual(len(president.child), 2) |
204
|
1 |
|
self.assertEqual(president.subtreeType, 'undef') |
205
|
1 |
|
self.assertEqual(president.dfsTag, 0) |
206
|
|
|
# The |
207
|
1 |
|
the=president.child[0] |
208
|
1 |
|
self.assertEqual(the.wordList, [Word("the", 3, 'DT')]) |
209
|
1 |
|
self.assertEqual(the.namedEntityTag, 'undef') |
210
|
1 |
|
self.assertEqual(the.dependency, 'det') |
211
|
1 |
|
self.assertEqual(the.parent, president) |
212
|
1 |
|
self.assertEqual(len(the.child), 0) |
213
|
1 |
|
self.assertEqual(the.subtreeType, 'undef') |
214
|
1 |
|
self.assertEqual(the.dfsTag, 0) |
215
|
|
|
# United States |
216
|
1 |
|
united=president.child[1] |
217
|
1 |
|
self.assertEqual(united.wordList, [Word("United", 4, 'NNP'), Word("States", 5, 'NNPS')]) |
218
|
1 |
|
self.assertEqual(united.namedEntityTag, 'LOCATION') |
219
|
1 |
|
self.assertEqual(united.dependency, 'compound') |
220
|
1 |
|
self.assertEqual(united.parent, president) |
221
|
1 |
|
self.assertEqual(len(united.child), 0) |
222
|
1 |
|
self.assertEqual(united.subtreeType, 'undef') |
223
|
1 |
|
self.assertEqual(united.dfsTag, 0) |
224
|
|
|
|
225
|
1 |
|
def testStr2(self): |
226
|
1 |
|
tree=computeTree(data.give_john_smith()) |
227
|
1 |
|
NamedEntityMerging(tree).merge() |
228
|
1 |
|
PrepositionMerging(tree).merge() |
229
|
1 |
|
self.maxDiff=None |
230
|
1 |
|
tree.sort() |
231
|
|
|
self.assertEqual(str(tree), data.give_john_smith_stringMerge()) |
232
|
|
|
|