1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
|
|
# Gus Hahn-Powell 2015 |
5
|
|
|
# data structures for storing processors-server output |
6
|
|
|
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors) |
7
|
|
|
from __future__ import unicode_literals |
8
|
|
|
from itertools import chain |
9
|
|
|
from collections import defaultdict |
10
|
|
|
#from six import text_type |
11
|
|
|
import json |
12
|
|
|
import re |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class Document(object): |
16
|
|
|
|
17
|
|
|
def __init__(self, sentences, text=None): |
18
|
|
|
self.size = len(sentences) |
19
|
|
|
self.sentences = sentences |
20
|
|
|
# easily access token attributes from all sentences |
21
|
|
|
self.words = list(chain(*[s.words for s in self.sentences])) |
22
|
|
|
self.tags = list(chain(*[s.tags for s in self.sentences])) |
23
|
|
|
self.lemmas = list(chain(*[s.lemmas for s in self.sentences])) |
24
|
|
|
self._entities = list(chain(*[s._entities for s in self.sentences])) |
25
|
|
|
self.nes = merge_entity_dicts = self._merge_ne_dicts() |
26
|
|
|
self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) |
27
|
|
|
self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) |
28
|
|
|
self.text = text if text else " ".join(self.words) |
29
|
|
|
|
30
|
|
|
def __unicode__(self): |
31
|
|
|
return self.text |
32
|
|
|
|
33
|
|
|
def __str__(self): |
34
|
|
|
return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") |
35
|
|
|
|
36
|
|
|
def __eq__(self, other): |
37
|
|
|
if isinstance(other, self.__class__): |
38
|
|
|
return self.__dict__ == other.__dict__ |
39
|
|
|
else: |
40
|
|
|
return False |
41
|
|
|
|
42
|
|
|
def __ne__(self, other): |
43
|
|
|
return not self.__eq__(other) |
44
|
|
|
|
45
|
|
|
def bag_of_labeled_dependencies_using(self, form): |
46
|
|
|
return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences])) |
47
|
|
|
|
48
|
|
|
def bag_of_unlabeled_dependencies_using(self, form): |
49
|
|
|
return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences])) |
50
|
|
|
|
51
|
|
|
def _merge_ne_dicts(self): |
52
|
|
|
# Get the set of all NE labels found in the Doc's sentences |
53
|
|
|
entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) |
54
|
|
|
# Do we have any labels? |
55
|
|
|
if entity_labels == None: |
56
|
|
|
return None |
57
|
|
|
# If we have labels, consolidate the NEs under the appropriate label |
58
|
|
|
else: |
59
|
|
|
nes_dict = dict() |
60
|
|
|
for e in entity_labels: |
61
|
|
|
entities = [] |
62
|
|
|
for s in self.sentences: |
63
|
|
|
entities += s.nes[e] |
64
|
|
|
nes_dict[e] = entities |
65
|
|
|
return nes_dict |
66
|
|
|
|
67
|
|
|
def to_JSON_dict(self): |
68
|
|
|
doc_dict = dict() |
69
|
|
|
doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences] |
70
|
|
|
doc_dict["text"] = self.text |
71
|
|
|
return doc_dict |
72
|
|
|
|
73
|
|
|
def to_JSON(self): |
74
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
75
|
|
|
|
76
|
|
|
@staticmethod |
77
|
|
|
def load_from_JSON(json_dict): |
78
|
|
|
sentences = [] |
79
|
|
|
for s in json_dict["sentences"]: |
80
|
|
|
kwargs = { |
81
|
|
|
"words": s["words"], |
82
|
|
|
"startOffsets": s["startOffsets"], |
83
|
|
|
"endOffsets": s["endOffsets"], |
84
|
|
|
"tags": s.get("tags", None), |
85
|
|
|
"lemmas": s.get("lemmas", None), |
86
|
|
|
"entities": s.get("entities", None), |
87
|
|
|
"text": s.get("text", None), |
88
|
|
|
"dependencies": s.get("dependencies", None) |
89
|
|
|
} |
90
|
|
|
sent = Sentence(**kwargs) |
91
|
|
|
sentences.append(sent) |
92
|
|
|
return Document(sentences, json_dict.get("text", None)) |
93
|
|
|
|
94
|
|
|
class Sentence(object): |
95
|
|
|
|
96
|
|
|
UNKNOWN = "UNKNOWN" |
97
|
|
|
# the O in IOB notation |
98
|
|
|
NONENTITY = "O" |
99
|
|
|
|
100
|
|
|
def __init__(self, **kwargs): |
101
|
|
|
self.words = kwargs["words"] |
102
|
|
|
self.startOffsets = kwargs["startOffsets"] |
103
|
|
|
self.endOffsets = kwargs["endOffsets"] |
104
|
|
|
self.length = len(self.words) |
105
|
|
|
self.tags = self._set_toks(kwargs.get("tags", None)) |
106
|
|
|
self.lemmas = self._set_toks(kwargs.get("lemmas", None)) |
107
|
|
|
self._entities = self._set_toks(kwargs.get("entities", None)) |
108
|
|
|
self.text = kwargs.get("text", None) or " ".join(self.words) |
109
|
|
|
self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None)) |
110
|
|
|
self.nes = self._set_nes(self._entities) |
111
|
|
|
|
112
|
|
|
def __eq__(self, other): |
113
|
|
|
if isinstance(other, self.__class__): |
114
|
|
|
return self.__dict__ == other.__dict__ |
115
|
|
|
else: |
116
|
|
|
return False |
117
|
|
|
|
118
|
|
|
def __ne__(self, other): |
119
|
|
|
return not self.__eq__(other) |
120
|
|
|
|
121
|
|
|
def _set_toks(self, toks): |
122
|
|
|
return toks if toks else [self.UNKNOWN]*self.length |
123
|
|
|
|
124
|
|
|
def _set_nes(self, entities): |
125
|
|
|
""" |
126
|
|
|
Consolidates consecutive NEs under the appropriate label |
127
|
|
|
""" |
128
|
|
|
entity_dict = defaultdict(list) |
129
|
|
|
# initialize to empty label |
130
|
|
|
current = Sentence.NONENTITY |
131
|
|
|
start = None |
132
|
|
|
end = None |
133
|
|
|
for i, e in enumerate(entities): |
134
|
|
|
# we don't have an entity tag |
135
|
|
|
if e == Sentence.NONENTITY: |
136
|
|
|
# did we have an entity with the last token? |
137
|
|
|
if current == Sentence.NONENTITY: |
138
|
|
|
continue |
139
|
|
|
else: |
140
|
|
|
# the last sequence has ended |
141
|
|
|
end = i |
142
|
|
|
# store the entity |
143
|
|
|
named_entity = ' '.join(self.words[start:end]) |
144
|
|
|
entity_dict[current].append(named_entity) |
145
|
|
|
# reset our book-keeping vars |
146
|
|
|
current = Sentence.NONENTITY |
147
|
|
|
start = None |
148
|
|
|
end = None |
149
|
|
|
# we have an entity tag! |
150
|
|
|
else: |
151
|
|
|
# our old sequence continues |
152
|
|
|
if e == current: |
153
|
|
|
end = i |
154
|
|
|
# our old sequence has ended |
155
|
|
|
else: |
156
|
|
|
# do we have a previous NE? |
157
|
|
|
if current != Sentence.NONENTITY: |
158
|
|
|
end = i |
159
|
|
|
named_entity = ' '.join(self.words[start:end]) |
160
|
|
|
entity_dict[current].append(named_entity) |
161
|
|
|
# update our book-keeping vars |
162
|
|
|
current = e |
163
|
|
|
start = i |
164
|
|
|
end = None |
165
|
|
|
# this might be empty |
166
|
|
|
return entity_dict |
167
|
|
|
|
168
|
|
|
def _build_dependencies_from_dict(self, deps): |
169
|
|
|
if deps and len(deps) > 0: |
170
|
|
|
return Dependencies(deps, self.words) |
171
|
|
|
return None |
172
|
|
|
|
173
|
|
|
def __unicode__(self): |
174
|
|
|
return self.text |
175
|
|
|
|
176
|
|
|
def to_string(self): |
177
|
|
|
return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length)) |
178
|
|
|
|
179
|
|
|
def labeled_dependencies_using(self, form): |
180
|
|
|
""" |
181
|
|
|
Generates a list of labeled dependencies for a sentence |
182
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
183
|
|
|
""" |
184
|
|
|
|
185
|
|
|
f = form.lower() |
186
|
|
|
if f == "words": |
187
|
|
|
tokens = self.words |
188
|
|
|
elif f == "tags": |
189
|
|
|
tokens = self.tags |
190
|
|
|
elif f == "lemmas": |
191
|
|
|
tokens = self.lemmas |
192
|
|
|
elif f == "entities": |
193
|
|
|
tokens = self.nes |
194
|
|
|
elif f == "index": |
195
|
|
|
tokens = list(range(self.length)) |
196
|
|
|
#else: |
197
|
|
|
# raise Exception("""form must be "words", "tags", "lemmas", or "index"""") |
198
|
|
|
deps = self.dependencies |
199
|
|
|
labeled = [] |
200
|
|
|
for out in deps.outgoing: |
201
|
|
|
for (dest, rel) in deps.outgoing[out]: |
202
|
|
|
labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest])) |
203
|
|
|
return labeled |
204
|
|
|
|
205
|
|
|
def unlabeled_dependencies_using(self, form): |
206
|
|
|
""" |
207
|
|
|
Generate a list of unlabeled dependencies for a sentence |
208
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
209
|
|
|
""" |
210
|
|
|
unlabeled = [] |
211
|
|
|
for sd in self.labeled_dependencies_using(form): |
212
|
|
|
(head, _, dep) = sd.split("_") |
213
|
|
|
unlabeled.append("{}_{}".format(head, dep)) |
214
|
|
|
return unlabeled |
215
|
|
|
|
216
|
|
|
def to_JSON_dict(self): |
217
|
|
|
sentence_dict = dict() |
218
|
|
|
sentence_dict["words"] = self.words |
219
|
|
|
sentence_dict["startOffsets"] = self.startOffsets |
220
|
|
|
sentence_dict["endOffsets"] = self.endOffsets |
221
|
|
|
sentence_dict["tags"] = self.tags |
222
|
|
|
sentence_dict["lemmas"] = self.lemmas |
223
|
|
|
sentence_dict["entities"] = self._entities |
224
|
|
|
sentence_dict["dependencies"] = self.dependencies.to_JSON_dict() |
225
|
|
|
return sentence_dict |
226
|
|
|
|
227
|
|
|
def to_JSON(self): |
228
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
229
|
|
|
|
230
|
|
|
@staticmethod |
231
|
|
|
def load_from_JSON(json_dict): |
232
|
|
|
sent = Sentence( |
233
|
|
|
words=json_dict["words"], |
234
|
|
|
startOffsets=json_dict["startOffsets"], |
235
|
|
|
endOffsets=json_dict["endOffsets"], |
236
|
|
|
lemmas=json_dict.get("lemmas", None), |
237
|
|
|
tags=json_dict.get("tags", None), |
238
|
|
|
entities=json_dict.get("entities", None), |
239
|
|
|
text=json_dict.get("text", None), |
240
|
|
|
dependencies=json_dict.get("dependencies", None) |
241
|
|
|
) |
242
|
|
|
return sent |
243
|
|
|
|
244
|
|
|
|
245
|
|
|
class Dependencies(object): |
246
|
|
|
""" |
247
|
|
|
Storage class for Stanford-style dependencies |
248
|
|
|
""" |
249
|
|
|
def __init__(self, deps, words): |
250
|
|
|
self._words = [w.lower() for w in words] |
251
|
|
|
self.deps = self.unpack_deps(deps) |
252
|
|
|
self.roots = deps.get("roots", []) |
253
|
|
|
self.edges = deps["edges"] |
254
|
|
|
self.incoming = self._build_incoming(self.deps) |
255
|
|
|
self.outgoing = self._build_outgoing(self.deps) |
256
|
|
|
self.labeled = self._build_labeled() |
257
|
|
|
self.unlabeled = self._build_unlabeled() |
258
|
|
|
|
259
|
|
|
def __unicode__(self): |
260
|
|
|
return self.deps |
261
|
|
|
|
262
|
|
|
def __eq__(self, other): |
263
|
|
|
if isinstance(other, self.__class__): |
264
|
|
|
return self.__dict__ == other.__dict__ |
265
|
|
|
else: |
266
|
|
|
return False |
267
|
|
|
|
268
|
|
|
def __ne__(self, other): |
269
|
|
|
return not self.__eq__(other) |
270
|
|
|
|
271
|
|
|
def unpack_deps(self, deps): |
272
|
|
|
dependencies = [] |
273
|
|
|
for edge in deps["edges"]: |
274
|
|
|
outgoing = edge['source'] |
275
|
|
|
incoming = edge['destination'] |
276
|
|
|
rel = edge['relation'] |
277
|
|
|
dependencies.append((incoming, outgoing, rel)) |
278
|
|
|
return dependencies |
279
|
|
|
|
280
|
|
|
def _build_incoming(self, deps): |
281
|
|
|
dep_dict = defaultdict(list) |
282
|
|
|
for (incoming, outgoing, rel) in deps: |
283
|
|
|
dep_dict[incoming].append((outgoing, rel)) |
284
|
|
|
return dep_dict |
285
|
|
|
|
286
|
|
|
def _build_outgoing(self, deps): |
287
|
|
|
dep_dict = defaultdict(list) |
288
|
|
|
for (incoming, outgoing, rel) in deps: |
289
|
|
|
dep_dict[outgoing].append((incoming, rel)) |
290
|
|
|
return dep_dict |
291
|
|
|
|
292
|
|
|
def _build_labeled(self): |
293
|
|
|
labeled = [] |
294
|
|
|
for out in self.outgoing: |
295
|
|
|
for (dest, rel) in self.outgoing[out]: |
296
|
|
|
labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest])) |
297
|
|
|
return labeled |
298
|
|
|
|
299
|
|
|
def _build_unlabeled(self): |
300
|
|
|
unlabeled = [] |
301
|
|
|
for out in self.outgoing: |
302
|
|
|
for (dest, _) in self.outgoing[out]: |
303
|
|
|
unlabeled.append("{}_{}".format(self._words[out], self._words[dest])) |
304
|
|
|
return unlabeled |
305
|
|
|
|
306
|
|
|
def to_JSON_dict(self): |
307
|
|
|
deps_dict = dict() |
308
|
|
|
deps_dict["edges"] = self.edges |
309
|
|
|
deps_dict["roots"] = self.roots |
310
|
|
|
return deps_dict |
311
|
|
|
|
312
|
|
|
def to_JSON(self): |
313
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
314
|
|
|
|