|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
|
|
# Gus Hahn-Powell 2015 |
|
5
|
|
|
# data structures for storing processors-server output |
|
6
|
|
|
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors) |
|
7
|
|
|
from __future__ import unicode_literals |
|
8
|
|
|
from itertools import chain |
|
9
|
|
|
from collections import defaultdict |
|
10
|
|
|
#from six import text_type |
|
11
|
|
|
import json |
|
12
|
|
|
import re |
|
13
|
|
|
|
|
14
|
|
|
|
|
15
|
|
|
class Document(object): |
|
16
|
|
|
|
|
17
|
|
|
def __init__(self, sentences, text=None): |
|
18
|
|
|
self.size = len(sentences) |
|
19
|
|
|
self.sentences = sentences |
|
20
|
|
|
# easily access token attributes from all sentences |
|
21
|
|
|
self.words = list(chain(*[s.words for s in self.sentences])) |
|
22
|
|
|
self.tags = list(chain(*[s.tags for s in self.sentences])) |
|
23
|
|
|
self.lemmas = list(chain(*[s.lemmas for s in self.sentences])) |
|
24
|
|
|
self._entities = list(chain(*[s._entities for s in self.sentences])) |
|
25
|
|
|
self.nes = merge_entity_dicts = self._merge_ne_dicts() |
|
26
|
|
|
self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) |
|
27
|
|
|
self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) |
|
28
|
|
|
self.text = text if text else " ".join(self.words) |
|
29
|
|
|
|
|
30
|
|
|
def bag_of_labeled_dependencies_using(self, form): |
|
31
|
|
|
return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences])) |
|
32
|
|
|
|
|
33
|
|
|
def bag_of_unlabeled_dependencies_using(self, form): |
|
34
|
|
|
return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences])) |
|
35
|
|
|
|
|
36
|
|
|
def _merge_ne_dicts(self): |
|
37
|
|
|
# Get the set of all NE labels found in the Doc's sentences |
|
38
|
|
|
entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) |
|
39
|
|
|
# Do we have any labels? |
|
40
|
|
|
if entity_labels == None: |
|
41
|
|
|
return None |
|
42
|
|
|
# If we have labels, consolidate the NEs under the appropriate label |
|
43
|
|
|
else: |
|
44
|
|
|
nes_dict = dict() |
|
45
|
|
|
for e in entity_labels: |
|
46
|
|
|
entities = [] |
|
47
|
|
|
for s in self.sentences: |
|
48
|
|
|
entities += s.nes[e] |
|
49
|
|
|
nes_dict[e] = entities |
|
50
|
|
|
return nes_dict |
|
51
|
|
|
|
|
52
|
|
|
def __unicode__(self): |
|
53
|
|
|
return self.text |
|
54
|
|
|
|
|
55
|
|
|
def __str__(self): |
|
56
|
|
|
return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") |
|
57
|
|
|
|
|
58
|
|
|
def to_JSON_dict(self): |
|
59
|
|
|
doc_dict = dict() |
|
60
|
|
|
doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences] |
|
61
|
|
|
doc_dict["text"] = self.text |
|
62
|
|
|
return doc_dict |
|
63
|
|
|
|
|
64
|
|
|
def to_JSON(self): |
|
65
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
|
66
|
|
|
|
|
67
|
|
|
@staticmethod |
|
68
|
|
|
def load_from_JSON(json_dict): |
|
69
|
|
|
sentences = [] |
|
70
|
|
|
for s in json_dict["sentences"]: |
|
71
|
|
|
kwargs = { |
|
72
|
|
|
"words": s["words"], |
|
73
|
|
|
"startOffsets": s["startOffsets"], |
|
74
|
|
|
"endOffsets": s["endOffsets"], |
|
75
|
|
|
"tags": s.get("tags", None), |
|
76
|
|
|
"lemmas": s.get("lemmas", None), |
|
77
|
|
|
"entities": s.get("entities", None), |
|
78
|
|
|
"text": s.get("text", None), |
|
79
|
|
|
"dependencies": s.get("dependencies", None) |
|
80
|
|
|
} |
|
81
|
|
|
sent = Sentence(**kwargs) |
|
82
|
|
|
sentences.append(sent) |
|
83
|
|
|
return Document(sentences, json_dict.get("text", None)) |
|
84
|
|
|
|
|
85
|
|
|
class Sentence(object): |
|
86
|
|
|
|
|
87
|
|
|
UNKNOWN = "UNKNOWN" |
|
88
|
|
|
# the O in IOB notation |
|
89
|
|
|
NONENTITY = "O" |
|
90
|
|
|
|
|
91
|
|
|
def __init__(self, **kwargs): |
|
92
|
|
|
self.words = kwargs["words"] |
|
93
|
|
|
self.startOffsets = kwargs["startOffsets"] |
|
94
|
|
|
self.endOffsets = kwargs["endOffsets"] |
|
95
|
|
|
self.length = len(self.words) |
|
96
|
|
|
self.tags = self._set_toks(kwargs.get("tags", None)) |
|
97
|
|
|
self.lemmas = self._set_toks(kwargs.get("lemmas", None)) |
|
98
|
|
|
self._entities = self._set_toks(kwargs.get("entities", None)) |
|
99
|
|
|
self.text = kwargs.get("text", None) or " ".join(self.words) |
|
100
|
|
|
self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None)) |
|
101
|
|
|
self.nes = self._set_nes(self._entities) |
|
102
|
|
|
|
|
103
|
|
|
def _set_toks(self, toks): |
|
104
|
|
|
return toks if toks else [self.UNKNOWN]*self.length |
|
105
|
|
|
|
|
106
|
|
|
def _set_nes(self, entities): |
|
107
|
|
|
""" |
|
108
|
|
|
Consolidates consecutive NEs under the appropriate label |
|
109
|
|
|
""" |
|
110
|
|
|
entity_dict = defaultdict(list) |
|
111
|
|
|
# initialize to empty label |
|
112
|
|
|
current = Sentence.NONENTITY |
|
113
|
|
|
start = None |
|
114
|
|
|
end = None |
|
115
|
|
|
for i, e in enumerate(entities): |
|
116
|
|
|
# we don't have an entity tag |
|
117
|
|
|
if e == Sentence.NONENTITY: |
|
118
|
|
|
# did we have an entity with the last token? |
|
119
|
|
|
if current == Sentence.NONENTITY: |
|
120
|
|
|
continue |
|
121
|
|
|
else: |
|
122
|
|
|
# the last sequence has ended |
|
123
|
|
|
end = i |
|
124
|
|
|
# store the entity |
|
125
|
|
|
named_entity = ' '.join(self.words[start:end]) |
|
126
|
|
|
entity_dict[current].append(named_entity) |
|
127
|
|
|
# reset our book-keeping vars |
|
128
|
|
|
current = Sentence.NONENTITY |
|
129
|
|
|
start = None |
|
130
|
|
|
end = None |
|
131
|
|
|
# we have an entity tag! |
|
132
|
|
|
else: |
|
133
|
|
|
# our old sequence continues |
|
134
|
|
|
if e == current: |
|
135
|
|
|
end = i |
|
136
|
|
|
# our old sequence has ended |
|
137
|
|
|
else: |
|
138
|
|
|
# do we have a previous NE? |
|
139
|
|
|
if current != Sentence.NONENTITY: |
|
140
|
|
|
end = i |
|
141
|
|
|
named_entity = ' '.join(self.words[start:end]) |
|
142
|
|
|
entity_dict[current].append(named_entity) |
|
143
|
|
|
# update our book-keeping vars |
|
144
|
|
|
current = e |
|
145
|
|
|
start = i |
|
146
|
|
|
end = None |
|
147
|
|
|
# this might be empty |
|
148
|
|
|
return entity_dict |
|
149
|
|
|
|
|
150
|
|
|
def _build_dependencies_from_dict(self, deps): |
|
151
|
|
|
if deps and len(deps) > 0: |
|
152
|
|
|
return Dependencies(deps, self.words) |
|
153
|
|
|
return None |
|
154
|
|
|
|
|
155
|
|
|
def __unicode__(self): |
|
156
|
|
|
return self.text |
|
157
|
|
|
|
|
158
|
|
|
def to_string(self): |
|
159
|
|
|
return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length)) |
|
160
|
|
|
|
|
161
|
|
|
def labeled_dependencies_using(self, form): |
|
162
|
|
|
""" |
|
163
|
|
|
Generates a list of labeled dependencies for a sentence |
|
164
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
|
165
|
|
|
""" |
|
166
|
|
|
|
|
167
|
|
|
f = form.lower() |
|
168
|
|
|
if f == "words": |
|
169
|
|
|
tokens = self.words |
|
170
|
|
|
elif f == "tags": |
|
171
|
|
|
tokens = self.tags |
|
172
|
|
|
elif f == "lemmas": |
|
173
|
|
|
tokens = self.lemmas |
|
174
|
|
|
elif f == "entities": |
|
175
|
|
|
tokens = self.nes |
|
176
|
|
|
elif f == "index": |
|
177
|
|
|
tokens = list(range(self.length)) |
|
178
|
|
|
#else: |
|
179
|
|
|
# raise Exception("""form must be "words", "tags", "lemmas", or "index"""") |
|
180
|
|
|
deps = self.dependencies |
|
181
|
|
|
labeled = [] |
|
182
|
|
|
for out in deps.outgoing: |
|
183
|
|
|
for (dest, rel) in deps.outgoing[out]: |
|
184
|
|
|
labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest])) |
|
185
|
|
|
return labeled |
|
186
|
|
|
|
|
187
|
|
|
def unlabeled_dependencies_using(self, form): |
|
188
|
|
|
""" |
|
189
|
|
|
Generate a list of unlabeled dependencies for a sentence |
|
190
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
|
191
|
|
|
""" |
|
192
|
|
|
unlabeled = [] |
|
193
|
|
|
for sd in self.labeled_dependencies_using(form): |
|
194
|
|
|
(head, _, dep) = sd.split("_") |
|
195
|
|
|
unlabeled.append("{}_{}".format(head, dep)) |
|
196
|
|
|
return unlabeled |
|
197
|
|
|
|
|
198
|
|
|
def to_JSON_dict(self): |
|
199
|
|
|
sentence_dict = dict() |
|
200
|
|
|
sentence_dict["words"] = self.words |
|
201
|
|
|
sentence_dict["startOffsets"] = self.startOffsets |
|
202
|
|
|
sentence_dict["endOffsets"] = self.endOffsets |
|
203
|
|
|
sentence_dict["tags"] = self.tags |
|
204
|
|
|
sentence_dict["lemmas"] = self.lemmas |
|
205
|
|
|
sentence_dict["entities"] = self._entities |
|
206
|
|
|
sentence_dict["dependencies"] = self.dependencies.to_JSON_dict() |
|
207
|
|
|
return sentence_dict |
|
208
|
|
|
|
|
209
|
|
|
def to_JSON(self): |
|
210
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
|
211
|
|
|
|
|
212
|
|
|
@staticmethod |
|
213
|
|
|
def load_from_JSON(json_dict): |
|
214
|
|
|
sent = Sentence( |
|
215
|
|
|
words=json_dict["words"], |
|
216
|
|
|
startOffsets=json_dict["startOffsets"], |
|
217
|
|
|
endOffsets=json_dict["endOffsets"], |
|
218
|
|
|
lemmas=json_dict.get("lemmas", None), |
|
219
|
|
|
tags=json_dict.get("tags", None), |
|
220
|
|
|
entities=json_dict.get("entities", None), |
|
221
|
|
|
text=json_dict.get("text", None), |
|
222
|
|
|
dependencies=json_dict.get("dependencies", None) |
|
223
|
|
|
) |
|
224
|
|
|
return sent |
|
225
|
|
|
|
|
226
|
|
|
|
|
227
|
|
|
class Dependencies(object): |
|
228
|
|
|
""" |
|
229
|
|
|
Storage class for Stanford-style dependencies |
|
230
|
|
|
""" |
|
231
|
|
|
def __init__(self, deps, words): |
|
232
|
|
|
self._words = [w.lower() for w in words] |
|
233
|
|
|
self.deps = self.unpack_deps(deps) |
|
234
|
|
|
self.roots = deps.get("roots", []) |
|
235
|
|
|
self.edges = deps["edges"] |
|
236
|
|
|
self.incoming = self._build_incoming(self.deps) |
|
237
|
|
|
self.outgoing = self._build_outgoing(self.deps) |
|
238
|
|
|
self.labeled = self._build_labeled() |
|
239
|
|
|
self.unlabeled = self._build_unlabeled() |
|
240
|
|
|
|
|
241
|
|
|
def __unicode__(self): |
|
242
|
|
|
return self.deps |
|
243
|
|
|
|
|
244
|
|
|
def unpack_deps(self, deps): |
|
245
|
|
|
dependencies = [] |
|
246
|
|
|
for edge in deps["edges"]: |
|
247
|
|
|
outgoing = edge['source'] |
|
248
|
|
|
incoming = edge['destination'] |
|
249
|
|
|
rel = edge['relation'] |
|
250
|
|
|
dependencies.append((incoming, outgoing, rel)) |
|
251
|
|
|
return dependencies |
|
252
|
|
|
|
|
253
|
|
|
def _build_incoming(self, deps): |
|
254
|
|
|
dep_dict = defaultdict(list) |
|
255
|
|
|
for (incoming, outgoing, rel) in deps: |
|
256
|
|
|
dep_dict[outgoing].append((incoming, rel)) |
|
257
|
|
|
return dep_dict |
|
258
|
|
|
|
|
259
|
|
|
def _build_outgoing(self, deps): |
|
260
|
|
|
dep_dict = defaultdict(list) |
|
261
|
|
|
for (incoming, outgoing, rel) in deps: |
|
262
|
|
|
dep_dict[incoming].append((outgoing, rel)) |
|
263
|
|
|
return dep_dict |
|
264
|
|
|
|
|
265
|
|
|
def _build_labeled(self): |
|
266
|
|
|
labeled = [] |
|
267
|
|
|
for out in self.outgoing: |
|
268
|
|
|
for (dest, rel) in self.outgoing[out]: |
|
269
|
|
|
labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest])) |
|
270
|
|
|
return labeled |
|
271
|
|
|
|
|
272
|
|
|
def _build_unlabeled(self): |
|
273
|
|
|
unlabeled = [] |
|
274
|
|
|
for out in self.outgoing: |
|
275
|
|
|
for (dest, _) in self.outgoing[out]: |
|
276
|
|
|
unlabeled.append("{}_{}".format(self._words[out], self._words[dest])) |
|
277
|
|
|
return unlabeled |
|
278
|
|
|
|
|
279
|
|
|
def to_JSON_dict(self): |
|
280
|
|
|
deps_dict = dict() |
|
281
|
|
|
deps_dict["edges"] = self.edges |
|
282
|
|
|
deps_dict["roots"] = self.roots |
|
283
|
|
|
return deps_dict |
|
284
|
|
|
|
|
285
|
|
|
def to_JSON(self): |
|
286
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
|
287
|
|
|
|