1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
|
|
# Gus Hahn-Powell 2015 |
5
|
|
|
# data structures for storing processors-server output |
6
|
|
|
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors) |
7
|
|
|
from __future__ import unicode_literals |
8
|
|
|
from itertools import chain |
9
|
|
|
from collections import defaultdict |
10
|
|
|
#from six import text_type |
11
|
|
|
import json |
12
|
|
|
import re |
13
|
|
|
|
14
|
|
|
|
15
|
|
|
class Document(object): |
16
|
|
|
|
17
|
|
|
def __init__(self, sentences, text=None): |
18
|
|
|
self.size = len(sentences) |
19
|
|
|
self.sentences = sentences |
20
|
|
|
# easily access token attributes from all sentences |
21
|
|
|
self.words = list(chain(*[s.words for s in self.sentences])) |
22
|
|
|
self.tags = list(chain(*[s.tags for s in self.sentences])) |
23
|
|
|
self.lemmas = list(chain(*[s.lemmas for s in self.sentences])) |
24
|
|
|
self._entities = list(chain(*[s._entities for s in self.sentences])) |
25
|
|
|
self.nes = merge_entity_dicts = self._merge_ne_dicts() |
26
|
|
|
self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) |
27
|
|
|
self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) |
28
|
|
|
self.text = text if text else " ".join(self.words) |
29
|
|
|
|
30
|
|
|
def bag_of_labeled_dependencies_using(self, form): |
31
|
|
|
return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences])) |
32
|
|
|
|
33
|
|
|
def bag_of_unlabeled_dependencies_using(self, form): |
34
|
|
|
return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences])) |
35
|
|
|
|
36
|
|
|
def _merge_ne_dicts(self): |
37
|
|
|
# Get the set of all NE labels found in the Doc's sentences |
38
|
|
|
entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) |
39
|
|
|
# Do we have any labels? |
40
|
|
|
if entity_labels == None: |
41
|
|
|
return None |
42
|
|
|
# If we have labels, consolidate the NEs under the appropriate label |
43
|
|
|
else: |
44
|
|
|
nes_dict = dict() |
45
|
|
|
for e in entity_labels: |
46
|
|
|
entities = [] |
47
|
|
|
for s in self.sentences: |
48
|
|
|
entities += s.nes[e] |
49
|
|
|
nes_dict[e] = entities |
50
|
|
|
return nes_dict |
51
|
|
|
|
52
|
|
|
def __unicode__(self): |
53
|
|
|
return self.text |
54
|
|
|
|
55
|
|
|
def __str__(self): |
56
|
|
|
return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") |
57
|
|
|
|
58
|
|
|
def to_JSON_dict(self): |
59
|
|
|
doc_dict = dict() |
60
|
|
|
doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences] |
61
|
|
|
doc_dict["text"] = self.text |
62
|
|
|
return doc_dict |
63
|
|
|
|
64
|
|
|
def to_JSON(self): |
65
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
66
|
|
|
|
67
|
|
|
@staticmethod |
68
|
|
|
def load_from_JSON(json_dict): |
69
|
|
|
sentences = [] |
70
|
|
|
for s in json_dict["sentences"]: |
71
|
|
|
kwargs = { |
72
|
|
|
"words": s["words"], |
73
|
|
|
"startOffsets": s["startOffsets"], |
74
|
|
|
"endOffsets": s["endOffsets"], |
75
|
|
|
"tags": s.get("tags", None), |
76
|
|
|
"lemmas": s.get("lemmas", None), |
77
|
|
|
"entities": s.get("entities", None), |
78
|
|
|
"text": s.get("text", None), |
79
|
|
|
"dependencies": s.get("dependencies", None) |
80
|
|
|
} |
81
|
|
|
sent = Sentence(**kwargs) |
82
|
|
|
sentences.append(sent) |
83
|
|
|
return Document(sentences, json_dict.get("text", None)) |
84
|
|
|
|
85
|
|
|
class Sentence(object): |
86
|
|
|
|
87
|
|
|
UNKNOWN = "UNKNOWN" |
88
|
|
|
# the O in IOB notation |
89
|
|
|
NONENTITY = "O" |
90
|
|
|
|
91
|
|
|
def __init__(self, **kwargs): |
92
|
|
|
self.words = kwargs["words"] |
93
|
|
|
self.startOffsets = kwargs["startOffsets"] |
94
|
|
|
self.endOffsets = kwargs["endOffsets"] |
95
|
|
|
self.length = len(self.words) |
96
|
|
|
self.tags = self._set_toks(kwargs.get("tags", None)) |
97
|
|
|
self.lemmas = self._set_toks(kwargs.get("lemmas", None)) |
98
|
|
|
self._entities = self._set_toks(kwargs.get("entities", None)) |
99
|
|
|
self.text = kwargs.get("text", None) or " ".join(self.words) |
100
|
|
|
self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None)) |
101
|
|
|
self.nes = self._set_nes(self._entities) |
102
|
|
|
|
103
|
|
|
def _set_toks(self, toks): |
104
|
|
|
return toks if toks else [self.UNKNOWN]*self.length |
105
|
|
|
|
106
|
|
|
def _set_nes(self, entities): |
107
|
|
|
""" |
108
|
|
|
Consolidates consecutive NEs under the appropriate label |
109
|
|
|
""" |
110
|
|
|
entity_dict = defaultdict(list) |
111
|
|
|
# initialize to empty label |
112
|
|
|
current = Sentence.NONENTITY |
113
|
|
|
start = None |
114
|
|
|
end = None |
115
|
|
|
for i, e in enumerate(entities): |
116
|
|
|
# we don't have an entity tag |
117
|
|
|
if e == Sentence.NONENTITY: |
118
|
|
|
# did we have an entity with the last token? |
119
|
|
|
if current == Sentence.NONENTITY: |
120
|
|
|
continue |
121
|
|
|
else: |
122
|
|
|
# the last sequence has ended |
123
|
|
|
end = i |
124
|
|
|
# store the entity |
125
|
|
|
named_entity = ' '.join(self.words[start:end]) |
126
|
|
|
entity_dict[current].append(named_entity) |
127
|
|
|
# reset our book-keeping vars |
128
|
|
|
current = Sentence.NONENTITY |
129
|
|
|
start = None |
130
|
|
|
end = None |
131
|
|
|
# we have an entity tag! |
132
|
|
|
else: |
133
|
|
|
# our old sequence continues |
134
|
|
|
if e == current: |
135
|
|
|
end = i |
136
|
|
|
# our old sequence has ended |
137
|
|
|
else: |
138
|
|
|
# do we have a previous NE? |
139
|
|
|
if current != Sentence.NONENTITY: |
140
|
|
|
end = i |
141
|
|
|
named_entity = ' '.join(self.words[start:end]) |
142
|
|
|
entity_dict[current].append(named_entity) |
143
|
|
|
# update our book-keeping vars |
144
|
|
|
current = e |
145
|
|
|
start = i |
146
|
|
|
end = None |
147
|
|
|
# this might be empty |
148
|
|
|
return entity_dict |
149
|
|
|
|
150
|
|
|
def _build_dependencies_from_dict(self, deps): |
151
|
|
|
if deps and len(deps) > 0: |
152
|
|
|
return Dependencies(deps, self.words) |
153
|
|
|
return None |
154
|
|
|
|
155
|
|
|
def __unicode__(self): |
156
|
|
|
return self.text |
157
|
|
|
|
158
|
|
|
def to_string(self): |
159
|
|
|
return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length)) |
160
|
|
|
|
161
|
|
|
def labeled_dependencies_using(self, form): |
162
|
|
|
""" |
163
|
|
|
Generates a list of labeled dependencies for a sentence |
164
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
165
|
|
|
""" |
166
|
|
|
|
167
|
|
|
f = form.lower() |
168
|
|
|
if f == "words": |
169
|
|
|
tokens = self.words |
170
|
|
|
elif f == "tags": |
171
|
|
|
tokens = self.tags |
172
|
|
|
elif f == "lemmas": |
173
|
|
|
tokens = self.lemmas |
174
|
|
|
elif f == "entities": |
175
|
|
|
tokens = self.nes |
176
|
|
|
elif f == "index": |
177
|
|
|
tokens = list(range(self.length)) |
178
|
|
|
#else: |
179
|
|
|
# raise Exception("""form must be "words", "tags", "lemmas", or "index"""") |
180
|
|
|
deps = self.dependencies |
181
|
|
|
labeled = [] |
182
|
|
|
for out in deps.outgoing: |
183
|
|
|
for (dest, rel) in deps.outgoing[out]: |
184
|
|
|
labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest])) |
185
|
|
|
return labeled |
186
|
|
|
|
187
|
|
|
def unlabeled_dependencies_using(self, form): |
188
|
|
|
""" |
189
|
|
|
Generate a list of unlabeled dependencies for a sentence |
190
|
|
|
using "words", "tags", "lemmas", "entities", or token index ("index") |
191
|
|
|
""" |
192
|
|
|
unlabeled = [] |
193
|
|
|
for sd in self.labeled_dependencies_using(form): |
194
|
|
|
(head, _, dep) = sd.split("_") |
195
|
|
|
unlabeled.append("{}_{}".format(head, dep)) |
196
|
|
|
return unlabeled |
197
|
|
|
|
198
|
|
|
def to_JSON_dict(self): |
199
|
|
|
sentence_dict = dict() |
200
|
|
|
sentence_dict["words"] = self.words |
201
|
|
|
sentence_dict["startOffsets"] = self.startOffsets |
202
|
|
|
sentence_dict["endOffsets"] = self.endOffsets |
203
|
|
|
sentence_dict["tags"] = self.tags |
204
|
|
|
sentence_dict["lemmas"] = self.lemmas |
205
|
|
|
sentence_dict["entities"] = self._entities |
206
|
|
|
sentence_dict["dependencies"] = self.dependencies.to_JSON_dict() |
207
|
|
|
return sentence_dict |
208
|
|
|
|
209
|
|
|
def to_JSON(self): |
210
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
211
|
|
|
|
212
|
|
|
@staticmethod |
213
|
|
|
def load_from_JSON(json_dict): |
214
|
|
|
sent = Sentence( |
215
|
|
|
words=json_dict["words"], |
216
|
|
|
startOffsets=json_dict["startOffsets"], |
217
|
|
|
endOffsets=json_dict["endOffsets"], |
218
|
|
|
lemmas=json_dict.get("lemmas", None), |
219
|
|
|
tags=json_dict.get("tags", None), |
220
|
|
|
entities=json_dict.get("entities", None), |
221
|
|
|
text=json_dict.get("text", None), |
222
|
|
|
dependencies=json_dict.get("dependencies", None) |
223
|
|
|
) |
224
|
|
|
return sent |
225
|
|
|
|
226
|
|
|
|
227
|
|
|
class Dependencies(object): |
228
|
|
|
""" |
229
|
|
|
Storage class for Stanford-style dependencies |
230
|
|
|
""" |
231
|
|
|
def __init__(self, deps, words): |
232
|
|
|
self._words = [w.lower() for w in words] |
233
|
|
|
self.deps = self.unpack_deps(deps) |
234
|
|
|
self.roots = deps.get("roots", []) |
235
|
|
|
self.edges = deps["edges"] |
236
|
|
|
self.incoming = self._build_incoming(self.deps) |
237
|
|
|
self.outgoing = self._build_outgoing(self.deps) |
238
|
|
|
self.labeled = self._build_labeled() |
239
|
|
|
self.unlabeled = self._build_unlabeled() |
240
|
|
|
|
241
|
|
|
def __unicode__(self): |
242
|
|
|
return self.deps |
243
|
|
|
|
244
|
|
|
def unpack_deps(self, deps): |
245
|
|
|
dependencies = [] |
246
|
|
|
for edge in deps["edges"]: |
247
|
|
|
outgoing = edge['source'] |
248
|
|
|
incoming = edge['destination'] |
249
|
|
|
rel = edge['relation'] |
250
|
|
|
dependencies.append((incoming, outgoing, rel)) |
251
|
|
|
return dependencies |
252
|
|
|
|
253
|
|
|
def _build_incoming(self, deps): |
254
|
|
|
dep_dict = defaultdict(list) |
255
|
|
|
for (incoming, outgoing, rel) in deps: |
256
|
|
|
dep_dict[outgoing].append((incoming, rel)) |
257
|
|
|
return dep_dict |
258
|
|
|
|
259
|
|
|
def _build_outgoing(self, deps): |
260
|
|
|
dep_dict = defaultdict(list) |
261
|
|
|
for (incoming, outgoing, rel) in deps: |
262
|
|
|
dep_dict[incoming].append((outgoing, rel)) |
263
|
|
|
return dep_dict |
264
|
|
|
|
265
|
|
|
def _build_labeled(self): |
266
|
|
|
labeled = [] |
267
|
|
|
for out in self.outgoing: |
268
|
|
|
for (dest, rel) in self.outgoing[out]: |
269
|
|
|
labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest])) |
270
|
|
|
return labeled |
271
|
|
|
|
272
|
|
|
def _build_unlabeled(self): |
273
|
|
|
unlabeled = [] |
274
|
|
|
for out in self.outgoing: |
275
|
|
|
for (dest, _) in self.outgoing[out]: |
276
|
|
|
unlabeled.append("{}_{}".format(self._words[out], self._words[dest])) |
277
|
|
|
return unlabeled |
278
|
|
|
|
279
|
|
|
def to_JSON_dict(self): |
280
|
|
|
deps_dict = dict() |
281
|
|
|
deps_dict["edges"] = self.edges |
282
|
|
|
deps_dict["roots"] = self.roots |
283
|
|
|
return deps_dict |
284
|
|
|
|
285
|
|
|
def to_JSON(self): |
286
|
|
|
return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4) |
287
|
|
|
|