Completed
Push — master ( 3de2d7...fcfe03 )
by Gus
01:06
created

Document.to_JSON_dict()   A

Complexity

Conditions 2

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 2
c 1
b 0
f 0
dl 0
loc 5
rs 9.4285
1
#!/usr/bin/env python
2
# Gus Hahn-Powell 2015
3
# data structures for storing processors-server output
4
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
5
6
from itertools import chain
7
from collections import defaultdict
8
import json
9
import re
10
11
12
class Document(object):
13
14
    def __init__(self, sentences, text=None):
15
        self.size = len(sentences)
16
        self.sentences = sentences
17
        # easily access token attributes from all sentences
18
        self.words = list(chain(*[s.words for s in self.sentences]))
19
        self.tags = list(chain(*[s.tags for s in self.sentences]))
20
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
21
        self._entities = list(chain(*[s._entities for s in self.sentences]))
22
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
23
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
24
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
25
        self.text = text if text else " ".join(self.words)
26
27
    def bag_of_labeled_dependencies_using(self, form):
28
        return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences]))
29
30
    def bag_of_unlabeled_dependencies_using(self, form):
31
        return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences]))
32
33
    def _merge_ne_dicts(self):
34
        # Get the set of all NE labels found in the Doc's sentences
35
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
36
        # Do we have any labels?
37
        if entity_labels == None:
38
            return None
39
        # If we have labels, consolidate the NEs under the appropriate label
40
        else:
41
            nes_dict = dict()
42
            for e in entity_labels:
43
                entities = []
44
                for s in self.sentences:
45
                    entities += s.nes[e]
46
                nes_dict[e] = entities
47
            return nes_dict
48
49
    def __str__(self):
50
        return self.text
51
52
    def to_JSON_dict(self):
53
        doc_dict = dict()
54
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
55
        doc_dict["text"] = self.text
56
        return doc_dict
57
58
    def to_JSON(self):
59
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
60
61
    @staticmethod
62
    def load_from_JSON(json_dict):
63
        sentences = []
64
        for s in json_dict["sentences"]:
65
            kwargs = {
66
                "words": s["words"],
67
                "startOffsets": s["startOffsets"],
68
                "endOffsets": s["endOffsets"],
69
                "tags": s.get("tags", None),
70
                "lemmas": s.get("lemmas", None),
71
                "entities": s.get("entities", None),
72
                "text": s.get("text", None),
73
                "dependencies": s.get("dependencies", None)
74
            }
75
            sent = Sentence(**kwargs)
76
            sentences.append(sent)
77
        return Document(sentences, json_dict.get("text", None))
78
79
class Sentence(object):
80
81
    UNKNOWN = "UNKNOWN"
82
    # the O in IOB notation
83
    NONENTITY = "O"
84
85
    def __init__(self, **kwargs):
86
        self.words = kwargs["words"]
87
        self.startOffsets = kwargs["startOffsets"]
88
        self.endOffsets = kwargs["endOffsets"]
89
        self.length = len(self.words)
90
        self.tags = self._set_toks(kwargs.get("tags", None))
91
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
92
        self._entities = self._set_toks(kwargs.get("entities", None))
93
        self.text = kwargs.get("text", None) or " ".join(self.words)
94
        self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None))
95
        self.nes = self._set_nes(self._entities)
96
97
    def _set_toks(self, toks):
98
        return toks if toks else [self.UNKNOWN]*self.length
99
100
    def _set_nes(self, entities):
101
        """
102
        Consolidates consecutive NEs under the appropriate label
103
        """
104
        entity_dict = defaultdict(list)
105
        # initialize to empty label
106
        current = Sentence.NONENTITY
107
        start = None
108
        end = None
109
        for i, e in enumerate(entities):
110
            # we don't have an entity tag
111
            if e == Sentence.NONENTITY:
112
                # did we have an entity with the last token?
113
                if current == Sentence.NONENTITY:
114
                    continue
115
                else:
116
                    # the last sequence has ended
117
                    end = i
118
                    # store the entity
119
                    named_entity = ' '.join(self.words[start:end])
120
                    entity_dict[current].append(named_entity)
121
                    # reset our book-keeping vars
122
                    current = Sentence.NONENTITY
123
                    start = None
124
                    end = None
125
            # we have an entity tag!
126
            else:
127
                # our old sequence continues
128
                if e == current:
129
                    end = i
130
                # our old sequence has ended
131
                else:
132
                    # do we have a previous NE?
133
                    if current != Sentence.NONENTITY:
134
                        end = i
135
                        named_entity = ' '.join(self.words[start:end])
136
                        entity_dict[current].append(named_entity)
137
                    # update our book-keeping vars
138
                    current = e
139
                    start = i
140
                    end = None
141
        # this might be empty
142
        return entity_dict
143
144
    def _build_dependencies_from_dict(self, deps):
145
        if deps and len(deps) > 0:
146
            return Dependencies(deps, self.words)
147
        return None
148
149
    def __str__(self):
150
        return self.text
151
152
    def to_string(self):
153
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
154
155
    def labeled_dependencies_using(self, form):
156
        """
157
        Generates a list of labeled dependencies for a sentence
158
        using "words", "tags", "lemmas", "entities", or token index ("index")
159
        """
160
161
        f = form.lower()
162
        if f == "words":
163
            tokens = self.words
164
        elif f == "tags":
165
            tokens = self.tags
166
        elif f == "lemmas":
167
            tokens = self.lemmas
168
        elif f == "entities":
169
            tokens = self.nes
170
        elif f == "index":
171
            tokens = list(range(self.length))
172
        #else:
173
        #    raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
174
        deps = self.dependencies
175
        labeled = []
176
        for out in deps.outgoing:
177
            for (dest, rel) in deps.outgoing[out]:
178
                labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
179
        return labeled
180
181
    def unlabeled_dependencies_using(self, form):
182
        """
183
        Generate a list of unlabeled dependencies for a sentence
184
        using "words", "tags", "lemmas", "entities", or token index ("index")
185
        """
186
        unlabeled = []
187
        for sd in self.labeled_dependencies_using(form):
188
            (head, _, dep) = sd.split("_")
189
            unlabeled.append("{}_{}".format(head, dep))
190
        return unlabeled
191
192
    def to_JSON_dict(self):
193
        sentence_dict = dict()
194
        sentence_dict["words"] = self.words
195
        sentence_dict["startOffsets"] = self.startOffsets
196
        sentence_dict["endOffsets"] = self.endOffsets
197
        sentence_dict["tags"] = self.tags
198
        sentence_dict["lemmas"] = self.lemmas
199
        sentence_dict["entities"] = self._entities
200
        sentence_dict["dependencies"] = self.dependencies.to_JSON_dict()
201
        return sentence_dict
202
203
    def to_JSON(self):
204
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
205
206
    @staticmethod
207
    def load_from_JSON(json_dict):
208
        sent = Sentence(
209
                    words=json_dict["words"],
210
                    startOffsets=json_dict["startOffsets"],
211
                    endOffsets=json_dict["endOffsets"],
212
                    lemmas=json_dict.get("lemmas", None),
213
                    tags=json_dict.get("tags", None),
214
                    entities=json_dict.get("entities", None),
215
                    text=json_dict.get("text", None),
216
                    dependencies=json_dict.get("dependencies", None)
217
                    )
218
        return sent
219
220
221
class Dependencies(object):
222
    """
223
    Storage class for Stanford-style dependencies
224
    """
225
    def __init__(self, deps, words):
226
        self._words = [w.lower() for w in words]
227
        self.deps = self.unpack_deps(deps)
228
        self.edges = deps["edges"]
229
        self.incoming = self._build_incoming(self.deps)
230
        self.outgoing = self._build_outgoing(self.deps)
231
        self.labeled = self._build_labeled()
232
        self.unlabeled = self._build_unlabeled()
233
234
    def __str__(self):
235
        return self.deps
236
237
    def unpack_deps(self, deps):
238
        dependencies = []
239
        for edge in deps["edges"]:
240
            outgoing = edge['source']
241
            incoming = edge['destination']
242
            rel = edge['relation']
243
            dependencies.append((incoming, outgoing, rel))
244
        return dependencies
245
246
    def _build_incoming(self, deps):
247
        dep_dict = defaultdict(list)
248
        for (incoming, outgoing, rel) in deps:
249
            dep_dict[outgoing].append((incoming, rel))
250
        return dep_dict
251
252
    def _build_outgoing(self, deps):
253
        dep_dict = defaultdict(list)
254
        for (incoming, outgoing, rel) in deps:
255
            dep_dict[incoming].append((outgoing, rel))
256
        return dep_dict
257
258
    def _build_labeled(self):
259
        labeled = []
260
        for out in self.outgoing:
261
            for (dest, rel) in self.outgoing[out]:
262
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
263
        return labeled
264
265
    def _build_unlabeled(self):
266
        unlabeled = []
267
        for out in self.outgoing:
268
            for (dest, _) in self.outgoing[out]:
269
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
270
        return unlabeled
271
272
    def to_JSON_dict(self):
273
        deps_dict = dict()
274
        deps_dict["edges"] = self.edges
275
        return deps_dict
276
277
    def to_JSON(self):
278
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
279