Completed
Push — master ( d242d8...16b8b8 )
by Gus
01:27
created

Dependencies.__unicode__()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Gus Hahn-Powell 2015
5
# data structures for storing processors-server output
6
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7
from __future__ import unicode_literals
8
from itertools import chain
9
from collections import defaultdict
10
#from six import text_type
11
import json
12
import re
13
14
15
class Document(object):
16
17
    def __init__(self, sentences, text=None):
18
        self.size = len(sentences)
19
        self.sentences = sentences
20
        # easily access token attributes from all sentences
21
        self.words = list(chain(*[s.words for s in self.sentences]))
22
        self.tags = list(chain(*[s.tags for s in self.sentences]))
23
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
24
        self._entities = list(chain(*[s._entities for s in self.sentences]))
25
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
26
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
27
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
28
        self.text = text if text else " ".join(self.words)
29
30
    def bag_of_labeled_dependencies_using(self, form):
31
        return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences]))
32
33
    def bag_of_unlabeled_dependencies_using(self, form):
34
        return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences]))
35
36
    def _merge_ne_dicts(self):
37
        # Get the set of all NE labels found in the Doc's sentences
38
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
39
        # Do we have any labels?
40
        if entity_labels == None:
41
            return None
42
        # If we have labels, consolidate the NEs under the appropriate label
43
        else:
44
            nes_dict = dict()
45
            for e in entity_labels:
46
                entities = []
47
                for s in self.sentences:
48
                    entities += s.nes[e]
49
                nes_dict[e] = entities
50
            return nes_dict
51
52
    def __unicode__(self):
53
        return self.text
54
55
    def __str__(self):
56
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
57
58
    def to_JSON_dict(self):
59
        doc_dict = dict()
60
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
61
        doc_dict["text"] = self.text
62
        return doc_dict
63
64
    def to_JSON(self):
65
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
66
67
    @staticmethod
68
    def load_from_JSON(json_dict):
69
        sentences = []
70
        for s in json_dict["sentences"]:
71
            kwargs = {
72
                "words": s["words"],
73
                "startOffsets": s["startOffsets"],
74
                "endOffsets": s["endOffsets"],
75
                "tags": s.get("tags", None),
76
                "lemmas": s.get("lemmas", None),
77
                "entities": s.get("entities", None),
78
                "text": s.get("text", None),
79
                "dependencies": s.get("dependencies", None)
80
            }
81
            sent = Sentence(**kwargs)
82
            sentences.append(sent)
83
        return Document(sentences, json_dict.get("text", None))
84
85
class Sentence(object):
86
87
    UNKNOWN = "UNKNOWN"
88
    # the O in IOB notation
89
    NONENTITY = "O"
90
91
    def __init__(self, **kwargs):
92
        self.words = kwargs["words"]
93
        self.startOffsets = kwargs["startOffsets"]
94
        self.endOffsets = kwargs["endOffsets"]
95
        self.length = len(self.words)
96
        self.tags = self._set_toks(kwargs.get("tags", None))
97
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
98
        self._entities = self._set_toks(kwargs.get("entities", None))
99
        self.text = kwargs.get("text", None) or " ".join(self.words)
100
        self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None))
101
        self.nes = self._set_nes(self._entities)
102
103
    def _set_toks(self, toks):
104
        return toks if toks else [self.UNKNOWN]*self.length
105
106
    def _set_nes(self, entities):
107
        """
108
        Consolidates consecutive NEs under the appropriate label
109
        """
110
        entity_dict = defaultdict(list)
111
        # initialize to empty label
112
        current = Sentence.NONENTITY
113
        start = None
114
        end = None
115
        for i, e in enumerate(entities):
116
            # we don't have an entity tag
117
            if e == Sentence.NONENTITY:
118
                # did we have an entity with the last token?
119
                if current == Sentence.NONENTITY:
120
                    continue
121
                else:
122
                    # the last sequence has ended
123
                    end = i
124
                    # store the entity
125
                    named_entity = ' '.join(self.words[start:end])
126
                    entity_dict[current].append(named_entity)
127
                    # reset our book-keeping vars
128
                    current = Sentence.NONENTITY
129
                    start = None
130
                    end = None
131
            # we have an entity tag!
132
            else:
133
                # our old sequence continues
134
                if e == current:
135
                    end = i
136
                # our old sequence has ended
137
                else:
138
                    # do we have a previous NE?
139
                    if current != Sentence.NONENTITY:
140
                        end = i
141
                        named_entity = ' '.join(self.words[start:end])
142
                        entity_dict[current].append(named_entity)
143
                    # update our book-keeping vars
144
                    current = e
145
                    start = i
146
                    end = None
147
        # this might be empty
148
        return entity_dict
149
150
    def _build_dependencies_from_dict(self, deps):
151
        if deps and len(deps) > 0:
152
            return Dependencies(deps, self.words)
153
        return None
154
155
    def __unicode__(self):
156
        return self.text
157
158
    def to_string(self):
159
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
160
161
    def labeled_dependencies_using(self, form):
162
        """
163
        Generates a list of labeled dependencies for a sentence
164
        using "words", "tags", "lemmas", "entities", or token index ("index")
165
        """
166
167
        f = form.lower()
168
        if f == "words":
169
            tokens = self.words
170
        elif f == "tags":
171
            tokens = self.tags
172
        elif f == "lemmas":
173
            tokens = self.lemmas
174
        elif f == "entities":
175
            tokens = self.nes
176
        elif f == "index":
177
            tokens = list(range(self.length))
178
        #else:
179
        #    raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
180
        deps = self.dependencies
181
        labeled = []
182
        for out in deps.outgoing:
183
            for (dest, rel) in deps.outgoing[out]:
184
                labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
185
        return labeled
186
187
    def unlabeled_dependencies_using(self, form):
188
        """
189
        Generate a list of unlabeled dependencies for a sentence
190
        using "words", "tags", "lemmas", "entities", or token index ("index")
191
        """
192
        unlabeled = []
193
        for sd in self.labeled_dependencies_using(form):
194
            (head, _, dep) = sd.split("_")
195
            unlabeled.append("{}_{}".format(head, dep))
196
        return unlabeled
197
198
    def to_JSON_dict(self):
199
        sentence_dict = dict()
200
        sentence_dict["words"] = self.words
201
        sentence_dict["startOffsets"] = self.startOffsets
202
        sentence_dict["endOffsets"] = self.endOffsets
203
        sentence_dict["tags"] = self.tags
204
        sentence_dict["lemmas"] = self.lemmas
205
        sentence_dict["entities"] = self._entities
206
        sentence_dict["dependencies"] = self.dependencies.to_JSON_dict()
207
        return sentence_dict
208
209
    def to_JSON(self):
210
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
211
212
    @staticmethod
213
    def load_from_JSON(json_dict):
214
        sent = Sentence(
215
                    words=json_dict["words"],
216
                    startOffsets=json_dict["startOffsets"],
217
                    endOffsets=json_dict["endOffsets"],
218
                    lemmas=json_dict.get("lemmas", None),
219
                    tags=json_dict.get("tags", None),
220
                    entities=json_dict.get("entities", None),
221
                    text=json_dict.get("text", None),
222
                    dependencies=json_dict.get("dependencies", None)
223
                    )
224
        return sent
225
226
227
class Dependencies(object):
228
    """
229
    Storage class for Stanford-style dependencies
230
    """
231
    def __init__(self, deps, words):
232
        self._words = [w.lower() for w in words]
233
        self.deps = self.unpack_deps(deps)
234
        self.roots = deps.get("roots", [])
235
        self.edges = deps["edges"]
236
        self.incoming = self._build_incoming(self.deps)
237
        self.outgoing = self._build_outgoing(self.deps)
238
        self.labeled = self._build_labeled()
239
        self.unlabeled = self._build_unlabeled()
240
241
    def __unicode__(self):
242
        return self.deps
243
244
    def unpack_deps(self, deps):
245
        dependencies = []
246
        for edge in deps["edges"]:
247
            outgoing = edge['source']
248
            incoming = edge['destination']
249
            rel = edge['relation']
250
            dependencies.append((incoming, outgoing, rel))
251
        return dependencies
252
253
    def _build_incoming(self, deps):
254
        dep_dict = defaultdict(list)
255
        for (incoming, outgoing, rel) in deps:
256
            dep_dict[outgoing].append((incoming, rel))
257
        return dep_dict
258
259
    def _build_outgoing(self, deps):
260
        dep_dict = defaultdict(list)
261
        for (incoming, outgoing, rel) in deps:
262
            dep_dict[incoming].append((outgoing, rel))
263
        return dep_dict
264
265
    def _build_labeled(self):
266
        labeled = []
267
        for out in self.outgoing:
268
            for (dest, rel) in self.outgoing[out]:
269
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
270
        return labeled
271
272
    def _build_unlabeled(self):
273
        unlabeled = []
274
        for out in self.outgoing:
275
            for (dest, _) in self.outgoing[out]:
276
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
277
        return unlabeled
278
279
    def to_JSON_dict(self):
280
        deps_dict = dict()
281
        deps_dict["edges"] = self.edges
282
        deps_dict["roots"] = self.roots
283
        return deps_dict
284
285
    def to_JSON(self):
286
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
287