Completed
Push — master ( 6e8b4b...cccc5b )
by Gus
01:08
created

Document.__eq__()   A

Complexity

Conditions 2

Size

Total Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
dl 0
loc 5
rs 9.4285
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Gus Hahn-Powell 2015
5
# data structures for storing processors-server output
6
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7
from __future__ import unicode_literals
8
from itertools import chain
9
from collections import defaultdict
10
#from six import text_type
11
import json
12
import re
13
14
15
class Document(object):
16
17
    def __init__(self, sentences, text=None):
18
        self.size = len(sentences)
19
        self.sentences = sentences
20
        # easily access token attributes from all sentences
21
        self.words = list(chain(*[s.words for s in self.sentences]))
22
        self.tags = list(chain(*[s.tags for s in self.sentences]))
23
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
24
        self._entities = list(chain(*[s._entities for s in self.sentences]))
25
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
26
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
27
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
28
        self.text = text if text else " ".join(self.words)
29
30
    def __unicode__(self):
31
        return self.text
32
33
    def __str__(self):
34
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
35
36
    def __eq__(self, other):
37
        if isinstance(other, self.__class__):
38
            return self.__dict__ == other.__dict__
39
        else:
40
            return False
41
42
    def __ne__(self, other):
43
        return not self.__eq__(other)
44
45
    def bag_of_labeled_dependencies_using(self, form):
46
        return list(chain(*[s.labeled_dependencies_using(form) for s in self.sentences]))
47
48
    def bag_of_unlabeled_dependencies_using(self, form):
49
        return list(chain(*[s.unlabeled_dependencies_using(form) for s in self.sentences]))
50
51
    def _merge_ne_dicts(self):
52
        # Get the set of all NE labels found in the Doc's sentences
53
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
54
        # Do we have any labels?
55
        if entity_labels == None:
56
            return None
57
        # If we have labels, consolidate the NEs under the appropriate label
58
        else:
59
            nes_dict = dict()
60
            for e in entity_labels:
61
                entities = []
62
                for s in self.sentences:
63
                    entities += s.nes[e]
64
                nes_dict[e] = entities
65
            return nes_dict
66
67
    def to_JSON_dict(self):
68
        doc_dict = dict()
69
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
70
        doc_dict["text"] = self.text
71
        return doc_dict
72
73
    def to_JSON(self):
74
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
75
76
    @staticmethod
77
    def load_from_JSON(json_dict):
78
        sentences = []
79
        for s in json_dict["sentences"]:
80
            kwargs = {
81
                "words": s["words"],
82
                "startOffsets": s["startOffsets"],
83
                "endOffsets": s["endOffsets"],
84
                "tags": s.get("tags", None),
85
                "lemmas": s.get("lemmas", None),
86
                "entities": s.get("entities", None),
87
                "text": s.get("text", None),
88
                "dependencies": s.get("dependencies", None)
89
            }
90
            sent = Sentence(**kwargs)
91
            sentences.append(sent)
92
        return Document(sentences, json_dict.get("text", None))
93
94
class Sentence(object):
95
96
    UNKNOWN = "UNKNOWN"
97
    # the O in IOB notation
98
    NONENTITY = "O"
99
100
    def __init__(self, **kwargs):
101
        self.words = kwargs["words"]
102
        self.startOffsets = kwargs["startOffsets"]
103
        self.endOffsets = kwargs["endOffsets"]
104
        self.length = len(self.words)
105
        self.tags = self._set_toks(kwargs.get("tags", None))
106
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
107
        self._entities = self._set_toks(kwargs.get("entities", None))
108
        self.text = kwargs.get("text", None) or " ".join(self.words)
109
        self.dependencies = self._build_dependencies_from_dict(kwargs.get("dependencies", None))
110
        self.nes = self._set_nes(self._entities)
111
112
    def __eq__(self, other):
113
        if isinstance(other, self.__class__):
114
            return self.__dict__ == other.__dict__
115
        else:
116
            return False
117
118
    def __ne__(self, other):
119
        return not self.__eq__(other)
120
121
    def _set_toks(self, toks):
122
        return toks if toks else [self.UNKNOWN]*self.length
123
124
    def _set_nes(self, entities):
125
        """
126
        Consolidates consecutive NEs under the appropriate label
127
        """
128
        entity_dict = defaultdict(list)
129
        # initialize to empty label
130
        current = Sentence.NONENTITY
131
        start = None
132
        end = None
133
        for i, e in enumerate(entities):
134
            # we don't have an entity tag
135
            if e == Sentence.NONENTITY:
136
                # did we have an entity with the last token?
137
                if current == Sentence.NONENTITY:
138
                    continue
139
                else:
140
                    # the last sequence has ended
141
                    end = i
142
                    # store the entity
143
                    named_entity = ' '.join(self.words[start:end])
144
                    entity_dict[current].append(named_entity)
145
                    # reset our book-keeping vars
146
                    current = Sentence.NONENTITY
147
                    start = None
148
                    end = None
149
            # we have an entity tag!
150
            else:
151
                # our old sequence continues
152
                if e == current:
153
                    end = i
154
                # our old sequence has ended
155
                else:
156
                    # do we have a previous NE?
157
                    if current != Sentence.NONENTITY:
158
                        end = i
159
                        named_entity = ' '.join(self.words[start:end])
160
                        entity_dict[current].append(named_entity)
161
                    # update our book-keeping vars
162
                    current = e
163
                    start = i
164
                    end = None
165
        # this might be empty
166
        return entity_dict
167
168
    def _build_dependencies_from_dict(self, deps):
169
        if deps and len(deps) > 0:
170
            return Dependencies(deps, self.words)
171
        return None
172
173
    def __unicode__(self):
174
        return self.text
175
176
    def to_string(self):
177
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
178
179
    def labeled_dependencies_using(self, form):
180
        """
181
        Generates a list of labeled dependencies for a sentence
182
        using "words", "tags", "lemmas", "entities", or token index ("index")
183
        """
184
185
        f = form.lower()
186
        if f == "words":
187
            tokens = self.words
188
        elif f == "tags":
189
            tokens = self.tags
190
        elif f == "lemmas":
191
            tokens = self.lemmas
192
        elif f == "entities":
193
            tokens = self.nes
194
        elif f == "index":
195
            tokens = list(range(self.length))
196
        #else:
197
        #    raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
198
        deps = self.dependencies
199
        labeled = []
200
        for out in deps.outgoing:
201
            for (dest, rel) in deps.outgoing[out]:
202
                labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
203
        return labeled
204
205
    def unlabeled_dependencies_using(self, form):
206
        """
207
        Generate a list of unlabeled dependencies for a sentence
208
        using "words", "tags", "lemmas", "entities", or token index ("index")
209
        """
210
        unlabeled = []
211
        for sd in self.labeled_dependencies_using(form):
212
            (head, _, dep) = sd.split("_")
213
            unlabeled.append("{}_{}".format(head, dep))
214
        return unlabeled
215
216
    def to_JSON_dict(self):
217
        sentence_dict = dict()
218
        sentence_dict["words"] = self.words
219
        sentence_dict["startOffsets"] = self.startOffsets
220
        sentence_dict["endOffsets"] = self.endOffsets
221
        sentence_dict["tags"] = self.tags
222
        sentence_dict["lemmas"] = self.lemmas
223
        sentence_dict["entities"] = self._entities
224
        sentence_dict["dependencies"] = self.dependencies.to_JSON_dict()
225
        return sentence_dict
226
227
    def to_JSON(self):
228
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
229
230
    @staticmethod
231
    def load_from_JSON(json_dict):
232
        sent = Sentence(
233
                    words=json_dict["words"],
234
                    startOffsets=json_dict["startOffsets"],
235
                    endOffsets=json_dict["endOffsets"],
236
                    lemmas=json_dict.get("lemmas", None),
237
                    tags=json_dict.get("tags", None),
238
                    entities=json_dict.get("entities", None),
239
                    text=json_dict.get("text", None),
240
                    dependencies=json_dict.get("dependencies", None)
241
                    )
242
        return sent
243
244
245
class Dependencies(object):
246
    """
247
    Storage class for Stanford-style dependencies
248
    """
249
    def __init__(self, deps, words):
250
        self._words = [w.lower() for w in words]
251
        self.deps = self.unpack_deps(deps)
252
        self.roots = deps.get("roots", [])
253
        self.edges = deps["edges"]
254
        self.incoming = self._build_incoming(self.deps)
255
        self.outgoing = self._build_outgoing(self.deps)
256
        self.labeled = self._build_labeled()
257
        self.unlabeled = self._build_unlabeled()
258
259
    def __unicode__(self):
260
        return self.deps
261
262
    def __eq__(self, other):
263
        if isinstance(other, self.__class__):
264
            return self.__dict__ == other.__dict__
265
        else:
266
            return False
267
268
    def __ne__(self, other):
269
        return not self.__eq__(other)
270
271
    def unpack_deps(self, deps):
272
        dependencies = []
273
        for edge in deps["edges"]:
274
            outgoing = edge['source']
275
            incoming = edge['destination']
276
            rel = edge['relation']
277
            dependencies.append((incoming, outgoing, rel))
278
        return dependencies
279
280
    def _build_incoming(self, deps):
281
        dep_dict = defaultdict(list)
282
        for (incoming, outgoing, rel) in deps:
283
            dep_dict[incoming].append((outgoing, rel))
284
        return dep_dict
285
286
    def _build_outgoing(self, deps):
287
        dep_dict = defaultdict(list)
288
        for (incoming, outgoing, rel) in deps:
289
            dep_dict[outgoing].append((incoming, rel))
290
        return dep_dict
291
292
    def _build_labeled(self):
293
        labeled = []
294
        for out in self.outgoing:
295
            for (dest, rel) in self.outgoing[out]:
296
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
297
        return labeled
298
299
    def _build_unlabeled(self):
300
        unlabeled = []
301
        for out in self.outgoing:
302
            for (dest, _) in self.outgoing[out]:
303
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
304
        return unlabeled
305
306
    def to_JSON_dict(self):
307
        deps_dict = dict()
308
        deps_dict["edges"] = self.edges
309
        deps_dict["roots"] = self.roots
310
        return deps_dict
311
312
    def to_JSON(self):
313
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
314