Completed
Push — master ( 4c1319...a3584a )
by Gus
34s queued 23s
created

Mention.words()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
c 1
b 0
f 0
dl 0
loc 2
rs 10
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from __future__ import unicode_literals
4
from .utils import post_json
5
from .ds import Document, Interval, NLPDatum
6
from termcolor import colored
7
import re
8
import json
9
10
11
class OdinHighlighter(object):
12
13
    @staticmethod
14
    def LABEL(token):
15
        return colored(token, color="red", attrs=["bold"])
16
17
    @staticmethod
18
    def ARG(token):
19
        return colored(token, on_color="on_green", attrs=["bold"])
20
21
    @staticmethod
22
    def TRIGGER(token):
23
        return colored(token, on_color="on_blue", attrs=["bold"])
24
25
    @staticmethod
26
    def CONCEAL(token):
27
        return colored(token, on_color="on_grey", attrs=["concealed"])
28
29
    @staticmethod
30
    def MENTION(token):
31
        return colored(token, on_color="on_yellow")
32
33
    @staticmethod
34
    def highlight_mention(mention):
35
        """
36
        Formats text of mention
37
        """
38
        text_span = mention.sentenceObj.words[:]
39
        # format TBM span like an arg
40
        if mention.type == "TextBoundMention":
41
            for i in range(mention.start, mention.end):
42
                text_span[i] = OdinHighlighter.ARG(text_span[i])
43
        if mention.arguments:
44
            for (role, args) in mention.arguments.items():
45
                for arg in args:
46
                    for i in range(arg.start, arg.end):
47
                        text_span[i] = OdinHighlighter.ARG(text_span[i])
48
        # format trigger distinctly from args
49
        if mention.trigger:
50
            trigger = mention.trigger
51
            for i in range(trigger.start, trigger.end):
52
                text_span[i] = OdinHighlighter.TRIGGER(text_span[i])
53
54
        # highlight tokens contained in mention span
55
        for i in range(mention.start, mention.end):
56
            text_span[i] = OdinHighlighter.MENTION(text_span[i])
57
        mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
58
        # highlight spaces in mention span
59
        formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
60
        return formatted_text.strip()
61
62
class Mention(NLPDatum):
63
    """
64
    A labeled span of text.  Used to model textual mentions of events, relations, and entities.
65
66
    Parameters
67
    ----------
68
    token_interval : Interval
69
        The span of the Mention represented as an Interval.
70
    sentence : int
71
        The sentence index that contains the Mention.
72
    document : Document
73
        The Document in which the Mention was found.
74
    foundBy : str
75
        The Odin IE rule that produced this Mention.
76
    label : str
77
        The label most closely associated with this span.  Usually the lowest hyponym of "labels".
78
    labels: list
79
        The list of labels associated with this span.
80
    trigger: dict or None
81
        dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
82
    arguments: dict or None
83
        dict of JSON for Mention's arguments.
84
    paths: dict or None
85
        dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
86
    doc_id: str or None
87
        the id of the document
88
89
    Attributes
90
    ----------
91
    tokenInterval: processors.ds.Interval
92
        An `Interval` encoding the `start` and `end` of the `Mention`.
93
    start : int
94
        The token index that starts the `Mention`.
95
    end : int
96
        The token index that marks the end of the Mention (exclusive).
97
    sentenceObj : processors.ds.Sentence
98
        Pointer to the `Sentence` instance containing the `Mention`.
99
    characterStartOffset: int
100
        The index of the character that starts the `Mention`.
101
    characterEndOffset: int
102
        The index of the character that ends the `Mention`.
103
    type: Mention.TBM or Mention.EM or Mention.RM
104
        The type of the `Mention`.
105
106
    See Also
107
    --------
108
109
    [`Odin` manual](https://arxiv.org/abs/1509.07513)
110
111
    Methods
112
    -------
113
    matches(label_pattern)
114
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
115
116
    overlaps(other)
117
        Test whether other (token index or Mention) overlaps with span of this Mention.
118
119
    copy(**kwargs)
120
        Copy constructor for this Mention.
121
122
    words()
123
        Words for this Mention's span.
124
125
    tags()
126
        Part of speech for this Mention's span.
127
128
    lemmas()
129
        Lemmas for this Mention's span.
130
131
    _chunks()
132
        chunk labels for this Mention's span.
133
134
    _entities()
135
        NE labels for this Mention's span.
136
    """
137
138
    TBM = "TextBoundMention"
139
    EM = "EventMention"
140
    RM = "RelationMention"
141
142
    def __init__(self,
143
                token_interval,
144
                sentence,
145
                document,
146
                foundBy,
147
                label,
148
                labels=None,
149
                trigger=None,
150
                arguments=None,
151
                paths=None,
152
                keep=True,
153
                doc_id=None):
154
155
        NLPDatum.__init__(self)
156
        self.label = label
157
        self.labels = labels if labels else [self.label]
158
        self.tokenInterval = token_interval
159
        self.start = self.tokenInterval.start
160
        self.end = self.tokenInterval.end
161
        self.document = document
162
        self._doc_id = doc_id or hash(self.document)
163
        self.sentence = sentence
164
        if trigger:
165
            # NOTE: doc id is not stored for trigger's json,
166
            # as it is assumed to be contained in the same document as its parent
167
            trigger.update({"document": self._doc_id})
168
            self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
169
        else:
170
            self.trigger = None
171
        # unpack args
172
        self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
173
        self.paths = paths
174
        self.keep = keep
175
        self.foundBy = foundBy
176
        # other
177
        self.sentenceObj = self.document.sentences[self.sentence]
178
        self.text = " ".join(self.sentenceObj.words[self.start:self.end])
179
        # recover offsets
180
        self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
181
        self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
182
        # for later recovery
183
        self.id = None
184
        self.type = self._set_type()
185
186
    def __str__(self):
187
        return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))
188
189
    def __eq__(self, other):
190
        if isinstance(other, self.__class__):
191
            return self.__dict__ == other.__dict__
192
        else:
193
            return False
194
195
    def __ne__(self, other):
196
        return not self.__eq__(other)
197
198
    def __hash__(self):
199
        return hash(self.to_JSON())
200
201
    def to_JSON_dict(self):
202
        m = dict()
203
        m["id"] = self.id
204
        m["type"] = self.type
205
        m["label"] = self.label
206
        m["labels"] = self.labels
207
        m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
208
        m["characterStartOffset"] = self.characterStartOffset
209
        m["characterEndOffset"] = self.characterEndOffset
210
        m["sentence"] = self.sentence
211
        m["document"] = self._doc_id
212
        # do we have a trigger?
213
        if self.trigger:
214
             m["trigger"] = self.trigger.to_JSON_dict()
215
        # do we have arguments?
216
        if self.arguments:
217
            m["arguments"] = self._arguments_to_JSON_dict()
218
        # handle paths
219
        if self.paths:
220
            m["paths"] = self.paths
221
        m["keep"] = self.keep
222
        m["foundBy"] = self.foundBy
223
        return m
224
225
    def startOffset(self):
226
        return self.sentenceObj.endOffsets[self.start]
227
228
    def endOffset(self):
229
        return self.sentenceObj.endOffsets[self.end -1]
230
231
    def words(self):
232
        return self.sentenceObj.words[self.start:self.end]
233
234
    def tags(self):
235
        return self.sentenceObj.tags[self.start:self.end]
236
237
    def lemmas(self):
238
        return self.sentenceObj.lemmas[self.start:self.end]
239
240
    def _chunks(self):
241
        return self.sentenceObj._chunks[self.start:self.end]
242
243
    def _entities(self):
244
        return self.sentenceObj._entities[self.start:self.end]
245
246
    def copy(self, **kwargs):
247
        """
248
        Copy constructor for mention
249
        """
250
        # return new instance
251
        return self.__class__(
252
            label=kwargs.get("label", self.label),
253
            labels=kwargs.get("label", self.labels),
254
            token_interval=kwargs.get("token_interval", self.tokenInterval),
255
            sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
256
            document=kwargs.get("document", self.document),
257
            foundBy=kwargs.get("foundBy", self.foundBy),
258
            trigger=kwargs.get("trigger", self.trigger),
259
            arguments=kwargs.get("arguments", self.arguments),
260
            paths=kwargs.get("paths", self.paths),
261
            keep=kwargs.get("keep", self.keep),
262
            doc_id=kwargs.get("doc_id", self._doc_id)
263
        )
264
265
    def overlaps(self, other):
266
        """
267
        Checks for overlap.
268
        """
269
        if isinstance(other, int):
270
            return self.start <= other < self.end
271
        elif isinstance(other, Mention):
272
            # equiv. sentences + checks on start and end
273
            return (self.sentence.__hash__() == other.sentence.__hash__()) and \
274
            ((other.start <= self.start < other.end) or (self.start <= other.start < self.end))
275
        else:
276
            return False
277
278
    def matches(self, label_pattern):
279
        """
280
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
281
282
        Parameters
283
        ----------
284
        label_pattern : str or _sre.SRE_Pattern
285
            The pattern to match against each element in `Mention.labels`
286
287
        Returns
288
        -------
289
        bool
290
            True if `label_pattern` matches any element in `Mention.labels`
291
        """
292
        return any(re.match(label_pattern, label) for label in self.labels)
293
294
    def _arguments_to_JSON_dict(self):
295
        return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())
296
297
    def _paths_to_JSON_dict(self):
298
        return {role: paths.to_JSON_dict() for (role, paths) in self.paths}
299
300
    @staticmethod
301
    def load_from_JSON(mjson, docs_dict):
302
        # recover document
303
        doc_id = mjson["document"]
304
        doc = docs_dict[doc_id]
305
        labels = mjson["labels"]
306
        kwargs = {
307
            "label": mjson.get("label", labels[0]),
308
            "labels": labels,
309
            "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
310
            "sentence": mjson["sentence"],
311
            "document": doc,
312
            "doc_id": doc_id,
313
            "trigger": mjson.get("trigger", None),
314
            "arguments": mjson.get("arguments", None),
315
            "paths": mjson.get("paths", None),
316
            "keep": mjson.get("keep", True),
317
            "foundBy": mjson["foundBy"]
318
        }
319
        m = Mention(**kwargs)
320
        # set IDs
321
        m.id = mjson["id"]
322
        m._doc_id = doc_id
323
        # set character offsets
324
        m.character_start_offset = mjson["characterStartOffset"]
325
        m.character_end_offset = mjson["characterEndOffset"]
326
        return m
327
328
    def _to_document_map(self):
329
        return {self._doc_id: self.document}
330
331
    def _set_type(self):
332
        # event mention
333
        if self.trigger != None:
334
            return Mention.EM
335
        # textbound mention
336
        elif self.trigger == None and self.arguments == None:
337
            return Mention.TBM
338
        else:
339
            return Mention.RM
340