Completed
Push — master ( b98938...d37950 )
by Gus
01:14
created

Mention.__hash__()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
dl 0
loc 2
rs 10
c 1
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from __future__ import unicode_literals
4
from .utils import post_json
5
from .ds import Document, Interval
6
from .visualization import OdinHighlighter
7
import re
8
import json
9
10
11
class Mention(object):
12
    """
13
    A labeled span of text.  Used to model textual mentions of events, relations, and entities.
14
15
    Parameters
16
    ----------
17
    token_interval : Interval
18
        The span of the Mention represented as an Interval.
19
    sentence : int
20
        The sentence index that contains the Mention.
21
    document : Document
22
        The Document in which the Mention was found.
23
    foundBy : str
24
        The Odin IE rule that produced this Mention.
25
    label : str
26
        The label most closely associated with this span.  Usually the lowest hyponym of "labels".
27
    labels: list
28
        The list of labels associated with this span.
29
    trigger: dict or None
30
        dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
31
    arguments: dict or None
32
        dict of JSON for Mention's arguments.
33
    paths: dict or None
34
        dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
35
    doc_id: str or None
36
        the id of the document
37
38
    Attributes
39
    ----------
40
    tokenInterval: processors.ds.Interval
41
        An `Interval` encoding the `start` and `end` of the `Mention`.
42
    start : int
43
        The token index that starts the `Mention`.
44
    end : int
45
        The token index that marks the end of the Mention (exclusive).
46
    sentenceObj : processors.ds.Sentence
47
        Pointer to the `Sentence` instance containing the `Mention`.
48
    characterStartOffset: int
49
        The index of the character that starts the `Mention`.
50
    characterEndOffset: int
51
        The index of the character that ends the `Mention`.
52
    type: Mention.TBM or Mention.EM or Mention.RM
53
        The type of the `Mention`.
54
55
    See Also
56
    --------
57
58
    [`Odin` manual](https://arxiv.org/abs/1509.07513)
59
60
    Methods
61
    -------
62
    matches(label_pattern)
63
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
64
65
    """
66
67
    TBM = "TextBoundMention"
68
    EM = "EventMention"
69
    RM = "RelationMention"
70
71
    def __init__(self,
72
                token_interval,
73
                sentence,
74
                document,
75
                foundBy,
76
                label,
77
                labels=None,
78
                trigger=None,
79
                arguments=None,
80
                paths=None,
81
                keep=True,
82
                doc_id=None):
83
84
        self.label = label
85
        self.labels = labels if labels else [self.label]
86
        self.tokenInterval = token_interval
87
        self.start = self.tokenInterval.start
88
        self.end = self.tokenInterval.end
89
        self.document = document
90
        self._doc_id = doc_id or hash(self.document)
91
        self.sentence = sentence
92
        if trigger:
93
            # NOTE: doc id is not stored for trigger's json,
94
            # as it is assumed to be contained in the same document as its parent
95
            trigger.update({"document": self._doc_id})
96
            self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
97
        else:
98
            self.trigger = None
99
        # unpack args
100
        self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
101
        self.paths = paths
102
        self.keep = keep
103
        self.foundBy = foundBy
104
        # other
105
        self.sentenceObj = self.document.sentences[self.sentence]
106
        self.text = " ".join(self.sentenceObj.words[self.start:self.end])
107
        # recover offsets
108
        self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
109
        self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
110
        # for later recovery
111
        self.id = None
112
        self.type = self._set_type()
113
114
    def __str__(self):
115
        return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))
116
117
    def __eq__(self, other):
118
        if isinstance(other, self.__class__):
119
            return self.__dict__ == other.__dict__
120
        else:
121
            return False
122
123
    def __ne__(self, other):
124
        return not self.__eq__(other)
125
126
    def __hash__(self):
127
        return hash(self.to_JSON())
128
129
    def to_JSON_dict(self):
130
        m = dict()
131
        m["id"] = self.id
132
        m["type"] = self.type
133
        m["label"] = self.label
134
        m["labels"] = self.labels
135
        m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
136
        m["characterStartOffset"] = self.characterStartOffset
137
        m["characterEndOffset"] = self.characterEndOffset
138
        m["sentence"] = self.sentence
139
        m["document"] = self._doc_id
140
        # do we have a trigger?
141
        if self.trigger:
142
             m["trigger"] = self.trigger.to_JSON_dict()
143
        # do we have arguments?
144
        if self.arguments:
145
            m["arguments"] = self._arguments_to_JSON_dict()
146
        # handle paths
147
        if self.paths:
148
            m["paths"] = self.paths
149
        m["keep"] = self.keep
150
        m["foundBy"] = self.foundBy
151
        return m
152
153
    def matches(self, label_pattern):
154
        """
155
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
156
157
        Parameters
158
        ----------
159
        label_pattern : str or _sre.SRE_Pattern
160
            The pattern to match against each element in `Mention.labels`
161
162
        Returns
163
        -------
164
        bool
165
            True if `label_pattern` matches any element in `Mention.labels`
166
        """
167
        return any(re.match(label_pattern, label) for label in self.labels)
168
169
    def to_JSON(self):
170
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
171
172
    def _arguments_to_JSON_dict(self):
173
        return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())
174
175
    def _paths_to_JSON_dict(self):
176
        return {role: paths.to_JSON_dict() for (role, paths) in self.paths}
177
178
    @staticmethod
179
    def load_from_JSON(mjson, docs_dict):
180
        # recover document
181
        doc_id = mjson["document"]
182
        doc = docs_dict[doc_id]
183
        labels = mjson["labels"]
184
        kwargs = {
185
            "label": mjson.get("label", labels[0]),
186
            "labels": labels,
187
            "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
188
            "sentence": mjson["sentence"],
189
            "document": doc,
190
            "doc_id": doc_id,
191
            "trigger": mjson.get("trigger", None),
192
            "arguments": mjson.get("arguments", None),
193
            "paths": mjson.get("paths", None),
194
            "keep": mjson.get("keep", True),
195
            "foundBy": mjson["foundBy"]
196
        }
197
        m = Mention(**kwargs)
198
        # set IDs
199
        m.id = mjson["id"]
200
        m._doc_id = doc_id
201
        # set character offsets
202
        m.character_start_offset = mjson["characterStartOffset"]
203
        m.character_end_offset = mjson["characterEndOffset"]
204
        return m
205
206
    def _to_document_map(self):
207
        return {self._doc_id: self.document}
208
209
    def _set_type(self):
210
        # event mention
211
        if self.trigger != None:
212
            return Mention.EM
213
        # textbound mention
214
        elif self.trigger == None and self.arguments == None:
215
            return Mention.TBM
216
        else:
217
            return Mention.RM
218