Mention.words() - Code Metrics - Inspection of "Merge pull request #13 from myedibleenso/openie" - myedibleenso/py-processors - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 4c1319...a3584a )

by Gus

created 2018-02-21 05:57 UTC

Mention.words() A

↳ Parent: Mention

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	1
c	1
b	0
f	0
dl	0
loc	2
rs	10

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from .utils import post_json
from .ds import Document, Interval, NLPDatum
from termcolor import colored
import re
import json


class OdinHighlighter(object):

    @staticmethod
    def LABEL(token):
        return colored(token, color="red", attrs=["bold"])

    @staticmethod
    def ARG(token):
        return colored(token, on_color="on_green", attrs=["bold"])

    @staticmethod
    def TRIGGER(token):
        return colored(token, on_color="on_blue", attrs=["bold"])

    @staticmethod
    def CONCEAL(token):
        return colored(token, on_color="on_grey", attrs=["concealed"])

    @staticmethod
    def MENTION(token):
        return colored(token, on_color="on_yellow")

    @staticmethod
    def highlight_mention(mention):
        """
        Formats text of mention
        """
        text_span = mention.sentenceObj.words[:]
        # format TBM span like an arg
        if mention.type == "TextBoundMention":
            for i in range(mention.start, mention.end):
                text_span[i] = OdinHighlighter.ARG(text_span[i])
        if mention.arguments:
            for (role, args) in mention.arguments.items():
                for arg in args:
                    for i in range(arg.start, arg.end):
                        text_span[i] = OdinHighlighter.ARG(text_span[i])
        # format trigger distinctly from args
        if mention.trigger:
            trigger = mention.trigger
            for i in range(trigger.start, trigger.end):
                text_span[i] = OdinHighlighter.TRIGGER(text_span[i])

        # highlight tokens contained in mention span
        for i in range(mention.start, mention.end):
            text_span[i] = OdinHighlighter.MENTION(text_span[i])
        mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
        # highlight spaces in mention span
        formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
        return formatted_text.strip()

class Mention(NLPDatum):
    """
    A labeled span of text.  Used to model textual mentions of events, relations, and entities.

    Parameters
    ----------
    token_interval : Interval
        The span of the Mention represented as an Interval.
    sentence : int
        The sentence index that contains the Mention.
    document : Document
        The Document in which the Mention was found.
    foundBy : str
        The Odin IE rule that produced this Mention.
    label : str
        The label most closely associated with this span.  Usually the lowest hyponym of "labels".
    labels: list
        The list of labels associated with this span.
    trigger: dict or None
        dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
    arguments: dict or None
        dict of JSON for Mention's arguments.
    paths: dict or None
        dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
    doc_id: str or None
        the id of the document

    Attributes
    ----------
    tokenInterval: processors.ds.Interval
        An `Interval` encoding the `start` and `end` of the `Mention`.
    start : int
        The token index that starts the `Mention`.
    end : int
        The token index that marks the end of the Mention (exclusive).
    sentenceObj : processors.ds.Sentence
        Pointer to the `Sentence` instance containing the `Mention`.
    characterStartOffset: int
        The index of the character that starts the `Mention`.
    characterEndOffset: int
        The index of the character that ends the `Mention`.
    type: Mention.TBM or Mention.EM or Mention.RM
        The type of the `Mention`.

    See Also
    --------

    [`Odin` manual](https://arxiv.org/abs/1509.07513)

    Methods
    -------
    matches(label_pattern)
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

    overlaps(other)
        Test whether other (token index or Mention) overlaps with span of this Mention.

    copy(**kwargs)
        Copy constructor for this Mention.

    words()
        Words for this Mention's span.

    tags()
        Part of speech for this Mention's span.

    lemmas()
        Lemmas for this Mention's span.

    _chunks()
        chunk labels for this Mention's span.

    _entities()
        NE labels for this Mention's span.
    """

    TBM = "TextBoundMention"
    EM = "EventMention"
    RM = "RelationMention"

    def __init__(self,
                token_interval,
                sentence,
                document,
                foundBy,
                label,
                labels=None,
                trigger=None,
                arguments=None,
                paths=None,
                keep=True,
                doc_id=None):

        NLPDatum.__init__(self)
        self.label = label
        self.labels = labels if labels else [self.label]
        self.tokenInterval = token_interval
        self.start = self.tokenInterval.start
        self.end = self.tokenInterval.end
        self.document = document
        self._doc_id = doc_id or hash(self.document)
        self.sentence = sentence
        if trigger:
            # NOTE: doc id is not stored for trigger's json,
            # as it is assumed to be contained in the same document as its parent
            trigger.update({"document": self._doc_id})
            self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
        else:
            self.trigger = None
        # unpack args
        self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
        self.paths = paths
        self.keep = keep
        self.foundBy = foundBy
        # other
        self.sentenceObj = self.document.sentences[self.sentence]
        self.text = " ".join(self.sentenceObj.words[self.start:self.end])
        # recover offsets
        self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
        self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
        # for later recovery
        self.id = None
        self.type = self._set_type()

    def __str__(self):
        return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.__dict__ == other.__dict__
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(self.to_JSON())

    def to_JSON_dict(self):
        m = dict()
        m["id"] = self.id
        m["type"] = self.type
        m["label"] = self.label
        m["labels"] = self.labels
        m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
        m["characterStartOffset"] = self.characterStartOffset
        m["characterEndOffset"] = self.characterEndOffset
        m["sentence"] = self.sentence
        m["document"] = self._doc_id
        # do we have a trigger?
        if self.trigger:
             m["trigger"] = self.trigger.to_JSON_dict()
        # do we have arguments?
        if self.arguments:
            m["arguments"] = self._arguments_to_JSON_dict()
        # handle paths
        if self.paths:
            m["paths"] = self.paths
        m["keep"] = self.keep
        m["foundBy"] = self.foundBy
        return m

    def startOffset(self):
        return self.sentenceObj.endOffsets[self.start]

    def endOffset(self):
        return self.sentenceObj.endOffsets[self.end -1]

    def words(self):
        return self.sentenceObj.words[self.start:self.end]

    def tags(self):
        return self.sentenceObj.tags[self.start:self.end]

    def lemmas(self):
        return self.sentenceObj.lemmas[self.start:self.end]

    def _chunks(self):
        return self.sentenceObj._chunks[self.start:self.end]

    def _entities(self):
        return self.sentenceObj._entities[self.start:self.end]

    def copy(self, **kwargs):
        """
        Copy constructor for mention
        """
        # return new instance
        return self.__class__(
            label=kwargs.get("label", self.label),
            labels=kwargs.get("label", self.labels),
            token_interval=kwargs.get("token_interval", self.tokenInterval),
            sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
            document=kwargs.get("document", self.document),
            foundBy=kwargs.get("foundBy", self.foundBy),
            trigger=kwargs.get("trigger", self.trigger),
            arguments=kwargs.get("arguments", self.arguments),
            paths=kwargs.get("paths", self.paths),
            keep=kwargs.get("keep", self.keep),
            doc_id=kwargs.get("doc_id", self._doc_id)
        )

    def overlaps(self, other):
        """
        Checks for overlap.
        """
        if isinstance(other, int):
            return self.start <= other < self.end
        elif isinstance(other, Mention):
            # equiv. sentences + checks on start and end
            return (self.sentence.__hash__() == other.sentence.__hash__()) and \
            ((other.start <= self.start < other.end) or (self.start <= other.start < self.end))
        else:
            return False

    def matches(self, label_pattern):
        """
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

        Parameters
        ----------
        label_pattern : str or _sre.SRE_Pattern
            The pattern to match against each element in `Mention.labels`

        Returns
        -------
        bool
            True if `label_pattern` matches any element in `Mention.labels`
        """
        return any(re.match(label_pattern, label) for label in self.labels)

    def _arguments_to_JSON_dict(self):
        return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())

    def _paths_to_JSON_dict(self):
        return {role: paths.to_JSON_dict() for (role, paths) in self.paths}

    @staticmethod
    def load_from_JSON(mjson, docs_dict):
        # recover document
        doc_id = mjson["document"]
        doc = docs_dict[doc_id]
        labels = mjson["labels"]
        kwargs = {
            "label": mjson.get("label", labels[0]),
            "labels": labels,
            "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
            "sentence": mjson["sentence"],
            "document": doc,
            "doc_id": doc_id,
            "trigger": mjson.get("trigger", None),
            "arguments": mjson.get("arguments", None),
            "paths": mjson.get("paths", None),
            "keep": mjson.get("keep", True),
            "foundBy": mjson["foundBy"]
        }
        m = Mention(**kwargs)
        # set IDs
        m.id = mjson["id"]
        m._doc_id = doc_id
        # set character offsets
        m.character_start_offset = mjson["characterStartOffset"]
        m.character_end_offset = mjson["characterEndOffset"]
        return m

    def _to_document_map(self):
        return {self._doc_id: self.document}

    def _set_type(self):
        # event mention
        if self.trigger != None:
            return Mention.EM
        # textbound mention
        elif self.trigger == None and self.arguments == None:
            return Mention.TBM
        else:
            return Mention.RM


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3			from __future__ import unicode_literals
4			from .utils import post_json
5			from .ds import Document, Interval, NLPDatum
6			from termcolor import colored
7			import re
8			import json
9
10
11			class OdinHighlighter(object):
12
13			@staticmethod
14			def LABEL(token):
15			return colored(token, color="red", attrs=["bold"])
16
17			@staticmethod
18			def ARG(token):
19			return colored(token, on_color="on_green", attrs=["bold"])
20
21			@staticmethod
22			def TRIGGER(token):
23			return colored(token, on_color="on_blue", attrs=["bold"])
24
25			@staticmethod
26			def CONCEAL(token):
27			return colored(token, on_color="on_grey", attrs=["concealed"])
28
29			@staticmethod
30			def MENTION(token):
31			return colored(token, on_color="on_yellow")
32
33			@staticmethod
34			def highlight_mention(mention):
35			"""
36			Formats text of mention
37			"""
38			text_span = mention.sentenceObj.words[:]
39			# format TBM span like an arg
40			if mention.type == "TextBoundMention":
41			for i in range(mention.start, mention.end):
42			text_span[i] = OdinHighlighter.ARG(text_span[i])
43			if mention.arguments:
44			for (role, args) in mention.arguments.items():
45			for arg in args:
46			for i in range(arg.start, arg.end):
47			text_span[i] = OdinHighlighter.ARG(text_span[i])
48			# format trigger distinctly from args
49			if mention.trigger:
50			trigger = mention.trigger
51			for i in range(trigger.start, trigger.end):
52			text_span[i] = OdinHighlighter.TRIGGER(text_span[i])
53
54			# highlight tokens contained in mention span
55			for i in range(mention.start, mention.end):
56			text_span[i] = OdinHighlighter.MENTION(text_span[i])
57			mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
58			# highlight spaces in mention span
59			formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
60			return formatted_text.strip()
61
62			class Mention(NLPDatum):
63			"""
64			A labeled span of text. Used to model textual mentions of events, relations, and entities.
65
66			Parameters
67			----------
68			token_interval : Interval
69			The span of the Mention represented as an Interval.
70			sentence : int
71			The sentence index that contains the Mention.
72			document : Document
73			The Document in which the Mention was found.
74			foundBy : str
75			The Odin IE rule that produced this Mention.
76			label : str
77			The label most closely associated with this span. Usually the lowest hyponym of "labels".
78			labels: list
79			The list of labels associated with this span.
80			trigger: dict or None
81			dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
82			arguments: dict or None
83			dict of JSON for Mention's arguments.
84			paths: dict or None
85			dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
86			doc_id: str or None
87			the id of the document
88
89			Attributes
90			----------
91			tokenInterval: processors.ds.Interval
92			An `Interval` encoding the `start` and `end` of the `Mention`.
93			start : int
94			The token index that starts the `Mention`.
95			end : int
96			The token index that marks the end of the Mention (exclusive).
97			sentenceObj : processors.ds.Sentence
98			Pointer to the `Sentence` instance containing the `Mention`.
99			characterStartOffset: int
100			The index of the character that starts the `Mention`.
101			characterEndOffset: int
102			The index of the character that ends the `Mention`.
103			type: Mention.TBM or Mention.EM or Mention.RM
104			The type of the `Mention`.
105
106			See Also
107			--------
108
109			[`Odin` manual](https://arxiv.org/abs/1509.07513)
110
111			Methods
112			-------
113			matches(label_pattern)
114			Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
115
116			overlaps(other)
117			Test whether other (token index or Mention) overlaps with span of this Mention.
118
119			copy(**kwargs)
120			Copy constructor for this Mention.
121
122			words()
123			Words for this Mention's span.
124
125			tags()
126			Part of speech for this Mention's span.
127
128			lemmas()
129			Lemmas for this Mention's span.
130
131			_chunks()
132			chunk labels for this Mention's span.
133
134			_entities()
135			NE labels for this Mention's span.
136			"""
137
138			TBM = "TextBoundMention"
139			EM = "EventMention"
140			RM = "RelationMention"
141
142			def __init__(self,
143			token_interval,
144			sentence,
145			document,
146			foundBy,
147			label,
148			labels=None,
149			trigger=None,
150			arguments=None,
151			paths=None,
152			keep=True,
153			doc_id=None):
154
155			NLPDatum.__init__(self)
156			self.label = label
157			self.labels = labels if labels else [self.label]
158			self.tokenInterval = token_interval
159			self.start = self.tokenInterval.start
160			self.end = self.tokenInterval.end
161			self.document = document
162			self._doc_id = doc_id or hash(self.document)
163			self.sentence = sentence
164			if trigger:
165			# NOTE: doc id is not stored for trigger's json,
166			# as it is assumed to be contained in the same document as its parent
167			trigger.update({"document": self._doc_id})
168			self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
169			else:
170			self.trigger = None
171			# unpack args
172			self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
173			self.paths = paths
174			self.keep = keep
175			self.foundBy = foundBy
176			# other
177			self.sentenceObj = self.document.sentences[self.sentence]
178			self.text = " ".join(self.sentenceObj.words[self.start:self.end])
179			# recover offsets
180			self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
181			self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
182			# for later recovery
183			self.id = None
184			self.type = self._set_type()
185
186			def __str__(self):
187			return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))
188
189			def __eq__(self, other):
190			if isinstance(other, self.__class__):
191			return self.__dict__ == other.__dict__
192			else:
193			return False
194
195			def __ne__(self, other):
196			return not self.__eq__(other)
197
198			def __hash__(self):
199			return hash(self.to_JSON())
200
201			def to_JSON_dict(self):
202			m = dict()
203			m["id"] = self.id
204			m["type"] = self.type
205			m["label"] = self.label
206			m["labels"] = self.labels
207			m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
208			m["characterStartOffset"] = self.characterStartOffset
209			m["characterEndOffset"] = self.characterEndOffset
210			m["sentence"] = self.sentence
211			m["document"] = self._doc_id
212			# do we have a trigger?
213			if self.trigger:
214			m["trigger"] = self.trigger.to_JSON_dict()
215			# do we have arguments?
216			if self.arguments:
217			m["arguments"] = self._arguments_to_JSON_dict()
218			# handle paths
219			if self.paths:
220			m["paths"] = self.paths
221			m["keep"] = self.keep
222			m["foundBy"] = self.foundBy
223			return m
224
225			def startOffset(self):
226			return self.sentenceObj.endOffsets[self.start]
227
228			def endOffset(self):
229			return self.sentenceObj.endOffsets[self.end -1]
230
231			def words(self):
232			return self.sentenceObj.words[self.start:self.end]
233
234			def tags(self):
235			return self.sentenceObj.tags[self.start:self.end]
236
237			def lemmas(self):
238			return self.sentenceObj.lemmas[self.start:self.end]
239
240			def _chunks(self):
241			return self.sentenceObj._chunks[self.start:self.end]
242
243			def _entities(self):
244			return self.sentenceObj._entities[self.start:self.end]
245
246			def copy(self, **kwargs):
247			"""
248			Copy constructor for mention
249			"""
250			# return new instance
251			return self.__class__(
252			label=kwargs.get("label", self.label),
253			labels=kwargs.get("label", self.labels),
254			token_interval=kwargs.get("token_interval", self.tokenInterval),
255			sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
256			document=kwargs.get("document", self.document),
257			foundBy=kwargs.get("foundBy", self.foundBy),
258			trigger=kwargs.get("trigger", self.trigger),
259			arguments=kwargs.get("arguments", self.arguments),
260			paths=kwargs.get("paths", self.paths),
261			keep=kwargs.get("keep", self.keep),
262			doc_id=kwargs.get("doc_id", self._doc_id)
263			)
264
265			def overlaps(self, other):
266			"""
267			Checks for overlap.
268			"""
269			if isinstance(other, int):
270			return self.start <= other < self.end
271			elif isinstance(other, Mention):
272			# equiv. sentences + checks on start and end
273			return (self.sentence.__hash__() == other.sentence.__hash__()) and \
274			((other.start <= self.start < other.end) or (self.start <= other.start < self.end))
275			else:
276			return False
277
278			def matches(self, label_pattern):
279			"""
280			Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
281
282			Parameters
283			----------
284			label_pattern : str or _sre.SRE_Pattern
285			The pattern to match against each element in `Mention.labels`
286
287			Returns
288			-------
289			bool
290			True if `label_pattern` matches any element in `Mention.labels`
291			"""
292			return any(re.match(label_pattern, label) for label in self.labels)
293
294			def _arguments_to_JSON_dict(self):
295			return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())
296
297			def _paths_to_JSON_dict(self):
298			return {role: paths.to_JSON_dict() for (role, paths) in self.paths}
299
300			@staticmethod
301			def load_from_JSON(mjson, docs_dict):
302			# recover document
303			doc_id = mjson["document"]
304			doc = docs_dict[doc_id]
305			labels = mjson["labels"]
306			kwargs = {
307			"label": mjson.get("label", labels[0]),
308			"labels": labels,
309			"token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
310			"sentence": mjson["sentence"],
311			"document": doc,
312			"doc_id": doc_id,
313			"trigger": mjson.get("trigger", None),
314			"arguments": mjson.get("arguments", None),
315			"paths": mjson.get("paths", None),
316			"keep": mjson.get("keep", True),
317			"foundBy": mjson["foundBy"]
318			}
319			m = Mention(**kwargs)
320			# set IDs
321			m.id = mjson["id"]
322			m._doc_id = doc_id
323			# set character offsets
324			m.character_start_offset = mjson["characterStartOffset"]
325			m.character_end_offset = mjson["characterEndOffset"]
326			return m
327
328			def _to_document_map(self):
329			return {self._doc_id: self.document}
330
331			def _set_type(self):
332			# event mention
333			if self.trigger != None:
334			return Mention.EM
335			# textbound mention
336			elif self.trigger == None and self.arguments == None:
337			return Mention.TBM
338			else:
339			return Mention.RM
340

myedibleenso / py-processors

Push — master ( 4c1319...a3584a )

Mention.words() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like