processors.odin - Code Metrics - clu-ling/py-processors - Measure and Improve Code Quality continuously with Scrutinizer

processors.odin B
last analyzed 2024-01-21 06:33 UTC

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	339
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	48
eloc	183
dl	0
loc	339
rs	8.5599
c	0
b	0
f	0

27 Methods

Rating	Name	Size	Complexity
A	OdinHighlighter.CONCEAL()	3	1
A	Mention.startOffset()	2	1
A	Mention.matches()	15	1
A	Mention.tags()	2	1
A	Mention._to_document_map()	2	1
A	OdinHighlighter.ARG()	3	1
A	OdinHighlighter.TRIGGER()	3	1
A	Mention.__hash__()	2	1
A	OdinHighlighter.MENTION()	3	1
A	Mention.__eq__()	5	2
A	Mention.__ne__()	2	1
C	OdinHighlighter.highlight_mention()	28	10
A	Mention._entities()	2	1
A	Mention._arguments_to_JSON_dict()	2	1
A	Mention.endOffset()	2	1
A	Mention.__init__()	43	4
A	Mention._chunks()	2	1
A	Mention.load_from_JSON()	27	1
A	Mention.overlaps()	12	3
A	Mention.lemmas()	2	1
A	Mention._paths_to_JSON_dict()	2	1
A	Mention.to_JSON_dict()	23	4
A	Mention._set_type()	9	4
A	Mention.words()	2	1
A	Mention.copy()	17	1
A	OdinHighlighter.LABEL()	3	1
A	Mention.__str__()	2	1

How to fix Complexity

# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from .utils import post_json
from .ds import Document, Interval, NLPDatum
from termcolor import colored
import re
import json


class OdinHighlighter(object):

    @staticmethod
    def LABEL(token):
        return colored(token, color="red", attrs=["bold"])

    @staticmethod
    def ARG(token):
        return colored(token, on_color="on_green", attrs=["bold"])

    @staticmethod
    def TRIGGER(token):
        return colored(token, on_color="on_blue", attrs=["bold"])

    @staticmethod
    def CONCEAL(token):
        return colored(token, on_color="on_grey", attrs=["concealed"])

    @staticmethod
    def MENTION(token):
        return colored(token, on_color="on_yellow")

    @staticmethod
    def highlight_mention(mention):
        """
        Formats text of mention
        """
        text_span = mention.sentenceObj.words[:]
        # format TBM span like an arg
        if mention.type == "TextBoundMention":
            for i in range(mention.start, mention.end):
                text_span[i] = OdinHighlighter.ARG(text_span[i])
        if mention.arguments:
            for (role, args) in mention.arguments.items():
                for arg in args:
                    for i in range(arg.start, arg.end):
                        text_span[i] = OdinHighlighter.ARG(text_span[i])
        # format trigger distinctly from args
        if mention.trigger:
            trigger = mention.trigger
            for i in range(trigger.start, trigger.end):
                text_span[i] = OdinHighlighter.TRIGGER(text_span[i])

        # highlight tokens contained in mention span
        for i in range(mention.start, mention.end):
            text_span[i] = OdinHighlighter.MENTION(text_span[i])
        mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
        # highlight spaces in mention span
        formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
        return formatted_text.strip()

class Mention(NLPDatum):
    """
    A labeled span of text.  Used to model textual mentions of events, relations, and entities.

    Parameters
    ----------
    token_interval : Interval
        The span of the Mention represented as an Interval.
    sentence : int
        The sentence index that contains the Mention.
    document : Document
        The Document in which the Mention was found.
    foundBy : str
        The Odin IE rule that produced this Mention.
    label : str
        The label most closely associated with this span.  Usually the lowest hyponym of "labels".
    labels: list
        The list of labels associated with this span.
    trigger: dict or None
        dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
    arguments: dict or None
        dict of JSON for Mention's arguments.
    paths: dict or None
        dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
    doc_id: str or None
        the id of the document

    Attributes
    ----------
    tokenInterval: processors.ds.Interval
        An `Interval` encoding the `start` and `end` of the `Mention`.
    start : int
        The token index that starts the `Mention`.
    end : int
        The token index that marks the end of the Mention (exclusive).
    sentenceObj : processors.ds.Sentence
        Pointer to the `Sentence` instance containing the `Mention`.
    characterStartOffset: int
        The index of the character that starts the `Mention`.
    characterEndOffset: int
        The index of the character that ends the `Mention`.
    type: Mention.TBM or Mention.EM or Mention.RM
        The type of the `Mention`.

    See Also
    --------

    [`Odin` manual](https://arxiv.org/abs/1509.07513)

    Methods
    -------
    matches(label_pattern)
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

    overlaps(other)
        Test whether other (token index or Mention) overlaps with span of this Mention.

    copy(**kwargs)
        Copy constructor for this Mention.

    words()
        Words for this Mention's span.

    tags()
        Part of speech for this Mention's span.

    lemmas()
        Lemmas for this Mention's span.

    _chunks()
        chunk labels for this Mention's span.

    _entities()
        NE labels for this Mention's span.
    """

    TBM = "TextBoundMention"
    EM = "EventMention"
    RM = "RelationMention"

    def __init__(self,
                token_interval,
                sentence,
                document,
                foundBy,
                label,
                labels=None,
                trigger=None,
                arguments=None,
                paths=None,
                keep=True,
                doc_id=None):

        NLPDatum.__init__(self)
        self.label = label
        self.labels = labels if labels else [self.label]
        self.tokenInterval = token_interval
        self.start = self.tokenInterval.start
        self.end = self.tokenInterval.end
        self.document = document
        self._doc_id = doc_id or hash(self.document)
        self.sentence = sentence
        if trigger:
            # NOTE: doc id is not stored for trigger's json,
            # as it is assumed to be contained in the same document as its parent
            trigger.update({"document": self._doc_id})
            self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
        else:
            self.trigger = None
        # unpack args
        self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
        self.paths = paths
        self.keep = keep
        self.foundBy = foundBy
        # other
        self.sentenceObj = self.document.sentences[self.sentence]
        self.text = " ".join(self.sentenceObj.words[self.start:self.end])
        # recover offsets
        self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
        self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
        # for later recovery
        self.id = None
        self.type = self._set_type()

    def __str__(self):
        return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.__dict__ == other.__dict__
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def __hash__(self):
        return hash(self.to_JSON())

    def to_JSON_dict(self):
        m = dict()
        m["id"] = self.id
        m["type"] = self.type
        m["label"] = self.label
        m["labels"] = self.labels
        m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
        m["characterStartOffset"] = self.characterStartOffset
        m["characterEndOffset"] = self.characterEndOffset
        m["sentence"] = self.sentence
        m["document"] = self._doc_id
        # do we have a trigger?
        if self.trigger:
             m["trigger"] = self.trigger.to_JSON_dict()
        # do we have arguments?
        if self.arguments:
            m["arguments"] = self._arguments_to_JSON_dict()
        # handle paths
        if self.paths:
            m["paths"] = self.paths
        m["keep"] = self.keep
        m["foundBy"] = self.foundBy
        return m

    def startOffset(self):
        return self.sentenceObj.endOffsets[self.start]

    def endOffset(self):
        return self.sentenceObj.endOffsets[self.end -1]

    def words(self):
        return self.sentenceObj.words[self.start:self.end]

    def tags(self):
        return self.sentenceObj.tags[self.start:self.end]

    def lemmas(self):
        return self.sentenceObj.lemmas[self.start:self.end]

    def _chunks(self):
        return self.sentenceObj._chunks[self.start:self.end]

    def _entities(self):
        return self.sentenceObj._entities[self.start:self.end]

    def copy(self, **kwargs):
        """
        Copy constructor for mention
        """
        # return new instance
        return self.__class__(
            label=kwargs.get("label", self.label),
            labels=kwargs.get("label", self.labels),
            token_interval=kwargs.get("token_interval", self.tokenInterval),
            sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
            document=kwargs.get("document", self.document),
            foundBy=kwargs.get("foundBy", self.foundBy),
            trigger=kwargs.get("trigger", self.trigger),
            arguments=kwargs.get("arguments", self.arguments),
            paths=kwargs.get("paths", self.paths),
            keep=kwargs.get("keep", self.keep),
            doc_id=kwargs.get("doc_id", self._doc_id)
        )

    def overlaps(self, other):
        """
        Checks for overlap.
        """
        if isinstance(other, int):
            return self.start <= other < self.end
        elif isinstance(other, Mention):
            # equiv. sentences + checks on start and end
            return (self.sentence.__hash__() == other.sentence.__hash__()) and \
            self.tokenInterval.overlaps(other.tokenInterval)
        else:
            return False

    def matches(self, label_pattern):
        """
        Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.

        Parameters
        ----------
        label_pattern : str or _sre.SRE_Pattern
            The pattern to match against each element in `Mention.labels`

        Returns
        -------
        bool
            True if `label_pattern` matches any element in `Mention.labels`
        """
        return any(re.match(label_pattern, label) for label in self.labels)

    def _arguments_to_JSON_dict(self):
        return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())

    def _paths_to_JSON_dict(self):
        return {role: paths.to_JSON_dict() for (role, paths) in self.paths}

    @staticmethod
    def load_from_JSON(mjson, docs_dict):
        # recover document
        doc_id = mjson["document"]
        doc = docs_dict[doc_id]
        labels = mjson["labels"]
        kwargs = {
            "label": mjson.get("label", labels[0]),
            "labels": labels,
            "token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
            "sentence": mjson["sentence"],
            "document": doc,
            "doc_id": doc_id,
            "trigger": mjson.get("trigger", None),
            "arguments": mjson.get("arguments", None),
            "paths": mjson.get("paths", None),
            "keep": mjson.get("keep", True),
            "foundBy": mjson["foundBy"]
        }
        m = Mention(**kwargs)
        # set IDs
        m.id = mjson["id"]
        m._doc_id = doc_id
        # set character offsets
        m.character_start_offset = mjson["characterStartOffset"]
        m.character_end_offset = mjson["characterEndOffset"]
        return m

    def _to_document_map(self):
        return {self._doc_id: self.document}

    def _set_type(self):
        # event mention
        if self.trigger != None:
            return Mention.EM
        # textbound mention
        elif self.trigger == None and self.arguments == None:
            return Mention.TBM
        else:
            return Mention.RM


1			# -- coding: utf-8 --
2			from __future__ import unicode_literals
3			from .utils import post_json
4			from .ds import Document, Interval, NLPDatum
5			from termcolor import colored
6			import re
7			import json
8
9
10			class OdinHighlighter(object):
11
12			@staticmethod
13			def LABEL(token):
14			return colored(token, color="red", attrs=["bold"])
15
16			@staticmethod
17			def ARG(token):
18			return colored(token, on_color="on_green", attrs=["bold"])
19
20			@staticmethod
21			def TRIGGER(token):
22			return colored(token, on_color="on_blue", attrs=["bold"])
23
24			@staticmethod
25			def CONCEAL(token):
26			return colored(token, on_color="on_grey", attrs=["concealed"])
27
28			@staticmethod
29			def MENTION(token):
30			return colored(token, on_color="on_yellow")
31
32			@staticmethod
33			def highlight_mention(mention):
34			"""
35			Formats text of mention
36			"""
37			text_span = mention.sentenceObj.words[:]
38			# format TBM span like an arg
39			if mention.type == "TextBoundMention":
40			for i in range(mention.start, mention.end):
41			text_span[i] = OdinHighlighter.ARG(text_span[i])
42			if mention.arguments:
43			for (role, args) in mention.arguments.items():
44			for arg in args:
45			for i in range(arg.start, arg.end):
46			text_span[i] = OdinHighlighter.ARG(text_span[i])
47			# format trigger distinctly from args
48			if mention.trigger:
49			trigger = mention.trigger
50			for i in range(trigger.start, trigger.end):
51			text_span[i] = OdinHighlighter.TRIGGER(text_span[i])
52
53			# highlight tokens contained in mention span
54			for i in range(mention.start, mention.end):
55			text_span[i] = OdinHighlighter.MENTION(text_span[i])
56			mention_span = OdinHighlighter.MENTION(" ").join(text_span[mention.start:mention.end])
57			# highlight spaces in mention span
58			formatted_text = " ".join(text_span[:mention.start]) + " " + mention_span + " " + " ".join(text_span[mention.end:])
59			return formatted_text.strip()
60
61			class Mention(NLPDatum):
62			"""
63			A labeled span of text. Used to model textual mentions of events, relations, and entities.
64
65			Parameters
66			----------
67			token_interval : Interval
68			The span of the Mention represented as an Interval.
69			sentence : int
70			The sentence index that contains the Mention.
71			document : Document
72			The Document in which the Mention was found.
73			foundBy : str
74			The Odin IE rule that produced this Mention.
75			label : str
76			The label most closely associated with this span. Usually the lowest hyponym of "labels".
77			labels: list
78			The list of labels associated with this span.
79			trigger: dict or None
80			dict of JSON for Mention's trigger (event predicate or word(s) signaling the Mention).
81			arguments: dict or None
82			dict of JSON for Mention's arguments.
83			paths: dict or None
84			dict of JSON encoding the syntactic paths linking a Mention's arguments to its trigger (applies to Mentions produces from `type:"dependency"` rules).
85			doc_id: str or None
86			the id of the document
87
88			Attributes
89			----------
90			tokenInterval: processors.ds.Interval
91			An `Interval` encoding the `start` and `end` of the `Mention`.
92			start : int
93			The token index that starts the `Mention`.
94			end : int
95			The token index that marks the end of the Mention (exclusive).
96			sentenceObj : processors.ds.Sentence
97			Pointer to the `Sentence` instance containing the `Mention`.
98			characterStartOffset: int
99			The index of the character that starts the `Mention`.
100			characterEndOffset: int
101			The index of the character that ends the `Mention`.
102			type: Mention.TBM or Mention.EM or Mention.RM
103			The type of the `Mention`.
104
105			See Also
106			--------
107
108			[`Odin` manual](https://arxiv.org/abs/1509.07513)
109
110			Methods
111			-------
112			matches(label_pattern)
113			Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
114
115			overlaps(other)
116			Test whether other (token index or Mention) overlaps with span of this Mention.
117
118			copy(**kwargs)
119			Copy constructor for this Mention.
120
121			words()
122			Words for this Mention's span.
123
124			tags()
125			Part of speech for this Mention's span.
126
127			lemmas()
128			Lemmas for this Mention's span.
129
130			_chunks()
131			chunk labels for this Mention's span.
132
133			_entities()
134			NE labels for this Mention's span.
135			"""
136
137			TBM = "TextBoundMention"
138			EM = "EventMention"
139			RM = "RelationMention"
140
141			def __init__(self,
142			token_interval,
143			sentence,
144			document,
145			foundBy,
146			label,
147			labels=None,
148			trigger=None,
149			arguments=None,
150			paths=None,
151			keep=True,
152			doc_id=None):
153
154			NLPDatum.__init__(self)
155			self.label = label
156			self.labels = labels if labels else [self.label]
157			self.tokenInterval = token_interval
158			self.start = self.tokenInterval.start
159			self.end = self.tokenInterval.end
160			self.document = document
161			self._doc_id = doc_id or hash(self.document)
162			self.sentence = sentence
163			if trigger:
164			# NOTE: doc id is not stored for trigger's json,
165			# as it is assumed to be contained in the same document as its parent
166			trigger.update({"document": self._doc_id})
167			self.trigger = Mention.load_from_JSON(trigger, self._to_document_map())
168			else:
169			self.trigger = None
170			# unpack args
171			self.arguments = {role:[Mention.load_from_JSON(a, self._to_document_map()) for a in args] for (role, args) in arguments.items()} if arguments else None
172			self.paths = paths
173			self.keep = keep
174			self.foundBy = foundBy
175			# other
176			self.sentenceObj = self.document.sentences[self.sentence]
177			self.text = " ".join(self.sentenceObj.words[self.start:self.end])
178			# recover offsets
179			self.characterStartOffset = self.sentenceObj.startOffsets[self.tokenInterval.start]
180			self.characterEndOffset = self.sentenceObj.endOffsets[self.tokenInterval.end - 1]
181			# for later recovery
182			self.id = None
183			self.type = self._set_type()
184
185			def __str__(self):
186			return "{}: {}".format(OdinHighlighter.LABEL(self.label), OdinHighlighter.highlight_mention(self))
187
188			def __eq__(self, other):
189			if isinstance(other, self.__class__):
190			return self.__dict__ == other.__dict__
191			else:
192			return False
193
194			def __ne__(self, other):
195			return not self.__eq__(other)
196
197			def __hash__(self):
198			return hash(self.to_JSON())
199
200			def to_JSON_dict(self):
201			m = dict()
202			m["id"] = self.id
203			m["type"] = self.type
204			m["label"] = self.label
205			m["labels"] = self.labels
206			m["tokenInterval"] = self.tokenInterval.to_JSON_dict()
207			m["characterStartOffset"] = self.characterStartOffset
208			m["characterEndOffset"] = self.characterEndOffset
209			m["sentence"] = self.sentence
210			m["document"] = self._doc_id
211			# do we have a trigger?
212			if self.trigger:
213			m["trigger"] = self.trigger.to_JSON_dict()
214			# do we have arguments?
215			if self.arguments:
216			m["arguments"] = self._arguments_to_JSON_dict()
217			# handle paths
218			if self.paths:
219			m["paths"] = self.paths
220			m["keep"] = self.keep
221			m["foundBy"] = self.foundBy
222			return m
223
224			def startOffset(self):
225			return self.sentenceObj.endOffsets[self.start]
226
227			def endOffset(self):
228			return self.sentenceObj.endOffsets[self.end -1]
229
230			def words(self):
231			return self.sentenceObj.words[self.start:self.end]
232
233			def tags(self):
234			return self.sentenceObj.tags[self.start:self.end]
235
236			def lemmas(self):
237			return self.sentenceObj.lemmas[self.start:self.end]
238
239			def _chunks(self):
240			return self.sentenceObj._chunks[self.start:self.end]
241
242			def _entities(self):
243			return self.sentenceObj._entities[self.start:self.end]
244
245			def copy(self, **kwargs):
246			"""
247			Copy constructor for mention
248			"""
249			# return new instance
250			return self.__class__(
251			label=kwargs.get("label", self.label),
252			labels=kwargs.get("label", self.labels),
253			token_interval=kwargs.get("token_interval", self.tokenInterval),
254			sentence=kwargs.get("sentence", self.sentence), # NOTE: this is the sentence idx
255			document=kwargs.get("document", self.document),
256			foundBy=kwargs.get("foundBy", self.foundBy),
257			trigger=kwargs.get("trigger", self.trigger),
258			arguments=kwargs.get("arguments", self.arguments),
259			paths=kwargs.get("paths", self.paths),
260			keep=kwargs.get("keep", self.keep),
261			doc_id=kwargs.get("doc_id", self._doc_id)
262			)
263
264			def overlaps(self, other):
265			"""
266			Checks for overlap.
267			"""
268			if isinstance(other, int):
269			return self.start <= other < self.end
270			elif isinstance(other, Mention):
271			# equiv. sentences + checks on start and end
272			return (self.sentence.__hash__() == other.sentence.__hash__()) and \
273			self.tokenInterval.overlaps(other.tokenInterval)
274			else:
275			return False
276
277			def matches(self, label_pattern):
278			"""
279			Test if the provided pattern, `label_pattern`, matches any element in `Mention.labels`.
280
281			Parameters
282			----------
283			label_pattern : str or _sre.SRE_Pattern
284			The pattern to match against each element in `Mention.labels`
285
286			Returns
287			-------
288			bool
289			True if `label_pattern` matches any element in `Mention.labels`
290			"""
291			return any(re.match(label_pattern, label) for label in self.labels)
292
293			def _arguments_to_JSON_dict(self):
294			return dict((role, [a.to_JSON_dict() for a in args]) for (role, args) in self.arguments.items())
295
296			def _paths_to_JSON_dict(self):
297			return {role: paths.to_JSON_dict() for (role, paths) in self.paths}
298
299			@staticmethod
300			def load_from_JSON(mjson, docs_dict):
301			# recover document
302			doc_id = mjson["document"]
303			doc = docs_dict[doc_id]
304			labels = mjson["labels"]
305			kwargs = {
306			"label": mjson.get("label", labels[0]),
307			"labels": labels,
308			"token_interval": Interval.load_from_JSON(mjson["tokenInterval"]),
309			"sentence": mjson["sentence"],
310			"document": doc,
311			"doc_id": doc_id,
312			"trigger": mjson.get("trigger", None),
313			"arguments": mjson.get("arguments", None),
314			"paths": mjson.get("paths", None),
315			"keep": mjson.get("keep", True),
316			"foundBy": mjson["foundBy"]
317			}
318			m = Mention(**kwargs)
319			# set IDs
320			m.id = mjson["id"]
321			m._doc_id = doc_id
322			# set character offsets
323			m.character_start_offset = mjson["characterStartOffset"]
324			m.character_end_offset = mjson["characterEndOffset"]
325			return m
326
327			def _to_document_map(self):
328			return {self._doc_id: self.document}
329
330			def _set_type(self):
331			# event mention
332			if self.trigger != None:
333			return Mention.EM
334			# textbound mention
335			elif self.trigger == None and self.arguments == None:
336			return Mention.TBM
337			else:
338			return Mention.RM
339

clu-ling / py-processors

processors.odin B last analyzed 2024-01-21 06:33 UTC

Complexity

Size/Duplication

Importance

27 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like

processors.odin B
last analyzed 2024-01-21 06:33 UTC