Dependencies - Code Metrics - Inspection of "Version 3.0.0" - myedibleenso/py-processors - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 352884...922d0d )

by Gus

created 2017-03-26 10:12 UTC

Dependencies A

↳ Parent: Sentence

Complexity

Total Complexity

Size/Duplication

Total Lines	69
Duplicated Lines	0 %

Importance

Changes	5
Bugs	2	Features	0

Metric	Value
dl	0
loc	69
rs	10
c	5
b	2
f	0
wmc	20

2 Methods

Rating	Name	Duplication	Size	Complexity
B	Sentence._handle_iob()	0	47	6
A	Sentence._set_toks()	0	2	2

#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Gus Hahn-Powell 2015-2016
# data structures for storing processors-server output
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
from __future__ import unicode_literals
from itertools import chain
from collections import defaultdict
from processors.paths import DependencyUtils
from processors.utils import LabelManager
#from six import text_type
import json
import re


class Document(object):

    """
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)

    Parameters
    ----------
    sentences : [processors.ds.Sentence]
        The sentences comprising the `Document`.

    Attributes
    ----------
    id : str or None
        A unique ID for the `Document`.
    size : int
        The number of `sentences`.
    sentences : sentences
        The sentences comprising the `Document`.
    words : [str]
        A list of the `Document`'s tokens.
    tags : [str]
        A list of the `Document`'s tokens represented using part of speech (PoS) tags.
    lemmas : [str]
        A list of the `Document`'s tokens represented using lemmas.
    _entities : [str]
        A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
    nes : dict
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
    bag_of_labeled_deps : [str]
        The labeled dependencies from all sentences in the `Document`.
    bag_of_unlabeled_deps : [str]
        The unlabeled dependencies from all sentences in the `Document`.
    text : str or None
        The original text of the `Document`.

    Methods
    -------
    bag_of_labeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
    bag_of_unlabeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
    """

    def __init__(self, sentences):
        self.id = None
        self.size = len(sentences)
        self.sentences = sentences
        # easily access token attributes from all sentences
        self.words = list(chain(*[s.words for s in self.sentences]))
        self.tags = list(chain(*[s.tags for s in self.sentences]))
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
        self._entities = list(chain(*[s._entities for s in self.sentences]))
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
        self.text = None

    def __hash__(self):
        return hash(self.to_JSON())

    def __unicode__(self):
        return self.text

    def __str__(self):
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.to_JSON() == other.to_JSON()
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def bag_of_labeled_dependencies_using(self, form):
        return list(chain(*[s.labeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))

    def bag_of_unlabeled_dependencies_using(self, form):
        return list(chain(*[s.unlabeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))

    def _merge_ne_dicts(self):
        # Get the set of all NE labels found in the Doc's sentences
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
        # Do we have any labels?
        if entity_labels == None:
            return None
        # If we have labels, consolidate the NEs under the appropriate label
        else:
            nes_dict = dict()
            for e in entity_labels:
                entities = []
                for s in self.sentences:
                    entities += s.nes[e]
                nes_dict[e] = entities
            return nes_dict

    def to_JSON_dict(self):
        doc_dict = dict()
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
        doc_dict["text"] = self.text
        # can the ID be set?
        if self.id != None:
            doc_dict["id"] = self.id
        return doc_dict

    def to_JSON(self, pretty=True):
        """
        Returns JSON as String.
        """
        num_spaces = 4 if pretty else 0
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces)

    @staticmethod
    def load_from_JSON(json_dict):
        sentences = []
        for s in json_dict["sentences"]:
            kwargs = {
                "words": s["words"],
                "startOffsets": s["startOffsets"],
                "endOffsets": s["endOffsets"],
                "tags": s.get("tags", None),
                "lemmas": s.get("lemmas", None),
                "chunks": s.get("chunks", None),
                "entities": s.get("entities", None),
                "graphs": s.get("graphs", None)
            }
            sent = Sentence(**kwargs)
            sentences.append(sent)
        doc = Document(sentences)
        # set id and text
        doc.text = json_dict.get("text", None)
        doc.id = kwargs.get("id", None)
        return doc


class Sentence(object):

    """
    Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)

    Parameters
    ----------
    text : str or None
        The text of the `Sentence`.
    words : [str]
        A list of the `Sentence`'s tokens.
    startOffsets : [int]
        The character offsets starting each token (inclusive).
    endOffsets : [int]
        The character offsets marking the end of each token (exclusive).
    tags : [str]
        A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.
    lemmas : [str]
        A list of the `Sentence`'s tokens represented using lemmas.
    chunks : [str]
        A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).
    entities : [str]
        A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.
    graphs : dict
        A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}}

    Attributes
    ----------
    text : str
        The text of the `Sentence`.
    startOffsets : [int]
        The character offsets starting each token (inclusive).
    endOffsets : [int]
        The character offsets marking the end of each token (exclusive).
    length : int
        The number of tokens in the `Sentence`

    graphs : dict
        A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`.
    basic_dependencies : processors.ds.DirectedGraph
        A `processors.ds.DirectedGraph` using basic Stanford dependencies.
    collapsed_dependencies : processors.ds.DirectedGraph
        A `processors.ds.DirectedGraph` using collapsed Stanford dependencies.
    dependencies : processors.ds.DirectedGraph
        A pointer to the prefered syntactic dependency graph type for this `Sentence`.
    _entities : [str]
        The IOB-style Named Entity (NE) labels corresponding to each token.
    _chunks : [str]
        The IOB-style chunk labels corresponding to each token.
    nes : dict
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
    phrases : dict
        A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`


    Methods
    -------
    bag_of_labeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
    bag_of_unlabeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
    """

    UNKNOWN = LabelManager.UNKNOWN
    # the O in IOB notation
    O = LabelManager.O

    def __init__(self, **kwargs):
        self.words = kwargs["words"]
        self.startOffsets = kwargs["startOffsets"]
        self.endOffsets = kwargs["endOffsets"]
        self.length = len(self.words)
        self.tags = self._set_toks(kwargs.get("tags", None))
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
        self._chunks = self._set_toks(kwargs.get("chunks", None))
        self._entities = self._set_toks(kwargs.get("entities", None))
        self.text = kwargs.get("text", None) or " ".join(self.words)
        self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None))
        self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
        self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
        self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
        # IOB tokens -> {label: [phrase 1, ..., phrase n]}
        self.nes = self._handle_iob(self._entities)
        self.phrases = self._handle_iob(self._chunks)

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.to_JSON() == other.to_JSON()
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def _get_tokens(self, form):
        f = form.lower()
        if f == "words":
            tokens = self.words
        elif f == "tags":
            tokens = self.tags
        elif f == "lemmas":
            tokens = self.lemmas
        elif f == "entities":
            tokens = self.nes
        elif f == "index":
            tokens = list(range(self.length))
        return tokens

    def _set_toks(self, toks):
        return toks if toks else [Sentence.UNKNOWN]*self.length

    def _handle_iob(self, iob):
        """
        Consolidates consecutive tokens in IOB notation under the appropriate label.
        Regexs control for bionlp annotator, which uses IOB notation.
        """
        entity_dict = defaultdict(list)
        # initialize to empty label
        current = Sentence.O
        start = None
        end = None
        for i, tok in enumerate(iob):
            # we don't have an I or O
            if tok == Sentence.O:
                # did we have an entity with the last token?
                current = re.sub('(B-|I-)','', str(current))
                if current == Sentence.O:
                    continue
                else:
                    # the last sequence has ended
                    end = i
                    # store the entity
                    named_entity = ' '.join(self.words[start:end])
                    entity_dict[current].append(named_entity)
                    # reset our book-keeping vars
                    current = Sentence.O
                    start = None
                    end = None
            # we have a tag!
            else:
                # our old sequence continues
                current = re.sub('(B-|I-)','', str(current))
                tok = re.sub('(B-|I-)','', str(tok))
                if tok == current:
                    end = i
                # our old sequence has ended
                else:
                    # do we have a previous NE?
                    if current != Sentence.O:
                        end = i
                        named_entity = ' '.join(self.words[start:end])
                        entity_dict[current].append(named_entity)
                    # update our book-keeping vars
                    current = tok
                    start = i
                    end = None
        # this might be empty
        return entity_dict

    def _build_directed_graph_from_dict(self, graphs):
        deps_dict = dict()
        if graphs and len(graphs) > 0:
            # process each stored graph
            for (kind, deps) in graphs.items():
                deps_dict[kind] = DirectedGraph(kind, deps, self.words)
            return deps_dict
        return None

    def __unicode__(self):
        return self.text

    def to_string(self):
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))

    def labeled_dependencies_using(self, tokens):
        """
        Generates a list of labeled dependencies for a sentence
        using the provided tokens
        """
        #else:
        #    raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
        deps = self.dependencies
        labeled = []
        for out in deps.outgoing:
            for (dest, rel) in deps.outgoing[out]:
                labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
        return labeled

    def unlabeled_dependencies_using(self, tokens):
        """
        Generate a list of unlabeled dependencies for a sentence
        using the provided tokens
        """
        unlabeled = []
        for sd in self.labeled_dependencies_using(tokens):
            (head, _, dep) = sd.split("_")
            unlabeled.append("{}_{}".format(head, dep))
        return unlabeled

    def to_JSON_dict(self):
        sentence_dict = dict()
        sentence_dict["words"] = self.words
        sentence_dict["startOffsets"] = self.startOffsets
        sentence_dict["endOffsets"] = self.endOffsets
        sentence_dict["tags"] = self.tags
        sentence_dict["lemmas"] = self.lemmas
        sentence_dict["entities"] = self._entities
        # add graphs
        sentence_dict["graphs"] = dict()
        for (kind, graph) in self.graphs.items():
            sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict()
        return sentence_dict

    def to_JSON(self):
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)

    @staticmethod
    def load_from_JSON(json_dict):
        sent = Sentence(
                    words=json_dict["words"],
                    startOffsets=json_dict["startOffsets"],
                    endOffsets=json_dict["endOffsets"],
                    lemmas=json_dict.get("lemmas", None),
                    tags=json_dict.get("tags", None),
                    entities=json_dict.get("entities", None),
                    text=json_dict.get("text", None),
                    graphs=json_dict.get("graphs", None)
                    )
        return sent


class Edge(object):

    def __init__(self, source, destination, relation):
        self.source = source
        self.destination = destination
        self.relation = relation

    def __unicode__(self):
        return self.to_string()

    def to_string(self):
        return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation)

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.to_JSON() == other.to_JSON()
        else:
            return False

    def to_JSON_dict(self):
        edge_dict = dict()
        edge_dict["source"] = self.source
        edge_dict["destination"] = self.destination
        edge_dict["relation"] = self.relation
        return edge_dict

    def to_JSON(self):
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)


class DirectedGraph(object):

    """
    Storage class for directed graphs.


    Parameters
    ----------
    kind : str
        The name of the directed graph.
    deps : dict
        A dictionary of {edges: [{source, destination, relation}], roots: [int]}
    words : [str]
        A list of the word form of the tokens from the originating `Sentence`.

    Attributes
    ----------
    _words : [str]
        A list of the word form of the tokens from the originating `Sentence`.
    roots : [int]
        A list of indices for the syntactic dependency graph's roots.  Generally this is a single token index.
    edges: list[processors.ds.Edge]
        A list of `processors.ds.Edge`
    incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph.
    outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph.
    labeled : [str]
        A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination").
    unlabeled : [str]
        A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination").
    graph : networkx.Graph
        A `networkx.graph` representation of the `DirectedGraph`.  Used by `shortest_path`

    Methods
    -------
    bag_of_labeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
    bag_of_unlabeled_dependencies_using(form)
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
    """
    STANFORD_BASIC_DEPENDENCIES = "stanford-basic"
    STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed"

    def __init__(self, kind, deps, words):
        self._words = [w.lower() for w in words]
        self.kind = kind
        self.roots = deps.get("roots", [])
        self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]]
        self.incoming = self._build_incoming(self.edges)
        self.outgoing = self._build_outgoing(self.edges)
        self.labeled = self._build_labeled()
        self.unlabeled = self._build_unlabeled()
        self.graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind)

    def __unicode__(self):
        return self.edges

    def __eq__(self, other):
        if isinstance(other, self.__class__):
            return self.to_JSON() == other.to_JSON()
        else:
            return False

    def __ne__(self, other):
        return not self.__eq__(other)

    def shortest_path(self, start, end):
        """
        Find the shortest path in the syntactic depedency graph
        between the provided start and end nodes.

        See Also
        --------
        `processors.paths.DependencyUtils.shortest_path`
        """
        res = DependencyUtils.shortest_path(self.graph, start, end)
        return DependencyUtils.retrieve_edges(self, res) if res else None

    def pagerank(self,
                 alpha=0.85,
                 personalization=None,
                 max_iter=1000,
                 tol=1e-06,
                 nstart=None,
                 weight='weight',
                 dangling=None):
        """
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.

        See Also
        --------
        `processors.paths.DependencyUtils.pagerank`
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
        """
        return DependencyUtils.pagerank(self.graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)

    def _build_incoming(self, edges):
        dep_dict = defaultdict(list)
        for edge in edges:
            dep_dict[edge.destination].append((edge.source, edge.relation))
        return dep_dict

    def _build_outgoing(self, edges):
        dep_dict = defaultdict(list)
        for edge in edges:
            dep_dict[edge.source].append((edge.destination, edge.relation))
        return dep_dict

    def _build_labeled(self):
        labeled = []
        for out in self.outgoing:
            for (dest, rel) in self.outgoing[out]:
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
        return labeled

    def _build_unlabeled(self):
        unlabeled = []
        for out in self.outgoing:
            for (dest, _) in self.outgoing[out]:
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
        return unlabeled

    def _graph_to_JSON_dict(self):
        dg_dict = dict()
        dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges]
        dg_dict["roots"] = self.roots
        return dg_dict

    def to_JSON_dict(self):
        return {self.kind:self._graph_to_JSON_dict()}

    def to_JSON(self):
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)


class Interval(object):
    """
    Defines a token or character span

    Parameters
    ----------
    start : str
        The token or character index where the interval begins.
    end : str
        The 1 + the index of the last token/character in the span.
    """

    def __init__(self, start, end):
        self.start = start
        self.end = end

    def to_JSON_dict(self):
        return {"start":self.start, "end":self.end}

    def to_JSON(self):
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)

    @staticmethod
    def load_from_JSON(json):
        return Interval(start=json["start"], end=json["end"])


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3
4			# Gus Hahn-Powell 2015-2016
5			# data structures for storing processors-server output
6			# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7			from __future__ import unicode_literals
8			from itertools import chain
9			from collections import defaultdict
10			from processors.paths import DependencyUtils
11			from processors.utils import LabelManager
12			#from six import text_type
13			import json
14			import re
15
16
17			class Document(object):
18
19			"""
20			Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
21
22			Parameters
23			----------
24			sentences : [processors.ds.Sentence]
25			The sentences comprising the `Document`.
26
27			Attributes
28			----------
29			id : str or None
30			A unique ID for the `Document`.
31			size : int
32			The number of `sentences`.
33			sentences : sentences
34			The sentences comprising the `Document`.
35			words : [str]
36			A list of the `Document`'s tokens.
37			tags : [str]
38			A list of the `Document`'s tokens represented using part of speech (PoS) tags.
39			lemmas : [str]
40			A list of the `Document`'s tokens represented using lemmas.
41			_entities : [str]
42			A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
43			nes : dict
44			A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
45			bag_of_labeled_deps : [str]
46			The labeled dependencies from all sentences in the `Document`.
47			bag_of_unlabeled_deps : [str]
48			The unlabeled dependencies from all sentences in the `Document`.
49			text : str or None
50			The original text of the `Document`.
51
52			Methods
53			-------
54			bag_of_labeled_dependencies_using(form)
55			Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
56			bag_of_unlabeled_dependencies_using(form)
57			Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
58			"""
59
60			def __init__(self, sentences):
61			self.id = None
62			self.size = len(sentences)
63			self.sentences = sentences
64			# easily access token attributes from all sentences
65			self.words = list(chain(*[s.words for s in self.sentences]))
66			self.tags = list(chain(*[s.tags for s in self.sentences]))
67			self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
68			self._entities = list(chain(*[s._entities for s in self.sentences]))
69			self.nes = merge_entity_dicts = self._merge_ne_dicts()
70			self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
71			self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
72			self.text = None
73
74			def __hash__(self):
75			return hash(self.to_JSON())
76
77			def __unicode__(self):
78			return self.text
79
80			def __str__(self):
81			return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
82
83			def __eq__(self, other):
84			if isinstance(other, self.__class__):
85			return self.to_JSON() == other.to_JSON()
86			else:
87			return False
88
89			def __ne__(self, other):
90			return not self.__eq__(other)
91
92			def bag_of_labeled_dependencies_using(self, form):
93			return list(chain(*[s.labeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))
94
95			def bag_of_unlabeled_dependencies_using(self, form):
96			return list(chain(*[s.unlabeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))
97
98			def _merge_ne_dicts(self):
99			# Get the set of all NE labels found in the Doc's sentences
100			entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
101			# Do we have any labels?
102			if entity_labels == None:
103			return None
104			# If we have labels, consolidate the NEs under the appropriate label
105			else:
106			nes_dict = dict()
107			for e in entity_labels:
108			entities = []
109			for s in self.sentences:
110			entities += s.nes[e]
111			nes_dict[e] = entities
112			return nes_dict
113
114			def to_JSON_dict(self):
115			doc_dict = dict()
116			doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
117			doc_dict["text"] = self.text
118			# can the ID be set?
119			if self.id != None:
120			doc_dict["id"] = self.id
121			return doc_dict
122
123			def to_JSON(self, pretty=True):
124			"""
125			Returns JSON as String.
126			"""
127			num_spaces = 4 if pretty else 0
128			return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces)
129
130			@staticmethod
131			def load_from_JSON(json_dict):
132			sentences = []
133			for s in json_dict["sentences"]:
134			kwargs = {
135			"words": s["words"],
136			"startOffsets": s["startOffsets"],
137			"endOffsets": s["endOffsets"],
138			"tags": s.get("tags", None),
139			"lemmas": s.get("lemmas", None),
140			"chunks": s.get("chunks", None),
141			"entities": s.get("entities", None),
142			"graphs": s.get("graphs", None)
143			}
144			sent = Sentence(**kwargs)
145			sentences.append(sent)
146			doc = Document(sentences)
147			# set id and text
148			doc.text = json_dict.get("text", None)
149			doc.id = kwargs.get("id", None)
150			return doc
151
152
153			class Sentence(object):
154
155			"""
156			Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
157
158			Parameters
159			----------
160			text : str or None
161			The text of the `Sentence`.
162			words : [str]
163			A list of the `Sentence`'s tokens.
164			startOffsets : [int]
165			The character offsets starting each token (inclusive).
166			endOffsets : [int]
167			The character offsets marking the end of each token (exclusive).
168			tags : [str]
169			A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.
170			lemmas : [str]
171			A list of the `Sentence`'s tokens represented using lemmas.
172			chunks : [str]
173			A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).
174			entities : [str]
175			A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.
176			graphs : dict
177			A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}}
178
179			Attributes
180			----------
181			text : str
182			The text of the `Sentence`.
183			startOffsets : [int]
184			The character offsets starting each token (inclusive).
185			endOffsets : [int]
186			The character offsets marking the end of each token (exclusive).
187			length : int
188			The number of tokens in the `Sentence`
189
190			graphs : dict
191			A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`.
192			basic_dependencies : processors.ds.DirectedGraph
193			A `processors.ds.DirectedGraph` using basic Stanford dependencies.
194			collapsed_dependencies : processors.ds.DirectedGraph
195			A `processors.ds.DirectedGraph` using collapsed Stanford dependencies.
196			dependencies : processors.ds.DirectedGraph
197			A pointer to the prefered syntactic dependency graph type for this `Sentence`.
198			_entities : [str]
199			The IOB-style Named Entity (NE) labels corresponding to each token.
200			_chunks : [str]
201			The IOB-style chunk labels corresponding to each token.
202			nes : dict
203			A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
204			phrases : dict
205			A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
206
207
208			Methods
209			-------
210			bag_of_labeled_dependencies_using(form)
211			Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
212			bag_of_unlabeled_dependencies_using(form)
213			Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
214			"""
215
216			UNKNOWN = LabelManager.UNKNOWN
217			# the O in IOB notation
218			O = LabelManager.O
219
220			def __init__(self, **kwargs):
221			self.words = kwargs["words"]
222			self.startOffsets = kwargs["startOffsets"]
223			self.endOffsets = kwargs["endOffsets"]
224			self.length = len(self.words)
225			self.tags = self._set_toks(kwargs.get("tags", None))
226			self.lemmas = self._set_toks(kwargs.get("lemmas", None))
227			self._chunks = self._set_toks(kwargs.get("chunks", None))
228			self._entities = self._set_toks(kwargs.get("entities", None))
229			self.text = kwargs.get("text", None) or " ".join(self.words)
230			self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None))
231			self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
232			self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
233			self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
234			# IOB tokens -> {label: [phrase 1, ..., phrase n]}
235			self.nes = self._handle_iob(self._entities)
236			self.phrases = self._handle_iob(self._chunks)
237
238			def __eq__(self, other):
239			if isinstance(other, self.__class__):
240			return self.to_JSON() == other.to_JSON()
241			else:
242			return False
243
244			def __ne__(self, other):
245			return not self.__eq__(other)
246
247			def _get_tokens(self, form):
248			f = form.lower()
249			if f == "words":
250			tokens = self.words
251			elif f == "tags":
252			tokens = self.tags
253			elif f == "lemmas":
254			tokens = self.lemmas
255			elif f == "entities":
256			tokens = self.nes
257			elif f == "index":
258			tokens = list(range(self.length))
259			return tokens
260
261			def _set_toks(self, toks):
262			return toks if toks else [Sentence.UNKNOWN]*self.length
263
264			def _handle_iob(self, iob):
265			"""
266			Consolidates consecutive tokens in IOB notation under the appropriate label.
267			Regexs control for bionlp annotator, which uses IOB notation.
268			"""
269			entity_dict = defaultdict(list)
270			# initialize to empty label
271			current = Sentence.O
272			start = None
273			end = None
274			for i, tok in enumerate(iob):
275			# we don't have an I or O
276			if tok == Sentence.O:
277			# did we have an entity with the last token?
278			current = re.sub('(B-\|I-)','', str(current))
279			if current == Sentence.O:
280			continue
281			else:
282			# the last sequence has ended
283			end = i
284			# store the entity
285			named_entity = ' '.join(self.words[start:end])
286			entity_dict[current].append(named_entity)
287			# reset our book-keeping vars
288			current = Sentence.O
289			start = None
290			end = None
291			# we have a tag!
292			else:
293			# our old sequence continues
294			current = re.sub('(B-\|I-)','', str(current))
295			tok = re.sub('(B-\|I-)','', str(tok))
296			if tok == current:
297			end = i
298			# our old sequence has ended
299			else:
300			# do we have a previous NE?
301			if current != Sentence.O:
302			end = i
303			named_entity = ' '.join(self.words[start:end])
304			entity_dict[current].append(named_entity)
305			# update our book-keeping vars
306			current = tok
307			start = i
308			end = None
309			# this might be empty
310			return entity_dict
311
312			def _build_directed_graph_from_dict(self, graphs):
313			deps_dict = dict()
314			if graphs and len(graphs) > 0:
315			# process each stored graph
316			for (kind, deps) in graphs.items():
317			deps_dict[kind] = DirectedGraph(kind, deps, self.words)
318			return deps_dict
319			return None
320
321			def __unicode__(self):
322			return self.text
323
324			def to_string(self):
325			return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
326
327			def labeled_dependencies_using(self, tokens):
328			"""
329			Generates a list of labeled dependencies for a sentence
330			using the provided tokens
331			"""
332			#else:
333			# raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
334			deps = self.dependencies
335			labeled = []
336			for out in deps.outgoing:
337			for (dest, rel) in deps.outgoing[out]:
338			labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
339			return labeled
340
341			def unlabeled_dependencies_using(self, tokens):
342			"""
343			Generate a list of unlabeled dependencies for a sentence
344			using the provided tokens
345			"""
346			unlabeled = []
347			for sd in self.labeled_dependencies_using(tokens):
348			(head, _, dep) = sd.split("_")
349			unlabeled.append("{}_{}".format(head, dep))
350			return unlabeled
351
352			def to_JSON_dict(self):
353			sentence_dict = dict()
354			sentence_dict["words"] = self.words
355			sentence_dict["startOffsets"] = self.startOffsets
356			sentence_dict["endOffsets"] = self.endOffsets
357			sentence_dict["tags"] = self.tags
358			sentence_dict["lemmas"] = self.lemmas
359			sentence_dict["entities"] = self._entities
360			# add graphs
361			sentence_dict["graphs"] = dict()
362			for (kind, graph) in self.graphs.items():
363			sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict()
364			return sentence_dict
365
366			def to_JSON(self):
367			return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
368
369			@staticmethod
370			def load_from_JSON(json_dict):
371			sent = Sentence(
372			words=json_dict["words"],
373			startOffsets=json_dict["startOffsets"],
374			endOffsets=json_dict["endOffsets"],
375			lemmas=json_dict.get("lemmas", None),
376			tags=json_dict.get("tags", None),
377			entities=json_dict.get("entities", None),
378			text=json_dict.get("text", None),
379			graphs=json_dict.get("graphs", None)
380			)
381			return sent
382
383
384			class Edge(object):
385
386			def __init__(self, source, destination, relation):
387			self.source = source
388			self.destination = destination
389			self.relation = relation
390
391			def __unicode__(self):
392			return self.to_string()
393
394			def to_string(self):
395			return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation)
396
397			def __eq__(self, other):
398			if isinstance(other, self.__class__):
399			return self.to_JSON() == other.to_JSON()
400			else:
401			return False
402
403			def to_JSON_dict(self):
404			edge_dict = dict()
405			edge_dict["source"] = self.source
406			edge_dict["destination"] = self.destination
407			edge_dict["relation"] = self.relation
408			return edge_dict
409
410			def to_JSON(self):
411			return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
412
413
414			class DirectedGraph(object):
415
416			"""
417			Storage class for directed graphs.
418
419
420			Parameters
421			----------
422			kind : str
423			The name of the directed graph.
424			deps : dict
425			A dictionary of {edges: [{source, destination, relation}], roots: [int]}
426			words : [str]
427			A list of the word form of the tokens from the originating `Sentence`.
428
429			Attributes
430			----------
431			_words : [str]
432			A list of the word form of the tokens from the originating `Sentence`.
433			roots : [int]
434			A list of indices for the syntactic dependency graph's roots. Generally this is a single token index.
435			edges: list[processors.ds.Edge]
436			A list of `processors.ds.Edge`
437			incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph.
438			outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph.
439			labeled : [str]
440			A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination").
441			unlabeled : [str]
442			A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination").
443			graph : networkx.Graph
444			A `networkx.graph` representation of the `DirectedGraph`. Used by `shortest_path`
445
446			Methods
447			-------
448			bag_of_labeled_dependencies_using(form)
449			Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
450			bag_of_unlabeled_dependencies_using(form)
451			Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
452			"""
453			STANFORD_BASIC_DEPENDENCIES = "stanford-basic"
454			STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed"
455
456			def __init__(self, kind, deps, words):
457			self._words = [w.lower() for w in words]
458			self.kind = kind
459			self.roots = deps.get("roots", [])
460			self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]]
461			self.incoming = self._build_incoming(self.edges)
462			self.outgoing = self._build_outgoing(self.edges)
463			self.labeled = self._build_labeled()
464			self.unlabeled = self._build_unlabeled()
465			self.graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind)
466
467			def __unicode__(self):
468			return self.edges
469
470			def __eq__(self, other):
471			if isinstance(other, self.__class__):
472			return self.to_JSON() == other.to_JSON()
473			else:
474			return False
475
476			def __ne__(self, other):
477			return not self.__eq__(other)
478
479			def shortest_path(self, start, end):
480			"""
481			Find the shortest path in the syntactic depedency graph
482			between the provided start and end nodes.
483
484			See Also
485			--------
486			`processors.paths.DependencyUtils.shortest_path`
487			"""
488			res = DependencyUtils.shortest_path(self.graph, start, end)
489			return DependencyUtils.retrieve_edges(self, res) if res else None
490
491			def pagerank(self,
492			alpha=0.85,
493			personalization=None,
494			max_iter=1000,
495			tol=1e-06,
496			nstart=None,
497			weight='weight',
498			dangling=None):
499			"""
500			Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`.
501
502			See Also
503			--------
504			`processors.paths.DependencyUtils.pagerank`
505			Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
506			"""
507			return DependencyUtils.pagerank(self.graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
508
509			def _build_incoming(self, edges):
510			dep_dict = defaultdict(list)
511			for edge in edges:
512			dep_dict[edge.destination].append((edge.source, edge.relation))
513			return dep_dict
514
515			def _build_outgoing(self, edges):
516			dep_dict = defaultdict(list)
517			for edge in edges:
518			dep_dict[edge.source].append((edge.destination, edge.relation))
519			return dep_dict
520
521			def _build_labeled(self):
522			labeled = []
523			for out in self.outgoing:
524			for (dest, rel) in self.outgoing[out]:
525			labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
526			return labeled
527
528			def _build_unlabeled(self):
529			unlabeled = []
530			for out in self.outgoing:
531			for (dest, _) in self.outgoing[out]:
532			unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
533			return unlabeled
534
535			def _graph_to_JSON_dict(self):
536			dg_dict = dict()
537			dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges]
538			dg_dict["roots"] = self.roots
539			return dg_dict
540
541			def to_JSON_dict(self):
542			return {self.kind:self._graph_to_JSON_dict()}
543
544			def to_JSON(self):
545			return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
546
547
548			class Interval(object):
549			"""
550			Defines a token or character span
551
552			Parameters
553			----------
554			start : str
555			The token or character index where the interval begins.
556			end : str
557			The 1 + the index of the last token/character in the span.
558			"""
559
560			def __init__(self, start, end):
561			self.start = start
562			self.end = end
563
564			def to_JSON_dict(self):
565			return {"start":self.start, "end":self.end}
566
567			def to_JSON(self):
568			return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
569
570			@staticmethod
571			def load_from_JSON(json):
572			return Interval(start=json["start"], end=json["end"])
573

myedibleenso / py-processors

Push — master ( 352884...922d0d )

Dependencies A

Complexity

Size/Duplication

Importance

2 Methods

Duplication Side-by-Side

Filter issues like