Completed
Push — master ( 352884...922d0d )
by Gus
01:03
created

DependencyUtils   B

Complexity

Total Complexity 41

Size/Duplication

Total Lines 229
Duplicated Lines 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
dl 0
loc 229
rs 8.2769
c 2
b 0
f 0
wmc 41

7 Methods

Rating   Name   Duplication   Size   Complexity  
F lexicalize_path() 0 68 14
F shortest_path() 0 29 10
A pagerank() 0 22 1
B retrieve_edges() 0 31 5
A build_networkx_graph() 0 12 2
A path_to_edges() 0 2 2
F simplify_tag() 0 40 9

How to fix   Complexity   

Complex Class

Complex classes like DependencyUtils often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from __future__ import unicode_literals
4
from processors.utils import LabelManager
5
import networkx as nx
6
import collections
7
8
9
class DependencyUtils(object):
10
    """
11
    A set of utilities for analyzing syntactic dependency graphs.
12
13
    Methods
14
    -------
15
    build_networkx_graph(roots, edges, name)
16
        Constructs a networkx.Graph
17
    shortest_path(g, start, end)
18
        Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes.
19
    retrieve_edges(dep_graph, path)
20
        Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph.
21
    simplify_tag(tag)
22
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
23
    lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False)
24
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.
25
    pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None)
26
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
27
    """
28
29
    UNKNOWN = LabelManager.UNKNOWN
30
31
    @staticmethod
32
    def build_networkx_graph(roots, edges, name):
33
        """
34
        Converts a `processors` dependency graph into a networkx graph
35
        """
36
        G = nx.Graph()
37
        graph_name = name
38
        # store roots
39
        G.graph["roots"] = roots
40
        edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges]
41
        G.add_edges_from(edges)
42
        return G
43
44
    @staticmethod
45
    def shortest_path(g, start, end):
46
        """
47
        Find the shortest path between two nodes.
48
49
        Parameters
50
        ----------
51
        start : int or [int]
52
            A single token index or list of token indices serving as the start of the graph traversal.
53
        end : int or [int]
54
            A single token index or list of token indices serving as the end of the graph traversal.
55
        """
56
        start = start if isinstance(start, collections.Iterable) else [start]
57
        end = end if isinstance(end, collections.Iterable) else [end]
58
        # node list -> edges (i.e., (source, dest) pairs)
59
        def path_to_edges(g, path):
60
            return [(path[i], path[i+1]) for i in range(len(path) - 1)]
61
62
        shortest_paths = []
63
        for s in start:
64
            for e in end:
65
                try:
66
                    path = nx.algorithms.shortest_path(g, s, e)
67
                    shortest_paths.append(path_to_edges(g, path))
68
                # no path found...
69
                except:
70
                    #print("No path found between '{}' and '{}'".format(s, e))
71
                    continue
72
        return None if len(shortest_paths) == 0 else min(shortest_paths, key=lambda x: len(x))
73
74
    @staticmethod
75
    def retrieve_edges(dep_graph, path):
76
        """
77
        Converts output of Converts output of `DependencyUtils.shortest_path`
78
        into a list of triples that include the grammatical relation (and direction)
79
        for each node-node "hop" in the syntactic dependency graph.
80
81
        Parameters
82
        ----------
83
        dep_graph : processors.ds.DirectedGraph
84
            The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`.
85
        path : [(int, int)]
86
            A list of tuples representing the shortest path from A to B in `dep_graph`.
87
88
        Returns
89
        -------
90
        [(int, str, int)]
91
            the shortest path (`path`) enhanced with the directed grammatical relations
92
            (ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`).
93
        """
94
95
        shortest_path = []
96
        for (s, d) in path:
97
            # build dictionaries from incoming/outgoing
98
            outgoing = dep_graph.outgoing[s]
99
            outgoing = dict(outgoing) if len(outgoing) > 0 else dict()
100
            incoming = dep_graph.incoming[s]
101
            incoming = dict(incoming) if len(incoming) > 0 else dict()
102
            relation = ">{}".format(outgoing[d]) if d in outgoing else "<{}".format(incoming[d])
103
            shortest_path.append((s, relation, d))
104
        return shortest_path
105
106
    @staticmethod
107
    def simplify_tag(tag):
108
        """
109
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
110
111
        Parameters
112
        ----------
113
        tag : str
114
            The Penn-style PoS tag to be mapped to a simplified form.
115
116
        Returns
117
        -------
118
        str
119
            A simplified form of `tag`.  In some cases, the returned form may be identical to `tag`.
120
        """
121
        simple_tag = "\"{}\"".format(tag)
122
        # collapse plurals
123
        if tag.startswith("NNP"):
124
            simple_tag = "/^NNP/"
125
        # collapse plurals
126
        elif tag.startswith("NN"):
127
            simple_tag = "/^N/"
128
        elif tag.startswith("VB"):
129
            simple_tag = "/^V/"
130
        # collapse comparative, superlatives, etc.
131
        elif tag.startswith("JJ"):
132
            simple_tag = "/^J/"
133
        # collapse comparative, superlatives, etc.
134
        elif tag.startswith("RB"):
135
            simple_tag = "/^RB/"
136
        # collapse possessive/non-possesive pronouns
137
        elif tag.startswith("PRP"):
138
            simple_tag = "/^PRP/"
139
        # treat WH determiners as DT
140
        elif tag == "WDT":
141
            simple_tag = "/DT$/"
142
        # treat DT the same as WDT
143
        elif tag == "DT":
144
            simple_tag = "/DT$/"
145
        return simple_tag
146
147
    @staticmethod
148
    def lexicalize_path(sentence,
149
                        path,
150
                        words=False,
151
                        lemmas=False,
152
                        tags=False,
153
                        simple_tags=False,
154
                        entities=False):
155
        """
156
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.  Operates on output of `DependencyUtils.retrieve_edges`
157
158
        Parameters
159
        ----------
160
        sentence : processors.ds.Sentence
161
            The `Sentence` from which the `path` was found.  Used to lexicalize the `path`.
162
        words : bool
163
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words`
164
        lemmas : bool
165
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas`
166
        tags : bool
167
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags`
168
        simple_tags : bool
169
            Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags`
170
        entities : bool
171
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities`
172
173
        Returns
174
        -------
175
        [str]
176
            The lexicalized form of `path`, encoded according to the specified parameters.
177
        """
178
        UNKNOWN = LabelManager.UNKNOWN
179
        lexicalized_path = []
180
        relations = []
181
        nodes = []
182
        # gather edges and nodes
183
        for edge in path:
184
            relations.append(edge[1])
185
            nodes.append(edge[0])
186
        nodes.append(path[-1][-1])
187
188
        for (i, node) in enumerate(nodes):
189
            # build token constraints
190
            token_constraints = []
191
            # words
192
            if words:
193
                token_constraints.append("word=\"{}\"".format(sentence.words[node]))
194
            # PoS tags
195
            if tags and sentence.tags[node] != UNKNOWN:
196
                token_constraints.append("tag=\"{}\"".format(sentence.tags[node]))
197
            # lemmas
198
            if lemmas and sentence.lemmas[node] != UNKNOWN:
199
                token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node]))
200
            # NE labels
201
            if entities and sentence._entities[node] != UNKNOWN:
202
                token_constraints.append("entity=\"{}\"".format(sentence.entity[node]))
203
            # simple tags
204
            if simple_tags and sentence.tags[node] != UNKNOWN:
205
                token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node])))
206
            # build node pattern
207
            if len(token_constraints) > 0:
208
                node_pattern = "[{}]".format(" & ".join(token_constraints))
209
                # store lexicalized representation of node
210
                lexicalized_path.append(node_pattern)
211
            # append next edge
212
            if i < len(relations):
213
                lexicalized_path.append(relations[i])
214
        return lexicalized_path
215
216
    @staticmethod
217
    def pagerank(networkx_graph,
218
                 alpha=0.85,
219
                 personalization=None,
220
                 max_iter=1000,
221
                 tol=1e-06,
222
                 nstart=None,
223
                 weight='weight',
224
                 dangling=None):
225
        """
226
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
227
228
        Parameters
229
        ----------
230
        networkx_graph : networkx.Graph
231
            Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`.
232
233
        See Also
234
        --------
235
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
236
        """
237
        return nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
238