Completed
Push — master ( 8c21e2...b98938 )
by Gus
01:05
created

DependencyUtils.shortest_paths()   F

Complexity

Conditions 10

Size

Total Lines 42

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 10
dl 0
loc 42
rs 3.1304
c 2
b 0
f 0

1 Method

Rating   Name   Duplication   Size   Complexity  
A DependencyUtils.path_to_edges() 0 2 2

How to fix   Complexity   

Complexity

Complex classes like DependencyUtils.shortest_paths() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from __future__ import unicode_literals
4
from processors.utils import LabelManager
5
from collections import Counter
6
import networkx as nx
7
import collections
8
9
10
class DependencyUtils(object):
11
    """
12
    A set of utilities for analyzing syntactic dependency graphs.
13
14
    Methods
15
    -------
16
    build_networkx_graph(roots, edges, name)
17
        Constructs a networkx.Graph
18
19
    shortest_path(g, start, end)
20
        Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes.
21
22
    retrieve_edges(dep_graph, path)
23
        Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph.
24
25
    simplify_tag(tag)
26
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
27
28
    lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False)
29
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.
30
31
    pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None)
32
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
33
    """
34
35
    UNKNOWN = LabelManager.UNKNOWN
36
37
    @staticmethod
38
    def build_networkx_graph(roots, edges, name, reverse=False):
39
        """
40
        Converts a `processors` dependency graph into a networkx graph
41
        """
42
        G = nx.DiGraph()
43
        graph_name = name
44
        # store roots
45
        G.graph["roots"] = roots
46
        # reversing the graph is useful if you want to run pagerank to highlight predicate and argument nodes
47
        if reverse:
48
            edges = [(edge.destination, edge.source, {"relation": edge.relation}) for edge in edges]
49
        else:
50
            edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges]
51
        G.add_edges_from(edges)
52
        return G
53
54
    @staticmethod
55
    def shortest_paths(g, start, end):
56
        """
57
        Find the shortest paths between two nodes.
58
        Note that if `g` is a directed graph, a path will not be found.
59
60
        Parameters
61
        ----------
62
        g : a networkx graph
63
            The networkx graph to explore.
64
65
        start : int or [int]
66
            A single token index or list of token indices serving as the start of the graph traversal.
67
68
        end : int or [int]
69
            A single token index or list of token indices serving as the end of the graph traversal.
70
71
        Returns
72
        -------
73
        None or [[(int, int)]]
74
            None if no paths are found.  Otherwise, a list of lists of (source index, target index) tuples representing path segments.
75
        """
76
        # converts single int to [int]
77
        start = start if isinstance(start, collections.Iterable) else [start]
78
        end = end if isinstance(end, collections.Iterable) else [end]
79
        # node list -> edges (i.e., (source, dest) pairs)
80
        def path_to_edges(g, path):
81
            return [(path[i], path[i+1]) for i in range(len(path) - 1)]
82
83
        shortest_paths = []
84
        # pathfinding b/w pairs of nodes
85
        for s in start:
86
            for e in end:
87
                try:
88
                    paths = nx.algorithms.all_shortest_paths(g, s, e)
89
                    for path in paths:
90
                        shortest_paths.append(path_to_edges(g, path))
91
                # no path found...
92
                except:
93
                    #print("No path found between '{}' and '{}'".format(s, e))
94
                    continue
95
        return None if len(shortest_paths) == 0 else shortest_paths
96
97
    @staticmethod
98
    def shortest_path(g, start, end, scoring_func=lambda path: -len(path)):
99
        """
100
        Find the shortest path between two nodes.
101
        Note that pathfinding is sensitive to direction.  If you want to ignore direction, convert your networkx.Digraph to a networkx.Graph.
102
103
        Parameters
104
        ----------
105
        g : a networkx graph
106
            The networkx graph to explore.
107
108
        start : int or [int]
109
            A single token index or list of token indices serving as the start of the graph traversal.
110
111
        end : int or [int]
112
            A single token index or list of token indices serving as the end of the graph traversal.
113
114
        scoring_func : function
115
            A function that scores each path in a list of paths.  Each path has the form [(source index, relation, destination index)].
116
            The path with the maximum score will be returned.
117
118
        Returns
119
        -------
120
        None or [(int, int)]
121
            None if no paths are found.  Otherwise, a list of (source index, target index) tuples representing path segments.
122
        """
123
        paths = DependencyUtils.shortest_paths(g, start, end)
124
        return None if len(shortest_paths) == 0 else max(paths, key=scoring_func)
125
126
    @staticmethod
127
    def directed_relation(source_idx, destination_idx, relation, deps):
128
        """
129
        Converts relation to a directed relation (incoming v. outgoing)
130
        if such a relation links `source_idx` and `destination_idx` in `deps`.
131
132
        Parameters
133
        ----------
134
        source_idx : int
135
            The token index for the source node
136
137
        destination_idx : int
138
            The token index for the destination node
139
140
        relation : str
141
            The undirected relation (i.e., the grammatical/semantic relation that connects the two nodes)
142
143
        deps : processors.ds.DirectedGraph
144
            The directed graph to be referenced
145
146
        Returns
147
        -------
148
        str or None
149
            The directed relation that connects the `source_idx` to the `destination_idx` in `deps`.
150
        """
151
        matches = [">{}".format(rel) for d, rel in deps.outgoing[source_idx] if d == destination_idx and rel == relation] + \
152
        ["<{}".format(rel) for d, rel in deps.incoming[source_idx] if d == destination_idx and rel == relation]
153
        return None if len(matches) == 0 else matches[0]
154
155
    @staticmethod
156
    def retrieve_edges(dep_graph, path):
157
        """
158
        Converts output of `DependencyUtils.shortest_path`
159
        into a list of triples that include the grammatical relation (and direction)
160
        for each node-node "hop" in the syntactic dependency graph.
161
162
        Parameters
163
        ----------
164
        dep_graph : processors.ds.DirectedGraph
165
            The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`.
166
167
        path : [(int, int)]
168
            A list of tuples representing the shortest path from A to B in `dep_graph`.
169
170
        Returns
171
        -------
172
        [(int, str, int)]
173
            the shortest path (`path`) enhanced with the directed grammatical relations
174
            (ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`).
175
        """
176
177
        shortest_path = []
178
        for (s, d) in path:
179
            # build dictionaries from incoming/outgoing
180
            outgoing = {dest_idx:">{}".format(rel) for (dest_idx, rel) in dep_graph.outgoing[s]}
181
            incoming = {source_idx:"<{}".format(rel) for (source_idx, rel) in dep_graph.incoming[s]}
182
            relation = outgoing[d] if d in outgoing else incoming[d]
183
            shortest_path.append((s, relation, d))
184
        return shortest_path
185
186
    @staticmethod
187
    def simplify_tag(tag):
188
        """
189
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
190
191
        Parameters
192
        ----------
193
        tag : str
194
            The Penn-style PoS tag to be mapped to a simplified form.
195
196
        Returns
197
        -------
198
        str
199
            A simplified form of `tag`.  In some cases, the returned form may be identical to `tag`.
200
        """
201
        simple_tag = "\"{}\"".format(tag)
202
        # collapse plurals
203
        if tag.startswith("NNP"):
204
            simple_tag = "/^NNP/"
205
        # collapse plurals
206
        elif tag.startswith("NN"):
207
            simple_tag = "/^N/"
208
        elif tag.startswith("VB"):
209
            simple_tag = "/^V/"
210
        # collapse comparative, superlatives, etc.
211
        elif tag.startswith("JJ"):
212
            simple_tag = "/^J/"
213
        # collapse comparative, superlatives, etc.
214
        elif tag.startswith("RB"):
215
            simple_tag = "/^RB/"
216
        # collapse possessive/non-possesive pronouns
217
        elif tag.startswith("PRP"):
218
            simple_tag = "/^PRP/"
219
        # treat WH determiners as DT
220
        elif tag == "WDT":
221
            simple_tag = "/DT$/"
222
        # treat DT the same as WDT
223
        elif tag == "DT":
224
            simple_tag = "/DT$/"
225
        return simple_tag
226
227
    @staticmethod
228
    def lexicalize_path(sentence,
229
                        path,
230
                        words=False,
231
                        lemmas=False,
232
                        tags=False,
233
                        simple_tags=False,
234
                        entities=False,
235
                        limit_to=None,
236
                        ):
237
        """
238
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.  Operates on output of `DependencyUtils.retrieve_edges`
239
240
        Parameters
241
        ----------
242
        sentence : processors.ds.Sentence
243
            The `Sentence` from which the `path` was found.  Used to lexicalize the `path`.
244
245
        path : list
246
            A list of (source index, relation, target index) triples.
247
248
        words : bool
249
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words`
250
251
        lemmas : bool
252
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas`
253
254
        tags : bool
255
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags`
256
257
        simple_tags : bool
258
            Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags`
259
260
        entities : bool
261
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities`
262
263
        limit_to : [int] or None
264
            Selectively apply lexicalization only to the this list of token indices.  None means apply the specified lexicalization to all token indices in the path.
265
        Returns
266
        -------
267
        [str]
268
            The lexicalized form of `path`, encoded according to the specified parameters.
269
        """
270
        UNKNOWN = LabelManager.UNKNOWN
271
        lexicalized_path = []
272
        relations = []
273
        nodes = []
274
        # gather edges and nodes
275
        for edge in path:
276
            relations.append(edge[1])
277
            nodes.append(edge[0])
278
        nodes.append(path[-1][-1])
279
280
        for (i, node) in enumerate(nodes):
281
            if not limit_to or node in limit_to:
282
                # build token constraints
283
                token_constraints = []
284
                # words
285
                if words:
286
                    token_constraints.append("word=\"{}\"".format(sentence.words[node]))
287
                # PoS tags
288
                if tags and sentence.tags[node] != UNKNOWN:
289
                    token_constraints.append("tag=\"{}\"".format(sentence.tags[node]))
290
                # lemmas
291
                if lemmas and sentence.lemmas[node] != UNKNOWN:
292
                    token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node]))
293
                # NE labels
294
                if entities and sentence._entities[node] != UNKNOWN:
295
                    token_constraints.append("entity=\"{}\"".format(sentence.entity[node]))
296
                # simple tags
297
                if simple_tags and sentence.tags[node] != UNKNOWN:
298
                    token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node])))
299
                # build node pattern
300
                if len(token_constraints) > 0:
301
                    node_pattern = "[{}]".format(" & ".join(token_constraints))
302
                    # store lexicalized representation of node
303
                    lexicalized_path.append(node_pattern)
304
                # append next edge
305
                if i < len(relations):
306
                    lexicalized_path.append(relations[i])
307
        return lexicalized_path
308
309
    @staticmethod
310
    def pagerank(networkx_graph,
311
                 alpha=0.85,
312
                 personalization=None,
313
                 max_iter=1000,
314
                 tol=1e-06,
315
                 nstart=None,
316
                 weight='weight',
317
                 dangling=None):
318
        """
319
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
320
321
        Parameters
322
        ----------
323
        networkx_graph : networkx.Graph
324
            Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`.
325
326
        See Also
327
        --------
328
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
329
330
        Returns
331
        -------
332
        collections.Counter
333
            A collections.Counter of node -> pagerank weights
334
        """
335
        pg_res = nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
336
        return Counter(pg_res)
337