DependencyUtils.simplify_tag()   F
last analyzed

Complexity

Conditions 9

Size

Total Lines 40

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 9
dl 0
loc 40
rs 3
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
from __future__ import unicode_literals
4
from processors.utils import LabelManager
5
from collections import Counter
6
import networkx as nx
7
import collections
8
import re
9
10
11
class DependencyUtils(object):
12
    """
13
    A set of utilities for analyzing syntactic dependency graphs.
14
15
    Methods
16
    -------
17
    build_networkx_graph(roots, edges, name)
18
        Constructs a networkx.Graph
19
20
    shortest_path(g, start, end)
21
        Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes.
22
23
    retrieve_edges(dep_graph, path)
24
        Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph.
25
26
    simplify_tag(tag)
27
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
28
29
    lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False, limit_to=None)
30
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.
31
32
    pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None)
33
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
34
    """
35
36
    UNKNOWN = LabelManager.UNKNOWN
37
38
    @staticmethod
39
    def build_networkx_graph(roots, edges, name, reverse=False):
40
        """
41
        Converts a `processors` dependency graph into a networkx graph
42
        """
43
        G = nx.DiGraph()
44
        graph_name = name
45
        # store roots
46
        G.graph["roots"] = roots
47
        # reversing the graph is useful if you want to run pagerank to highlight predicate and argument nodes
48
        if reverse:
49
            edges = [(edge.destination, edge.source, {"relation": edge.relation}) for edge in edges]
50
        else:
51
            edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges]
52
        G.add_edges_from(edges)
53
        return G
54
55
    @staticmethod
56
    def shortest_paths(g, start, end):
57
        """
58
        Find the shortest paths between two nodes.
59
        Note that if `g` is a directed graph, a path will not be found.
60
61
        Parameters
62
        ----------
63
        g : a networkx graph
64
            The networkx graph to explore.
65
66
        start : int or [int]
67
            A single token index or list of token indices serving as the start of the graph traversal.
68
69
        end : int or [int]
70
            A single token index or list of token indices serving as the end of the graph traversal.
71
72
        Returns
73
        -------
74
        None or [[(int, int)]]
75
            None if no paths are found.  Otherwise, a list of lists of (source index, target index) tuples representing path segments.
76
        """
77
        # converts single int to [int]
78
        start = start if isinstance(start, collections.Iterable) else [start]
79
        end = end if isinstance(end, collections.Iterable) else [end]
80
        # node list -> edges (i.e., (source, dest) pairs)
81
        def path_to_edges(g, path):
82
            return [(path[i], path[i+1]) for i in range(len(path) - 1)]
83
84
        shortest_paths = []
85
        # pathfinding b/w pairs of nodes
86
        for s in start:
87
            for e in end:
88
                try:
89
                    paths = nx.algorithms.all_shortest_paths(g, s, e)
90
                    for path in paths:
91
                        shortest_paths.append(path_to_edges(g, path))
92
                # no path found...
93
                except:
94
                    #print("No path found between '{}' and '{}'".format(s, e))
95
                    continue
96
        return None if len(shortest_paths) == 0 else shortest_paths
97
98
    @staticmethod
99
    def shortest_path(g, start, end, scoring_func=lambda path: -len(path)):
100
        """
101
        Find the shortest path between two nodes.
102
        Note that pathfinding is sensitive to direction.  If you want to ignore direction, convert your networkx.Digraph to a networkx.Graph.
103
104
        Parameters
105
        ----------
106
        g : a networkx graph
107
            The networkx graph to explore.
108
109
        start : int or [int]
110
            A single token index or list of token indices serving as the start of the graph traversal.
111
112
        end : int or [int]
113
            A single token index or list of token indices serving as the end of the graph traversal.
114
115
        scoring_func : function
116
            A function that scores each path in a list of paths.  Each path has the form [(source index, relation, destination index)].
117
            The path with the maximum score will be returned.
118
119
        Returns
120
        -------
121
        None or [(int, int)]
122
            None if no paths are found.  Otherwise, a list of (source index, target index) tuples representing path segments.
123
        """
124
        paths = DependencyUtils.shortest_paths(g, start, end)
125
        return None if len(shortest_paths) == 0 else max(paths, key=scoring_func)
126
127
    @staticmethod
128
    def directed_relation(source_idx, destination_idx, relation, deps):
129
        """
130
        Converts relation to a directed relation (incoming v. outgoing)
131
        if such a relation links `source_idx` and `destination_idx` in `deps`.
132
133
        Parameters
134
        ----------
135
        source_idx : int
136
            The token index for the source node
137
138
        destination_idx : int
139
            The token index for the destination node
140
141
        relation : str
142
            The undirected relation (i.e., the grammatical/semantic relation that connects the two nodes)
143
144
        deps : processors.ds.DirectedGraph
145
            The directed graph to be referenced
146
147
        Returns
148
        -------
149
        str or None
150
            The directed relation that connects the `source_idx` to the `destination_idx` in `deps`.
151
        """
152
        matches = [">{}".format(rel) for d, rel in deps.outgoing[source_idx] if d == destination_idx and rel == relation] + \
153
        ["<{}".format(rel) for d, rel in deps.incoming[source_idx] if d == destination_idx and rel == relation]
154
        return None if len(matches) == 0 else matches[0]
155
156
    @staticmethod
157
    def retrieve_edges(dep_graph, path):
158
        """
159
        Converts output of `DependencyUtils.shortest_path`
160
        into a list of triples that include the grammatical relation (and direction)
161
        for each node-node "hop" in the syntactic dependency graph.
162
163
        Parameters
164
        ----------
165
        dep_graph : processors.ds.DirectedGraph
166
            The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`.
167
168
        path : [(int, int)]
169
            A list of tuples representing the shortest path from A to B in `dep_graph`.
170
171
        Returns
172
        -------
173
        [(int, str, int)]
174
            the shortest path (`path`) enhanced with the directed grammatical relations
175
            (ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`).
176
        """
177
178
        shortest_path = []
179
        for (s, d) in path:
180
            # build dictionaries from incoming/outgoing
181
            outgoing = {dest_idx:">{}".format(rel) for (dest_idx, rel) in dep_graph.outgoing[s]}
182
            incoming = {source_idx:"<{}".format(rel) for (source_idx, rel) in dep_graph.incoming[s]}
183
            relation = outgoing[d] if d in outgoing else incoming[d]
184
            shortest_path.append((s, relation, d))
185
        return shortest_path
186
187
    @staticmethod
188
    def simplify_tag(tag):
189
        """
190
        Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels.
191
192
        Parameters
193
        ----------
194
        tag : str
195
            The Penn-style PoS tag to be mapped to a simplified form.
196
197
        Returns
198
        -------
199
        str
200
            A simplified form of `tag`.  In some cases, the returned form may be identical to `tag`.
201
        """
202
        simple_tag = "\"{}\"".format(tag)
203
        # collapse plurals
204
        if tag.startswith("NNP"):
205
            simple_tag = "/^NNP/"
206
        # collapse plurals
207
        elif tag.startswith("NN"):
208
            simple_tag = "/^N/"
209
        elif tag.startswith("VB"):
210
            simple_tag = "/^V/"
211
        # collapse comparative, superlatives, etc.
212
        elif tag.startswith("JJ"):
213
            simple_tag = "/^J/"
214
        # collapse comparative, superlatives, etc.
215
        elif tag.startswith("RB"):
216
            simple_tag = "/^RB/"
217
        # collapse possessive/non-possesive pronouns
218
        elif tag.startswith("PRP"):
219
            simple_tag = "/^PRP/"
220
        # treat WH determiners as DT
221
        elif tag == "WDT":
222
            simple_tag = "/DT$/"
223
        # treat DT the same as WDT
224
        elif tag == "DT":
225
            simple_tag = "/DT$/"
226
        return simple_tag
227
228
    @staticmethod
229
    def lexicalize_path(sentence,
230
                        path,
231
                        words=False,
232
                        lemmas=False,
233
                        tags=False,
234
                        simple_tags=False,
235
                        entities=False,
236
                        limit_to=None,
237
                        ):
238
        """
239
        Lexicalizes path in syntactic dependency graph using Odin-style token constraints.  Operates on output of `DependencyUtils.retrieve_edges`
240
241
        Parameters
242
        ----------
243
        sentence : processors.ds.Sentence
244
            The `Sentence` from which the `path` was found.  Used to lexicalize the `path`.
245
246
        path : list
247
            A list of (source index, relation, target index) triples.
248
249
        words : bool
250
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words`
251
252
        lemmas : bool
253
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas`
254
255
        tags : bool
256
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags`
257
258
        simple_tags : bool
259
            Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags`
260
261
        entities : bool
262
            Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities`
263
264
        limit_to : [int] or None
265
            Selectively apply lexicalization only to the this list of token indices.  None means apply the specified lexicalization to all token indices in the path.
266
        Returns
267
        -------
268
        [str]
269
            The lexicalized form of `path`, encoded according to the specified parameters.
270
        """
271
        UNKNOWN = LabelManager.UNKNOWN
272
        lexicalized_path = []
273
        relations = []
274
        nodes = []
275
        # gather edges and nodes
276
        for edge in path:
277
            relations.append(edge[1])
278
            nodes.append(edge[0])
279
        nodes.append(path[-1][-1])
280
281
        for (i, node) in enumerate(nodes):
282
            if not limit_to or node in limit_to:
283
                # build token constraints
284
                token_constraints = []
285
                # words
286
                if words:
287
                    token_constraints.append("word=\"{}\"".format(sentence.words[node]))
288
                # PoS tags
289
                if tags and sentence.tags[node] != UNKNOWN:
290
                    token_constraints.append("tag=\"{}\"".format(sentence.tags[node]))
291
                # lemmas
292
                if lemmas and sentence.lemmas[node] != UNKNOWN:
293
                    token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node]))
294
                # NE labels
295
                if entities and sentence._entities[node] != UNKNOWN:
296
                    token_constraints.append("entity=\"{}\"".format(sentence.entity[node]))
297
                # simple tags
298
                if simple_tags and sentence.tags[node] != UNKNOWN:
299
                    token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node])))
300
                # build node pattern
301
                if len(token_constraints) > 0:
302
                    node_pattern = "[{}]".format(" & ".join(token_constraints))
303
                    # store lexicalized representation of node
304
                    lexicalized_path.append(node_pattern)
305
            # append next edge
306
            if i < len(relations):
307
                lexicalized_path.append(relations[i])
308
        return lexicalized_path
309
310
    @staticmethod
311
    def pagerank(networkx_graph,
312
                 alpha=0.85,
313
                 personalization=None,
314
                 max_iter=1000,
315
                 tol=1e-06,
316
                 nstart=None,
317
                 weight='weight',
318
                 dangling=None):
319
        """
320
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
321
322
        Parameters
323
        ----------
324
        networkx_graph : networkx.Graph
325
            Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`.
326
327
        See Also
328
        --------
329
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
330
331
        Returns
332
        -------
333
        collections.Counter
334
            A collections.Counter of node -> pagerank weights
335
        """
336
        pg_res = nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
337
        return Counter(pg_res)
338
339
340
341
class HeadFinder(object):
342
343
    import processors
344
345
    @staticmethod
346
    def semantic_head(sentence, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
347
        """
348
        Finds the token with the highest pagerank score that meets the filtering criteria.
349
350
        Parameters
351
        ----------
352
        sentence : processors.ds.Sentence
353
            The Sentence to be analyzed.
354
355
        graph_name : str
356
            The name of the graph upon which to run the algorithm.  Default is "stanford-collapsed".
357
358
        valid_tags : set or None
359
            An optional set of str or regexes representing valid tokens.
360
361
        valid_indices : list or None
362
            A optional list of int representing the indices that should be considered.
363
364
        Returns
365
        -------
366
        int or None
367
            The index of the highest scoring token meeting the criteria.
368
        """
369
370
        from processors.ds import Sentence as Sent
371
372
        def is_valid_tag(tag):
373
            return True if not valid_tags else any(re.match(tag_pattern, tag) for tag_pattern in valid_tags)
374
375
        # ensure we're dealing with a Sentence
376
        if not isinstance(sentence, Sent): return None
377
378
        valid_indices = valid_indices if valid_indices else list(range(sentence.length))
379
380
        # corner case: if the sentence is a single token, pagerank doesn't apply.
381
        # check tag and index
382
        if sentence.length == 1:
383
            return 0 if is_valid_tag(sentence.tags[0]) and 0 in valid_indices else None
384
385
        dependencies = sentence.graphs.get(graph_name, None)
386
387
        if not dependencies: return None
388
389
        scored_toks = dependencies.pagerank().most_common()
390
391
        remaining = [i for (i, score) in scored_toks \
392
                    if i in valid_indices and
393
                    is_valid_tag(sentence.tags[i])]
394
        # take token with the highest pagerank score
395
        return remaining[0] if len(remaining) > 0 else None
396