1 | # -*- coding: utf-8 -*- |
||
2 | from __future__ import unicode_literals |
||
3 | from processors.utils import LabelManager |
||
4 | from collections import Counter |
||
5 | import networkx as nx |
||
6 | import collections |
||
7 | import re |
||
8 | |||
9 | |||
10 | class DependencyUtils(object): |
||
11 | """ |
||
12 | A set of utilities for analyzing syntactic dependency graphs. |
||
13 | |||
14 | Methods |
||
15 | ------- |
||
16 | build_networkx_graph(roots, edges, name) |
||
17 | Constructs a networkx.Graph |
||
18 | |||
19 | shortest_path(g, start, end) |
||
20 | Finds the shortest path in a `networkx.Graph` between any element in a list of start nodes and any element in a list of end nodes. |
||
21 | |||
22 | retrieve_edges(dep_graph, path) |
||
23 | Converts output of `shortest_path` into a list of triples that include the grammatical relation (and direction) for each node-node "hop" in the syntactic dependency graph. |
||
24 | |||
25 | simplify_tag(tag) |
||
26 | Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels. |
||
27 | |||
28 | lexicalize_path(sentence, path, words=False, lemmas=False, tags=False, simple_tags=False, entities=False, limit_to=None) |
||
29 | Lexicalizes path in syntactic dependency graph using Odin-style token constraints. |
||
30 | |||
31 | pagerank(networkx_graph, alpha=0.85, personalization=None, max_iter=1000, tol=1e-06, nstart=None, weight='weight', dangling=None) |
||
32 | Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`. |
||
33 | """ |
||
34 | |||
35 | UNKNOWN = LabelManager.UNKNOWN |
||
36 | |||
37 | @staticmethod |
||
38 | def build_networkx_graph(roots, edges, name, reverse=False): |
||
39 | """ |
||
40 | Converts a `processors` dependency graph into a networkx graph |
||
41 | """ |
||
42 | G = nx.DiGraph() |
||
43 | graph_name = name |
||
44 | # store roots |
||
45 | G.graph["roots"] = roots |
||
46 | # reversing the graph is useful if you want to run pagerank to highlight predicate and argument nodes |
||
47 | if reverse: |
||
48 | edges = [(edge.destination, edge.source, {"relation": edge.relation}) for edge in edges] |
||
49 | else: |
||
50 | edges = [(edge.source, edge.destination, {"relation": edge.relation}) for edge in edges] |
||
51 | G.add_edges_from(edges) |
||
52 | return G |
||
53 | |||
54 | @staticmethod |
||
55 | def shortest_paths(g, start, end): |
||
56 | """ |
||
57 | Find the shortest paths between two nodes. |
||
58 | Note that if `g` is a directed graph, a path will not be found. |
||
59 | |||
60 | Parameters |
||
61 | ---------- |
||
62 | g : a networkx graph |
||
63 | The networkx graph to explore. |
||
64 | |||
65 | start : int or [int] |
||
66 | A single token index or list of token indices serving as the start of the graph traversal. |
||
67 | |||
68 | end : int or [int] |
||
69 | A single token index or list of token indices serving as the end of the graph traversal. |
||
70 | |||
71 | Returns |
||
72 | ------- |
||
73 | None or [[(int, int)]] |
||
74 | None if no paths are found. Otherwise, a list of lists of (source index, target index) tuples representing path segments. |
||
75 | """ |
||
76 | # converts single int to [int] |
||
77 | start = start if isinstance(start, collections.Iterable) else [start] |
||
78 | end = end if isinstance(end, collections.Iterable) else [end] |
||
79 | # node list -> edges (i.e., (source, dest) pairs) |
||
80 | def path_to_edges(g, path): |
||
81 | return [(path[i], path[i+1]) for i in range(len(path) - 1)] |
||
82 | |||
83 | shortest_paths = [] |
||
84 | # pathfinding b/w pairs of nodes |
||
85 | for s in start: |
||
86 | for e in end: |
||
87 | try: |
||
88 | paths = nx.algorithms.all_shortest_paths(g, s, e) |
||
89 | for path in paths: |
||
90 | shortest_paths.append(path_to_edges(g, path)) |
||
91 | # no path found... |
||
92 | except: |
||
93 | #print("No path found between '{}' and '{}'".format(s, e)) |
||
94 | continue |
||
95 | return None if len(shortest_paths) == 0 else shortest_paths |
||
96 | |||
97 | @staticmethod |
||
98 | def shortest_path(g, start, end, scoring_func=lambda path: -len(path)): |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
![]() Comprehensibility
Best Practice
introduced
by
|
|||
99 | """ |
||
100 | Find the shortest path between two nodes. |
||
101 | Note that pathfinding is sensitive to direction. If you want to ignore direction, convert your networkx.Digraph to a networkx.Graph. |
||
102 | |||
103 | Parameters |
||
104 | ---------- |
||
105 | g : a networkx graph |
||
106 | The networkx graph to explore. |
||
107 | |||
108 | start : int or [int] |
||
109 | A single token index or list of token indices serving as the start of the graph traversal. |
||
110 | |||
111 | end : int or [int] |
||
112 | A single token index or list of token indices serving as the end of the graph traversal. |
||
113 | |||
114 | scoring_func : function |
||
115 | A function that scores each path in a list of paths. Each path has the form [(source index, relation, destination index)]. |
||
116 | The path with the maximum score will be returned. |
||
117 | |||
118 | Returns |
||
119 | ------- |
||
120 | None or [(int, int)] |
||
121 | None if no paths are found. Otherwise, a list of (source index, target index) tuples representing path segments. |
||
122 | """ |
||
123 | paths = DependencyUtils.shortest_paths(g, start, end) |
||
124 | return None if len(shortest_paths) == 0 else max(paths, key=scoring_func) |
||
0 ignored issues
–
show
Comprehensibility
Best Practice
introduced
by
|
|||
125 | |||
126 | @staticmethod |
||
127 | def directed_relation(source_idx, destination_idx, relation, deps): |
||
128 | """ |
||
129 | Converts relation to a directed relation (incoming v. outgoing) |
||
130 | if such a relation links `source_idx` and `destination_idx` in `deps`. |
||
131 | |||
132 | Parameters |
||
133 | ---------- |
||
134 | source_idx : int |
||
135 | The token index for the source node |
||
136 | |||
137 | destination_idx : int |
||
138 | The token index for the destination node |
||
139 | |||
140 | relation : str |
||
141 | The undirected relation (i.e., the grammatical/semantic relation that connects the two nodes) |
||
142 | |||
143 | deps : processors.ds.DirectedGraph |
||
144 | The directed graph to be referenced |
||
145 | |||
146 | Returns |
||
147 | ------- |
||
148 | str or None |
||
149 | The directed relation that connects the `source_idx` to the `destination_idx` in `deps`. |
||
150 | """ |
||
151 | matches = [">{}".format(rel) for d, rel in deps.outgoing[source_idx] if d == destination_idx and rel == relation] + \ |
||
152 | ["<{}".format(rel) for d, rel in deps.incoming[source_idx] if d == destination_idx and rel == relation] |
||
153 | return None if len(matches) == 0 else matches[0] |
||
154 | |||
155 | @staticmethod |
||
156 | def retrieve_edges(dep_graph, path): |
||
157 | """ |
||
158 | Converts output of `DependencyUtils.shortest_path` |
||
159 | into a list of triples that include the grammatical relation (and direction) |
||
160 | for each node-node "hop" in the syntactic dependency graph. |
||
161 | |||
162 | Parameters |
||
163 | ---------- |
||
164 | dep_graph : processors.ds.DirectedGraph |
||
165 | The `DirectedGraph` used to retrieve the grammatical relations for each edge in the `path`. |
||
166 | |||
167 | path : [(int, int)] |
||
168 | A list of tuples representing the shortest path from A to B in `dep_graph`. |
||
169 | |||
170 | Returns |
||
171 | ------- |
||
172 | [(int, str, int)] |
||
173 | the shortest path (`path`) enhanced with the directed grammatical relations |
||
174 | (ex. `>nsubj` for `predicate` to `subject` vs. `<nsubj` for `subject` to `predicate`). |
||
175 | """ |
||
176 | |||
177 | shortest_path = [] |
||
178 | for (s, d) in path: |
||
179 | # build dictionaries from incoming/outgoing |
||
180 | outgoing = {dest_idx:">{}".format(rel) for (dest_idx, rel) in dep_graph.outgoing[s]} |
||
181 | incoming = {source_idx:"<{}".format(rel) for (source_idx, rel) in dep_graph.incoming[s]} |
||
182 | relation = outgoing[d] if d in outgoing else incoming[d] |
||
183 | shortest_path.append((s, relation, d)) |
||
184 | return shortest_path |
||
185 | |||
186 | @staticmethod |
||
187 | def simplify_tag(tag): |
||
188 | """ |
||
189 | Maps part of speech (PoS) tag to a subset of PoS tags to better consolidate categorical labels. |
||
190 | |||
191 | Parameters |
||
192 | ---------- |
||
193 | tag : str |
||
194 | The Penn-style PoS tag to be mapped to a simplified form. |
||
195 | |||
196 | Returns |
||
197 | ------- |
||
198 | str |
||
199 | A simplified form of `tag`. In some cases, the returned form may be identical to `tag`. |
||
200 | """ |
||
201 | simple_tag = "\"{}\"".format(tag) |
||
202 | # collapse plurals |
||
203 | if tag.startswith("NNP"): |
||
204 | simple_tag = "/^NNP/" |
||
205 | # collapse plurals |
||
206 | elif tag.startswith("NN"): |
||
207 | simple_tag = "/^N/" |
||
208 | elif tag.startswith("VB"): |
||
209 | simple_tag = "/^V/" |
||
210 | # collapse comparative, superlatives, etc. |
||
211 | elif tag.startswith("JJ"): |
||
212 | simple_tag = "/^J/" |
||
213 | # collapse comparative, superlatives, etc. |
||
214 | elif tag.startswith("RB"): |
||
215 | simple_tag = "/^RB/" |
||
216 | # collapse possessive/non-possesive pronouns |
||
217 | elif tag.startswith("PRP"): |
||
218 | simple_tag = "/^PRP/" |
||
219 | # treat WH determiners as DT |
||
220 | elif tag == "WDT": |
||
221 | simple_tag = "/DT$/" |
||
222 | # treat DT the same as WDT |
||
223 | elif tag == "DT": |
||
224 | simple_tag = "/DT$/" |
||
225 | return simple_tag |
||
226 | |||
227 | @staticmethod |
||
228 | def lexicalize_path(sentence, |
||
229 | path, |
||
230 | words=False, |
||
231 | lemmas=False, |
||
232 | tags=False, |
||
233 | simple_tags=False, |
||
234 | entities=False, |
||
235 | limit_to=None, |
||
236 | ): |
||
237 | """ |
||
238 | Lexicalizes path in syntactic dependency graph using Odin-style token constraints. Operates on output of `DependencyUtils.retrieve_edges` |
||
239 | |||
240 | Parameters |
||
241 | ---------- |
||
242 | sentence : processors.ds.Sentence |
||
243 | The `Sentence` from which the `path` was found. Used to lexicalize the `path`. |
||
244 | |||
245 | path : list |
||
246 | A list of (source index, relation, target index) triples. |
||
247 | |||
248 | words : bool |
||
249 | Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.words` |
||
250 | |||
251 | lemmas : bool |
||
252 | Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.lemmas` |
||
253 | |||
254 | tags : bool |
||
255 | Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence.tags` |
||
256 | |||
257 | simple_tags : bool |
||
258 | Whether or not to encode nodes in the `path` with a token constraint constructed from `DependencyUtils.simplify_tag` applied to `Sentence.tags` |
||
259 | |||
260 | entities : bool |
||
261 | Whether or not to encode nodes in the `path` with a token constraint constructed from `Sentence._entities` |
||
262 | |||
263 | limit_to : [int] or None |
||
264 | Selectively apply lexicalization only to the this list of token indices. None means apply the specified lexicalization to all token indices in the path. |
||
265 | Returns |
||
266 | ------- |
||
267 | [str] |
||
268 | The lexicalized form of `path`, encoded according to the specified parameters. |
||
269 | """ |
||
270 | UNKNOWN = LabelManager.UNKNOWN |
||
271 | lexicalized_path = [] |
||
272 | relations = [] |
||
273 | nodes = [] |
||
274 | # gather edges and nodes |
||
275 | for edge in path: |
||
276 | relations.append(edge[1]) |
||
277 | nodes.append(edge[0]) |
||
278 | nodes.append(path[-1][-1]) |
||
279 | |||
280 | for (i, node) in enumerate(nodes): |
||
281 | if not limit_to or node in limit_to: |
||
282 | # build token constraints |
||
283 | token_constraints = [] |
||
284 | # words |
||
285 | if words: |
||
286 | token_constraints.append("word=\"{}\"".format(sentence.words[node])) |
||
287 | # PoS tags |
||
288 | if tags and sentence.tags[node] != UNKNOWN: |
||
289 | token_constraints.append("tag=\"{}\"".format(sentence.tags[node])) |
||
290 | # lemmas |
||
291 | if lemmas and sentence.lemmas[node] != UNKNOWN: |
||
292 | token_constraints.append("lemma=\"{}\"".format(sentence.lemmas[node])) |
||
293 | # NE labels |
||
294 | if entities and sentence._entities[node] != UNKNOWN: |
||
295 | token_constraints.append("entity=\"{}\"".format(sentence.entity[node])) |
||
296 | # simple tags |
||
297 | if simple_tags and sentence.tags[node] != UNKNOWN: |
||
298 | token_constraints.append("tag={}".format(DependencyUtils.simplify_tag(sentence.tags[node]))) |
||
299 | # build node pattern |
||
300 | if len(token_constraints) > 0: |
||
301 | node_pattern = "[{}]".format(" & ".join(token_constraints)) |
||
302 | # store lexicalized representation of node |
||
303 | lexicalized_path.append(node_pattern) |
||
304 | # append next edge |
||
305 | if i < len(relations): |
||
306 | lexicalized_path.append(relations[i]) |
||
307 | return lexicalized_path |
||
308 | |||
309 | @staticmethod |
||
310 | def pagerank(networkx_graph, |
||
311 | alpha=0.85, |
||
312 | personalization=None, |
||
313 | max_iter=1000, |
||
314 | tol=1e-06, |
||
315 | nstart=None, |
||
316 | weight='weight', |
||
317 | dangling=None): |
||
318 | """ |
||
319 | Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`. |
||
320 | |||
321 | Parameters |
||
322 | ---------- |
||
323 | networkx_graph : networkx.Graph |
||
324 | Corresponds to `G` parameter of `networkx.algorithms.link_analysis.pagerank`. |
||
325 | |||
326 | See Also |
||
327 | -------- |
||
328 | Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank) |
||
329 | |||
330 | Returns |
||
331 | ------- |
||
332 | collections.Counter |
||
333 | A collections.Counter of node -> pagerank weights |
||
334 | """ |
||
335 | pg_res = nx.algorithms.link_analysis.pagerank(G=networkx_graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling) |
||
336 | return Counter(pg_res) |
||
337 | |||
338 | |||
339 | |||
340 | class HeadFinder(object): |
||
341 | |||
342 | import processors |
||
343 | |||
344 | @staticmethod |
||
345 | def semantic_head(sentence, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None): |
||
346 | """ |
||
347 | Finds the token with the highest pagerank score that meets the filtering criteria. |
||
348 | |||
349 | Parameters |
||
350 | ---------- |
||
351 | sentence : processors.ds.Sentence |
||
352 | The Sentence to be analyzed. |
||
353 | |||
354 | graph_name : str |
||
355 | The name of the graph upon which to run the algorithm. Default is "stanford-collapsed". |
||
356 | |||
357 | valid_tags : set or None |
||
358 | An optional set of str or regexes representing valid tokens. |
||
359 | |||
360 | valid_indices : list or None |
||
361 | A optional list of int representing the indices that should be considered. |
||
362 | |||
363 | Returns |
||
364 | ------- |
||
365 | int or None |
||
366 | The index of the highest scoring token meeting the criteria. |
||
367 | """ |
||
368 | |||
369 | from processors.ds import Sentence as Sent |
||
370 | |||
371 | def is_valid_tag(tag): |
||
372 | return True if not valid_tags else any(re.match(tag_pattern, tag) for tag_pattern in valid_tags) |
||
373 | |||
374 | # ensure we're dealing with a Sentence |
||
375 | if not isinstance(sentence, Sent): return None |
||
376 | |||
377 | valid_indices = valid_indices if valid_indices else list(range(sentence.length)) |
||
378 | |||
379 | # corner case: if the sentence is a single token, pagerank doesn't apply. |
||
380 | # check tag and index |
||
381 | if sentence.length == 1: |
||
382 | return 0 if is_valid_tag(sentence.tags[0]) and 0 in valid_indices else None |
||
383 | |||
384 | dependencies = sentence.graphs.get(graph_name, None) |
||
385 | |||
386 | if not dependencies: return None |
||
387 | |||
388 | scored_toks = dependencies.pagerank().most_common() |
||
389 | |||
390 | remaining = [i for (i, score) in scored_toks \ |
||
391 | if i in valid_indices and |
||
392 | is_valid_tag(sentence.tags[i])] |
||
393 | # take token with the highest pagerank score |
||
394 | return remaining[0] if len(remaining) > 0 else None |
||
395 |