Total Complexity | 102 |
Total Lines | 742 |
Duplicated Lines | 3.37 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like processors.ds often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
1 | # -*- coding: utf-8 -*- |
||
2 | |||
3 | # Gus Hahn-Powell 2015 |
||
4 | # data structures for storing processors-server output |
||
5 | # based on conventions from the CLU lab's processors library (https://github.com/clulab/processors) |
||
6 | from __future__ import unicode_literals |
||
7 | from itertools import chain |
||
8 | from collections import defaultdict, Counter |
||
9 | from processors.paths import DependencyUtils, HeadFinder |
||
10 | from processors.utils import LabelManager |
||
11 | import networkx as nx |
||
12 | import hashlib |
||
13 | import json |
||
14 | import re |
||
15 | |||
16 | |||
17 | class NLPDatum(object): |
||
18 | |||
19 | def to_JSON_dict(self): |
||
20 | return dict() |
||
21 | |||
22 | def to_JSON(self, pretty=False): |
||
23 | """ |
||
24 | Returns JSON as String. |
||
25 | """ |
||
26 | num_spaces = 4 if pretty else None |
||
27 | return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces) |
||
28 | |||
29 | |||
30 | class Document(NLPDatum): |
||
31 | |||
32 | """ |
||
33 | Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala) |
||
34 | |||
35 | Parameters |
||
36 | ---------- |
||
37 | sentences : [processors.ds.Sentence] |
||
38 | The sentences comprising the `Document`. |
||
39 | |||
40 | Attributes |
||
41 | ---------- |
||
42 | id : str or None |
||
43 | A unique ID for the `Document`. |
||
44 | |||
45 | size : int |
||
46 | The number of `sentences`. |
||
47 | |||
48 | sentences : sentences |
||
49 | The sentences comprising the `Document`. |
||
50 | |||
51 | words : [str] |
||
52 | A list of the `Document`'s tokens. |
||
53 | |||
54 | tags : [str] |
||
55 | A list of the `Document`'s tokens represented using part of speech (PoS) tags. |
||
56 | |||
57 | lemmas : [str] |
||
58 | A list of the `Document`'s tokens represented using lemmas. |
||
59 | |||
60 | _entities : [str] |
||
61 | A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels. |
||
62 | |||
63 | nes : dict |
||
64 | A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans. |
||
65 | |||
66 | bag_of_labeled_deps : [str] |
||
67 | The labeled dependencies from all sentences in the `Document`. |
||
68 | |||
69 | bag_of_unlabeled_deps : [str] |
||
70 | The unlabeled dependencies from all sentences in the `Document`. |
||
71 | |||
72 | text : str or None |
||
73 | The original text of the `Document`. |
||
74 | |||
75 | Methods |
||
76 | ------- |
||
77 | bag_of_labeled_dependencies_using(form) |
||
78 | Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. |
||
79 | |||
80 | bag_of_unlabeled_dependencies_using(form) |
||
81 | Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. |
||
82 | """ |
||
83 | |||
84 | def __init__(self, sentences): |
||
85 | NLPDatum.__init__(self) |
||
86 | self.id = None |
||
87 | self.size = len(sentences) |
||
88 | self.sentences = sentences |
||
89 | # easily access token attributes from all sentences |
||
90 | self.words = list(chain(*[s.words for s in self.sentences])) |
||
91 | self.tags = list(chain(*[s.tags for s in self.sentences])) |
||
92 | self.lemmas = list(chain(*[s.lemmas for s in self.sentences])) |
||
93 | self._entities = list(chain(*[s._entities for s in self.sentences])) |
||
94 | self.nes = merge_entity_dicts = self._merge_ne_dicts() |
||
95 | self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences])) |
||
96 | self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences])) |
||
97 | self.text = None |
||
98 | |||
99 | def __hash__(self): |
||
100 | return hash(self.to_JSON()) |
||
101 | |||
102 | def __unicode__(self): |
||
103 | return self.text |
||
104 | |||
105 | def __str__(self): |
||
106 | return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s") |
||
107 | |||
108 | def __eq__(self, other): |
||
109 | if isinstance(other, self.__class__): |
||
110 | return self.to_JSON() == other.to_JSON() |
||
111 | else: |
||
112 | return False |
||
113 | |||
114 | def __ne__(self, other): |
||
115 | return not self.__eq__(other) |
||
116 | |||
117 | def bag_of_labeled_dependencies_using(self, form): |
||
118 | return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) |
||
119 | |||
120 | def bag_of_unlabeled_dependencies_using(self, form): |
||
121 | return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences])) |
||
122 | |||
123 | def _merge_ne_dicts(self): |
||
124 | # Get the set of all NE labels found in the Doc's sentences |
||
125 | entity_labels = set(chain(*[s.nes.keys() for s in self.sentences])) |
||
126 | # Do we have any labels? |
||
127 | if entity_labels == None: |
||
128 | return None |
||
129 | # If we have labels, consolidate the NEs under the appropriate label |
||
130 | else: |
||
131 | nes_dict = dict() |
||
132 | for e in entity_labels: |
||
133 | entities = [] |
||
134 | for s in self.sentences: |
||
135 | entities += s.nes[e] |
||
136 | nes_dict[e] = entities |
||
137 | return nes_dict |
||
138 | |||
139 | def to_JSON_dict(self): |
||
140 | doc_dict = dict() |
||
141 | doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences] |
||
142 | doc_dict["text"] = self.text |
||
143 | # can the ID be set? |
||
144 | if self.id != None: |
||
145 | doc_dict["id"] = self.id |
||
146 | return doc_dict |
||
147 | |||
148 | @staticmethod |
||
149 | def load_from_JSON(json_dict): |
||
150 | sentences = [] |
||
151 | for s in json_dict["sentences"]: |
||
152 | kwargs = { |
||
153 | "words": s["words"], |
||
154 | "startOffsets": s["startOffsets"], |
||
155 | "endOffsets": s["endOffsets"], |
||
156 | "tags": s.get("tags", None), |
||
157 | "lemmas": s.get("lemmas", None), |
||
158 | "chunks": s.get("chunks", None), |
||
159 | "entities": s.get("entities", None), |
||
160 | "graphs": s.get("graphs", None) |
||
161 | } |
||
162 | sent = Sentence(**kwargs) |
||
163 | sentences.append(sent) |
||
164 | doc = Document(sentences) |
||
165 | # set id and text |
||
166 | doc.text = json_dict.get("text", None) |
||
167 | doc.id = json_dict.get("id", None) |
||
168 | return doc |
||
169 | |||
170 | |||
171 | class Sentence(NLPDatum): |
||
172 | |||
173 | """ |
||
174 | Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala) |
||
175 | |||
176 | Parameters |
||
177 | ---------- |
||
178 | text : str or None |
||
179 | The text of the `Sentence`. |
||
180 | |||
181 | words : [str] |
||
182 | A list of the `Sentence`'s tokens. |
||
183 | |||
184 | startOffsets : [int] |
||
185 | The character offsets starting each token (inclusive). |
||
186 | |||
187 | endOffsets : [int] |
||
188 | The character offsets marking the end of each token (exclusive). |
||
189 | |||
190 | tags : [str] |
||
191 | A list of the `Sentence`'s tokens represented using part of speech (PoS) tags. |
||
192 | |||
193 | lemmas : [str] |
||
194 | A list of the `Sentence`'s tokens represented using lemmas. |
||
195 | |||
196 | chunks : [str] |
||
197 | A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.). |
||
198 | |||
199 | entities : [str] |
||
200 | A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels. |
||
201 | |||
202 | graphs : dict |
||
203 | A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}} |
||
204 | |||
205 | Attributes |
||
206 | ---------- |
||
207 | text : str |
||
208 | The text of the `Sentence`. |
||
209 | |||
210 | startOffsets : [int] |
||
211 | The character offsets starting each token (inclusive). |
||
212 | |||
213 | endOffsets : [int] |
||
214 | The character offsets marking the end of each token (exclusive). |
||
215 | |||
216 | length : int |
||
217 | The number of tokens in the `Sentence` |
||
218 | |||
219 | graphs : dict |
||
220 | A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`. |
||
221 | |||
222 | basic_dependencies : processors.ds.DirectedGraph |
||
223 | A `processors.ds.DirectedGraph` using basic Stanford dependencies. |
||
224 | |||
225 | collapsed_dependencies : processors.ds.DirectedGraph |
||
226 | A `processors.ds.DirectedGraph` using collapsed Stanford dependencies. |
||
227 | |||
228 | dependencies : processors.ds.DirectedGraph |
||
229 | A pointer to the prefered syntactic dependency graph type for this `Sentence`. |
||
230 | |||
231 | _entities : [str] |
||
232 | The IOB-style Named Entity (NE) labels corresponding to each token. |
||
233 | |||
234 | _chunks : [str] |
||
235 | The IOB-style chunk labels corresponding to each token. |
||
236 | |||
237 | nes : dict |
||
238 | A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities` |
||
239 | |||
240 | phrases : dict |
||
241 | A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks` |
||
242 | |||
243 | |||
244 | Methods |
||
245 | ------- |
||
246 | bag_of_labeled_dependencies_using(form) |
||
247 | Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. |
||
248 | |||
249 | bag_of_unlabeled_dependencies_using(form) |
||
250 | Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. |
||
251 | """ |
||
252 | |||
253 | UNKNOWN = LabelManager.UNKNOWN |
||
254 | # the O in IOB notation |
||
255 | O = LabelManager.O |
||
256 | |||
257 | def __init__(self, **kwargs): |
||
258 | NLPDatum.__init__(self) |
||
259 | self.words = kwargs["words"] |
||
260 | self.startOffsets = kwargs["startOffsets"] |
||
261 | self.endOffsets = kwargs["endOffsets"] |
||
262 | self.length = len(self.words) |
||
263 | self.tags = self._set_toks(kwargs.get("tags", None)) |
||
264 | self.lemmas = self._set_toks(kwargs.get("lemmas", None)) |
||
265 | self._chunks = self._set_toks(kwargs.get("chunks", None)) |
||
266 | self._entities = self._set_toks(kwargs.get("entities", None)) |
||
267 | self.text = kwargs.get("text", None) or " ".join(self.words) |
||
268 | self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None)) |
||
269 | self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None) |
||
270 | self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None) |
||
271 | self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies |
||
272 | # IOB tokens -> {label: [phrase 1, ..., phrase n]} |
||
273 | self.nes = self._handle_iob(self._entities) |
||
274 | self.phrases = self._handle_iob(self._chunks) |
||
275 | |||
276 | def __eq__(self, other): |
||
277 | if isinstance(other, self.__class__): |
||
278 | return self.to_JSON() == other.to_JSON() |
||
279 | else: |
||
280 | return False |
||
281 | |||
282 | def __ne__(self, other): |
||
283 | return not self.__eq__(other) |
||
284 | |||
285 | def __hash__(self): |
||
286 | return hash(self.to_JSON(pretty=False)) |
||
287 | |||
288 | def deduplication_hash(self): |
||
289 | """ |
||
290 | Generates a deduplication hash for the sentence |
||
291 | """ |
||
292 | return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest() |
||
293 | |||
294 | def _get_tokens(self, form): |
||
295 | f = form.lower() |
||
296 | if f == "words": |
||
297 | tokens = self.words |
||
298 | elif f == "tags": |
||
299 | tokens = self.tags |
||
300 | elif f == "lemmas": |
||
301 | tokens = self.lemmas |
||
302 | elif f == "entities": |
||
303 | tokens = self.nes |
||
304 | elif f == "index": |
||
305 | tokens = list(range(self.length)) |
||
306 | # unrecognized form |
||
307 | else: |
||
308 | raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""") |
||
309 | return tokens |
||
310 | |||
311 | def _set_toks(self, toks): |
||
312 | return toks if toks else [Sentence.UNKNOWN]*self.length |
||
313 | |||
314 | def _handle_iob(self, iob): |
||
315 | """ |
||
316 | Consolidates consecutive tokens in IOB notation under the appropriate label. |
||
317 | Regexs control for bionlp annotator, which uses IOB notation. |
||
318 | """ |
||
319 | entity_dict = defaultdict(list) |
||
320 | # initialize to empty label |
||
321 | current = Sentence.O |
||
322 | start = None |
||
323 | end = None |
||
324 | for i, tok in enumerate(iob): |
||
325 | # we don't have an I or O |
||
326 | if tok == Sentence.O: |
||
327 | # did we have an entity with the last token? |
||
328 | current = re.sub('(B-|I-)','', str(current)) |
||
329 | View Code Duplication | if current == Sentence.O: |
|
|
|||
330 | continue |
||
331 | else: |
||
332 | # the last sequence has ended |
||
333 | end = i |
||
334 | # store the entity |
||
335 | named_entity = ' '.join(self.words[start:end]) |
||
336 | entity_dict[current].append(named_entity) |
||
337 | # reset our book-keeping vars |
||
338 | current = Sentence.O |
||
339 | start = None |
||
340 | end = None |
||
341 | # we have a tag! |
||
342 | else: |
||
343 | # our old sequence continues |
||
344 | current = re.sub('(B-|I-)','', str(current)) |
||
345 | tok = re.sub('(B-|I-)','', str(tok)) |
||
346 | View Code Duplication | if tok == current: |
|
347 | end = i |
||
348 | # our old sequence has ended |
||
349 | else: |
||
350 | # do we have a previous NE? |
||
351 | if current != Sentence.O: |
||
352 | end = i |
||
353 | named_entity = ' '.join(self.words[start:end]) |
||
354 | entity_dict[current].append(named_entity) |
||
355 | # update our book-keeping vars |
||
356 | current = tok |
||
357 | start = i |
||
358 | end = None |
||
359 | # this might be empty |
||
360 | return entity_dict |
||
361 | |||
362 | def _build_directed_graph_from_dict(self, graphs): |
||
363 | deps_dict = dict() |
||
364 | if graphs and len(graphs) > 0: |
||
365 | # process each stored graph |
||
366 | for (kind, deps) in graphs.items(): |
||
367 | deps_dict[kind] = DirectedGraph(kind, deps, self.words) |
||
368 | return deps_dict |
||
369 | return None |
||
370 | |||
371 | def __unicode__(self): |
||
372 | return self.text |
||
373 | |||
374 | def to_string(self): |
||
375 | return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length)) |
||
376 | |||
377 | def bag_of_labeled_dependencies_using(self, form): |
||
378 | """ |
||
379 | Produces a list of syntactic dependencies |
||
380 | where each edge is labeled with its grammatical relation. |
||
381 | """ |
||
382 | tokens = self._get_tokens(form) |
||
383 | return self.labeled_dependencies_from_tokens(tokens) if tokens else None |
||
384 | |||
385 | def bag_of_unlabeled_dependencies_using(self, form): |
||
386 | """ |
||
387 | Produces a list of syntactic dependencies |
||
388 | where each edge is left unlabeled without its grammatical relation. |
||
389 | """ |
||
390 | tokens = self._get_tokens(form) |
||
391 | return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None |
||
392 | |||
393 | def labeled_dependencies_from_tokens(self, tokens): |
||
394 | """ |
||
395 | Generates a list of labeled dependencies for a sentence |
||
396 | using the provided tokens |
||
397 | """ |
||
398 | deps = self.dependencies |
||
399 | labeled = [] |
||
400 | return [(tokens[out], rel, tokens[dest]) \ |
||
401 | for out in deps.outgoing \ |
||
402 | for (dest, rel) in deps.outgoing[out]] |
||
403 | |||
404 | def unlabeled_dependencies_from_tokens(self, tokens): |
||
405 | """ |
||
406 | Generate a list of unlabeled dependencies for a sentence |
||
407 | using the provided tokens |
||
408 | """ |
||
409 | return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)] |
||
410 | |||
411 | def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None): |
||
412 | return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices) |
||
413 | |||
414 | def to_JSON_dict(self): |
||
415 | sentence_dict = dict() |
||
416 | sentence_dict["words"] = self.words |
||
417 | sentence_dict["startOffsets"] = self.startOffsets |
||
418 | sentence_dict["endOffsets"] = self.endOffsets |
||
419 | sentence_dict["tags"] = self.tags |
||
420 | sentence_dict["lemmas"] = self.lemmas |
||
421 | sentence_dict["entities"] = self._entities |
||
422 | sentence_dict["chunks"] = self._chunks |
||
423 | # add graphs |
||
424 | sentence_dict["graphs"] = dict() |
||
425 | for (kind, graph) in self.graphs.items(): |
||
426 | sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict() |
||
427 | return sentence_dict |
||
428 | |||
429 | @staticmethod |
||
430 | def load_from_JSON(json_dict): |
||
431 | sent = Sentence( |
||
432 | words=json_dict["words"], |
||
433 | startOffsets=json_dict["startOffsets"], |
||
434 | endOffsets=json_dict["endOffsets"], |
||
435 | lemmas=json_dict.get("lemmas", None), |
||
436 | tags=json_dict.get("tags", None), |
||
437 | entities=json_dict.get("entities", None), |
||
438 | text=json_dict.get("text", None), |
||
439 | graphs=json_dict.get("graphs", None), |
||
440 | chunks=json_dict.get("chunks", None) |
||
441 | ) |
||
442 | return sent |
||
443 | |||
444 | |||
445 | class Edge(NLPDatum): |
||
446 | |||
447 | def __init__(self, source, destination, relation): |
||
448 | NLPDatum.__init__(self) |
||
449 | self.source = source |
||
450 | self.destination = destination |
||
451 | self.relation = relation |
||
452 | |||
453 | def __unicode__(self): |
||
454 | return self.to_string() |
||
455 | |||
456 | def to_string(self): |
||
457 | return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation) |
||
458 | |||
459 | def __eq__(self, other): |
||
460 | if isinstance(other, self.__class__): |
||
461 | return self.to_JSON() == other.to_JSON() |
||
462 | else: |
||
463 | return False |
||
464 | |||
465 | def to_JSON_dict(self): |
||
466 | edge_dict = dict() |
||
467 | edge_dict["source"] = self.source |
||
468 | edge_dict["destination"] = self.destination |
||
469 | edge_dict["relation"] = self.relation |
||
470 | return edge_dict |
||
471 | |||
472 | class DirectedGraph(NLPDatum): |
||
473 | |||
474 | """ |
||
475 | Storage class for directed graphs. |
||
476 | |||
477 | |||
478 | Parameters |
||
479 | ---------- |
||
480 | kind : str |
||
481 | The name of the directed graph. |
||
482 | |||
483 | deps : dict |
||
484 | A dictionary of {edges: [{source, destination, relation}], roots: [int]} |
||
485 | |||
486 | words : [str] |
||
487 | A list of the word form of the tokens from the originating `Sentence`. |
||
488 | |||
489 | Attributes |
||
490 | ---------- |
||
491 | _words : [str] |
||
492 | A list of the word form of the tokens from the originating `Sentence`. |
||
493 | |||
494 | roots : [int] |
||
495 | A list of indices for the syntactic dependency graph's roots. Generally this is a single token index. |
||
496 | |||
497 | edges: list[processors.ds.Edge] |
||
498 | A list of `processors.ds.Edge` |
||
499 | |||
500 | incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph. |
||
501 | |||
502 | outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph. |
||
503 | |||
504 | labeled : [str] |
||
505 | A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination"). |
||
506 | |||
507 | unlabeled : [str] |
||
508 | A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination"). |
||
509 | |||
510 | graph : networkx.Graph |
||
511 | A `networkx.graph` representation of the `DirectedGraph`. Used by `shortest_path` |
||
512 | |||
513 | Methods |
||
514 | ------- |
||
515 | bag_of_labeled_dependencies_from_tokens(form) |
||
516 | Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation. |
||
517 | bag_of_unlabeled_dependencies_from_tokens(form) |
||
518 | Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation. |
||
519 | """ |
||
520 | STANFORD_BASIC_DEPENDENCIES = "stanford-basic" |
||
521 | STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed" |
||
522 | |||
523 | def __init__(self, kind, deps, words): |
||
524 | NLPDatum.__init__(self) |
||
525 | self._words = [w.lower() for w in words] |
||
526 | self.kind = kind |
||
527 | self.roots = deps.get("roots", []) |
||
528 | self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]] |
||
529 | self.incoming = self._build_incoming(self.edges) |
||
530 | self.outgoing = self._build_outgoing(self.edges) |
||
531 | self.labeled = self._build_labeled() |
||
532 | self.unlabeled = self._build_unlabeled() |
||
533 | self.directed_graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=False) |
||
534 | self.undirected_graph = self.directed_graph.to_undirected() |
||
535 | |||
536 | def __unicode__(self): |
||
537 | return self.edges |
||
538 | |||
539 | def __eq__(self, other): |
||
540 | if isinstance(other, self.__class__): |
||
541 | return self.to_JSON() == other.to_JSON() |
||
542 | else: |
||
543 | return False |
||
544 | |||
545 | def __ne__(self, other): |
||
546 | return not self.__eq__(other) |
||
547 | |||
548 | def __hash__(self): |
||
549 | return hash(self.to_JSON()) |
||
550 | |||
551 | def shortest_paths(self, start, end): |
||
552 | """ |
||
553 | Find the shortest paths in the syntactic depedency graph |
||
554 | between the provided start and end nodes. |
||
555 | |||
556 | Parameters |
||
557 | ---------- |
||
558 | start : int or [int] |
||
559 | A single token index or list of token indices serving as the start of the graph traversal. |
||
560 | |||
561 | end : int or [int] |
||
562 | A single token index or list of token indices serving as the end of the graph traversal. |
||
563 | |||
564 | See Also |
||
565 | -------- |
||
566 | `processors.paths.DependencyUtils.shortest_path` |
||
567 | """ |
||
568 | paths = DependencyUtils.shortest_paths(self.undirected_graph, start, end) |
||
569 | return None if not paths else [DependencyUtils.retrieve_edges(self, path) for path in paths] |
||
570 | |||
571 | def shortest_path(self, start, end, scoring_func=lambda path: -len(path)): |
||
572 | """ |
||
573 | Find the shortest path in the syntactic depedency graph |
||
574 | between the provided start and end nodes. |
||
575 | |||
576 | Parameters |
||
577 | ---------- |
||
578 | start : int or [int] |
||
579 | A single token index or list of token indices serving as the start of the graph traversal. |
||
580 | |||
581 | end : int or [int] |
||
582 | A single token index or list of token indices serving as the end of the graph traversal. |
||
583 | |||
584 | scoring_func : function |
||
585 | A function that scores each path in a list of [(source index, directed relation, destination index)] paths. Each path has the form [(source index, relation, destination index)]. |
||
586 | The path with the maximum score will be returned. |
||
587 | |||
588 | See Also |
||
589 | -------- |
||
590 | `processors.paths.DependencyUtils.shortest_path` |
||
591 | """ |
||
592 | paths = self.shortest_paths(start, end) |
||
593 | return None if not paths else max(paths, key=scoring_func) |
||
594 | |||
595 | def degree_centrality(self): |
||
596 | """ |
||
597 | Compute the degree centrality for nodes. |
||
598 | |||
599 | See Also |
||
600 | -------- |
||
601 | https://networkx.github.io/documentation/development/reference/algorithms.centrality.html |
||
602 | """ |
||
603 | return Counter(nx.degree_centrality(self.directed_graph)) |
||
604 | |||
605 | def in_degree_centrality(self): |
||
606 | """ |
||
607 | Compute the in-degree centrality for nodes. |
||
608 | |||
609 | See Also |
||
610 | -------- |
||
611 | https://networkx.github.io/documentation/development/reference/algorithms.centrality.html |
||
612 | """ |
||
613 | return Counter(nx.in_degree_centrality(self.directed_graph)) |
||
614 | |||
615 | def out_degree_centrality(self): |
||
616 | """ |
||
617 | Compute the out-degree centrality for nodes. |
||
618 | |||
619 | See Also |
||
620 | -------- |
||
621 | https://networkx.github.io/documentation/development/reference/algorithms.centrality.html |
||
622 | """ |
||
623 | return Counter(nx.out_degree_centrality(self.directed_graph)) |
||
624 | |||
625 | def pagerank(self, |
||
626 | alpha=0.85, |
||
627 | personalization=None, |
||
628 | max_iter=1000, |
||
629 | tol=1e-06, |
||
630 | nstart=None, |
||
631 | weight='weight', |
||
632 | dangling=None, |
||
633 | use_directed=True, |
||
634 | reverse=True): |
||
635 | """ |
||
636 | Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`). Use with `processors.ds.DirectedGraph.graph`. |
||
637 | Note that by default, the directed graph is reversed in order to highlight predicate-argument nodes (refer to pagerank algorithm to understand why). |
||
638 | |||
639 | See Also |
||
640 | -------- |
||
641 | `processors.paths.DependencyUtils.pagerank` |
||
642 | Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank) |
||
643 | """ |
||
644 | # check whether or not to reverse directed graph |
||
645 | dg = self.directed_graph if not reverse else DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=True) |
||
646 | # determine graph to use |
||
647 | graph = dg if use_directed else self.undirected_graph |
||
648 | return DependencyUtils.pagerank(graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling) |
||
649 | |||
650 | def _build_incoming(self, edges): |
||
651 | dep_dict = defaultdict(list) |
||
652 | for edge in edges: |
||
653 | dep_dict[edge.destination].append((edge.source, edge.relation)) |
||
654 | return dep_dict |
||
655 | |||
656 | def _build_outgoing(self, edges): |
||
657 | dep_dict = defaultdict(list) |
||
658 | for edge in edges: |
||
659 | dep_dict[edge.source].append((edge.destination, edge.relation)) |
||
660 | return dep_dict |
||
661 | |||
662 | def _build_labeled(self): |
||
663 | labeled = [] |
||
664 | for out in self.outgoing: |
||
665 | for (dest, rel) in self.outgoing[out]: |
||
666 | labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest])) |
||
667 | return labeled |
||
668 | |||
669 | def _build_unlabeled(self): |
||
670 | unlabeled = [] |
||
671 | for out in self.outgoing: |
||
672 | for (dest, _) in self.outgoing[out]: |
||
673 | unlabeled.append("{}_{}".format(self._words[out], self._words[dest])) |
||
674 | return unlabeled |
||
675 | |||
676 | def _graph_to_JSON_dict(self): |
||
677 | dg_dict = dict() |
||
678 | dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges] |
||
679 | dg_dict["roots"] = self.roots |
||
680 | return dg_dict |
||
681 | |||
682 | def to_JSON_dict(self): |
||
683 | return {self.kind:self._graph_to_JSON_dict()} |
||
684 | |||
685 | |||
686 | class Interval(NLPDatum): |
||
687 | """ |
||
688 | Defines a token or character span |
||
689 | |||
690 | Parameters |
||
691 | ---------- |
||
692 | start : str |
||
693 | The token or character index where the interval begins. |
||
694 | |||
695 | end : str |
||
696 | The 1 + the index of the last token/character in the span. |
||
697 | |||
698 | Methods |
||
699 | ------- |
||
700 | contains(that) |
||
701 | Test whether `that` (int or Interval) overlaps with span of this Interval. |
||
702 | |||
703 | overlaps(that) |
||
704 | Test whether this Interval contains another. Equivalent Intervals will overlap. |
||
705 | """ |
||
706 | |||
707 | def __init__(self, start, end): |
||
708 | NLPDatum.__init__(self) |
||
709 | assert (start < end), "Interval start must precede end." |
||
710 | self.start = start |
||
711 | self.end = end |
||
712 | |||
713 | def to_JSON_dict(self): |
||
714 | return {"start":self.start, "end":self.end} |
||
715 | |||
716 | def size(self): |
||
717 | return self.end - self.start |
||
718 | |||
719 | def contains(self, that): |
||
720 | """ |
||
721 | Checks if this interval contains another (that) |
||
722 | """ |
||
723 | if isinstance(that, self.__class__): |
||
724 | return self.start <= that.start and self.end >= that.end |
||
725 | else: |
||
726 | return False |
||
727 | |||
728 | def overlaps(self, that): |
||
729 | """ |
||
730 | Checks for overlap. |
||
731 | """ |
||
732 | if isinstance(that, int): |
||
733 | return self.start <= other < self.end |
||
734 | elif isinstance(that, self.__class__): |
||
735 | return ((that.start <= self.start < that.end) or (self.start <= that.start < self.end)) |
||
736 | else: |
||
737 | return False |
||
738 | |||
739 | @staticmethod |
||
740 | def load_from_JSON(json): |
||
741 | return Interval(start=json["start"], end=json["end"]) |
||
742 |