Completed
Push — master ( 352884...922d0d )
by Gus
01:03
created

DirectedGraph   A

Complexity

Total Complexity 24

Size/Duplication

Total Lines 132
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
dl 0
loc 132
rs 10
c 1
b 0
f 0
wmc 24

13 Methods

Rating   Name   Duplication   Size   Complexity  
A shortest_path() 0 11 2
A __unicode__() 0 2 1
A _build_incoming() 0 5 2
A __ne__() 0 2 1
A _graph_to_JSON_dict() 0 5 2
A _build_outgoing() 0 5 2
A _build_unlabeled() 0 6 3
A pagerank() 0 17 1
A to_JSON() 0 2 1
A to_JSON_dict() 0 2 1
A _build_labeled() 0 6 3
A __eq__() 0 5 2
A __init__() 0 10 3
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Gus Hahn-Powell 2015-2016
5
# data structures for storing processors-server output
6
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7
from __future__ import unicode_literals
8
from itertools import chain
9
from collections import defaultdict
10
from processors.paths import DependencyUtils
11
from processors.utils import LabelManager
12
#from six import text_type
13
import json
14
import re
15
16
17
class Document(object):
18
19
    """
20
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
21
22
    Parameters
23
    ----------
24
    sentences : [processors.ds.Sentence]
25
        The sentences comprising the `Document`.
26
27
    Attributes
28
    ----------
29
    id : str or None
30
        A unique ID for the `Document`.
31
    size : int
32
        The number of `sentences`.
33
    sentences : sentences
34
        The sentences comprising the `Document`.
35
    words : [str]
36
        A list of the `Document`'s tokens.
37
    tags : [str]
38
        A list of the `Document`'s tokens represented using part of speech (PoS) tags.
39
    lemmas : [str]
40
        A list of the `Document`'s tokens represented using lemmas.
41
    _entities : [str]
42
        A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
43
    nes : dict
44
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
45
    bag_of_labeled_deps : [str]
46
        The labeled dependencies from all sentences in the `Document`.
47
    bag_of_unlabeled_deps : [str]
48
        The unlabeled dependencies from all sentences in the `Document`.
49
    text : str or None
50
        The original text of the `Document`.
51
52
    Methods
53
    -------
54
    bag_of_labeled_dependencies_using(form)
55
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
56
    bag_of_unlabeled_dependencies_using(form)
57
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
58
    """
59
60
    def __init__(self, sentences):
61
        self.id = None
62
        self.size = len(sentences)
63
        self.sentences = sentences
64
        # easily access token attributes from all sentences
65
        self.words = list(chain(*[s.words for s in self.sentences]))
66
        self.tags = list(chain(*[s.tags for s in self.sentences]))
67
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
68
        self._entities = list(chain(*[s._entities for s in self.sentences]))
69
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
70
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
71
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
72
        self.text = None
73
74
    def __hash__(self):
75
        return hash(self.to_JSON())
76
77
    def __unicode__(self):
78
        return self.text
79
80
    def __str__(self):
81
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
82
83
    def __eq__(self, other):
84
        if isinstance(other, self.__class__):
85
            return self.to_JSON() == other.to_JSON()
86
        else:
87
            return False
88
89
    def __ne__(self, other):
90
        return not self.__eq__(other)
91
92
    def bag_of_labeled_dependencies_using(self, form):
93
        return list(chain(*[s.labeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))
94
95
    def bag_of_unlabeled_dependencies_using(self, form):
96
        return list(chain(*[s.unlabeled_dependencies_using(s._get_tokens(form)) for s in self.sentences]))
97
98
    def _merge_ne_dicts(self):
99
        # Get the set of all NE labels found in the Doc's sentences
100
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
101
        # Do we have any labels?
102
        if entity_labels == None:
103
            return None
104
        # If we have labels, consolidate the NEs under the appropriate label
105
        else:
106
            nes_dict = dict()
107
            for e in entity_labels:
108
                entities = []
109
                for s in self.sentences:
110
                    entities += s.nes[e]
111
                nes_dict[e] = entities
112
            return nes_dict
113
114
    def to_JSON_dict(self):
115
        doc_dict = dict()
116
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
117
        doc_dict["text"] = self.text
118
        # can the ID be set?
119
        if self.id != None:
120
            doc_dict["id"] = self.id
121
        return doc_dict
122
123
    def to_JSON(self, pretty=True):
124
        """
125
        Returns JSON as String.
126
        """
127
        num_spaces = 4 if pretty else 0
128
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces)
129
130
    @staticmethod
131
    def load_from_JSON(json_dict):
132
        sentences = []
133
        for s in json_dict["sentences"]:
134
            kwargs = {
135
                "words": s["words"],
136
                "startOffsets": s["startOffsets"],
137
                "endOffsets": s["endOffsets"],
138
                "tags": s.get("tags", None),
139
                "lemmas": s.get("lemmas", None),
140
                "chunks": s.get("chunks", None),
141
                "entities": s.get("entities", None),
142
                "graphs": s.get("graphs", None)
143
            }
144
            sent = Sentence(**kwargs)
145
            sentences.append(sent)
146
        doc = Document(sentences)
147
        # set id and text
148
        doc.text = json_dict.get("text", None)
149
        doc.id = kwargs.get("id", None)
150
        return doc
151
152
153
class Sentence(object):
154
155
    """
156
    Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
157
158
    Parameters
159
    ----------
160
    text : str or None
161
        The text of the `Sentence`.
162
    words : [str]
163
        A list of the `Sentence`'s tokens.
164
    startOffsets : [int]
165
        The character offsets starting each token (inclusive).
166
    endOffsets : [int]
167
        The character offsets marking the end of each token (exclusive).
168
    tags : [str]
169
        A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.
170
    lemmas : [str]
171
        A list of the `Sentence`'s tokens represented using lemmas.
172
    chunks : [str]
173
        A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).
174
    entities : [str]
175
        A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.
176
    graphs : dict
177
        A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}}
178
179
    Attributes
180
    ----------
181
    text : str
182
        The text of the `Sentence`.
183
    startOffsets : [int]
184
        The character offsets starting each token (inclusive).
185
    endOffsets : [int]
186
        The character offsets marking the end of each token (exclusive).
187
    length : int
188
        The number of tokens in the `Sentence`
189
190
    graphs : dict
191
        A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`.
192
    basic_dependencies : processors.ds.DirectedGraph
193
        A `processors.ds.DirectedGraph` using basic Stanford dependencies.
194
    collapsed_dependencies : processors.ds.DirectedGraph
195
        A `processors.ds.DirectedGraph` using collapsed Stanford dependencies.
196
    dependencies : processors.ds.DirectedGraph
197
        A pointer to the prefered syntactic dependency graph type for this `Sentence`.
198
    _entities : [str]
199
        The IOB-style Named Entity (NE) labels corresponding to each token.
200
    _chunks : [str]
201
        The IOB-style chunk labels corresponding to each token.
202
    nes : dict
203
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
204
    phrases : dict
205
        A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
206
207
208
    Methods
209
    -------
210
    bag_of_labeled_dependencies_using(form)
211
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
212
    bag_of_unlabeled_dependencies_using(form)
213
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
214
    """
215
216
    UNKNOWN = LabelManager.UNKNOWN
217
    # the O in IOB notation
218
    O = LabelManager.O
219
220
    def __init__(self, **kwargs):
221
        self.words = kwargs["words"]
222
        self.startOffsets = kwargs["startOffsets"]
223
        self.endOffsets = kwargs["endOffsets"]
224
        self.length = len(self.words)
225
        self.tags = self._set_toks(kwargs.get("tags", None))
226
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
227
        self._chunks = self._set_toks(kwargs.get("chunks", None))
228
        self._entities = self._set_toks(kwargs.get("entities", None))
229
        self.text = kwargs.get("text", None) or " ".join(self.words)
230
        self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None))
231
        self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
232
        self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
233
        self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
234
        # IOB tokens -> {label: [phrase 1, ..., phrase n]}
235
        self.nes = self._handle_iob(self._entities)
236
        self.phrases = self._handle_iob(self._chunks)
237
238
    def __eq__(self, other):
239
        if isinstance(other, self.__class__):
240
            return self.to_JSON() == other.to_JSON()
241
        else:
242
            return False
243
244
    def __ne__(self, other):
245
        return not self.__eq__(other)
246
247
    def _get_tokens(self, form):
248
        f = form.lower()
249
        if f == "words":
250
            tokens = self.words
251
        elif f == "tags":
252
            tokens = self.tags
253
        elif f == "lemmas":
254
            tokens = self.lemmas
255
        elif f == "entities":
256
            tokens = self.nes
257
        elif f == "index":
258
            tokens = list(range(self.length))
259
        return tokens
260
261
    def _set_toks(self, toks):
262
        return toks if toks else [Sentence.UNKNOWN]*self.length
263
264
    def _handle_iob(self, iob):
265
        """
266
        Consolidates consecutive tokens in IOB notation under the appropriate label.
267
        Regexs control for bionlp annotator, which uses IOB notation.
268
        """
269
        entity_dict = defaultdict(list)
270
        # initialize to empty label
271
        current = Sentence.O
272
        start = None
273
        end = None
274
        for i, tok in enumerate(iob):
275
            # we don't have an I or O
276
            if tok == Sentence.O:
277
                # did we have an entity with the last token?
278
                current = re.sub('(B-|I-)','', str(current))
279
                if current == Sentence.O:
280
                    continue
281
                else:
282
                    # the last sequence has ended
283
                    end = i
284
                    # store the entity
285
                    named_entity = ' '.join(self.words[start:end])
286
                    entity_dict[current].append(named_entity)
287
                    # reset our book-keeping vars
288
                    current = Sentence.O
289
                    start = None
290
                    end = None
291
            # we have a tag!
292
            else:
293
                # our old sequence continues
294
                current = re.sub('(B-|I-)','', str(current))
295
                tok = re.sub('(B-|I-)','', str(tok))
296
                if tok == current:
297
                    end = i
298
                # our old sequence has ended
299
                else:
300
                    # do we have a previous NE?
301
                    if current != Sentence.O:
302
                        end = i
303
                        named_entity = ' '.join(self.words[start:end])
304
                        entity_dict[current].append(named_entity)
305
                    # update our book-keeping vars
306
                    current = tok
307
                    start = i
308
                    end = None
309
        # this might be empty
310
        return entity_dict
311
312
    def _build_directed_graph_from_dict(self, graphs):
313
        deps_dict = dict()
314
        if graphs and len(graphs) > 0:
315
            # process each stored graph
316
            for (kind, deps) in graphs.items():
317
                deps_dict[kind] = DirectedGraph(kind, deps, self.words)
318
            return deps_dict
319
        return None
320
321
    def __unicode__(self):
322
        return self.text
323
324
    def to_string(self):
325
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
326
327
    def labeled_dependencies_using(self, tokens):
328
        """
329
        Generates a list of labeled dependencies for a sentence
330
        using the provided tokens
331
        """
332
        #else:
333
        #    raise Exception("""form must be "words", "tags", "lemmas", or "index"""")
334
        deps = self.dependencies
335
        labeled = []
336
        for out in deps.outgoing:
337
            for (dest, rel) in deps.outgoing[out]:
338
                labeled.append("{}_{}_{}".format(tokens[out], rel.upper(), tokens[dest]))
339
        return labeled
340
341
    def unlabeled_dependencies_using(self, tokens):
342
        """
343
        Generate a list of unlabeled dependencies for a sentence
344
        using the provided tokens
345
        """
346
        unlabeled = []
347
        for sd in self.labeled_dependencies_using(tokens):
348
            (head, _, dep) = sd.split("_")
349
            unlabeled.append("{}_{}".format(head, dep))
350
        return unlabeled
351
352
    def to_JSON_dict(self):
353
        sentence_dict = dict()
354
        sentence_dict["words"] = self.words
355
        sentence_dict["startOffsets"] = self.startOffsets
356
        sentence_dict["endOffsets"] = self.endOffsets
357
        sentence_dict["tags"] = self.tags
358
        sentence_dict["lemmas"] = self.lemmas
359
        sentence_dict["entities"] = self._entities
360
        # add graphs
361
        sentence_dict["graphs"] = dict()
362
        for (kind, graph) in self.graphs.items():
363
            sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict()
364
        return sentence_dict
365
366
    def to_JSON(self):
367
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
368
369
    @staticmethod
370
    def load_from_JSON(json_dict):
371
        sent = Sentence(
372
                    words=json_dict["words"],
373
                    startOffsets=json_dict["startOffsets"],
374
                    endOffsets=json_dict["endOffsets"],
375
                    lemmas=json_dict.get("lemmas", None),
376
                    tags=json_dict.get("tags", None),
377
                    entities=json_dict.get("entities", None),
378
                    text=json_dict.get("text", None),
379
                    graphs=json_dict.get("graphs", None)
380
                    )
381
        return sent
382
383
384
class Edge(object):
385
386
    def __init__(self, source, destination, relation):
387
        self.source = source
388
        self.destination = destination
389
        self.relation = relation
390
391
    def __unicode__(self):
392
        return self.to_string()
393
394
    def to_string(self):
395
        return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation)
396
397
    def __eq__(self, other):
398
        if isinstance(other, self.__class__):
399
            return self.to_JSON() == other.to_JSON()
400
        else:
401
            return False
402
403
    def to_JSON_dict(self):
404
        edge_dict = dict()
405
        edge_dict["source"] = self.source
406
        edge_dict["destination"] = self.destination
407
        edge_dict["relation"] = self.relation
408
        return edge_dict
409
410
    def to_JSON(self):
411
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
412
413
414
class DirectedGraph(object):
415
416
    """
417
    Storage class for directed graphs.
418
419
420
    Parameters
421
    ----------
422
    kind : str
423
        The name of the directed graph.
424
    deps : dict
425
        A dictionary of {edges: [{source, destination, relation}], roots: [int]}
426
    words : [str]
427
        A list of the word form of the tokens from the originating `Sentence`.
428
429
    Attributes
430
    ----------
431
    _words : [str]
432
        A list of the word form of the tokens from the originating `Sentence`.
433
    roots : [int]
434
        A list of indices for the syntactic dependency graph's roots.  Generally this is a single token index.
435
    edges: list[processors.ds.Edge]
436
        A list of `processors.ds.Edge`
437
    incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph.
438
    outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph.
439
    labeled : [str]
440
        A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination").
441
    unlabeled : [str]
442
        A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination").
443
    graph : networkx.Graph
444
        A `networkx.graph` representation of the `DirectedGraph`.  Used by `shortest_path`
445
446
    Methods
447
    -------
448
    bag_of_labeled_dependencies_using(form)
449
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
450
    bag_of_unlabeled_dependencies_using(form)
451
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
452
    """
453
    STANFORD_BASIC_DEPENDENCIES = "stanford-basic"
454
    STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed"
455
456
    def __init__(self, kind, deps, words):
457
        self._words = [w.lower() for w in words]
458
        self.kind = kind
459
        self.roots = deps.get("roots", [])
460
        self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]]
461
        self.incoming = self._build_incoming(self.edges)
462
        self.outgoing = self._build_outgoing(self.edges)
463
        self.labeled = self._build_labeled()
464
        self.unlabeled = self._build_unlabeled()
465
        self.graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind)
466
467
    def __unicode__(self):
468
        return self.edges
469
470
    def __eq__(self, other):
471
        if isinstance(other, self.__class__):
472
            return self.to_JSON() == other.to_JSON()
473
        else:
474
            return False
475
476
    def __ne__(self, other):
477
        return not self.__eq__(other)
478
479
    def shortest_path(self, start, end):
480
        """
481
        Find the shortest path in the syntactic depedency graph
482
        between the provided start and end nodes.
483
484
        See Also
485
        --------
486
        `processors.paths.DependencyUtils.shortest_path`
487
        """
488
        res = DependencyUtils.shortest_path(self.graph, start, end)
489
        return DependencyUtils.retrieve_edges(self, res) if res else None
490
491
    def pagerank(self,
492
                 alpha=0.85,
493
                 personalization=None,
494
                 max_iter=1000,
495
                 tol=1e-06,
496
                 nstart=None,
497
                 weight='weight',
498
                 dangling=None):
499
        """
500
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
501
502
        See Also
503
        --------
504
        `processors.paths.DependencyUtils.pagerank`
505
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
506
        """
507
        return DependencyUtils.pagerank(self.graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
508
509
    def _build_incoming(self, edges):
510
        dep_dict = defaultdict(list)
511
        for edge in edges:
512
            dep_dict[edge.destination].append((edge.source, edge.relation))
513
        return dep_dict
514
515
    def _build_outgoing(self, edges):
516
        dep_dict = defaultdict(list)
517
        for edge in edges:
518
            dep_dict[edge.source].append((edge.destination, edge.relation))
519
        return dep_dict
520
521
    def _build_labeled(self):
522
        labeled = []
523
        for out in self.outgoing:
524
            for (dest, rel) in self.outgoing[out]:
525
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
526
        return labeled
527
528
    def _build_unlabeled(self):
529
        unlabeled = []
530
        for out in self.outgoing:
531
            for (dest, _) in self.outgoing[out]:
532
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
533
        return unlabeled
534
535
    def _graph_to_JSON_dict(self):
536
        dg_dict = dict()
537
        dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges]
538
        dg_dict["roots"] = self.roots
539
        return dg_dict
540
541
    def to_JSON_dict(self):
542
        return {self.kind:self._graph_to_JSON_dict()}
543
544
    def to_JSON(self):
545
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
546
547
548
class Interval(object):
549
    """
550
    Defines a token or character span
551
552
    Parameters
553
    ----------
554
    start : str
555
        The token or character index where the interval begins.
556
    end : str
557
        The 1 + the index of the last token/character in the span.
558
    """
559
560
    def __init__(self, start, end):
561
        self.start = start
562
        self.end = end
563
564
    def to_JSON_dict(self):
565
        return {"start":self.start, "end":self.end}
566
567
    def to_JSON(self):
568
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
569
570
    @staticmethod
571
    def load_from_JSON(json):
572
        return Interval(start=json["start"], end=json["end"])
573