Completed
Push — master ( 82a8a2...e9219d )
by Gus
10s
created

Interval.size()   A

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Gus Hahn-Powell 2015
5
# data structures for storing processors-server output
6
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7
from __future__ import unicode_literals
8
from itertools import chain
9
from collections import defaultdict, Counter
10
from processors.paths import DependencyUtils, HeadFinder
11
from processors.utils import LabelManager
12
import networkx as nx
13
import hashlib
14
import json
15
import re
16
17
18
class NLPDatum(object):
19
20
    def to_JSON_dict(self):
21
        return dict()
22
23
    def to_JSON(self, pretty=False):
24
        """
25
        Returns JSON as String.
26
        """
27
        num_spaces = 4 if pretty else None
28
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces)
29
30
31
class Document(NLPDatum):
32
33
    """
34
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
35
36
    Parameters
37
    ----------
38
    sentences : [processors.ds.Sentence]
39
        The sentences comprising the `Document`.
40
41
    Attributes
42
    ----------
43
    id : str or None
44
        A unique ID for the `Document`.
45
46
    size : int
47
        The number of `sentences`.
48
49
    sentences : sentences
50
        The sentences comprising the `Document`.
51
52
    words : [str]
53
        A list of the `Document`'s tokens.
54
55
    tags : [str]
56
        A list of the `Document`'s tokens represented using part of speech (PoS) tags.
57
58
    lemmas : [str]
59
        A list of the `Document`'s tokens represented using lemmas.
60
61
    _entities : [str]
62
        A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
63
64
    nes : dict
65
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
66
67
    bag_of_labeled_deps : [str]
68
        The labeled dependencies from all sentences in the `Document`.
69
70
    bag_of_unlabeled_deps : [str]
71
        The unlabeled dependencies from all sentences in the `Document`.
72
73
    text : str or None
74
        The original text of the `Document`.
75
76
    Methods
77
    -------
78
    bag_of_labeled_dependencies_using(form)
79
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
80
81
    bag_of_unlabeled_dependencies_using(form)
82
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
83
    """
84
85
    def __init__(self, sentences):
86
        NLPDatum.__init__(self)
87
        self.id = None
88
        self.size = len(sentences)
89
        self.sentences = sentences
90
        # easily access token attributes from all sentences
91
        self.words = list(chain(*[s.words for s in self.sentences]))
92
        self.tags = list(chain(*[s.tags for s in self.sentences]))
93
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
94
        self._entities = list(chain(*[s._entities for s in self.sentences]))
95
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
96
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
97
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
98
        self.text = None
99
100
    def __hash__(self):
101
        return hash(self.to_JSON())
102
103
    def __unicode__(self):
104
        return self.text
105
106
    def __str__(self):
107
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
108
109
    def __eq__(self, other):
110
        if isinstance(other, self.__class__):
111
            return self.to_JSON() == other.to_JSON()
112
        else:
113
            return False
114
115
    def __ne__(self, other):
116
        return not self.__eq__(other)
117
118
    def bag_of_labeled_dependencies_using(self, form):
119
        return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
120
121
    def bag_of_unlabeled_dependencies_using(self, form):
122
        return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
123
124
    def _merge_ne_dicts(self):
125
        # Get the set of all NE labels found in the Doc's sentences
126
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
127
        # Do we have any labels?
128
        if entity_labels == None:
129
            return None
130
        # If we have labels, consolidate the NEs under the appropriate label
131
        else:
132
            nes_dict = dict()
133
            for e in entity_labels:
134
                entities = []
135
                for s in self.sentences:
136
                    entities += s.nes[e]
137
                nes_dict[e] = entities
138
            return nes_dict
139
140
    def to_JSON_dict(self):
141
        doc_dict = dict()
142
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
143
        doc_dict["text"] = self.text
144
        # can the ID be set?
145
        if self.id != None:
146
            doc_dict["id"] = self.id
147
        return doc_dict
148
149
    @staticmethod
150
    def load_from_JSON(json_dict):
151
        sentences = []
152
        for s in json_dict["sentences"]:
153
            kwargs = {
154
                "words": s["words"],
155
                "startOffsets": s["startOffsets"],
156
                "endOffsets": s["endOffsets"],
157
                "tags": s.get("tags", None),
158
                "lemmas": s.get("lemmas", None),
159
                "chunks": s.get("chunks", None),
160
                "entities": s.get("entities", None),
161
                "graphs": s.get("graphs", None)
162
            }
163
            sent = Sentence(**kwargs)
164
            sentences.append(sent)
165
        doc = Document(sentences)
166
        # set id and text
167
        doc.text = json_dict.get("text", None)
168
        doc.id = json_dict.get("id", None)
169
        return doc
170
171
172
class Sentence(NLPDatum):
173
174
    """
175
    Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
176
177
    Parameters
178
    ----------
179
    text : str or None
180
        The text of the `Sentence`.
181
182
    words : [str]
183
        A list of the `Sentence`'s tokens.
184
185
    startOffsets : [int]
186
        The character offsets starting each token (inclusive).
187
188
    endOffsets : [int]
189
        The character offsets marking the end of each token (exclusive).
190
191
    tags : [str]
192
        A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.
193
194
    lemmas : [str]
195
        A list of the `Sentence`'s tokens represented using lemmas.
196
197
    chunks : [str]
198
        A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).
199
200
    entities : [str]
201
        A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.
202
203
    graphs : dict
204
        A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}}
205
206
    Attributes
207
    ----------
208
    text : str
209
        The text of the `Sentence`.
210
211
    startOffsets : [int]
212
        The character offsets starting each token (inclusive).
213
214
    endOffsets : [int]
215
        The character offsets marking the end of each token (exclusive).
216
217
    length : int
218
        The number of tokens in the `Sentence`
219
220
    graphs : dict
221
        A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`.
222
223
    basic_dependencies : processors.ds.DirectedGraph
224
        A `processors.ds.DirectedGraph` using basic Stanford dependencies.
225
226
    collapsed_dependencies : processors.ds.DirectedGraph
227
        A `processors.ds.DirectedGraph` using collapsed Stanford dependencies.
228
229
    dependencies : processors.ds.DirectedGraph
230
        A pointer to the prefered syntactic dependency graph type for this `Sentence`.
231
232
    _entities : [str]
233
        The IOB-style Named Entity (NE) labels corresponding to each token.
234
235
    _chunks : [str]
236
        The IOB-style chunk labels corresponding to each token.
237
238
    nes : dict
239
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
240
241
    phrases : dict
242
        A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
243
244
245
    Methods
246
    -------
247
    bag_of_labeled_dependencies_using(form)
248
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
249
250
    bag_of_unlabeled_dependencies_using(form)
251
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
252
    """
253
254
    UNKNOWN = LabelManager.UNKNOWN
255
    # the O in IOB notation
256
    O = LabelManager.O
257
258
    def __init__(self, **kwargs):
259
        NLPDatum.__init__(self)
260
        self.words = kwargs["words"]
261
        self.startOffsets = kwargs["startOffsets"]
262
        self.endOffsets = kwargs["endOffsets"]
263
        self.length = len(self.words)
264
        self.tags = self._set_toks(kwargs.get("tags", None))
265
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
266
        self._chunks = self._set_toks(kwargs.get("chunks", None))
267
        self._entities = self._set_toks(kwargs.get("entities", None))
268
        self.text = kwargs.get("text", None) or " ".join(self.words)
269
        self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None))
270
        self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
271
        self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
272
        self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
273
        # IOB tokens -> {label: [phrase 1, ..., phrase n]}
274
        self.nes = self._handle_iob(self._entities)
275
        self.phrases = self._handle_iob(self._chunks)
276
277
    def __eq__(self, other):
278
        if isinstance(other, self.__class__):
279
            return self.to_JSON() == other.to_JSON()
280
        else:
281
            return False
282
283
    def __ne__(self, other):
284
        return not self.__eq__(other)
285
286
    def __hash__(self):
287
        return hash(self.to_JSON(pretty=False))
288
289
    def deduplication_hash(self):
290
        """
291
        Generates a deduplication hash for the sentence
292
        """
293
        return hashlib.sha256(self.to_JSON(pretty=False).encode()).hexdigest()
294
295
    def _get_tokens(self, form):
296
        f = form.lower()
297
        if f == "words":
298
            tokens = self.words
299
        elif f == "tags":
300
            tokens = self.tags
301
        elif f == "lemmas":
302
            tokens = self.lemmas
303
        elif f == "entities":
304
            tokens = self.nes
305
        elif f == "index":
306
            tokens = list(range(self.length))
307
        # unrecognized form
308
        else:
309
            raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""")
310
        return tokens
311
312
    def _set_toks(self, toks):
313
        return toks if toks else [Sentence.UNKNOWN]*self.length
314
315
    def _handle_iob(self, iob):
316
        """
317
        Consolidates consecutive tokens in IOB notation under the appropriate label.
318
        Regexs control for bionlp annotator, which uses IOB notation.
319
        """
320
        entity_dict = defaultdict(list)
321
        # initialize to empty label
322
        current = Sentence.O
323
        start = None
324
        end = None
325
        for i, tok in enumerate(iob):
326
            # we don't have an I or O
327
            if tok == Sentence.O:
328
                # did we have an entity with the last token?
329
                current = re.sub('(B-|I-)','', str(current))
330
                if current == Sentence.O:
331
                    continue
332
                else:
333
                    # the last sequence has ended
334
                    end = i
335
                    # store the entity
336
                    named_entity = ' '.join(self.words[start:end])
337
                    entity_dict[current].append(named_entity)
338
                    # reset our book-keeping vars
339
                    current = Sentence.O
340
                    start = None
341
                    end = None
342
            # we have a tag!
343
            else:
344
                # our old sequence continues
345
                current = re.sub('(B-|I-)','', str(current))
346
                tok = re.sub('(B-|I-)','', str(tok))
347
                if tok == current:
348
                    end = i
349
                # our old sequence has ended
350
                else:
351
                    # do we have a previous NE?
352
                    if current != Sentence.O:
353
                        end = i
354
                        named_entity = ' '.join(self.words[start:end])
355
                        entity_dict[current].append(named_entity)
356
                    # update our book-keeping vars
357
                    current = tok
358
                    start = i
359
                    end = None
360
        # this might be empty
361
        return entity_dict
362
363
    def _build_directed_graph_from_dict(self, graphs):
364
        deps_dict = dict()
365
        if graphs and len(graphs) > 0:
366
            # process each stored graph
367
            for (kind, deps) in graphs.items():
368
                deps_dict[kind] = DirectedGraph(kind, deps, self.words)
369
            return deps_dict
370
        return None
371
372
    def __unicode__(self):
373
        return self.text
374
375
    def to_string(self):
376
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
377
378
    def bag_of_labeled_dependencies_using(self, form):
379
        """
380
        Produces a list of syntactic dependencies
381
        where each edge is labeled with its grammatical relation.
382
        """
383
        tokens = self._get_tokens(form)
384
        return self.labeled_dependencies_from_tokens(tokens) if tokens else None
385
386
    def bag_of_unlabeled_dependencies_using(self, form):
387
        """
388
        Produces a list of syntactic dependencies
389
        where each edge is left unlabeled without its grammatical relation.
390
        """
391
        tokens = self._get_tokens(form)
392
        return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None
393
394
    def labeled_dependencies_from_tokens(self, tokens):
395
        """
396
        Generates a list of labeled dependencies for a sentence
397
        using the provided tokens
398
        """
399
        deps = self.dependencies
400
        labeled = []
401
        return [(tokens[out], rel, tokens[dest]) \
402
                for out in deps.outgoing \
403
                for (dest, rel) in deps.outgoing[out]]
404
405
    def unlabeled_dependencies_from_tokens(self, tokens):
406
        """
407
        Generate a list of unlabeled dependencies for a sentence
408
        using the provided tokens
409
        """
410
        return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)]
411
412
    def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
413
        return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)
414
415
    def to_JSON_dict(self):
416
        sentence_dict = dict()
417
        sentence_dict["words"] = self.words
418
        sentence_dict["startOffsets"] = self.startOffsets
419
        sentence_dict["endOffsets"] = self.endOffsets
420
        sentence_dict["tags"] = self.tags
421
        sentence_dict["lemmas"] = self.lemmas
422
        sentence_dict["entities"] = self._entities
423
        sentence_dict["chunks"] = self._chunks
424
        # add graphs
425
        sentence_dict["graphs"] = dict()
426
        for (kind, graph) in self.graphs.items():
427
            sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict()
428
        return sentence_dict
429
430
    @staticmethod
431
    def load_from_JSON(json_dict):
432
        sent = Sentence(
433
                    words=json_dict["words"],
434
                    startOffsets=json_dict["startOffsets"],
435
                    endOffsets=json_dict["endOffsets"],
436
                    lemmas=json_dict.get("lemmas", None),
437
                    tags=json_dict.get("tags", None),
438
                    entities=json_dict.get("entities", None),
439
                    text=json_dict.get("text", None),
440
                    graphs=json_dict.get("graphs", None),
441
                    chunks=json_dict.get("chunks", None)
442
                    )
443
        return sent
444
445
446
class Edge(NLPDatum):
447
448
    def __init__(self, source, destination, relation):
449
        NLPDatum.__init__(self)
450
        self.source = source
451
        self.destination = destination
452
        self.relation = relation
453
454
    def __unicode__(self):
455
        return self.to_string()
456
457
    def to_string(self):
458
        return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation)
459
460
    def __eq__(self, other):
461
        if isinstance(other, self.__class__):
462
            return self.to_JSON() == other.to_JSON()
463
        else:
464
            return False
465
466
    def to_JSON_dict(self):
467
        edge_dict = dict()
468
        edge_dict["source"] = self.source
469
        edge_dict["destination"] = self.destination
470
        edge_dict["relation"] = self.relation
471
        return edge_dict
472
473
class DirectedGraph(NLPDatum):
474
475
    """
476
    Storage class for directed graphs.
477
478
479
    Parameters
480
    ----------
481
    kind : str
482
        The name of the directed graph.
483
484
    deps : dict
485
        A dictionary of {edges: [{source, destination, relation}], roots: [int]}
486
487
    words : [str]
488
        A list of the word form of the tokens from the originating `Sentence`.
489
490
    Attributes
491
    ----------
492
    _words : [str]
493
        A list of the word form of the tokens from the originating `Sentence`.
494
495
    roots : [int]
496
        A list of indices for the syntactic dependency graph's roots.  Generally this is a single token index.
497
498
    edges: list[processors.ds.Edge]
499
        A list of `processors.ds.Edge`
500
501
    incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph.
502
503
    outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph.
504
505
    labeled : [str]
506
        A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination").
507
508
    unlabeled : [str]
509
        A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination").
510
511
    graph : networkx.Graph
512
        A `networkx.graph` representation of the `DirectedGraph`.  Used by `shortest_path`
513
514
    Methods
515
    -------
516
    bag_of_labeled_dependencies_from_tokens(form)
517
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
518
    bag_of_unlabeled_dependencies_from_tokens(form)
519
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
520
    """
521
    STANFORD_BASIC_DEPENDENCIES = "stanford-basic"
522
    STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed"
523
524
    def __init__(self, kind, deps, words):
525
        NLPDatum.__init__(self)
526
        self._words = [w.lower() for w in words]
527
        self.kind = kind
528
        self.roots = deps.get("roots", [])
529
        self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]]
530
        self.incoming = self._build_incoming(self.edges)
531
        self.outgoing = self._build_outgoing(self.edges)
532
        self.labeled = self._build_labeled()
533
        self.unlabeled = self._build_unlabeled()
534
        self.directed_graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=False)
535
        self.undirected_graph = self.directed_graph.to_undirected()
536
537
    def __unicode__(self):
538
        return self.edges
539
540
    def __eq__(self, other):
541
        if isinstance(other, self.__class__):
542
            return self.to_JSON() == other.to_JSON()
543
        else:
544
            return False
545
546
    def __ne__(self, other):
547
        return not self.__eq__(other)
548
549
    def __hash__(self):
550
        return hash(self.to_JSON())
551
552
    def shortest_paths(self, start, end):
553
        """
554
        Find the shortest paths in the syntactic depedency graph
555
        between the provided start and end nodes.
556
557
        Parameters
558
        ----------
559
        start : int or [int]
560
            A single token index or list of token indices serving as the start of the graph traversal.
561
562
        end : int or [int]
563
            A single token index or list of token indices serving as the end of the graph traversal.
564
565
        See Also
566
        --------
567
        `processors.paths.DependencyUtils.shortest_path`
568
        """
569
        paths = DependencyUtils.shortest_paths(self.undirected_graph, start, end)
570
        return None if not paths else [DependencyUtils.retrieve_edges(self, path) for path in paths]
571
572
    def shortest_path(self, start, end, scoring_func=lambda path: -len(path)):
573
        """
574
        Find the shortest path in the syntactic depedency graph
575
        between the provided start and end nodes.
576
577
        Parameters
578
        ----------
579
        start : int or [int]
580
            A single token index or list of token indices serving as the start of the graph traversal.
581
582
        end : int or [int]
583
            A single token index or list of token indices serving as the end of the graph traversal.
584
585
        scoring_func : function
586
            A function that scores each path in a list of [(source index, directed relation, destination index)] paths.  Each path has the form [(source index, relation, destination index)].
587
            The path with the maximum score will be returned.
588
589
        See Also
590
        --------
591
        `processors.paths.DependencyUtils.shortest_path`
592
        """
593
        paths = self.shortest_paths(start, end)
594
        return None if not paths else max(paths, key=scoring_func)
595
596
    def degree_centrality(self):
597
        """
598
        Compute the degree centrality for nodes.
599
600
        See Also
601
        --------
602
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
603
        """
604
        return Counter(nx.degree_centrality(self.directed_graph))
605
606
    def in_degree_centrality(self):
607
        """
608
        Compute the in-degree centrality for nodes.
609
610
        See Also
611
        --------
612
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
613
        """
614
        return Counter(nx.in_degree_centrality(self.directed_graph))
615
616
    def out_degree_centrality(self):
617
        """
618
        Compute the out-degree centrality for nodes.
619
620
        See Also
621
        --------
622
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
623
        """
624
        return Counter(nx.out_degree_centrality(self.directed_graph))
625
626
    def pagerank(self,
627
                 alpha=0.85,
628
                 personalization=None,
629
                 max_iter=1000,
630
                 tol=1e-06,
631
                 nstart=None,
632
                 weight='weight',
633
                 dangling=None,
634
                 use_directed=True,
635
                 reverse=True):
636
        """
637
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
638
        Note that by default, the directed graph is reversed in order to highlight predicate-argument nodes (refer to pagerank algorithm to understand why).
639
640
        See Also
641
        --------
642
        `processors.paths.DependencyUtils.pagerank`
643
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
644
        """
645
        # check whether or not to reverse directed graph
646
        dg = self.directed_graph if not reverse else DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=True)
647
        # determine graph to use
648
        graph = dg if use_directed else self.undirected_graph
649
        return DependencyUtils.pagerank(graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
650
651
    def _build_incoming(self, edges):
652
        dep_dict = defaultdict(list)
653
        for edge in edges:
654
            dep_dict[edge.destination].append((edge.source, edge.relation))
655
        return dep_dict
656
657
    def _build_outgoing(self, edges):
658
        dep_dict = defaultdict(list)
659
        for edge in edges:
660
            dep_dict[edge.source].append((edge.destination, edge.relation))
661
        return dep_dict
662
663
    def _build_labeled(self):
664
        labeled = []
665
        for out in self.outgoing:
666
            for (dest, rel) in self.outgoing[out]:
667
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
668
        return labeled
669
670
    def _build_unlabeled(self):
671
        unlabeled = []
672
        for out in self.outgoing:
673
            for (dest, _) in self.outgoing[out]:
674
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
675
        return unlabeled
676
677
    def _graph_to_JSON_dict(self):
678
        dg_dict = dict()
679
        dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges]
680
        dg_dict["roots"] = self.roots
681
        return dg_dict
682
683
    def to_JSON_dict(self):
684
        return {self.kind:self._graph_to_JSON_dict()}
685
686
687
class Interval(NLPDatum):
688
    """
689
    Defines a token or character span
690
691
    Parameters
692
    ----------
693
    start : str
694
        The token or character index where the interval begins.
695
696
    end : str
697
        The 1 + the index of the last token/character in the span.
698
699
    Methods
700
    -------
701
    contains(that)
702
        Test whether `that` (int or Interval) overlaps with span of this Interval.
703
704
    overlaps(that)
705
        Test whether this Interval contains another.  Equivalent Intervals will overlap.
706
    """
707
708
    def __init__(self, start, end):
709
        NLPDatum.__init__(self)
710
        assert (start < end), "Interval start must precede end."
711
        self.start = start
712
        self.end = end
713
714
    def to_JSON_dict(self):
715
        return {"start":self.start, "end":self.end}
716
717
    def size(self):
718
        return self.end - self.start
719
720
    def contains(self, that):
721
        """
722
        Checks if this interval contains another (that)
723
        """
724
        if isinstance(that, self.__class__):
725
            return self.start <= that.start and self.end >= that.end
726
        else:
727
            return False
728
729
    def overlaps(self, that):
730
        """
731
        Checks for overlap.
732
        """
733
        if isinstance(that, int):
734
            return self.start <= other < self.end
735
        elif isinstance(that, self.__class__):
736
            return ((that.start <= self.start < that.end) or (self.start <= that.start < self.end))
737
        else:
738
            return False
739
740
    @staticmethod
741
    def load_from_JSON(json):
742
        return Interval(start=json["start"], end=json["end"])
743