Completed
Push — master ( 352e2e...4c1319 )
by Gus
8s
created

NLPDatum   A

Complexity

Total Complexity 3

Size/Duplication

Total Lines 11
Duplicated Lines 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
c 1
b 0
f 0
dl 0
loc 11
rs 10
wmc 3

2 Methods

Rating   Name   Duplication   Size   Complexity  
A to_JSON() 0 6 2
A to_JSON_dict() 0 2 1
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# Gus Hahn-Powell 2015
5
# data structures for storing processors-server output
6
# based on conventions from the CLU lab's processors library (https://github.com/clulab/processors)
7
from __future__ import unicode_literals
8
from itertools import chain
9
from collections import defaultdict, Counter
10
from processors.paths import DependencyUtils, HeadFinder
11
from processors.utils import LabelManager
12
import networkx as nx
13
import json
14
import re
15
16
17
class NLPDatum(object):
18
19
    def to_JSON_dict(self):
20
        return dict()
21
22
    def to_JSON(self, pretty=True):
23
        """
24
        Returns JSON as String.
25
        """
26
        num_spaces = 4 if pretty else None
27
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=num_spaces)
28
29
30
class Document(NLPDatum):
31
32
    """
33
    Storage class for annotated text. Based on [`org.clulab.processors.Document`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Document.scala)
34
35
    Parameters
36
    ----------
37
    sentences : [processors.ds.Sentence]
38
        The sentences comprising the `Document`.
39
40
    Attributes
41
    ----------
42
    id : str or None
43
        A unique ID for the `Document`.
44
45
    size : int
46
        The number of `sentences`.
47
48
    sentences : sentences
49
        The sentences comprising the `Document`.
50
51
    words : [str]
52
        A list of the `Document`'s tokens.
53
54
    tags : [str]
55
        A list of the `Document`'s tokens represented using part of speech (PoS) tags.
56
57
    lemmas : [str]
58
        A list of the `Document`'s tokens represented using lemmas.
59
60
    _entities : [str]
61
        A list of the `Document`'s tokens represented using IOB-style named entity (NE) labels.
62
63
    nes : dict
64
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans.
65
66
    bag_of_labeled_deps : [str]
67
        The labeled dependencies from all sentences in the `Document`.
68
69
    bag_of_unlabeled_deps : [str]
70
        The unlabeled dependencies from all sentences in the `Document`.
71
72
    text : str or None
73
        The original text of the `Document`.
74
75
    Methods
76
    -------
77
    bag_of_labeled_dependencies_using(form)
78
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
79
80
    bag_of_unlabeled_dependencies_using(form)
81
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
82
    """
83
84
    def __init__(self, sentences):
85
        NLPDatum.__init__(self)
86
        self.id = None
87
        self.size = len(sentences)
88
        self.sentences = sentences
89
        # easily access token attributes from all sentences
90
        self.words = list(chain(*[s.words for s in self.sentences]))
91
        self.tags = list(chain(*[s.tags for s in self.sentences]))
92
        self.lemmas = list(chain(*[s.lemmas for s in self.sentences]))
93
        self._entities = list(chain(*[s._entities for s in self.sentences]))
94
        self.nes = merge_entity_dicts = self._merge_ne_dicts()
95
        self.bag_of_labeled_deps = list(chain(*[s.dependencies.labeled for s in self.sentences]))
96
        self.bag_of_unlabeled_deps = list(chain(*[s.dependencies.unlabeled for s in self.sentences]))
97
        self.text = None
98
99
    def __hash__(self):
100
        return hash(self.to_JSON())
101
102
    def __unicode__(self):
103
        return self.text
104
105
    def __str__(self):
106
        return "Document w/ {} Sentence{}".format(self.size, "" if self.size == 1 else "s")
107
108
    def __eq__(self, other):
109
        if isinstance(other, self.__class__):
110
            return self.to_JSON() == other.to_JSON()
111
        else:
112
            return False
113
114
    def __ne__(self, other):
115
        return not self.__eq__(other)
116
117
    def bag_of_labeled_dependencies_using(self, form):
118
        return list(chain(*[s.labeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
119
120
    def bag_of_unlabeled_dependencies_using(self, form):
121
        return list(chain(*[s.unlabeled_dependencies_from_tokens(s._get_tokens(form)) for s in self.sentences]))
122
123
    def _merge_ne_dicts(self):
124
        # Get the set of all NE labels found in the Doc's sentences
125
        entity_labels = set(chain(*[s.nes.keys() for s in self.sentences]))
126
        # Do we have any labels?
127
        if entity_labels == None:
128
            return None
129
        # If we have labels, consolidate the NEs under the appropriate label
130
        else:
131
            nes_dict = dict()
132
            for e in entity_labels:
133
                entities = []
134
                for s in self.sentences:
135
                    entities += s.nes[e]
136
                nes_dict[e] = entities
137
            return nes_dict
138
139
    def to_JSON_dict(self):
140
        doc_dict = dict()
141
        doc_dict["sentences"] = [s.to_JSON_dict() for s in self.sentences]
142
        doc_dict["text"] = self.text
143
        # can the ID be set?
144
        if self.id != None:
145
            doc_dict["id"] = self.id
146
        return doc_dict
147
148
    @staticmethod
149
    def load_from_JSON(json_dict):
150
        sentences = []
151
        for s in json_dict["sentences"]:
152
            kwargs = {
153
                "words": s["words"],
154
                "startOffsets": s["startOffsets"],
155
                "endOffsets": s["endOffsets"],
156
                "tags": s.get("tags", None),
157
                "lemmas": s.get("lemmas", None),
158
                "chunks": s.get("chunks", None),
159
                "entities": s.get("entities", None),
160
                "graphs": s.get("graphs", None)
161
            }
162
            sent = Sentence(**kwargs)
163
            sentences.append(sent)
164
        doc = Document(sentences)
165
        # set id and text
166
        doc.text = json_dict.get("text", None)
167
        doc.id = json_dict.get("id", None)
168
        return doc
169
170
171
class Sentence(NLPDatum):
172
173
    """
174
    Storage class for an annotated sentence. Based on [`org.clulab.processors.Sentence`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/Sentence.scala)
175
176
    Parameters
177
    ----------
178
    text : str or None
179
        The text of the `Sentence`.
180
181
    words : [str]
182
        A list of the `Sentence`'s tokens.
183
184
    startOffsets : [int]
185
        The character offsets starting each token (inclusive).
186
187
    endOffsets : [int]
188
        The character offsets marking the end of each token (exclusive).
189
190
    tags : [str]
191
        A list of the `Sentence`'s tokens represented using part of speech (PoS) tags.
192
193
    lemmas : [str]
194
        A list of the `Sentence`'s tokens represented using lemmas.
195
196
    chunks : [str]
197
        A list of the `Sentence`'s tokens represented using IOB-style phrase labels (ex. `B-NP`, `I-NP`, `B-VP`, etc.).
198
199
    entities : [str]
200
        A list of the `Sentence`'s tokens represented using IOB-style named entity (NE) labels.
201
202
    graphs : dict
203
        A dictionary of {graph-name -> {edges: [{source, destination, relation}], roots: [int]}}
204
205
    Attributes
206
    ----------
207
    text : str
208
        The text of the `Sentence`.
209
210
    startOffsets : [int]
211
        The character offsets starting each token (inclusive).
212
213
    endOffsets : [int]
214
        The character offsets marking the end of each token (exclusive).
215
216
    length : int
217
        The number of tokens in the `Sentence`
218
219
    graphs : dict
220
        A dictionary (str -> `processors.ds.DirectedGraph`) mapping the graph type/name to a `processors.ds.DirectedGraph`.
221
222
    basic_dependencies : processors.ds.DirectedGraph
223
        A `processors.ds.DirectedGraph` using basic Stanford dependencies.
224
225
    collapsed_dependencies : processors.ds.DirectedGraph
226
        A `processors.ds.DirectedGraph` using collapsed Stanford dependencies.
227
228
    dependencies : processors.ds.DirectedGraph
229
        A pointer to the prefered syntactic dependency graph type for this `Sentence`.
230
231
    _entities : [str]
232
        The IOB-style Named Entity (NE) labels corresponding to each token.
233
234
    _chunks : [str]
235
        The IOB-style chunk labels corresponding to each token.
236
237
    nes : dict
238
        A dictionary of NE labels represented in the `Document` -> a list of corresponding text spans (ex. {"PERSON": [phrase 1, ..., phrase n]}). Built from `Sentence._entities`
239
240
    phrases : dict
241
        A dictionary of chunk labels represented in the `Document` -> a list of corresponding text spans (ex. {"NP": [phrase 1, ..., phrase n]}). Built from `Sentence._chunks`
242
243
244
    Methods
245
    -------
246
    bag_of_labeled_dependencies_using(form)
247
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
248
249
    bag_of_unlabeled_dependencies_using(form)
250
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
251
    """
252
253
    UNKNOWN = LabelManager.UNKNOWN
254
    # the O in IOB notation
255
    O = LabelManager.O
256
257
    def __init__(self, **kwargs):
258
        NLPDatum.__init__(self)
259
        self.words = kwargs["words"]
260
        self.startOffsets = kwargs["startOffsets"]
261
        self.endOffsets = kwargs["endOffsets"]
262
        self.length = len(self.words)
263
        self.tags = self._set_toks(kwargs.get("tags", None))
264
        self.lemmas = self._set_toks(kwargs.get("lemmas", None))
265
        self._chunks = self._set_toks(kwargs.get("chunks", None))
266
        self._entities = self._set_toks(kwargs.get("entities", None))
267
        self.text = kwargs.get("text", None) or " ".join(self.words)
268
        self.graphs = self._build_directed_graph_from_dict(kwargs.get("graphs", None))
269
        self.basic_dependencies = self.graphs.get(DirectedGraph.STANFORD_BASIC_DEPENDENCIES, None)
270
        self.collapsed_dependencies = self.graphs.get(DirectedGraph.STANFORD_COLLAPSED_DEPENDENCIES, None)
271
        self.dependencies = self.collapsed_dependencies if self.collapsed_dependencies != None else self.basic_dependencies
272
        # IOB tokens -> {label: [phrase 1, ..., phrase n]}
273
        self.nes = self._handle_iob(self._entities)
274
        self.phrases = self._handle_iob(self._chunks)
275
276
    def __eq__(self, other):
277
        if isinstance(other, self.__class__):
278
            return self.to_JSON() == other.to_JSON()
279
        else:
280
            return False
281
282
    def __ne__(self, other):
283
        return not self.__eq__(other)
284
285
    def __hash__(self):
286
        return hash(self.to_JSON())
287
288
    def _get_tokens(self, form):
289
        f = form.lower()
290
        if f == "words":
291
            tokens = self.words
292
        elif f == "tags":
293
            tokens = self.tags
294
        elif f == "lemmas":
295
            tokens = self.lemmas
296
        elif f == "entities":
297
            tokens = self.nes
298
        elif f == "index":
299
            tokens = list(range(self.length))
300
        # unrecognized form
301
        else:
302
            raise Exception("""form must be 'words', 'tags', 'lemmas', or 'index'""")
303
        return tokens
304
305
    def _set_toks(self, toks):
306
        return toks if toks else [Sentence.UNKNOWN]*self.length
307
308
    def _handle_iob(self, iob):
309
        """
310
        Consolidates consecutive tokens in IOB notation under the appropriate label.
311
        Regexs control for bionlp annotator, which uses IOB notation.
312
        """
313
        entity_dict = defaultdict(list)
314
        # initialize to empty label
315
        current = Sentence.O
316
        start = None
317
        end = None
318
        for i, tok in enumerate(iob):
319
            # we don't have an I or O
320
            if tok == Sentence.O:
321
                # did we have an entity with the last token?
322
                current = re.sub('(B-|I-)','', str(current))
323
                if current == Sentence.O:
324
                    continue
325
                else:
326
                    # the last sequence has ended
327
                    end = i
328
                    # store the entity
329
                    named_entity = ' '.join(self.words[start:end])
330
                    entity_dict[current].append(named_entity)
331
                    # reset our book-keeping vars
332
                    current = Sentence.O
333
                    start = None
334
                    end = None
335
            # we have a tag!
336
            else:
337
                # our old sequence continues
338
                current = re.sub('(B-|I-)','', str(current))
339
                tok = re.sub('(B-|I-)','', str(tok))
340
                if tok == current:
341
                    end = i
342
                # our old sequence has ended
343
                else:
344
                    # do we have a previous NE?
345
                    if current != Sentence.O:
346
                        end = i
347
                        named_entity = ' '.join(self.words[start:end])
348
                        entity_dict[current].append(named_entity)
349
                    # update our book-keeping vars
350
                    current = tok
351
                    start = i
352
                    end = None
353
        # this might be empty
354
        return entity_dict
355
356
    def _build_directed_graph_from_dict(self, graphs):
357
        deps_dict = dict()
358
        if graphs and len(graphs) > 0:
359
            # process each stored graph
360
            for (kind, deps) in graphs.items():
361
                deps_dict[kind] = DirectedGraph(kind, deps, self.words)
362
            return deps_dict
363
        return None
364
365
    def __unicode__(self):
366
        return self.text
367
368
    def to_string(self):
369
        return ' '.join("{w}__{p}".format(w=self.words[i],p=self.tags[i]) for i in range(self.length))
370
371
    def bag_of_labeled_dependencies_using(self, form):
372
        """
373
        Produces a list of syntactic dependencies
374
        where each edge is labeled with its grammatical relation.
375
        """
376
        tokens = self._get_tokens(form)
377
        return self.labeled_dependencies_from_tokens(tokens) if tokens else None
378
379
    def bag_of_unlabeled_dependencies_using(self, form):
380
        """
381
        Produces a list of syntactic dependencies
382
        where each edge is left unlabeled without its grammatical relation.
383
        """
384
        tokens = self._get_tokens(form)
385
        return self.unlabeled_dependencies_from_tokens(tokens) if tokens else None
386
387
    def labeled_dependencies_from_tokens(self, tokens):
388
        """
389
        Generates a list of labeled dependencies for a sentence
390
        using the provided tokens
391
        """
392
        deps = self.dependencies
393
        labeled = []
394
        return [(tokens[out], rel, tokens[dest]) \
395
                for out in deps.outgoing \
396
                for (dest, rel) in deps.outgoing[out]]
397
398
    def unlabeled_dependencies_from_tokens(self, tokens):
399
        """
400
        Generate a list of unlabeled dependencies for a sentence
401
        using the provided tokens
402
        """
403
        return [(head, dep) for (head, rel, dep) in self.labeled_dependencies_from_tokens(tokens)]
404
405
    def semantic_head(self, graph_name="stanford-collapsed", valid_tags={r"^N", "VBG"}, valid_indices=None):
406
        return HeadFinder.semantic_head(self, graph_name, valid_tags, valid_indices)
407
408
    def to_JSON_dict(self):
409
        sentence_dict = dict()
410
        sentence_dict["words"] = self.words
411
        sentence_dict["startOffsets"] = self.startOffsets
412
        sentence_dict["endOffsets"] = self.endOffsets
413
        sentence_dict["tags"] = self.tags
414
        sentence_dict["lemmas"] = self.lemmas
415
        sentence_dict["entities"] = self._entities
416
        # add graphs
417
        sentence_dict["graphs"] = dict()
418
        for (kind, graph) in self.graphs.items():
419
            sentence_dict["graphs"][kind] = graph._graph_to_JSON_dict()
420
        return sentence_dict
421
422
    @staticmethod
423
    def load_from_JSON(json_dict):
424
        sent = Sentence(
425
                    words=json_dict["words"],
426
                    startOffsets=json_dict["startOffsets"],
427
                    endOffsets=json_dict["endOffsets"],
428
                    lemmas=json_dict.get("lemmas", None),
429
                    tags=json_dict.get("tags", None),
430
                    entities=json_dict.get("entities", None),
431
                    text=json_dict.get("text", None),
432
                    graphs=json_dict.get("graphs", None)
433
                    )
434
        return sent
435
436
437
class Edge(NLPDatum):
438
439
    def __init__(self, source, destination, relation):
440
        NLPDatum.__init__(self)
441
        self.source = source
442
        self.destination = destination
443
        self.relation = relation
444
445
    def __unicode__(self):
446
        return self.to_string()
447
448
    def to_string(self):
449
        return "Edge(source: {}, destination: {}, relation: {})".format(self.source, self.destination, self.relation)
450
451
    def __eq__(self, other):
452
        if isinstance(other, self.__class__):
453
            return self.to_JSON() == other.to_JSON()
454
        else:
455
            return False
456
457
    def to_JSON_dict(self):
458
        edge_dict = dict()
459
        edge_dict["source"] = self.source
460
        edge_dict["destination"] = self.destination
461
        edge_dict["relation"] = self.relation
462
        return edge_dict
463
464
class DirectedGraph(NLPDatum):
465
466
    """
467
    Storage class for directed graphs.
468
469
470
    Parameters
471
    ----------
472
    kind : str
473
        The name of the directed graph.
474
475
    deps : dict
476
        A dictionary of {edges: [{source, destination, relation}], roots: [int]}
477
478
    words : [str]
479
        A list of the word form of the tokens from the originating `Sentence`.
480
481
    Attributes
482
    ----------
483
    _words : [str]
484
        A list of the word form of the tokens from the originating `Sentence`.
485
486
    roots : [int]
487
        A list of indices for the syntactic dependency graph's roots.  Generally this is a single token index.
488
489
    edges: list[processors.ds.Edge]
490
        A list of `processors.ds.Edge`
491
492
    incoming : A dictionary of {int -> [int]} encoding the incoming edges for each node in the graph.
493
494
    outgoing : A dictionary of {int -> [int]} encoding the outgoing edges for each node in the graph.
495
496
    labeled : [str]
497
        A list of strings where each element in the list represents an edge encoded as source index, relation, and destination index ("source_relation_destination").
498
499
    unlabeled : [str]
500
        A list of strings where each element in the list represents an edge encoded as source index and destination index ("source_destination").
501
502
    graph : networkx.Graph
503
        A `networkx.graph` representation of the `DirectedGraph`.  Used by `shortest_path`
504
505
    Methods
506
    -------
507
    bag_of_labeled_dependencies_from_tokens(form)
508
        Produces a list of syntactic dependencies where each edge is labeled with its grammatical relation.
509
    bag_of_unlabeled_dependencies_from_tokens(form)
510
        Produces a list of syntactic dependencies where each edge is left unlabeled without its grammatical relation.
511
    """
512
    STANFORD_BASIC_DEPENDENCIES = "stanford-basic"
513
    STANFORD_COLLAPSED_DEPENDENCIES = "stanford-collapsed"
514
515
    def __init__(self, kind, deps, words):
516
        NLPDatum.__init__(self)
517
        self._words = [w.lower() for w in words]
518
        self.kind = kind
519
        self.roots = deps.get("roots", [])
520
        self.edges = [Edge(e["source"], e["destination"], e["relation"]) for e in deps["edges"]]
521
        self.incoming = self._build_incoming(self.edges)
522
        self.outgoing = self._build_outgoing(self.edges)
523
        self.labeled = self._build_labeled()
524
        self.unlabeled = self._build_unlabeled()
525
        self.directed_graph = DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=False)
526
        self.undirected_graph = self.directed_graph.to_undirected()
527
528
    def __unicode__(self):
529
        return self.edges
530
531
    def __eq__(self, other):
532
        if isinstance(other, self.__class__):
533
            return self.to_JSON() == other.to_JSON()
534
        else:
535
            return False
536
537
    def __ne__(self, other):
538
        return not self.__eq__(other)
539
540
    def __hash__(self):
541
        return hash(self.to_JSON())
542
543
    def shortest_paths(self, start, end):
544
        """
545
        Find the shortest paths in the syntactic depedency graph
546
        between the provided start and end nodes.
547
548
        Parameters
549
        ----------
550
        start : int or [int]
551
            A single token index or list of token indices serving as the start of the graph traversal.
552
553
        end : int or [int]
554
            A single token index or list of token indices serving as the end of the graph traversal.
555
556
        See Also
557
        --------
558
        `processors.paths.DependencyUtils.shortest_path`
559
        """
560
        paths = DependencyUtils.shortest_paths(self.undirected_graph, start, end)
561
        return None if not paths else [DependencyUtils.retrieve_edges(self, path) for path in paths]
562
563
    def shortest_path(self, start, end, scoring_func=lambda path: -len(path)):
564
        """
565
        Find the shortest path in the syntactic depedency graph
566
        between the provided start and end nodes.
567
568
        Parameters
569
        ----------
570
        start : int or [int]
571
            A single token index or list of token indices serving as the start of the graph traversal.
572
573
        end : int or [int]
574
            A single token index or list of token indices serving as the end of the graph traversal.
575
576
        scoring_func : function
577
            A function that scores each path in a list of [(source index, directed relation, destination index)] paths.  Each path has the form [(source index, relation, destination index)].
578
            The path with the maximum score will be returned.
579
580
        See Also
581
        --------
582
        `processors.paths.DependencyUtils.shortest_path`
583
        """
584
        paths = self.shortest_paths(start, end)
585
        return None if not paths else max(paths, key=scoring_func)
586
587
    def degree_centrality(self):
588
        """
589
        Compute the degree centrality for nodes.
590
591
        See Also
592
        --------
593
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
594
        """
595
        return Counter(nx.degree_centrality(self.directed_graph))
596
597
    def in_degree_centrality(self):
598
        """
599
        Compute the in-degree centrality for nodes.
600
601
        See Also
602
        --------
603
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
604
        """
605
        return Counter(nx.in_degree_centrality(self.directed_graph))
606
607
    def out_degree_centrality(self):
608
        """
609
        Compute the out-degree centrality for nodes.
610
611
        See Also
612
        --------
613
        https://networkx.github.io/documentation/development/reference/algorithms.centrality.html
614
        """
615
        return Counter(nx.out_degree_centrality(self.directed_graph))
616
617
    def pagerank(self,
618
                 alpha=0.85,
619
                 personalization=None,
620
                 max_iter=1000,
621
                 tol=1e-06,
622
                 nstart=None,
623
                 weight='weight',
624
                 dangling=None,
625
                 use_directed=True,
626
                 reverse=True):
627
        """
628
        Measures node activity in a `networkx.Graph` using a thin wrapper around `networkx` implementation of pagerank algorithm (see `networkx.algorithms.link_analysis.pagerank`).  Use with `processors.ds.DirectedGraph.graph`.
629
        Note that by default, the directed graph is reversed in order to highlight predicate-argument nodes (refer to pagerank algorithm to understand why).
630
631
        See Also
632
        --------
633
        `processors.paths.DependencyUtils.pagerank`
634
        Method parameters correspond to those of [`networkx.algorithms.link_analysis.pagerank`](https://networkx.github.io/documentation/development/reference/generated/networkx.algorithms.link_analysis.pagerank_alg.pagerank.html#networkx.algorithms.link_analysis.pagerank_alg.pagerank)
635
        """
636
        # check whether or not to reverse directed graph
637
        dg = self.directed_graph if not reverse else DependencyUtils.build_networkx_graph(roots=self.roots, edges=self.edges, name=self.kind, reverse=True)
638
        # determine graph to use
639
        graph = dg if use_directed else self.undirected_graph
640
        return DependencyUtils.pagerank(graph, alpha=alpha, personalization=personalization, max_iter=max_iter, tol=tol, nstart=nstart, weight=weight, dangling=dangling)
641
642
    def _build_incoming(self, edges):
643
        dep_dict = defaultdict(list)
644
        for edge in edges:
645
            dep_dict[edge.destination].append((edge.source, edge.relation))
646
        return dep_dict
647
648
    def _build_outgoing(self, edges):
649
        dep_dict = defaultdict(list)
650
        for edge in edges:
651
            dep_dict[edge.source].append((edge.destination, edge.relation))
652
        return dep_dict
653
654
    def _build_labeled(self):
655
        labeled = []
656
        for out in self.outgoing:
657
            for (dest, rel) in self.outgoing[out]:
658
                labeled.append("{}_{}_{}".format(self._words[out], rel.upper(), self._words[dest]))
659
        return labeled
660
661
    def _build_unlabeled(self):
662
        unlabeled = []
663
        for out in self.outgoing:
664
            for (dest, _) in self.outgoing[out]:
665
                unlabeled.append("{}_{}".format(self._words[out], self._words[dest]))
666
        return unlabeled
667
668
    def _graph_to_JSON_dict(self):
669
        dg_dict = dict()
670
        dg_dict["edges"] = [e.to_JSON_dict() for e in self.edges]
671
        dg_dict["roots"] = self.roots
672
        return dg_dict
673
674
    def to_JSON_dict(self):
675
        return {self.kind:self._graph_to_JSON_dict()}
676
677
678
class Interval(NLPDatum):
679
    """
680
    Defines a token or character span
681
682
    Parameters
683
    ----------
684
    start : str
685
        The token or character index where the interval begins.
686
687
    end : str
688
        The 1 + the index of the last token/character in the span.
689
    """
690
691
    def __init__(self, start, end):
692
        NLPDatum.__init__(self)
693
        self.start = start
694
        self.end = end
695
696
    def to_JSON_dict(self):
697
        return {"start":self.start, "end":self.end}
698
699
    @staticmethod
700
    def load_from_JSON(json):
701
        return Interval(start=json["start"], end=json["end"])
702