Completed
Push — master ( 4c1319...a3584a )
by Gus
34s queued 23s
created

FastNLPProcessor.chunk_sentence()   A

Complexity

Conditions 1

Size

Total Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
c 0
b 0
f 0
dl 0
loc 3
rs 10
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
# use data structures
5
from __future__ import unicode_literals
6
from processors.ds import Document, Sentence, DirectedGraph
7
from processors.utils import post_json
8
import json
9
10
11
class Processor(object):
12
    """
13
    Base Processor for text annotation (tokenization, sentence splitting,
14
    parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.).
15
16
    Parameters
17
    ----------
18
    address : str
19
        The base address for the API (i.e., everything preceding `/api/..`)
20
21
22
    Attributes
23
    ----------
24
    service : str
25
        The API endpoint for `annotate` requests.
26
27
    Methods
28
    -------
29
    annotate(text)
30
        Produces an annotated `Document` from the provided text.
31
    annotate_from_sentences(sentences)
32
        Produces an annotated `Document` from a [str] of text already split into sentences.
33
34
    """
35
    def __init__(self, address):
36
        self.service = "{}/api/annotate".format(address)
37
38
    def _message_to_json_dict(self, msg):
39
        return post_json(self.service, msg.to_JSON())
40
41
    def _annotate_message(self, msg):
42
        annotated_text = post_json(self.service, msg.to_JSON())
43
        return Document.load_from_JSON(annotated_text)
44
45
    def annotate(self, text):
46
        """
47
        Annotate text (tokenization, sentence splitting,
48
        parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.)
49
50
        Parameters
51
        ----------
52
        text : str
53
            `text` to be annotated.
54
55
        Returns
56
        -------
57
        processors.ds.Document or None
58
            An annotated Document composed of `sentences`.
59
        """
60
        try:
61
            # load json and build Sentences and Document
62
            msg = Message(text)
63
            return self._annotate_message(msg)
64
65
        except Exception as e:
66
            #print(e)
67
            return None
68
69
    def annotate_from_sentences(self, sentences):
70
        """
71
        Annotate text that has already been segmented into `sentences`.
72
73
        Parameters
74
        ----------
75
        sentences : [str]
76
            A list of str representing text already split into sentences.
77
78
        Returns
79
        -------
80
        processors.ds.Document or None
81
            An annotated `Document` composed of `sentences`.
82
        """
83
        try:
84
            # load json from str interable and build Sentences and Document
85
            msg = SegmentedMessage(sentences)
86
            return self._annotate_message(msg)
87
88
        except Exception as e:
89
            #print(e)
90
            return None
91
92
class CluProcessor(Processor):
93
94
    """
95
    Processor for text annotation based on [`org.clulab.processors.clu.CluProcessor`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala)
96
97
    Uses the Malt parser.
98
    """
99
    def __init__(self, address):
100
        self.service = "{}/api/clu/annotate".format(address)
101
102
    def annotate(self, text):
103
        return super(CluProcessor, self).annotate(text)
104
105
106
class FastNLPProcessor(Processor):
107
108
    """
109
    Processor for text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala)
110
111
    Uses the Stanford CoreNLP neural network parser.
112
    """
113
    def __init__(self, address):
114
        self.address = address
115
        self.service = "{}/api/fastnlp/annotate".format(address)
116
        self.chunk_address = "{}/api/fastnlp/chunk".format(self.address)
117
118
119
    def annotate(self, text):
120
        return super(FastNLPProcessor, self).annotate(text)
121
122
    def _chunk(self, obj):
123
        return post_json(self.chunk_address, obj.to_JSON())
124
125
    def chunk_sentence(self, sentence):
126
        res = self._chunk(sentence)
127
        return Sentence.load_from_JSON(res)
128
129
    def chunk_document(self, doc):
130
        res = self._chunk(doc)
131
        return Document.load_from_JSON(res)
132
133
134
class BioNLPProcessor(Processor):
135
136
    """
137
    Processor for biomedical text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala)
138
139
    CoreNLP-derived annotator.
140
141
    """
142
143
    def __init__(self, address):
144
        self.service = "{}/api/bionlp/annotate".format(address)
145
146
    def annotate(self, text):
147
        return super(BioNLPProcessor, self).annotate(text)
148
149
150
class Message(object):
151
152
    """
153
    A storage class for passing `text` to API `annotate` endpoint.
154
155
    Attributes
156
    ----------
157
    text : str
158
        The `text` to be annotated.
159
160
    Methods
161
    -------
162
    to_JSON()
163
        Produces a json str in the structure expected by the API `annotate` endpoint.
164
165
    """
166
    def __init__(self, text):
167
        self.text = text
168
169
    def to_JSON_dict(self):
170
        jdict = dict()
171
        jdict["text"] = self.text
172
        return jdict
173
174
    def to_JSON(self):
175
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
176
177
178
class SegmentedMessage(object):
179
    """
180
    A storage class for passing text already split into sentences to API `annotate` endpoint.
181
182
    Attributes
183
    ----------
184
    segments : [str]
185
        Text to be annotated that has already been split into sentences.  This segmentation is preserved during annotation.
186
187
    Methods
188
    -------
189
    to_JSON()
190
        Produces a json str in the structure expected by the API `annotate` endpoint.
191
192
    """
193
    def __init__(self, segments):
194
        self.segments = segments
195
196
    def to_JSON_dict(self):
197
        jdict = dict()
198
        jdict["segments"] = self.segments
199
        return jdict
200
201
    def to_JSON(self):
202
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
203