processors.annotators   A
last analyzed

Complexity

Total Complexity 22

Size/Duplication

Total Lines 202
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 22
eloc 68
dl 0
loc 202
rs 10
c 0
b 0
f 0

20 Methods

Rating   Name   Duplication   Size   Complexity  
A Processor.annotate_from_sentences() 0 22 2
A FastNLPProcessor.__init__() 0 4 1
A Processor._annotate_message() 0 3 1
A SegmentedMessage.to_JSON_dict() 0 4 1
A SegmentedMessage.__init__() 0 2 1
A CluProcessor.__init__() 0 2 1
A Processor.annotate() 0 23 2
A Message.__init__() 0 2 1
A FastNLPProcessor.chunk_document() 0 3 1
A SegmentedMessage.to_JSON() 0 2 1
A Message.to_JSON() 0 2 1
A FastNLPProcessor._chunk() 0 2 1
A BioNLPProcessor.__init__() 0 2 1
A Message.to_JSON_dict() 0 4 1
A BioNLPProcessor.annotate() 0 2 1
A CluProcessor.annotate() 0 2 1
A Processor._message_to_json_dict() 0 2 1
A FastNLPProcessor.annotate() 0 2 1
A FastNLPProcessor.chunk_sentence() 0 3 1
A Processor.__init__() 0 2 1
1
# -*- coding: utf-8 -*-
2
3
# use data structures
4
from __future__ import unicode_literals
5
from processors.ds import Document, Sentence, DirectedGraph
6
from processors.utils import post_json
7
import json
8
9
10
class Processor(object):
11
    """
12
    Base Processor for text annotation (tokenization, sentence splitting,
13
    parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.).
14
15
    Parameters
16
    ----------
17
    address : str
18
        The base address for the API (i.e., everything preceding `/api/..`)
19
20
21
    Attributes
22
    ----------
23
    service : str
24
        The API endpoint for `annotate` requests.
25
26
    Methods
27
    -------
28
    annotate(text)
29
        Produces an annotated `Document` from the provided text.
30
    annotate_from_sentences(sentences)
31
        Produces an annotated `Document` from a [str] of text already split into sentences.
32
33
    """
34
    def __init__(self, address):
35
        self.service = "{}/api/annotate".format(address)
36
37
    def _message_to_json_dict(self, msg):
38
        return post_json(self.service, msg.to_JSON())
39
40
    def _annotate_message(self, msg):
41
        annotated_text = post_json(self.service, msg.to_JSON())
42
        return Document.load_from_JSON(annotated_text)
43
44
    def annotate(self, text):
45
        """
46
        Annotate text (tokenization, sentence splitting,
47
        parsing, lemmatization, PoS tagging, named entity recognition, chunking, etc.)
48
49
        Parameters
50
        ----------
51
        text : str
52
            `text` to be annotated.
53
54
        Returns
55
        -------
56
        processors.ds.Document or None
57
            An annotated Document composed of `sentences`.
58
        """
59
        try:
60
            # load json and build Sentences and Document
61
            msg = Message(text)
62
            return self._annotate_message(msg)
63
64
        except Exception as e:
65
            #print(e)
66
            return None
67
68
    def annotate_from_sentences(self, sentences):
69
        """
70
        Annotate text that has already been segmented into `sentences`.
71
72
        Parameters
73
        ----------
74
        sentences : [str]
75
            A list of str representing text already split into sentences.
76
77
        Returns
78
        -------
79
        processors.ds.Document or None
80
            An annotated `Document` composed of `sentences`.
81
        """
82
        try:
83
            # load json from str interable and build Sentences and Document
84
            msg = SegmentedMessage(sentences)
85
            return self._annotate_message(msg)
86
87
        except Exception as e:
88
            #print(e)
89
            return None
90
91
class CluProcessor(Processor):
92
93
    """
94
    Processor for text annotation based on [`org.clulab.processors.clu.CluProcessor`](https://github.com/clulab/processors/blob/master/main/src/main/scala/org/clulab/processors/clu/CluProcessor.scala)
95
96
    Uses the Malt parser.
97
    """
98
    def __init__(self, address):
99
        self.service = "{}/api/clu/annotate".format(address)
100
101
    def annotate(self, text):
102
        return super(CluProcessor, self).annotate(text)
103
104
105
class FastNLPProcessor(Processor):
106
107
    """
108
    Processor for text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala)
109
110
    Uses the Stanford CoreNLP neural network parser.
111
    """
112
    def __init__(self, address):
113
        self.address = address
114
        self.service = "{}/api/fastnlp/annotate".format(address)
115
        self.chunk_address = "{}/api/fastnlp/chunk".format(self.address)
116
117
118
    def annotate(self, text):
119
        return super(FastNLPProcessor, self).annotate(text)
120
121
    def _chunk(self, obj):
122
        return post_json(self.chunk_address, obj.to_JSON())
123
124
    def chunk_sentence(self, sentence):
125
        res = self._chunk(sentence)
126
        return Sentence.load_from_JSON(res)
127
128
    def chunk_document(self, doc):
129
        res = self._chunk(doc)
130
        return Document.load_from_JSON(res)
131
132
133
class BioNLPProcessor(Processor):
134
135
    """
136
    Processor for biomedical text annotation based on [`org.clulab.processors.fastnlp.FastNLPProcessor`](https://github.com/clulab/processors/blob/master/corenlp/src/main/scala/org/clulab/processors/fastnlp/FastNLPProcessor.scala)
137
138
    CoreNLP-derived annotator.
139
140
    """
141
142
    def __init__(self, address):
143
        self.service = "{}/api/bionlp/annotate".format(address)
144
145
    def annotate(self, text):
146
        return super(BioNLPProcessor, self).annotate(text)
147
148
149
class Message(object):
150
151
    """
152
    A storage class for passing `text` to API `annotate` endpoint.
153
154
    Attributes
155
    ----------
156
    text : str
157
        The `text` to be annotated.
158
159
    Methods
160
    -------
161
    to_JSON()
162
        Produces a json str in the structure expected by the API `annotate` endpoint.
163
164
    """
165
    def __init__(self, text):
166
        self.text = text
167
168
    def to_JSON_dict(self):
169
        jdict = dict()
170
        jdict["text"] = self.text
171
        return jdict
172
173
    def to_JSON(self):
174
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
175
176
177
class SegmentedMessage(object):
178
    """
179
    A storage class for passing text already split into sentences to API `annotate` endpoint.
180
181
    Attributes
182
    ----------
183
    segments : [str]
184
        Text to be annotated that has already been split into sentences.  This segmentation is preserved during annotation.
185
186
    Methods
187
    -------
188
    to_JSON()
189
        Produces a json str in the structure expected by the API `annotate` endpoint.
190
191
    """
192
    def __init__(self, segments):
193
        self.segments = segments
194
195
    def to_JSON_dict(self):
196
        jdict = dict()
197
        jdict["segments"] = self.segments
198
        return jdict
199
200
    def to_JSON(self):
201
        return json.dumps(self.to_JSON_dict(), sort_keys=True, indent=4)
202