1
|
|
|
import itertools |
|
|
|
|
2
|
|
|
import jsonpickle |
3
|
|
|
import os |
4
|
|
|
|
5
|
|
|
from topik import tokenizers, transformers, vectorizers, models, visualizers |
6
|
|
|
from ._registry import registered_outputs |
7
|
|
|
from .reader import read_input |
8
|
|
|
|
9
|
|
|
|
10
|
|
|
def _get_parameters_string(**kwargs): |
11
|
|
|
"""Used to create identifiers for output""" |
12
|
|
|
_id = "" |
13
|
|
|
if kwargs: |
|
|
|
|
14
|
|
|
_id = "_" + ''.join('{}={}_'.format(key, val) for key, val in sorted(kwargs.items()))[:-1] |
|
|
|
|
15
|
|
|
return _id |
16
|
|
|
|
17
|
|
|
|
18
|
|
|
class TopikProject(object): |
|
|
|
|
19
|
|
|
def __init__(self, project_name, output_type=None, output_args=None, **kwargs): |
20
|
|
|
"""Class that abstracts persistence. Drives different output types, and handles |
21
|
|
|
storing intermediate results to given output type. |
22
|
|
|
|
23
|
|
|
output_type : string |
24
|
|
|
internal format for handling user data. Current options are |
25
|
|
|
present in topik.fileio.registered_outputs. default is "InMemoryOutput". |
26
|
|
|
output_args : dictionary or None |
27
|
|
|
configuration to pass through to output |
28
|
|
|
synchronous_wait : integer |
29
|
|
|
number of seconds to wait for data to finish uploading to output, when using an asynchronous |
30
|
|
|
output type. Only relevant for some output types ("ElasticSearchOutput", not "InMemoryOutput") |
31
|
|
|
**kwargs : passed through to superclass __init__. Not passed to output. |
32
|
|
|
""" |
33
|
|
|
if output_args is None: |
|
|
|
|
34
|
|
|
output_args = {} |
35
|
|
|
if os.path.exists(project_name + ".topikproject") and output_type is None: |
|
|
|
|
36
|
|
|
with open(project_name + ".topikproject") as project_meta: |
|
|
|
|
37
|
|
|
project_data = jsonpickle.decode(project_meta.read()) |
38
|
|
|
kwargs.update(project_data) |
|
|
|
|
39
|
|
|
with open(project_name + ".topikdata") as project_data: |
40
|
|
|
loaded_data = jsonpickle.decode(project_data.read()) |
|
|
|
|
41
|
|
|
output_type = loaded_data["class"] |
|
|
|
|
42
|
|
|
output_args.update(loaded_data["saved_data"]) |
43
|
|
|
self.project_name = project_name |
44
|
|
|
if output_type is None: |
45
|
|
|
output_type = "InMemoryOutput" |
46
|
|
|
# loading the output here is sufficient to restore all results: the output is responsible for loading them as |
47
|
|
|
# necessary, and returning iterators or output objects appropriately. |
48
|
|
|
self.output = registered_outputs[output_type](**output_args) |
49
|
|
|
# not used, but stored here for persistence purposes |
50
|
|
|
self._output_type = output_type |
51
|
|
|
self._output_args = output_args |
52
|
|
|
# None or a string expression in Elasticsearch query format |
53
|
|
|
self.corpus_filter = kwargs["corpus_filter"] if "corpus_filter" in kwargs else "" |
|
|
|
|
54
|
|
|
# None or a string name |
55
|
|
|
self.content_field = kwargs["content_field"] if "content_field" in kwargs else "" |
56
|
|
|
# Initially None, set to string value when tokenize or transform method called |
57
|
|
|
self._selected_source_field = kwargs["_selected_content_field"] if "_selected_content_field" in kwargs else None |
58
|
|
|
# Initially None, set to string value when tokenize or transform method called |
59
|
|
|
self._selected_tokenized_corpus_id = kwargs["_selected_tokenized_corpus_id"] if "_selected_tokenized_corpus_id" in kwargs else None |
60
|
|
|
# Initially None, set to string value when vectorize method called |
61
|
|
|
self._selected_vectorized_corpus_id = kwargs["_selected_vectorized_corpus_id"] if "_selected_vectorized_corpus_id" in kwargs else None |
62
|
|
|
# Initially None, set to string value when run_model method called |
63
|
|
|
self._selected_modeled_corpus_id = kwargs["_selected_modeled_corpus_id"] if "_selected_modeled_corpus_id" in kwargs else None |
64
|
|
|
|
65
|
|
|
def __enter__(self): |
66
|
|
|
return self |
|
|
|
|
67
|
|
|
|
68
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb): |
69
|
|
|
self.close() |
70
|
|
|
|
71
|
|
|
def close(self): |
|
|
|
|
72
|
|
|
self.save() |
73
|
|
|
self.output.close() # close any open file handles or network connections |
74
|
|
|
|
75
|
|
|
def save(self): |
76
|
|
|
"""Save project as .topikproject metafile and some number of sidecar data files.""" |
77
|
|
|
with open(self.project_name + ".topikproject", "w") as f: |
|
|
|
|
78
|
|
|
f.write(jsonpickle.encode({ |
79
|
|
|
"_selected_tokenized_corpus_id": self._selected_tokenized_corpus_id, |
80
|
|
|
"_selected_vectorized_corpus_id": self._selected_vectorized_corpus_id, |
81
|
|
|
"_selected_modeled_corpus_id": self._selected_modeled_corpus_id, |
82
|
|
|
"corpus_filter": self.corpus_filter, |
83
|
|
|
"project_name": self.project_name, |
84
|
|
|
"output_type": self._output_type, |
85
|
|
|
"output_args": self._output_args, |
86
|
|
|
"content_field": self.content_field}, |
87
|
|
|
f)) |
88
|
|
|
self.output.save(self.project_name + ".topikdata") |
89
|
|
|
|
90
|
|
|
def read_input(self, source, content_field, source_type="auto", **kwargs): |
91
|
|
|
"""Import data from external source into Topik's internal format""" |
92
|
|
|
self.output.import_from_iterable(read_input(source, |
|
|
|
|
93
|
|
|
source_type=source_type, |
|
|
|
|
94
|
|
|
**kwargs), |
|
|
|
|
95
|
|
|
field_to_hash=content_field) |
|
|
|
|
96
|
|
|
self.content_field = content_field |
97
|
|
|
|
98
|
|
|
def get_filtered_corpus_iterator(self, field=None, filter_expression=None): |
|
|
|
|
99
|
|
|
if field is None: |
|
|
|
|
100
|
|
|
field = self.content_field |
|
|
|
|
101
|
|
|
if filter_expression is None: |
|
|
|
|
102
|
|
|
filter_expression = self.corpus_filter |
103
|
|
|
return self.output.get_filtered_data(field, filter_expression) |
104
|
|
|
|
105
|
|
|
def get_date_filtered_corpus_iterator(self, start, end, filter_field, |
|
|
|
|
106
|
|
|
field_to_get=None): |
107
|
|
|
if field_to_get is None: |
|
|
|
|
108
|
|
|
field_to_get = self.content_field |
|
|
|
|
109
|
|
|
return self.output.get_date_filtered_data(field_to_get=field_to_get, |
110
|
|
|
start=start, |
|
|
|
|
111
|
|
|
end=end, |
|
|
|
|
112
|
|
|
filter_field=filter_field) |
|
|
|
|
113
|
|
|
|
114
|
|
|
def tokenize(self, method="simple", **kwargs): |
115
|
|
|
"""Break raw text into substituent terms (or collections of terms)""" |
116
|
|
|
# tokenize, and store the results on this object somehow |
117
|
|
|
tokenized_corpus = tokenizers.tokenize(self.selected_filtered_corpus, |
|
|
|
|
118
|
|
|
method=method, **kwargs) |
|
|
|
|
119
|
|
|
tokenize_parameter_string = self.corpus_filter + "_tk_{method}{params}".format( |
120
|
|
|
method=method, |
121
|
|
|
params=_get_parameters_string(**kwargs)) |
122
|
|
|
|
123
|
|
|
# store this |
124
|
|
|
self.output.tokenized_corpora[tokenize_parameter_string] = tokenized_corpus |
|
|
|
|
125
|
|
|
# set _tokenizer_id internal handle to point to this data |
126
|
|
|
self._selected_tokenized_corpus_id = tokenize_parameter_string |
|
|
|
|
127
|
|
|
|
128
|
|
|
def transform(self, method, **kwargs): |
129
|
|
|
"""Stem or lemmatize input text that has already been tokenized""" |
130
|
|
|
transformed_data = transformers.transform(method=method, **kwargs) |
|
|
|
|
131
|
|
|
tokenize_parameter_string = "_".join([self.tokenizer_id, "xform", method, |
|
|
|
|
132
|
|
|
_get_parameters_string(**kwargs)]) |
133
|
|
|
# store this |
134
|
|
|
self.output.tokenized_corpora[tokenize_parameter_string] = transformed_data |
|
|
|
|
135
|
|
|
# set _tokenizer_id internal handle to point to this data |
136
|
|
|
self._selected_tokenized_corpus_id = tokenize_parameter_string |
|
|
|
|
137
|
|
|
|
138
|
|
|
def vectorize(self, method="bag_of_words", **kwargs): |
139
|
|
|
"""Convert tokenized text to vector form - mathematical representation used for modeling.""" |
140
|
|
|
tokenizer_iterators = itertools.tee(self.selected_tokenized_corpus) |
|
|
|
|
141
|
|
|
vectorized_corpus = vectorizers.vectorize(tokenizer_iterators[0], |
|
|
|
|
142
|
|
|
method=method, **kwargs) |
|
|
|
|
143
|
|
|
vectorize_parameter_string = self.corpus_filter + self._selected_tokenized_corpus_id + "_".join([method, _get_parameters_string(**kwargs)]) |
144
|
|
|
# store this internally |
145
|
|
|
self.output.vectorized_corpora[vectorize_parameter_string] = vectorized_corpus |
|
|
|
|
146
|
|
|
# set _vectorizer_id internal handle to point to this data |
147
|
|
|
self._selected_vectorized_corpus_id = vectorize_parameter_string |
|
|
|
|
148
|
|
|
|
149
|
|
|
def run_model(self, model_name="lda", ntopics=3, **kwargs): |
150
|
|
|
"""Analyze vectorized text; determine topics and assign document probabilities""" |
151
|
|
|
if (model_name=='lda') and ('tfidf' in self._selected_vectorized_corpus_id): |
|
|
|
|
152
|
|
|
raise ValueError('LDA models are incompatible with TF-IDF vectorization. If you wish to use TFIDF vectorization, please select another type of model.') |
153
|
|
|
modeled_corpus = models.run_model(self.selected_vectorized_corpus, |
|
|
|
|
154
|
|
|
model_name=model_name, |
155
|
|
|
ntopics=ntopics, **kwargs) |
|
|
|
|
156
|
|
|
model_id = "_".join([model_name, _get_parameters_string(**kwargs)]) |
157
|
|
|
# store this internally |
158
|
|
|
self.output.modeled_corpora[model_id] = modeled_corpus |
|
|
|
|
159
|
|
|
# set _model_id internal handle to point to this data |
160
|
|
|
self._selected_modeled_corpus_id = model_id |
|
|
|
|
161
|
|
|
|
162
|
|
|
def visualize(self, vis_name='lda_vis', model_id=None, **kwargs): |
163
|
|
|
"""Plot model output""" |
164
|
|
|
if not model_id: |
|
|
|
|
165
|
|
|
modeled_corpus = self.selected_modeled_corpus |
|
|
|
|
166
|
|
|
else: |
167
|
|
|
modeled_corpus = self.output.model_data[model_id] |
168
|
|
|
return visualizers.visualize(modeled_corpus, vis_name, **kwargs) |
|
|
|
|
169
|
|
|
|
170
|
|
|
def select_tokenized_corpus(self, _id): |
171
|
|
|
"""Assign active tokenized corpus. |
172
|
|
|
|
173
|
|
|
When more than one tokenized corpus available (ran tokenization more than once with different |
174
|
|
|
methods), this allows you to switch to a different data set. |
175
|
|
|
""" |
176
|
|
|
if _id in self.output.tokenized_corpora: |
|
|
|
|
177
|
|
|
self._selected_tokenized_corpus_id = _id |
178
|
|
|
else: |
179
|
|
|
raise ValueError("tokenized data {} not found in storage.".format(id)) |
|
|
|
|
180
|
|
|
|
181
|
|
|
def select_vectorized_corpus(self, _id): |
182
|
|
|
"""Assign active vectorized corpus. |
183
|
|
|
|
184
|
|
|
When more than one vectorized corpus available (ran tokenization more than once with different |
185
|
|
|
methods), this allows you to switch to a different data set. |
186
|
|
|
""" |
187
|
|
|
if _id in self.output.vectorized_corpora: |
|
|
|
|
188
|
|
|
self._selected_vectorized_corpus_id = _id |
189
|
|
|
else: |
190
|
|
|
raise ValueError("vectorized data {} not found in storage.".format(_id)) |
191
|
|
|
|
192
|
|
|
def select_modeled_corpus(self, _id): |
193
|
|
|
"""When more than one model output available (ran modeling more than once with different |
194
|
|
|
methods), this allows you to switch to a different data set. |
195
|
|
|
""" |
196
|
|
|
if _id in self.output.modeled_corpus: |
|
|
|
|
197
|
|
|
self._selected_modeled_corpus_id = _id |
198
|
|
|
else: |
199
|
|
|
raise ValueError("model {} not found in storage.".format(_id)) |
200
|
|
|
|
201
|
|
|
@property |
|
|
|
|
202
|
|
|
def selected_filtered_corpus(self): |
203
|
|
|
"""Corpus documents, potentially a subset. |
204
|
|
|
|
205
|
|
|
Output from read_input step. |
206
|
|
|
Input to tokenization step. |
207
|
|
|
""" |
208
|
|
|
return self.output.get_filtered_data(field_to_get=self.content_field, |
|
|
|
|
209
|
|
|
filter=self.corpus_filter) |
210
|
|
|
|
211
|
|
|
@property |
|
|
|
|
212
|
|
|
def selected_tokenized_corpus(self): |
213
|
|
|
"""Documents broken into component words. May also be transformed. |
214
|
|
|
|
215
|
|
|
Output from tokenization and/or transformation steps. |
216
|
|
|
Input to vectorization step. |
217
|
|
|
""" |
218
|
|
|
return self.output.tokenized_corpora[self._selected_tokenized_corpus_id] |
|
|
|
|
219
|
|
|
|
220
|
|
|
@property |
|
|
|
|
221
|
|
|
def selected_vectorized_corpus(self): |
222
|
|
|
"""Data that has been vectorized into term frequencies, TF/IDF, or |
223
|
|
|
other vector representation. |
224
|
|
|
|
225
|
|
|
Output from vectorization step. |
226
|
|
|
Input to modeling step. |
227
|
|
|
""" |
228
|
|
|
return self.output.vectorized_corpora[self._selected_vectorized_corpus_id] |
|
|
|
|
229
|
|
|
|
230
|
|
|
@property |
|
|
|
|
231
|
|
|
def selected_modeled_corpus(self): |
232
|
|
|
"""matrices representing the model derived. |
233
|
|
|
|
234
|
|
|
Output from modeling step. |
235
|
|
|
Input to visualization step. |
236
|
|
|
""" |
237
|
|
|
return self.output.modeled_corpora[self._selected_modeled_corpus_id] |
|
|
|
|
238
|
|
|
|
239
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.