1
|
|
|
import glob |
|
|
|
|
2
|
|
|
import os |
3
|
|
|
import time |
4
|
|
|
import unittest |
5
|
|
|
|
6
|
|
|
import elasticsearch |
7
|
|
|
import nose.tools as nt |
|
|
|
|
8
|
|
|
from elasticsearch.exceptions import ConnectionError |
|
|
|
|
9
|
|
|
from nose.plugins.skip import SkipTest |
|
|
|
|
10
|
|
|
|
11
|
|
|
from topik.fileio import TopikProject |
12
|
|
|
from topik.fileio.tests import test_data_path |
13
|
|
|
|
14
|
|
|
# make logging quiet during testing, to keep Travis CI logs short. |
15
|
|
|
import logging |
16
|
|
|
logging.basicConfig() |
17
|
|
|
logging.getLogger('elasticsearch').setLevel(logging.ERROR) |
|
|
|
|
18
|
|
|
logging.getLogger('urllib3').setLevel(logging.ERROR) |
19
|
|
|
|
20
|
|
|
SAVE_FILENAME = "test_project" |
21
|
|
|
|
22
|
|
|
sample_tokenized_doc = (2318580746137828354, |
|
|
|
|
23
|
|
|
[u'nano', u'sized', u'tio', u'particles', u'applications', u'including', |
24
|
|
|
u'use', u'photocatalysts', u'heat', u'transfer', u'fluids', u'nanofluids', |
25
|
|
|
u'present', u'study', u'tio', u'nanoparticles', u'controllable', u'phase', |
26
|
|
|
u'particle', u'size', u'obtained', u'homogeneous', u'gas', u'phase', |
27
|
|
|
u'nucleation', u'chemical', u'vapor', u'condensation', u'cvc', u'phase', |
28
|
|
|
u'particle', u'size', u'tio', u'nanoparticles', u'processing', u'conditions', |
29
|
|
|
u'characterized', u'x', u'ray', u'diffraction', u'transmission', u'electron', |
30
|
|
|
u'microscopy', u'chamber', u'temperature', u'pressure', u'key', u'parameters', |
31
|
|
|
u'affecting', u'particle', u'phase', u'size', u'pure', u'anatase', u'phase', |
32
|
|
|
u'observed', u'synthesis', u'temperatures', u'low', u'c', u'chamber', |
33
|
|
|
u'pressure', u'varying', u'torr', u'furnace', u'temperature', u'increased', |
34
|
|
|
u'c', u'pressure', u'torr', u'mixture', u'anatase', u'rutile', u'phases', |
35
|
|
|
u'observed', u'predominant', u'phase', u'anatase', u'average', u'particle', |
36
|
|
|
u'size', u'experimental', u'conditions', u'observed', u'nm']) |
37
|
|
|
|
38
|
|
|
test_data_path = os.path.join(test_data_path, "test_data_json_stream.json") |
|
|
|
|
39
|
|
|
|
40
|
|
|
|
41
|
|
|
class ProjectTest(object): |
|
|
|
|
42
|
|
|
def test_context_manager(self): |
|
|
|
|
43
|
|
|
for filename in glob.glob("context_output*"): |
44
|
|
|
os.remove(filename) |
|
|
|
|
45
|
|
|
with TopikProject("context_output", self.output_type, self.output_args) as project: |
|
|
|
|
46
|
|
|
project.read_input(source=test_data_path, content_field='abstract') |
|
|
|
|
47
|
|
|
project.tokenize() |
48
|
|
|
project.vectorize(method='bag_of_words') |
49
|
|
|
project.run_model(model_name='lda', ntopics=2) |
50
|
|
|
|
51
|
|
|
# above runs through a whole workflow (minus plotting.) At end, it closes file. |
52
|
|
|
# load output here. |
53
|
|
|
with TopikProject("context_output") as project: |
54
|
|
|
nt.assert_equal(len(list(project.get_filtered_corpus_iterator())), 100) |
55
|
|
|
nt.assert_true(sample_tokenized_doc in list(iter(project.selected_tokenized_corpus))) |
|
|
|
|
56
|
|
|
nt.assert_equal(project.selected_vectorized_corpus.global_term_count, 2434) |
57
|
|
|
nt.assert_equal(len(project.selected_vectorized_corpus), 100) # All documents processed |
58
|
|
|
for doc in project.selected_modeled_corpus.doc_topic_matrix.values(): |
59
|
|
|
nt.assert_almost_equal(sum(doc), 1) |
|
|
|
|
60
|
|
|
for topic in project.selected_modeled_corpus.topic_term_matrix.values(): |
61
|
|
|
nt.assert_almost_equal(sum(topic), 1) |
|
|
|
|
62
|
|
|
|
63
|
|
|
for filename in glob.glob("context_output*"): |
64
|
|
|
os.remove(filename) |
65
|
|
|
|
66
|
|
|
def test_read_input(self): |
|
|
|
|
67
|
|
|
nt.assert_equal(len(list(self.project.get_filtered_corpus_iterator())), 100) |
|
|
|
|
68
|
|
|
|
69
|
|
|
def test_get_filtered_corpus_iterator(self): |
|
|
|
|
70
|
|
|
doc_list = list(self.project.get_filtered_corpus_iterator()) |
|
|
|
|
71
|
|
|
nt.assert_equal(type(doc_list[0]), type(('123', 'text'))) |
|
|
|
|
72
|
|
|
nt.assert_equal(len(doc_list), 100) |
73
|
|
|
|
74
|
|
|
def test_get_date_filtered_corpus_iterator(self): |
|
|
|
|
75
|
|
|
results = list(self.project.get_date_filtered_corpus_iterator( |
|
|
|
|
76
|
|
|
field_to_get="abstract", start=1975, end=1999, filter_field='year')) |
77
|
|
|
nt.assert_equal(len(results), 25) |
|
|
|
|
78
|
|
|
|
79
|
|
|
def test_tokenize(self): |
|
|
|
|
80
|
|
|
self.project.tokenize('simple') |
|
|
|
|
81
|
|
|
in_results = False |
82
|
|
|
for id, doc in self.project.selected_tokenized_corpus: |
|
|
|
|
83
|
|
|
if doc in sample_tokenized_doc: |
|
|
|
|
84
|
|
|
in_results = True |
85
|
|
|
break |
86
|
|
|
nt.assert_true(in_results) |
|
|
|
|
87
|
|
|
|
88
|
|
|
def test_vectorize(self): |
|
|
|
|
89
|
|
|
self.project.tokenize() |
|
|
|
|
90
|
|
|
self.project.vectorize() |
|
|
|
|
91
|
|
|
nt.assert_equal(self.project.selected_vectorized_corpus.global_term_count, 2434) |
|
|
|
|
92
|
|
|
nt.assert_equal(len(self.project.selected_vectorized_corpus), 100) # All documents processed |
|
|
|
|
93
|
|
|
|
94
|
|
|
def test_model(self): |
|
|
|
|
95
|
|
|
self.project.tokenize() |
|
|
|
|
96
|
|
|
self.project.vectorize() |
|
|
|
|
97
|
|
|
self.project.run_model(model_name='lda', ntopics=2) |
|
|
|
|
98
|
|
|
for doc in self.project.selected_modeled_corpus.doc_topic_matrix.values(): |
|
|
|
|
99
|
|
|
nt.assert_almost_equal(sum(doc), 1) |
|
|
|
|
100
|
|
|
for topic in self.project.selected_modeled_corpus.topic_term_matrix.values(): |
|
|
|
|
101
|
|
|
nt.assert_almost_equal(sum(topic), 1) |
|
|
|
|
102
|
|
|
|
103
|
|
|
def test_visualize(self): |
|
|
|
|
104
|
|
|
self.project.tokenize() |
|
|
|
|
105
|
|
|
self.project.vectorize(method='bag_of_words') |
|
|
|
|
106
|
|
|
self.project.run_model(ntopics=2) |
|
|
|
|
107
|
|
|
self.project.visualize(vis_name='termite', topn=5) |
|
|
|
|
108
|
|
|
|
109
|
|
|
|
110
|
|
|
class TestInMemoryOutput(unittest.TestCase, ProjectTest): |
|
|
|
|
111
|
|
|
def setUp(self): |
112
|
|
|
self.output_type = "InMemoryOutput" |
113
|
|
|
self.output_args = {} |
114
|
|
|
self.project = TopikProject("test_project", |
115
|
|
|
output_type=self.output_type, |
|
|
|
|
116
|
|
|
output_args=self.output_args) |
117
|
|
|
self.project.read_input(test_data_path, content_field="abstract") |
|
|
|
|
118
|
|
|
|
119
|
|
|
|
120
|
|
|
class TestElasticSearchOutput(unittest.TestCase, ProjectTest): |
|
|
|
|
121
|
|
|
INDEX = "test_index" |
122
|
|
|
|
123
|
|
|
def setUp(self): |
124
|
|
|
self.output_type = "ElasticSearchOutput" |
125
|
|
|
self.output_args = {'source': 'localhost', |
126
|
|
|
'index': TestElasticSearchOutput.INDEX, |
|
|
|
|
127
|
|
|
'content_field': "abstract"} |
128
|
|
|
self.project = TopikProject("test_project", output_type=self.output_type, |
|
|
|
|
129
|
|
|
output_args=self.output_args) |
130
|
|
|
try: |
131
|
|
|
self.project.read_input(test_data_path, content_field="abstract", synchronous_wait=30) |
|
|
|
|
132
|
|
|
except ConnectionError: |
|
|
|
|
133
|
|
|
raise SkipTest("Skipping Elasticsearch test - elasticsearch not running") |
134
|
|
|
|
135
|
|
|
def tearDown(self): |
136
|
|
|
instance = elasticsearch.Elasticsearch("localhost") |
137
|
|
|
instance.indices.delete(TestElasticSearchOutput.INDEX) |
|
|
|
|
138
|
|
|
if instance.indices.exists("{}_year_alias_date".format(TestElasticSearchOutput.INDEX)): |
139
|
|
|
instance.indices.delete("{}_year_alias_date".format(TestElasticSearchOutput.INDEX)) |
140
|
|
|
time.sleep(1) |
141
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.