1
|
|
|
import os |
|
|
|
|
2
|
|
|
import unittest |
3
|
|
|
import logging |
4
|
|
|
import elasticsearch |
5
|
|
|
|
6
|
|
|
from topik.fileio.base_output import load_output |
7
|
|
|
from topik.fileio.reader import read_input |
8
|
|
|
from topik.fileio.tests import test_data_path |
9
|
|
|
from topik.fileio.out_elastic import ElasticSearchOutput |
10
|
|
|
from topik.fileio.out_memory import InMemoryOutput |
11
|
|
|
from elasticsearch.exceptions import ConnectionError |
|
|
|
|
12
|
|
|
from nose.plugins.skip import SkipTest |
|
|
|
|
13
|
|
|
|
14
|
|
|
INDEX = "topik_unittest" |
15
|
|
|
SAVE_FILENAME = "test_save.topikdata" |
16
|
|
|
CONTENT_FIELD = "abstract" |
17
|
|
|
|
18
|
|
|
# make logging quiet during testing, to keep Travis CI logs short. |
19
|
|
|
|
20
|
|
|
logging.basicConfig() |
21
|
|
|
logging.getLogger('elasticsearch').setLevel(logging.ERROR) |
|
|
|
|
22
|
|
|
logging.getLogger('urllib3').setLevel(logging.ERROR) |
23
|
|
|
|
24
|
|
|
|
25
|
|
|
class BaseOutputTest(object): |
|
|
|
|
26
|
|
|
test_raw_data = None |
27
|
|
|
|
28
|
|
|
def test_get_filtered_data(self): |
|
|
|
|
29
|
|
|
data = list(self.test_raw_data.get_filtered_data(CONTENT_FIELD)) |
|
|
|
|
30
|
|
|
self.assertEqual(len(data), 100) |
|
|
|
|
31
|
|
|
self.assertFalse(data[0] == data[1]) |
|
|
|
|
32
|
|
|
|
33
|
|
|
def test_save_file(self): |
|
|
|
|
34
|
|
|
self.test_raw_data.save(SAVE_FILENAME) |
|
|
|
|
35
|
|
|
self.assertTrue(os.path.exists(SAVE_FILENAME)) |
|
|
|
|
36
|
|
|
os.remove(SAVE_FILENAME) |
37
|
|
|
|
38
|
|
|
def test_load_file(self): |
|
|
|
|
39
|
|
|
self.test_raw_data.save(SAVE_FILENAME) |
|
|
|
|
40
|
|
|
self.test_raw_data = load_output(SAVE_FILENAME) |
41
|
|
|
data = list(self.test_raw_data.get_filtered_data(CONTENT_FIELD)) |
|
|
|
|
42
|
|
|
self.assertEqual(len(data), 100) |
|
|
|
|
43
|
|
|
os.remove(SAVE_FILENAME) |
44
|
|
|
|
45
|
|
|
def test_get_date_filtered_data(self): |
|
|
|
|
46
|
|
|
result_list = list(self.test_raw_data.get_date_filtered_data(field_to_get=CONTENT_FIELD, |
|
|
|
|
47
|
|
|
start=1975, |
48
|
|
|
end=1999, |
49
|
|
|
filter_field="year")) |
50
|
|
|
self.assertEqual(len(result_list), 25) |
|
|
|
|
51
|
|
|
self.assertTrue(-1611117933394825767 in [int(item[0]) for item in |
|
|
|
|
52
|
|
|
result_list]) |
53
|
|
|
|
54
|
|
|
|
55
|
|
|
class TestInMemoryOutput(unittest.TestCase, BaseOutputTest): |
|
|
|
|
56
|
|
|
def setUp(self): |
57
|
|
|
self.test_raw_data = InMemoryOutput() |
58
|
|
|
self.test_raw_data.import_from_iterable(read_input( |
59
|
|
|
'{}/test_data_json_stream.json'.format(test_data_path)), |
|
|
|
|
60
|
|
|
field_to_hash=CONTENT_FIELD) |
|
|
|
|
61
|
|
|
|
62
|
|
|
|
63
|
|
|
class TestElasticSearchOutput(unittest.TestCase, BaseOutputTest): |
|
|
|
|
64
|
|
|
def setUp(self): |
65
|
|
|
self.test_raw_data = ElasticSearchOutput( |
66
|
|
|
source='localhost', |
67
|
|
|
index=INDEX, |
|
|
|
|
68
|
|
|
content_field='abstract' |
69
|
|
|
) |
70
|
|
|
try: |
71
|
|
|
self.test_raw_data.import_from_iterable(read_input( |
72
|
|
|
'{}/test_data_json_stream.json'.format(test_data_path)), |
|
|
|
|
73
|
|
|
field_to_hash=CONTENT_FIELD) |
|
|
|
|
74
|
|
|
|
75
|
|
|
except ConnectionError: |
|
|
|
|
76
|
|
|
raise SkipTest("Skipping Elasticsearch test - elasticsearch not running") |
77
|
|
|
|
78
|
|
|
def tearDown(self): |
79
|
|
|
instance = elasticsearch.Elasticsearch("localhost") |
80
|
|
|
instance.indices.delete(INDEX) |
|
|
|
|
81
|
|
|
if instance.indices.exists("{}_year_alias_date".format(INDEX)): |
82
|
|
|
instance.indices.delete("{}_year_alias_date".format(INDEX)) |
83
|
|
|
|
84
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.