1
|
|
|
from six.moves import UserDict |
|
|
|
|
2
|
|
|
import types |
3
|
|
|
|
4
|
|
|
from ._registry import register_output |
5
|
|
|
from .base_output import OutputInterface |
6
|
|
|
|
7
|
|
|
|
8
|
|
|
class GreedyDict(UserDict, object): |
|
|
|
|
9
|
|
|
def __setitem__(self, key, value): |
10
|
|
|
if isinstance(value, types.GeneratorType): |
|
|
|
|
11
|
|
|
value = [val for val in value] |
|
|
|
|
12
|
|
|
super(GreedyDict, self).__setitem__(key, value) |
|
|
|
|
13
|
|
|
|
14
|
|
|
def __iter__(self): |
15
|
|
|
for val in self.data.values(): |
|
|
|
|
16
|
|
|
yield val |
|
|
|
|
17
|
|
|
|
18
|
|
|
|
19
|
|
|
@register_output |
|
|
|
|
20
|
|
|
class InMemoryOutput(OutputInterface): |
|
|
|
|
21
|
|
|
def __init__(self, iterable=None, hash_field=None, |
22
|
|
|
tokenized_corpora=None, |
23
|
|
|
vectorized_corpora=None, modeled_corpora=None): |
24
|
|
|
super(InMemoryOutput, self).__init__() |
25
|
|
|
|
26
|
|
|
self.corpus = GreedyDict() |
27
|
|
|
|
28
|
|
|
if iterable: |
|
|
|
|
29
|
|
|
self.import_from_iterable(iterable, hash_field) |
|
|
|
|
30
|
|
|
|
31
|
|
|
self.tokenized_corpora = tokenized_corpora if tokenized_corpora else GreedyDict() |
|
|
|
|
32
|
|
|
self.vectorized_corpora = vectorized_corpora if vectorized_corpora else GreedyDict() |
|
|
|
|
33
|
|
|
self.modeled_corpora = modeled_corpora if modeled_corpora else GreedyDict() |
|
|
|
|
34
|
|
|
|
35
|
|
|
def import_from_iterable(self, iterable, field_to_hash): |
36
|
|
|
""" |
37
|
|
|
iterable: generally a list of dicts, but possibly a list of strings |
38
|
|
|
This is your data. Your dictionary structure defines the schema |
39
|
|
|
of the elasticsearch index. |
40
|
|
|
""" |
41
|
|
|
self.hash_field=field_to_hash |
|
|
|
|
42
|
|
|
for item in iterable: |
|
|
|
|
43
|
|
|
if isinstance(item, basestring): |
|
|
|
|
44
|
|
|
item = {field_to_hash: item} |
45
|
|
|
elif field_to_hash not in item and field_to_hash in item.values()[0]: |
46
|
|
|
item = item.values()[0] |
47
|
|
|
id = hash(item[field_to_hash]) |
|
|
|
|
48
|
|
|
self.corpus[id] = item |
49
|
|
|
|
50
|
|
|
# TODO: generalize for datetimes |
|
|
|
|
51
|
|
|
# TODO: validate input data to ensure that it has valid year data |
|
|
|
|
52
|
|
|
def get_date_filtered_data(self, field_to_get, start, end, filter_field="year"): |
|
|
|
|
53
|
|
|
return self.get_filtered_data(field_to_get, |
|
|
|
|
54
|
|
|
"{}<=int({}['{}'])<={}".format(start, "{}", |
|
|
|
|
55
|
|
|
filter_field, end)) |
|
|
|
|
56
|
|
|
|
57
|
|
|
def get_filtered_data(self, field_to_get, filter=""): |
|
|
|
|
58
|
|
|
if not filter: |
|
|
|
|
59
|
|
|
for doc_id, doc in self.corpus.items(): |
|
|
|
|
60
|
|
|
yield doc_id, doc[field_to_get] |
|
|
|
|
61
|
|
|
else: |
62
|
|
|
for doc_id, doc in self.corpus.items(): |
|
|
|
|
63
|
|
|
if eval(filter.format(doc)): |
|
|
|
|
64
|
|
|
yield doc_id, doc[field_to_get] |
65
|
|
|
|
66
|
|
|
def save(self, filename): |
|
|
|
|
67
|
|
|
saved_data = {"iterable": self.corpus, |
|
|
|
|
68
|
|
|
"hash_field": self.hash_field, |
69
|
|
|
"modeled_corpora": self.modeled_corpora, |
70
|
|
|
"vectorized_corpora": self.vectorized_corpora, |
71
|
|
|
"tokenized_corpora": self.tokenized_corpora} |
72
|
|
|
return super(InMemoryOutput, self).save(filename, saved_data) |
|
|
|
|
73
|
|
|
|
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.