@@ 61-97 (lines=37) @@ | ||
58 | return es_getitem(key,self.corpus_type,self.instance,self.index, |
|
59 | self.query) |
|
60 | ||
61 | class VectorizedElasticCorpora(BaseElasticCorpora): |
|
62 | def __setitem__(self, key, value): |
|
63 | #id_term_map |
|
64 | es_setitem(key,value.id_term_map.items(),"term",self.instance,self.index) |
|
65 | #document_term_counts |
|
66 | es_setitem(key,value.document_term_counts.items(),"document_term_count",self.instance,self.index) |
|
67 | #doc_lengths |
|
68 | es_setitem(key,value.doc_lengths.items(),"document_length",self.instance,self.index) |
|
69 | #global term_frequency |
|
70 | es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index) |
|
71 | #vectors |
|
72 | es_setitem(key,value.vectors.items(),"vector",self.instance,self.index) |
|
73 | # could either upload vectors explicitly here (above) or using Super (below) |
|
74 | #super(VectorizedElasticCorpora, self).__setitem__(key, value) |
|
75 | ||
76 | def __getitem__(self, key): |
|
77 | # TODO: each of these should be retrieved from a query. Populate the VectorizerOutput object |
|
78 | # and return it. These things can be iterators instead of dicts; VectorizerOutput should |
|
79 | # not care. |
|
80 | # TODO: this is the id->term map for the full set of unique terms across all docs |
|
81 | id_term_map = {int(term_id): term for term_id, term in es_getitem(key,"term",self.instance,self.index,self.query)} |
|
82 | # 15 |
|
83 | # TODO: this is the count of terms associated with each document |
|
84 | document_term_count = {int(doc_id): doc_term_count for doc_id, doc_term_count in es_getitem(key,"document_term_count",self.instance,self.index,self.query)} |
|
85 | # {"doc1": 3, "doc2": 5} |
|
86 | doc_lengths = {int(doc_id): doc_length for doc_id, doc_length in es_getitem(key,"document_length",self.instance,self.index,self.query)} |
|
87 | term_frequency = {int(term_id): global_frequency for term_id, global_frequency in es_getitem(key,"term_frequency",self.instance,self.index,self.query)} |
|
88 | # TODO: this is the vectorized representation of each document |
|
89 | vectors = {int(doc_id): {int(term_id): term_weight for term_id, term_weight in doc_term_weights.items()} for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)} |
|
90 | #vectors = {int(doc_id): {doc_term_weights for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)} |
|
91 | #vectors = list(es_getitem(key,"vector",self.instance,self.index,self.query)) |
|
92 | # {"doc1": {1: 3, 2: 1} # word id is key, word count is value (for bag of words model) |
|
93 | return VectorizerOutput(id_term_map=id_term_map, |
|
94 | document_term_counts=document_term_count, |
|
95 | doc_lengths=doc_lengths, |
|
96 | term_frequency=term_frequency, |
|
97 | vectors=vectors) |
|
98 | ||
99 | class ModeledElasticCorpora(BaseElasticCorpora): |
|
100 | def __setitem__(self, key, value): |
|
@@ 99-124 (lines=26) @@ | ||
96 | term_frequency=term_frequency, |
|
97 | vectors=vectors) |
|
98 | ||
99 | class ModeledElasticCorpora(BaseElasticCorpora): |
|
100 | def __setitem__(self, key, value): |
|
101 | es_setitem(key,value.vocab.items(),"term",self.instance,self.index) |
|
102 | es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index) |
|
103 | es_setitem(key,value.topic_term_matrix.items(),"topic_term_dist",self.instance,self.index) |
|
104 | es_setitem(key,value.doc_lengths.items(),"doc_length",self.instance,self.index) |
|
105 | es_setitem(key,value.doc_topic_matrix.items(),"doc_topic_dist",self.instance,self.index) |
|
106 | ||
107 | def __lt__(self, y): |
|
108 | return super(ModeledElasticCorpora, self).__lt__(y) |
|
109 | ||
110 | def __getitem__(self, key): |
|
111 | vocab = {int(term_id): term for term_id, term in \ |
|
112 | es_getitem(key,"term",self.instance,self.index,self.query)} |
|
113 | term_frequency = {int(term_id): tf for term_id, tf in \ |
|
114 | es_getitem(key,"term_frequency",self.instance,self.index,self.query)} |
|
115 | topic_term_matrix = {topic_id: topic_term_dist for topic_id, topic_term_dist in \ |
|
116 | es_getitem(key,"topic_term_dist",self.instance,self.index,self.query)} |
|
117 | doc_lengths = {topic_id: doc_length for topic_id, doc_length in \ |
|
118 | es_getitem(key,"doc_length",self.instance,self.index,self.query)} |
|
119 | doc_topic_matrix = {int(doc_id): doc_topic_dist for doc_id, doc_topic_dist in \ |
|
120 | es_getitem(key,"doc_topic_dist",self.instance,self.index,self.query)} |
|
121 | return ModelOutput(vocab=vocab, term_frequency=term_frequency, |
|
122 | topic_term_matrix=topic_term_matrix, |
|
123 | doc_lengths=doc_lengths, |
|
124 | doc_topic_matrix=doc_topic_matrix) |
|
125 | ||
126 | @register_output |
|
127 | class ElasticSearchOutput(OutputInterface): |