|
@@ 61-97 (lines=37) @@
|
| 58 |
|
return es_getitem(key,self.corpus_type,self.instance,self.index, |
| 59 |
|
self.query) |
| 60 |
|
|
| 61 |
|
class VectorizedElasticCorpora(BaseElasticCorpora): |
| 62 |
|
def __setitem__(self, key, value): |
| 63 |
|
#id_term_map |
| 64 |
|
es_setitem(key,value.id_term_map.items(),"term",self.instance,self.index) |
| 65 |
|
#document_term_counts |
| 66 |
|
es_setitem(key,value.document_term_counts.items(),"document_term_count",self.instance,self.index) |
| 67 |
|
#doc_lengths |
| 68 |
|
es_setitem(key,value.doc_lengths.items(),"document_length",self.instance,self.index) |
| 69 |
|
#global term_frequency |
| 70 |
|
es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index) |
| 71 |
|
#vectors |
| 72 |
|
es_setitem(key,value.vectors.items(),"vector",self.instance,self.index) |
| 73 |
|
# could either upload vectors explicitly here (above) or using Super (below) |
| 74 |
|
#super(VectorizedElasticCorpora, self).__setitem__(key, value) |
| 75 |
|
|
| 76 |
|
def __getitem__(self, key): |
| 77 |
|
# TODO: each of these should be retrieved from a query. Populate the VectorizerOutput object |
| 78 |
|
# and return it. These things can be iterators instead of dicts; VectorizerOutput should |
| 79 |
|
# not care. |
| 80 |
|
# TODO: this is the id->term map for the full set of unique terms across all docs |
| 81 |
|
id_term_map = {int(term_id): term for term_id, term in es_getitem(key,"term",self.instance,self.index,self.query)} |
| 82 |
|
# 15 |
| 83 |
|
# TODO: this is the count of terms associated with each document |
| 84 |
|
document_term_count = {int(doc_id): doc_term_count for doc_id, doc_term_count in es_getitem(key,"document_term_count",self.instance,self.index,self.query)} |
| 85 |
|
# {"doc1": 3, "doc2": 5} |
| 86 |
|
doc_lengths = {int(doc_id): doc_length for doc_id, doc_length in es_getitem(key,"document_length",self.instance,self.index,self.query)} |
| 87 |
|
term_frequency = {int(term_id): global_frequency for term_id, global_frequency in es_getitem(key,"term_frequency",self.instance,self.index,self.query)} |
| 88 |
|
# TODO: this is the vectorized representation of each document |
| 89 |
|
vectors = {int(doc_id): {int(term_id): term_weight for term_id, term_weight in doc_term_weights.items()} for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)} |
| 90 |
|
#vectors = {int(doc_id): {doc_term_weights for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)} |
| 91 |
|
#vectors = list(es_getitem(key,"vector",self.instance,self.index,self.query)) |
| 92 |
|
# {"doc1": {1: 3, 2: 1} # word id is key, word count is value (for bag of words model) |
| 93 |
|
return VectorizerOutput(id_term_map=id_term_map, |
| 94 |
|
document_term_counts=document_term_count, |
| 95 |
|
doc_lengths=doc_lengths, |
| 96 |
|
term_frequency=term_frequency, |
| 97 |
|
vectors=vectors) |
| 98 |
|
|
| 99 |
|
class ModeledElasticCorpora(BaseElasticCorpora): |
| 100 |
|
def __setitem__(self, key, value): |
|
@@ 99-124 (lines=26) @@
|
| 96 |
|
term_frequency=term_frequency, |
| 97 |
|
vectors=vectors) |
| 98 |
|
|
| 99 |
|
class ModeledElasticCorpora(BaseElasticCorpora): |
| 100 |
|
def __setitem__(self, key, value): |
| 101 |
|
es_setitem(key,value.vocab.items(),"term",self.instance,self.index) |
| 102 |
|
es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index) |
| 103 |
|
es_setitem(key,value.topic_term_matrix.items(),"topic_term_dist",self.instance,self.index) |
| 104 |
|
es_setitem(key,value.doc_lengths.items(),"doc_length",self.instance,self.index) |
| 105 |
|
es_setitem(key,value.doc_topic_matrix.items(),"doc_topic_dist",self.instance,self.index) |
| 106 |
|
|
| 107 |
|
def __lt__(self, y): |
| 108 |
|
return super(ModeledElasticCorpora, self).__lt__(y) |
| 109 |
|
|
| 110 |
|
def __getitem__(self, key): |
| 111 |
|
vocab = {int(term_id): term for term_id, term in \ |
| 112 |
|
es_getitem(key,"term",self.instance,self.index,self.query)} |
| 113 |
|
term_frequency = {int(term_id): tf for term_id, tf in \ |
| 114 |
|
es_getitem(key,"term_frequency",self.instance,self.index,self.query)} |
| 115 |
|
topic_term_matrix = {topic_id: topic_term_dist for topic_id, topic_term_dist in \ |
| 116 |
|
es_getitem(key,"topic_term_dist",self.instance,self.index,self.query)} |
| 117 |
|
doc_lengths = {topic_id: doc_length for topic_id, doc_length in \ |
| 118 |
|
es_getitem(key,"doc_length",self.instance,self.index,self.query)} |
| 119 |
|
doc_topic_matrix = {int(doc_id): doc_topic_dist for doc_id, doc_topic_dist in \ |
| 120 |
|
es_getitem(key,"doc_topic_dist",self.instance,self.index,self.query)} |
| 121 |
|
return ModelOutput(vocab=vocab, term_frequency=term_frequency, |
| 122 |
|
topic_term_matrix=topic_term_matrix, |
| 123 |
|
doc_lengths=doc_lengths, |
| 124 |
|
doc_topic_matrix=doc_topic_matrix) |
| 125 |
|
|
| 126 |
|
@register_output |
| 127 |
|
class ElasticSearchOutput(OutputInterface): |