Code Duplication    Length = 26-37 lines in 2 locations

topik/fileio/out_elastic.py 2 locations

@@ 61-97 (lines=37) @@
58
        return es_getitem(key,self.corpus_type,self.instance,self.index,
59
                          self.query)
60
61
class VectorizedElasticCorpora(BaseElasticCorpora):
62
    def __setitem__(self, key, value):
63
        #id_term_map
64
        es_setitem(key,value.id_term_map.items(),"term",self.instance,self.index)
65
        #document_term_counts
66
        es_setitem(key,value.document_term_counts.items(),"document_term_count",self.instance,self.index)
67
        #doc_lengths
68
        es_setitem(key,value.doc_lengths.items(),"document_length",self.instance,self.index)
69
        #global term_frequency
70
        es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index)
71
        #vectors
72
        es_setitem(key,value.vectors.items(),"vector",self.instance,self.index)
73
        # could either upload vectors explicitly here (above) or using Super (below)
74
        #super(VectorizedElasticCorpora, self).__setitem__(key, value)
75
76
    def __getitem__(self, key):
77
        # TODO: each of these should be retrieved from a query.  Populate the VectorizerOutput object
78
        # and return it.  These things can be iterators instead of dicts; VectorizerOutput should
79
        # not care.
80
        # TODO: this is the id->term map for the full set of unique terms across all docs
81
        id_term_map = {int(term_id): term for term_id, term in es_getitem(key,"term",self.instance,self.index,self.query)}
82
        # 15
83
        # TODO: this is the count of terms associated with each document
84
        document_term_count = {int(doc_id): doc_term_count for doc_id, doc_term_count in es_getitem(key,"document_term_count",self.instance,self.index,self.query)}
85
        # {"doc1": 3, "doc2": 5}
86
        doc_lengths = {int(doc_id): doc_length for doc_id, doc_length in es_getitem(key,"document_length",self.instance,self.index,self.query)}
87
        term_frequency = {int(term_id): global_frequency for term_id, global_frequency in es_getitem(key,"term_frequency",self.instance,self.index,self.query)}
88
        # TODO: this is the vectorized representation of each document
89
        vectors = {int(doc_id): {int(term_id): term_weight for term_id, term_weight in doc_term_weights.items()} for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)}
90
        #vectors = {int(doc_id): {doc_term_weights for doc_id, doc_term_weights in es_getitem(key,"vector",self.instance,self.index,self.query)}
91
        #vectors = list(es_getitem(key,"vector",self.instance,self.index,self.query))
92
        #  {"doc1": {1: 3, 2: 1}  # word id is key, word count is value (for bag of words model)
93
        return VectorizerOutput(id_term_map=id_term_map,
94
                                document_term_counts=document_term_count,
95
                                doc_lengths=doc_lengths,
96
                                term_frequency=term_frequency,
97
                                vectors=vectors)
98
99
class ModeledElasticCorpora(BaseElasticCorpora):
100
    def __setitem__(self, key, value):
@@ 99-124 (lines=26) @@
96
                                term_frequency=term_frequency,
97
                                vectors=vectors)
98
99
class ModeledElasticCorpora(BaseElasticCorpora):
100
    def __setitem__(self, key, value):
101
        es_setitem(key,value.vocab.items(),"term",self.instance,self.index)
102
        es_setitem(key,value.term_frequency.items(),"term_frequency",self.instance,self.index)
103
        es_setitem(key,value.topic_term_matrix.items(),"topic_term_dist",self.instance,self.index)
104
        es_setitem(key,value.doc_lengths.items(),"doc_length",self.instance,self.index)
105
        es_setitem(key,value.doc_topic_matrix.items(),"doc_topic_dist",self.instance,self.index)
106
107
    def __lt__(self, y):
108
        return super(ModeledElasticCorpora, self).__lt__(y)
109
110
    def __getitem__(self, key):
111
        vocab = {int(term_id): term for term_id, term in \
112
                 es_getitem(key,"term",self.instance,self.index,self.query)}
113
        term_frequency = {int(term_id): tf for term_id, tf in \
114
                          es_getitem(key,"term_frequency",self.instance,self.index,self.query)}
115
        topic_term_matrix = {topic_id: topic_term_dist for topic_id, topic_term_dist in \
116
                             es_getitem(key,"topic_term_dist",self.instance,self.index,self.query)}
117
        doc_lengths = {topic_id: doc_length for topic_id, doc_length in \
118
                       es_getitem(key,"doc_length",self.instance,self.index,self.query)}
119
        doc_topic_matrix = {int(doc_id): doc_topic_dist for doc_id, doc_topic_dist in \
120
                             es_getitem(key,"doc_topic_dist",self.instance,self.index,self.query)}
121
        return ModelOutput(vocab=vocab, term_frequency=term_frequency,
122
                           topic_term_matrix=topic_term_matrix,
123
                           doc_lengths=doc_lengths,
124
                           doc_topic_matrix=doc_topic_matrix)
125
126
@register_output
127
class ElasticSearchOutput(OutputInterface):