| Total Complexity | 42 |
| Total Lines | 220 |
| Duplicated Lines | 0 % |
Complex classes like TopikProject often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | import itertools |
||
| 18 | class TopikProject(object): |
||
| 19 | def __init__(self, project_name, output_type=None, output_args=None, **kwargs): |
||
| 20 | """Class that abstracts persistence. Drives different output types, and handles |
||
| 21 | storing intermediate results to given output type. |
||
| 22 | |||
| 23 | output_type : string |
||
| 24 | internal format for handling user data. Current options are |
||
| 25 | present in topik.fileio.registered_outputs. default is "InMemoryOutput". |
||
| 26 | output_args : dictionary or None |
||
| 27 | configuration to pass through to output |
||
| 28 | synchronous_wait : integer |
||
| 29 | number of seconds to wait for data to finish uploading to output, when using an asynchronous |
||
| 30 | output type. Only relevant for some output types ("ElasticSearchOutput", not "InMemoryOutput") |
||
| 31 | **kwargs : passed through to superclass __init__. Not passed to output. |
||
| 32 | """ |
||
| 33 | if output_args is None: |
||
| 34 | output_args = {} |
||
| 35 | if os.path.exists(project_name + ".topikproject") and output_type is None: |
||
| 36 | with open(project_name + ".topikproject") as project_meta: |
||
| 37 | project_data = jsonpickle.decode(project_meta.read()) |
||
| 38 | kwargs.update(project_data) |
||
| 39 | with open(project_name + ".topikdata") as project_data: |
||
| 40 | loaded_data = jsonpickle.decode(project_data.read()) |
||
| 41 | output_type = loaded_data["class"] |
||
| 42 | output_args.update(loaded_data["saved_data"]) |
||
| 43 | self.project_name = project_name |
||
| 44 | if output_type is None: |
||
| 45 | output_type = "InMemoryOutput" |
||
| 46 | # loading the output here is sufficient to restore all results: the output is responsible for loading them as |
||
| 47 | # necessary, and returning iterators or output objects appropriately. |
||
| 48 | self.output = registered_outputs[output_type](**output_args) |
||
| 49 | # not used, but stored here for persistence purposes |
||
| 50 | self._output_type = output_type |
||
| 51 | self._output_args = output_args |
||
| 52 | # None or a string expression in Elasticsearch query format |
||
| 53 | self.corpus_filter = kwargs["corpus_filter"] if "corpus_filter" in kwargs else "" |
||
| 54 | # None or a string name |
||
| 55 | self.content_field = kwargs["content_field"] if "content_field" in kwargs else "" |
||
| 56 | # Initially None, set to string value when tokenize or transform method called |
||
| 57 | self._selected_source_field = kwargs["_selected_content_field"] if "_selected_content_field" in kwargs else None |
||
| 58 | # Initially None, set to string value when tokenize or transform method called |
||
| 59 | self._selected_tokenized_corpus_id = kwargs["_selected_tokenized_corpus_id"] if "_selected_tokenized_corpus_id" in kwargs else None |
||
| 60 | # Initially None, set to string value when vectorize method called |
||
| 61 | self._selected_vectorized_corpus_id = kwargs["_selected_vectorized_corpus_id"] if "_selected_vectorized_corpus_id" in kwargs else None |
||
| 62 | # Initially None, set to string value when run_model method called |
||
| 63 | self._selected_modeled_corpus_id = kwargs["_selected_modeled_corpus_id"] if "_selected_modeled_corpus_id" in kwargs else None |
||
| 64 | |||
| 65 | def __enter__(self): |
||
| 66 | return self |
||
| 67 | |||
| 68 | def __exit__(self, exc_type, exc_val, exc_tb): |
||
| 69 | self.close() |
||
| 70 | |||
| 71 | def close(self): |
||
| 72 | self.save() |
||
| 73 | self.output.close() # close any open file handles or network connections |
||
| 74 | |||
| 75 | def save(self): |
||
| 76 | """Save project as .topikproject metafile and some number of sidecar data files.""" |
||
| 77 | with open(self.project_name + ".topikproject", "w") as f: |
||
| 78 | f.write(jsonpickle.encode({ |
||
| 79 | "_selected_tokenized_corpus_id": self._selected_tokenized_corpus_id, |
||
| 80 | "_selected_vectorized_corpus_id": self._selected_vectorized_corpus_id, |
||
| 81 | "_selected_modeled_corpus_id": self._selected_modeled_corpus_id, |
||
| 82 | "corpus_filter": self.corpus_filter, |
||
| 83 | "project_name": self.project_name, |
||
| 84 | "output_type": self._output_type, |
||
| 85 | "output_args": self._output_args, |
||
| 86 | "content_field": self.content_field}, |
||
| 87 | f)) |
||
| 88 | self.output.save(self.project_name + ".topikdata") |
||
| 89 | |||
| 90 | def read_input(self, source, content_field, source_type="auto", **kwargs): |
||
| 91 | """Import data from external source into Topik's internal format""" |
||
| 92 | self.output.import_from_iterable(read_input(source, |
||
| 93 | source_type=source_type, |
||
| 94 | **kwargs), |
||
| 95 | field_to_hash=content_field) |
||
| 96 | self.content_field = content_field |
||
| 97 | |||
| 98 | def get_filtered_corpus_iterator(self, field=None, filter_expression=None): |
||
| 99 | if field is None: |
||
| 100 | field = self.content_field |
||
| 101 | if filter_expression is None: |
||
| 102 | filter_expression = self.corpus_filter |
||
| 103 | return self.output.get_filtered_data(field, filter_expression) |
||
| 104 | |||
| 105 | def get_date_filtered_corpus_iterator(self, start, end, filter_field, |
||
| 106 | field_to_get=None): |
||
| 107 | if field_to_get is None: |
||
| 108 | field_to_get = self.content_field |
||
| 109 | return self.output.get_date_filtered_data(field_to_get=field_to_get, |
||
| 110 | start=start, |
||
| 111 | end=end, |
||
| 112 | filter_field=filter_field) |
||
| 113 | |||
| 114 | def tokenize(self, method="simple", **kwargs): |
||
| 115 | """Break raw text into substituent terms (or collections of terms)""" |
||
| 116 | # tokenize, and store the results on this object somehow |
||
| 117 | tokenized_corpus = tokenizers.tokenize(self.selected_filtered_corpus, |
||
| 118 | method=method, **kwargs) |
||
| 119 | tokenize_parameter_string = self.corpus_filter + "_tk_{method}{params}".format( |
||
| 120 | method=method, |
||
| 121 | params=_get_parameters_string(**kwargs)) |
||
| 122 | |||
| 123 | # store this |
||
| 124 | self.output.tokenized_corpora[tokenize_parameter_string] = tokenized_corpus |
||
| 125 | # set _tokenizer_id internal handle to point to this data |
||
| 126 | self._selected_tokenized_corpus_id = tokenize_parameter_string |
||
| 127 | |||
| 128 | def transform(self, method, **kwargs): |
||
| 129 | """Stem or lemmatize input text that has already been tokenized""" |
||
| 130 | transformed_data = transformers.transform(method=method, **kwargs) |
||
| 131 | tokenize_parameter_string = "_".join([self.tokenizer_id, "xform", method, |
||
| 132 | _get_parameters_string(**kwargs)]) |
||
| 133 | # store this |
||
| 134 | self.output.tokenized_corpora[tokenize_parameter_string] = transformed_data |
||
| 135 | # set _tokenizer_id internal handle to point to this data |
||
| 136 | self._selected_tokenized_corpus_id = tokenize_parameter_string |
||
| 137 | |||
| 138 | def vectorize(self, method="bag_of_words", **kwargs): |
||
| 139 | """Convert tokenized text to vector form - mathematical representation used for modeling.""" |
||
| 140 | tokenizer_iterators = itertools.tee(self.selected_tokenized_corpus) |
||
| 141 | vectorized_corpus = vectorizers.vectorize(tokenizer_iterators[0], |
||
| 142 | method=method, **kwargs) |
||
| 143 | vectorize_parameter_string = self.corpus_filter + self._selected_tokenized_corpus_id + "_".join([method, _get_parameters_string(**kwargs)]) |
||
| 144 | # store this internally |
||
| 145 | self.output.vectorized_corpora[vectorize_parameter_string] = vectorized_corpus |
||
| 146 | # set _vectorizer_id internal handle to point to this data |
||
| 147 | self._selected_vectorized_corpus_id = vectorize_parameter_string |
||
| 148 | |||
| 149 | def run_model(self, model_name="lda", ntopics=3, **kwargs): |
||
| 150 | """Analyze vectorized text; determine topics and assign document probabilities""" |
||
| 151 | if (model_name=='lda') and ('tfidf' in self._selected_vectorized_corpus_id): |
||
| 152 | raise ValueError('LDA models are incompatible with TF-IDF vectorization. If you wish to use TFIDF vectorization, please select another type of model.') |
||
| 153 | modeled_corpus = models.run_model(self.selected_vectorized_corpus, |
||
| 154 | model_name=model_name, |
||
| 155 | ntopics=ntopics, **kwargs) |
||
| 156 | model_id = "_".join([model_name, _get_parameters_string(**kwargs)]) |
||
| 157 | # store this internally |
||
| 158 | self.output.modeled_corpora[model_id] = modeled_corpus |
||
| 159 | # set _model_id internal handle to point to this data |
||
| 160 | self._selected_modeled_corpus_id = model_id |
||
| 161 | |||
| 162 | def visualize(self, vis_name='lda_vis', model_id=None, **kwargs): |
||
| 163 | """Plot model output""" |
||
| 164 | if not model_id: |
||
| 165 | modeled_corpus = self.selected_modeled_corpus |
||
| 166 | else: |
||
| 167 | modeled_corpus = self.output.model_data[model_id] |
||
| 168 | return visualizers.visualize(modeled_corpus, vis_name, **kwargs) |
||
| 169 | |||
| 170 | def select_tokenized_corpus(self, _id): |
||
| 171 | """Assign active tokenized corpus. |
||
| 172 | |||
| 173 | When more than one tokenized corpus available (ran tokenization more than once with different |
||
| 174 | methods), this allows you to switch to a different data set. |
||
| 175 | """ |
||
| 176 | if _id in self.output.tokenized_corpora: |
||
| 177 | self._selected_tokenized_corpus_id = _id |
||
| 178 | else: |
||
| 179 | raise ValueError("tokenized data {} not found in storage.".format(id)) |
||
| 180 | |||
| 181 | def select_vectorized_corpus(self, _id): |
||
| 182 | """Assign active vectorized corpus. |
||
| 183 | |||
| 184 | When more than one vectorized corpus available (ran tokenization more than once with different |
||
| 185 | methods), this allows you to switch to a different data set. |
||
| 186 | """ |
||
| 187 | if _id in self.output.vectorized_corpora: |
||
| 188 | self._selected_vectorized_corpus_id = _id |
||
| 189 | else: |
||
| 190 | raise ValueError("vectorized data {} not found in storage.".format(_id)) |
||
| 191 | |||
| 192 | def select_modeled_corpus(self, _id): |
||
| 193 | """When more than one model output available (ran modeling more than once with different |
||
| 194 | methods), this allows you to switch to a different data set. |
||
| 195 | """ |
||
| 196 | if _id in self.output.modeled_corpus: |
||
| 197 | self._selected_modeled_corpus_id = _id |
||
| 198 | else: |
||
| 199 | raise ValueError("model {} not found in storage.".format(_id)) |
||
| 200 | |||
| 201 | @property |
||
| 202 | def selected_filtered_corpus(self): |
||
| 203 | """Corpus documents, potentially a subset. |
||
| 204 | |||
| 205 | Output from read_input step. |
||
| 206 | Input to tokenization step. |
||
| 207 | """ |
||
| 208 | return self.output.get_filtered_data(field_to_get=self.content_field, |
||
| 209 | filter=self.corpus_filter) |
||
| 210 | |||
| 211 | @property |
||
| 212 | def selected_tokenized_corpus(self): |
||
| 213 | """Documents broken into component words. May also be transformed. |
||
| 214 | |||
| 215 | Output from tokenization and/or transformation steps. |
||
| 216 | Input to vectorization step. |
||
| 217 | """ |
||
| 218 | return self.output.tokenized_corpora[self._selected_tokenized_corpus_id] |
||
| 219 | |||
| 220 | @property |
||
| 221 | def selected_vectorized_corpus(self): |
||
| 222 | """Data that has been vectorized into term frequencies, TF/IDF, or |
||
| 223 | other vector representation. |
||
| 224 | |||
| 225 | Output from vectorization step. |
||
| 226 | Input to modeling step. |
||
| 227 | """ |
||
| 228 | return self.output.vectorized_corpora[self._selected_vectorized_corpus_id] |
||
| 229 | |||
| 230 | @property |
||
| 231 | def selected_modeled_corpus(self): |
||
| 232 | """matrices representing the model derived. |
||
| 233 | |||
| 234 | Output from modeling step. |
||
| 235 | Input to visualization step. |
||
| 236 | """ |
||
| 237 | return self.output.modeled_corpora[self._selected_modeled_corpus_id] |
||
| 238 | |||
| 239 |
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.