| @@ 209-222 (lines=14) @@ | ||
| 206 | ) |
|
| 207 | atomic_save_folder(self._model, model_path) |
|
| 208 | ||
| 209 | def _train(self, corpus, params, jobs=0): |
|
| 210 | if corpus == 'cached': |
|
| 211 | self.info("Reusing cached training data from previous run.") |
|
| 212 | else: |
|
| 213 | if corpus.is_empty(): |
|
| 214 | raise NotSupportedException( |
|
| 215 | 'Cannot t project with no documents') |
|
| 216 | input = (doc.text for doc in corpus.documents) |
|
| 217 | vecparams = {'min_df': int(params['min_df']), |
|
| 218 | 'tokenizer': self.project.analyzer.tokenize_words, |
|
| 219 | 'ngram_range': (1, int(params['ngram']))} |
|
| 220 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 221 | self._create_train_files(veccorpus, corpus) |
|
| 222 | self._create_model(params, jobs) |
|
| 223 | ||
| 224 | def _suggest(self, text, params): |
|
| 225 | text = ' '.join(text.split()) |
|
| @@ 102-115 (lines=14) @@ | ||
| 99 | self._model, |
|
| 100 | model_path) |
|
| 101 | ||
| 102 | def _train(self, corpus, params, jobs=0): |
|
| 103 | if corpus != 'cached': |
|
| 104 | if corpus.is_empty(): |
|
| 105 | raise NotSupportedException( |
|
| 106 | 'Cannot train omikuji project with no documents') |
|
| 107 | input = (doc.text for doc in corpus.documents) |
|
| 108 | vecparams = {'min_df': int(params['min_df']), |
|
| 109 | 'tokenizer': self.project.analyzer.tokenize_words, |
|
| 110 | 'ngram_range': (1, int(params['ngram']))} |
|
| 111 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 112 | self._create_train_file(veccorpus, corpus) |
|
| 113 | else: |
|
| 114 | self.info("Reusing cached training data from previous run.") |
|
| 115 | self._create_model(params, jobs) |
|
| 116 | ||
| 117 | def _suggest(self, text, params): |
|
| 118 | self.debug('Suggesting subjects for text "{}..." (len={})'.format( |
|