| @@ 103-116 (lines=14) @@ | ||
| 100 | model_path, |
|
| 101 | None) |
|
| 102 | ||
| 103 | def _train(self, corpus, params, jobs=0): |
|
| 104 | if corpus != 'cached': |
|
| 105 | if corpus.is_empty(): |
|
| 106 | raise NotSupportedException( |
|
| 107 | 'Cannot train omikuji project with no documents') |
|
| 108 | input = (doc.text for doc in corpus.documents) |
|
| 109 | vecparams = {'min_df': int(params['min_df']), |
|
| 110 | 'tokenizer': self.project.analyzer.tokenize_words, |
|
| 111 | 'ngram_range': (1, int(params['ngram']))} |
|
| 112 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 113 | self._create_train_file(veccorpus, corpus) |
|
| 114 | else: |
|
| 115 | self.info("Reusing cached training data from previous run.") |
|
| 116 | self._create_model(params, jobs) |
|
| 117 | ||
| 118 | def _suggest(self, text, params): |
|
| 119 | self.debug('Suggesting subjects for text "{}..." (len={})'.format( |
|
| @@ 208-221 (lines=14) @@ | ||
| 205 | ) |
|
| 206 | atomic_save(self._model, model_path, None) |
|
| 207 | ||
| 208 | def _train(self, corpus, params, jobs=0): |
|
| 209 | if corpus == 'cached': |
|
| 210 | self.info("Reusing cached training data from previous run.") |
|
| 211 | else: |
|
| 212 | if corpus.is_empty(): |
|
| 213 | raise NotSupportedException( |
|
| 214 | 'Cannot t project with no documents') |
|
| 215 | input = (doc.text for doc in corpus.documents) |
|
| 216 | vecparams = {'min_df': int(params['min_df']), |
|
| 217 | 'tokenizer': self.project.analyzer.tokenize_words, |
|
| 218 | 'ngram_range': (1, int(params['ngram']))} |
|
| 219 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 220 | self._create_train_files(veccorpus, corpus) |
|
| 221 | self._create_model(params, jobs) |
|
| 222 | ||
| 223 | def _suggest(self, text, params): |
|
| 224 | text = ' '.join(text.split()) |
|