| @@ 104-117 (lines=14) @@ | ||
| 101 | self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None) |
|
| 102 | annif.util.atomic_save_folder(self._model, model_path) |
|
| 103 | ||
| 104 | def _train(self, corpus, params, jobs=0): |
|
| 105 | if corpus != "cached": |
|
| 106 | if corpus.is_empty(): |
|
| 107 | raise NotSupportedException( |
|
| 108 | "Cannot train omikuji project with no documents" |
|
| 109 | ) |
|
| 110 | input = (doc.text for doc in corpus.documents) |
|
| 111 | vecparams = { |
|
| 112 | "min_df": int(params["min_df"]), |
|
| 113 | "tokenizer": self.project.analyzer.tokenize_words, |
|
| 114 | "ngram_range": (1, int(params["ngram"])), |
|
| 115 | } |
|
| 116 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 117 | self._create_train_file(veccorpus, corpus) |
|
| 118 | else: |
|
| 119 | self.info("Reusing cached training data from previous run.") |
|
| 120 | self._create_model(params, jobs) |
|
| @@ 204-217 (lines=14) @@ | ||
| 201 | ) |
|
| 202 | atomic_save_folder(self._model, model_path) |
|
| 203 | ||
| 204 | def _train(self, corpus, params, jobs=0): |
|
| 205 | if corpus == "cached": |
|
| 206 | self.info("Reusing cached training data from previous run.") |
|
| 207 | else: |
|
| 208 | if corpus.is_empty(): |
|
| 209 | raise NotSupportedException("Cannot t project with no documents") |
|
| 210 | input = (doc.text for doc in corpus.documents) |
|
| 211 | vecparams = { |
|
| 212 | "min_df": int(params["min_df"]), |
|
| 213 | "tokenizer": self.project.analyzer.tokenize_words, |
|
| 214 | "ngram_range": (1, int(params["ngram"])), |
|
| 215 | } |
|
| 216 | veccorpus = self.create_vectorizer(input, vecparams) |
|
| 217 | self._create_train_files(veccorpus, corpus) |
|
| 218 | self._create_model(params, jobs) |
|
| 219 | ||
| 220 | def _suggest(self, text, params): |
|