Code Duplication    Length = 20-22 lines in 2 locations

annif/backend/xtransformer.py 1 location

@@ 208-227 (lines=20) @@
205
        )
206
        atomic_save_folder(self._model, model_path)
207
208
    def _train(
209
        self,
210
        corpus: DocumentCorpus,
211
        params: dict[str, Any],
212
        jobs: int = 0,
213
    ) -> None:
214
        if corpus == "cached":
215
            self.info("Reusing cached training data from previous run.")
216
        else:
217
            if corpus.is_empty():
218
                raise NotSupportedException("Cannot t project with no documents")
219
            input = (doc.text for doc in corpus.documents)
220
            vecparams = {
221
                "min_df": int(params["min_df"]),
222
                "tokenizer": self.project.analyzer.tokenize_words,
223
                "ngram_range": (1, int(params["ngram"])),
224
            }
225
            veccorpus = self.create_vectorizer(input, vecparams)
226
            self._create_train_files(veccorpus, corpus)
227
        self._create_model(params, jobs)
228
229
    def _suggest_batch(
230
        self, texts: list[str], params: dict[str, Any]

annif/backend/omikuji.py 1 location

@@ 107-128 (lines=22) @@
104
        self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
105
        annif.util.atomic_save_folder(self._model, model_path)
106
107
    def _train(
108
        self,
109
        corpus: DocumentCorpus,
110
        params: dict[str, Any],
111
        jobs: int = 0,
112
    ) -> None:
113
        if corpus != "cached":
114
            if corpus.is_empty():
115
                raise NotSupportedException(
116
                    "Cannot train omikuji project with no documents"
117
                )
118
            input = (doc.text for doc in corpus.documents)
119
            vecparams = {
120
                "min_df": int(params["min_df"]),
121
                "tokenizer": self.project.analyzer.tokenize_words,
122
                "ngram_range": (1, int(params["ngram"])),
123
            }
124
            veccorpus = self.create_vectorizer(input, vecparams)
125
            self._create_train_file(veccorpus, corpus)
126
        else:
127
            self.info("Reusing cached training data from previous run.")
128
        self._create_model(params, jobs)
129
130
    def _suggest_batch(
131
        self, texts: list[str], params: dict[str, Any]