Code Duplication    Length = 20-22 lines in 2 locations

annif/backend/omikuji.py 1 location

@@ 108-129 (lines=22) @@
105
        self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
106
        annif.util.atomic_save_folder(self._model, model_path)
107
108
    def _train(
109
        self,
110
        corpus: DocumentCorpus,
111
        params: dict[str, Any],
112
        jobs: int = 0,
113
    ) -> None:
114
        if corpus != "cached":
115
            if corpus.is_empty():
116
                raise NotSupportedException(
117
                    "Cannot train omikuji project with no documents"
118
                )
119
            input = (doc.text for doc in corpus.documents)
120
            vecparams = {
121
                "min_df": int(params["min_df"]),
122
                "tokenizer": self.project.analyzer.tokenize_words,
123
                "ngram_range": (1, int(params["ngram"])),
124
            }
125
            veccorpus = self.create_vectorizer(input, vecparams)
126
            self._create_train_file(veccorpus, corpus)
127
        else:
128
            self.info("Reusing cached training data from previous run.")
129
        self._create_model(params, jobs)
130
131
    def _suggest_batch(
132
        self, texts: list[str], params: dict[str, Any]

annif/backend/xtransformer.py 1 location

@@ 208-227 (lines=20) @@
205
        )
206
        atomic_save_folder(self._model, model_path)
207
208
    def _train(
209
        self,
210
        corpus: DocumentCorpus,
211
        params: dict[str, Any],
212
        jobs: int = 0,
213
    ) -> None:
214
        if corpus == "cached":
215
            self.info("Reusing cached training data from previous run.")
216
        else:
217
            if corpus.is_empty():
218
                raise NotSupportedException("Cannot t project with no documents")
219
            input = (doc.text for doc in corpus.documents)
220
            vecparams = {
221
                "min_df": int(params["min_df"]),
222
                "tokenizer": self.project.analyzer.tokenize_words,
223
                "ngram_range": (1, int(params["ngram"])),
224
            }
225
            veccorpus = self.create_vectorizer(input, vecparams)
226
            self._create_train_files(veccorpus, corpus)
227
        self._create_model(params, jobs)
228
229
    def _suggest_batch(
230
        self, texts: list[str], params: dict[str, Any]