Passed
Push — issue703-python-3.11-support ( f59527...05d52a )
by Juho
04:06 queued 14s
created

annif.backend.svc   A

Complexity

Total Complexity 17

Size/Duplication

Total Lines 120
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 86
dl 0
loc 120
rs 10
c 0
b 0
f 0
wmc 17

7 Methods

Rating   Name   Duplication   Size   Complexity  
A SVCBackend._scores_to_suggestions() 0 12 3
A SVCBackend._initialize_model() 0 9 3
A SVCBackend._suggest_batch() 0 13 2
A SVCBackend._train_classifier() 0 6 1
A SVCBackend._corpus_to_texts_and_classes() 0 16 4
A SVCBackend.initialize() 0 3 1
A SVCBackend._train() 0 17 3
1
"""Annif backend using a SVM classifier"""
2
from __future__ import annotations
3
4
import os.path
5
from typing import TYPE_CHECKING, Any
6
7
import joblib
8
import numpy as np
9
import scipy.special
10
from sklearn.svm import LinearSVC
11
12
import annif.util
13
from annif.exception import NotInitializedException, NotSupportedException
14
from annif.suggestion import SubjectSuggestion, SuggestionBatch
15
16
from . import backend, mixins
17
18
if TYPE_CHECKING:
19
    from scipy.sparse._csr import csr_matrix
20
21
    from annif.corpus.document import DocumentCorpus
22
23
24
class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
25
    """Support vector classifier backend for Annif"""
26
27
    name = "svc"
28
29
    # defaults for uninitialized instances
30
    _model = None
31
32
    MODEL_FILE = "svc-model.gz"
33
34
    DEFAULT_PARAMETERS = {"min_df": 1, "ngram": 1}
35
36
    def _initialize_model(self) -> None:
37
        if self._model is None:
38
            path = os.path.join(self.datadir, self.MODEL_FILE)
39
            self.debug("loading model from {}".format(path))
40
            if os.path.exists(path):
41
                self._model = joblib.load(path)
42
            else:
43
                raise NotInitializedException(
44
                    "model {} not found".format(path), backend_id=self.backend_id
45
                )
46
47
    def initialize(self, parallel: bool = False) -> None:
48
        self.initialize_vectorizer()
49
        self._initialize_model()
50
51
    def _corpus_to_texts_and_classes(
52
        self, corpus: DocumentCorpus
53
    ) -> tuple[list[str], list[int]]:
54
        texts = []
55
        classes = []
56
        for doc in corpus.documents:
57
            if len(doc.subject_set) > 1:
58
                self.warning(
59
                    "training on a document with multiple subjects is not "
60
                    + "supported by SVC; selecting one random subject."
61
                )
62
            elif not doc.subject_set:
63
                continue  # skip documents with no subjects
64
            texts.append(doc.text)
65
            classes.append(doc.subject_set[0])
66
        return texts, classes
67
68
    def _train_classifier(self, veccorpus: csr_matrix, classes: list[int]) -> None:
69
        self.info("creating classifier")
70
        self._model = LinearSVC()
71
        self._model.fit(veccorpus, classes)
72
        annif.util.atomic_save(
73
            self._model, self.datadir, self.MODEL_FILE, method=joblib.dump
74
        )
75
76
    def _train(
77
        self, corpus: DocumentCorpus, params: dict[str, Any], jobs: int = 0
78
    ) -> None:
79
        if corpus == "cached":
80
            raise NotSupportedException(
81
                "SVC backend does not support reuse of cached training data."
82
            )
83
        if corpus.is_empty():
84
            raise NotSupportedException("Cannot train SVC project with no documents")
85
        texts, classes = self._corpus_to_texts_and_classes(corpus)
86
        vecparams = {
87
            "min_df": int(params["min_df"]),
88
            "tokenizer": self.project.analyzer.tokenize_words,
89
            "ngram_range": (1, int(params["ngram"])),
90
        }
91
        veccorpus = self.create_vectorizer(texts, vecparams)
92
        self._train_classifier(veccorpus, classes)
93
94
    def _scores_to_suggestions(
95
        self, scores: np.ndarray, params: dict[str, Any]
96
    ) -> list[SubjectSuggestion]:
97
        results = []
98
        limit = int(params["limit"])
99
        for class_id in np.argsort(scores)[::-1][:limit]:
100
            subject_id = self._model.classes_[class_id]
101
            if subject_id is not None:
102
                results.append(
103
                    SubjectSuggestion(subject_id=subject_id, score=scores[class_id])
104
                )
105
        return results
106
107
    def _suggest_batch(
108
        self, texts: list[str], params: dict[str, Any]
109
    ) -> SuggestionBatch:
110
        vector = self.vectorizer.transform(texts)
111
        confidences = self._model.decision_function(vector)
112
        # convert to 0..1 score range using logistic function
113
        scores_list = scipy.special.expit(confidences)
114
        return SuggestionBatch.from_sequence(
115
            [
116
                [] if row.nnz == 0 else self._scores_to_suggestions(scores, params)
117
                for scores, row in zip(scores_list, vector)
118
            ],
119
            self.project.subjects,
120
        )
121