annif.backend.svc   A
last analyzed

Complexity

Total Complexity 17

Size/Duplication

Total Lines 121
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 86
dl 0
loc 121
rs 10
c 0
b 0
f 0
wmc 17

7 Methods

Rating   Name   Duplication   Size   Complexity  
A SVCBackend._scores_to_suggestions() 0 12 3
A SVCBackend._initialize_model() 0 9 3
A SVCBackend._suggest_batch() 0 13 2
A SVCBackend._train_classifier() 0 6 1
A SVCBackend._corpus_to_texts_and_classes() 0 16 4
A SVCBackend.initialize() 0 3 1
A SVCBackend._train() 0 17 3
1
"""Annif backend using a SVM classifier"""
2
3
from __future__ import annotations
4
5
import os.path
6
from typing import TYPE_CHECKING, Any
7
8
import joblib
9
import numpy as np
10
import scipy.special
11
from sklearn.svm import LinearSVC
12
13
import annif.util
14
from annif.exception import NotInitializedException, NotSupportedException
15
from annif.suggestion import SubjectSuggestion, SuggestionBatch
16
17
from . import backend, mixins
18
19
if TYPE_CHECKING:
20
    from scipy.sparse._csr import csr_matrix
21
22
    from annif.corpus.document import DocumentCorpus
23
24
25
class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
26
    """Support vector classifier backend for Annif"""
27
28
    name = "svc"
29
30
    # defaults for uninitialized instances
31
    _model = None
32
33
    MODEL_FILE = "svc-model.gz"
34
35
    DEFAULT_PARAMETERS = {"min_df": 1, "ngram": 1}
36
37
    def _initialize_model(self) -> None:
38
        if self._model is None:
39
            path = os.path.join(self.datadir, self.MODEL_FILE)
40
            self.debug("loading model from {}".format(path))
41
            if os.path.exists(path):
42
                self._model = joblib.load(path)
43
            else:
44
                raise NotInitializedException(
45
                    "model {} not found".format(path), backend_id=self.backend_id
46
                )
47
48
    def initialize(self, parallel: bool = False) -> None:
49
        self.initialize_vectorizer()
50
        self._initialize_model()
51
52
    def _corpus_to_texts_and_classes(
53
        self, corpus: DocumentCorpus
54
    ) -> tuple[list[str], list[int]]:
55
        texts = []
56
        classes = []
57
        for doc in corpus.documents:
58
            if len(doc.subject_set) > 1:
59
                self.warning(
60
                    "training on a document with multiple subjects is not "
61
                    + "supported by SVC; selecting one random subject."
62
                )
63
            elif not doc.subject_set:
64
                continue  # skip documents with no subjects
65
            texts.append(doc.text)
66
            classes.append(doc.subject_set[0])
67
        return texts, classes
68
69
    def _train_classifier(self, veccorpus: csr_matrix, classes: list[int]) -> None:
70
        self.info("creating classifier")
71
        self._model = LinearSVC(dual="auto")
72
        self._model.fit(veccorpus, classes)
73
        annif.util.atomic_save(
74
            self._model, self.datadir, self.MODEL_FILE, method=joblib.dump
75
        )
76
77
    def _train(
78
        self, corpus: DocumentCorpus, params: dict[str, Any], jobs: int = 0
79
    ) -> None:
80
        if corpus == "cached":
81
            raise NotSupportedException(
82
                "SVC backend does not support reuse of cached training data."
83
            )
84
        if corpus.is_empty():
85
            raise NotSupportedException("Cannot train SVC project with no documents")
86
        texts, classes = self._corpus_to_texts_and_classes(corpus)
87
        vecparams = {
88
            "min_df": int(params["min_df"]),
89
            "tokenizer": self.project.analyzer.tokenize_words,
90
            "ngram_range": (1, int(params["ngram"])),
91
        }
92
        veccorpus = self.create_vectorizer(texts, vecparams)
93
        self._train_classifier(veccorpus, classes)
94
95
    def _scores_to_suggestions(
96
        self, scores: np.ndarray, params: dict[str, Any]
97
    ) -> list[SubjectSuggestion]:
98
        results = []
99
        limit = int(params["limit"])
100
        for class_id in np.argsort(scores)[::-1][:limit]:
101
            subject_id = self._model.classes_[class_id]
102
            if subject_id is not None:
103
                results.append(
104
                    SubjectSuggestion(subject_id=subject_id, score=scores[class_id])
105
                )
106
        return results
107
108
    def _suggest_batch(
109
        self, texts: list[str], params: dict[str, Any]
110
    ) -> SuggestionBatch:
111
        vector = self.vectorizer.transform(texts)
112
        confidences = self._model.decision_function(vector)
113
        # convert to 0..1 score range using logistic function
114
        scores_list = scipy.special.expit(confidences)
115
        return SuggestionBatch.from_sequence(
116
            [
117
                [] if row.nnz == 0 else self._scores_to_suggestions(scores, params)
118
                for scores, row in zip(scores_list, vector)
119
            ],
120
            self.project.subjects,
121
        )
122