Passed
Pull Request — master (#486)
by Osma
02:03
created

annif.backend.svc.SVCBackend._train()   A

Complexity

Conditions 4

Size

Total Lines 17
Code Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 17
nop 3
dl 0
loc 17
rs 9.55
c 0
b 0
f 0
1
"""Annif backend using a SVM classifier"""
2
3
import os.path
4
import joblib
5
import numpy as np
6
import scipy.special
7
from sklearn.svm import LinearSVC
8
import annif.util
9
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
10
from annif.exception import NotInitializedException, NotSupportedException
11
from . import backend
12
from . import mixins
13
14
15
class SVCBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
16
    """Support vector classifier backend for Annif"""
17
    name = "svc"
18
    needs_subject_index = True
19
20
    # defaults for uninitialized instances
21
    _model = None
22
23
    MODEL_FILE = 'svc-model.gz'
24
25
    DEFAULT_PARAMETERS = {
26
        'min_df': 1,
27
        'ngram': 1
28
    }
29
30
    def default_params(self):
31
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
32
        params.update(self.DEFAULT_PARAMETERS)
33
        return params
34
35
    def _initialize_model(self):
36
        if self._model is None:
37
            path = os.path.join(self.datadir, self.MODEL_FILE)
38
            self.debug('loading model from {}'.format(path))
39
            if os.path.exists(path):
40
                self._model = joblib.load(path)
41
            else:
42
                raise NotInitializedException(
43
                    'model {} not found'.format(path),
44
                    backend_id=self.backend_id)
45
46
    def initialize(self):
47
        self.initialize_vectorizer()
48
        self._initialize_model()
49
50
    def _train_classifier(self, veccorpus, classes):
51
        self.info('creating classifier')
52
        self._model = LinearSVC()
53
        self._model.fit(veccorpus, classes)
54
        annif.util.atomic_save(self._model,
55
                               self.datadir,
56
                               self.MODEL_FILE,
57
                               method=joblib.dump)
58
59
    def _train(self, corpus, params):
60
        if corpus == 'cached':
61
            raise NotSupportedException(
62
                'SVC backend does not support reuse of cached training data.')
63
        if corpus.is_empty():
64
            raise NotSupportedException(
65
                'Cannot train SVC project with no documents')
66
        texts = []
67
        classes = []
68
        for doc in corpus.documents:
69
            texts.append(doc.text)
70
            classes.append(doc.uris[0])
71
        vecparams = {'min_df': int(params['min_df']),
72
                     'tokenizer': self.project.analyzer.tokenize_words,
73
                     'ngram_range': (1, int(params['ngram']))}
74
        veccorpus = self.create_vectorizer(texts, vecparams)
75
        self._train_classifier(veccorpus, classes)
76
77
    def _scores_to_suggestions(self, scores, params):
78
        results = []
79
        limit = int(params['limit'])
80
        for class_id in np.argsort(scores)[::-1][:limit]:
81
            class_uri = self._model.classes_[class_id]
82
            subject_id = self.project.subjects.by_uri(class_uri)
83
            if subject_id is not None:
84
                uri, label, notation = self.project.subjects[subject_id]
85
                results.append(SubjectSuggestion(
86
                    uri=uri,
87
                    label=label,
88
                    notation=notation,
89
                    score=scores[class_id]))
90
        return ListSuggestionResult(results)
91
92
    def _suggest(self, text, params):
93
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
94
            text[:20], len(text)))
95
        vector = self.vectorizer.transform([text])
96
        if vector.nnz == 0:  # All zero vector, empty result
97
            return ListSuggestionResult([])
98
        confidences = self._model.decision_function(vector)[0]
99
        # convert to 0..1 score range using logistic function
100
        scores = scipy.special.expit(confidences)
101
        return self._scores_to_suggestions(scores, params)
102