Completed
Push — master ( 4a55a3...cc6dfc )
by Osma
24s queued 14s
created

OmikujiBackend._suggest_batch()   A

Complexity

Conditions 4

Size

Total Lines 15
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 4
eloc 14
nop 3
dl 0
loc 15
rs 9.7
c 0
b 0
f 0
1
"""Annif backend using the Omikuji classifier"""
2
3
import os.path
4
import shutil
5
6
import omikuji
7
8
import annif.util
9
from annif.exception import (
10
    NotInitializedException,
11
    NotSupportedException,
12
    OperationFailedException,
13
)
14
from annif.suggestion import ListSuggestionResult, SubjectSuggestion
15
16
from . import backend, mixins
17
18
19
class OmikujiBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):
20
    """Omikuji based backend for Annif"""
21
22
    name = "omikuji"
23
24
    # defaults for uninitialized instances
25
    _model = None
26
27
    TRAIN_FILE = "omikuji-train.txt"
28
    MODEL_FILE = "omikuji-model"
29
30
    DEFAULT_PARAMETERS = {
31
        "min_df": 1,
32
        "ngram": 1,
33
        "cluster_balanced": True,
34
        "cluster_k": 2,
35
        "max_depth": 20,
36
        "collapse_every_n_layers": 0,
37
    }
38
39
    def default_params(self):
40
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
41
        params.update(self.DEFAULT_PARAMETERS)
42
        return params
43
44
    def _initialize_model(self):
45
        if self._model is None:
46
            path = os.path.join(self.datadir, self.MODEL_FILE)
47
            self.debug("loading model from {}".format(path))
48
            if os.path.exists(path):
49
                try:
50
                    self._model = omikuji.Model.load(path)
51
                except RuntimeError:
52
                    raise OperationFailedException(
53
                        "Omikuji models trained on Annif versions older than "
54
                        "0.56 cannot be loaded. Please retrain your project."
55
                    )
56
            else:
57
                raise NotInitializedException(
58
                    "model {} not found".format(path), backend_id=self.backend_id
59
                )
60
61
    def initialize(self, parallel=False):
62
        self.initialize_vectorizer()
63
        self._initialize_model()
64
65
    def _create_train_file(self, veccorpus, corpus):
66
        self.info("creating train file")
67
        path = os.path.join(self.datadir, self.TRAIN_FILE)
68
        with open(path, "w", encoding="utf-8") as trainfile:
69
            # Extreme Classification Repository format header line
70
            # We don't yet know the number of samples, as some may be skipped
71
            print(
72
                "00000000",
73
                len(self.vectorizer.vocabulary_),
74
                len(self.project.subjects),
75
                file=trainfile,
76
            )
77
            n_samples = 0
78
            for doc, vector in zip(corpus.documents, veccorpus):
79
                subject_ids = [str(subject_id) for subject_id in doc.subject_set]
80
                feature_values = [
81
                    "{}:{}".format(col, vector[row, col])
82
                    for row, col in zip(*vector.nonzero())
83
                ]
84
                if not subject_ids or not feature_values:
85
                    continue  # noqa
86
                print(",".join(subject_ids), " ".join(feature_values), file=trainfile)
87
                n_samples += 1
88
            # replace the number of samples value at the beginning
89
            trainfile.seek(0)
90
            print("{:08d}".format(n_samples), end="", file=trainfile)
91
92
    def _create_model(self, params, jobs):
93
        train_path = os.path.join(self.datadir, self.TRAIN_FILE)
94
        model_path = os.path.join(self.datadir, self.MODEL_FILE)
95
        hyper_param = omikuji.Model.default_hyper_param()
96
97
        hyper_param.cluster_balanced = annif.util.boolean(params["cluster_balanced"])
98
        hyper_param.cluster_k = int(params["cluster_k"])
99
        hyper_param.max_depth = int(params["max_depth"])
100
        hyper_param.collapse_every_n_layers = int(params["collapse_every_n_layers"])
101
102
        self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
103
        if os.path.exists(model_path):
104
            shutil.rmtree(model_path)
105
        self._model.save(os.path.join(self.datadir, self.MODEL_FILE))
106
107
    def _train(self, corpus, params, jobs=0):
108
        if corpus != "cached":
109
            if corpus.is_empty():
110
                raise NotSupportedException(
111
                    "Cannot train omikuji project with no documents"
112
                )
113
            input = (doc.text for doc in corpus.documents)
114
            vecparams = {
115
                "min_df": int(params["min_df"]),
116
                "tokenizer": self.project.analyzer.tokenize_words,
117
                "ngram_range": (1, int(params["ngram"])),
118
            }
119
            veccorpus = self.create_vectorizer(input, vecparams)
120
            self._create_train_file(veccorpus, corpus)
121
        else:
122
            self.info("Reusing cached training data from previous run.")
123
        self._create_model(params, jobs)
124
125
    def _suggest_batch(self, texts, params):
126
        vector = self.vectorizer.transform(texts)
127
        limit = int(params["limit"])
128
129
        batch_results = []
130
        for row in vector:
131
            if row.nnz == 0:  # All zero vector, empty result
132
                batch_results.append(ListSuggestionResult([]))
133
                continue
134
            feature_values = [(col, row[0, col]) for col in row.nonzero()[1]]
135
            results = []
136
            for subj_id, score in self._model.predict(feature_values, top_k=limit):
137
                results.append(SubjectSuggestion(subject_id=subj_id, score=score))
138
            batch_results.append(ListSuggestionResult(results))
139
        return batch_results
140