Passed
Pull Request — master (#447)
by
unknown
02:34
created

annif.backend.pav.PAVBackend.default_params()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 4
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 1
1
"""PAV ensemble backend that combines results from multiple projects and
2
learns which concept suggestions from each backend are trustworthy using the
3
PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
4
individual backends into probabilities."""
5
6
import os.path
7
import joblib
8
from scipy.sparse import coo_matrix, csc_matrix
9
from sklearn.isotonic import IsotonicRegression
10
import numpy as np
11
import annif.corpus
12
import annif.suggestion
13
import annif.util
14
from annif.exception import NotInitializedException, NotSupportedException
15
from . import backend
16
from . import ensemble
17
18
19
class PAVBackend(ensemble.BaseEnsembleBackend):
20
    """PAV ensemble backend that combines results from multiple projects"""
21
    name = "pav"
22
23
    MODEL_FILE_PREFIX = "pav-model-"
24
25
    # defaults for uninitialized instances
26
    _models = None
27
28
    DEFAULT_PARAMETERS = {'min-docs': 10}
29
30
    def default_params(self):
31
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
32
        params.update(self.DEFAULT_PARAMETERS)
33
        return params
34
35
    def initialize(self):
36
        super().initialize()
37
        if self._models is not None:
38
            return  # already initialized
39
        self._models = {}
40
        sources = annif.util.parse_sources(self.params['sources'])
41
        for source_project_id, _ in sources:
42
            model_filename = self.MODEL_FILE_PREFIX + source_project_id
43
            path = os.path.join(self.datadir, model_filename)
44
            if os.path.exists(path):
45
                self.debug('loading PAV model from {}'.format(path))
46
                self._models[source_project_id] = joblib.load(path)
47
            else:
48
                raise NotInitializedException(
49
                    "PAV model file '{}' not found".format(path),
50
                    backend_id=self.backend_id)
51
52
    def _get_model(self, source_project_id):
53
        self.initialize()
54
        return self._models[source_project_id]
55
56
    def _normalize_hits(self, hits, source_project):
57
        reg_models = self._get_model(source_project.project_id)
58
        pav_result = []
59
        for hit in hits.as_list(source_project.subjects):
60
            if hit.uri in reg_models:
61
                score = reg_models[hit.uri].predict([hit.score])[0]
62
            else:  # default to raw score
63
                score = hit.score
64
            pav_result.append(
65
                annif.suggestion.SubjectSuggestion(
66
                    uri=hit.uri,
67
                    label=hit.label,
68
                    notation=hit.notation,
69
                    score=score))
70
        pav_result.sort(key=lambda hit: hit.score, reverse=True)
71
        return annif.suggestion.ListSuggestionResult(pav_result)
72
73
    @staticmethod
74
    def _suggest_train_corpus(source_project, corpus):
75
        # lists for constructing score matrix
76
        data, row, col = [], [], []
77
        # lists for constructing true label matrix
78
        trow, tcol = [], []
79
80
        ndocs = 0
81
        for docid, doc in enumerate(corpus.documents):
82
            hits = source_project.suggest(doc.text)
83
            vector = hits.as_vector(source_project.subjects)
84
            for cid in np.flatnonzero(vector):
85
                data.append(vector[cid])
86
                row.append(docid)
87
                col.append(cid)
88
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
89
            for cid in np.flatnonzero(
90
                    subjects.as_vector(source_project.subjects)):
91
92
                trow.append(docid)
93
                tcol.append(cid)
94
            ndocs += 1
95
        scores = coo_matrix((data, (row, col)),
96
                            shape=(ndocs, len(source_project.subjects)),
97
                            dtype=np.float32)
98
        true = coo_matrix((np.ones(len(trow), dtype=np.bool), (trow, tcol)),
99
                          shape=(ndocs, len(source_project.subjects)),
100
                          dtype=np.bool)
101
        return csc_matrix(scores), csc_matrix(true)
102
103
    def _create_pav_model(self, source_project_id, min_docs, corpus):
104
        self.info("creating PAV model for source {}, min_docs={}".format(
105
            source_project_id, min_docs))
106
        source_project = self.project.registry.get_project(source_project_id)
107
        # suggest subjects for the training corpus
108
        scores, true = self._suggest_train_corpus(source_project, corpus)
109
        # create the concept-specific PAV regression models
110
        pav_regressions = {}
111
        for cid in range(len(source_project.subjects)):
112
            if true[:, cid].sum() < min_docs:
113
                continue  # don't create model b/c of too few examples
114
            reg = IsotonicRegression(out_of_bounds='clip')
115
            cid_scores = scores[:, cid].toarray().flatten().astype(np.float64)
116
            reg.fit(cid_scores, true[:, cid].toarray().flatten())
117
            pav_regressions[source_project.subjects[cid][0]] = reg
118
        self.info("created PAV model for {} concepts".format(
119
            len(pav_regressions)))
120
        model_filename = self.MODEL_FILE_PREFIX + source_project_id
121
        annif.util.atomic_save(
122
            pav_regressions,
123
            self.datadir,
124
            model_filename,
125
            method=joblib.dump)
126
127
    def _train(self, corpus, params):
128
        if corpus == 'cached':
129
            raise NotSupportedException(
130
                'Training pav project from cached data not supported.')
131
        if corpus.is_empty():
132
            raise NotSupportedException('training backend {} with no documents'
133
                                        .format(self.backend_id))
134
        self.info("creating PAV models")
135
        sources = annif.util.parse_sources(self.params['sources'])
136
        min_docs = int(params['min-docs'])
137
        for source_project_id, _ in sources:
138
            self._create_pav_model(source_project_id, min_docs, corpus)
139