Passed
Push — master ( c8c370...dee89b )
by Osma
03:14
created

annif.backend.pav.PAVBackend.initialize()   A

Complexity

Conditions 4

Size

Total Lines 15
Code Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 14
dl 0
loc 15
rs 9.7
c 0
b 0
f 0
cc 4
nop 1
1
"""PAV ensemble backend that combines results from multiple projects and
2
learns which concept suggestions from each backend are trustworthy using the
3
PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
4
individual backends into probabilities."""
5
6
import os.path
7
from sklearn.externals import joblib
8
from sklearn.isotonic import IsotonicRegression
9
import numpy as np
10
import annif.corpus
11
import annif.suggestion
12
import annif.project
13
import annif.util
14
from annif.exception import NotInitializedException
15
from . import ensemble
16
17
18
class PAVBackend(ensemble.EnsembleBackend):
19
    """PAV ensemble backend that combines results from multiple projects"""
20
    name = "pav"
21
22
    MODEL_FILE_PREFIX = "pav-model-"
23
24
    # defaults for uninitialized instances
25
    _models = None
26
27
    def initialize(self):
28
        if self._models is not None:
29
            return  # already initialized
30
        self._models = {}
31
        sources = annif.util.parse_sources(self.params['sources'])
32
        for source_project_id, _ in sources:
33
            model_filename = self.MODEL_FILE_PREFIX + source_project_id
34
            path = os.path.join(self.datadir, model_filename)
35
            if os.path.exists(path):
36
                self.debug('loading PAV model from {}'.format(path))
37
                self._models[source_project_id] = joblib.load(path)
38
            else:
39
                raise NotInitializedException(
40
                    "PAV model file '{}' not found".format(path),
41
                    backend_id=self.backend_id)
42
43
    def _get_model(self, source_project_id):
44
        self.initialize()
45
        return self._models[source_project_id]
46
47
    def _normalize_hits(self, hits, source_project):
48
        reg_models = self._get_model(source_project.project_id)
49
        pav_result = []
50
        for hit in hits.hits:
51
            if hit.uri in reg_models:
52
                score = reg_models[hit.uri].predict([hit.score])[0]
53
            else:  # default to raw score
54
                score = hit.score
55
            pav_result.append(
56
                annif.suggestion.SubjectSuggestion(
57
                    uri=hit.uri,
58
                    label=hit.label,
59
                    score=score))
60
        pav_result.sort(key=lambda hit: hit.score, reverse=True)
61
        return annif.suggestion.ListSuggestionResult(
62
            pav_result, source_project.subjects)
63
64
    @staticmethod
65
    def _suggest_train_corpus(source_project, corpus):
66
        scores = []
67
        true = []
68
        for doc in corpus.documents:
69
            hits = source_project.suggest(doc.text)
70
            scores.append(hits.vector)
71
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
72
            true.append(subjects.as_vector(source_project.subjects))
73
        return np.array(scores), np.array(true)
74
75
    def _create_pav_model(self, source_project_id, min_docs, corpus):
76
        self.info("creating PAV model for source {}, min_docs={}".format(
77
            source_project_id, min_docs))
78
        source_project = annif.project.get_project(source_project_id)
79
        # suggest subjects for the training corpus
80
        scores, true = self._suggest_train_corpus(source_project, corpus)
81
        # create the concept-specific PAV regression models
82
        pav_regressions = {}
83
        for cid in range(len(source_project.subjects)):
84
            if true[:, cid].sum() < min_docs:
85
                continue  # don't create model b/c of too few examples
86
            reg = IsotonicRegression(out_of_bounds='clip')
87
            reg.fit(scores[:, cid], true[:, cid])
88
            pav_regressions[source_project.subjects[cid][0]] = reg
89
        self.info("created PAV model for {} concepts".format(
90
            len(pav_regressions)))
91
        model_filename = self.MODEL_FILE_PREFIX + source_project_id
92
        annif.util.atomic_save(
93
            pav_regressions,
94
            self.datadir,
95
            model_filename,
96
            method=joblib.dump)
97
98
    def train(self, corpus, project):
99
        self.info("creating PAV models")
100
        sources = annif.util.parse_sources(self.params['sources'])
101
        min_docs = int(self.params['min-docs'])
102
        for source_project_id, _ in sources:
103
            self._create_pav_model(source_project_id, min_docs, corpus)
104