annif.backend.pav.PAVBackend.initialize() - Code Metrics - Inspection of "Merge pull request #270 from NatLibFi/issue267-cli..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( c8c370...dee89b )

by Osma

created 2019-04-17 09:24 UTC

annif.backend.pav.PAVBackend.initialize() A

↳ Parent: annif.backend.pav

Complexity

Conditions

Size

Total Lines	15
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	14
dl	0
loc	15
rs	9.7
c	0
b	0
f	0
cc	4
nop	1

"""PAV ensemble backend that combines results from multiple projects and
learns which concept suggestions from each backend are trustworthy using the
PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
individual backends into probabilities."""

import os.path
from sklearn.externals import joblib
from sklearn.isotonic import IsotonicRegression
import numpy as np
import annif.corpus
import annif.suggestion
import annif.project
import annif.util
from annif.exception import NotInitializedException
from . import ensemble


class PAVBackend(ensemble.EnsembleBackend):
    """PAV ensemble backend that combines results from multiple projects"""
    name = "pav"

    MODEL_FILE_PREFIX = "pav-model-"

    # defaults for uninitialized instances
    _models = None

    def initialize(self):
        if self._models is not None:
            return  # already initialized
        self._models = {}
        sources = annif.util.parse_sources(self.params['sources'])
        for source_project_id, _ in sources:
            model_filename = self.MODEL_FILE_PREFIX + source_project_id
            path = os.path.join(self.datadir, model_filename)
            if os.path.exists(path):
                self.debug('loading PAV model from {}'.format(path))
                self._models[source_project_id] = joblib.load(path)
            else:
                raise NotInitializedException(
                    "PAV model file '{}' not found".format(path),
                    backend_id=self.backend_id)

    def _get_model(self, source_project_id):
        self.initialize()
        return self._models[source_project_id]

    def _normalize_hits(self, hits, source_project):
        reg_models = self._get_model(source_project.project_id)
        pav_result = []
        for hit in hits.hits:
            if hit.uri in reg_models:
                score = reg_models[hit.uri].predict([hit.score])[0]
            else:  # default to raw score
                score = hit.score
            pav_result.append(
                annif.suggestion.SubjectSuggestion(
                    uri=hit.uri,
                    label=hit.label,
                    score=score))
        pav_result.sort(key=lambda hit: hit.score, reverse=True)
        return annif.suggestion.ListSuggestionResult(
            pav_result, source_project.subjects)

    @staticmethod
    def _suggest_train_corpus(source_project, corpus):
        scores = []
        true = []
        for doc in corpus.documents:
            hits = source_project.suggest(doc.text)
            scores.append(hits.vector)
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
            true.append(subjects.as_vector(source_project.subjects))
        return np.array(scores), np.array(true)

    def _create_pav_model(self, source_project_id, min_docs, corpus):
        self.info("creating PAV model for source {}, min_docs={}".format(
            source_project_id, min_docs))
        source_project = annif.project.get_project(source_project_id)
        # suggest subjects for the training corpus
        scores, true = self._suggest_train_corpus(source_project, corpus)
        # create the concept-specific PAV regression models
        pav_regressions = {}
        for cid in range(len(source_project.subjects)):
            if true[:, cid].sum() < min_docs:
                continue  # don't create model b/c of too few examples
            reg = IsotonicRegression(out_of_bounds='clip')
            reg.fit(scores[:, cid], true[:, cid])
            pav_regressions[source_project.subjects[cid][0]] = reg
        self.info("created PAV model for {} concepts".format(
            len(pav_regressions)))
        model_filename = self.MODEL_FILE_PREFIX + source_project_id
        annif.util.atomic_save(
            pav_regressions,
            self.datadir,
            model_filename,
            method=joblib.dump)

    def train(self, corpus, project):
        self.info("creating PAV models")
        sources = annif.util.parse_sources(self.params['sources'])
        min_docs = int(self.params['min-docs'])
        for source_project_id, _ in sources:
            self._create_pav_model(source_project_id, min_docs, corpus)


1			"""PAV ensemble backend that combines results from multiple projects and
2			learns which concept suggestions from each backend are trustworthy using the
3			PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
4			individual backends into probabilities."""
5
6			import os.path
7			from sklearn.externals import joblib
8			from sklearn.isotonic import IsotonicRegression
9			import numpy as np
10			import annif.corpus
11			import annif.suggestion
12			import annif.project
13			import annif.util
14			from annif.exception import NotInitializedException
15			from . import ensemble
16
17
18			class PAVBackend(ensemble.EnsembleBackend):
19			"""PAV ensemble backend that combines results from multiple projects"""
20			name = "pav"
21
22			MODEL_FILE_PREFIX = "pav-model-"
23
24			# defaults for uninitialized instances
25			_models = None
26
27			def initialize(self):
28			if self._models is not None:
29			return # already initialized
30			self._models = {}
31			sources = annif.util.parse_sources(self.params['sources'])
32			for source_project_id, _ in sources:
33			model_filename = self.MODEL_FILE_PREFIX + source_project_id
34			path = os.path.join(self.datadir, model_filename)
35			if os.path.exists(path):
36			self.debug('loading PAV model from {}'.format(path))
37			self._models[source_project_id] = joblib.load(path)
38			else:
39			raise NotInitializedException(
40			"PAV model file '{}' not found".format(path),
41			backend_id=self.backend_id)
42
43			def _get_model(self, source_project_id):
44			self.initialize()
45			return self._models[source_project_id]
46
47			def _normalize_hits(self, hits, source_project):
48			reg_models = self._get_model(source_project.project_id)
49			pav_result = []
50			for hit in hits.hits:
51			if hit.uri in reg_models:
52			score = reg_models[hit.uri].predict([hit.score])[0]
53			else: # default to raw score
54			score = hit.score
55			pav_result.append(
56			annif.suggestion.SubjectSuggestion(
57			uri=hit.uri,
58			label=hit.label,
59			score=score))
60			pav_result.sort(key=lambda hit: hit.score, reverse=True)
61			return annif.suggestion.ListSuggestionResult(
62			pav_result, source_project.subjects)
63
64			@staticmethod
65			def _suggest_train_corpus(source_project, corpus):
66			scores = []
67			true = []
68			for doc in corpus.documents:
69			hits = source_project.suggest(doc.text)
70			scores.append(hits.vector)
71			subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
72			true.append(subjects.as_vector(source_project.subjects))
73			return np.array(scores), np.array(true)
74
75			def _create_pav_model(self, source_project_id, min_docs, corpus):
76			self.info("creating PAV model for source {}, min_docs={}".format(
77			source_project_id, min_docs))
78			source_project = annif.project.get_project(source_project_id)
79			# suggest subjects for the training corpus
80			scores, true = self._suggest_train_corpus(source_project, corpus)
81			# create the concept-specific PAV regression models
82			pav_regressions = {}
83			for cid in range(len(source_project.subjects)):
84			if true[:, cid].sum() < min_docs:
85			continue # don't create model b/c of too few examples
86			reg = IsotonicRegression(out_of_bounds='clip')
87			reg.fit(scores[:, cid], true[:, cid])
88			pav_regressions[source_project.subjects[cid][0]] = reg
89			self.info("created PAV model for {} concepts".format(
90			len(pav_regressions)))
91			model_filename = self.MODEL_FILE_PREFIX + source_project_id
92			annif.util.atomic_save(
93			pav_regressions,
94			self.datadir,
95			model_filename,
96			method=joblib.dump)
97
98			def train(self, corpus, project):
99			self.info("creating PAV models")
100			sources = annif.util.parse_sources(self.params['sources'])
101			min_docs = int(self.params['min-docs'])
102			for source_project_id, _ in sources:
103			self._create_pav_model(source_project_id, min_docs, corpus)
104

NatLibFi / Annif

Push — master ( c8c370...dee89b )

annif.backend.pav.PAVBackend.initialize() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like