annif.backend.pav.PAVBackend.default_params() - Code Metrics - Inspection of "Fix missing default params in pav and nn ensemble" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#447)

unknown

created 2020-10-29 15:11 UTC

annif.backend.pav.PAVBackend.default_params() A

↳ Parent: annif.backend.pav

Complexity

Conditions

Size

Total Lines	4
Code Lines	4

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	4
dl	0
loc	4
rs	10
c	0
b	0
f	0
cc	1
nop	1

"""PAV ensemble backend that combines results from multiple projects and
learns which concept suggestions from each backend are trustworthy using the
PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
individual backends into probabilities."""

import os.path
import joblib
from scipy.sparse import coo_matrix, csc_matrix
from sklearn.isotonic import IsotonicRegression
import numpy as np
import annif.corpus
import annif.suggestion
import annif.util
from annif.exception import NotInitializedException, NotSupportedException
from . import backend
from . import ensemble


class PAVBackend(ensemble.BaseEnsembleBackend):
    """PAV ensemble backend that combines results from multiple projects"""
    name = "pav"

    MODEL_FILE_PREFIX = "pav-model-"

    # defaults for uninitialized instances
    _models = None

    DEFAULT_PARAMETERS = {'min-docs': 10}

    def default_params(self):
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
        params.update(self.DEFAULT_PARAMETERS)
        return params

    def initialize(self):
        super().initialize()
        if self._models is not None:
            return  # already initialized
        self._models = {}
        sources = annif.util.parse_sources(self.params['sources'])
        for source_project_id, _ in sources:
            model_filename = self.MODEL_FILE_PREFIX + source_project_id
            path = os.path.join(self.datadir, model_filename)
            if os.path.exists(path):
                self.debug('loading PAV model from {}'.format(path))
                self._models[source_project_id] = joblib.load(path)
            else:
                raise NotInitializedException(
                    "PAV model file '{}' not found".format(path),
                    backend_id=self.backend_id)

    def _get_model(self, source_project_id):
        self.initialize()
        return self._models[source_project_id]

    def _normalize_hits(self, hits, source_project):
        reg_models = self._get_model(source_project.project_id)
        pav_result = []
        for hit in hits.as_list(source_project.subjects):
            if hit.uri in reg_models:
                score = reg_models[hit.uri].predict([hit.score])[0]
            else:  # default to raw score
                score = hit.score
            pav_result.append(
                annif.suggestion.SubjectSuggestion(
                    uri=hit.uri,
                    label=hit.label,
                    notation=hit.notation,
                    score=score))
        pav_result.sort(key=lambda hit: hit.score, reverse=True)
        return annif.suggestion.ListSuggestionResult(pav_result)

    @staticmethod
    def _suggest_train_corpus(source_project, corpus):
        # lists for constructing score matrix
        data, row, col = [], [], []
        # lists for constructing true label matrix
        trow, tcol = [], []

        ndocs = 0
        for docid, doc in enumerate(corpus.documents):
            hits = source_project.suggest(doc.text)
            vector = hits.as_vector(source_project.subjects)
            for cid in np.flatnonzero(vector):
                data.append(vector[cid])
                row.append(docid)
                col.append(cid)
            subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
            for cid in np.flatnonzero(
                    subjects.as_vector(source_project.subjects)):

                trow.append(docid)
                tcol.append(cid)
            ndocs += 1
        scores = coo_matrix((data, (row, col)),
                            shape=(ndocs, len(source_project.subjects)),
                            dtype=np.float32)
        true = coo_matrix((np.ones(len(trow), dtype=np.bool), (trow, tcol)),
                          shape=(ndocs, len(source_project.subjects)),
                          dtype=np.bool)
        return csc_matrix(scores), csc_matrix(true)

    def _create_pav_model(self, source_project_id, min_docs, corpus):
        self.info("creating PAV model for source {}, min_docs={}".format(
            source_project_id, min_docs))
        source_project = self.project.registry.get_project(source_project_id)
        # suggest subjects for the training corpus
        scores, true = self._suggest_train_corpus(source_project, corpus)
        # create the concept-specific PAV regression models
        pav_regressions = {}
        for cid in range(len(source_project.subjects)):
            if true[:, cid].sum() < min_docs:
                continue  # don't create model b/c of too few examples
            reg = IsotonicRegression(out_of_bounds='clip')
            cid_scores = scores[:, cid].toarray().flatten().astype(np.float64)
            reg.fit(cid_scores, true[:, cid].toarray().flatten())
            pav_regressions[source_project.subjects[cid][0]] = reg
        self.info("created PAV model for {} concepts".format(
            len(pav_regressions)))
        model_filename = self.MODEL_FILE_PREFIX + source_project_id
        annif.util.atomic_save(
            pav_regressions,
            self.datadir,
            model_filename,
            method=joblib.dump)

    def _train(self, corpus, params):
        if corpus == 'cached':
            raise NotSupportedException(
                'Training pav project from cached data not supported.')
        if corpus.is_empty():
            raise NotSupportedException('training backend {} with no documents'
                                        .format(self.backend_id))
        self.info("creating PAV models")
        sources = annif.util.parse_sources(self.params['sources'])
        min_docs = int(params['min-docs'])
        for source_project_id, _ in sources:
            self._create_pav_model(source_project_id, min_docs, corpus)


1			"""PAV ensemble backend that combines results from multiple projects and
2			learns which concept suggestions from each backend are trustworthy using the
3			PAV algorithm, a.k.a. isotonic regression, to turn raw scores returned by
4			individual backends into probabilities."""
5
6			import os.path
7			import joblib
8			from scipy.sparse import coo_matrix, csc_matrix
9			from sklearn.isotonic import IsotonicRegression
10			import numpy as np
11			import annif.corpus
12			import annif.suggestion
13			import annif.util
14			from annif.exception import NotInitializedException, NotSupportedException
15			from . import backend
16			from . import ensemble
17
18
19			class PAVBackend(ensemble.BaseEnsembleBackend):
20			"""PAV ensemble backend that combines results from multiple projects"""
21			name = "pav"
22
23			MODEL_FILE_PREFIX = "pav-model-"
24
25			# defaults for uninitialized instances
26			_models = None
27
28			DEFAULT_PARAMETERS = {'min-docs': 10}
29
30			def default_params(self):
31			params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
32			params.update(self.DEFAULT_PARAMETERS)
33			return params
34
35			def initialize(self):
36			super().initialize()
37			if self._models is not None:
38			return # already initialized
39			self._models = {}
40			sources = annif.util.parse_sources(self.params['sources'])
41			for source_project_id, _ in sources:
42			model_filename = self.MODEL_FILE_PREFIX + source_project_id
43			path = os.path.join(self.datadir, model_filename)
44			if os.path.exists(path):
45			self.debug('loading PAV model from {}'.format(path))
46			self._models[source_project_id] = joblib.load(path)
47			else:
48			raise NotInitializedException(
49			"PAV model file '{}' not found".format(path),
50			backend_id=self.backend_id)
51
52			def _get_model(self, source_project_id):
53			self.initialize()
54			return self._models[source_project_id]
55
56			def _normalize_hits(self, hits, source_project):
57			reg_models = self._get_model(source_project.project_id)
58			pav_result = []
59			for hit in hits.as_list(source_project.subjects):
60			if hit.uri in reg_models:
61			score = reg_models[hit.uri].predict([hit.score])[0]
62			else: # default to raw score
63			score = hit.score
64			pav_result.append(
65			annif.suggestion.SubjectSuggestion(
66			uri=hit.uri,
67			label=hit.label,
68			notation=hit.notation,
69			score=score))
70			pav_result.sort(key=lambda hit: hit.score, reverse=True)
71			return annif.suggestion.ListSuggestionResult(pav_result)
72
73			@staticmethod
74			def _suggest_train_corpus(source_project, corpus):
75			# lists for constructing score matrix
76			data, row, col = [], [], []
77			# lists for constructing true label matrix
78			trow, tcol = [], []
79
80			ndocs = 0
81			for docid, doc in enumerate(corpus.documents):
82			hits = source_project.suggest(doc.text)
83			vector = hits.as_vector(source_project.subjects)
84			for cid in np.flatnonzero(vector):
85			data.append(vector[cid])
86			row.append(docid)
87			col.append(cid)
88			subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
89			for cid in np.flatnonzero(
90			subjects.as_vector(source_project.subjects)):
91
92			trow.append(docid)
93			tcol.append(cid)
94			ndocs += 1
95			scores = coo_matrix((data, (row, col)),
96			shape=(ndocs, len(source_project.subjects)),
97			dtype=np.float32)
98			true = coo_matrix((np.ones(len(trow), dtype=np.bool), (trow, tcol)),
99			shape=(ndocs, len(source_project.subjects)),
100			dtype=np.bool)
101			return csc_matrix(scores), csc_matrix(true)
102
103			def _create_pav_model(self, source_project_id, min_docs, corpus):
104			self.info("creating PAV model for source {}, min_docs={}".format(
105			source_project_id, min_docs))
106			source_project = self.project.registry.get_project(source_project_id)
107			# suggest subjects for the training corpus
108			scores, true = self._suggest_train_corpus(source_project, corpus)
109			# create the concept-specific PAV regression models
110			pav_regressions = {}
111			for cid in range(len(source_project.subjects)):
112			if true[:, cid].sum() < min_docs:
113			continue # don't create model b/c of too few examples
114			reg = IsotonicRegression(out_of_bounds='clip')
115			cid_scores = scores[:, cid].toarray().flatten().astype(np.float64)
116			reg.fit(cid_scores, true[:, cid].toarray().flatten())
117			pav_regressions[source_project.subjects[cid][0]] = reg
118			self.info("created PAV model for {} concepts".format(
119			len(pav_regressions)))
120			model_filename = self.MODEL_FILE_PREFIX + source_project_id
121			annif.util.atomic_save(
122			pav_regressions,
123			self.datadir,
124			model_filename,
125			method=joblib.dump)
126
127			def _train(self, corpus, params):
128			if corpus == 'cached':
129			raise NotSupportedException(
130			'Training pav project from cached data not supported.')
131			if corpus.is_empty():
132			raise NotSupportedException('training backend {} with no documents'
133			.format(self.backend_id))
134			self.info("creating PAV models")
135			sources = annif.util.parse_sources(self.params['sources'])
136			min_docs = int(params['min-docs'])
137			for source_project_id, _ in sources:
138			self._create_pav_model(source_project_id, min_docs, corpus)
139

NatLibFi / Annif

Pull Request — master (#447)

annif.backend.pav.PAVBackend.default_params() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like