annif.backend.vw_ensemble.VWEnsembleBackend._format_value() - Code Metrics - Inspection of "Add more features to the model in vw_ensemble" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#288)

by Osma

created 2019-07-02 11:27 UTC

VWEnsembleBackend._format_value() A

↳ Parent: annif.backend.vw_ensemble

Complexity

Conditions

Size

Total Lines	8
Code Lines	7

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	7
dl	0
loc	8
rs	10
c	0
b	0
f	0
cc	3
nop	1

"""Annif backend using the Vowpal Wabbit multiclass and multilabel
classifiers"""

import collections
import json
import random
import os.path
import annif.util
import annif.project
import numpy as np
from annif.exception import NotInitializedException
from annif.suggestion import VectorSuggestionResult
from . import vw_base
from . import ensemble


class VWEnsembleBackend(
        ensemble.EnsembleBackend,
        vw_base.VWBaseBackend):
    """Vowpal Wabbit ensemble backend that combines results from multiple
    projects and learns how well those projects/backends recognize
    particular subjects."""

    name = "vw_ensemble"

    VW_PARAMS = {
        'bit_precision': (int, None),
        'learning_rate': (float, None),
        'loss_function': (['squared', 'logistic', 'hinge'], 'squared'),
        'l1': (float, None),
        'l2': (float, None),
        'passes': (int, None)
    }

    # number of training examples per subject, stored as a collections.Counter
    _subject_freq = None

    FREQ_FILE = 'subject-freq.json'

    # The discount rate affects how quickly the ensemble starts to trust its
    # own judgement when the amount of training data increases, versus using
    # a simple mean of scores. A higher value will mean that the model
    # adapts quicker (and possibly makes more errors) while a lower value
    # will make it more careful so that it will require more training data.
    DEFAULT_DISCOUNT_RATE = 0.01

    # score threshold for "zero features": scores lower than this will be
    # considered zero and marked with a zero feature given to VW
    ZERO_THRESHOLD = 0.001

    def _load_subject_freq(self):
        path = os.path.join(self.datadir, self.FREQ_FILE)
        if not os.path.exists(path):
            raise NotInitializedException(
                'frequency file {} not found'.format(path),
                backend_id=self.backend_id)
        self.debug('loading concept frequencies from {}'.format(path))
        with open(path) as freqf:
            # The Counter was serialized like a dictionary, need to
            # convert it back. Keys that became strings need to be turned
            # back into integers.
            self._subject_freq = collections.Counter()
            for cid, freq in json.load(freqf).items():
                self._subject_freq[int(cid)] = freq

        self.debug('loaded frequencies for {} concepts'.format(
            len(self._subject_freq)))

    def initialize(self):
        if self._subject_freq is None:
            self._load_subject_freq()
        super().initialize()

    def _calculate_scores(self, subj_id, subj_score_vector):
        ex = self._format_example(subj_id, subj_score_vector)
        raw_score = subj_score_vector.mean()
        pred_score = (self._model.predict(ex) + 1.0) / 2.0
        return raw_score, pred_score

    def _merge_hits_from_sources(self, hits_from_sources, project, params):
        score_vector = np.array([hits.vector
                                 for hits, _ in hits_from_sources])
        discount_rate = float(self.params.get('discount_rate',
                                              self.DEFAULT_DISCOUNT_RATE))
        result = np.zeros(score_vector.shape[1])
        for subj_id in range(score_vector.shape[1]):
            subj_score_vector = score_vector[:, subj_id]
            if subj_score_vector.sum() > 0.0:
                raw_score, pred_score = self._calculate_scores(
                    subj_id, subj_score_vector)
                raw_weight = 1.0 / \
                    ((discount_rate * self._subject_freq[subj_id]) + 1)
                result[subj_id] = (raw_weight * raw_score) + \
                    (1.0 - raw_weight) * pred_score
        return VectorSuggestionResult(result, project.subjects)

    @property
    def _source_project_ids(self):
        sources = annif.util.parse_sources(self.params['sources'])
        return [project_id for project_id, _ in sources]

    @staticmethod
    def _format_value(true):
        if true is None:
            return ''
        elif true:
            return 1
        else:
            return -1

    def _format_example(self, subject_id, scores, true=None):
        features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx])
                             for proj_idx, proj
                             in enumerate(self._source_project_ids)])
        zero_features = " ".join(["zero^{}".format(proj)
                                  for proj_idx, proj
                                  in enumerate(self._source_project_ids)
                                  if scores[proj_idx] < self.ZERO_THRESHOLD])
        return "{} |raw {} {} |{} {} {}".format(
            self._format_value(true),
            features,
            zero_features,
            subject_id,
            features,
            zero_features)

    def _doc_score_vector(self, doc, source_projects):
        score_vectors = []
        for source_project in source_projects:
            hits = source_project.suggest(doc.text)
            score_vectors.append(hits.vector)
        return np.array(score_vectors)

    def _doc_to_example(self, doc, project, source_projects):
        examples = []
        subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
        true = subjects.as_vector(project.subjects)
        score_vector = self._doc_score_vector(doc, source_projects)
        for subj_id in range(len(true)):
            if true[subj_id] \
               or score_vector[:, subj_id].sum() >= self.ZERO_THRESHOLD:
                ex = (subj_id, self._format_example(
                    subj_id,
                    score_vector[:, subj_id],
                    true[subj_id]))
                examples.append(ex)
        return examples

    def _create_examples(self, corpus, project):
        source_projects = [annif.project.get_project(project_id)
                           for project_id in self._source_project_ids]
        examples = []
        for doc in corpus.documents:
            examples += self._doc_to_example(doc, project, source_projects)
        random.shuffle(examples)
        return examples

    def _create_model(self, project):
        # add interactions between raw (descriptor-invariant) features to
        # the mix
        super()._create_model(project, {'q': 'rr'})

    @staticmethod
    def _write_freq_file(subject_freq, filename):
        with open(filename, 'w') as freqfile:
            json.dump(subject_freq, freqfile)

    def _create_train_file(self, corpus, project):
        self.info('creating VW train file')
        exampledata = self._create_examples(corpus, project)

        subjects = [subj_id for subj_id, ex in exampledata]
        self._subject_freq = collections.Counter(subjects)
        annif.util.atomic_save(self._subject_freq,
                               self.datadir,
                               self.FREQ_FILE,
                               method=self._write_freq_file)

        examples = [ex for subj_id, ex in exampledata]
        annif.util.atomic_save(examples,
                               self.datadir,
                               self.TRAIN_FILE,
                               method=self._write_train_file)

    def learn(self, corpus, project):
        self.initialize()
        exampledata = self._create_examples(corpus, project)
        for subj_id, example in exampledata:
            self._model.learn(example)
            self._subject_freq[subj_id] += 1
        modelpath = os.path.join(self.datadir, self.MODEL_FILE)
        self._model.save(modelpath)
        annif.util.atomic_save(self._subject_freq,
                               self.datadir,
                               self.FREQ_FILE,
                               method=self._write_freq_file)


1			"""Annif backend using the Vowpal Wabbit multiclass and multilabel
2			classifiers"""
3
4			import collections
5			import json
6			import random
7			import os.path
8			import annif.util
9			import annif.project
10			import numpy as np
11			from annif.exception import NotInitializedException
12			from annif.suggestion import VectorSuggestionResult
13			from . import vw_base
14			from . import ensemble
15
16
17			class VWEnsembleBackend(
18			ensemble.EnsembleBackend,
19			vw_base.VWBaseBackend):
20			"""Vowpal Wabbit ensemble backend that combines results from multiple
21			projects and learns how well those projects/backends recognize
22			particular subjects."""
23
24			name = "vw_ensemble"
25
26			VW_PARAMS = {
27			'bit_precision': (int, None),
28			'learning_rate': (float, None),
29			'loss_function': (['squared', 'logistic', 'hinge'], 'squared'),
30			'l1': (float, None),
31			'l2': (float, None),
32			'passes': (int, None)
33			}
34
35			# number of training examples per subject, stored as a collections.Counter
36			_subject_freq = None
37
38			FREQ_FILE = 'subject-freq.json'
39
40			# The discount rate affects how quickly the ensemble starts to trust its
41			# own judgement when the amount of training data increases, versus using
42			# a simple mean of scores. A higher value will mean that the model
43			# adapts quicker (and possibly makes more errors) while a lower value
44			# will make it more careful so that it will require more training data.
45			DEFAULT_DISCOUNT_RATE = 0.01
46
47			# score threshold for "zero features": scores lower than this will be
48			# considered zero and marked with a zero feature given to VW
49			ZERO_THRESHOLD = 0.001
50
51			def _load_subject_freq(self):
52			path = os.path.join(self.datadir, self.FREQ_FILE)
53			if not os.path.exists(path):
54			raise NotInitializedException(
55			'frequency file {} not found'.format(path),
56			backend_id=self.backend_id)
57			self.debug('loading concept frequencies from {}'.format(path))
58			with open(path) as freqf:
59			# The Counter was serialized like a dictionary, need to
60			# convert it back. Keys that became strings need to be turned
61			# back into integers.
62			self._subject_freq = collections.Counter()
63			for cid, freq in json.load(freqf).items():
64			self._subject_freq[int(cid)] = freq
			1 ignored issue – show Comprehensibility Best Practice introduced 2019-06-28 07:46 UTC by Report Bug Copy Issue Report The variable `int` does not seem to be defined. Loading history...
65			self.debug('loaded frequencies for {} concepts'.format(
66			len(self._subject_freq)))
67
68			def initialize(self):
69			if self._subject_freq is None:
70			self._load_subject_freq()
71			super().initialize()
72
73			def _calculate_scores(self, subj_id, subj_score_vector):
74			ex = self._format_example(subj_id, subj_score_vector)
75			raw_score = subj_score_vector.mean()
76			pred_score = (self._model.predict(ex) + 1.0) / 2.0
77			return raw_score, pred_score
78
79			def _merge_hits_from_sources(self, hits_from_sources, project, params):
80			score_vector = np.array([hits.vector
81			for hits, _ in hits_from_sources])
82			discount_rate = float(self.params.get('discount_rate',
83			self.DEFAULT_DISCOUNT_RATE))
84			result = np.zeros(score_vector.shape[1])
85			for subj_id in range(score_vector.shape[1]):
86			subj_score_vector = score_vector[:, subj_id]
87			if subj_score_vector.sum() > 0.0:
88			raw_score, pred_score = self._calculate_scores(
89			subj_id, subj_score_vector)
90			raw_weight = 1.0 / \
91			((discount_rate * self._subject_freq[subj_id]) + 1)
92			result[subj_id] = (raw_weight * raw_score) + \
93			(1.0 - raw_weight) * pred_score
94			return VectorSuggestionResult(result, project.subjects)
95
96			@property
97			def _source_project_ids(self):
98			sources = annif.util.parse_sources(self.params['sources'])
99			return [project_id for project_id, _ in sources]
100
101			@staticmethod
102			def _format_value(true):
103			if true is None:
104			return ''
105			elif true:
106			return 1
107			else:
108			return -1
109
110			def _format_example(self, subject_id, scores, true=None):
111			features = " ".join(["{}:{:.6f}".format(proj, scores[proj_idx])
112			for proj_idx, proj
113			in enumerate(self._source_project_ids)])
114			zero_features = " ".join(["zero^{}".format(proj)
115			for proj_idx, proj
116			in enumerate(self._source_project_ids)
117			if scores[proj_idx] < self.ZERO_THRESHOLD])
118			return "{} \|raw {} {} \|{} {} {}".format(
119			self._format_value(true),
120			features,
121			zero_features,
122			subject_id,
123			features,
124			zero_features)
125
126			def _doc_score_vector(self, doc, source_projects):
127			score_vectors = []
128			for source_project in source_projects:
129			hits = source_project.suggest(doc.text)
130			score_vectors.append(hits.vector)
131			return np.array(score_vectors)
132
133			def _doc_to_example(self, doc, project, source_projects):
134			examples = []
135			subjects = annif.corpus.SubjectSet((doc.uris, doc.labels))
136			true = subjects.as_vector(project.subjects)
137			score_vector = self._doc_score_vector(doc, source_projects)
138			for subj_id in range(len(true)):
139			if true[subj_id] \
140			or score_vector[:, subj_id].sum() >= self.ZERO_THRESHOLD:
141			ex = (subj_id, self._format_example(
142			subj_id,
143			score_vector[:, subj_id],
144			true[subj_id]))
145			examples.append(ex)
146			return examples
147
148			def _create_examples(self, corpus, project):
149			source_projects = [annif.project.get_project(project_id)
150			for project_id in self._source_project_ids]
151			examples = []
152			for doc in corpus.documents:
153			examples += self._doc_to_example(doc, project, source_projects)
154			random.shuffle(examples)
155			return examples
156
157			def _create_model(self, project):
158			# add interactions between raw (descriptor-invariant) features to
159			# the mix
160			super()._create_model(project, {'q': 'rr'})
161
162			@staticmethod
163			def _write_freq_file(subject_freq, filename):
164			with open(filename, 'w') as freqfile:
165			json.dump(subject_freq, freqfile)
166
167			def _create_train_file(self, corpus, project):
168			self.info('creating VW train file')
169			exampledata = self._create_examples(corpus, project)
170
171			subjects = [subj_id for subj_id, ex in exampledata]
172			self._subject_freq = collections.Counter(subjects)
173			annif.util.atomic_save(self._subject_freq,
174			self.datadir,
175			self.FREQ_FILE,
176			method=self._write_freq_file)
177
178			examples = [ex for subj_id, ex in exampledata]
179			annif.util.atomic_save(examples,
180			self.datadir,
181			self.TRAIN_FILE,
182			method=self._write_train_file)
183
184			def learn(self, corpus, project):
185			self.initialize()
186			exampledata = self._create_examples(corpus, project)
187			for subj_id, example in exampledata:
188			self._model.learn(example)
189			self._subject_freq[subj_id] += 1
190			modelpath = os.path.join(self.datadir, self.MODEL_FILE)
191			self._model.save(modelpath)
192			annif.util.atomic_save(self._subject_freq,
193			self.datadir,
194			self.FREQ_FILE,
195			method=self._write_freq_file)
196

NatLibFi / Annif

Pull Request — master (#288)

VWEnsembleBackend._format_value() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like