annif.backend.vw_multi.VWMultiBackend.initialize() - Code Metrics - Inspection of "Merge pull request #249 from NatLibFi/vw-backend" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( d8a4d2...b3163f )

by Osma

created 2019-01-29 14:21 UTC

annif.backend.vw_multi.VWMultiBackend.initialize() A

↳ Parent: annif.backend.vw_multi

Complexity

Conditions

Size

Total Lines	15
Code Lines	14

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
eloc	14
dl	0
loc	15
rs	9.7
c	0
b	0
f	0
cc	3
nop	1

"""Annif backend using the Vorpal Wabbit multiclass and multilabel
classifiers"""

import random
import os.path
import annif.util
from vowpalwabbit import pyvw
import numpy as np
from annif.hit import AnalysisHit, VectorAnalysisResult
from annif.exception import ConfigurationException, NotInitializedException
from . import backend
from . import mixins


class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend):
    """Vorpal Wabbit multiclass/multilabel backend for Annif"""

    name = "vw_multi"
    needs_subject_index = True

    VW_PARAMS = {
        # each param specifier is a pair (allowed_values, default_value)
        # where allowed_values is either a type or a list of allowed values
        # and default_value may be None, to let VW decide by itself
        'bit_precision': (int, None),
        'learning_rate': (float, None),
        'loss_function': (['squared', 'logistic', 'hinge'], 'logistic'),
        'l1': (float, None),
        'l2': (float, None),
        'passes': (int, None)
    }

    MODEL_FILE = 'vw-model'
    TRAIN_FILE = 'vw-train.txt'

    # defaults for uninitialized instances
    _model = None

    def initialize(self):
        if self._model is None:
            path = os.path.join(self._get_datadir(), self.MODEL_FILE)
            self.debug('loading VW model from {}'.format(path))
            if os.path.exists(path):
                self._model = pyvw.vw(
                    i=path,
                    quiet=True,
                    loss_function='logistic',
                    probabilities=True)
                self.debug('loaded model {}'.format(str(self._model)))
            else:
                raise NotInitializedException(
                    'model {} not found'.format(path),
                    backend_id=self.backend_id)

    @classmethod
    def _normalize_text(cls, project, text):
        ntext = ' '.join(project.analyzer.tokenize_words(text))
        # colon and pipe chars have special meaning in VW and must be avoided
        return ntext.replace(':', '').replace('|', '')

    def _write_train_file(self, examples, filename):
        with open(filename, 'w') as trainfile:
            for ex in examples:
                print(ex, file=trainfile)

    def _create_train_file(self, corpus, project):
        self.info('creating VW train file')
        examples = []
        for doc in corpus.documents:
            text = self._normalize_text(project, doc.text)
            for uri in doc.uris:
                subject_id = project.subjects.by_uri(uri)
                if subject_id is None:
                    continue
                exstr = '{} | {}'.format(subject_id + 1, text)
                examples.append(exstr)
        random.shuffle(examples)
        annif.util.atomic_save(examples,
                               self._get_datadir(),
                               self.TRAIN_FILE,
                               method=self._write_train_file)

    def _convert_param(self, param, val):
        pspec, _ = self.VW_PARAMS[param]
        if isinstance(pspec, list):
            if val in pspec:
                return val
            raise ConfigurationException(
                "{} is not a valid value for {} (allowed: {})".format(
                    val, param, ', '.join(pspec)), backend_id=self.backend_id)
        try:
            return pspec(val)
        except ValueError:
            raise ConfigurationException(
                "The {} value {} cannot be converted to {}".format(
                    param, val, pspec), backend_id=self.backend_id)

    def _create_model(self, project):
        self.info('creating VW model')
        trainpath = os.path.join(self._get_datadir(), self.TRAIN_FILE)
        params = {param: defaultval
                  for param, (_, defaultval) in self.VW_PARAMS.items()
                  if defaultval is not None}
        params.update({param: self._convert_param(param, val)
                       for param, val in self.params.items()
                       if param in self.VW_PARAMS})
        self.debug("model parameters: {}".format(params))
        self._model = pyvw.vw(
            oaa=len(project.subjects),
            probabilities=True,
            data=trainpath,
            **params)
        modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE)
        self._model.save(modelpath)

    def train(self, corpus, project):
        self._create_train_file(corpus, project)
        self._create_model(project)

    def _analyze_chunks(self, chunktexts, project):
        results = []
        for chunktext in chunktexts:
            example = ' | {}'.format(chunktext)
            results.append(np.array(self._model.predict(example)))
        return VectorAnalysisResult(
            np.array(results).mean(axis=0), project.subjects)


1			"""Annif backend using the Vorpal Wabbit multiclass and multilabel
2			classifiers"""
3
4			import random
5			import os.path
6			import annif.util
7			from vowpalwabbit import pyvw
8			import numpy as np
9			from annif.hit import AnalysisHit, VectorAnalysisResult
10			from annif.exception import ConfigurationException, NotInitializedException
11			from . import backend
12			from . import mixins
13
14
15			class VWMultiBackend(mixins.ChunkingBackend, backend.AnnifBackend):
16			"""Vorpal Wabbit multiclass/multilabel backend for Annif"""
17
18			name = "vw_multi"
19			needs_subject_index = True
20
21			VW_PARAMS = {
22			# each param specifier is a pair (allowed_values, default_value)
23			# where allowed_values is either a type or a list of allowed values
24			# and default_value may be None, to let VW decide by itself
25			'bit_precision': (int, None),
26			'learning_rate': (float, None),
27			'loss_function': (['squared', 'logistic', 'hinge'], 'logistic'),
28			'l1': (float, None),
29			'l2': (float, None),
30			'passes': (int, None)
31			}
32
33			MODEL_FILE = 'vw-model'
34			TRAIN_FILE = 'vw-train.txt'
35
36			# defaults for uninitialized instances
37			_model = None
38
39			def initialize(self):
40			if self._model is None:
41			path = os.path.join(self._get_datadir(), self.MODEL_FILE)
42			self.debug('loading VW model from {}'.format(path))
43			if os.path.exists(path):
44			self._model = pyvw.vw(
45			i=path,
46			quiet=True,
47			loss_function='logistic',
48			probabilities=True)
49			self.debug('loaded model {}'.format(str(self._model)))
50			else:
51			raise NotInitializedException(
52			'model {} not found'.format(path),
53			backend_id=self.backend_id)
54
55			@classmethod
56			def _normalize_text(cls, project, text):
57			ntext = ' '.join(project.analyzer.tokenize_words(text))
58			# colon and pipe chars have special meaning in VW and must be avoided
59			return ntext.replace(':', '').replace('\|', '')
60
61			def _write_train_file(self, examples, filename):
62			with open(filename, 'w') as trainfile:
63			for ex in examples:
64			print(ex, file=trainfile)
65
66			def _create_train_file(self, corpus, project):
67			self.info('creating VW train file')
68			examples = []
69			for doc in corpus.documents:
70			text = self._normalize_text(project, doc.text)
71			for uri in doc.uris:
72			subject_id = project.subjects.by_uri(uri)
73			if subject_id is None:
74			continue
75			exstr = '{} \| {}'.format(subject_id + 1, text)
76			examples.append(exstr)
77			random.shuffle(examples)
78			annif.util.atomic_save(examples,
79			self._get_datadir(),
80			self.TRAIN_FILE,
81			method=self._write_train_file)
82
83			def _convert_param(self, param, val):
84			pspec, _ = self.VW_PARAMS[param]
85			if isinstance(pspec, list):
86			if val in pspec:
87			return val
88			raise ConfigurationException(
89			"{} is not a valid value for {} (allowed: {})".format(
90			val, param, ', '.join(pspec)), backend_id=self.backend_id)
91			try:
92			return pspec(val)
93			except ValueError:
94			raise ConfigurationException(
95			"The {} value {} cannot be converted to {}".format(
96			param, val, pspec), backend_id=self.backend_id)
97
98			def _create_model(self, project):
99			self.info('creating VW model')
100			trainpath = os.path.join(self._get_datadir(), self.TRAIN_FILE)
101			params = {param: defaultval
102			for param, (_, defaultval) in self.VW_PARAMS.items()
103			if defaultval is not None}
104			params.update({param: self._convert_param(param, val)
105			for param, val in self.params.items()
106			if param in self.VW_PARAMS})
107			self.debug("model parameters: {}".format(params))
108			self._model = pyvw.vw(
109			oaa=len(project.subjects),
110			probabilities=True,
111			data=trainpath,
112			**params)
113			modelpath = os.path.join(self._get_datadir(), self.MODEL_FILE)
114			self._model.save(modelpath)
115
116			def train(self, corpus, project):
117			self._create_train_file(corpus, project)
118			self._create_model(project)
119
120			def _analyze_chunks(self, chunktexts, project):
121			results = []
122			for chunktext in chunktexts:
123			example = ' \| {}'.format(chunktext)
124			results.append(np.array(self._model.predict(example)))
125			return VectorAnalysisResult(
126			np.array(results).mean(axis=0), project.subjects)
127

NatLibFi / Annif

Push — master ( d8a4d2...b3163f )

annif.backend.vw_multi.VWMultiBackend.initialize() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like