annif.backend.mixins - Code Metrics - Inspection of "Merge pull request #366 from NatLibFi/issue343-omi..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 06f522...8fd911 )

by Osma

created 2019-12-10 10:25 UTC

annif.backend.mixins A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	72
Duplicated Lines	0 %

Importance

Changes

Metric	Value
wmc	9
eloc	51
dl	0
loc	72
rs	10
c	0
b	0
f	0

5 Methods

Rating	Name	Size	Complexity
A	ChunkingBackend._suggest()	14	3
A	TfidfVectorizerMixin.create_vectorizer()	10	1
A	ChunkingBackend._suggest_chunks()	6	1
A	TfidfVectorizerMixin.initialize_vectorizer()	10	3
A	ChunkingBackend.default_params()	2	1

"""Annif backend mixins that can be used to implement features"""


import abc
import os.path
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
import annif.util
from annif.exception import NotInitializedException
from annif.suggestion import ListSuggestionResult


class ChunkingBackend(metaclass=abc.ABCMeta):
    """Annif backend mixin that implements chunking of input"""

    DEFAULT_PARAMS = {'chunksize': 1}

    def default_params(self):
        return self.DEFAULT_PARAMS

    @abc.abstractmethod
    def _suggest_chunks(self, chunktexts):
        """Suggest subjects for the chunked text; should be implemented by
        the subclass inheriting this mixin"""

        pass  # pragma: no cover

    def _suggest(self, text, params):
        self.debug('Suggesting subjects for text "{}..." (len={})'.format(
            text[:20], len(text)))
        sentences = self.project.analyzer.tokenize_sentences(text)
        self.debug('Found {} sentences'.format(len(sentences)))
        chunksize = int(params['chunksize'])
        chunktexts = []
        for i in range(0, len(sentences), chunksize):
            chunktexts.append(' '.join(sentences[i:i + chunksize]))
        self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
        if len(chunktexts) == 0:  # no input, empty result
            return ListSuggestionResult(
                hits=[], subject_index=self.project.subjects)
        return self._suggest_chunks(chunktexts)


class TfidfVectorizerMixin:
    """Annif backend mixin that implements TfidfVectorizer functionality"""

    VECTORIZER_FILE = 'vectorizer'

    vectorizer = None

    def initialize_vectorizer(self):
        if self.vectorizer is None:
            path = os.path.join(self.datadir, self.VECTORIZER_FILE)
            if os.path.exists(path):
                self.debug('loading vectorizer from {}'.format(path))
                self.vectorizer = joblib.load(path)
            else:
                raise NotInitializedException(
                    "vectorizer file '{}' not found".format(path),
                    backend_id=self.backend_id)

    def create_vectorizer(self, input, params={}):
        self.info('creating vectorizer')
        self.vectorizer = TfidfVectorizer(**params)
        veccorpus = self.vectorizer.fit_transform(input)
        annif.util.atomic_save(
            self.vectorizer,
            self.datadir,
            self.VECTORIZER_FILE,
            method=joblib.dump)
        return veccorpus


1			"""Annif backend mixins that can be used to implement features"""
2
3
4			import abc
5			import os.path
6			import joblib
7			from sklearn.feature_extraction.text import TfidfVectorizer
8			import annif.util
9			from annif.exception import NotInitializedException
10			from annif.suggestion import ListSuggestionResult
11
12
13			class ChunkingBackend(metaclass=abc.ABCMeta):
14			"""Annif backend mixin that implements chunking of input"""
15
16			DEFAULT_PARAMS = {'chunksize': 1}
17
18			def default_params(self):
19			return self.DEFAULT_PARAMS
20
21			@abc.abstractmethod
22			def _suggest_chunks(self, chunktexts):
23			"""Suggest subjects for the chunked text; should be implemented by
24			the subclass inheriting this mixin"""
25
26			pass # pragma: no cover
27
28			def _suggest(self, text, params):
29			self.debug('Suggesting subjects for text "{}..." (len={})'.format(
30			text[:20], len(text)))
31			sentences = self.project.analyzer.tokenize_sentences(text)
32			self.debug('Found {} sentences'.format(len(sentences)))
33			chunksize = int(params['chunksize'])
34			chunktexts = []
35			for i in range(0, len(sentences), chunksize):
36			chunktexts.append(' '.join(sentences[i:i + chunksize]))
37			self.debug('Split sentences into {} chunks'.format(len(chunktexts)))
38			if len(chunktexts) == 0: # no input, empty result
39			return ListSuggestionResult(
40			hits=[], subject_index=self.project.subjects)
41			return self._suggest_chunks(chunktexts)
42
43
44			class TfidfVectorizerMixin:
45			"""Annif backend mixin that implements TfidfVectorizer functionality"""
46
47			VECTORIZER_FILE = 'vectorizer'
48
49			vectorizer = None
50
51			def initialize_vectorizer(self):
52			if self.vectorizer is None:
53			path = os.path.join(self.datadir, self.VECTORIZER_FILE)
54			if os.path.exists(path):
55			self.debug('loading vectorizer from {}'.format(path))
56			self.vectorizer = joblib.load(path)
57			else:
58			raise NotInitializedException(
59			"vectorizer file '{}' not found".format(path),
60			backend_id=self.backend_id)
61
62			def create_vectorizer(self, input, params={}):
63			self.info('creating vectorizer')
64			self.vectorizer = TfidfVectorizer(**params)
65			veccorpus = self.vectorizer.fit_transform(input)
66			annif.util.atomic_save(
67			self.vectorizer,
68			self.datadir,
69			self.VECTORIZER_FILE,
70			method=joblib.dump)
71			return veccorpus
72

NatLibFi / Annif

Push — master ( 06f522...8fd911 )

annif.backend.mixins A

Complexity

Size/Duplication

Importance

5 Methods

Duplication Side-by-Side

Filter issues like