TFIDFBackend.load_subjects() - Code Metrics - Inspection of "Merge pull request #78 from NatLibFi/backend-param" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (db5e7a)

by Osma

created 2018-03-28 16:35 UTC

TFIDFBackend.load_subjects() A

↳ Parent: TFIDFBackend

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	4
Bugs	0	Features	0

Metric	Value
cc	2
c	4
b	0
f	0
dl	0
loc	18
rs	9.4285

"""Backend that returns most similar subjects based on similarity in sparse
TF-IDF normalized bag-of-words vector space"""

import collections
import glob
import os
import os.path
import tempfile
import gensim.corpora
import gensim.models
import gensim.similarities
import annif.analyzer
import annif.corpus
from annif.hit import AnalysisHit
from . import backend


class VectorCorpus:

    """A class that wraps a subject corpus so it can be iterated as lists of
    vectors, by using a dictionary to map words to integers."""

    def __init__(self, corpus, dictionary, analyzer):
        self.corpus = corpus
        self.dictionary = dictionary
        self.analyzer = analyzer

    def __iter__(self):
        """Iterate through the subject directory, yielding vectors that are
        derived from subjects using the given analyzer and dictionary."""

        for subject in self.corpus:
            yield self.dictionary.doc2bow(
                self.analyzer.tokenize_words(subject.text))


class SubjectIndex:

    """A class that remembers the associations between integers subject IDs
    and their URIs and labels."""

    def __init__(self, corpus):
        """Initialize the subject index from a subject corpus."""
        self._uris = []
        self._labels = []
        for subject_id, subject in enumerate(corpus):

            self._uris.append(subject.uri)
            self._labels.append(subject.label)

    def __len__(self):
        return len(self._uris)

    def __getitem__(self, subject_id):
        return (self._uris[subject_id], self._labels[subject_id])

    def save(self, path):
        """Save this subject index into a file."""

        with open(path, 'w') as subjfile:
            for subject_id in range(len(self)):
                line = "<{}>\t{}".format(
                    self._uris[subject_id], self._labels[subject_id])
                print(line, file=subjfile)

    @classmethod
    def load(cls, path):
        """Load a subject index from a file and return it."""

        def file_as_corpus(path):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
            with open(path) as subjfile:
                for line in subjfile:
                    uri, label = line.strip().split(None, 1)
                    uri = uri[1:-1]
                    yield annif.corpus.Subject(uri, label, None)

        return cls(file_as_corpus(path))


class TFIDFBackend(backend.AnnifBackend):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
    name = "tfidf"

    # top K subjects per chunk to consider
    MAX_CHUNK_SUBJECTS = 100

    # defaults for uninitialized instances
    _subjects = None
    _analyzer = None
    _dictionary = None
    _tfidf = None
    _index = None

    def _atomic_save(self, obj, dirname, filename):
        tempfd, tempfilename = tempfile.mkstemp(prefix=filename, dir=dirname)
        os.close(tempfd)
        self.debug('saving {} to temporary file {}'.format(obj, tempfilename))
        obj.save(tempfilename)
        for fn in glob.glob(tempfilename + '*'):

            newname = fn.replace(tempfilename, os.path.join(dirname, filename))
            self.debug('renaming temporary file {} to {}'.format(fn, newname))
            os.rename(fn, newname)

    def _initialize_subjects(self):
        if self._subjects is None:
            path = os.path.join(self._get_datadir(), 'subjects')
            self.debug('loading subjects from {}'.format(path))
            self._subjects = SubjectIndex.load(path)

    def _initialize_analyzer(self):
        if self._analyzer is None:
            self._analyzer = annif.analyzer.get_analyzer(
                self.params['analyzer'])

    def _initialize_dictionary(self):
        if self._dictionary is None:
            path = os.path.join(self._get_datadir(), 'dictionary')
            self.debug('loading dictionary from {}'.format(path))
            self._dictionary = gensim.corpora.Dictionary.load(path)

    def _initialize_tfidf(self):
        if self._tfidf is None:
            path = os.path.join(self._get_datadir(), 'tfidf')
            self.debug('loading TF-IDF model from {}'.format(path))
            self._tfidf = gensim.models.TfidfModel.load(path)

    def _initialize_index(self):
        if self._index is None:
            path = os.path.join(self._get_datadir(), 'index')
            self.debug('loading similarity index from {}'.format(path))
            self._index = gensim.similarities.SparseMatrixSimilarity.load(path)

    def initialize(self):
class SomeClass:
    def some_method(self):
        """Do x and return foo."""
        self._initialize_subjects()
        self._initialize_analyzer()
        self._initialize_dictionary()
        self._initialize_tfidf()
        self._initialize_index()

    def load_subjects(self, subjects):

        self.info('Backend {}: creating subject index'.format(self.backend_id))
        self._subjects = SubjectIndex(subjects)
        self._atomic_save(self._subjects, self._get_datadir(), 'subjects')
        self._initialize_analyzer()
        self.info('creating dictionary')
        self._dictionary = gensim.corpora.Dictionary(
            (self._analyzer.tokenize_words(subject.text)
             for subject in subjects))
        self._atomic_save(self._dictionary, self._get_datadir(), 'dictionary')
        veccorpus = VectorCorpus(subjects, self._dictionary, self._analyzer)
        self.info('creating TF-IDF model')
        self._tfidf = gensim.models.TfidfModel(veccorpus)
        self._atomic_save(self._tfidf, self._get_datadir(), 'tfidf')
        self.info('creating similarity index')
        self._index = gensim.similarities.SparseMatrixSimilarity(
            self._tfidf[veccorpus], num_features=len(self._dictionary))
        self._atomic_save(self._index, self._get_datadir(), 'index')

    def _analyze_chunks(self, chunks):
        results = []
        for docsim in self._index[chunks]:
            sims = sorted(
                enumerate(docsim),
                key=lambda item: item[1],
                reverse=True)
            results.append(sims[:self.MAX_CHUNK_SUBJECTS])
        return results

    def _merge_chunk_results(self, chunk_results):
        subject_scores = collections.defaultdict(float)
        for result in chunk_results:
            for subject_id, score in result:
                subject_scores[subject_id] += score
        best_subjects = sorted([(score,
                                 subject_id) for subject_id,
                                score in subject_scores.items()],
                               reverse=True)
        limit = int(self.params['limit'])
        results = []
        for score, subject_id in best_subjects[:limit]:
            if score <= 0.0:
                continue
            subject = self._subjects[subject_id]
            results.append(
                AnalysisHit(
                    subject[0],
                    subject[1],
                    score /
                    len(chunk_results)))
        return results

    def _analyze(self, text, params):
        self.initialize()
        self.debug('Analyzing text "{}..." (len={})'.format(
            text[:20], len(text)))
        sentences = self._analyzer.tokenize_sentences(text)
        self.debug('Found {} sentences'.format(len(sentences)))
        chunksize = int(params['chunksize'])
        chunks = []  # chunks represented as TF-IDF normalized vectors
        for i in range(0, len(sentences), chunksize):
            chunktext = ' '.join(sentences[i:i + chunksize])
            chunkbow = self._dictionary.doc2bow(
                self._analyzer.tokenize_words(chunktext))
            chunks.append(self._tfidf[chunkbow])
        self.debug('Split sentences into {} chunks'.format(len(chunks)))
        chunk_results = self._analyze_chunks(chunks)
        return self._merge_chunk_results(chunk_results)


1			"""Backend that returns most similar subjects based on similarity in sparse
2			TF-IDF normalized bag-of-words vector space"""
3
4			import collections
5			import glob
6			import os
7			import os.path
8			import tempfile
9			import gensim.corpora
10			import gensim.models
11			import gensim.similarities
12			import annif.analyzer
13			import annif.corpus
14			from annif.hit import AnalysisHit
15			from . import backend
16
17
18			class VectorCorpus:
			1 ignored issue – show Unused Code introduced 2018-03-20 07:45 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
19			"""A class that wraps a subject corpus so it can be iterated as lists of
20			vectors, by using a dictionary to map words to integers."""
21
22			def __init__(self, corpus, dictionary, analyzer):
23			self.corpus = corpus
24			self.dictionary = dictionary
25			self.analyzer = analyzer
26
27			def __iter__(self):
28			"""Iterate through the subject directory, yielding vectors that are
29			derived from subjects using the given analyzer and dictionary."""
30
31			for subject in self.corpus:
32			yield self.dictionary.doc2bow(
33			self.analyzer.tokenize_words(subject.text))
34
35
36			class SubjectIndex:
			1 ignored issue – show Unused Code introduced 2018-03-20 14:18 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
37			"""A class that remembers the associations between integers subject IDs
38			and their URIs and labels."""
39
40			def __init__(self, corpus):
41			"""Initialize the subject index from a subject corpus."""
42			self._uris = []
43			self._labels = []
44			for subject_id, subject in enumerate(corpus):
			0 ignored issues – show Unused Code introduced 2018-03-20 14:18 UTC by Report Bug Copy Issue Report The variable `subject_id` seems to be unused. Loading history...
45			self._uris.append(subject.uri)
46			self._labels.append(subject.label)
47
48			def __len__(self):
49			return len(self._uris)
50
51			def __getitem__(self, subject_id):
52			return (self._uris[subject_id], self._labels[subject_id])
53
54			def save(self, path):
55			"""Save this subject index into a file."""
56
57			with open(path, 'w') as subjfile:
58			for subject_id in range(len(self)):
59			line = "<{}>\t{}".format(
60			self._uris[subject_id], self._labels[subject_id])
61			print(line, file=subjfile)
62
63			@classmethod
64			def load(cls, path):
65			"""Load a subject index from a file and return it."""
66
67			def file_as_corpus(path):
			0 ignored issues – show Coding Style introduced 2018-03-20 14:18 UTC by Report Bug Copy Issue Report This function should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
68			with open(path) as subjfile:
69			for line in subjfile:
70			uri, label = line.strip().split(None, 1)
71			uri = uri[1:-1]
72			yield annif.corpus.Subject(uri, label, None)
73
74			return cls(file_as_corpus(path))
75
76
77			class TFIDFBackend(backend.AnnifBackend):
			1 ignored issue – show Coding Style introduced 2018-03-20 07:45 UTC by Report Bug Copy Issue Report This class should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history... Unused Code introduced 2018-03-20 07:45 UTC by Report Bug Copy Issue Report The variable `__class__` seems to be unused. Loading history...
78			name = "tfidf"
79
80			# top K subjects per chunk to consider
81			MAX_CHUNK_SUBJECTS = 100
82
83			# defaults for uninitialized instances
84			_subjects = None
85			_analyzer = None
86			_dictionary = None
87			_tfidf = None
88			_index = None
89
90			def _atomic_save(self, obj, dirname, filename):
91			tempfd, tempfilename = tempfile.mkstemp(prefix=filename, dir=dirname)
92			os.close(tempfd)
93			self.debug('saving {} to temporary file {}'.format(obj, tempfilename))
94			obj.save(tempfilename)
95			for fn in glob.glob(tempfilename + '*'):
			0 ignored issues – show Coding Style Naming introduced 2018-03-22 13:57 UTC by Report Bug Copy Issue Report The name `fn` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
96			newname = fn.replace(tempfilename, os.path.join(dirname, filename))
97			self.debug('renaming temporary file {} to {}'.format(fn, newname))
98			os.rename(fn, newname)
99
100			def _initialize_subjects(self):
101			if self._subjects is None:
102			path = os.path.join(self._get_datadir(), 'subjects')
103			self.debug('loading subjects from {}'.format(path))
104			self._subjects = SubjectIndex.load(path)
105
106			def _initialize_analyzer(self):
107			if self._analyzer is None:
108			self._analyzer = annif.analyzer.get_analyzer(
109			self.params['analyzer'])
110
111			def _initialize_dictionary(self):
112			if self._dictionary is None:
113			path = os.path.join(self._get_datadir(), 'dictionary')
114			self.debug('loading dictionary from {}'.format(path))
115			self._dictionary = gensim.corpora.Dictionary.load(path)
116
117			def _initialize_tfidf(self):
118			if self._tfidf is None:
119			path = os.path.join(self._get_datadir(), 'tfidf')
120			self.debug('loading TF-IDF model from {}'.format(path))
121			self._tfidf = gensim.models.TfidfModel.load(path)
122
123			def _initialize_index(self):
124			if self._index is None:
125			path = os.path.join(self._get_datadir(), 'index')
126			self.debug('loading similarity index from {}'.format(path))
127			self._index = gensim.similarities.SparseMatrixSimilarity.load(path)
128
129			def initialize(self):
			0 ignored issues – show Coding Style introduced 2018-03-20 14:18 UTC by Report Bug Copy Issue Report This method should have a docstring. The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods: class SomeClass: def some_method(self): """Do x and return foo.""" If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions. Loading history...
130			self._initialize_subjects()
131			self._initialize_analyzer()
132			self._initialize_dictionary()
133			self._initialize_tfidf()
134			self._initialize_index()
135
136			def load_subjects(self, subjects):
			0 ignored issues – show Bug introduced 2018-03-20 14:18 UTC by Report Bug Copy Issue Report Parameters differ from overridden 'load_subjects' method Loading history...
137			self.info('Backend {}: creating subject index'.format(self.backend_id))
138			self._subjects = SubjectIndex(subjects)
139			self._atomic_save(self._subjects, self._get_datadir(), 'subjects')
140			self._initialize_analyzer()
141			self.info('creating dictionary')
142			self._dictionary = gensim.corpora.Dictionary(
143			(self._analyzer.tokenize_words(subject.text)
144			for subject in subjects))
145			self._atomic_save(self._dictionary, self._get_datadir(), 'dictionary')
146			veccorpus = VectorCorpus(subjects, self._dictionary, self._analyzer)
147			self.info('creating TF-IDF model')
148			self._tfidf = gensim.models.TfidfModel(veccorpus)
149			self._atomic_save(self._tfidf, self._get_datadir(), 'tfidf')
150			self.info('creating similarity index')
151			self._index = gensim.similarities.SparseMatrixSimilarity(
152			self._tfidf[veccorpus], num_features=len(self._dictionary))
153			self._atomic_save(self._index, self._get_datadir(), 'index')
154
155			def _analyze_chunks(self, chunks):
156			results = []
157			for docsim in self._index[chunks]:
158			sims = sorted(
159			enumerate(docsim),
160			key=lambda item: item[1],
161			reverse=True)
162			results.append(sims[:self.MAX_CHUNK_SUBJECTS])
163			return results
164
165			def _merge_chunk_results(self, chunk_results):
166			subject_scores = collections.defaultdict(float)
167			for result in chunk_results:
168			for subject_id, score in result:
169			subject_scores[subject_id] += score
170			best_subjects = sorted([(score,
171			subject_id) for subject_id,
172			score in subject_scores.items()],
173			reverse=True)
174			limit = int(self.params['limit'])
175			results = []
176			for score, subject_id in best_subjects[:limit]:
177			if score <= 0.0:
178			continue
179			subject = self._subjects[subject_id]
180			results.append(
181			AnalysisHit(
182			subject[0],
183			subject[1],
184			score /
185			len(chunk_results)))
186			return results
187
188			def _analyze(self, text, params):
189			self.initialize()
190			self.debug('Analyzing text "{}..." (len={})'.format(
191			text[:20], len(text)))
192			sentences = self._analyzer.tokenize_sentences(text)
193			self.debug('Found {} sentences'.format(len(sentences)))
194			chunksize = int(params['chunksize'])
195			chunks = [] # chunks represented as TF-IDF normalized vectors
196			for i in range(0, len(sentences), chunksize):
197			chunktext = ' '.join(sentences[i:i + chunksize])
198			chunkbow = self._dictionary.doc2bow(
199			self._analyzer.tokenize_words(chunktext))
200			chunks.append(self._tfidf[chunkbow])
201			self.debug('Split sentences into {} chunks'.format(len(chunks)))
202			chunk_results = self._analyze_chunks(chunks)
203			return self._merge_chunk_results(chunk_results)
204

NatLibFi / Annif

Branch — master (db5e7a)

TFIDFBackend.load_subjects() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like