Completed
Branch master (db5e7a)
by Osma
09:07 queued 05:09
created

TFIDFBackend   A

Complexity

Total Complexity 26

Size/Duplication

Total Lines 127
Duplicated Lines 0 %

Importance

Changes 6
Bugs 0 Features 0
Metric Value
dl 0
loc 127
rs 10
c 6
b 0
f 0
wmc 26

11 Methods

Rating   Name   Duplication   Size   Complexity  
A _initialize_analyzer() 0 4 2
A _initialize_tfidf() 0 5 2
A _initialize_dictionary() 0 5 2
A load_subjects() 0 18 2
A _initialize_subjects() 0 5 2
A _analyze_chunks() 0 9 3
A _initialize_index() 0 5 2
A _atomic_save() 0 9 2
A initialize() 0 6 1
B _merge_chunk_results() 0 22 6
A _analyze() 0 16 2
1
"""Backend that returns most similar subjects based on similarity in sparse
2
TF-IDF normalized bag-of-words vector space"""
3
4
import collections
5
import glob
6
import os
7
import os.path
8
import tempfile
9
import gensim.corpora
10
import gensim.models
11
import gensim.similarities
12
import annif.analyzer
13
import annif.corpus
14
from annif.hit import AnalysisHit
15
from . import backend
16
17
18
class VectorCorpus:
1 ignored issue
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
19
    """A class that wraps a subject corpus so it can be iterated as lists of
20
    vectors, by using a dictionary to map words to integers."""
21
22
    def __init__(self, corpus, dictionary, analyzer):
23
        self.corpus = corpus
24
        self.dictionary = dictionary
25
        self.analyzer = analyzer
26
27
    def __iter__(self):
28
        """Iterate through the subject directory, yielding vectors that are
29
        derived from subjects using the given analyzer and dictionary."""
30
31
        for subject in self.corpus:
32
            yield self.dictionary.doc2bow(
33
                self.analyzer.tokenize_words(subject.text))
34
35
36
class SubjectIndex:
1 ignored issue
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
37
    """A class that remembers the associations between integers subject IDs
38
    and their URIs and labels."""
39
40
    def __init__(self, corpus):
41
        """Initialize the subject index from a subject corpus."""
42
        self._uris = []
43
        self._labels = []
44
        for subject_id, subject in enumerate(corpus):
0 ignored issues
show
Unused Code introduced by
The variable subject_id seems to be unused.
Loading history...
45
            self._uris.append(subject.uri)
46
            self._labels.append(subject.label)
47
48
    def __len__(self):
49
        return len(self._uris)
50
51
    def __getitem__(self, subject_id):
52
        return (self._uris[subject_id], self._labels[subject_id])
53
54
    def save(self, path):
55
        """Save this subject index into a file."""
56
57
        with open(path, 'w') as subjfile:
58
            for subject_id in range(len(self)):
59
                line = "<{}>\t{}".format(
60
                    self._uris[subject_id], self._labels[subject_id])
61
                print(line, file=subjfile)
62
63
    @classmethod
64
    def load(cls, path):
65
        """Load a subject index from a file and return it."""
66
67
        def file_as_corpus(path):
0 ignored issues
show
Coding Style introduced by
This function should have a docstring.

The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:

class SomeClass:
    def some_method(self):
        """Do x and return foo."""

If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.

Loading history...
68
            with open(path) as subjfile:
69
                for line in subjfile:
70
                    uri, label = line.strip().split(None, 1)
71
                    uri = uri[1:-1]
72
                    yield annif.corpus.Subject(uri, label, None)
73
74
        return cls(file_as_corpus(path))
75
76
77
class TFIDFBackend(backend.AnnifBackend):
1 ignored issue
show
Coding Style introduced by
This class should have a docstring.

The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:

class SomeClass:
    def some_method(self):
        """Do x and return foo."""

If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.

Loading history...
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
78
    name = "tfidf"
79
80
    # top K subjects per chunk to consider
81
    MAX_CHUNK_SUBJECTS = 100
82
83
    # defaults for uninitialized instances
84
    _subjects = None
85
    _analyzer = None
86
    _dictionary = None
87
    _tfidf = None
88
    _index = None
89
90
    def _atomic_save(self, obj, dirname, filename):
91
        tempfd, tempfilename = tempfile.mkstemp(prefix=filename, dir=dirname)
92
        os.close(tempfd)
93
        self.debug('saving {} to temporary file {}'.format(obj, tempfilename))
94
        obj.save(tempfilename)
95
        for fn in glob.glob(tempfilename + '*'):
0 ignored issues
show
Coding Style Naming introduced by
The name fn does not conform to the variable naming conventions ((([a-z][a-z0-9_]{2,30})|(_[a-z0-9_]*))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
96
            newname = fn.replace(tempfilename, os.path.join(dirname, filename))
97
            self.debug('renaming temporary file {} to {}'.format(fn, newname))
98
            os.rename(fn, newname)
99
100
    def _initialize_subjects(self):
101
        if self._subjects is None:
102
            path = os.path.join(self._get_datadir(), 'subjects')
103
            self.debug('loading subjects from {}'.format(path))
104
            self._subjects = SubjectIndex.load(path)
105
106
    def _initialize_analyzer(self):
107
        if self._analyzer is None:
108
            self._analyzer = annif.analyzer.get_analyzer(
109
                self.params['analyzer'])
110
111
    def _initialize_dictionary(self):
112
        if self._dictionary is None:
113
            path = os.path.join(self._get_datadir(), 'dictionary')
114
            self.debug('loading dictionary from {}'.format(path))
115
            self._dictionary = gensim.corpora.Dictionary.load(path)
116
117
    def _initialize_tfidf(self):
118
        if self._tfidf is None:
119
            path = os.path.join(self._get_datadir(), 'tfidf')
120
            self.debug('loading TF-IDF model from {}'.format(path))
121
            self._tfidf = gensim.models.TfidfModel.load(path)
122
123
    def _initialize_index(self):
124
        if self._index is None:
125
            path = os.path.join(self._get_datadir(), 'index')
126
            self.debug('loading similarity index from {}'.format(path))
127
            self._index = gensim.similarities.SparseMatrixSimilarity.load(path)
128
129
    def initialize(self):
0 ignored issues
show
Coding Style introduced by
This method should have a docstring.

The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:

class SomeClass:
    def some_method(self):
        """Do x and return foo."""

If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.

Loading history...
130
        self._initialize_subjects()
131
        self._initialize_analyzer()
132
        self._initialize_dictionary()
133
        self._initialize_tfidf()
134
        self._initialize_index()
135
136
    def load_subjects(self, subjects):
0 ignored issues
show
Bug introduced by
Parameters differ from overridden 'load_subjects' method
Loading history...
137
        self.info('Backend {}: creating subject index'.format(self.backend_id))
138
        self._subjects = SubjectIndex(subjects)
139
        self._atomic_save(self._subjects, self._get_datadir(), 'subjects')
140
        self._initialize_analyzer()
141
        self.info('creating dictionary')
142
        self._dictionary = gensim.corpora.Dictionary(
143
            (self._analyzer.tokenize_words(subject.text)
144
             for subject in subjects))
145
        self._atomic_save(self._dictionary, self._get_datadir(), 'dictionary')
146
        veccorpus = VectorCorpus(subjects, self._dictionary, self._analyzer)
147
        self.info('creating TF-IDF model')
148
        self._tfidf = gensim.models.TfidfModel(veccorpus)
149
        self._atomic_save(self._tfidf, self._get_datadir(), 'tfidf')
150
        self.info('creating similarity index')
151
        self._index = gensim.similarities.SparseMatrixSimilarity(
152
            self._tfidf[veccorpus], num_features=len(self._dictionary))
153
        self._atomic_save(self._index, self._get_datadir(), 'index')
154
155
    def _analyze_chunks(self, chunks):
156
        results = []
157
        for docsim in self._index[chunks]:
158
            sims = sorted(
159
                enumerate(docsim),
160
                key=lambda item: item[1],
161
                reverse=True)
162
            results.append(sims[:self.MAX_CHUNK_SUBJECTS])
163
        return results
164
165
    def _merge_chunk_results(self, chunk_results):
166
        subject_scores = collections.defaultdict(float)
167
        for result in chunk_results:
168
            for subject_id, score in result:
169
                subject_scores[subject_id] += score
170
        best_subjects = sorted([(score,
171
                                 subject_id) for subject_id,
172
                                score in subject_scores.items()],
173
                               reverse=True)
174
        limit = int(self.params['limit'])
175
        results = []
176
        for score, subject_id in best_subjects[:limit]:
177
            if score <= 0.0:
178
                continue
179
            subject = self._subjects[subject_id]
180
            results.append(
181
                AnalysisHit(
182
                    subject[0],
183
                    subject[1],
184
                    score /
185
                    len(chunk_results)))
186
        return results
187
188
    def _analyze(self, text, params):
189
        self.initialize()
190
        self.debug('Analyzing text "{}..." (len={})'.format(
191
            text[:20], len(text)))
192
        sentences = self._analyzer.tokenize_sentences(text)
193
        self.debug('Found {} sentences'.format(len(sentences)))
194
        chunksize = int(params['chunksize'])
195
        chunks = []  # chunks represented as TF-IDF normalized vectors
196
        for i in range(0, len(sentences), chunksize):
197
            chunktext = ' '.join(sentences[i:i + chunksize])
198
            chunkbow = self._dictionary.doc2bow(
199
                self._analyzer.tokenize_words(chunktext))
200
            chunks.append(self._tfidf[chunkbow])
201
        self.debug('Split sentences into {} chunks'.format(len(chunks)))
202
        chunk_results = self._analyze_chunks(chunks)
203
        return self._merge_chunk_results(chunk_results)
204