for testing and deploying your application
for finding and fixing issues
for empowering human code reviews
"""Backend that returns most similar subjects based on similarity in sparse
TF-IDF normalized bag-of-words vector space"""
import os
import os.path
import tempfile
import gensim.corpora
import gensim.models
from . import backend
class VectorCorpus:
__class__
"""A class that wraps a text corpus so it can be iterated as lists of
vectors, by using a dictionary to map words to integers."""
def __init__(self, corpus, dictionary):
self.corpus = corpus
self.dictionary = dictionary
def __iter__(self):
for doc in self.corpus:
yield self.dictionary.doc2bow(doc)
class TFIDFBackend(backend.AnnifBackend):
The coding style of this project requires that you add a docstring to this code element. Below, you find an example for methods:
class SomeClass: def some_method(self): """Do x and return foo."""
If you would like to know more about docstrings, we recommend to read PEP-257: Docstring Conventions.
name = "tfidf"
def _atomic_save(self, obj, dirname, filename):
If a method does not access any attributes of the class, it could also be implemented as a function or static method. This can help improve readability. For example
class Foo: def some_method(self, x, y): return x + y;
could be written as
class Foo: @classmethod def some_method(cls, x, y): return x + y;
tempfd, tempfilename = tempfile.mkstemp(prefix=filename, dir=dirname)
os.close(tempfd)
obj.save(tempfilename)
os.rename(tempfilename, os.path.join(dirname, filename))
def load_subjects(self, subjects, analyzer):
corpus = subjects.tokens(analyzer)
dictionary = gensim.corpora.Dictionary(corpus)
self._atomic_save(dictionary, self._get_datadir(), 'dictionary')
veccorpus = VectorCorpus(corpus, dictionary)
tfidf = gensim.models.TfidfModel(veccorpus)
self._atomic_save(tfidf, self._get_datadir(), 'tfidf')
def analyze(self, text):
return [] # TODO
TODO
FIXME