| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                """Backend that returns most similar subjects based on similarity in sparse  | 
            
            
                                                        
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                TF-IDF normalized bag-of-words vector space"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                import os.path  | 
            
            
                                                        
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                import tempfile  | 
            
            
                                                        
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                import joblib  | 
            
            
                                                        
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                import gensim.similarities  | 
            
            
                                                        
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                from gensim.matutils import Sparse2Corpus  | 
            
            
                                                        
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                from sklearn.feature_extraction.text import TfidfVectorizer  | 
            
            
                                                        
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                import annif.util  | 
            
            
                                                        
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                from annif.suggestion import VectorSuggestionResult  | 
            
            
                                                        
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                from annif.exception import NotInitializedException, NotSupportedException  | 
            
            
                                                        
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                from . import backend  | 
            
            
                                                        
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                class SubjectBuffer:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                    """A file-backed buffer to store and retrieve subject text."""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                    BUFFER_SIZE = 100  | 
            
            
                                                        
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                    def __init__(self, tempdir, subject_id):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                        filename = '{:08d}.txt'.format(subject_id) | 
            
            
                                                        
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                        self._path = os.path.join(tempdir, filename)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                        self._buffer = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                        self._created = False  | 
            
            
                                                        
            
                                    
            
            
                | 
                    26
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                    def flush(self):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                        if self._created:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                            mode = 'a'  | 
            
            
                                                        
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                            mode = 'w'  | 
            
            
                                                        
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                        with open(self._path, mode, encoding='utf-8') as subjfile:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                            for text in self._buffer:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                                print(text, file=subjfile)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                        self._buffer = []  | 
            
            
                                                        
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                        self._created = True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                    def write(self, text):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                        self._buffer.append(text)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                        if len(self._buffer) >= self.BUFFER_SIZE:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                            self.flush()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                    def read(self):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                        if not self._created:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                            # file was never created - we can simply return the buffer content  | 
            
            
                                                        
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                            return "\n".join(self._buffer)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    50
                 | 
                                    
                                                     | 
                
                 | 
                            with open(self._path, 'r', encoding='utf-8') as subjfile:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    51
                 | 
                                    
                                                     | 
                
                 | 
                                return subjfile.read() + "\n" + "\n".join(self._buffer)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                class TFIDFBackend(backend.AnnifBackend):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                    """TF-IDF vector space similarity based backend for Annif"""  | 
            
            
                                                        
            
                                    
            
            
                | 
                    56
                 | 
                                    
                                                     | 
                
                 | 
                    name = "tfidf"  | 
            
            
                                                        
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                    needs_subject_index = True  | 
            
            
                                                        
            
                                    
            
            
                | 
                    58
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                    # defaults for uninitialized instances  | 
            
            
                                                        
            
                                    
            
            
                | 
                    60
                 | 
                                    
                                                     | 
                
                 | 
                    _vectorizer = None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                    _index = None  | 
            
            
                                                        
            
                                    
            
            
                | 
                    62
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                    VECTORIZER_FILE = 'vectorizer'  | 
            
            
                                                        
            
                                    
            
            
                | 
                    64
                 | 
                                    
                                                     | 
                
                 | 
                    INDEX_FILE = 'tfidf-index'  | 
            
            
                                                        
            
                                    
            
            
                | 
                    65
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    66
                 | 
                                    
                                                     | 
                
                 | 
                    def _generate_subjects_from_documents(self, corpus, project):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                        with tempfile.TemporaryDirectory() as tempdir:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                            subject_buffer = {} | 
            
            
                                                        
            
                                    
            
            
                | 
                    69
                 | 
                                    
                                                     | 
                
                 | 
                            for subject_id in range(len(project.subjects)):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                                subject_buffer[subject_id] = SubjectBuffer(tempdir,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                                                                           subject_id)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                            for doc in corpus.documents:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                                tokens = project.analyzer.tokenize_words(doc.text)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                                for uri in doc.uris:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                                    subject_id = project.subjects.by_uri(uri)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                                    if subject_id is None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                                        continue  | 
            
            
                                                        
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                                    subject_buffer[subject_id].write(" ".join(tokens)) | 
            
            
                                                        
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                            for sid in range(len(project.subjects)):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                                yield subject_buffer[sid].read()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    84
                 | 
                                    
                                                     | 
                
                 | 
                    def _initialize_vectorizer(self):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                        if self._vectorizer is None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    86
                 | 
                                    
                                                     | 
                
                 | 
                            path = os.path.join(self.datadir, self.VECTORIZER_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                            if os.path.exists(path):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    88
                 | 
                                    
                                                     | 
                
                 | 
                                self.debug('loading vectorizer from {}'.format(path)) | 
            
            
                                                        
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                                self._vectorizer = joblib.load(path)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    90
                 | 
                                    
                                                     | 
                
                 | 
                            else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    91
                 | 
                                    
                                                     | 
                
                 | 
                                raise NotInitializedException(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    92
                 | 
                                    
                                                     | 
                
                 | 
                                    "vectorizer file '{}' not found".format(path), | 
            
            
                                                        
            
                                    
            
            
                | 
                    93
                 | 
                                    
                                                     | 
                
                 | 
                                    backend_id=self.backend_id)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    94
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    95
                 | 
                                    
                                                     | 
                
                 | 
                    def _initialize_index(self):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    96
                 | 
                                    
                                                     | 
                
                 | 
                        if self._index is None:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    97
                 | 
                                    
                                                     | 
                
                 | 
                            path = os.path.join(self.datadir, self.INDEX_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    98
                 | 
                                    
                                                     | 
                
                 | 
                            self.debug('loading similarity index from {}'.format(path)) | 
            
            
                                                        
            
                                    
            
            
                | 
                    99
                 | 
                                    
                                                     | 
                
                 | 
                            if os.path.exists(path):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    100
                 | 
                                    
                                                     | 
                
                 | 
                                self._index = gensim.similarities.SparseMatrixSimilarity.load(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    101
                 | 
                                    
                                                     | 
                
                 | 
                                    path)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    102
                 | 
                                    
                                                     | 
                
                 | 
                            else:  | 
            
            
                                                        
            
                                    
            
            
                | 
                    103
                 | 
                                    
                                                     | 
                
                 | 
                                raise NotInitializedException(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    104
                 | 
                                    
                                                     | 
                
                 | 
                                    'similarity index {} not found'.format(path), | 
            
            
                                                        
            
                                    
            
            
                | 
                    105
                 | 
                                    
                                                     | 
                
                 | 
                                    backend_id=self.backend_id)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    106
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    107
                 | 
                                    
                                                     | 
                
                 | 
                    def initialize(self):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    108
                 | 
                                    
                                                     | 
                
                 | 
                        self._initialize_vectorizer()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    109
                 | 
                                    
                                                     | 
                
                 | 
                        self._initialize_index()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    110
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    111
                 | 
                                    
                                                     | 
                
                 | 
                    def _create_index(self, veccorpus):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    112
                 | 
                                    
                                                     | 
                
                 | 
                        self.info('creating similarity index') | 
            
            
                                                        
            
                                    
            
            
                | 
                    113
                 | 
                                    
                                                     | 
                
                 | 
                        gscorpus = Sparse2Corpus(veccorpus, documents_columns=False)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    114
                 | 
                                    
                                                     | 
                
                 | 
                        self._index = gensim.similarities.SparseMatrixSimilarity(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    115
                 | 
                                    
                                                     | 
                
                 | 
                            gscorpus, num_features=len(self._vectorizer.vocabulary_))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    116
                 | 
                                    
                                                     | 
                
                 | 
                        annif.util.atomic_save(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    117
                 | 
                                    
                                                     | 
                
                 | 
                            self._index,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    118
                 | 
                                    
                                                     | 
                
                 | 
                            self.datadir,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    119
                 | 
                                    
                                                     | 
                
                 | 
                            self.INDEX_FILE)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    120
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    121
                 | 
                                    
                                                     | 
                
                 | 
                    def train(self, corpus, project):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    122
                 | 
                                    
                                                     | 
                
                 | 
                        if corpus.is_empty():  | 
            
            
                                                        
            
                                    
            
            
                | 
                    123
                 | 
                                    
                                                     | 
                
                 | 
                            raise NotSupportedException(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    124
                 | 
                                    
                                                     | 
                
                 | 
                                'Cannot train tfidf project with no documents')  | 
            
            
                                                        
            
                                    
            
            
                | 
                    125
                 | 
                                    
                                                     | 
                
                 | 
                        self.info('transforming subject corpus') | 
            
            
                                                        
            
                                    
            
            
                | 
                    126
                 | 
                                    
                                                     | 
                
                 | 
                        subjects = self._generate_subjects_from_documents(corpus, project)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    127
                 | 
                                    
                                                     | 
                
                 | 
                        self.info('creating vectorizer') | 
            
            
                                                        
            
                                    
            
            
                | 
                    128
                 | 
                                    
                                                     | 
                
                 | 
                        self._vectorizer = TfidfVectorizer()  | 
            
            
                                                        
            
                                    
            
            
                | 
                    129
                 | 
                                    
                                                     | 
                
                 | 
                        veccorpus = self._vectorizer.fit_transform(subjects)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    130
                 | 
                                    
                                                     | 
                
                 | 
                        annif.util.atomic_save(  | 
            
            
                                                        
            
                                    
            
            
                | 
                    131
                 | 
                                    
                                                     | 
                
                 | 
                            self._vectorizer,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    132
                 | 
                                    
                                                     | 
                
                 | 
                            self.datadir,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    133
                 | 
                                    
                                                     | 
                
                 | 
                            self.VECTORIZER_FILE,  | 
            
            
                                                        
            
                                    
            
            
                | 
                    134
                 | 
                                    
                                                     | 
                
                 | 
                            method=joblib.dump)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    135
                 | 
                                    
                                                     | 
                
                 | 
                        self._create_index(veccorpus)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    136
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                        
            
                                    
            
            
                | 
                    137
                 | 
                                    
                                                     | 
                
                 | 
                    def _suggest(self, text, project, params):  | 
            
            
                                                        
            
                                    
            
            
                | 
                    138
                 | 
                                    
                                                     | 
                
                 | 
                        self.debug('Suggesting subjects for text "{}..." (len={})'.format( | 
            
            
                                                        
            
                                    
            
            
                | 
                    139
                 | 
                                    
                                                     | 
                
                 | 
                            text[:20], len(text)))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    140
                 | 
                                    
                                                     | 
                
                 | 
                        tokens = project.analyzer.tokenize_words(text)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    141
                 | 
                                    
                                                     | 
                
                 | 
                        vectors = self._vectorizer.transform([" ".join(tokens)])  | 
            
            
                                                        
            
                                    
            
            
                | 
                    142
                 | 
                                    
                                                     | 
                
                 | 
                        docsim = self._index[vectors[0]]  | 
            
            
                                                        
            
                                    
            
            
                | 
                    143
                 | 
                                    
                                                     | 
                
                 | 
                        fullresult = VectorSuggestionResult(docsim, project.subjects)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    144
                 | 
                                    
                                                     | 
                
                 | 
                        return fullresult.filter(limit=int(self.params['limit']))  | 
            
            
                                                        
            
                                    
            
            
                | 
                    145
                 | 
                                    
                                                     | 
                
                 | 
                 |