| 
                    1
                 | 
                                    
                                                     | 
                
                 | 
                """MLLM (Maui-like Lexical Matchin) model for Annif"""  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    2
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    3
                 | 
                                    
                                                     | 
                
                 | 
                import collections  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    4
                 | 
                                    
                                                     | 
                
                 | 
                import math  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    5
                 | 
                                    
                                                     | 
                
                 | 
                import joblib  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    6
                 | 
                                    
                                                     | 
                
                 | 
                from statistics import mean  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    7
                 | 
                                    
                                                     | 
                
                 | 
                from enum import IntEnum  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    8
                 | 
                                    
                                                     | 
                
                 | 
                import numpy as np  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    9
                 | 
                                    
                                                     | 
                
                 | 
                from rdflib.namespace import SKOS  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    10
                 | 
                                    
                                                     | 
                
                 | 
                from sklearn.feature_extraction.text import CountVectorizer  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    11
                 | 
                                    
                                                     | 
                
                 | 
                from sklearn.ensemble import BaggingClassifier  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    12
                 | 
                                    
                                                     | 
                
                 | 
                from sklearn.tree import DecisionTreeClassifier  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    13
                 | 
                                    
                                                     | 
                
                 | 
                import annif.util  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    14
                 | 
                                    
                                                     | 
                
                 | 
                from annif.lexical.tokenset import TokenSet, TokenSetIndex  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    15
                 | 
                                    
                                                     | 
                
                 | 
                from annif.lexical.util import get_subject_labels  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    16
                 | 
                                    
                                                     | 
                
                 | 
                from annif.lexical.util import make_relation_matrix, make_collection_matrix  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    17
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    18
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    19
                 | 
                                    
                                                     | 
                
                 | 
                Term = collections.namedtuple('Term', 'subject_id label is_pref') | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    20
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    21
                 | 
                                    
                                                     | 
                
                 | 
                Match = collections.namedtuple(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    22
                 | 
                                    
                                                     | 
                
                 | 
                    'Match', 'subject_id is_pref n_tokens pos ambiguity')  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    23
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    24
                 | 
                                    
                                                     | 
                
                 | 
                Candidate = collections.namedtuple(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    25
                 | 
                                    
                                                     | 
                
                 | 
                    'Candidate',  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    26
                 | 
                                    
                                                     | 
                
                 | 
                    'doc_length subject_id freq is_pref n_tokens ambiguity ' +  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    27
                 | 
                                    
                                                     | 
                
                 | 
                    'first_occ last_occ spread')  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    28
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    29
                 | 
                                    
                                                     | 
                
                 | 
                Feature = IntEnum(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    30
                 | 
                                    
                                                     | 
                
                 | 
                    'Feature',  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    31
                 | 
                                    
                                                     | 
                
                 | 
                    'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' +  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    32
                 | 
                                    
                                                     | 
                
                 | 
                    'first_occ last_occ spread doc_length ' +  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    33
                 | 
                                    
                                                     | 
                
                 | 
                    'broader narrower related collection',  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    34
                 | 
                                    
                                                     | 
                
                 | 
                    start=0)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    35
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    36
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    37
                 | 
                                    
                                                     | 
                
                 | 
                class MLLMModel:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    38
                 | 
                                    
                                                     | 
                
                 | 
                    """Maui-like Lexical Matching model"""  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    39
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    40
                 | 
                                    
                                                     | 
                
                 | 
                    def _conflate_matches(self, matches, doc_length):  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    41
                 | 
                                    
                                                     | 
                
                 | 
                        subj_matches = collections.defaultdict(list)  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    42
                 | 
                                    
                                                     | 
                
                 | 
                        for match in matches:  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    43
                 | 
                                    
                                                     | 
                
                 | 
                            subj_matches[match.subject_id].append(match)  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    44
                 | 
                                    
                                                     | 
                
                 | 
                        return [  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    45
                 | 
                                    
                                                     | 
                
                 | 
                            Candidate(  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    46
                 | 
                                    
                                                     | 
                
                 | 
                                doc_length=doc_length,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    47
                 | 
                                    
                                                     | 
                
                 | 
                                subject_id=subject_id,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    48
                 | 
                                    
                                                     | 
                
                 | 
                                freq=len(matches) / doc_length,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    49
                 | 
                                    
                                                     | 
                
                 | 
                                is_pref=mean((float(m.is_pref) for m in matches)),  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    50
                 | 
                                    
                                                     | 
                
                 | 
                                n_tokens=mean((m.n_tokens for m in matches)),  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    51
                 | 
                                    
                                                     | 
                
                 | 
                                ambiguity=mean((m.ambiguity for m in matches)),  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    52
                 | 
                                    
                                                     | 
                
                 | 
                                first_occ=matches[0].pos / doc_length,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    53
                 | 
                                    
                                                     | 
                
                 | 
                                last_occ=matches[-1].pos / doc_length,  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    54
                 | 
                                    
                                                     | 
                
                 | 
                                spread=(matches[-1].pos - matches[0].pos) / doc_length  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    55
                 | 
                                    
                                                     | 
                
                 | 
                            )  | 
            
            
                                                                        
                            
            
                                    
            
            
                | 
                    56
                 | 
                                    
                                                     | 
                
                 | 
                            for subject_id, matches in subj_matches.items()]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    57
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    58
                 | 
                                    
                                                     | 
                
                 | 
                    def generate_candidates(self, text, analyzer):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    59
                 | 
                                    
                                                     | 
                
                 | 
                        sentences = analyzer.tokenize_sentences(text)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    60
                 | 
                                    
                                                     | 
                
                 | 
                        sent_tokens = self._vectorizer.transform(sentences)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    61
                 | 
                                    
                                                     | 
                
                 | 
                        matches = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    62
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    63
                 | 
                                    
                                                     | 
                
                 | 
                        for sent_idx, token_matrix in enumerate(sent_tokens):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    64
                 | 
                                    
                                                     | 
                
                 | 
                            tset = TokenSet(token_matrix.nonzero()[1])  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    65
                 | 
                                    
                                                     | 
                
                 | 
                            for ts, ambiguity in self._index.search(tset):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    66
                 | 
                                    
                                                     | 
                
                 | 
                                matches.append(Match(subject_id=ts.subject_id,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    67
                 | 
                                    
                                                     | 
                
                 | 
                                                     is_pref=ts.is_pref,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    68
                 | 
                                    
                                                     | 
                
                 | 
                                                     n_tokens=len(ts),  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    69
                 | 
                                    
                                                     | 
                
                 | 
                                                     pos=sent_idx,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    70
                 | 
                                    
                                                     | 
                
                 | 
                                                     ambiguity=ambiguity))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    71
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    72
                 | 
                                    
                                                     | 
                
                 | 
                        return self._conflate_matches(matches, len(sentences))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    73
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    74
                 | 
                                    
                                                     | 
                
                 | 
                    def _candidates_to_features(self, candidates):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    75
                 | 
                                    
                                                     | 
                
                 | 
                        """Convert a list of Candidates to a NumPy feature matrix"""  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    76
                 | 
                                    
                                                     | 
                
                 | 
                        matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    77
                 | 
                                    
                                                     | 
                
                 | 
                        c_ids = [c.subject_id for c in candidates]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    78
                 | 
                                    
                                                     | 
                
                 | 
                        c_vec = np.zeros(self._related_matrix.shape[0], dtype=np.bool)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    79
                 | 
                                    
                                                     | 
                
                 | 
                        c_vec[c_ids] = True  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    80
                 | 
                                    
                                                     | 
                
                 | 
                        broader = self._broader_matrix.multiply(c_vec).sum(axis=1)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    81
                 | 
                                    
                                                     | 
                
                 | 
                        narrower = self._narrower_matrix.multiply(c_vec).sum(axis=1)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    82
                 | 
                                    
                                                     | 
                
                 | 
                        related = self._related_matrix.multiply(c_vec).sum(axis=1)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    83
                 | 
                                    
                                                     | 
                
                 | 
                        collection = self._collection_matrix.multiply(c_vec).T.dot(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    84
                 | 
                                    
                                                     | 
                
                 | 
                            self._collection_matrix).sum(axis=0)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    85
                 | 
                                    
                                                     | 
                
                 | 
                        for idx, c in enumerate(candidates):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    86
                 | 
                                    
                                                     | 
                
                 | 
                            subj = c.subject_id  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    87
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.freq] = c.freq  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    88
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.doc_freq] = self._doc_freq[subj]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    89
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.subj_freq] = self._subj_freq.get(subj, 1) - 1  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    90
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.tfidf] = c.freq * self._idf[subj]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    91
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.is_pref] = c.is_pref  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    92
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.n_tokens] = c.n_tokens  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    93
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.ambiguity] = c.ambiguity  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    94
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.first_occ] = c.first_occ  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    95
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.last_occ] = c.last_occ  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    96
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.spread] = c.spread  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    97
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.doc_length] = c.doc_length  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    98
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    99
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    100
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.related] = related[subj, 0] / len(c_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    101
                 | 
                                    
                                                     | 
                
                 | 
                            matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    102
                 | 
                                    
                                                     | 
                
                 | 
                        return matrix  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    103
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    104
                 | 
                                    
                                                     | 
                
                 | 
                    def _prepare_terms(self, graph, vocab, params):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    105
                 | 
                                    
                                                     | 
                
                 | 
                        if annif.util.boolean(params['use_hidden_labels']):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    106
                 | 
                                    
                                                     | 
                
                 | 
                            label_props = [SKOS.altLabel, SKOS.hiddenLabel]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    107
                 | 
                                    
                                                     | 
                
                 | 
                        else:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    108
                 | 
                                    
                                                     | 
                
                 | 
                            label_props = [SKOS.altLabel]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    109
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    110
                 | 
                                    
                                                     | 
                
                 | 
                        terms = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    111
                 | 
                                    
                                                     | 
                
                 | 
                        subject_ids = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    112
                 | 
                                    
                                                     | 
                
                 | 
                        for subj_id, uri, pref, _ in vocab.subjects.active:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    113
                 | 
                                    
                                                     | 
                
                 | 
                            subject_ids.append(subj_id)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    114
                 | 
                                    
                                                     | 
                
                 | 
                            terms.append(Term(subject_id=subj_id, label=pref, is_pref=True))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    115
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    116
                 | 
                                    
                                                     | 
                
                 | 
                            for label in get_subject_labels(graph, uri, label_props,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    117
                 | 
                                    
                                                     | 
                
                 | 
                                                            params['language']):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    118
                 | 
                                    
                                                     | 
                
                 | 
                                terms.append(Term(subject_id=subj_id,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    119
                 | 
                                    
                                                     | 
                
                 | 
                                                  label=label,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    120
                 | 
                                    
                                                     | 
                
                 | 
                                                  is_pref=False))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    121
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    122
                 | 
                                    
                                                     | 
                
                 | 
                        return (terms, subject_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    123
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    124
                 | 
                                    
                                                     | 
                
                 | 
                    def _prepare_relations(self, graph, vocab):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    125
                 | 
                                    
                                                     | 
                
                 | 
                        self._broader_matrix = make_relation_matrix(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    126
                 | 
                                    
                                                     | 
                
                 | 
                            graph, vocab, SKOS.broader)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    127
                 | 
                                    
                                                     | 
                
                 | 
                        self._narrower_matrix = make_relation_matrix(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    128
                 | 
                                    
                                                     | 
                
                 | 
                            graph, vocab, SKOS.narrower)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    129
                 | 
                                    
                                                     | 
                
                 | 
                        self._related_matrix = make_relation_matrix(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    130
                 | 
                                    
                                                     | 
                
                 | 
                            graph, vocab, SKOS.related)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    131
                 | 
                                    
                                                     | 
                
                 | 
                        self._collection_matrix = make_collection_matrix(graph, vocab)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    132
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    133
                 | 
                                    
                                                     | 
                
                 | 
                    def _prepare_train_index(self, vocab, analyzer, params):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    134
                 | 
                                    
                                                     | 
                
                 | 
                        graph = vocab.as_graph()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    135
                 | 
                                    
                                                     | 
                
                 | 
                        terms, subject_ids = self._prepare_terms(graph, vocab, params)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    136
                 | 
                                    
                                                     | 
                
                 | 
                        self._prepare_relations(graph, vocab)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    137
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    138
                 | 
                                    
                                                     | 
                
                 | 
                        self._vectorizer = CountVectorizer(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    139
                 | 
                                    
                                                     | 
                
                 | 
                            binary=True,  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    140
                 | 
                                    
                                                     | 
                
                 | 
                            tokenizer=analyzer.tokenize_words  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    141
                 | 
                                    
                                                     | 
                
                 | 
                        )  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    142
                 | 
                                    
                                                     | 
                
                 | 
                        label_corpus = self._vectorizer.fit_transform((t.label for t in terms))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    143
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    144
                 | 
                                    
                                                     | 
                
                 | 
                        self._index = TokenSetIndex()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    145
                 | 
                                    
                                                     | 
                
                 | 
                        for term, label_matrix in zip(terms, label_corpus):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    146
                 | 
                                    
                                                     | 
                
                 | 
                            tokens = label_matrix.nonzero()[1]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    147
                 | 
                                    
                                                     | 
                
                 | 
                            tset = TokenSet(tokens, term.subject_id, term.is_pref)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    148
                 | 
                                    
                                                     | 
                
                 | 
                            self._index.add(tset)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    149
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    150
                 | 
                                    
                                                     | 
                
                 | 
                        return subject_ids  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    151
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    152
                 | 
                                    
                                                     | 
                
                 | 
                    def _calculate_idf(self, subject_ids, doc_count):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    153
                 | 
                                    
                                                     | 
                
                 | 
                        idf = collections.defaultdict(float)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    154
                 | 
                                    
                                                     | 
                
                 | 
                        for subj_id in subject_ids:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    155
                 | 
                                    
                                                     | 
                
                 | 
                            idf[subj_id] = math.log((doc_count + 1) /  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    156
                 | 
                                    
                                                     | 
                
                 | 
                                                    (self._doc_freq[subj_id] + 1)) + 1  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    157
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    158
                 | 
                                    
                                                     | 
                
                 | 
                        return idf  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    159
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    160
                 | 
                                    
                                                     | 
                
                 | 
                    def prepare_train(self, corpus, vocab, analyzer, params):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    161
                 | 
                                    
                                                     | 
                
                 | 
                        subject_ids = self._prepare_train_index(vocab, analyzer, params)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    162
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    163
                 | 
                                    
                                                     | 
                
                 | 
                        # frequency of subjects (by id) in the generated candidates  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    164
                 | 
                                    
                                                     | 
                
                 | 
                        self._doc_freq = collections.Counter()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    165
                 | 
                                    
                                                     | 
                
                 | 
                        # frequency of manually assigned subjects ("domain keyphraseness") | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    166
                 | 
                                    
                                                     | 
                
                 | 
                        self._subj_freq = collections.Counter()  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    167
                 | 
                                    
                                                     | 
                
                 | 
                        doc_count = 0  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    168
                 | 
                                    
                                                     | 
                
                 | 
                        train_x = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    169
                 | 
                                    
                                                     | 
                
                 | 
                        train_y = []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    170
                 | 
                                    
                                                     | 
                
                 | 
                        for idx, doc in enumerate(corpus.documents):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    171
                 | 
                                    
                                                     | 
                
                 | 
                            doc_subject_ids = [vocab.subjects.by_uri(uri)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    172
                 | 
                                    
                                                     | 
                
                 | 
                                               for uri in doc.uris]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    173
                 | 
                                    
                                                     | 
                
                 | 
                            self._subj_freq.update(doc_subject_ids)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    174
                 | 
                                    
                                                     | 
                
                 | 
                            candidates = self.generate_candidates(doc.text, analyzer)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    175
                 | 
                                    
                                                     | 
                
                 | 
                            self._doc_freq.update([c.subject_id for c in candidates])  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    176
                 | 
                                    
                                                     | 
                
                 | 
                            train_x.append(candidates)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    177
                 | 
                                    
                                                     | 
                
                 | 
                            train_y += [(c.subject_id in doc_subject_ids) for c in candidates]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    178
                 | 
                                    
                                                     | 
                
                 | 
                            doc_count += 1  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    179
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    180
                 | 
                                    
                                                     | 
                
                 | 
                        # precalculate idf values for all candidate subjects  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    181
                 | 
                                    
                                                     | 
                
                 | 
                        self._idf = self._calculate_idf(subject_ids, doc_count)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    182
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    183
                 | 
                                    
                                                     | 
                
                 | 
                        return (np.vstack([self._candidates_to_features(candidates)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    184
                 | 
                                    
                                                     | 
                
                 | 
                                           for candidates in train_x]), np.array(train_y))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    185
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    186
                 | 
                                    
                                                     | 
                
                 | 
                    def _create_classifier(self, params):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    187
                 | 
                                    
                                                     | 
                
                 | 
                        return BaggingClassifier(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    188
                 | 
                                    
                                                     | 
                
                 | 
                            DecisionTreeClassifier(  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    189
                 | 
                                    
                                                     | 
                
                 | 
                                min_samples_leaf=int(params['min_samples_leaf']),  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    190
                 | 
                                    
                                                     | 
                
                 | 
                                max_leaf_nodes=int(params['max_leaf_nodes'])  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    191
                 | 
                                    
                                                     | 
                
                 | 
                            ), max_samples=float(params['max_samples']))  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    192
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    193
                 | 
                                    
                                                     | 
                
                 | 
                    def train(self, train_x, train_y, params):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    194
                 | 
                                    
                                                     | 
                
                 | 
                        # fit the model on the training corpus  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    195
                 | 
                                    
                                                     | 
                
                 | 
                        self._classifier = self._create_classifier(params)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    196
                 | 
                                    
                                                     | 
                
                 | 
                        self._classifier.fit(train_x, train_y)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    197
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    198
                 | 
                                    
                                                     | 
                
                 | 
                    def _prediction_to_list(self, scores, candidates):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    199
                 | 
                                    
                                                     | 
                
                 | 
                        subj_scores = [(score[1], c.subject_id)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    200
                 | 
                                    
                                                     | 
                
                 | 
                                       for score, c in zip(scores, candidates)]  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    201
                 | 
                                    
                                                     | 
                
                 | 
                        return sorted(subj_scores, reverse=True)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    202
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    203
                 | 
                                    
                                                     | 
                
                 | 
                    def predict(self, candidates):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    204
                 | 
                                    
                                                     | 
                
                 | 
                        if not candidates:  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    205
                 | 
                                    
                                                     | 
                
                 | 
                            return []  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    206
                 | 
                                    
                                                     | 
                
                 | 
                        features = self._candidates_to_features(candidates)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    207
                 | 
                                    
                                                     | 
                
                 | 
                        scores = self._classifier.predict_proba(features)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    208
                 | 
                                    
                                                     | 
                
                 | 
                        return self._prediction_to_list(scores, candidates)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    209
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    210
                 | 
                                    
                                                     | 
                
                 | 
                    def save(self, filename):  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    211
                 | 
                                    
                                                     | 
                
                 | 
                        return joblib.dump(self, filename)  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    212
                 | 
                                    
                                                     | 
                
                 | 
                 | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    213
                 | 
                                    
                                                     | 
                
                 | 
                    @staticmethod  | 
            
            
                                                                                                            
                            
            
                                    
            
            
                | 
                    214
                 | 
                                    
                                                     | 
                
                 | 
                    def load(filename):  | 
            
            
                                                                                                            
                                                                
            
                                    
            
            
                | 
                    215
                 | 
                                    
                                                     | 
                
                 | 
                        return joblib.load(filename)  | 
            
            
                                                        
            
                                    
            
            
                | 
                    216
                 | 
                                    
                                                     | 
                
                 | 
                 |