| 1 |  |  | """MLLM (Maui-like Lexical Matchin) model for Annif""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 3 |  |  | import collections | 
            
                                                                                                            
                            
            
                                    
            
            
                | 4 |  |  | import math | 
            
                                                                                                            
                            
            
                                    
            
            
                | 5 |  |  | import joblib | 
            
                                                                                                            
                            
            
                                    
            
            
                | 6 |  |  | from statistics import mean | 
            
                                                                                                            
                            
            
                                    
            
            
                | 7 |  |  | from enum import IntEnum | 
            
                                                                                                            
                            
            
                                    
            
            
                | 8 |  |  | import numpy as np | 
            
                                                                                                            
                            
            
                                    
            
            
                | 9 |  |  | from rdflib.namespace import SKOS | 
            
                                                                                                            
                            
            
                                    
            
            
                | 10 |  |  | from sklearn.feature_extraction.text import CountVectorizer | 
            
                                                                                                            
                            
            
                                    
            
            
                | 11 |  |  | from sklearn.ensemble import BaggingClassifier | 
            
                                                                                                            
                            
            
                                    
            
            
                | 12 |  |  | from sklearn.tree import DecisionTreeClassifier | 
            
                                                                                                            
                            
            
                                    
            
            
                | 13 |  |  | import annif.util | 
            
                                                                                                            
                            
            
                                    
            
            
                | 14 |  |  | import annif.parallel | 
            
                                                                                                            
                            
            
                                    
            
            
                | 15 |  |  | from annif.exception import OperationFailedException | 
            
                                                                                                            
                            
            
                                    
            
            
                | 16 |  |  | from annif.lexical.tokenset import TokenSet, TokenSetIndex | 
            
                                                                                                            
                            
            
                                    
            
            
                | 17 |  |  | from annif.lexical.util import get_subject_labels | 
            
                                                                                                            
                            
            
                                    
            
            
                | 18 |  |  | from annif.lexical.util import make_relation_matrix, make_collection_matrix | 
            
                                                                                                            
                            
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 20 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 21 |  |  | Term = collections.namedtuple('Term', 'subject_id label is_pref') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 22 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 23 |  |  | Match = collections.namedtuple( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 24 |  |  |     'Match', 'subject_id is_pref n_tokens pos ambiguity') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 26 |  |  | Candidate = collections.namedtuple( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 27 |  |  |     'Candidate', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 28 |  |  |     'doc_length subject_id freq is_pref n_tokens ambiguity ' + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 29 |  |  |     'first_occ last_occ spread') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 30 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 31 |  |  | ModelData = collections.namedtuple( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 32 |  |  |     'ModelData', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 33 |  |  |     'broader narrower related collection ' + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 34 |  |  |     'doc_freq subj_freq idf') | 
            
                                                                                                            
                            
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 36 |  |  | Feature = IntEnum( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 37 |  |  |     'Feature', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 38 |  |  |     'freq doc_freq subj_freq tfidf is_pref n_tokens ambiguity ' + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 39 |  |  |     'first_occ last_occ spread doc_length ' + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 40 |  |  |     'broader narrower related collection', | 
            
                                                                                                            
                            
            
                                    
            
            
                | 41 |  |  |     start=0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 42 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 43 |  |  |  | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 44 |  | View Code Duplication | def conflate_matches(matches, doc_length): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 45 |  |  |     subj_matches = collections.defaultdict(list) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 46 |  |  |     for match in matches: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 47 |  |  |         subj_matches[match.subject_id].append(match) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 48 |  |  |     return [ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 49 |  |  |         Candidate( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 50 |  |  |             doc_length=doc_length, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 51 |  |  |             subject_id=subject_id, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 52 |  |  |             freq=len(matches) / doc_length, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 53 |  |  |             is_pref=mean((float(m.is_pref) for m in matches)), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 54 |  |  |             n_tokens=mean((m.n_tokens for m in matches)), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 55 |  |  |             ambiguity=mean((m.ambiguity for m in matches)), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 56 |  |  |             first_occ=matches[0].pos / doc_length, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 57 |  |  |             last_occ=matches[-1].pos / doc_length, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 58 |  |  |             spread=(matches[-1].pos - matches[0].pos) / doc_length | 
            
                                                                                                            
                            
            
                                    
            
            
                | 59 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 60 |  |  |         for subject_id, matches in subj_matches.items()] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 61 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 62 |  |  |  | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 63 |  | View Code Duplication | def generate_candidates(text, analyzer, vectorizer, index): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 64 |  |  |     sentences = analyzer.tokenize_sentences(text) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 65 |  |  |     sent_tokens = vectorizer.transform(sentences) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 66 |  |  |     matches = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 67 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 68 |  |  |     for sent_idx, token_matrix in enumerate(sent_tokens): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 69 |  |  |         tset = TokenSet(token_matrix.nonzero()[1]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 70 |  |  |         for ts, ambiguity in index.search(tset): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 71 |  |  |             matches.append(Match(subject_id=ts.subject_id, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 72 |  |  |                                  is_pref=ts.is_pref, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 73 |  |  |                                  n_tokens=len(ts), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 74 |  |  |                                  pos=sent_idx, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 75 |  |  |                                  ambiguity=ambiguity)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 76 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 77 |  |  |     return conflate_matches(matches, len(sentences)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 78 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 79 |  |  |  | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 80 |  | View Code Duplication | def candidates_to_features(candidates, mdata): | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 81 |  |  |     """Convert a list of Candidates to a NumPy feature matrix""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 82 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 83 |  |  |     matrix = np.zeros((len(candidates), len(Feature)), dtype=np.float32) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 84 |  |  |     c_ids = [c.subject_id for c in candidates] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 85 |  |  |     c_vec = np.zeros(mdata.related.shape[0], dtype=bool) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 86 |  |  |     c_vec[c_ids] = True | 
            
                                                                                                            
                            
            
                                    
            
            
                | 87 |  |  |     broader = mdata.broader.multiply(c_vec).sum(axis=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 88 |  |  |     narrower = mdata.narrower.multiply(c_vec).sum(axis=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 89 |  |  |     related = mdata.related.multiply(c_vec).sum(axis=1) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 90 |  |  |     collection = mdata.collection.multiply(c_vec).T.dot( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 91 |  |  |         mdata.collection).sum(axis=0) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 92 |  |  |     for idx, c in enumerate(candidates): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 93 |  |  |         subj = c.subject_id | 
            
                                                                                                            
                            
            
                                    
            
            
                | 94 |  |  |         matrix[idx, Feature.freq] = c.freq | 
            
                                                                                                            
                            
            
                                    
            
            
                | 95 |  |  |         matrix[idx, Feature.doc_freq] = mdata.doc_freq[subj] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 96 |  |  |         matrix[idx, Feature.subj_freq] = mdata.subj_freq.get(subj, 1) - 1 | 
            
                                                                                                            
                            
            
                                    
            
            
                | 97 |  |  |         matrix[idx, Feature.tfidf] = c.freq * mdata.idf[subj] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 98 |  |  |         matrix[idx, Feature.is_pref] = c.is_pref | 
            
                                                                                                            
                            
            
                                    
            
            
                | 99 |  |  |         matrix[idx, Feature.n_tokens] = c.n_tokens | 
            
                                                                                                            
                            
            
                                    
            
            
                | 100 |  |  |         matrix[idx, Feature.ambiguity] = c.ambiguity | 
            
                                                                                                            
                            
            
                                    
            
            
                | 101 |  |  |         matrix[idx, Feature.first_occ] = c.first_occ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 102 |  |  |         matrix[idx, Feature.last_occ] = c.last_occ | 
            
                                                                                                            
                            
            
                                    
            
            
                | 103 |  |  |         matrix[idx, Feature.spread] = c.spread | 
            
                                                                                                            
                            
            
                                    
            
            
                | 104 |  |  |         matrix[idx, Feature.doc_length] = c.doc_length | 
            
                                                                                                            
                            
            
                                    
            
            
                | 105 |  |  |         matrix[idx, Feature.broader] = broader[subj, 0] / len(c_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 106 |  |  |         matrix[idx, Feature.narrower] = narrower[subj, 0] / len(c_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 107 |  |  |         matrix[idx, Feature.related] = related[subj, 0] / len(c_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 108 |  |  |         matrix[idx, Feature.collection] = collection[0, subj] / len(c_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 109 |  |  |     return matrix | 
            
                                                                                                            
                            
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 111 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 112 |  |  | class MLLMCandidateGenerator(annif.parallel.BaseWorker): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 113 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 114 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 115 |  |  |     def generate_candidates(cls, doc_subject_ids, text): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 116 |  |  |         candidates = generate_candidates(text, **cls.args)  # pragma: no cover | 
            
                                                                                                            
                            
            
                                    
            
            
                | 117 |  |  |         return doc_subject_ids, candidates  # pragma: no cover | 
            
                                                                                                            
                            
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 119 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 120 |  |  | class MLLMFeatureConverter(annif.parallel.BaseWorker): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 121 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 122 |  |  |     @classmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 123 |  |  |     def candidates_to_features(cls, candidates): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 124 |  |  |         return candidates_to_features(candidates, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 125 |  |  |                                       **cls.args)  # pragma: no cover | 
            
                                                                                                            
                            
            
                                    
            
            
                | 126 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 127 |  |  |  | 
            
                                                                                                            
                            
            
                                                                    
                                                                                                        
            
            
                | 128 |  | View Code Duplication | class MLLMModel: | 
                            
                    |  |  |  | 
                                                                                        
                                                                                     | 
            
                                                                                                            
                            
            
                                    
            
            
                | 129 |  |  |     """Maui-like Lexical Matching model""" | 
            
                                                                                                            
                            
            
                                    
            
            
                | 130 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 131 |  |  |     def generate_candidates(self, text, analyzer): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 132 |  |  |         return generate_candidates(text, analyzer, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 133 |  |  |                                    self._vectorizer, self._index) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 134 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 135 |  |  |     @property | 
            
                                                                                                            
                            
            
                                    
            
            
                | 136 |  |  |     def _model_data(self): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 137 |  |  |         return ModelData(broader=self._broader_matrix, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 138 |  |  |                          narrower=self._narrower_matrix, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 139 |  |  |                          related=self._related_matrix, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 140 |  |  |                          collection=self._collection_matrix, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 141 |  |  |                          doc_freq=self._doc_freq, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 142 |  |  |                          subj_freq=self._subj_freq, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 143 |  |  |                          idf=self._idf) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 144 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 145 |  |  |     def _candidates_to_features(self, candidates): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 146 |  |  |         return candidates_to_features(candidates, self._model_data) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 147 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 148 |  |  |     @staticmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 149 |  |  |     def _get_label_props(params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 150 |  |  |         pref_label_props = [SKOS.prefLabel] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 151 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 152 |  |  |         if annif.util.boolean(params['use_hidden_labels']): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 153 |  |  |             nonpref_label_props = [SKOS.altLabel, SKOS.hiddenLabel] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 154 |  |  |         else: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 155 |  |  |             nonpref_label_props = [SKOS.altLabel] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 156 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 157 |  |  |         return (pref_label_props, nonpref_label_props) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 158 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 159 |  |  |     def _prepare_terms(self, graph, vocab, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 160 |  |  |         pref_label_props, nonpref_label_props = self._get_label_props(params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 161 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 162 |  |  |         terms = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 163 |  |  |         subject_ids = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 164 |  |  |         for subj_id, uri, _, _ in vocab.subjects.active: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 165 |  |  |             subject_ids.append(subj_id) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 166 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 167 |  |  |             for label in get_subject_labels(graph, uri, pref_label_props, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 168 |  |  |                                             params['language']): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 169 |  |  |                 terms.append(Term(subject_id=subj_id, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 170 |  |  |                                   label=label, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 171 |  |  |                                   is_pref=True)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 172 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 173 |  |  |             for label in get_subject_labels(graph, uri, nonpref_label_props, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 174 |  |  |                                             params['language']): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 175 |  |  |                 terms.append(Term(subject_id=subj_id, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 176 |  |  |                                   label=label, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 177 |  |  |                                   is_pref=False)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 178 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 179 |  |  |         return (terms, subject_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 180 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 181 |  |  |     def _prepare_relations(self, graph, vocab): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 182 |  |  |         self._broader_matrix = make_relation_matrix( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 183 |  |  |             graph, vocab, SKOS.broader) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 184 |  |  |         self._narrower_matrix = make_relation_matrix( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 185 |  |  |             graph, vocab, SKOS.narrower) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 186 |  |  |         self._related_matrix = make_relation_matrix( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 187 |  |  |             graph, vocab, SKOS.related) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 188 |  |  |         self._collection_matrix = make_collection_matrix(graph, vocab) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 189 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 190 |  |  |     def _prepare_train_index(self, vocab, analyzer, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 191 |  |  |         graph = vocab.as_graph() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 192 |  |  |         terms, subject_ids = self._prepare_terms(graph, vocab, params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 193 |  |  |         self._prepare_relations(graph, vocab) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 194 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 195 |  |  |         self._vectorizer = CountVectorizer( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 196 |  |  |             binary=True, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 197 |  |  |             tokenizer=analyzer.tokenize_words | 
            
                                                                                                            
                            
            
                                    
            
            
                | 198 |  |  |         ) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 199 |  |  |         label_corpus = self._vectorizer.fit_transform((t.label for t in terms)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 200 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 201 |  |  |         # frequency of each token used in labels - how rare each word is | 
            
                                                                                                            
                            
            
                                    
            
            
                | 202 |  |  |         token_freq = np.bincount(label_corpus.indices, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 203 |  |  |                                  minlength=label_corpus.shape[1]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 204 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 205 |  |  |         self._index = TokenSetIndex() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 206 |  |  |         for term, label_matrix in zip(terms, label_corpus): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 207 |  |  |             tokens = label_matrix.nonzero()[1] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 208 |  |  |             # sort tokens by frequency - use the rarest token as index key | 
            
                                                                                                            
                            
            
                                    
            
            
                | 209 |  |  |             tokens = sorted(tokens, key=token_freq.__getitem__) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 210 |  |  |             tset = TokenSet(tokens, term.subject_id, term.is_pref) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 211 |  |  |             self._index.add(tset) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 212 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 213 |  |  |         return subject_ids | 
            
                                                                                                            
                            
            
                                    
            
            
                | 214 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 215 |  |  |     def _prepare_train_data(self, corpus, vocab, analyzer, n_jobs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 216 |  |  |         # frequency of subjects (by id) in the generated candidates | 
            
                                                                                                            
                            
            
                                    
            
            
                | 217 |  |  |         self._doc_freq = collections.Counter() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 218 |  |  |         # frequency of manually assigned subjects ("domain keyphraseness") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 219 |  |  |         self._subj_freq = collections.Counter() | 
            
                                                                                                            
                            
            
                                    
            
            
                | 220 |  |  |         train_x = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 221 |  |  |         train_y = [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 222 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 223 |  |  |         jobs, pool_class = annif.parallel.get_pool(n_jobs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 224 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 225 |  |  |         cg_args = { | 
            
                                                                                                            
                            
            
                                    
            
            
                | 226 |  |  |             'analyzer': analyzer, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 227 |  |  |             'vectorizer': self._vectorizer, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 228 |  |  |             'index': self._index | 
            
                                                                                                            
                            
            
                                    
            
            
                | 229 |  |  |         } | 
            
                                                                                                            
                            
            
                                    
            
            
                | 230 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 231 |  |  |         with pool_class(jobs, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 232 |  |  |                         initializer=MLLMCandidateGenerator.init, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 233 |  |  |                         initargs=(cg_args,)) as pool: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 234 |  |  |             params = (([vocab.subjects.by_uri(uri) for uri in doc.uris], | 
            
                                                                                                            
                            
            
                                    
            
            
                | 235 |  |  |                        doc.text) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 236 |  |  |                       for doc in corpus.documents) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 237 |  |  |             for doc_subject_ids, candidates in pool.starmap( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 238 |  |  |                     MLLMCandidateGenerator.generate_candidates, params, 10): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 239 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 240 |  |  |                 self._subj_freq.update(doc_subject_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 241 |  |  |                 self._doc_freq.update([c.subject_id for c in candidates]) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 242 |  |  |                 train_x.append(candidates) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 243 |  |  |                 train_y += [(c.subject_id in doc_subject_ids) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 244 |  |  |                             for c in candidates] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 245 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 246 |  |  |         return (train_x, train_y) | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 247 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 248 |  |  |     def _calculate_idf(self, subject_ids, doc_count): | 
            
                                                                        
                            
            
                                    
            
            
                | 249 |  |  |         idf = collections.defaultdict(float) | 
            
                                                                        
                            
            
                                    
            
            
                | 250 |  |  |         for subj_id in subject_ids: | 
            
                                                                        
                            
            
                                    
            
            
                | 251 |  |  |             idf[subj_id] = math.log((doc_count + 1) / | 
            
                                                                        
                            
            
                                    
            
            
                | 252 |  |  |                                     (self._doc_freq[subj_id] + 1)) + 1 | 
            
                                                                        
                            
            
                                    
            
            
                | 253 |  |  |  | 
            
                                                                        
                            
            
                                    
            
            
                | 254 |  |  |         return idf | 
            
                                                                                                            
                            
            
                                    
            
            
                | 255 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 256 |  |  |     def _prepare_features(self, train_x, n_jobs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 257 |  |  |         fc_args = {'mdata': self._model_data} | 
            
                                                                                                            
                            
            
                                    
            
            
                | 258 |  |  |         jobs, pool_class = annif.parallel.get_pool(n_jobs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 259 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 260 |  |  |         with pool_class(jobs, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 261 |  |  |                         initializer=MLLMFeatureConverter.init, | 
            
                                                                                                            
                            
            
                                    
            
            
                | 262 |  |  |                         initargs=(fc_args,)) as pool: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 263 |  |  |             features = pool.map( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 264 |  |  |                 MLLMFeatureConverter.candidates_to_features, train_x, 10) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 265 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 266 |  |  |         return features | 
            
                                                                                                            
                            
            
                                    
            
            
                | 267 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 268 |  |  |     def prepare_train(self, corpus, vocab, analyzer, params, n_jobs): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 269 |  |  |         # create an index from the vocabulary terms | 
            
                                                                                                            
                            
            
                                    
            
            
                | 270 |  |  |         subject_ids = self._prepare_train_index(vocab, analyzer, params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 271 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 272 |  |  |         # convert the corpus into train data | 
            
                                                                                                            
                            
            
                                    
            
            
                | 273 |  |  |         train_x, train_y = self._prepare_train_data( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 274 |  |  |             corpus, vocab, analyzer, n_jobs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 275 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 276 |  |  |         # precalculate idf values for all candidate subjects | 
            
                                                                                                            
                            
            
                                    
            
            
                | 277 |  |  |         self._idf = self._calculate_idf(subject_ids, len(train_x)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 278 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 279 |  |  |         # convert the train data into feature values | 
            
                                                                                                            
                            
            
                                    
            
            
                | 280 |  |  |         features = self._prepare_features(train_x, n_jobs) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 281 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 282 |  |  |         return (np.vstack(features), np.array(train_y)) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 283 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 284 |  |  |     def _create_classifier(self, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 285 |  |  |         return BaggingClassifier( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 286 |  |  |             DecisionTreeClassifier( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 287 |  |  |                 min_samples_leaf=int(params['min_samples_leaf']), | 
            
                                                                                                            
                            
            
                                    
            
            
                | 288 |  |  |                 max_leaf_nodes=int(params['max_leaf_nodes']) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 289 |  |  |             ), max_samples=float(params['max_samples'])) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 290 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 291 |  |  |     def train(self, train_x, train_y, params): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 292 |  |  |         # fit the model on the training corpus | 
            
                                                                                                            
                            
            
                                    
            
            
                | 293 |  |  |         self._classifier = self._create_classifier(params) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 294 |  |  |         self._classifier.fit(train_x, train_y) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 295 |  |  |         # sanity check: verify that the classifier has seen both classes | 
            
                                                                                                            
                            
            
                                    
            
            
                | 296 |  |  |         if self._classifier.n_classes_ != 2: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 297 |  |  |             raise OperationFailedException( | 
            
                                                                                                            
                            
            
                                    
            
            
                | 298 |  |  |                 "Unable to create classifier: " + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 299 |  |  |                 "Not enough positive and negative examples " + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 300 |  |  |                 "in the training data. Please check that your training " + | 
            
                                                                                                            
                            
            
                                    
            
            
                | 301 |  |  |                 "data matches your vocabulary.") | 
            
                                                                                                            
                            
            
                                    
            
            
                | 302 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 303 |  |  |     def _prediction_to_list(self, scores, candidates): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 304 |  |  |         subj_scores = [(score[1], c.subject_id) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 305 |  |  |                        for score, c in zip(scores, candidates)] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 306 |  |  |         return sorted(subj_scores, reverse=True) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 307 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 308 |  |  |     def predict(self, candidates): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 309 |  |  |         if not candidates: | 
            
                                                                                                            
                            
            
                                    
            
            
                | 310 |  |  |             return [] | 
            
                                                                                                            
                            
            
                                    
            
            
                | 311 |  |  |         features = self._candidates_to_features(candidates) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 312 |  |  |         scores = self._classifier.predict_proba(features) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 313 |  |  |         return self._prediction_to_list(scores, candidates) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 314 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 315 |  |  |     def save(self, filename): | 
            
                                                                                                            
                            
            
                                    
            
            
                | 316 |  |  |         return joblib.dump(self, filename) | 
            
                                                                                                            
                            
            
                                    
            
            
                | 317 |  |  |  | 
            
                                                                                                            
                            
            
                                    
            
            
                | 318 |  |  |     @staticmethod | 
            
                                                                                                            
                            
            
                                    
            
            
                | 319 |  |  |     def load(filename): | 
            
                                                                                                            
                                                                
            
                                    
            
            
                | 320 |  |  |         return joblib.load(filename) | 
            
                                                        
            
                                    
            
            
                | 321 |  |  |  |