| 1 |  |  | """Subject index functionality for Annif""" | 
            
                                                        
            
                                    
            
            
                | 2 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 3 |  |  | from __future__ import annotations | 
            
                                                        
            
                                    
            
            
                | 4 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 5 |  |  | import csv | 
            
                                                        
            
                                    
            
            
                | 6 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 7 |  |  | import annif | 
            
                                                        
            
                                    
            
            
                | 8 |  |  | import annif.util | 
            
                                                        
            
                                    
            
            
                | 9 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 10 |  |  | from .subject_file import VocabFileCSV | 
            
                                                        
            
                                    
            
            
                | 11 |  |  | from .types import Subject, SubjectIndex, VocabSource | 
            
                                                        
            
                                    
            
            
                | 12 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 13 |  |  | logger = annif.logger | 
            
                                                        
            
                                    
            
            
                | 14 |  |  | logger.addFilter(annif.util.DuplicateFilter()) | 
            
                                                        
            
                                    
            
            
                | 15 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 16 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 17 |  |  | class SubjectIndexFile(SubjectIndex): | 
            
                                                        
            
                                    
            
            
                | 18 |  |  |     """SubjectIndex implementation backed by a file.""" | 
            
                                                        
            
                                    
            
            
                | 19 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 20 |  |  |     def __init__(self) -> None: | 
            
                                                        
            
                                    
            
            
                | 21 |  |  |         self._subjects = [] | 
            
                                                        
            
                                    
            
            
                | 22 |  |  |         self._uri_idx = {} | 
            
                                                        
            
                                    
            
            
                | 23 |  |  |         self._label_idx = {} | 
            
                                                        
            
                                    
            
            
                | 24 |  |  |         self._languages = None | 
            
                                                        
            
                                    
            
            
                | 25 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 26 |  |  |     def load_subjects(self, vocab_source: VocabSource) -> None: | 
            
                                                        
            
                                    
            
            
                | 27 |  |  |         """Initialize the subject index from a subject corpus""" | 
            
                                                        
            
                                    
            
            
                | 28 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 29 |  |  |         self._languages = vocab_source.languages | 
            
                                                        
            
                                    
            
            
                | 30 |  |  |         for subject in vocab_source.subjects: | 
            
                                                        
            
                                    
            
            
                | 31 |  |  |             self.append(subject) | 
            
                                                        
            
                                    
            
            
                | 32 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 33 |  |  |     def __len__(self) -> int: | 
            
                                                        
            
                                    
            
            
                | 34 |  |  |         return len(self._subjects) | 
            
                                                        
            
                                    
            
            
                | 35 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 36 |  |  |     @property | 
            
                                                        
            
                                    
            
            
                | 37 |  |  |     def languages(self) -> list[str] | None: | 
            
                                                        
            
                                    
            
            
                | 38 |  |  |         return self._languages | 
            
                                                        
            
                                    
            
            
                | 39 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 40 |  |  |     def __getitem__(self, subject_id: int) -> Subject: | 
            
                                                        
            
                                    
            
            
                | 41 |  |  |         subject = self._subjects[subject_id] | 
            
                                                        
            
                                    
            
            
                | 42 |  |  |         if subject.labels is None: | 
            
                                                        
            
                                    
            
            
                | 43 |  |  |             raise IndexError(f"Subject is deprecated: {subject_id}") | 
            
                                                        
            
                                    
            
            
                | 44 |  |  |         return subject | 
            
                                                        
            
                                    
            
            
                | 45 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 46 |  |  |     def append(self, subject: Subject) -> None: | 
            
                                                        
            
                                    
            
            
                | 47 |  |  |         if self._languages is None and subject.labels is not None: | 
            
                                                        
            
                                    
            
            
                | 48 |  |  |             self._languages = list(subject.labels.keys()) | 
            
                                                        
            
                                    
            
            
                | 49 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 50 |  |  |         subject_id = len(self._subjects) | 
            
                                                        
            
                                    
            
            
                | 51 |  |  |         self._uri_idx[subject.uri] = subject_id | 
            
                                                        
            
                                    
            
            
                | 52 |  |  |         if subject.labels: | 
            
                                                        
            
                                    
            
            
                | 53 |  |  |             for lang, label in subject.labels.items(): | 
            
                                                        
            
                                    
            
            
                | 54 |  |  |                 self._label_idx[(label, lang)] = subject_id | 
            
                                                        
            
                                    
            
            
                | 55 |  |  |         self._subjects.append(subject) | 
            
                                                        
            
                                    
            
            
                | 56 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 57 |  |  |     def contains_uri(self, uri: str) -> bool: | 
            
                                                        
            
                                    
            
            
                | 58 |  |  |         return uri in self._uri_idx | 
            
                                                        
            
                                    
            
            
                | 59 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 60 |  |  |     def by_uri(self, uri: str, warnings: bool = True) -> int | None: | 
            
                                                        
            
                                    
            
            
                | 61 |  |  |         try: | 
            
                                                        
            
                                    
            
            
                | 62 |  |  |             subject_id = self._uri_idx[uri] | 
            
                                                        
            
                                    
            
            
                | 63 |  |  |             if self._subjects[subject_id].labels is None:  # deprecated | 
            
                                                        
            
                                    
            
            
                | 64 |  |  |                 return None | 
            
                                                        
            
                                    
            
            
                | 65 |  |  |             return subject_id | 
            
                                                        
            
                                    
            
            
                | 66 |  |  |         except KeyError: | 
            
                                                        
            
                                    
            
            
                | 67 |  |  |             if warnings: | 
            
                                                        
            
                                    
            
            
                | 68 |  |  |                 logger.warning("Unknown subject URI <%s>", uri) | 
            
                                                        
            
                                    
            
            
                | 69 |  |  |             return None | 
            
                                                        
            
                                    
            
            
                | 70 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 71 |  |  |     def by_label(self, label: str | None, language: str) -> int | None: | 
            
                                                        
            
                                    
            
            
                | 72 |  |  |         try: | 
            
                                                        
            
                                    
            
            
                | 73 |  |  |             return self._label_idx[(label, language)] | 
            
                                                        
            
                                    
            
            
                | 74 |  |  |         except KeyError: | 
            
                                                        
            
                                    
            
            
                | 75 |  |  |             logger.warning('Unknown subject label "%s"@%s', label, language) | 
            
                                                        
            
                                    
            
            
                | 76 |  |  |             return None | 
            
                                                        
            
                                    
            
            
                | 77 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 78 |  |  |     @property | 
            
                                                        
            
                                    
            
            
                | 79 |  |  |     def active(self) -> list[tuple[int, Subject]]: | 
            
                                                        
            
                                    
            
            
                | 80 |  |  |         return [ | 
            
                                                        
            
                                    
            
            
                | 81 |  |  |             (subj_id, subject) | 
            
                                                        
            
                                    
            
            
                | 82 |  |  |             for subj_id, subject in enumerate(self._subjects) | 
            
                                                        
            
                                    
            
            
                | 83 |  |  |             if subject.labels is not None | 
            
                                                        
            
                                    
            
            
                | 84 |  |  |         ] | 
            
                                                        
            
                                    
            
            
                | 85 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 86 |  |  |     def save(self, path: str) -> None: | 
            
                                                        
            
                                    
            
            
                | 87 |  |  |         """Save this subject index into a file with the given path name.""" | 
            
                                                        
            
                                    
            
            
                | 88 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 89 |  |  |         fieldnames = ["uri", "notation"] + [f"label_{lang}" for lang in self._languages] | 
            
                                                        
            
                                    
            
            
                | 90 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 91 |  |  |         with open(path, "w", encoding="utf-8", newline="") as csvfile: | 
            
                                                        
            
                                    
            
            
                | 92 |  |  |             writer = csv.DictWriter(csvfile, fieldnames=fieldnames) | 
            
                                                        
            
                                    
            
            
                | 93 |  |  |             writer.writeheader() | 
            
                                                        
            
                                    
            
            
                | 94 |  |  |             for subject in self: | 
            
                                                        
            
                                    
            
            
                | 95 |  |  |                 row = {"uri": subject.uri, "notation": subject.notation or ""} | 
            
                                                        
            
                                    
            
            
                | 96 |  |  |                 if subject.labels: | 
            
                                                        
            
                                    
            
            
                | 97 |  |  |                     for lang, label in subject.labels.items(): | 
            
                                                        
            
                                    
            
            
                | 98 |  |  |                         row[f"label_{lang}"] = label | 
            
                                                        
            
                                    
            
            
                | 99 |  |  |                 writer.writerow(row) | 
            
                                                        
            
                                    
            
            
                | 100 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 101 |  |  |     @classmethod | 
            
                                                        
            
                                    
            
            
                | 102 |  |  |     def load(cls, path: str) -> SubjectIndex: | 
            
                                                        
            
                                    
            
            
                | 103 |  |  |         """Load a subject index from a CSV file and return it.""" | 
            
                                                        
            
                                    
            
            
                | 104 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 105 |  |  |         vocab_file = VocabFileCSV(path) | 
            
                                                        
            
                                    
            
            
                | 106 |  |  |         subject_index = cls() | 
            
                                                        
            
                                    
            
            
                | 107 |  |  |         subject_index.load_subjects(vocab_file) | 
            
                                                        
            
                                    
            
            
                | 108 |  |  |         return subject_index | 
            
                                                        
            
                                    
            
            
                | 109 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 110 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 111 |  |  | class SubjectIndexFilter(SubjectIndex): | 
            
                                                        
            
                                    
            
            
                | 112 |  |  |     """SubjectIndex implementation that filters another SubjectIndex based | 
            
                                                        
            
                                    
            
            
                | 113 |  |  |     on a list of subject URIs to exclude.""" | 
            
                                                        
            
                                    
            
            
                | 114 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 115 |  |  |     def __init__(self, subject_index: SubjectIndex, exclude: set[str]): | 
            
                                                        
            
                                    
            
            
                | 116 |  |  |         self._subject_index = subject_index | 
            
                                                        
            
                                    
            
            
                | 117 |  |  |         self._exclude = exclude | 
            
                                                        
            
                                    
            
            
                | 118 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 119 |  |  |     def __len__(self) -> int: | 
            
                                                        
            
                                    
            
            
                | 120 |  |  |         return len(self._subject_index) | 
            
                                                        
            
                                    
            
            
                | 121 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 122 |  |  |     @property | 
            
                                                        
            
                                    
            
            
                | 123 |  |  |     def languages(self) -> list[str] | None: | 
            
                                                        
            
                                    
            
            
                | 124 |  |  |         return self._subject_index.languages | 
            
                                                        
            
                                    
            
            
                | 125 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 126 |  |  |     def __getitem__(self, subject_id: int) -> Subject: | 
            
                                                        
            
                                    
            
            
                | 127 |  |  |         subject = self._subject_index[subject_id] | 
            
                                                        
            
                                    
            
            
                | 128 |  |  |         if subject.uri in self._exclude: | 
            
                                                        
            
                                    
            
            
                | 129 |  |  |             raise IndexError(f"Subject is excluded: {subject.uri}") | 
            
                                                        
            
                                    
            
            
                | 130 |  |  |         return subject | 
            
                                                        
            
                                    
            
            
                | 131 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 132 |  |  |     def contains_uri(self, uri: str) -> bool: | 
            
                                                        
            
                                    
            
            
                | 133 |  |  |         if uri in self._exclude: | 
            
                                                        
            
                                    
            
            
                | 134 |  |  |             return False | 
            
                                                        
            
                                    
            
            
                | 135 |  |  |         return self._subject_index.contains_uri(uri) | 
            
                                                        
            
                                    
            
            
                | 136 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 137 |  |  |     def by_uri(self, uri: str, warnings: bool = True) -> int | None: | 
            
                                                        
            
                                    
            
            
                | 138 |  |  |         """return the subject ID of a subject by its URI, or None if not found. | 
            
                                                        
            
                                    
            
            
                | 139 |  |  |         If warnings=True, log a warning message if the URI cannot be found.""" | 
            
                                                        
            
                                    
            
            
                | 140 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 141 |  |  |         if uri in self._exclude: | 
            
                                                        
            
                                    
            
            
                | 142 |  |  |             return None | 
            
                                                        
            
                                    
            
            
                | 143 |  |  |         return self._subject_index.by_uri(uri, warnings) | 
            
                                                        
            
                                    
            
            
                | 144 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 145 |  |  |     def by_label(self, label: str | None, language: str) -> int | None: | 
            
                                                        
            
                                    
            
            
                | 146 |  |  |         """return the subject ID of a subject by its label in a given | 
            
                                                        
            
                                    
            
            
                | 147 |  |  |         language""" | 
            
                                                        
            
                                    
            
            
                | 148 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 149 |  |  |         subject_id = self._subject_index.by_label(label, language) | 
            
                                                        
            
                                    
            
            
                | 150 |  |  |         subject = self._subject_index[subject_id] | 
            
                                                        
            
                                    
            
            
                | 151 |  |  |         if subject is not None and subject.uri not in self._exclude: | 
            
                                                        
            
                                    
            
            
                | 152 |  |  |             return subject_id | 
            
                                                        
            
                                    
            
            
                | 153 |  |  |         return None | 
            
                                                        
            
                                    
            
            
                | 154 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 155 |  |  |     @property | 
            
                                                        
            
                                    
            
            
                | 156 |  |  |     def active(self) -> list[tuple[int, Subject]]: | 
            
                                                        
            
                                    
            
            
                | 157 |  |  |         """return a list of (subject_id, Subject) tuples of all subjects that | 
            
                                                        
            
                                    
            
            
                | 158 |  |  |         are available for use""" | 
            
                                                        
            
                                    
            
            
                | 159 |  |  |  | 
            
                                                        
            
                                    
            
            
                | 160 |  |  |         return [ | 
            
                                                        
            
                                    
            
            
                | 161 |  |  |             (subject_id, subject) | 
            
                                                        
            
                                    
            
            
                | 162 |  |  |             for subject_id, subject in self._subject_index.active | 
            
                                                        
            
                                    
            
            
                | 163 |  |  |             if subject.uri not in self._exclude | 
            
                                                        
            
                                    
            
            
                | 164 |  |  |         ] | 
            
                                                        
            
                                    
            
            
                | 165 |  |  |  |