| Total Complexity | 49 |
| Total Lines | 263 |
| Duplicated Lines | 90.11 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
Complex classes like annif.project often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.
Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.
| 1 | """Project management functionality for Annif""" |
||
| 2 | |||
| 3 | import enum |
||
| 4 | import os.path |
||
| 5 | from shutil import rmtree |
||
| 6 | import annif |
||
| 7 | import annif.transform |
||
| 8 | import annif.analyzer |
||
| 9 | import annif.corpus |
||
| 10 | import annif.suggestion |
||
| 11 | import annif.backend |
||
| 12 | from annif.datadir import DatadirMixin |
||
| 13 | from annif.exception import AnnifException, ConfigurationException, \ |
||
| 14 | NotSupportedException, NotInitializedException |
||
| 15 | |||
| 16 | logger = annif.logger |
||
| 17 | |||
| 18 | |||
| 19 | class Access(enum.IntEnum): |
||
| 20 | """Enumeration of access levels for projects""" |
||
| 21 | private = 1 |
||
| 22 | hidden = 2 |
||
| 23 | public = 3 |
||
| 24 | |||
| 25 | |||
| 26 | View Code Duplication | class AnnifProject(DatadirMixin): |
|
|
|
|||
| 27 | """Class representing the configuration of a single Annif project.""" |
||
| 28 | |||
| 29 | # defaults for uninitialized instances |
||
| 30 | _transform = None |
||
| 31 | _analyzer = None |
||
| 32 | _backend = None |
||
| 33 | _vocab = None |
||
| 34 | _vocab_lang = None |
||
| 35 | initialized = False |
||
| 36 | |||
| 37 | # default values for configuration settings |
||
| 38 | DEFAULT_ACCESS = 'public' |
||
| 39 | |||
| 40 | def __init__(self, project_id, config, datadir, registry): |
||
| 41 | DatadirMixin.__init__(self, datadir, 'projects', project_id) |
||
| 42 | self.project_id = project_id |
||
| 43 | self.name = config.get('name', project_id) |
||
| 44 | self.language = config['language'] |
||
| 45 | self.analyzer_spec = config.get('analyzer', None) |
||
| 46 | self.transform_spec = config.get('transform', 'pass') |
||
| 47 | self.vocab_spec = config.get('vocab', None) |
||
| 48 | self.config = config |
||
| 49 | self._base_datadir = datadir |
||
| 50 | self.registry = registry |
||
| 51 | self._init_access() |
||
| 52 | |||
| 53 | def _init_access(self): |
||
| 54 | access = self.config.get('access', self.DEFAULT_ACCESS) |
||
| 55 | try: |
||
| 56 | self.access = getattr(Access, access) |
||
| 57 | except AttributeError: |
||
| 58 | raise ConfigurationException( |
||
| 59 | "'{}' is not a valid access setting".format(access), |
||
| 60 | project_id=self.project_id) |
||
| 61 | |||
| 62 | def _initialize_analyzer(self): |
||
| 63 | if not self.analyzer_spec: |
||
| 64 | return # not configured, so assume it's not needed |
||
| 65 | analyzer = self.analyzer |
||
| 66 | logger.debug("Project '%s': initialized analyzer: %s", |
||
| 67 | self.project_id, |
||
| 68 | str(analyzer)) |
||
| 69 | |||
| 70 | def _initialize_subjects(self): |
||
| 71 | try: |
||
| 72 | subjects = self.subjects |
||
| 73 | logger.debug("Project '%s': initialized subjects: %s", |
||
| 74 | self.project_id, |
||
| 75 | str(subjects)) |
||
| 76 | except AnnifException as err: |
||
| 77 | logger.warning(err.format_message()) |
||
| 78 | |||
| 79 | def _initialize_backend(self, parallel): |
||
| 80 | logger.debug("Project '%s': initializing backend", self.project_id) |
||
| 81 | try: |
||
| 82 | if not self.backend: |
||
| 83 | logger.debug("Cannot initialize backend: does not exist") |
||
| 84 | return |
||
| 85 | self.backend.initialize(parallel) |
||
| 86 | except AnnifException as err: |
||
| 87 | logger.warning(err.format_message()) |
||
| 88 | |||
| 89 | def initialize(self, parallel=False): |
||
| 90 | """Initialize this project and its backend so that they are ready to |
||
| 91 | be used. If parallel is True, expect that the project will be used |
||
| 92 | for parallel processing.""" |
||
| 93 | |||
| 94 | if self.initialized: |
||
| 95 | return |
||
| 96 | |||
| 97 | logger.debug("Initializing project '%s'", self.project_id) |
||
| 98 | |||
| 99 | self._initialize_analyzer() |
||
| 100 | self._initialize_subjects() |
||
| 101 | self._initialize_backend(parallel) |
||
| 102 | |||
| 103 | self.initialized = True |
||
| 104 | |||
| 105 | def _suggest_with_backend(self, text, backend_params): |
||
| 106 | if backend_params is None: |
||
| 107 | backend_params = {} |
||
| 108 | beparams = backend_params.get(self.backend.backend_id, {}) |
||
| 109 | hits = self.backend.suggest(text, beparams) |
||
| 110 | logger.debug( |
||
| 111 | 'Got %d hits from backend %s', |
||
| 112 | len(hits), self.backend.backend_id) |
||
| 113 | return hits |
||
| 114 | |||
| 115 | @property |
||
| 116 | def analyzer(self): |
||
| 117 | if self._analyzer is None: |
||
| 118 | if self.analyzer_spec: |
||
| 119 | self._analyzer = annif.analyzer.get_analyzer( |
||
| 120 | self.analyzer_spec) |
||
| 121 | else: |
||
| 122 | raise ConfigurationException( |
||
| 123 | "analyzer setting is missing", project_id=self.project_id) |
||
| 124 | return self._analyzer |
||
| 125 | |||
| 126 | @property |
||
| 127 | def transform(self): |
||
| 128 | if self._transform is None: |
||
| 129 | self._transform = annif.transform.get_transform( |
||
| 130 | self.transform_spec, project=self) |
||
| 131 | return self._transform |
||
| 132 | |||
| 133 | @property |
||
| 134 | def backend(self): |
||
| 135 | if self._backend is None: |
||
| 136 | if 'backend' not in self.config: |
||
| 137 | raise ConfigurationException( |
||
| 138 | "backend setting is missing", project_id=self.project_id) |
||
| 139 | backend_id = self.config['backend'] |
||
| 140 | try: |
||
| 141 | backend_class = annif.backend.get_backend(backend_id) |
||
| 142 | self._backend = backend_class( |
||
| 143 | backend_id, config_params=self.config, |
||
| 144 | project=self) |
||
| 145 | except ValueError: |
||
| 146 | logger.warning( |
||
| 147 | "Could not create backend %s, " |
||
| 148 | "make sure you've installed optional dependencies", |
||
| 149 | backend_id) |
||
| 150 | return self._backend |
||
| 151 | |||
| 152 | def _initialize_vocab(self): |
||
| 153 | if self.vocab_spec is None: |
||
| 154 | raise ConfigurationException("vocab setting is missing", |
||
| 155 | project_id=self.project_id) |
||
| 156 | self._vocab, self._vocab_lang = self.registry.get_vocab( |
||
| 157 | self.vocab_spec, self.language) |
||
| 158 | |||
| 159 | @property |
||
| 160 | def vocab(self): |
||
| 161 | if self._vocab is None: |
||
| 162 | self._initialize_vocab() |
||
| 163 | return self._vocab |
||
| 164 | |||
| 165 | @property |
||
| 166 | def vocab_lang(self): |
||
| 167 | if self._vocab_lang is None: |
||
| 168 | self._initialize_vocab() |
||
| 169 | return self._vocab_lang |
||
| 170 | |||
| 171 | @property |
||
| 172 | def subjects(self): |
||
| 173 | return self.vocab.subjects |
||
| 174 | |||
| 175 | def _get_info(self, key): |
||
| 176 | try: |
||
| 177 | be = self.backend |
||
| 178 | if be is not None: |
||
| 179 | return getattr(be, key) |
||
| 180 | except AnnifException as err: |
||
| 181 | logger.warning(err.format_message()) |
||
| 182 | return None |
||
| 183 | |||
| 184 | @property |
||
| 185 | def is_trained(self): |
||
| 186 | return self._get_info('is_trained') |
||
| 187 | |||
| 188 | @property |
||
| 189 | def modification_time(self): |
||
| 190 | return self._get_info('modification_time') |
||
| 191 | |||
| 192 | def suggest(self, text, backend_params=None): |
||
| 193 | """Suggest subjects the given text by passing it to the backend. Returns a |
||
| 194 | list of SubjectSuggestion objects ordered by decreasing score.""" |
||
| 195 | if not self.is_trained: |
||
| 196 | if self.is_trained is None: |
||
| 197 | logger.warning('Could not get train state information.') |
||
| 198 | else: |
||
| 199 | raise NotInitializedException('Project is not trained.') |
||
| 200 | logger.debug('Suggesting subjects for text "%s..." (len=%d)', |
||
| 201 | text[:20], len(text)) |
||
| 202 | text = self.transform.transform_text(text) |
||
| 203 | hits = self._suggest_with_backend(text, backend_params) |
||
| 204 | logger.debug('%d hits from backend', len(hits)) |
||
| 205 | return hits |
||
| 206 | |||
| 207 | def train(self, corpus, backend_params=None, jobs=0): |
||
| 208 | """train the project using documents from a metadata source""" |
||
| 209 | if corpus != 'cached': |
||
| 210 | corpus = self.transform.transform_corpus(corpus) |
||
| 211 | if backend_params is None: |
||
| 212 | backend_params = {} |
||
| 213 | beparams = backend_params.get(self.backend.backend_id, {}) |
||
| 214 | self.backend.train(corpus, beparams, jobs) |
||
| 215 | |||
| 216 | def learn(self, corpus, backend_params=None): |
||
| 217 | """further train the project using documents from a metadata source""" |
||
| 218 | if backend_params is None: |
||
| 219 | backend_params = {} |
||
| 220 | beparams = backend_params.get(self.backend.backend_id, {}) |
||
| 221 | corpus = self.transform.transform_corpus(corpus) |
||
| 222 | if isinstance( |
||
| 223 | self.backend, |
||
| 224 | annif.backend.backend.AnnifLearningBackend): |
||
| 225 | self.backend.learn(corpus, beparams) |
||
| 226 | else: |
||
| 227 | raise NotSupportedException("Learning not supported by backend", |
||
| 228 | project_id=self.project_id) |
||
| 229 | |||
| 230 | def hyperopt(self, corpus, trials, jobs, metric, results_file): |
||
| 231 | """optimize the hyperparameters of the project using a validation |
||
| 232 | corpus against a given metric""" |
||
| 233 | if isinstance( |
||
| 234 | self.backend, |
||
| 235 | annif.backend.hyperopt.AnnifHyperoptBackend): |
||
| 236 | optimizer = self.backend.get_hp_optimizer(corpus, metric) |
||
| 237 | return optimizer.optimize(trials, jobs, results_file) |
||
| 238 | |||
| 239 | raise NotSupportedException( |
||
| 240 | "Hyperparameter optimization not supported " |
||
| 241 | "by backend", project_id=self.project_id) |
||
| 242 | |||
| 243 | def dump(self): |
||
| 244 | """return this project as a dict""" |
||
| 245 | return {'project_id': self.project_id, |
||
| 246 | 'name': self.name, |
||
| 247 | 'language': self.language, |
||
| 248 | 'backend': {'backend_id': self.config.get('backend')}, |
||
| 249 | 'is_trained': self.is_trained, |
||
| 250 | 'modification_time': self.modification_time |
||
| 251 | } |
||
| 252 | |||
| 253 | def remove_model_data(self): |
||
| 254 | """remove the data of this project""" |
||
| 255 | datadir_path = self._datadir_path |
||
| 256 | if os.path.isdir(datadir_path): |
||
| 257 | rmtree(datadir_path) |
||
| 258 | logger.info('Removed model data for project {}.' |
||
| 259 | .format(self.project_id)) |
||
| 260 | else: |
||
| 261 | logger.warning('No model data to remove for project {}.' |
||
| 262 | .format(self.project_id)) |
||
| 263 |