| Total Complexity | 5 | 
| Total Lines | 37 | 
| Duplicated Lines | 70.27 % | 
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | """spaCy analyzer for Annif which uses spaCy for lemmatization""" | ||
| 2 | |||
| 3 | import spacy | ||
| 4 | from . import analyzer | ||
| 5 | from annif.exception import OperationFailedException | ||
| 6 | import annif.util | ||
| 7 | |||
| 8 | _KEY_LOWERCASE = 'lowercase' | ||
| 9 | |||
| 10 | |||
| 11 | View Code Duplication | class SpacyAnalyzer(analyzer.Analyzer): | |
|  | |||
| 12 | name = "spacy" | ||
| 13 | |||
| 14 | def __init__(self, param, **kwargs): | ||
| 15 | self.param = param | ||
| 16 | try: | ||
| 17 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) | ||
| 18 | except IOError as err: | ||
| 19 | raise OperationFailedException( | ||
| 20 |                 f"Loading spaCy model '{param}' failed - " + | ||
| 21 |                 f"please download the model.\n{err}") | ||
| 22 | if _KEY_LOWERCASE in kwargs: | ||
| 23 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) | ||
| 24 | else: | ||
| 25 | self.lowercase = False | ||
| 26 | super().__init__(**kwargs) | ||
| 27 | |||
| 28 | def tokenize_words(self, text, filter=True): | ||
| 29 | lemmas = [lemma | ||
| 30 | for lemma in (token.lemma_ | ||
| 31 | for token in self.nlp(text.strip())) | ||
| 32 | if (not filter or self.is_valid_token(lemma))] | ||
| 33 | if self.lowercase: | ||
| 34 | return [lemma.lower() for lemma in lemmas] | ||
| 35 | else: | ||
| 36 | return lemmas | ||
| 37 |