| Total Complexity | 5 |
| Total Lines | 37 |
| Duplicated Lines | 70.27 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | """spaCy analyzer for Annif which uses spaCy for lemmatization""" |
||
| 2 | |||
| 3 | import spacy |
||
| 4 | from . import analyzer |
||
| 5 | from annif.exception import OperationFailedException |
||
| 6 | import annif.util |
||
| 7 | |||
| 8 | _KEY_LOWERCASE = 'lowercase' |
||
| 9 | |||
| 10 | |||
| 11 | View Code Duplication | class SpacyAnalyzer(analyzer.Analyzer): |
|
|
|
|||
| 12 | name = "spacy" |
||
| 13 | |||
| 14 | def __init__(self, param, **kwargs): |
||
| 15 | self.param = param |
||
| 16 | try: |
||
| 17 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) |
||
| 18 | except IOError as err: |
||
| 19 | raise OperationFailedException( |
||
| 20 | f"Loading spaCy model '{param}' failed - " + |
||
| 21 | f"please download the model.\n{err}") |
||
| 22 | if _KEY_LOWERCASE in kwargs: |
||
| 23 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) |
||
| 24 | else: |
||
| 25 | self.lowercase = False |
||
| 26 | super().__init__(**kwargs) |
||
| 27 | |||
| 28 | def tokenize_words(self, text, filter=True): |
||
| 29 | lemmas = [lemma |
||
| 30 | for lemma in (token.lemma_ |
||
| 31 | for token in self.nlp(text.strip())) |
||
| 32 | if (not filter or self.is_valid_token(lemma))] |
||
| 33 | if self.lowercase: |
||
| 34 | return [lemma.lower() for lemma in lemmas] |
||
| 35 | else: |
||
| 36 | return lemmas |
||
| 37 |