| Total Complexity | 4 |
| Total Lines | 31 |
| Duplicated Lines | 67.74 % |
| Changes | 0 | ||
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
| 1 | """spaCy analyzer for Annif which uses spaCy for lemmatization""" |
||
| 2 | |||
| 3 | import spacy |
||
| 4 | from . import analyzer |
||
| 5 | import annif.util |
||
| 6 | |||
| 7 | _KEY_LOWERCASE = 'lowercase' |
||
| 8 | |||
| 9 | |||
| 10 | View Code Duplication | class SpacyAnalyzer(analyzer.Analyzer): |
|
|
|
|||
| 11 | name = "spacy" |
||
| 12 | |||
| 13 | def __init__(self, param, **kwargs): |
||
| 14 | self.param = param |
||
| 15 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) |
||
| 16 | if _KEY_LOWERCASE in kwargs: |
||
| 17 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) |
||
| 18 | else: |
||
| 19 | self.lowercase = False |
||
| 20 | super().__init__(**kwargs) |
||
| 21 | |||
| 22 | def tokenize_words(self, text, filter=True): |
||
| 23 | lemmas = [lemma |
||
| 24 | for lemma in (token.lemma_ |
||
| 25 | for token in self.nlp(text.strip())) |
||
| 26 | if (not filter or self.is_valid_token(lemma))] |
||
| 27 | if self.lowercase: |
||
| 28 | return [lemma.lower() for lemma in lemmas] |
||
| 29 | else: |
||
| 30 | return lemmas |
||
| 31 |