Total Complexity | 6 |
Total Lines | 39 |
Duplicated Lines | 74.36 % |
Changes | 0 |
Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.
Common duplication problems, and corresponding solutions are:
1 | """Simple analyzer for Annif. Only folds words to lower case.""" |
||
2 | |||
3 | import spacy |
||
4 | from . import analyzer |
||
5 | import annif.util |
||
6 | |||
7 | _KEY_LOWERCASE = 'lowercase' |
||
8 | |||
9 | |||
10 | View Code Duplication | class SpacyAnalyzer(analyzer.Analyzer): |
|
|
|||
11 | name = "spacy" |
||
12 | |||
13 | def __init__(self, param, **kwargs): |
||
14 | self.param = param |
||
15 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) |
||
16 | if _KEY_LOWERCASE in kwargs: |
||
17 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) |
||
18 | else: |
||
19 | self.lowercase = False |
||
20 | super().__init__(**kwargs) |
||
21 | |||
22 | def tokenize_words(self, text, filter=True): |
||
23 | lemmas = [lemma |
||
24 | for lemma in (token.lemma_ |
||
25 | for token in self.nlp(text.strip())) |
||
26 | if (not filter or self.is_valid_token(lemma))] |
||
27 | if self.lowercase: |
||
28 | return [lemma.lower() for lemma in lemmas] |
||
29 | else: |
||
30 | return lemmas |
||
31 | |||
32 | def normalize_word(self, word): |
||
33 | doc = self.nlp(word) |
||
34 | lemma = doc[:].lemma_ |
||
35 | if self.lowercase: |
||
36 | return lemma.lower() |
||
37 | else: |
||
38 | return lemma |
||
39 |