Passed
Pull Request — master (#527)
by Osma
02:36
created

annif.analyzer.spacy.SpacyAnalyzer.__init__()   A

Complexity

Conditions 2

Size

Total Lines 8
Code Lines 7

Duplication

Lines 8
Ratio 100 %

Importance

Changes 0
Metric Value
cc 2
eloc 7
nop 3
dl 8
loc 8
rs 10
c 0
b 0
f 0
1
"""Simple analyzer for Annif. Only folds words to lower case."""
2
3
import spacy
4
from . import analyzer
5
import annif.util
6
7
_KEY_LOWERCASE = 'lowercase'
8
9
10 View Code Duplication
class SpacyAnalyzer(analyzer.Analyzer):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
11
    name = "spacy"
12
13
    def __init__(self, param, **kwargs):
14
        self.param = param
15
        self.nlp = spacy.load(param, exclude=['ner', 'parser'])
16
        if _KEY_LOWERCASE in kwargs:
17
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
18
        else:
19
            self.lowercase = False
20
        super().__init__(**kwargs)
21
22
    def tokenize_words(self, text):
23
        lemmas = [lemma for lemma in (token.lemma_ for token in self.nlp(text))
24
                  if self.is_valid_token(lemma)]
25
        if self.lowercase:
26
            return [lemma.lower() for lemma in lemmas]
27
        else:
28
            return lemmas
29
30
    def normalize_word(self, word):
31
        doc = self.nlp(word)
32
        lemma = doc[:].lemma_
33
        if self.lowercase:
34
            return lemma.lower()
35
        else:
36
            return lemma
37