Passed
Pull Request — master (#527)
by Osma
03:11
created

SpacyAnalyzer.tokenize_words()   A

Complexity

Conditions 2

Size

Total Lines 7
Code Lines 6

Duplication

Lines 7
Ratio 100 %

Importance

Changes 0
Metric Value
cc 2
eloc 6
nop 2
dl 7
loc 7
rs 10
c 0
b 0
f 0
1
"""Simple analyzer for Annif. Only folds words to lower case."""
2
3
import spacy
4
from . import analyzer
5
import annif.util
6
7
_KEY_LOWERCASE = 'lowercase'
8
9
10 View Code Duplication
class SpacyAnalyzer(analyzer.Analyzer):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
11
    name = "spacy"
12
13
    def __init__(self, param, **kwargs):
14
        self.param = param
15
        self.nlp = spacy.load(param, exclude=['ner', 'parser'])
16
        if _KEY_LOWERCASE in kwargs:
17
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
18
        else:
19
            self.lowercase = False
20
        super().__init__(**kwargs)
21
22
    def tokenize_words(self, text):
23
        lemmas = [lemma for lemma in (token.lemma_ for token in self.nlp(text))
24
                  if self.is_valid_token(lemma)]
25
        if self.lowercase:
26
            return [lemma.lower() for lemma in lemmas]
27
        else:
28
            return lemmas
29
30
    def normalize_word(self, word):
31
        doc = self.nlp(word)
32
        lemma = doc[:].lemma_
33
        if self.lowercase:
34
            return lemma.lower()
35
        else:
36
            return lemma
37