Passed
Pull Request — master (#527)
by Osma
03:23
created

SpacyAnalyzer.tokenize_words()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 3
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""Simple analyzer for Annif. Only folds words to lower case."""
2
3
import spacy
4
from . import analyzer
5
6
7
class SpacyAnalyzer(analyzer.Analyzer):
8
    name = "spacy"
9
10
    def __init__(self, param, **kwargs):
11
        self.param = param
12
        self.nlp = spacy.load(param, exclude=['ner', 'parser'])
13
        super().__init__(**kwargs)
14
15
    def tokenize_words(self, text):
16
        return [lemma for lemma in (token.lemma_ for token in self.nlp(text))
17
                if self.is_valid_token(lemma)]
18
19
    def normalize_word(self, word):
20
        doc = self.nlp(word)
21
        return doc[:].lemma_
22