Passed
Pull Request — master (#527)
by Osma
01:59
created

SpacyAnalyzer.tokenize_words()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nop 2
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
"""Simple analyzer for Annif. Only folds words to lower case."""
2
3
import spacy
4
from spacy.tokens import Doc, Span
5
from . import analyzer
6
7
8
class SpacyAnalyzer(analyzer.Analyzer):
9
    name = "spacy"
10
11
    def __init__(self, param, **kwargs):
12
        self.param = param
13
        self.nlp = spacy.load(param, exclude=['ner', 'parser'])
14
        # we need a way to split sentences, now that parser is excluded
15
        self.nlp.add_pipe('sentencizer')
16
        super().__init__(**kwargs)
17
18
    def tokenize_sentences(self, text):
19
        doc = self.nlp(text)
20
        return list(doc.sents)
21
22
    def tokenize_words(self, text):
23
        if not isinstance(text, (Doc, Span)):
24
            text = self.nlp(text)
25
        return [lemma for lemma in (token.lemma_ for token in text)
26
                if self.is_valid_token(lemma)]
27
28
    def normalize_word(self, word):
29
        doc = self.nlp(word)
30
        return doc[:].lemma_
31