Completed
Push — master ( cf5fd0...1b4762 )
by Osma
15s queued 12s
created

annif.analyzer.spacy.SpacyAnalyzer.__init__()   A

Complexity

Conditions 3

Size

Total Lines 13
Code Lines 12

Duplication

Lines 13
Ratio 100 %

Importance

Changes 0
Metric Value
cc 3
eloc 12
nop 3
dl 13
loc 13
rs 9.8
c 0
b 0
f 0
1
"""spaCy analyzer for Annif which uses spaCy for lemmatization"""
2
3
import spacy
4
from . import analyzer
5
from annif.exception import OperationFailedException
6
import annif.util
7
8
_KEY_LOWERCASE = 'lowercase'
9
10
11 View Code Duplication
class SpacyAnalyzer(analyzer.Analyzer):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
12
    name = "spacy"
13
14
    def __init__(self, param, **kwargs):
15
        self.param = param
16
        try:
17
            self.nlp = spacy.load(param, exclude=['ner', 'parser'])
18
        except IOError as err:
19
            raise OperationFailedException(
20
                f"Loading spaCy model '{param}' failed - " +
21
                f"please download the model.\n{err}")
22
        if _KEY_LOWERCASE in kwargs:
23
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
24
        else:
25
            self.lowercase = False
26
        super().__init__(**kwargs)
27
28
    def tokenize_words(self, text, filter=True):
29
        lemmas = [lemma
30
                  for lemma in (token.lemma_
31
                                for token in self.nlp(text.strip()))
32
                  if (not filter or self.is_valid_token(lemma))]
33
        if self.lowercase:
34
            return [lemma.lower() for lemma in lemmas]
35
        else:
36
            return lemmas
37