annif.analyzer.spacy   A
last analyzed

Complexity

Total Complexity 6

Size/Duplication

Total Lines 50
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 33
dl 0
loc 50
rs 10
c 0
b 0
f 0
wmc 6

3 Methods

Rating   Name   Duplication   Size   Complexity  
A SpacyAnalyzer.is_available() 0 4 1
A SpacyAnalyzer.__init__() 0 16 3
A SpacyAnalyzer.tokenize_words() 0 10 2
1
"""spaCy analyzer for Annif which uses spaCy for lemmatization"""
2
3
from __future__ import annotations
4
5
import importlib
6
7
import annif.util
8
from annif.exception import OperationFailedException
9
10
from . import analyzer
11
12
_KEY_LOWERCASE = "lowercase"
13
14
15
class SpacyAnalyzer(analyzer.Analyzer):
16
    name = "spacy"
17
18
    @staticmethod
19
    def is_available() -> bool:
20
        # return True iff spaCy is installed
21
        return importlib.util.find_spec("spacy") is not None
22
23
    def __init__(self, param: str, **kwargs) -> None:
24
        import spacy
25
26
        self.param = param
27
        try:
28
            self.nlp = spacy.load(param, exclude=["ner", "parser"])
29
        except IOError as err:
30
            raise OperationFailedException(
31
                f"Loading spaCy model '{param}' failed - "
32
                + f"please download the model.\n{err}"
33
            )
34
        if _KEY_LOWERCASE in kwargs:
35
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
36
        else:
37
            self.lowercase = False
38
        super().__init__(**kwargs)
39
40
    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
41
        lemmas = [
42
            lemma
43
            for lemma in (token.lemma_ for token in self.nlp(text.strip()))
44
            if (not filter or self.is_valid_token(lemma))
45
        ]
46
        if self.lowercase:
47
            return [lemma.lower() for lemma in lemmas]
48
        else:
49
            return lemmas
50