Total Complexity | 6 |
Total Lines | 50 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """spaCy analyzer for Annif which uses spaCy for lemmatization""" |
||
2 | |||
3 | from __future__ import annotations |
||
4 | |||
5 | import importlib |
||
6 | |||
7 | import annif.util |
||
8 | from annif.exception import OperationFailedException |
||
9 | |||
10 | from . import analyzer |
||
11 | |||
12 | _KEY_LOWERCASE = "lowercase" |
||
13 | |||
14 | |||
15 | class SpacyAnalyzer(analyzer.Analyzer): |
||
16 | name = "spacy" |
||
17 | |||
18 | @staticmethod |
||
19 | def is_available() -> bool: |
||
20 | # return True iff spaCy is installed |
||
21 | return importlib.util.find_spec("spacy") is not None |
||
22 | |||
23 | def __init__(self, param: str, **kwargs) -> None: |
||
24 | import spacy |
||
25 | |||
26 | self.param = param |
||
27 | try: |
||
28 | self.nlp = spacy.load(param, exclude=["ner", "parser"]) |
||
29 | except IOError as err: |
||
30 | raise OperationFailedException( |
||
31 | f"Loading spaCy model '{param}' failed - " |
||
32 | + f"please download the model.\n{err}" |
||
33 | ) |
||
34 | if _KEY_LOWERCASE in kwargs: |
||
35 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) |
||
36 | else: |
||
37 | self.lowercase = False |
||
38 | super().__init__(**kwargs) |
||
39 | |||
40 | def tokenize_words(self, text: str, filter: bool = True) -> list[str]: |
||
41 | lemmas = [ |
||
42 | lemma |
||
43 | for lemma in (token.lemma_ for token in self.nlp(text.strip())) |
||
44 | if (not filter or self.is_valid_token(lemma)) |
||
45 | ] |
||
46 | if self.lowercase: |
||
47 | return [lemma.lower() for lemma in lemmas] |
||
48 | else: |
||
49 | return lemmas |
||
50 |