| Total Complexity | 6 |
| Total Lines | 50 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """spaCy analyzer for Annif which uses spaCy for lemmatization""" |
||
| 2 | |||
| 3 | from __future__ import annotations |
||
| 4 | |||
| 5 | import importlib |
||
| 6 | |||
| 7 | import annif.util |
||
| 8 | from annif.exception import OperationFailedException |
||
| 9 | |||
| 10 | from . import analyzer |
||
| 11 | |||
| 12 | _KEY_LOWERCASE = "lowercase" |
||
| 13 | |||
| 14 | |||
| 15 | class SpacyAnalyzer(analyzer.Analyzer): |
||
| 16 | name = "spacy" |
||
| 17 | |||
| 18 | @staticmethod |
||
| 19 | def is_available() -> bool: |
||
| 20 | # return True iff spaCy is installed |
||
| 21 | return importlib.util.find_spec("spacy") is not None |
||
| 22 | |||
| 23 | def __init__(self, param: str, **kwargs) -> None: |
||
| 24 | import spacy |
||
| 25 | |||
| 26 | self.param = param |
||
| 27 | try: |
||
| 28 | self.nlp = spacy.load(param, exclude=["ner", "parser"]) |
||
| 29 | except IOError as err: |
||
| 30 | raise OperationFailedException( |
||
| 31 | f"Loading spaCy model '{param}' failed - " |
||
| 32 | + f"please download the model.\n{err}" |
||
| 33 | ) |
||
| 34 | if _KEY_LOWERCASE in kwargs: |
||
| 35 | self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE]) |
||
| 36 | else: |
||
| 37 | self.lowercase = False |
||
| 38 | super().__init__(**kwargs) |
||
| 39 | |||
| 40 | def tokenize_words(self, text: str, filter: bool = True) -> list[str]: |
||
| 41 | lemmas = [ |
||
| 42 | lemma |
||
| 43 | for lemma in (token.lemma_ for token in self.nlp(text.strip())) |
||
| 44 | if (not filter or self.is_valid_token(lemma)) |
||
| 45 | ] |
||
| 46 | if self.lowercase: |
||
| 47 | return [lemma.lower() for lemma in lemmas] |
||
| 48 | else: |
||
| 49 | return lemmas |
||
| 50 |