annif.analyzer.spacy.SpacyAnalyzer.tokenize_words() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

SpacyAnalyzer.tokenize_words() A
last analyzed 2025-08-27 13:59 UTC

↳ Parent: annif.analyzer.spacy

Complexity

Conditions

Size

Total Lines	10
Code Lines	8

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	2
eloc	8
nop	3
dl	0
loc	10
rs	10
c	0
b	0
f	0

"""spaCy analyzer for Annif which uses spaCy for lemmatization"""

from __future__ import annotations

import importlib

import annif.util
from annif.exception import OperationFailedException

from . import analyzer

_KEY_LOWERCASE = "lowercase"


class SpacyAnalyzer(analyzer.Analyzer):
    name = "spacy"

    @staticmethod
    def is_available() -> bool:
        # return True iff spaCy is installed
        return importlib.util.find_spec("spacy") is not None

    def __init__(self, param: str, **kwargs) -> None:
        import spacy

        self.param = param
        try:
            self.nlp = spacy.load(param, exclude=["ner", "parser"])
        except IOError as err:
            raise OperationFailedException(
                f"Loading spaCy model '{param}' failed - "
                + f"please download the model.\n{err}"
            )
        if _KEY_LOWERCASE in kwargs:
            self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
        else:
            self.lowercase = False
        super().__init__(**kwargs)

    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
        lemmas = [
            lemma
            for lemma in (token.lemma_ for token in self.nlp(text.strip()))
            if (not filter or self.is_valid_token(lemma))
        ]
        if self.lowercase:
            return [lemma.lower() for lemma in lemmas]
        else:
            return lemmas


1			"""spaCy analyzer for Annif which uses spaCy for lemmatization"""
2
3			from __future__ import annotations
4
5			import importlib
6
7			import annif.util
8			from annif.exception import OperationFailedException
9
10			from . import analyzer
11
12			_KEY_LOWERCASE = "lowercase"
13
14
15			class SpacyAnalyzer(analyzer.Analyzer):
16			name = "spacy"
17
18			@staticmethod
19			def is_available() -> bool:
20			# return True iff spaCy is installed
21			return importlib.util.find_spec("spacy") is not None
22
23			def __init__(self, param: str, **kwargs) -> None:
24			import spacy
25
26			self.param = param
27			try:
28			self.nlp = spacy.load(param, exclude=["ner", "parser"])
29			except IOError as err:
30			raise OperationFailedException(
31			f"Loading spaCy model '{param}' failed - "
32			+ f"please download the model.\n{err}"
33			)
34			if _KEY_LOWERCASE in kwargs:
35			self.lowercase = annif.util.boolean(kwargs[_KEY_LOWERCASE])
36			else:
37			self.lowercase = False
38			super().__init__(**kwargs)
39
40			def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
41			lemmas = [
42			lemma
43			for lemma in (token.lemma_ for token in self.nlp(text.strip()))
44			if (not filter or self.is_valid_token(lemma))
45			]
46			if self.lowercase:
47			return [lemma.lower() for lemma in lemmas]
48			else:
49			return lemmas
50

NatLibFi / Annif

SpacyAnalyzer.tokenize_words() A last analyzed 2025-08-27 13:59 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

SpacyAnalyzer.tokenize_words() A
last analyzed 2025-08-27 13:59 UTC