annif.analyzer.analyzer.Analyzer.tokenize_sentences() - Code Metrics - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Analyzer.tokenize_sentences() A
last analyzed 2025-08-15 13:39 UTC

↳ Parent: annif.analyzer.analyzer

Complexity

Conditions

Size

Total Lines	5
Code Lines	3

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	3
nop	2
dl	0
loc	5
rs	10
c	0
b	0
f	0

"""Common functionality for analyzers."""

from __future__ import annotations

import abc
import functools
import unicodedata

import annif

logger = annif.logger

_KEY_TOKEN_MIN_LENGTH = "token_min_length"
_NLTK_TOKENIZER_DATA = "punkt_tab"


class Analyzer(metaclass=abc.ABCMeta):
    """Base class for language-specific analyzers. Either tokenize_words or
    _normalize_word must be overridden in subclasses. Other methods may be
    overridden when necessary."""

    name = None
    token_min_length = 3  # default value, can be overridden in instances

    @staticmethod
    def is_available() -> bool:
        """Return True if the analyzer is available for use, False if not."""
        return True  # can be overridden in implementations if necessary

    def __init__(self, **kwargs) -> None:
        if _KEY_TOKEN_MIN_LENGTH in kwargs:
            self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])

        import nltk.data

        try:
            nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA)
        except LookupError as err:
            logger.debug(str(err))
            if _NLTK_TOKENIZER_DATA in str(err):
                logger.warning(
                    f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, '
                    "downloading it now."
                )
                nltk.download(_NLTK_TOKENIZER_DATA)
            else:
                raise

    def tokenize_sentences(self, text: str) -> list[str]:
        """Tokenize a piece of text (e.g. a document) into sentences."""
        import nltk.tokenize

        return nltk.tokenize.sent_tokenize(text)

    @functools.lru_cache(maxsize=50000)
    def is_valid_token(self, word: str) -> bool:
        """Return True if the word is an acceptable token."""
        if len(word) < self.token_min_length:
            return False
        for char in word:
            category = unicodedata.category(char)
            if category[0] == "L":  # letter
                return True
        return False

    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
        """Tokenize a piece of text (e.g. a sentence) into words. If
        filter=True (default), only return valid tokens (e.g. not
        punctuation, numbers or very short words)"""

        import nltk.tokenize

        return [
            self._normalize_word(word)
            for word in nltk.tokenize.word_tokenize(text)
            if (not filter or self.is_valid_token(word))
        ]

    def _normalize_word(self, word):
        """Normalize (stem or lemmatize) a word form into a normal form."""
        pass  # pragma: no cover


1			"""Common functionality for analyzers."""
2
3			from __future__ import annotations
4
5			import abc
6			import functools
7			import unicodedata
8
9			import annif
10
11			logger = annif.logger
12
13			_KEY_TOKEN_MIN_LENGTH = "token_min_length"
14			_NLTK_TOKENIZER_DATA = "punkt_tab"
15
16
17			class Analyzer(metaclass=abc.ABCMeta):
18			"""Base class for language-specific analyzers. Either tokenize_words or
19			_normalize_word must be overridden in subclasses. Other methods may be
20			overridden when necessary."""
21
22			name = None
23			token_min_length = 3 # default value, can be overridden in instances
24
25			@staticmethod
26			def is_available() -> bool:
27			"""Return True if the analyzer is available for use, False if not."""
28			return True # can be overridden in implementations if necessary
29
30			def __init__(self, **kwargs) -> None:
31			if _KEY_TOKEN_MIN_LENGTH in kwargs:
32			self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])
33
34			import nltk.data
35
36			try:
37			nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA)
38			except LookupError as err:
39			logger.debug(str(err))
40			if _NLTK_TOKENIZER_DATA in str(err):
41			logger.warning(
42			f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, '
43			"downloading it now."
44			)
45			nltk.download(_NLTK_TOKENIZER_DATA)
46			else:
47			raise
48
49			def tokenize_sentences(self, text: str) -> list[str]:
50			"""Tokenize a piece of text (e.g. a document) into sentences."""
51			import nltk.tokenize
52
53			return nltk.tokenize.sent_tokenize(text)
54
55			@functools.lru_cache(maxsize=50000)
56			def is_valid_token(self, word: str) -> bool:
57			"""Return True if the word is an acceptable token."""
58			if len(word) < self.token_min_length:
59			return False
60			for char in word:
61			category = unicodedata.category(char)
62			if category[0] == "L": # letter
63			return True
64			return False
65
66			def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
67			"""Tokenize a piece of text (e.g. a sentence) into words. If
68			filter=True (default), only return valid tokens (e.g. not
69			punctuation, numbers or very short words)"""
70
71			import nltk.tokenize
72
73			return [
74			self._normalize_word(word)
75			for word in nltk.tokenize.word_tokenize(text)
76			if (not filter or self.is_valid_token(word))
77			]
78
79			def _normalize_word(self, word):
80			"""Normalize (stem or lemmatize) a word form into a normal form."""
81			pass # pragma: no cover
82

NatLibFi / Annif

Analyzer.tokenize_sentences() A last analyzed 2025-08-15 13:39 UTC

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

Analyzer.tokenize_sentences() A
last analyzed 2025-08-15 13:39 UTC