annif.analyzer.analyzer.Analyzer._normalize_word()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""Common functionality for analyzers."""
2
3
from __future__ import annotations
4
5
import abc
6
import functools
7
import unicodedata
8
9
import annif
10
11
logger = annif.logger
12
13
_KEY_TOKEN_MIN_LENGTH = "token_min_length"
14
_NLTK_TOKENIZER_DATA = "punkt_tab"
15
16
17
class Analyzer(metaclass=abc.ABCMeta):
18
    """Base class for language-specific analyzers. Either tokenize_words or
19
    _normalize_word must be overridden in subclasses. Other methods may be
20
    overridden when necessary."""
21
22
    name = None
23
    token_min_length = 3  # default value, can be overridden in instances
24
25
    @staticmethod
26
    def is_available() -> bool:
27
        """Return True if the analyzer is available for use, False if not."""
28
        return True  # can be overridden in implementations if necessary
29
30
    def __init__(self, **kwargs) -> None:
31
        if _KEY_TOKEN_MIN_LENGTH in kwargs:
32
            self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])
33
34
        import nltk.data
35
36
        try:
37
            nltk.data.find("tokenizers/" + _NLTK_TOKENIZER_DATA)
38
        except LookupError as err:
39
            logger.debug(str(err))
40
            if _NLTK_TOKENIZER_DATA in str(err):
41
                logger.warning(
42
                    f'NLTK datapackage "{_NLTK_TOKENIZER_DATA}" not found, '
43
                    "downloading it now."
44
                )
45
                nltk.download(_NLTK_TOKENIZER_DATA)
46
            else:
47
                raise
48
49
    def tokenize_sentences(self, text: str) -> list[str]:
50
        """Tokenize a piece of text (e.g. a document) into sentences."""
51
        import nltk.tokenize
52
53
        return nltk.tokenize.sent_tokenize(text)
54
55
    @functools.lru_cache(maxsize=50000)
56
    def is_valid_token(self, word: str) -> bool:
57
        """Return True if the word is an acceptable token."""
58
        if len(word) < self.token_min_length:
59
            return False
60
        for char in word:
61
            category = unicodedata.category(char)
62
            if category[0] == "L":  # letter
63
                return True
64
        return False
65
66
    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
67
        """Tokenize a piece of text (e.g. a sentence) into words. If
68
        filter=True (default), only return valid tokens (e.g. not
69
        punctuation, numbers or very short words)"""
70
71
        import nltk.tokenize
72
73
        return [
74
            self._normalize_word(word)
75
            for word in nltk.tokenize.word_tokenize(text)
76
            if (not filter or self.is_valid_token(word))
77
        ]
78
79
    def _normalize_word(self, word):
80
        """Normalize (stem or lemmatize) a word form into a normal form."""
81
        pass  # pragma: no cover
82