Passed
Pull Request — master (#468)
by
unknown
01:47
created

annif.analyzer.analyzer.Analyzer.__init__()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
1
"""Common functionality for analyzers."""
2
3
import abc
4
import functools
5
import unicodedata
6
import nltk.tokenize
7
8
_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
9
10
11
class Analyzer(metaclass=abc.ABCMeta):
12
    """Base class for language-specific analyzers. The non-implemented
13
    methods should be overridden in subclasses. Tokenize functions may
14
    be overridden when necessary."""
15
16
    name = None
17
18
    def __init__(self, **kwargs):
19
        self.token_min_length = int(kwargs.get(_KEY_TOKEN_MIN_LENGTH, 3))
20
21
    def tokenize_sentences(self, text):
22
        """Tokenize a piece of text (e.g. a document) into sentences."""
23
        return nltk.tokenize.sent_tokenize(text)
24
25
    @functools.lru_cache(maxsize=50000)
26
    def is_valid_token(self, word):
27
        """Return True if the word is an acceptable token."""
28
        if len(word) < self.token_min_length:
29
            return False
30
        for char in word:
31
            category = unicodedata.category(char)
32
            if category[0] == 'L':  # letter
33
                return True
34
        return False
35
36
    def tokenize_words(self, text):
37
        """Tokenize a piece of text (e.g. a sentence) into words."""
38
        return [self.normalize_word(word)
39
                for word in nltk.tokenize.word_tokenize(text)
40
                if self.is_valid_token(word)]
41
42
    @abc.abstractmethod
43
    def normalize_word(self, word):
44
        """Normalize (stem or lemmatize) a word form into a normal form."""
45
        pass  # pragma: no cover
46