Passed
Pull Request — master (#468)
by
unknown
03:09
created

annif.analyzer.analyzer   A

Complexity

Total Complexity 8

Size/Duplication

Total Lines 46
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 28
dl 0
loc 46
rs 10
c 0
b 0
f 0
wmc 8

5 Methods

Rating   Name   Duplication   Size   Complexity  
A Analyzer.tokenize_words() 0 5 1
A Analyzer.is_valid_token() 0 10 4
A Analyzer.normalize_word() 0 4 1
A Analyzer.__init__() 0 2 1
A Analyzer.tokenize_sentences() 0 3 1
1
"""Common functionality for analyzers."""
2
3
import abc
4
import functools
5
import unicodedata
6
import nltk.tokenize
7
8
_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
9
10
11
class Analyzer(metaclass=abc.ABCMeta):
12
    """Base class for language-specific analyzers. The non-implemented
13
    methods should be overridden in subclasses. Tokenize functions may
14
    be overridden when necessary."""
15
16
    name = None
17
18
    def __init__(self, **kwargs):
19
        self.token_min_length = int(kwargs.get(_KEY_TOKEN_MIN_LENGTH, 3))
20
21
    def tokenize_sentences(self, text):
22
        """Tokenize a piece of text (e.g. a document) into sentences."""
23
        return nltk.tokenize.sent_tokenize(text)
24
25
    @functools.lru_cache(maxsize=50000)
26
    def is_valid_token(self, word):
27
        """Return True if the word is an acceptable token."""
28
        if len(word) < self.token_min_length:
29
            return False
30
        for char in word:
31
            category = unicodedata.category(char)
32
            if category[0] == 'L':  # letter
33
                return True
34
        return False
35
36
    def tokenize_words(self, text):
37
        """Tokenize a piece of text (e.g. a sentence) into words."""
38
        return [self.normalize_word(word)
39
                for word in nltk.tokenize.word_tokenize(text)
40
                if self.is_valid_token(word)]
41
42
    @abc.abstractmethod
43
    def normalize_word(self, word):
44
        """Normalize (stem or lemmatize) a word form into a normal form."""
45
        pass  # pragma: no cover
46