Passed
Pull Request — master (#527)
by Osma
02:34
created

annif.analyzer.analyzer.Analyzer._normalize_word()   A

Complexity

Conditions 1

Size

Total Lines 3
Code Lines 2

Duplication

Lines 3
Ratio 100 %

Importance

Changes 0
Metric Value
cc 1
eloc 2
nop 2
dl 3
loc 3
rs 10
c 0
b 0
f 0
1
"""Common functionality for analyzers."""
2
3
import abc
4
import functools
5
import unicodedata
6
7
_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
8
9
10 View Code Duplication
class Analyzer(metaclass=abc.ABCMeta):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
11
    """Base class for language-specific analyzers. Either tokenize_words or
12
    _normalize_word must be overridden in subclasses. Other methods may be
13
    overridden when necessary."""
14
15
    name = None
16
    token_min_length = 3  # default value, can be overridden in instances
17
18
    def __init__(self, **kwargs):
19
        if _KEY_TOKEN_MIN_LENGTH in kwargs:
20
            self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])
21
22
    def tokenize_sentences(self, text):
23
        """Tokenize a piece of text (e.g. a document) into sentences."""
24
        import nltk.tokenize
25
        return nltk.tokenize.sent_tokenize(text)
26
27
    @functools.lru_cache(maxsize=50000)
28
    def is_valid_token(self, word):
29
        """Return True if the word is an acceptable token."""
30
        if len(word) < self.token_min_length:
31
            return False
32
        for char in word:
33
            category = unicodedata.category(char)
34
            if category[0] == 'L':  # letter
35
                return True
36
        return False
37
38
    def tokenize_words(self, text, filter=True):
39
        """Tokenize a piece of text (e.g. a sentence) into words. If
40
        filter=True (default), only return valid tokens (e.g. not
41
        punctuation, numbers or very short words)"""
42
43
        import nltk.tokenize
44
        return [self._normalize_word(word)
45
                for word in nltk.tokenize.word_tokenize(text)
46
                if (not filter or self.is_valid_token(word))]
47
48
    def _normalize_word(self, word):
49
        """Normalize (stem or lemmatize) a word form into a normal form."""
50
        pass  # pragma: no cover
51