annif.analyzer.analyzer.Analyzer.__init__() - Code Metrics - Inspection of "Set Min Token Size to Two" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — master (#468)

unknown

created 2021-03-17 10:06 UTC

annif.analyzer.analyzer.Analyzer.init() A

↳ Parent: annif.analyzer.analyzer

Complexity

Conditions

Size

Total Lines	2
Code Lines	2

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	0
loc	2
rs	10
c	0
b	0
f	0

"""Common functionality for analyzers."""

import abc
import functools
import unicodedata
import nltk.tokenize

_KEY_TOKEN_MIN_LENGTH = 'token_min_length'


class Analyzer(metaclass=abc.ABCMeta):
    """Base class for language-specific analyzers. The non-implemented
    methods should be overridden in subclasses. Tokenize functions may
    be overridden when necessary."""

    name = None

    def __init__(self, **kwargs):
        self.token_min_length = int(kwargs.get(_KEY_TOKEN_MIN_LENGTH, 3))

    def tokenize_sentences(self, text):
        """Tokenize a piece of text (e.g. a document) into sentences."""
        return nltk.tokenize.sent_tokenize(text)

    @functools.lru_cache(maxsize=50000)
    def is_valid_token(self, word):
        """Return True if the word is an acceptable token."""
        if len(word) < self.token_min_length:
            return False
        for char in word:
            category = unicodedata.category(char)
            if category[0] == 'L':  # letter
                return True
        return False

    def tokenize_words(self, text):
        """Tokenize a piece of text (e.g. a sentence) into words."""
        return [self.normalize_word(word)
                for word in nltk.tokenize.word_tokenize(text)
                if self.is_valid_token(word)]

    @abc.abstractmethod
    def normalize_word(self, word):
        """Normalize (stem or lemmatize) a word form into a normal form."""
        pass  # pragma: no cover


1			"""Common functionality for analyzers."""
2
3			import abc
4			import functools
5			import unicodedata
6			import nltk.tokenize
7
8			_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
9
10
11			class Analyzer(metaclass=abc.ABCMeta):
12			"""Base class for language-specific analyzers. The non-implemented
13			methods should be overridden in subclasses. Tokenize functions may
14			be overridden when necessary."""
15
16			name = None
17
18			def __init__(self, **kwargs):
19			self.token_min_length = int(kwargs.get(_KEY_TOKEN_MIN_LENGTH, 3))
20
21			def tokenize_sentences(self, text):
22			"""Tokenize a piece of text (e.g. a document) into sentences."""
23			return nltk.tokenize.sent_tokenize(text)
24
25			@functools.lru_cache(maxsize=50000)
26			def is_valid_token(self, word):
27			"""Return True if the word is an acceptable token."""
28			if len(word) < self.token_min_length:
29			return False
30			for char in word:
31			category = unicodedata.category(char)
32			if category[0] == 'L': # letter
33			return True
34			return False
35
36			def tokenize_words(self, text):
37			"""Tokenize a piece of text (e.g. a sentence) into words."""
38			return [self.normalize_word(word)
39			for word in nltk.tokenize.word_tokenize(text)
40			if self.is_valid_token(word)]
41
42			@abc.abstractmethod
43			def normalize_word(self, word):
44			"""Normalize (stem or lemmatize) a word form into a normal form."""
45			pass # pragma: no cover
46

NatLibFi / Annif

Pull Request — master (#468)

annif.analyzer.analyzer.Analyzer.__init__() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like

annif.analyzer.analyzer.Analyzer.init() A