annif.analyzer.analyzer.Analyzer._normalize_word() - Code Metrics - Inspection of "Merge pull request #527 from NatLibFi/issue374-spa..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( cf5fd0...1b4762 )

by Osma

created 2022-01-24 08:05 UTC

annif.analyzer.analyzer.Analyzer._normalize_word() A

↳ Parent: annif.analyzer.analyzer

Complexity

Conditions

Size

Total Lines	3
Code Lines	2

Duplication

Lines	3
Ratio	100 %

Importance

Changes

Metric	Value
cc	1
eloc	2
nop	2
dl	3
loc	3
rs	10
c	0
b	0
f	0

"""Common functionality for analyzers."""

import abc
import functools
import unicodedata

_KEY_TOKEN_MIN_LENGTH = 'token_min_length'


class Analyzer(metaclass=abc.ABCMeta):

    """Base class for language-specific analyzers. Either tokenize_words or
    _normalize_word must be overridden in subclasses. Other methods may be
    overridden when necessary."""

    name = None
    token_min_length = 3  # default value, can be overridden in instances

    def __init__(self, **kwargs):
        if _KEY_TOKEN_MIN_LENGTH in kwargs:
            self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])

    def tokenize_sentences(self, text):
        """Tokenize a piece of text (e.g. a document) into sentences."""
        import nltk.tokenize
        return nltk.tokenize.sent_tokenize(text)

    @functools.lru_cache(maxsize=50000)
    def is_valid_token(self, word):
        """Return True if the word is an acceptable token."""
        if len(word) < self.token_min_length:
            return False
        for char in word:
            category = unicodedata.category(char)
            if category[0] == 'L':  # letter
                return True
        return False

    def tokenize_words(self, text, filter=True):
        """Tokenize a piece of text (e.g. a sentence) into words. If
        filter=True (default), only return valid tokens (e.g. not
        punctuation, numbers or very short words)"""

        import nltk.tokenize
        return [self._normalize_word(word)
                for word in nltk.tokenize.word_tokenize(text)
                if (not filter or self.is_valid_token(word))]

    def _normalize_word(self, word):
        """Normalize (stem or lemmatize) a word form into a normal form."""
        pass  # pragma: no cover


1		"""Common functionality for analyzers."""
2
3		import abc
4		import functools
5		import unicodedata
6
7		_KEY_TOKEN_MIN_LENGTH = 'token_min_length'
8
9
10	View Code Duplication	class Analyzer(metaclass=abc.ABCMeta):
		0 ignored issues – show Duplication introduced 2022-01-18 08:57 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
11		"""Base class for language-specific analyzers. Either tokenize_words or
12		_normalize_word must be overridden in subclasses. Other methods may be
13		overridden when necessary."""
14
15		name = None
16		token_min_length = 3 # default value, can be overridden in instances
17
18		def __init__(self, **kwargs):
19		if _KEY_TOKEN_MIN_LENGTH in kwargs:
20		self.token_min_length = int(kwargs[_KEY_TOKEN_MIN_LENGTH])
21
22		def tokenize_sentences(self, text):
23		"""Tokenize a piece of text (e.g. a document) into sentences."""
24		import nltk.tokenize
25		return nltk.tokenize.sent_tokenize(text)
26
27		@functools.lru_cache(maxsize=50000)
28		def is_valid_token(self, word):
29		"""Return True if the word is an acceptable token."""
30		if len(word) < self.token_min_length:
31		return False
32		for char in word:
33		category = unicodedata.category(char)
34		if category[0] == 'L': # letter
35		return True
36		return False
37
38		def tokenize_words(self, text, filter=True):
39		"""Tokenize a piece of text (e.g. a sentence) into words. If
40		filter=True (default), only return valid tokens (e.g. not
41		punctuation, numbers or very short words)"""
42
43		import nltk.tokenize
44		return [self._normalize_word(word)
45		for word in nltk.tokenize.word_tokenize(text)
46		if (not filter or self.is_valid_token(word))]
47
48		def _normalize_word(self, word):
49		"""Normalize (stem or lemmatize) a word form into a normal form."""
50		pass # pragma: no cover
51

NatLibFi / Annif

Push — master ( cf5fd0...1b4762 )

annif.analyzer.analyzer.Analyzer._normalize_word() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like