| Total Complexity | 8 |
| Total Lines | 46 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """Common functionality for analyzers.""" |
||
| 2 | |||
| 3 | import abc |
||
| 4 | import functools |
||
| 5 | import unicodedata |
||
| 6 | import nltk.tokenize |
||
| 7 | |||
| 8 | _KEY_TOKEN_MIN_LENGTH = 'token_min_length' |
||
| 9 | |||
| 10 | |||
| 11 | class Analyzer(metaclass=abc.ABCMeta): |
||
| 12 | """Base class for language-specific analyzers. The non-implemented |
||
| 13 | methods should be overridden in subclasses. Tokenize functions may |
||
| 14 | be overridden when necessary.""" |
||
| 15 | |||
| 16 | name = None |
||
| 17 | |||
| 18 | def __init__(self, **kwargs): |
||
| 19 | self.token_min_length = int(kwargs.get(_KEY_TOKEN_MIN_LENGTH, 3)) |
||
| 20 | |||
| 21 | def tokenize_sentences(self, text): |
||
| 22 | """Tokenize a piece of text (e.g. a document) into sentences.""" |
||
| 23 | return nltk.tokenize.sent_tokenize(text) |
||
| 24 | |||
| 25 | @functools.lru_cache(maxsize=50000) |
||
| 26 | def is_valid_token(self, word): |
||
| 27 | """Return True if the word is an acceptable token.""" |
||
| 28 | if len(word) < self.token_min_length: |
||
| 29 | return False |
||
| 30 | for char in word: |
||
| 31 | category = unicodedata.category(char) |
||
| 32 | if category[0] == 'L': # letter |
||
| 33 | return True |
||
| 34 | return False |
||
| 35 | |||
| 36 | def tokenize_words(self, text): |
||
| 37 | """Tokenize a piece of text (e.g. a sentence) into words.""" |
||
| 38 | return [self.normalize_word(word) |
||
| 39 | for word in nltk.tokenize.word_tokenize(text) |
||
| 40 | if self.is_valid_token(word)] |
||
| 41 | |||
| 42 | @abc.abstractmethod |
||
| 43 | def normalize_word(self, word): |
||
| 44 | """Normalize (stem or lemmatize) a word form into a normal form.""" |
||
| 45 | pass # pragma: no cover |
||
| 46 |