Passed
Pull Request — main (#818)
by Osma
05:35 queued 02:32
created

EstNLTKAnalyzer.tokenize_words()   A

Complexity

Conditions 1

Size

Total Lines 9
Code Lines 8

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 8
nop 3
dl 0
loc 9
rs 10
c 0
b 0
f 0
1
"""EstNLTK analyzer for Annif which uses EstNLTK for lemmatization"""
2
3
from __future__ import annotations
4
5
from . import analyzer
6
7
8
class EstNLTKAnalyzer(analyzer.Analyzer):
9
    name = "estnltk"
10
11
    def __init__(self, param: str, **kwargs) -> None:
12
        self.param = param
13
        super().__init__(**kwargs)
14
15
    def tokenize_words(self, text: str, filter: bool = True) -> list[str]:
16
        import estnltk
17
18
        txt = estnltk.Text(text.strip())
19
        txt.tag_layer()
20
        return [
21
            lemma
22
            for lemma in [lemmas[0] for lemmas in txt.lemma]
23
            if (not filter or self.is_valid_token(lemma))
24
        ]
25