Total Complexity | 3 |
Total Lines | 31 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" |
||
2 | |||
3 | from __future__ import annotations |
||
4 | |||
5 | import importlib |
||
6 | |||
7 | from . import analyzer |
||
8 | |||
9 | |||
10 | class EstNLTKAnalyzer(analyzer.Analyzer): |
||
11 | name = "estnltk" |
||
12 | |||
13 | @staticmethod |
||
14 | def is_available() -> bool: |
||
15 | # return True iff EstNLTK is installed |
||
16 | return importlib.util.find_spec("estnltk") is not None |
||
17 | |||
18 | def __init__(self, param: str, **kwargs) -> None: |
||
19 | self.param = param |
||
20 | super().__init__(**kwargs) |
||
21 | |||
22 | def tokenize_words(self, text: str, filter: bool = True) -> list[str]: |
||
23 | import estnltk |
||
24 | |||
25 | txt = estnltk.Text(text.strip()) |
||
26 | txt.tag_layer() |
||
27 | return [ |
||
28 | lemma |
||
29 | for lemma in [lemmas[0] for lemmas in txt.lemma] |
||
30 | if (not filter or self.is_valid_token(lemma)) |
||
31 | ] |
||
32 |