| Total Complexity | 2 |
| Total Lines | 24 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """EstNLTK analyzer for Annif which uses EstNLTK for lemmatization""" |
||
| 2 | |||
| 3 | from __future__ import annotations |
||
| 4 | |||
| 5 | from . import analyzer |
||
| 6 | |||
| 7 | |||
| 8 | class EstNLTKAnalyzer(analyzer.Analyzer): |
||
| 9 | name = "estnltk" |
||
| 10 | |||
| 11 | def __init__(self, param: str, **kwargs) -> None: |
||
| 12 | self.param = param |
||
| 13 | super().__init__(**kwargs) |
||
| 14 | |||
| 15 | def tokenize_words(self, text: str, filter: bool = True) -> list[str]: |
||
| 16 | import estnltk |
||
| 17 | |||
| 18 | txt = estnltk.Text(text.strip()) |
||
| 19 | txt.tag_layer() |
||
| 20 | return [ |
||
| 21 | lemma |
||
| 22 | for lemma in [lemmas[0] for lemmas in txt.lemma] |
||
| 23 | if (not filter or self.is_valid_token(lemma)) |
||
| 24 | ] |
||
| 25 |