Total Complexity | 6 |
Total Lines | 50 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """Transformation filtering out parts of a text that are in a language |
||
2 | different from the language of the project.""" |
||
3 | |||
4 | from __future__ import annotations |
||
5 | |||
6 | from typing import TYPE_CHECKING |
||
7 | |||
8 | import annif |
||
9 | import annif.simplemma_util |
||
10 | |||
11 | from . import transform |
||
12 | |||
13 | if TYPE_CHECKING: |
||
14 | from annif.project import AnnifProject |
||
15 | |||
16 | logger = annif.logger |
||
17 | |||
18 | |||
19 | class LangFilter(transform.BaseTransform): |
||
20 | name = "filter_lang" |
||
21 | |||
22 | def __init__( |
||
23 | self, |
||
24 | project: AnnifProject, |
||
25 | text_min_length: int | str = 500, |
||
26 | sentence_min_length: int | str = 50, |
||
27 | min_ratio: float = 0.5, |
||
28 | ) -> None: |
||
29 | super().__init__(project) |
||
30 | self.text_min_length = int(text_min_length) |
||
31 | self.sentence_min_length = int(sentence_min_length) |
||
32 | self.min_ratio = float(min_ratio) |
||
33 | self.language_detector = annif.simplemma_util.get_language_detector( |
||
34 | self.project.language |
||
35 | ) |
||
36 | |||
37 | def transform_text(self, text: str) -> str: |
||
38 | if len(text) < self.text_min_length: |
||
39 | return text |
||
40 | |||
41 | retained_sentences = [] |
||
42 | for sent in self.project.analyzer.tokenize_sentences(text): |
||
43 | if len(sent) < self.sentence_min_length: |
||
44 | retained_sentences.append(sent) |
||
45 | continue |
||
46 | proportion = self.language_detector.proportion_in_target_languages(sent) |
||
47 | if proportion >= self.min_ratio: |
||
48 | retained_sentences.append(sent) |
||
49 | return " ".join(retained_sentences) |
||
50 |