| Total Complexity | 6 |
| Total Lines | 50 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """Transformation filtering out parts of a text that are in a language |
||
| 2 | different from the language of the project.""" |
||
| 3 | |||
| 4 | from __future__ import annotations |
||
| 5 | |||
| 6 | from typing import TYPE_CHECKING |
||
| 7 | |||
| 8 | import annif |
||
| 9 | import annif.simplemma_util |
||
| 10 | |||
| 11 | from . import transform |
||
| 12 | |||
| 13 | if TYPE_CHECKING: |
||
| 14 | from annif.project import AnnifProject |
||
| 15 | |||
| 16 | logger = annif.logger |
||
| 17 | |||
| 18 | |||
| 19 | class LangFilter(transform.BaseTransform): |
||
| 20 | name = "filter_lang" |
||
| 21 | |||
| 22 | def __init__( |
||
| 23 | self, |
||
| 24 | project: AnnifProject, |
||
| 25 | text_min_length: int | str = 500, |
||
| 26 | sentence_min_length: int | str = 50, |
||
| 27 | min_ratio: float = 0.5, |
||
| 28 | ) -> None: |
||
| 29 | super().__init__(project) |
||
| 30 | self.text_min_length = int(text_min_length) |
||
| 31 | self.sentence_min_length = int(sentence_min_length) |
||
| 32 | self.min_ratio = float(min_ratio) |
||
| 33 | self.language_detector = annif.simplemma_util.get_language_detector( |
||
| 34 | self.project.language |
||
| 35 | ) |
||
| 36 | |||
| 37 | def transform_text(self, text: str) -> str: |
||
| 38 | if len(text) < self.text_min_length: |
||
| 39 | return text |
||
| 40 | |||
| 41 | retained_sentences = [] |
||
| 42 | for sent in self.project.analyzer.tokenize_sentences(text): |
||
| 43 | if len(sent) < self.sentence_min_length: |
||
| 44 | retained_sentences.append(sent) |
||
| 45 | continue |
||
| 46 | proportion = self.language_detector.proportion_in_target_languages(sent) |
||
| 47 | if proportion >= self.min_ratio: |
||
| 48 | retained_sentences.append(sent) |
||
| 49 | return " ".join(retained_sentences) |
||
| 50 |