annif.transform.langfilter   A
last analyzed

Complexity

Total Complexity 6

Size/Duplication

Total Lines 50
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 6
eloc 35
dl 0
loc 50
rs 10
c 0
b 0
f 0

2 Methods

Rating   Name   Duplication   Size   Complexity  
A LangFilter.__init__() 0 13 1
A LangFilter.transform_text() 0 13 5
1
"""Transformation filtering out parts of a text that are in a language
2
different from the language of the project."""
3
4
from __future__ import annotations
5
6
from typing import TYPE_CHECKING
7
8
import annif
9
import annif.simplemma_util
10
11
from . import transform
12
13
if TYPE_CHECKING:
14
    from annif.project import AnnifProject
15
16
logger = annif.logger
17
18
19
class LangFilter(transform.BaseTransform):
20
    name = "filter_lang"
21
22
    def __init__(
23
        self,
24
        project: AnnifProject,
25
        text_min_length: int | str = 500,
26
        sentence_min_length: int | str = 50,
27
        min_ratio: float = 0.5,
28
    ) -> None:
29
        super().__init__(project)
30
        self.text_min_length = int(text_min_length)
31
        self.sentence_min_length = int(sentence_min_length)
32
        self.min_ratio = float(min_ratio)
33
        self.language_detector = annif.simplemma_util.get_language_detector(
34
            self.project.language
35
        )
36
37
    def transform_text(self, text: str) -> str:
38
        if len(text) < self.text_min_length:
39
            return text
40
41
        retained_sentences = []
42
        for sent in self.project.analyzer.tokenize_sentences(text):
43
            if len(sent) < self.sentence_min_length:
44
                retained_sentences.append(sent)
45
                continue
46
            proportion = self.language_detector.proportion_in_target_languages(sent)
47
            if proportion >= self.min_ratio:
48
                retained_sentences.append(sent)
49
        return " ".join(retained_sentences)
50