annif.simplemma_util.detect_language()   A
last analyzed

Complexity

Conditions 2

Size

Total Lines 4
Code Lines 4

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 4
nop 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
1
"""Wrapper code for using Simplemma functionality in Annif"""
2
3
from typing import Dict, Tuple, Union
4
5
from simplemma import LanguageDetector, Lemmatizer
6
from simplemma.strategies import DefaultStrategy
7
from simplemma.strategies.dictionaries import DefaultDictionaryFactory
8
9
LANG_CACHE_SIZE = 5  # How many language dictionaries to keep in memory at once (max)
10
11
_dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE)
12
_lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory)
13
lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy)
14
15
16
def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector:
17
    return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy)
18
19
20
def detect_language(text: str, languages: Tuple[str, ...]) -> Dict[str, float]:
21
    detector = get_language_detector(languages)
22
    proportions = detector.proportion_in_each_language(text)
23
    return dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True))
24