Total Complexity | 3 |
Total Lines | 24 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """Wrapper code for using Simplemma functionality in Annif""" |
||
2 | |||
3 | from typing import Dict, Tuple, Union |
||
4 | |||
5 | from simplemma import LanguageDetector, Lemmatizer |
||
6 | from simplemma.strategies import DefaultStrategy |
||
7 | from simplemma.strategies.dictionaries import DefaultDictionaryFactory |
||
8 | |||
9 | LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max) |
||
10 | |||
11 | _dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE) |
||
12 | _lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory) |
||
13 | lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy) |
||
14 | |||
15 | |||
16 | def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector: |
||
17 | return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy) |
||
18 | |||
19 | |||
20 | def detect_language(text: str, languages: Tuple[str, ...]) -> Dict[str, float]: |
||
21 | detector = get_language_detector(languages) |
||
22 | proportions = detector.proportion_in_each_language(text) |
||
23 | return dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True)) |
||
24 |