| Total Complexity | 3 |
| Total Lines | 24 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """Wrapper code for using Simplemma functionality in Annif""" |
||
| 2 | |||
| 3 | from typing import Dict, Tuple, Union |
||
| 4 | |||
| 5 | from simplemma import LanguageDetector, Lemmatizer |
||
| 6 | from simplemma.strategies import DefaultStrategy |
||
| 7 | from simplemma.strategies.dictionaries import DefaultDictionaryFactory |
||
| 8 | |||
| 9 | LANG_CACHE_SIZE = 5 # How many language dictionaries to keep in memory at once (max) |
||
| 10 | |||
| 11 | _dictionary_factory = DefaultDictionaryFactory(cache_max_size=LANG_CACHE_SIZE) |
||
| 12 | _lemmatization_strategy = DefaultStrategy(dictionary_factory=_dictionary_factory) |
||
| 13 | lemmatizer = Lemmatizer(lemmatization_strategy=_lemmatization_strategy) |
||
| 14 | |||
| 15 | |||
| 16 | def get_language_detector(lang: Union[str, Tuple[str, ...]]) -> LanguageDetector: |
||
| 17 | return LanguageDetector(lang, lemmatization_strategy=_lemmatization_strategy) |
||
| 18 | |||
| 19 | |||
| 20 | def detect_language(text: str, languages: Tuple[str, ...]) -> Dict[str, float]: |
||
| 21 | detector = get_language_detector(languages) |
||
| 22 | proportions = detector.proportion_in_each_language(text) |
||
| 23 | return dict(sorted(proportions.items(), key=lambda x: x[1], reverse=True)) |
||
| 24 |