1
|
|
|
"""Collection of language-specific analyzers and analyzer registry for Annif""" |
2
|
|
|
|
3
|
|
|
from __future__ import annotations |
4
|
|
|
|
5
|
|
|
import re |
6
|
|
|
from typing import TYPE_CHECKING |
7
|
|
|
|
8
|
|
|
import annif |
9
|
|
|
from annif.util import parse_args |
10
|
|
|
|
11
|
|
|
from . import estnltk, simple, simplemma, snowball, spacy, voikko |
12
|
|
|
|
13
|
|
|
if TYPE_CHECKING: |
14
|
|
|
from annif.analyzer.analyzer import Analyzer |
15
|
|
|
|
16
|
|
|
_analyzers = {} |
17
|
|
|
|
18
|
|
|
|
19
|
|
|
def register_analyzer(analyzer): |
20
|
|
|
if analyzer.is_available(): |
21
|
|
|
_analyzers[analyzer.name] = analyzer |
22
|
|
|
else: |
23
|
|
|
annif.logger.debug(f"{analyzer.name} analyzer not available, not enabling it") |
24
|
|
|
|
25
|
|
|
|
26
|
|
|
def get_analyzer(analyzerspec: str) -> Analyzer: |
27
|
|
|
match = re.match(r"(\w+)(\((.*)\))?", analyzerspec) |
28
|
|
|
if match is None: |
29
|
|
|
raise ValueError("Invalid analyzer specification {}".format(analyzerspec)) |
30
|
|
|
|
31
|
|
|
analyzer = match.group(1) |
32
|
|
|
posargs, kwargs = parse_args(match.group(3)) |
33
|
|
|
posargs = posargs if posargs else [None] |
34
|
|
|
try: |
35
|
|
|
return _analyzers[analyzer](*posargs, **kwargs) |
36
|
|
|
except KeyError: |
37
|
|
|
raise ValueError("No such analyzer {}".format(analyzer)) |
38
|
|
|
|
39
|
|
|
|
40
|
|
|
register_analyzer(simple.SimpleAnalyzer) |
41
|
|
|
register_analyzer(snowball.SnowballAnalyzer) |
42
|
|
|
register_analyzer(simplemma.SimplemmaAnalyzer) |
43
|
|
|
register_analyzer(voikko.VoikkoAnalyzer) |
44
|
|
|
register_analyzer(spacy.SpacyAnalyzer) |
45
|
|
|
register_analyzer(estnltk.EstNLTKAnalyzer) |
46
|
|
|
|