Total Complexity | 3 |
Total Lines | 22 |
Duplicated Lines | 0 % |
Changes | 0 |
1 | """Simple analyzer for Annif. Only folds words to lower case.""" |
||
2 | |||
3 | import spacy |
||
4 | from . import analyzer |
||
5 | |||
6 | |||
7 | class SpacyAnalyzer(analyzer.Analyzer): |
||
8 | name = "spacy" |
||
9 | |||
10 | def __init__(self, param, **kwargs): |
||
11 | self.param = param |
||
12 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) |
||
13 | super().__init__(**kwargs) |
||
14 | |||
15 | def tokenize_words(self, text): |
||
16 | return [lemma for lemma in (token.lemma_ for token in self.nlp(text)) |
||
17 | if self.is_valid_token(lemma)] |
||
18 | |||
19 | def normalize_word(self, word): |
||
20 | doc = self.nlp(word) |
||
21 | return doc[:].lemma_ |
||
22 |