| Total Complexity | 5 |
| Total Lines | 31 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """Simple analyzer for Annif. Only folds words to lower case.""" |
||
| 2 | |||
| 3 | import spacy |
||
| 4 | from spacy.tokens import Doc, Span |
||
| 5 | from . import analyzer |
||
| 6 | |||
| 7 | |||
| 8 | class SpacyAnalyzer(analyzer.Analyzer): |
||
| 9 | name = "spacy" |
||
| 10 | |||
| 11 | def __init__(self, param, **kwargs): |
||
| 12 | self.param = param |
||
| 13 | self.nlp = spacy.load(param, exclude=['ner', 'parser']) |
||
| 14 | # we need a way to split sentences, now that parser is excluded |
||
| 15 | self.nlp.add_pipe('sentencizer') |
||
| 16 | super().__init__(**kwargs) |
||
| 17 | |||
| 18 | def tokenize_sentences(self, text): |
||
| 19 | doc = self.nlp(text) |
||
| 20 | return list(doc.sents) |
||
| 21 | |||
| 22 | def tokenize_words(self, text): |
||
| 23 | if not isinstance(text, (Doc, Span)): |
||
| 24 | text = self.nlp(text) |
||
| 25 | return [lemma for lemma in (token.lemma_ for token in text) |
||
| 26 | if self.is_valid_token(lemma)] |
||
| 27 | |||
| 28 | def normalize_word(self, word): |
||
| 29 | doc = self.nlp(word) |
||
| 30 | return doc[:].lemma_ |
||
| 31 |