| Total Complexity | 5 |
| Total Lines | 35 |
| Duplicated Lines | 0 % |
| Changes | 0 | ||
| 1 | """Annif backend mixins that can be used to implement features""" |
||
| 2 | |||
| 3 | |||
| 4 | import abc |
||
| 5 | from annif.hit import ListAnalysisResult |
||
| 6 | |||
| 7 | |||
| 8 | class ChunkingBackend(metaclass=abc.ABCMeta): |
||
| 9 | """Annif backend mixin that implements chunking of input""" |
||
| 10 | |||
| 11 | @abc.abstractmethod |
||
| 12 | def _analyze_chunks(self, chunktexts, project): |
||
| 13 | """Analyze the chunked text; should be implemented by the subclass |
||
| 14 | inheriting this mixin""" |
||
| 15 | |||
| 16 | pass # pragma: no cover |
||
| 17 | |||
| 18 | def _analyze(self, text, project, params): |
||
| 19 | self.initialize() |
||
| 20 | self.debug('Analyzing text "{}..." (len={})'.format( |
||
| 21 | text[:20], len(text))) |
||
| 22 | sentences = project.analyzer.tokenize_sentences(text) |
||
| 23 | self.debug('Found {} sentences'.format(len(sentences))) |
||
| 24 | chunksize = int(params['chunksize']) |
||
| 25 | chunktexts = [] |
||
| 26 | for i in range(0, len(sentences), chunksize): |
||
| 27 | chunktext = ' '.join(sentences[i:i + chunksize]) |
||
| 28 | normalized = self._normalize_text(project, chunktext) |
||
| 29 | if normalized != '': |
||
| 30 | chunktexts.append(normalized) |
||
| 31 | self.debug('Split sentences into {} chunks'.format(len(chunktexts))) |
||
| 32 | if len(chunktexts) == 0: # nothing to analyze, empty result |
||
| 33 | return ListAnalysisResult(hits=[], subject_index=project.subjects) |
||
| 34 | return self._analyze_chunks(chunktexts, project) |
||
| 35 |