Passed
Pull Request — master (#461)
by
unknown
02:45
created

annif.backend.yake   B

Complexity

Total Complexity 45

Size/Duplication

Total Lines 210
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 174
dl 0
loc 210
rs 8.8
c 0
b 0
f 0
wmc 45

19 Methods

Rating   Name   Duplication   Size   Complexity  
A YakeBackend.default_params() 0 4 1
A YakeBackend.is_trained() 0 3 1
A YakeBackend._get_concept_labels() 0 7 4
A YakeBackend._create_index() 0 12 4
A YakeBackend.initialize() 0 2 1
A YakeBackend._lemmatize_phrase() 0 6 2
A YakeBackend._sort_phrase() 0 3 1
A YakeBackend._combine_scores() 0 6 1
A YakeBackend._combine_suggestions() 0 10 3
A YakeBackend._load_index() 0 7 3
A YakeBackend._save_index() 0 5 3
A YakeBackend.graph() 0 6 2
A YakeBackend._keyphrases2suggestions() 0 16 5
A YakeBackend.label_types() 0 13 4
A YakeBackend._keyphrase2uris() 0 4 1
A YakeBackend._transform_score() 0 5 2
A YakeBackend._initialize_index() 0 12 3
A YakeBackend._suggest() 0 28 2
A YakeBackend._normalize_label() 0 6 2

How to fix   Complexity   

Complexity

Complex classes like annif.backend.yake often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
"""Annif backend using Yake keyword extraction"""
2
# TODO Mention GPLv3 license also here?
3
4
import yake
5
import os.path
6
import re
7
from collections import defaultdict
8
from rdflib.namespace import SKOS, RDF, OWL
9
import rdflib
10
import annif.util
11
from . import backend
12
from annif.suggestion import SubjectSuggestion, ListSuggestionResult
13
from annif.exception import ConfigurationException
14
15
16
class YakeBackend(backend.AnnifBackend):
17
    """Yake based backend for Annif"""
18
    name = "yake"
19
    needs_subject_index = False
20
21
    # defaults for uninitialized instances
22
    _index = None
23
    _graph = None
24
    INDEX_FILE = 'yake-index'
25
26
    DEFAULT_PARAMETERS = {
27
        'max_ngram_size': 4,
28
        'deduplication_threshold': 0.9,
29
        'deduplication_algo': 'levs',
30
        'window_size': 1,
31
        'num_keywords': 100,
32
        'features': None,
33
        'label_types': ['prefLabel', 'altLabel'],
34
        'remove_parentheses': False
35
    }
36
37
    def default_params(self):
38
        params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
39
        params.update(self.DEFAULT_PARAMETERS)
40
        return params
41
42
    @property
43
    def is_trained(self):
44
        return True
45
46
    @property
47
    def label_types(self):
48
        if type(self.params['label_types']) == str:  # Label types set by user
49
            label_types = [lt.strip() for lt
50
                           in self.params['label_types'].split(',')]
51
            valid_types = ('prefLabel', 'altLabel', 'hiddenLabel')
52
            for lt in label_types:
53
                if lt not in valid_types:
54
                    raise ConfigurationException(
55
                        f'invalid label type {lt}', backend_id=self.backend_id)
56
        else:
57
            label_types = self.params['label_types']  # The defaults
58
        return [getattr(SKOS, lt) for lt in label_types]
59
60
    @property
61
    def graph(self):
62
        if self._graph is None:
63
            self.info('Loading graph')
64
            self._graph = self.project.vocab.as_graph()
65
        return self._graph
66
67
    def initialize(self):
68
        self._initialize_index()
69
70
    def _initialize_index(self):
71
        if self._index is None:
72
            path = os.path.join(self.datadir, self.INDEX_FILE)
73
            if os.path.exists(path):
74
                self._index = self._load_index(path)
75
                self.info(
76
                    f'Loaded index from {path} with {len(self._index)} labels')
77
            else:
78
                self.info('Creating index')
79
                self._index = self._create_index()
80
                self._save_index(path)
81
                self.info(f'Created index with {len(self._index)} labels')
82
83
    def _save_index(self, path):
84
        with open(path, 'w', encoding='utf-8') as indexfile:
85
            for label, uris in self._index.items():
86
                line = label + '\t' + ' '.join(uris)
87
                print(line, file=indexfile)
88
89
    def _load_index(self, path):
90
        index = dict()
91
        with open(path, 'r', encoding='utf-8') as indexfile:
92
            for line in indexfile:
93
                label, uris = line.strip().split('\t')
94
                index[label] = uris.split()
95
        return index
96
97
    def _create_index(self):
98
        index = defaultdict(set)
99
        for concept in self.graph.subjects(RDF.type, SKOS.Concept):
100
            if (concept, OWL.deprecated, rdflib.Literal(True)) in self.graph:
101
                continue
102
            uri = str(concept)
103
            labels = self._get_concept_labels(concept, self.label_types)
104
            for label in labels:
105
                label = self._normalize_label(label)
106
                index[label].add(uri)
107
        index.pop('', None)  # Remove possible empty string entry
108
        return dict(index)
109
110
    def _get_concept_labels(self, concept, label_types):
111
        labels = []
112
        for label_type in label_types:
113
            for label in self.graph.objects(concept, label_type):
114
                if label.language == self.params['language']:
115
                    labels.append(label)
116
        return labels
117
118
    def _normalize_label(self, label):
119
        label = str(label)
120
        if annif.util.boolean(self.params['remove_parentheses']):
121
            label = re.sub(r' \(.*\)', '', label)
122
        lemmatized_label = self._lemmatize_phrase(label)
123
        return self._sort_phrase(lemmatized_label)
124
125
    def _lemmatize_phrase(self, phrase):
126
        normalized = []
127
        for word in phrase.split():
128
            normalized.append(
129
                self.project.analyzer.normalize_word(word).lower())
130
        return ' '.join(normalized)
131
132
    def _sort_phrase(self, phrase):
133
        words = phrase.split()
134
        return ' '.join(sorted(words))
135
136
    def _suggest(self, text, params):
137
        self.debug(
138
            f'Suggesting subjects for text "{text[:20]}..." (len={len(text)})')
139
        limit = int(params['limit'])
140
141
        alphanum = re.compile('[^a-zA-Z0-9]')
142
        if len(re.sub(alphanum, '', text)) == 0:
143
            return ListSuggestionResult([])
144
145
        self._kw_extractor = yake.KeywordExtractor(
146
            lan=params['language'],
147
            n=int(params['max_ngram_size']),
148
            dedupLim=float(params['deduplication_threshold']),
149
            dedupFunc=params['deduplication_algo'],
150
            windowsSize=int(params['window_size']),
151
            top=int(params['num_keywords']),
152
            features=self.params['features'])
153
        keyphrases = self._kw_extractor.extract_keywords(text)
154
        suggestions = self._keyphrases2suggestions(keyphrases)
155
156
        subject_suggestions = [SubjectSuggestion(
157
                uri=uri,
158
                label=None,
159
                notation=None,
160
                score=score)
161
                for uri, score in suggestions[:limit] if score > 0.0]
162
        return ListSuggestionResult.create_from_index(subject_suggestions,
163
                                                      self.project.subjects)
164
165
    def _keyphrases2suggestions(self, keyphrases):
166
        suggestions = []
167
        not_matched = []
168
        for kp, score in keyphrases:
169
            uris = self._keyphrase2uris(kp)
170
            for uri in uris:
171
                suggestions.append(
172
                    (uri, self._transform_score(score)))
173
            if not uris:
174
                not_matched.append((kp, self._transform_score(score)))
175
        # Remove duplicate uris, conflating the scores
176
        suggestions = self._combine_suggestions(suggestions)
177
        self.debug('Keyphrases not matched:\n' + '\t'.join(
178
            [kp[0] + ' ' + str(kp[1]) for kp
179
             in sorted(not_matched, reverse=True, key=lambda kp: kp[1])]))
180
        return suggestions
181
182
    def _keyphrase2uris(self, keyphrase):
183
        keyphrase = self._lemmatize_phrase(keyphrase)
184
        keyphrase = self._sort_phrase(keyphrase)
185
        return self._index.get(keyphrase, [])
186
187
    def _transform_score(self, score):
188
        if score < 0:
189
            self.debug(f'Replacing negative YAKE score {score} with zero')
190
            return 1.0
191
        return 1.0 / (score + 1)
192
193
    def _combine_suggestions(self, suggestions):
194
        combined_suggestions = {}
195
        for uri, score in suggestions:
196
            if uri not in combined_suggestions:
197
                combined_suggestions[uri] = score
198
            else:
199
                old_score = combined_suggestions[uri]
200
                combined_suggestions[uri] = self._combine_scores(
201
                    score, old_score)
202
        return list(combined_suggestions.items())
203
204
    def _combine_scores(self, score1, score2):
205
        # The result is never smaller than the greater input
206
        score1 = score1/2 + 0.5
207
        score2 = score2/2 + 0.5
208
        confl = score1 * score2 / (score1 * score2 + (1-score1) * (1-score2))
209
        return (confl-0.5) * 2
210