Passed
Pull Request — master (#418)
by Osma
01:28
created

ListSuggestionResult._hits_to_vector()   A

Complexity

Conditions 3

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 7
rs 10
c 0
b 0
f 0
cc 3
nop 1
1
"""Representing suggested subjects."""
2
3
import abc
4
import collections
5
import numpy as np
6
7
8
SubjectSuggestion = collections.namedtuple(
9
    'SubjectSuggestion', 'uri label notation score')
10
WeightedSuggestion = collections.namedtuple(
11
    'WeightedSuggestion', 'hits weight subjects')
12
13
14
class SuggestionFilter:
15
    """A reusable filter for filtering SubjectSuggestion objects."""
16
17
    def __init__(self, subject_index, limit=None, threshold=0.0):
18
        self._subject_index = subject_index
19
        self._limit = limit
20
        self._threshold = threshold
21
22
    def __call__(self, orighits):
23
        return LazySuggestionResult(
24
            lambda: orighits.filter(self._subject_index,
25
                                    self._limit,
26
                                    self._threshold))
27
28
29
class SuggestionResult(metaclass=abc.ABCMeta):
30
    """Abstract base class for a set of hits returned by an analysis
31
    operation."""
32
33
    @abc.abstractmethod
34
    def as_list(self, subject_index):
35
        """Return the hits as an ordered sequence of SubjectSuggestion objects,
36
        highest scores first."""
37
        pass  # pragma: no cover
38
39
    @abc.abstractmethod
40
    def as_vector(self, subject_index):
41
        """Return the hits as a one-dimensional score vector
42
        where the indexes match the given subject index."""
43
        pass  # pragma: no cover
44
45
    @abc.abstractmethod
46
    def filter(self, subject_index, limit=None, threshold=0.0):
47
        """Return a subset of the hits, filtered by the given limit and
48
        score threshold, as another SuggestionResult object."""
49
        pass  # pragma: no cover
50
51
    @abc.abstractmethod
52
    def __len__(self):
53
        """Return the number of hits with non-zero scores."""
54
        pass  # pragma: no cover
55
56
57
class LazySuggestionResult(SuggestionResult):
58
    """SuggestionResult implementation that wraps another SuggestionResult which
59
    is initialized lazily only when it is actually accessed. Method calls
60
    will be proxied to the wrapped SuggestionResult."""
61
62
    def __init__(self, construct):
63
        """Create the proxy object. The given construct function will be
64
        called to create the actual SuggestionResult when it is needed."""
65
        self._construct = construct
66
        self._object = None
67
68
    def _initialize(self):
69
        if self._object is None:
70
            self._object = self._construct()
71
72
    def as_list(self, subject_index):
73
        self._initialize()
74
        return self._object.as_list(subject_index)
75
76
    def as_vector(self, subject_index):
77
        self._initialize()
78
        return self._object.as_vector(subject_index)
79
80
    def filter(self, subject_index, limit=None, threshold=0.0):
81
        self._initialize()
82
        return self._object.filter(subject_index, limit, threshold)
83
84
    def __len__(self):
85
        self._initialize()
86
        return len(self._object)
87
88
89
class VectorSuggestionResult(SuggestionResult):
90
    """SuggestionResult implementation based primarily on NumPy vectors."""
91
92
    def __init__(self, vector):
93
        self._vector = vector.astype(np.float32)
94
        self._subject_order = None
95
        self._lsr = None
96
97
    def _vector_to_list_suggestion(self, subject_index):
98
        hits = []
99
        for subject_id in self.subject_order:
100
            score = self._vector[subject_id]
101
            if score <= 0.0:
102
                continue  # we can skip the remaining ones
103
            subject = subject_index[subject_id]
104
            hits.append(
105
                SubjectSuggestion(
106
                    uri=subject[0],
107
                    label=subject[1],
108
                    notation=subject[2],
109
                    score=float(score)))
110
        return ListSuggestionResult(hits)
111
112
    @property
113
    def subject_order(self):
114
        if self._subject_order is None:
115
            self._subject_order = np.argsort(self._vector)[::-1]
116
        return self._subject_order
117
118
    def as_list(self, subject_index):
119
        if self._lsr is None:
120
            self._lsr = self._vector_to_list_suggestion(subject_index)
121
        return self._lsr.as_list(subject_index)
122
123
    def as_vector(self, subject_index):
124
        return self._vector
125
126
    def filter(self, subject_index, limit=None, threshold=0.0):
127
        mask = (self._vector > threshold)
128
        deprecated_ids = subject_index.deprecated_ids()
129
        if limit is not None:
130
            limit_mask = np.zeros_like(self._vector, dtype=np.bool)
131
            top_k_subjects = [subj for subj in self.subject_order
132
                              if subj not in deprecated_ids][:limit]
133
            limit_mask[top_k_subjects] = True
134
            mask = mask & limit_mask
135
        else:
136
            deprecated_mask = np.ones_like(self._vector, dtype=np.bool)
137
            deprecated_mask[deprecated_ids] = False
138
            mask = mask & deprecated_mask
139
        vsr = VectorSuggestionResult(self._vector * mask)
140
        return ListSuggestionResult(vsr.as_list(subject_index))
141
142
    def __len__(self):
143
        return (self._vector > 0.0).sum()
144
145
146
class ListSuggestionResult(SuggestionResult):
147
    """SuggestionResult implementation based primarily on lists of hits."""
148
149
    def __init__(self, hits):
150
        self._list = [hit for hit in hits if hit.score > 0.0]
151
        self._vector = None
152
153
    @classmethod
154
    def create_from_index(cls, hits, subject_index):
155
        subject_suggestions = []
156
        for hit in hits:
157
            subject_id = subject_index.by_uri(hit.uri)
158
            if subject_id is None:
159
                continue
160
            subject = subject_index[subject_id]
161
            subject_suggestions.append(
162
                SubjectSuggestion(uri=hit.uri,
163
                                  label=subject[1],
164
                                  notation=subject[2],
165
                                  score=hit.score))
166
        return ListSuggestionResult(subject_suggestions)
167
168
    def _list_to_vector(self, subject_index):
169
        vector = np.zeros(len(subject_index), dtype=np.float32)
170
        for hit in self._list:
171
            subject_id = subject_index.by_uri(hit.uri)
172
            if subject_id is not None:
173
                vector[subject_id] = hit.score
174
        return vector
175
176
    def as_list(self, subject_index):
177
        return self._list
178
179
    def as_vector(self, subject_index):
180
        if self._vector is None:
181
            self._vector = self._list_to_vector(subject_index)
182
        return self._vector
183
184
    def filter(self, subject_index, limit=None, threshold=0.0):
185
        hits = sorted(self._list, key=lambda hit: hit.score, reverse=True)
186
        filtered_hits = [hit for hit in hits
187
                         if hit.score >= threshold and hit.score > 0.0 and
188
                         hit.label is not None]
189
        if limit is not None:
190
            filtered_hits = filtered_hits[:limit]
191
        return ListSuggestionResult(filtered_hits)
192
193
    def __len__(self):
194
        return len(self._list)
195