Passed
Pull Request — master (#418)
by Osma
01:55
created

VectorSuggestionResult.subject_order()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 5
dl 0
loc 5
rs 10
c 0
b 0
f 0
cc 2
nop 1
1
"""Representing suggested subjects."""
2
3
import abc
4
import collections
5
import numpy as np
6
7
8
SubjectSuggestion = collections.namedtuple(
9
    'SubjectSuggestion', 'uri label notation score')
10
WeightedSuggestion = collections.namedtuple(
11
    'WeightedSuggestion', 'hits weight subjects')
12
13
14
class SuggestionFilter:
15
    """A reusable filter for filtering SubjectSuggestion objects."""
16
17
    def __init__(self, subject_index, limit=None, threshold=0.0):
18
        self._subject_index = subject_index
19
        self._limit = limit
20
        self._threshold = threshold
21
22
    def __call__(self, orighits):
23
        return LazySuggestionResult(
24
            lambda: orighits.filter(self._subject_index,
25
                                    self._limit,
26
                                    self._threshold))
27
28
29
class SuggestionResult(metaclass=abc.ABCMeta):
30
    """Abstract base class for a set of hits returned by an analysis
31
    operation."""
32
33
    @abc.abstractmethod
34
    def as_list(self, subject_index):
35
        """Return the hits as an ordered sequence of SubjectSuggestion objects,
36
        highest scores first."""
37
        pass  # pragma: no cover
38
39
    @abc.abstractmethod
40
    def as_vector(self, subject_index):
41
        """Return the hits as a one-dimensional score vector
42
        where the indexes match the given subject index."""
43
        pass  # pragma: no cover
44
45
    @abc.abstractmethod
46
    def filter(self, subject_index, limit=None, threshold=0.0):
47
        """Return a subset of the hits, filtered by the given limit and
48
        score threshold, as another SuggestionResult object."""
49
        pass  # pragma: no cover
50
51
    @abc.abstractmethod
52
    def __len__(self):
53
        """Return the number of hits with non-zero scores."""
54
        pass  # pragma: no cover
55
56
57
class LazySuggestionResult(SuggestionResult):
58
    """SuggestionResult implementation that wraps another SuggestionResult which
59
    is initialized lazily only when it is actually accessed. Method calls
60
    will be proxied to the wrapped SuggestionResult."""
61
62
    def __init__(self, construct):
63
        """Create the proxy object. The given construct function will be
64
        called to create the actual SuggestionResult when it is needed."""
65
        self._construct = construct
66
        self._object = None
67
68
    def _initialize(self):
69
        if self._object is None:
70
            self._object = self._construct()
71
72
    def as_list(self, subject_index):
73
        self._initialize()
74
        return self._object.as_list(subject_index)
75
76
    def as_vector(self, subject_index):
77
        self._initialize()
78
        return self._object.as_vector(subject_index)
79
80
    def filter(self, subject_index, limit=None, threshold=0.0):
81
        self._initialize()
82
        return self._object.filter(subject_index, limit, threshold)
83
84
    def __len__(self):
85
        self._initialize()
86
        return len(self._object)
87
88
89
class VectorSuggestionResult(SuggestionResult):
90
    """SuggestionResult implementation based primarily on NumPy vectors."""
91
92
    def __init__(self, vector):
93
        self._vector = vector.astype(np.float32)
94
        self._subject_order = None
95
        self._list = None
96
97
    def _vector_to_list(self, subject_index):
98
        hits = []
99
        for subject_id in self.subject_order:
100
            score = self._vector[subject_id]
101
            if score <= 0.0:
102
                continue  # we can skip the remaining ones
103
            subject = subject_index[subject_id]
104
            hits.append(
105
                SubjectSuggestion(
106
                    uri=subject[0],
107
                    label=subject[1],
108
                    notation=subject[2],
109
                    score=float(score)))
110
        return ListSuggestionResult(hits).as_list(subject_index)
111
112
    @property
113
    def subject_order(self):
114
        if self._subject_order is None:
115
            self._subject_order = np.argsort(self._vector)[::-1]
116
        return self._subject_order
117
118
    def as_list(self, subject_index):
119
        if self._list is None:
120
            self._list = self._vector_to_list(subject_index)
121
        return self._list
122
123
    def as_vector(self, subject_index):
124
        return self._vector
125
126
    def filter(self, subject_index, limit=None, threshold=0.0):
127
        mask = (self._vector > threshold)
128
        deprecated_ids = subject_index.deprecated_ids()
129
        if limit is not None:
130
            limit_mask = np.zeros_like(self._vector, dtype=np.bool)
131
            top_k_subjects = [subj for subj in self.subject_order
132
                              if subj not in deprecated_ids][:limit]
133
            limit_mask[top_k_subjects] = True
134
            mask = mask & limit_mask
135
        else:
136
            deprecated_mask = np.ones_like(self._vector, dtype=np.bool)
137
            deprecated_mask[deprecated_ids] = False
138
            mask = mask & deprecated_mask
139
        return VectorSuggestionResult(self._vector * mask)
140
141
    def __len__(self):
142
        return (self._vector > 0.0).sum()
143
144
145
class ListSuggestionResult(SuggestionResult):
146
    """SuggestionResult implementation based primarily on lists of hits."""
147
148
    def __init__(self, hits):
149
        self._list = [hit for hit in hits if hit.score > 0.0]
150
        self._vector = None
151
152
    @classmethod
153
    def create_from_index(cls, hits, subject_index):
154
        subject_suggestions = []
155
        for hit in hits:
156
            subject_id = subject_index.by_uri(hit.uri)
157
            if subject_id is None:
158
                continue
159
            subject = subject_index[subject_id]
160
            subject_suggestions.append(
161
                SubjectSuggestion(uri=hit.uri,
162
                                  label=subject[1],
163
                                  notation=subject[2],
164
                                  score=hit.score))
165
        return ListSuggestionResult(subject_suggestions)
166
167
    def _list_to_vector(self, subject_index):
168
        vector = np.zeros(len(subject_index), dtype=np.float32)
169
        for hit in self._list:
170
            subject_id = subject_index.by_uri(hit.uri)
171
            if subject_id is not None:
172
                vector[subject_id] = hit.score
173
        return vector
174
175
    def as_list(self, subject_index):
176
        return self._list
177
178
    def as_vector(self, subject_index):
179
        if self._vector is None:
180
            self._vector = self._list_to_vector(subject_index)
181
        return self._vector
182
183
    def filter(self, subject_index, limit=None, threshold=0.0):
184
        hits = sorted(self._list, key=lambda hit: hit.score, reverse=True)
185
        filtered_hits = [hit for hit in hits
186
                         if hit.score >= threshold and hit.score > 0.0 and
187
                         hit.label is not None]
188
        if limit is not None:
189
            filtered_hits = filtered_hits[:limit]
190
        return ListSuggestionResult(filtered_hits)
191
192
    def __len__(self):
193
        return len(self._list)
194