annif.suggestion.vector_to_suggestions() - Code Metrics - Inspection of "faster vector_to_suggestions using np.argpartition" - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — issue678-refactor-suggestionre... ( 17f52a...cc0b6b )

by Osma

created 2023-04-04 08:02 UTC

annif.suggestion.vector_to_suggestions() A

↳ Parent: annif.suggestion

Complexity

Conditions

Size

Total Lines	5
Code Lines	5

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	5
nop	2
dl	0
loc	5
rs	10
c	0
b	0
f	0

"""Representing suggested subjects."""

import collections
import itertools

import numpy as np
from scipy.sparse import csr_array, dok_array

SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score")
WeightedSuggestionsBatch = collections.namedtuple(
    "WeightedSuggestionsBatch", "hit_sets weight subjects"
)


def vector_to_suggestions(vector, limit):
    limit = min(len(vector), limit)
    topk_idx = np.argpartition(vector, -limit)[-limit:]
    return (
        SubjectSuggestion(subject_id=idx, score=float(vector[idx])) for idx in topk_idx
    )


def filter_suggestion(preds, limit=None, threshold=0.0):
    """filter a 2D sparse suggestion array (csr_array), retaining only the
    top K suggestions with a score above or equal to the threshold for each
    individual prediction; the rest will be left as zeros"""

    filtered = dok_array(preds.shape, dtype=np.float32)
    for row in range(preds.shape[0]):
        arow = preds.getrow(row)
        top_k = arow.data.argsort()[::-1]
        if limit is not None:
            top_k = top_k[:limit]
        for idx in top_k:
            val = arow.data[idx]
            if val < threshold:
                break
            filtered[row, arow.indices[idx]] = val
    return filtered.tocsr()


class SuggestionResult:
    """Suggestions for a single document, backed by a row of a sparse array."""

    def __init__(self, array, idx):
        self._array = array
        self._idx = idx

    def __iter__(self):
        _, cols = self._array[[self._idx], :].nonzero()
        suggestions = [
            SubjectSuggestion(subject_id=col, score=float(self._array[self._idx, col]))
            for col in cols
        ]
        return iter(
            sorted(suggestions, key=lambda suggestion: suggestion.score, reverse=True)
        )

    def as_vector(self):
        return self._array[[self._idx], :].toarray()[0]

    def __len__(self):
        _, cols = self._array[[self._idx], :].nonzero()
        return len(cols)


class SuggestionBatch:
    """Subject suggestions for a batch of documents."""

    def __init__(self, array):
        """Create a new SuggestionBatch from a csr_array"""
        assert isinstance(array, csr_array)
        self.array = array

    @classmethod
    def from_sequence(cls, suggestion_results, subject_index, limit=None):
        """Create a new SuggestionBatch from a sequence where each item is
        a sequence of SubjectSuggestion objects."""

        deprecated = set(subject_index.deprecated_ids())

        ar = dok_array((len(suggestion_results), len(subject_index)), dtype=np.float32)
        for idx, result in enumerate(suggestion_results):
            for suggestion in itertools.islice(result, limit):
                if suggestion.subject_id in deprecated or suggestion.score <= 0.0:
                    continue
                ar[idx, suggestion.subject_id] = min(suggestion.score, 1.0)
        return cls(ar.tocsr())

    def filter(self, limit=None, threshold=0.0):
        """Return a subset of the hits, filtered by the given limit and
        score threshold, as another SuggestionBatch object."""

        return SuggestionBatch(filter_suggestion(self.array, limit, threshold))

    def __getitem__(self, idx):
        if idx < 0 or idx >= len(self):
            raise IndexError
        return SuggestionResult(self.array, idx)

    def __len__(self):
        return self.array.shape[0]


class SuggestionResults:
    """Subject suggestions for a potentially very large number of documents."""

    def __init__(self, batches):
        """Initialize a new SuggestionResults from an iterable that provides
        SuggestionBatch objects."""

        self.batches = batches

    def filter(self, limit=None, threshold=0.0):
        """Return a view of these suggestions, filtered by the given limit
        and/or threshold, as another SuggestionResults object."""

        return SuggestionResults(
            (batch.filter(limit, threshold) for batch in self.batches)
        )

    def __iter__(self):
        return iter(itertools.chain.from_iterable(self.batches))


1			"""Representing suggested subjects."""
2
3			import collections
4			import itertools
5
6			import numpy as np
7			from scipy.sparse import csr_array, dok_array
8
9			SubjectSuggestion = collections.namedtuple("SubjectSuggestion", "subject_id score")
10			WeightedSuggestionsBatch = collections.namedtuple(
11			"WeightedSuggestionsBatch", "hit_sets weight subjects"
12			)
13
14
15			def vector_to_suggestions(vector, limit):
16			limit = min(len(vector), limit)
17			topk_idx = np.argpartition(vector, -limit)[-limit:]
18			return (
19			SubjectSuggestion(subject_id=idx, score=float(vector[idx])) for idx in topk_idx
20			)
21
22
23			def filter_suggestion(preds, limit=None, threshold=0.0):
24			"""filter a 2D sparse suggestion array (csr_array), retaining only the
25			top K suggestions with a score above or equal to the threshold for each
26			individual prediction; the rest will be left as zeros"""
27
28			filtered = dok_array(preds.shape, dtype=np.float32)
29			for row in range(preds.shape[0]):
30			arow = preds.getrow(row)
31			top_k = arow.data.argsort()[::-1]
32			if limit is not None:
33			top_k = top_k[:limit]
34			for idx in top_k:
35			val = arow.data[idx]
36			if val < threshold:
37			break
38			filtered[row, arow.indices[idx]] = val
39			return filtered.tocsr()
40
41
42			class SuggestionResult:
43			"""Suggestions for a single document, backed by a row of a sparse array."""
44
45			def __init__(self, array, idx):
46			self._array = array
47			self._idx = idx
48
49			def __iter__(self):
50			_, cols = self._array[[self._idx], :].nonzero()
51			suggestions = [
52			SubjectSuggestion(subject_id=col, score=float(self._array[self._idx, col]))
53			for col in cols
54			]
55			return iter(
56			sorted(suggestions, key=lambda suggestion: suggestion.score, reverse=True)
57			)
58
59			def as_vector(self):
60			return self._array[[self._idx], :].toarray()[0]
61
62			def __len__(self):
63			_, cols = self._array[[self._idx], :].nonzero()
64			return len(cols)
65
66
67			class SuggestionBatch:
68			"""Subject suggestions for a batch of documents."""
69
70			def __init__(self, array):
71			"""Create a new SuggestionBatch from a csr_array"""
72			assert isinstance(array, csr_array)
73			self.array = array
74
75			@classmethod
76			def from_sequence(cls, suggestion_results, subject_index, limit=None):
77			"""Create a new SuggestionBatch from a sequence where each item is
78			a sequence of SubjectSuggestion objects."""
79
80			deprecated = set(subject_index.deprecated_ids())
81
82			ar = dok_array((len(suggestion_results), len(subject_index)), dtype=np.float32)
83			for idx, result in enumerate(suggestion_results):
84			for suggestion in itertools.islice(result, limit):
85			if suggestion.subject_id in deprecated or suggestion.score <= 0.0:
86			continue
87			ar[idx, suggestion.subject_id] = min(suggestion.score, 1.0)
88			return cls(ar.tocsr())
89
90			def filter(self, limit=None, threshold=0.0):
91			"""Return a subset of the hits, filtered by the given limit and
92			score threshold, as another SuggestionBatch object."""
93
94			return SuggestionBatch(filter_suggestion(self.array, limit, threshold))
95
96			def __getitem__(self, idx):
97			if idx < 0 or idx >= len(self):
98			raise IndexError
99			return SuggestionResult(self.array, idx)
100
101			def __len__(self):
102			return self.array.shape[0]
103
104
105			class SuggestionResults:
106			"""Subject suggestions for a potentially very large number of documents."""
107
108			def __init__(self, batches):
109			"""Initialize a new SuggestionResults from an iterable that provides
110			SuggestionBatch objects."""
111
112			self.batches = batches
113
114			def filter(self, limit=None, threshold=0.0):
115			"""Return a view of these suggestions, filtered by the given limit
116			and/or threshold, as another SuggestionResults object."""
117
118			return SuggestionResults(
119			(batch.filter(limit, threshold) for batch in self.batches)
120			)
121
122			def __iter__(self):
123			return iter(itertools.chain.from_iterable(self.batches))
124

NatLibFi / Annif

Push — issue678-refactor-suggestionre... ( 17f52a...cc0b6b )

annif.suggestion.vector_to_suggestions() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like