Passed
Pull Request — main (#681)
by Osma
05:47 queued 03:01
created

annif.eval.dcg_score()   A

Complexity

Conditions 2

Size

Total Lines 12
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 9
dl 0
loc 12
rs 9.95
c 0
b 0
f 0
cc 2
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
import scipy.sparse
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
from annif.suggestion import SuggestionBatch
17
18
19
def filter_pred_top_k(preds, limit):
20
    """filter a 2D prediction vector, retaining only the top K suggestions
21
    for each individual prediction; the rest will be set to zeros"""
22
23
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
24
    for row in range(preds.shape[0]):
25
        ar = preds.getrow(row).toarray()[0]
26
        top_k = np.argsort(ar)[::-1][:limit]
27
        for col in top_k:
28
            filtered[row, col] = preds[row, col]
29
    return filtered.tocsr()
30
31
32
def true_positives(y_true, y_pred):
33
    """calculate the number of true positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true.multiply(y_pred)).sum())
36
37
38
def false_positives(y_true, y_pred):
39
    """calculate the number of false positives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((y_true < y_pred).sum())
42
43
44
def false_negatives(y_true, y_pred):
45
    """calculate the number of false negatives using bitwise operations,
46
    emulating the way sklearn evaluation metric functions work"""
47
    return int((y_true > y_pred).sum())
48
49
50
def precision_at_k_score(y_true, y_pred, limit):
51
    """calculate the precision at K, i.e. the number of relevant items
52
    among the top K predicted ones"""
53
    scores = []
54
    for true, pred in zip(y_true, y_pred):
55
        order = pred.argsort()[::-1]
56
        orderlimit = min(limit, np.count_nonzero(pred))
57
        order = order[:orderlimit]
58
        gain = true[order]
59
        if orderlimit > 0:
60
            scores.append(gain.sum() / orderlimit)
61
        else:
62
            scores.append(0.0)
63
    return statistics.mean(scores)
64
65
66
def dcg_score(y_true, y_pred, limit=None):
67
    """return the discounted cumulative gain (DCG) score for the selected
68
    labels vs. relevant labels"""
69
    order = y_pred.argsort()[::-1]
70
    n_pred = np.count_nonzero(y_pred)
71
    if limit is not None:
72
        n_pred = min(limit, n_pred)
73
    order = order[:n_pred]
74
    gain = y_true[order]
75
    discount = np.log2(np.arange(order.size) + 2)
76
77
    return (gain / discount).sum()
78
79
80
def ndcg_score(y_true, y_pred, limit=None):
81
    """return the normalized discounted cumulative gain (nDCG) score for the
82
    selected labels vs. relevant labels"""
83
    scores = []
84
    for true, pred in zip(y_true, y_pred):
85
        idcg = dcg_score(true, true, limit)
86
        dcg = dcg_score(true, pred, limit)
87
        if idcg > 0:
88
            scores.append(dcg / idcg)
89
        else:
90
            scores.append(1.0)  # perfect score for no relevant hits case
91
    return statistics.mean(scores)
92
93
94
class EvaluationBatch:
95
    """A class for evaluating batches of results using all available metrics.
96
    The evaluate() method is called once per document in the batch or evaluate_many()
97
    for a list of documents of the batch. Final results can be queried using the
98
    results() method."""
99
100
    def __init__(self, subject_index):
101
        self._subject_index = subject_index
102
        self._suggestion_arrays = []
103
        self._gold_subject_arrays = []
104
105
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
106
        if not isinstance(suggestion_batch, SuggestionBatch):
107
            suggestion_batch = SuggestionBatch(
108
                suggestion_batch, len(self._subject_index)
109
            )
110
        self._suggestion_arrays.append(suggestion_batch.array)
111
112
        # convert gold_subject_batch to sparse matrix
113
        ar = scipy.sparse.dok_array(
114
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
115
        )
116
        for idx, subject_set in enumerate(gold_subject_batch):
117
            for subject_id in subject_set:
118
                ar[idx, subject_id] = True
119
        self._gold_subject_arrays.append(ar.tocsr())
120
121
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
122
        y_pred_binary = y_pred > 0.0
123
        # dense versions of sparse arrays, for functions that need them
124
        # FIXME: conversion to dense arrays should be avoided
125
        y_pred_binary_dense = y_pred_binary.toarray()
126
        y_pred_dense = y_pred.toarray()
127
        y_true_dense = y_true.toarray()
128
129
        # define the available metrics as lazy lambda functions
130
        # so we can execute only the ones actually requested
131
        all_metrics = {
132
            "Precision (doc avg)": lambda: precision_score(
133
                y_true, y_pred_binary, average="samples"
134
            ),
135
            "Recall (doc avg)": lambda: recall_score(
136
                y_true, y_pred_binary, average="samples"
137
            ),
138
            "F1 score (doc avg)": lambda: f1_score(
139
                y_true, y_pred_binary, average="samples"
140
            ),
141
            "Precision (subj avg)": lambda: precision_score(
142
                y_true, y_pred_binary, average="macro"
143
            ),
144
            "Recall (subj avg)": lambda: recall_score(
145
                y_true, y_pred_binary, average="macro"
146
            ),
147
            "F1 score (subj avg)": lambda: f1_score(
148
                y_true, y_pred_binary, average="macro"
149
            ),
150
            "Precision (weighted subj avg)": lambda: precision_score(
151
                y_true, y_pred_binary, average="weighted"
152
            ),
153
            "Recall (weighted subj avg)": lambda: recall_score(
154
                y_true, y_pred_binary, average="weighted"
155
            ),
156
            "F1 score (weighted subj avg)": lambda: f1_score(
157
                y_true, y_pred_binary, average="weighted"
158
            ),
159
            "Precision (microavg)": lambda: precision_score(
160
                y_true, y_pred_binary, average="micro"
161
            ),
162
            "Recall (microavg)": lambda: recall_score(
163
                y_true, y_pred_binary, average="micro"
164
            ),
165
            "F1 score (microavg)": lambda: f1_score(
166
                y_true, y_pred_binary, average="micro"
167
            ),
168
            "F1@5": lambda: f1_score(
169
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
170
            ),
171
            "NDCG": lambda: ndcg_score(y_true_dense, y_pred_dense),
172
            "NDCG@5": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=5),
173
            "NDCG@10": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=10),
174
            "Precision@1": lambda: precision_at_k_score(
175
                y_true_dense, y_pred_dense, limit=1
176
            ),
177
            "Precision@3": lambda: precision_at_k_score(
178
                y_true_dense, y_pred_dense, limit=3
179
            ),
180
            "Precision@5": lambda: precision_at_k_score(
181
                y_true_dense, y_pred_dense, limit=5
182
            ),
183
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred_dense),
184
            "True positives": lambda: true_positives(y_true, y_pred_binary),
185
            "False positives": lambda: false_positives(y_true, y_pred_binary),
186
            "False negatives": lambda: false_negatives(y_true_dense, y_pred_binary),
187
        }
188
189
        if not metrics:
190
            metrics = all_metrics.keys()
191
192
        with warnings.catch_warnings():
193
            warnings.simplefilter("ignore")
194
195
            return {metric: all_metrics[metric]() for metric in metrics}
196
197
    def _result_per_subject_header(self, results_file):
198
        print(
199
            "\t".join(
200
                [
201
                    "URI",
202
                    "Label",
203
                    "Support",
204
                    "True_positives",
205
                    "False_positives",
206
                    "False_negatives",
207
                    "Precision",
208
                    "Recall",
209
                    "F1_score",
210
                ]
211
            ),
212
            file=results_file,
213
        )
214
215
    def _result_per_subject_body(self, zipped_results, results_file):
216
        for row in zipped_results:
217
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 216 is not entered. Are you sure this can never be the case?
Loading history...
218
219
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
220
        """Write results per subject (non-aggregated)
221
        to outputfile results_file, using labels in the given language"""
222
223
        # FIXME: conversion to dense arrays should be avoided
224
        y_pred = y_pred.T.toarray() > 0.0
225
        y_true = y_true.T.toarray() > 0.0
226
227
        true_pos = y_true & y_pred
228
        false_pos = ~y_true & y_pred
229
        false_neg = y_true & ~y_pred
230
231
        r = len(y_true)
232
233
        zipped = zip(
234
            [subj.uri for subj in self._subject_index],  # URI
235
            [subj.labels[language] for subj in self._subject_index],  # Label
236
            np.sum((true_pos + false_neg), axis=1),  # Support
237
            np.sum(true_pos, axis=1),  # True_positives
238
            np.sum(false_pos, axis=1),  # False_positives
239
            np.sum(false_neg, axis=1),  # False_negatives
240
            [
241
                precision_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
242
            ],  # Precision
243
            [
244
                recall_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
245
            ],  # Recall
246
            [f1_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)],
247
        )  # F1
248
        self._result_per_subject_header(results_file)
249
        self._result_per_subject_body(zipped, results_file)
250
251
    def results(self, metrics=[], results_file=None, language=None):
252
        """evaluate a set of selected subjects against a gold standard using
253
        different metrics. If metrics is empty, use all available metrics.
254
        If results_file (file object) given, write results per subject to it
255
        with labels expressed in the given language."""
256
257
        if not self._suggestion_arrays:
258
            raise NotSupportedException("cannot evaluate empty corpus")
259
260
        y_pred = scipy.sparse.vstack(self._suggestion_arrays)
261
        y_true = scipy.sparse.vstack(self._gold_subject_arrays)
262
263
        results = self._evaluate_samples(y_true, y_pred, metrics)
264
        results["Documents evaluated"] = int(y_true.shape[0])
265
266
        if results_file:
267
            self.output_result_per_subject(y_true, y_pred, results_file, language)
268
        return results
269