Passed
Pull Request — main (#681)
by Osma
06:57 queued 04:04
created

annif.eval.EvaluationBatch.evaluate()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
import scipy.sparse
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
from annif.suggestion import SuggestionBatch
17
18
19
def filter_pred_top_k(preds, limit):
20
    """filter a 2D prediction vector, retaining only the top K suggestions
21
    for each individual prediction; the rest will be set to zeros"""
22
23
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
24
    for row in range(preds.shape[0]):
25
        ar = preds.getrow(row).toarray()[0]
26
        top_k = np.argsort(ar)[::-1][:limit]
27
        for col in top_k:
28
            filtered[row, col] = preds[row, col]
29
    return filtered.tocsr()
30
31
32
def true_positives(y_true, y_pred):
33
    """calculate the number of true positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true & y_pred).sum())
36
37
38
def false_positives(y_true, y_pred):
39
    """calculate the number of false positives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((~y_true & y_pred).sum())
42
43
44
def false_negatives(y_true, y_pred):
45
    """calculate the number of false negatives using bitwise operations,
46
    emulating the way sklearn evaluation metric functions work"""
47
    return int((y_true & ~y_pred).sum())
48
49
50
def precision_at_k_score(y_true, y_pred, limit):
51
    """calculate the precision at K, i.e. the number of relevant items
52
    among the top K predicted ones"""
53
    scores = []
54
    for true, pred in zip(y_true, y_pred):
55
        order = pred.argsort()[::-1]
56
        orderlimit = min(limit, np.count_nonzero(pred))
57
        order = order[:orderlimit]
58
        gain = true[order]
59
        if orderlimit > 0:
60
            scores.append(gain.sum() / orderlimit)
61
        else:
62
            scores.append(0.0)
63
    return statistics.mean(scores)
64
65
66
def dcg_score(y_true, y_pred, limit=None):
67
    """return the discounted cumulative gain (DCG) score for the selected
68
    labels vs. relevant labels"""
69
    order = y_pred.argsort()[::-1]
70
    n_pred = np.count_nonzero(y_pred)
71
    if limit is not None:
72
        n_pred = min(limit, n_pred)
73
    order = order[:n_pred]
74
    gain = y_true[order]
75
    discount = np.log2(np.arange(order.size) + 2)
76
77
    return (gain / discount).sum()
78
79
80
def ndcg_score(y_true, y_pred, limit=None):
81
    """return the normalized discounted cumulative gain (nDCG) score for the
82
    selected labels vs. relevant labels"""
83
    scores = []
84
    for true, pred in zip(y_true, y_pred):
85
        idcg = dcg_score(true, true, limit)
86
        dcg = dcg_score(true, pred, limit)
87
        if idcg > 0:
88
            scores.append(dcg / idcg)
89
        else:
90
            scores.append(1.0)  # perfect score for no relevant hits case
91
    return statistics.mean(scores)
92
93
94
class EvaluationBatch:
95
    """A class for evaluating batches of results using all available metrics.
96
    The evaluate() method is called once per document in the batch or evaluate_many()
97
    for a list of documents of the batch. Final results can be queried using the
98
    results() method."""
99
100
    def __init__(self, subject_index):
101
        self._subject_index = subject_index
102
        self._suggestion_arrays = []
103
        self._gold_subject_arrays = []
104
105
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
106
        if not isinstance(suggestion_batch, SuggestionBatch):
107
            suggestion_batch = SuggestionBatch(
108
                suggestion_batch, len(self._subject_index)
109
            )
110
        self._suggestion_arrays.append(suggestion_batch.array)
111
112
        # convert gold_subject_batch to sparse matrix
113
        ar = scipy.sparse.dok_array(
114
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
115
        )
116
        for idx, subject_set in enumerate(gold_subject_batch):
117
            for subject_id in subject_set:
118
                ar[idx, subject_id] = True
119
        self._gold_subject_arrays.append(ar.tocsr())
120
121
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
122
        y_pred_binary = y_pred > 0.0
123
        # dense versions of sparse arrays, for functions that need them
124
        # FIXME: conversion to dense arrays should be avoided
125
        y_pred_binary_dense = y_pred_binary.toarray()
126
        y_pred_dense = y_pred.toarray()
127
        y_true_dense = y_true.toarray()
128
129
        # define the available metrics as lazy lambda functions
130
        # so we can execute only the ones actually requested
131
        all_metrics = {
132
            "Precision (doc avg)": lambda: precision_score(
133
                y_true, y_pred_binary, average="samples"
134
            ),
135
            "Recall (doc avg)": lambda: recall_score(
136
                y_true, y_pred_binary, average="samples"
137
            ),
138
            "F1 score (doc avg)": lambda: f1_score(
139
                y_true, y_pred_binary, average="samples"
140
            ),
141
            "Precision (subj avg)": lambda: precision_score(
142
                y_true, y_pred_binary, average="macro"
143
            ),
144
            "Recall (subj avg)": lambda: recall_score(
145
                y_true, y_pred_binary, average="macro"
146
            ),
147
            "F1 score (subj avg)": lambda: f1_score(
148
                y_true, y_pred_binary, average="macro"
149
            ),
150
            "Precision (weighted subj avg)": lambda: precision_score(
151
                y_true, y_pred_binary, average="weighted"
152
            ),
153
            "Recall (weighted subj avg)": lambda: recall_score(
154
                y_true, y_pred_binary, average="weighted"
155
            ),
156
            "F1 score (weighted subj avg)": lambda: f1_score(
157
                y_true, y_pred_binary, average="weighted"
158
            ),
159
            "Precision (microavg)": lambda: precision_score(
160
                y_true, y_pred_binary, average="micro"
161
            ),
162
            "Recall (microavg)": lambda: recall_score(
163
                y_true, y_pred_binary, average="micro"
164
            ),
165
            "F1 score (microavg)": lambda: f1_score(
166
                y_true, y_pred_binary, average="micro"
167
            ),
168
            "F1@5": lambda: f1_score(
169
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
170
            ),
171
            "NDCG": lambda: ndcg_score(y_true_dense, y_pred_dense),
172
            "NDCG@5": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=5),
173
            "NDCG@10": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=10),
174
            "Precision@1": lambda: precision_at_k_score(
175
                y_true_dense, y_pred_dense, limit=1
176
            ),
177
            "Precision@3": lambda: precision_at_k_score(
178
                y_true_dense, y_pred_dense, limit=3
179
            ),
180
            "Precision@5": lambda: precision_at_k_score(
181
                y_true_dense, y_pred_dense, limit=5
182
            ),
183
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred_dense),
184
            "True positives": lambda: true_positives(y_true_dense, y_pred_binary_dense),
185
            "False positives": lambda: false_positives(
186
                y_true_dense, y_pred_binary_dense
187
            ),
188
            "False negatives": lambda: false_negatives(
189
                y_true_dense, y_pred_binary_dense
190
            ),
191
        }
192
193
        if not metrics:
194
            metrics = all_metrics.keys()
195
196
        with warnings.catch_warnings():
197
            warnings.simplefilter("ignore")
198
199
            return {metric: all_metrics[metric]() for metric in metrics}
200
201
    def _result_per_subject_header(self, results_file):
202
        print(
203
            "\t".join(
204
                [
205
                    "URI",
206
                    "Label",
207
                    "Support",
208
                    "True_positives",
209
                    "False_positives",
210
                    "False_negatives",
211
                    "Precision",
212
                    "Recall",
213
                    "F1_score",
214
                ]
215
            ),
216
            file=results_file,
217
        )
218
219
    def _result_per_subject_body(self, zipped_results, results_file):
220
        for row in zipped_results:
221
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 220 is not entered. Are you sure this can never be the case?
Loading history...
222
223
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
224
        """Write results per subject (non-aggregated)
225
        to outputfile results_file, using labels in the given language"""
226
227
        # FIXME: conversion to dense arrays should be avoided
228
        y_pred = y_pred.T.toarray() > 0.0
229
        y_true = y_true.T.toarray() > 0.0
230
231
        true_pos = y_true & y_pred
232
        false_pos = ~y_true & y_pred
233
        false_neg = y_true & ~y_pred
234
235
        r = len(y_true)
236
237
        zipped = zip(
238
            [subj.uri for subj in self._subject_index],  # URI
239
            [subj.labels[language] for subj in self._subject_index],  # Label
240
            np.sum((true_pos + false_neg), axis=1),  # Support
241
            np.sum(true_pos, axis=1),  # True_positives
242
            np.sum(false_pos, axis=1),  # False_positives
243
            np.sum(false_neg, axis=1),  # False_negatives
244
            [
245
                precision_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
246
            ],  # Precision
247
            [
248
                recall_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
249
            ],  # Recall
250
            [f1_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)],
251
        )  # F1
252
        self._result_per_subject_header(results_file)
253
        self._result_per_subject_body(zipped, results_file)
254
255
    def results(self, metrics=[], results_file=None, language=None):
256
        """evaluate a set of selected subjects against a gold standard using
257
        different metrics. If metrics is empty, use all available metrics.
258
        If results_file (file object) given, write results per subject to it
259
        with labels expressed in the given language."""
260
261
        if not self._suggestion_arrays:
262
            raise NotSupportedException("cannot evaluate empty corpus")
263
264
        y_pred = scipy.sparse.vstack(self._suggestion_arrays)
265
        y_true = scipy.sparse.vstack(self._gold_subject_arrays)
266
267
        results = self._evaluate_samples(y_true, y_pred, metrics)
268
        results["Documents evaluated"] = int(y_true.shape[0])
269
270
        if results_file:
271
            self.output_result_per_subject(y_true, y_pred, results_file, language)
272
        return results
273