Passed
Pull Request — main (#681)
by Osma
05:15 queued 02:40
created

annif.eval.EvaluationBatch.evaluate()   A

Complexity

Conditions 1

Size

Total Lines 2
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 2
rs 10
c 0
b 0
f 0
cc 1
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
import scipy.sparse
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
from annif.suggestion import SuggestionBatch
17
18
19
def filter_pred_top_k(preds, limit):
20
    """filter a 2D prediction vector, retaining only the top K suggestions
21
    for each individual prediction; the rest will be set to zeros"""
22
23
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
24
    for row in range(preds.shape[0]):
25
        ar = preds.getrow(row).toarray()[0]
26
        top_k = np.argsort(ar)[::-1][:limit]
27
        for col in top_k:
28
            filtered[row, col] = preds[row, col]
29
    return filtered.tocsr()
30
31
32
def true_positives(y_true, y_pred):
33
    """calculate the number of true positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true.multiply(y_pred)).sum())
36
37
38
def false_positives(y_true, y_pred):
39
    """calculate the number of false positives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((y_true < y_pred).sum())
42
43
44
def false_negatives(y_true, y_pred):
45
    """calculate the number of false negatives using bitwise operations,
46
    emulating the way sklearn evaluation metric functions work"""
47
    return int((y_true > y_pred).sum())
48
49
50
def precision_at_k_score(y_true, y_pred, limit):
51
    """calculate the precision at K, i.e. the number of relevant items
52
    among the top K predicted ones"""
53
    scores = []
54
    for true, pred in zip(y_true, y_pred):
55
        order = pred.argsort()[::-1]
56
        orderlimit = min(limit, np.count_nonzero(pred))
57
        order = order[:orderlimit]
58
        gain = true[order]
59
        if orderlimit > 0:
60
            scores.append(gain.sum() / orderlimit)
61
        else:
62
            scores.append(0.0)
63
    return statistics.mean(scores)
64
65
66
def dcg_score(y_true, y_pred, limit=None):
67
    """return the discounted cumulative gain (DCG) score for the selected
68
    labels vs. relevant labels"""
69
    order = y_pred.argsort()[::-1]
70
    n_pred = np.count_nonzero(y_pred)
71
    if limit is not None:
72
        n_pred = min(limit, n_pred)
73
    order = order[:n_pred]
74
    gain = y_true[order]
75
    discount = np.log2(np.arange(order.size) + 2)
76
77
    return (gain / discount).sum()
78
79
80
def ndcg_score(y_true, y_pred, limit=None):
81
    """return the normalized discounted cumulative gain (nDCG) score for the
82
    selected labels vs. relevant labels"""
83
    scores = []
84
    for true, pred in zip(y_true, y_pred):
85
        idcg = dcg_score(true, true, limit)
86
        dcg = dcg_score(true, pred, limit)
87
        if idcg > 0:
88
            scores.append(dcg / idcg)
89
        else:
90
            scores.append(1.0)  # perfect score for no relevant hits case
91
    return statistics.mean(scores)
92
93
94
class EvaluationBatch:
95
    """A class for evaluating batches of results using all available metrics.
96
    The evaluate() method is called once per document in the batch or evaluate_many()
97
    for a list of documents of the batch. Final results can be queried using the
98
    results() method."""
99
100
    def __init__(self, subject_index):
101
        self._subject_index = subject_index
102
        self._suggestion_arrays = []
103
        self._gold_subject_arrays = []
104
105
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
106
        if not isinstance(suggestion_batch, SuggestionBatch):
107
            suggestion_batch = SuggestionBatch(
108
                suggestion_batch, len(self._subject_index)
109
            )
110
        self._suggestion_arrays.append(suggestion_batch.array)
111
112
        # convert gold_subject_batch to sparse matrix
113
        ar = scipy.sparse.dok_array(
114
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
115
        )
116
        for idx, subject_set in enumerate(gold_subject_batch):
117
            for subject_id in subject_set:
118
                ar[idx, subject_id] = True
119
        self._gold_subject_arrays.append(ar.tocsr())
120
121
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
122
        y_pred_binary = y_pred > 0.0
123
        # dense versions of sparse arrays, for functions that need them
124
        # FIXME: conversion to dense arrays should be avoided
125
        y_pred_dense = y_pred.toarray()
126
        y_true_dense = y_true.toarray()
127
128
        # define the available metrics as lazy lambda functions
129
        # so we can execute only the ones actually requested
130
        all_metrics = {
131
            "Precision (doc avg)": lambda: precision_score(
132
                y_true, y_pred_binary, average="samples"
133
            ),
134
            "Recall (doc avg)": lambda: recall_score(
135
                y_true, y_pred_binary, average="samples"
136
            ),
137
            "F1 score (doc avg)": lambda: f1_score(
138
                y_true, y_pred_binary, average="samples"
139
            ),
140
            "Precision (subj avg)": lambda: precision_score(
141
                y_true, y_pred_binary, average="macro"
142
            ),
143
            "Recall (subj avg)": lambda: recall_score(
144
                y_true, y_pred_binary, average="macro"
145
            ),
146
            "F1 score (subj avg)": lambda: f1_score(
147
                y_true, y_pred_binary, average="macro"
148
            ),
149
            "Precision (weighted subj avg)": lambda: precision_score(
150
                y_true, y_pred_binary, average="weighted"
151
            ),
152
            "Recall (weighted subj avg)": lambda: recall_score(
153
                y_true, y_pred_binary, average="weighted"
154
            ),
155
            "F1 score (weighted subj avg)": lambda: f1_score(
156
                y_true, y_pred_binary, average="weighted"
157
            ),
158
            "Precision (microavg)": lambda: precision_score(
159
                y_true, y_pred_binary, average="micro"
160
            ),
161
            "Recall (microavg)": lambda: recall_score(
162
                y_true, y_pred_binary, average="micro"
163
            ),
164
            "F1 score (microavg)": lambda: f1_score(
165
                y_true, y_pred_binary, average="micro"
166
            ),
167
            "F1@5": lambda: f1_score(
168
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
169
            ),
170
            "NDCG": lambda: ndcg_score(y_true_dense, y_pred_dense),
171
            "NDCG@5": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=5),
172
            "NDCG@10": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=10),
173
            "Precision@1": lambda: precision_at_k_score(
174
                y_true_dense, y_pred_dense, limit=1
175
            ),
176
            "Precision@3": lambda: precision_at_k_score(
177
                y_true_dense, y_pred_dense, limit=3
178
            ),
179
            "Precision@5": lambda: precision_at_k_score(
180
                y_true_dense, y_pred_dense, limit=5
181
            ),
182
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred_dense),
183
            "True positives": lambda: true_positives(y_true, y_pred_binary),
184
            "False positives": lambda: false_positives(y_true, y_pred_binary),
185
            "False negatives": lambda: false_negatives(y_true_dense, y_pred_binary),
186
        }
187
188
        if not metrics:
189
            metrics = all_metrics.keys()
190
191
        with warnings.catch_warnings():
192
            warnings.simplefilter("ignore")
193
194
            return {metric: all_metrics[metric]() for metric in metrics}
195
196
    def _result_per_subject_header(self, results_file):
197
        print(
198
            "\t".join(
199
                [
200
                    "URI",
201
                    "Label",
202
                    "Support",
203
                    "True_positives",
204
                    "False_positives",
205
                    "False_negatives",
206
                    "Precision",
207
                    "Recall",
208
                    "F1_score",
209
                ]
210
            ),
211
            file=results_file,
212
        )
213
214
    def _result_per_subject_body(self, zipped_results, results_file):
215
        for row in zipped_results:
216
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 215 is not entered. Are you sure this can never be the case?
Loading history...
217
218
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
219
        """Write results per subject (non-aggregated)
220
        to outputfile results_file, using labels in the given language"""
221
222
        # FIXME: conversion to dense arrays should be avoided
223
        y_pred = y_pred.T.toarray() > 0.0
224
        y_true = y_true.T.toarray() > 0.0
225
226
        true_pos = y_true & y_pred
227
        false_pos = ~y_true & y_pred
228
        false_neg = y_true & ~y_pred
229
230
        r = len(y_true)
231
232
        zipped = zip(
233
            [subj.uri for subj in self._subject_index],  # URI
234
            [subj.labels[language] for subj in self._subject_index],  # Label
235
            np.sum((true_pos + false_neg), axis=1),  # Support
236
            np.sum(true_pos, axis=1),  # True_positives
237
            np.sum(false_pos, axis=1),  # False_positives
238
            np.sum(false_neg, axis=1),  # False_negatives
239
            [
240
                precision_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
241
            ],  # Precision
242
            [
243
                recall_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
244
            ],  # Recall
245
            [f1_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)],
246
        )  # F1
247
        self._result_per_subject_header(results_file)
248
        self._result_per_subject_body(zipped, results_file)
249
250
    def results(self, metrics=[], results_file=None, language=None):
251
        """evaluate a set of selected subjects against a gold standard using
252
        different metrics. If metrics is empty, use all available metrics.
253
        If results_file (file object) given, write results per subject to it
254
        with labels expressed in the given language."""
255
256
        if not self._suggestion_arrays:
257
            raise NotSupportedException("cannot evaluate empty corpus")
258
259
        y_pred = scipy.sparse.vstack(self._suggestion_arrays)
260
        y_true = scipy.sparse.vstack(self._gold_subject_arrays)
261
262
        results = self._evaluate_samples(y_true, y_pred, metrics)
263
        results["Documents evaluated"] = int(y_true.shape[0])
264
265
        if results_file:
266
            self.output_result_per_subject(y_true, y_pred, results_file, language)
267
        return results
268