Passed
Pull Request — main (#681)
by Osma
13:56 queued 10:59
created

annif.eval.EvaluationBatch.results()   A

Complexity

Conditions 3

Size

Total Lines 18
Code Lines 10

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 10
nop 4
dl 0
loc 18
rs 9.9
c 0
b 0
f 0
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
import scipy.sparse
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
from annif.suggestion import SuggestionBatch
17
18
19
def filter_pred_top_k(preds, limit):
20
    """filter a 2D prediction vector, retaining only the top K suggestions
21
    for each individual prediction; the rest will be set to zeros"""
22
23
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
24
    for row in range(preds.shape[0]):
25
        ar = preds.getrow(row).toarray()[0]
26
        top_k = np.argsort(ar)[::-1][:limit]
27
        for col in top_k:
28
            filtered[row, col] = preds[row, col]
29
    return filtered.tocsr()
30
31
32
def true_positives(y_true, y_pred):
33
    """calculate the number of true positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true.multiply(y_pred)).sum())
36
37
38
def false_positives(y_true, y_pred):
39
    """calculate the number of false positives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((y_true < y_pred).sum())
42
43
44
def false_negatives(y_true, y_pred):
45
    """calculate the number of false negatives using bitwise operations,
46
    emulating the way sklearn evaluation metric functions work"""
47
    return int((y_true > y_pred).sum())
48
49
50
def precision_at_k_score(y_true, y_pred, limit):
51
    """calculate the precision at K, i.e. the number of relevant items
52
    among the top K predicted ones"""
53
    scores = []
54
    for true, pred in zip(y_true, y_pred):
55
        order = pred.argsort()[::-1]
56
        orderlimit = min(limit, np.count_nonzero(pred))
57
        order = order[:orderlimit]
58
        gain = true[order]
59
        if orderlimit > 0:
60
            scores.append(gain.sum() / orderlimit)
61
        else:
62
            scores.append(0.0)
63
    return statistics.mean(scores)
64
65
66
def dcg_score(y_true, y_pred, limit=None):
67
    """return the discounted cumulative gain (DCG) score for the selected
68
    labels vs. relevant labels"""
69
    order = y_pred.argsort()[::-1]
70
    n_pred = np.count_nonzero(y_pred)
71
    if limit is not None:
72
        n_pred = min(limit, n_pred)
73
    order = order[:n_pred]
74
    gain = y_true[order]
75
    discount = np.log2(np.arange(order.size) + 2)
76
77
    return (gain / discount).sum()
78
79
80
def ndcg_score(y_true, y_pred, limit=None):
81
    """return the normalized discounted cumulative gain (nDCG) score for the
82
    selected labels vs. relevant labels"""
83
    scores = []
84
    for true, pred in zip(y_true, y_pred):
85
        idcg = dcg_score(true, true, limit)
86
        dcg = dcg_score(true, pred, limit)
87
        if idcg > 0:
88
            scores.append(dcg / idcg)
89
        else:
90
            scores.append(1.0)  # perfect score for no relevant hits case
91
    return statistics.mean(scores)
92
93
94
class EvaluationBatch:
95
    """A class for evaluating batches of results using all available metrics.
96
    The evaluate() method is called once per document in the batch or evaluate_many()
97
    for a list of documents of the batch. Final results can be queried using the
98
    results() method."""
99
100
    def __init__(self, subject_index):
101
        self._subject_index = subject_index
102
        self._suggestion_arrays = []
103
        self._gold_subject_arrays = []
104
105
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
106
        if not isinstance(suggestion_batch, SuggestionBatch):
107
            suggestion_batch = SuggestionBatch(
108
                suggestion_batch, len(self._subject_index)
109
            )
110
        self._suggestion_arrays.append(suggestion_batch.array)
111
112
        # convert gold_subject_batch to sparse matrix
113
        ar = scipy.sparse.dok_array(
114
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
115
        )
116
        for idx, subject_set in enumerate(gold_subject_batch):
117
            for subject_id in subject_set:
118
                ar[idx, subject_id] = True
119
        self._gold_subject_arrays.append(ar.tocsr())
120
121
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
122
        y_pred_binary = y_pred > 0.0
123
        # dense versions of sparse arrays, for functions that need them
124
        # FIXME: conversion to dense arrays should be avoided
125
        y_pred_dense = y_pred.toarray()
126
        y_true_dense = y_true.toarray()
127
128
        # define the available metrics as lazy lambda functions
129
        # so we can execute only the ones actually requested
130
        all_metrics = {
131
            "Precision (doc avg)": lambda: precision_score(
132
                y_true, y_pred_binary, average="samples"
133
            ),
134
            "Recall (doc avg)": lambda: recall_score(
135
                y_true, y_pred_binary, average="samples"
136
            ),
137
            "F1 score (doc avg)": lambda: f1_score(
138
                y_true, y_pred_binary, average="samples"
139
            ),
140
            "Precision (subj avg)": lambda: precision_score(
141
                y_true, y_pred_binary, average="macro"
142
            ),
143
            "Recall (subj avg)": lambda: recall_score(
144
                y_true, y_pred_binary, average="macro"
145
            ),
146
            "F1 score (subj avg)": lambda: f1_score(
147
                y_true, y_pred_binary, average="macro"
148
            ),
149
            "Precision (weighted subj avg)": lambda: precision_score(
150
                y_true, y_pred_binary, average="weighted"
151
            ),
152
            "Recall (weighted subj avg)": lambda: recall_score(
153
                y_true, y_pred_binary, average="weighted"
154
            ),
155
            "F1 score (weighted subj avg)": lambda: f1_score(
156
                y_true, y_pred_binary, average="weighted"
157
            ),
158
            "Precision (microavg)": lambda: precision_score(
159
                y_true, y_pred_binary, average="micro"
160
            ),
161
            "Recall (microavg)": lambda: recall_score(
162
                y_true, y_pred_binary, average="micro"
163
            ),
164
            "F1 score (microavg)": lambda: f1_score(
165
                y_true, y_pred_binary, average="micro"
166
            ),
167
            "F1@5": lambda: f1_score(
168
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
169
            ),
170
            "NDCG": lambda: ndcg_score(y_true_dense, y_pred_dense),
171
            "NDCG@5": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=5),
172
            "NDCG@10": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=10),
173
            "Precision@1": lambda: precision_at_k_score(
174
                y_true_dense, y_pred_dense, limit=1
175
            ),
176
            "Precision@3": lambda: precision_at_k_score(
177
                y_true_dense, y_pred_dense, limit=3
178
            ),
179
            "Precision@5": lambda: precision_at_k_score(
180
                y_true_dense, y_pred_dense, limit=5
181
            ),
182
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred_dense),
183
            "True positives": lambda: true_positives(y_true, y_pred_binary),
184
            "False positives": lambda: false_positives(y_true, y_pred_binary),
185
            "False negatives": lambda: false_negatives(y_true_dense, y_pred_binary),
186
        }
187
188
        if not metrics:
189
            metrics = all_metrics.keys()
190
191
        with warnings.catch_warnings():
192
            warnings.simplefilter("ignore")
193
194
            return {metric: all_metrics[metric]() for metric in metrics}
195
196
    def _result_per_subject_header(self, results_file):
197
        print(
198
            "\t".join(
199
                [
200
                    "URI",
201
                    "Label",
202
                    "Support",
203
                    "True_positives",
204
                    "False_positives",
205
                    "False_negatives",
206
                    "Precision",
207
                    "Recall",
208
                    "F1_score",
209
                ]
210
            ),
211
            file=results_file,
212
        )
213
214
    def _result_per_subject_body(self, zipped_results, results_file):
215
        for row in zipped_results:
216
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 215 is not entered. Are you sure this can never be the case?
Loading history...
217
218
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
219
        """Write results per subject (non-aggregated)
220
        to outputfile results_file, using labels in the given language"""
221
222
        y_pred = y_pred.T > 0.0
223
        y_true = y_true.T
224
225
        true_pos = y_true.multiply(y_pred).sum(axis=1)
226
        false_pos = (y_true < y_pred).sum(axis=1)
227
        false_neg = (y_true > y_pred).sum(axis=1)
228
        precision = np.nan_to_num(true_pos / (true_pos + false_pos))
229
        recall = np.nan_to_num(true_pos / (true_pos + false_neg))
230
        f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
231
232
        zipped = zip(
233
            [subj.uri for subj in self._subject_index],  # URI
234
            [subj.labels[language] for subj in self._subject_index],  # Label
235
            y_true.sum(axis=1),  # Support
236
            true_pos,  # True positives
237
            false_pos,  # False positives
238
            false_neg,  # False negatives
239
            precision,  # Precision
240
            recall,  # Recall
241
            f1_score,  # F1 score
242
        )
243
        self._result_per_subject_header(results_file)
244
        self._result_per_subject_body(zipped, results_file)
245
246
    def results(self, metrics=[], results_file=None, language=None):
247
        """evaluate a set of selected subjects against a gold standard using
248
        different metrics. If metrics is empty, use all available metrics.
249
        If results_file (file object) given, write results per subject to it
250
        with labels expressed in the given language."""
251
252
        if not self._suggestion_arrays:
253
            raise NotSupportedException("cannot evaluate empty corpus")
254
255
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
256
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
257
258
        results = self._evaluate_samples(y_true, y_pred, metrics)
259
        results["Documents evaluated"] = int(y_true.shape[0])
260
261
        if results_file:
262
            self.output_result_per_subject(y_true, y_pred, results_file, language)
263
        return results
264