Passed
Push — issue678-refactor-suggestionre... ( 928cbc...1db8f5 )
by Osma
02:39
created

annif.eval.true_positives()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
import scipy.sparse
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
from annif.suggestion import SuggestionBatch
17
18
19
def filter_pred_top_k(preds, limit):
20
    """filter a 2D prediction vector, retaining only the top K suggestions
21
    for each individual prediction; the rest will be set to zeros"""
22
23
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
24
    for row in range(preds.shape[0]):
25
        ar = preds.getrow(row).toarray()[0]
26
        top_k = np.argsort(ar)[::-1][:limit]
27
        for col in top_k:
28
            filtered[row, col] = preds[row, col]
29
    return filtered.tocsr()
30
31
32
def true_positives(y_true, y_pred):
33
    """calculate the number of true positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true.multiply(y_pred)).sum())
36
37
38
def false_positives(y_true, y_pred):
39
    """calculate the number of false positives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((y_true < y_pred).sum())
42
43
44
def false_negatives(y_true, y_pred):
45
    """calculate the number of false negatives using bitwise operations,
46
    emulating the way sklearn evaluation metric functions work"""
47
    return int((y_true > y_pred).sum())
48
49
50
def dcg_score(y_true, y_pred, limit=None):
51
    """return the discounted cumulative gain (DCG) score for the selected
52
    labels vs. relevant labels"""
53
    order = y_pred.argsort()[::-1]
54
    n_pred = np.count_nonzero(y_pred)
55
    if limit is not None:
56
        n_pred = min(limit, n_pred)
57
    order = order[:n_pred]
58
    gain = y_true[order]
59
    discount = np.log2(np.arange(order.size) + 2)
60
61
    return (gain / discount).sum()
62
63
64
def ndcg_score(y_true, y_pred, limit=None):
65
    """return the normalized discounted cumulative gain (nDCG) score for the
66
    selected labels vs. relevant labels"""
67
    scores = []
68
    for true, pred in zip(y_true, y_pred):
69
        idcg = dcg_score(true, true, limit)
70
        dcg = dcg_score(true, pred, limit)
71
        if idcg > 0:
72
            scores.append(dcg / idcg)
73
        else:
74
            scores.append(1.0)  # perfect score for no relevant hits case
75
    return statistics.mean(scores)
76
77
78
class EvaluationBatch:
79
    """A class for evaluating batches of results using all available metrics.
80
    The evaluate() method is called once per document in the batch or evaluate_many()
81
    for a list of documents of the batch. Final results can be queried using the
82
    results() method."""
83
84
    def __init__(self, subject_index):
85
        self._subject_index = subject_index
86
        self._suggestion_arrays = []
87
        self._gold_subject_arrays = []
88
89
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
90
        if not isinstance(suggestion_batch, SuggestionBatch):
91
            suggestion_batch = SuggestionBatch(
92
                suggestion_batch, len(self._subject_index)
93
            )
94
        self._suggestion_arrays.append(suggestion_batch.array)
95
96
        # convert gold_subject_batch to sparse matrix
97
        ar = scipy.sparse.dok_array(
98
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
99
        )
100
        for idx, subject_set in enumerate(gold_subject_batch):
101
            for subject_id in subject_set:
102
                ar[idx, subject_id] = True
103
        self._gold_subject_arrays.append(ar.tocsr())
104
105
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
106
        y_pred_binary = y_pred > 0.0
107
        # dense versions of sparse arrays, for functions that need them
108
        # FIXME: conversion to dense arrays should be avoided
109
        y_pred_dense = y_pred.toarray()
110
        y_true_dense = y_true.toarray()
111
112
        # define the available metrics as lazy lambda functions
113
        # so we can execute only the ones actually requested
114
        all_metrics = {
115
            "Precision (doc avg)": lambda: precision_score(
116
                y_true, y_pred_binary, average="samples"
117
            ),
118
            "Recall (doc avg)": lambda: recall_score(
119
                y_true, y_pred_binary, average="samples"
120
            ),
121
            "F1 score (doc avg)": lambda: f1_score(
122
                y_true, y_pred_binary, average="samples"
123
            ),
124
            "Precision (subj avg)": lambda: precision_score(
125
                y_true, y_pred_binary, average="macro"
126
            ),
127
            "Recall (subj avg)": lambda: recall_score(
128
                y_true, y_pred_binary, average="macro"
129
            ),
130
            "F1 score (subj avg)": lambda: f1_score(
131
                y_true, y_pred_binary, average="macro"
132
            ),
133
            "Precision (weighted subj avg)": lambda: precision_score(
134
                y_true, y_pred_binary, average="weighted"
135
            ),
136
            "Recall (weighted subj avg)": lambda: recall_score(
137
                y_true, y_pred_binary, average="weighted"
138
            ),
139
            "F1 score (weighted subj avg)": lambda: f1_score(
140
                y_true, y_pred_binary, average="weighted"
141
            ),
142
            "Precision (microavg)": lambda: precision_score(
143
                y_true, y_pred_binary, average="micro"
144
            ),
145
            "Recall (microavg)": lambda: recall_score(
146
                y_true, y_pred_binary, average="micro"
147
            ),
148
            "F1 score (microavg)": lambda: f1_score(
149
                y_true, y_pred_binary, average="micro"
150
            ),
151
            "F1@5": lambda: f1_score(
152
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
153
            ),
154
            "NDCG": lambda: ndcg_score(y_true_dense, y_pred_dense),
155
            "NDCG@5": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=5),
156
            "NDCG@10": lambda: ndcg_score(y_true_dense, y_pred_dense, limit=10),
157
            "Precision@1": lambda: precision_score(
158
                y_true, filter_pred_top_k(y_pred, 1) > 0.0, average="samples"
159
            ),
160
            "Precision@3": lambda: precision_score(
161
                y_true, filter_pred_top_k(y_pred, 3) > 0.0, average="samples"
162
            ),
163
            "Precision@5": lambda: precision_score(
164
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
165
            ),
166
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred_dense),
167
            "True positives": lambda: true_positives(y_true, y_pred_binary),
168
            "False positives": lambda: false_positives(y_true, y_pred_binary),
169
            "False negatives": lambda: false_negatives(y_true_dense, y_pred_binary),
170
        }
171
172
        if not metrics:
173
            metrics = all_metrics.keys()
174
175
        with warnings.catch_warnings():
176
            warnings.simplefilter("ignore")
177
178
            return {metric: all_metrics[metric]() for metric in metrics}
179
180
    def _result_per_subject_header(self, results_file):
181
        print(
182
            "\t".join(
183
                [
184
                    "URI",
185
                    "Label",
186
                    "Support",
187
                    "True_positives",
188
                    "False_positives",
189
                    "False_negatives",
190
                    "Precision",
191
                    "Recall",
192
                    "F1_score",
193
                ]
194
            ),
195
            file=results_file,
196
        )
197
198
    def _result_per_subject_body(self, zipped_results, results_file):
199
        for row in zipped_results:
200
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 199 is not entered. Are you sure this can never be the case?
Loading history...
201
202
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
203
        """Write results per subject (non-aggregated)
204
        to outputfile results_file, using labels in the given language"""
205
206
        y_pred = y_pred.T > 0.0
207
        y_true = y_true.T
208
209
        true_pos = y_true.multiply(y_pred).sum(axis=1)
210
        false_pos = (y_true < y_pred).sum(axis=1)
211
        false_neg = (y_true > y_pred).sum(axis=1)
212
        precision = np.nan_to_num(true_pos / (true_pos + false_pos))
213
        recall = np.nan_to_num(true_pos / (true_pos + false_neg))
214
        f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
215
216
        zipped = zip(
217
            [subj.uri for subj in self._subject_index],  # URI
218
            [subj.labels[language] for subj in self._subject_index],  # Label
219
            y_true.sum(axis=1),  # Support
220
            true_pos,  # True positives
221
            false_pos,  # False positives
222
            false_neg,  # False negatives
223
            precision,  # Precision
224
            recall,  # Recall
225
            f1_score,  # F1 score
226
        )
227
        self._result_per_subject_header(results_file)
228
        self._result_per_subject_body(zipped, results_file)
229
230
    def results(self, metrics=[], results_file=None, language=None):
231
        """evaluate a set of selected subjects against a gold standard using
232
        different metrics. If metrics is empty, use all available metrics.
233
        If results_file (file object) given, write results per subject to it
234
        with labels expressed in the given language."""
235
236
        if not self._suggestion_arrays:
237
            raise NotSupportedException("cannot evaluate empty corpus")
238
239
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
240
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
241
242
        results = self._evaluate_samples(y_true, y_pred, metrics)
243
        results["Documents evaluated"] = int(y_true.shape[0])
244
245
        if results_file:
246
            self.output_result_per_subject(y_true, y_pred, results_file, language)
247
        return results
248