Passed
Pull Request — master (#663)
by Juho
05:19
created

annif.eval.EvaluationBatch.evaluate_many()   A

Complexity

Conditions 2

Size

Total Lines 3
Code Lines 3

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 3
nop 3
dl 0
loc 3
rs 10
c 0
b 0
f 0
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
6
import numpy as np
7
from scipy.sparse import csr_matrix
8
from sklearn.metrics import (
9
    f1_score,
10
    label_ranking_average_precision_score,
11
    precision_score,
12
    recall_score,
13
)
14
15
from annif.exception import NotSupportedException
16
17
18
def filter_pred_top_k(preds, limit):
19
    """filter a 2D prediction vector, retaining only the top K suggestions
20
    for each individual prediction; the rest will be set to zeros"""
21
22
    masks = []
23
    for pred in preds:
24
        mask = np.zeros_like(pred, dtype=bool)
25
        top_k = np.argsort(pred)[::-1][:limit]
26
        mask[top_k] = True
27
        masks.append(mask)
28
    return preds * np.array(masks)
29
30
31
def true_positives(y_true, y_pred):
32
    """calculate the number of true positives using bitwise operations,
33
    emulating the way sklearn evaluation metric functions work"""
34
    return int((y_true & y_pred).sum())
35
36
37
def false_positives(y_true, y_pred):
38
    """calculate the number of false positives using bitwise operations,
39
    emulating the way sklearn evaluation metric functions work"""
40
    return int((~y_true & y_pred).sum())
41
42
43
def false_negatives(y_true, y_pred):
44
    """calculate the number of false negatives using bitwise operations,
45
    emulating the way sklearn evaluation metric functions work"""
46
    return int((y_true & ~y_pred).sum())
47
48
49
def precision_at_k_score(y_true, y_pred, limit):
50
    """calculate the precision at K, i.e. the number of relevant items
51
    among the top K predicted ones"""
52
    scores = []
53
    for true, pred in zip(y_true, y_pred):
54
        order = pred.argsort()[::-1]
55
        orderlimit = min(limit, np.count_nonzero(pred))
56
        order = order[:orderlimit]
57
        gain = true[order]
58
        if orderlimit > 0:
59
            scores.append(gain.sum() / orderlimit)
60
        else:
61
            scores.append(0.0)
62
    return statistics.mean(scores)
63
64
65
def dcg_score(y_true, y_pred, limit=None):
66
    """return the discounted cumulative gain (DCG) score for the selected
67
    labels vs. relevant labels"""
68
    order = y_pred.argsort()[::-1]
69
    n_pred = np.count_nonzero(y_pred)
70
    if limit is not None:
71
        n_pred = min(limit, n_pred)
72
    order = order[:n_pred]
73
    gain = y_true[order]
74
    discount = np.log2(np.arange(order.size) + 2)
75
76
    return (gain / discount).sum()
77
78
79
def ndcg_score(y_true, y_pred, limit=None):
80
    """return the normalized discounted cumulative gain (nDCG) score for the
81
    selected labels vs. relevant labels"""
82
    scores = []
83
    for true, pred in zip(y_true, y_pred):
84
        idcg = dcg_score(true, true, limit)
85
        dcg = dcg_score(true, pred, limit)
86
        if idcg > 0:
87
            scores.append(dcg / idcg)
88
        else:
89
            scores.append(1.0)  # perfect score for no relevant hits case
90
    return statistics.mean(scores)
91
92
93
class EvaluationBatch:
94
    """A class for evaluating batches of results using all available metrics.
95
    The evaluate() method is called once per document in the batch or evaluate_many()
96
    for a list of documents of the batch. Final results can be queried using the
97
    results() method."""
98
99
    def __init__(self, subject_index):
100
        self._subject_index = subject_index
101
        self._samples = []
102
103
    def evaluate(self, hits, gold_subjects):
104
        self._samples.append((hits, gold_subjects))
105
106
    def evaluate_many(self, hit_sets, gold_subject_sets):
107
        for hits, gold_subjects in zip(hit_sets, gold_subject_sets):
108
            self._samples.append((hits, gold_subjects))
109
110
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
111
        y_pred_binary = y_pred > 0.0
112
        y_true_sparse = csr_matrix(y_true)
113
114
        # define the available metrics as lazy lambda functions
115
        # so we can execute only the ones actually requested
116
        all_metrics = {
117
            "Precision (doc avg)": lambda: precision_score(
118
                y_true_sparse, y_pred_binary, average="samples"
119
            ),
120
            "Recall (doc avg)": lambda: recall_score(
121
                y_true_sparse, y_pred_binary, average="samples"
122
            ),
123
            "F1 score (doc avg)": lambda: f1_score(
124
                y_true_sparse, y_pred_binary, average="samples"
125
            ),
126
            "Precision (subj avg)": lambda: precision_score(
127
                y_true_sparse, y_pred_binary, average="macro"
128
            ),
129
            "Recall (subj avg)": lambda: recall_score(
130
                y_true_sparse, y_pred_binary, average="macro"
131
            ),
132
            "F1 score (subj avg)": lambda: f1_score(
133
                y_true_sparse, y_pred_binary, average="macro"
134
            ),
135
            "Precision (weighted subj avg)": lambda: precision_score(
136
                y_true_sparse, y_pred_binary, average="weighted"
137
            ),
138
            "Recall (weighted subj avg)": lambda: recall_score(
139
                y_true_sparse, y_pred_binary, average="weighted"
140
            ),
141
            "F1 score (weighted subj avg)": lambda: f1_score(
142
                y_true_sparse, y_pred_binary, average="weighted"
143
            ),
144
            "Precision (microavg)": lambda: precision_score(
145
                y_true_sparse, y_pred_binary, average="micro"
146
            ),
147
            "Recall (microavg)": lambda: recall_score(
148
                y_true_sparse, y_pred_binary, average="micro"
149
            ),
150
            "F1 score (microavg)": lambda: f1_score(
151
                y_true_sparse, y_pred_binary, average="micro"
152
            ),
153
            "F1@5": lambda: f1_score(
154
                y_true_sparse, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
155
            ),
156
            "NDCG": lambda: ndcg_score(y_true, y_pred),
157
            "NDCG@5": lambda: ndcg_score(y_true, y_pred, limit=5),
158
            "NDCG@10": lambda: ndcg_score(y_true, y_pred, limit=10),
159
            "Precision@1": lambda: precision_at_k_score(y_true, y_pred, limit=1),
160
            "Precision@3": lambda: precision_at_k_score(y_true, y_pred, limit=3),
161
            "Precision@5": lambda: precision_at_k_score(y_true, y_pred, limit=5),
162
            "LRAP": lambda: label_ranking_average_precision_score(y_true, y_pred),
163
            "True positives": lambda: true_positives(y_true, y_pred_binary),
164
            "False positives": lambda: false_positives(y_true, y_pred_binary),
165
            "False negatives": lambda: false_negatives(y_true, y_pred_binary),
166
        }
167
168
        if not metrics:
169
            metrics = all_metrics.keys()
170
171
        with warnings.catch_warnings():
172
            warnings.simplefilter("ignore")
173
174
            return {metric: all_metrics[metric]() for metric in metrics}
175
176
    def _result_per_subject_header(self, results_file):
177
        print(
178
            "\t".join(
179
                [
180
                    "URI",
181
                    "Label",
182
                    "Support",
183
                    "True_positives",
184
                    "False_positives",
185
                    "False_negatives",
186
                    "Precision",
187
                    "Recall",
188
                    "F1_score",
189
                ]
190
            ),
191
            file=results_file,
192
        )
193
194
    def _result_per_subject_body(self, zipped_results, results_file):
195
        for row in zipped_results:
196
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 195 is not entered. Are you sure this can never be the case?
Loading history...
197
198
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
199
        """Write results per subject (non-aggregated)
200
        to outputfile results_file, using labels in the given language"""
201
202
        y_pred = y_pred.T > 0.0
203
        y_true = y_true.T > 0.0
204
205
        true_pos = y_true & y_pred
206
        false_pos = ~y_true & y_pred
207
        false_neg = y_true & ~y_pred
208
209
        r = len(y_true)
210
211
        zipped = zip(
212
            [subj.uri for subj in self._subject_index],  # URI
213
            [subj.labels[language] for subj in self._subject_index],  # Label
214
            np.sum((true_pos + false_neg), axis=1),  # Support
215
            np.sum(true_pos, axis=1),  # True_positives
216
            np.sum(false_pos, axis=1),  # False_positives
217
            np.sum(false_neg, axis=1),  # False_negatives
218
            [
219
                precision_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
220
            ],  # Precision
221
            [
222
                recall_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)
223
            ],  # Recall
224
            [f1_score(y_true[i], y_pred[i], zero_division=0) for i in range(r)],
225
        )  # F1
226
        self._result_per_subject_header(results_file)
227
        self._result_per_subject_body(zipped, results_file)
228
229
    def results(self, metrics=[], results_file=None, language=None):
230
        """evaluate a set of selected subjects against a gold standard using
231
        different metrics. If metrics is empty, use all available metrics.
232
        If results_file (file object) given, write results per subject to it
233
        with labels expressed in the given language."""
234
235
        if not self._samples:
236
            raise NotSupportedException("cannot evaluate empty corpus")
237
238
        shape = (len(self._samples), len(self._subject_index))
239
        y_true = np.zeros(shape, dtype=bool)
240
        y_pred = np.zeros(shape, dtype=np.float32)
241
242
        for idx, (hits, gold_subjects) in enumerate(self._samples):
243
            gold_subjects.as_vector(destination=y_true[idx])
244
            hits.as_vector(len(self._subject_index), destination=y_pred[idx])
245
246
        results = self._evaluate_samples(y_true, y_pred, metrics)
247
        results["Documents evaluated"] = int(y_true.shape[0])
248
249
        if results_file:
250
            self.output_result_per_subject(y_true, y_pred, results_file, language)
251
        return results
252