Passed
Pull Request — main (#681)
by Osma
05:13 queued 02:38
created

annif.eval.false_negatives()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Evaluation metrics for Annif"""
2
3
import warnings
4
5
import numpy as np
6
import scipy.sparse
7
from sklearn.metrics import f1_score, precision_score, recall_score
8
9
from annif.exception import NotSupportedException
10
from annif.suggestion import SuggestionBatch
11
12
13
def filter_pred_top_k(preds, limit):
14
    """filter a 2D prediction vector, retaining only the top K suggestions
15
    for each individual prediction; the rest will be set to zeros"""
16
17
    filtered = scipy.sparse.dok_array(preds.shape, dtype=np.float32)
18
    for row in range(preds.shape[0]):
19
        ar = preds.getrow(row).toarray()[0]
20
        top_k = np.argsort(ar)[::-1][:limit]
21
        for col in top_k:
22
            filtered[row, col] = preds[row, col]
23
    return filtered.tocsr()
24
25
26
def true_positives(y_true, y_pred):
27
    """calculate the number of true positives using bitwise operations,
28
    emulating the way sklearn evaluation metric functions work"""
29
    return int((y_true.multiply(y_pred)).sum())
30
31
32
def false_positives(y_true, y_pred):
33
    """calculate the number of false positives using bitwise operations,
34
    emulating the way sklearn evaluation metric functions work"""
35
    return int((y_true < y_pred).sum())
36
37
38
def false_negatives(y_true, y_pred):
39
    """calculate the number of false negatives using bitwise operations,
40
    emulating the way sklearn evaluation metric functions work"""
41
    return int((y_true > y_pred).sum())
42
43
44
def dcg_score(y_true, y_pred, limit=None):
45
    """return the discounted cumulative gain (DCG) score for the selected
46
    labels vs. relevant labels"""
47
48
    n_pred = y_pred.count_nonzero()
49
    if limit is not None:
50
        n_pred = min(limit, n_pred)
51
52
    top_k = y_pred.data.argsort()[-n_pred:][::-1]
53
    order = y_pred.indices[top_k]
54
    gain = y_true[:, order]
55
    discount = np.log2(np.arange(1, n_pred + 1) + 1)
56
    return (gain / discount).sum()
57
58
59
def ndcg_score(y_true, y_pred, limit=None):
60
    """return the normalized discounted cumulative gain (nDCG) score for the
61
    selected labels vs. relevant labels"""
62
63
    scores = np.ones(y_true.shape[0], dtype=np.float32)
64
    for i in range(y_true.shape[0]):
65
        true = y_true.getrow(i)
66
        idcg = dcg_score(true, true, limit)
67
        if idcg > 0:
68
            pred = y_pred.getrow(i)
69
            dcg = dcg_score(true, pred, limit)
70
            scores[i] = dcg / idcg
71
72
    return float(scores.mean())
73
74
75
class EvaluationBatch:
76
    """A class for evaluating batches of results using all available metrics.
77
    The evaluate() method is called once per document in the batch or evaluate_many()
78
    for a list of documents of the batch. Final results can be queried using the
79
    results() method."""
80
81
    def __init__(self, subject_index):
82
        self._subject_index = subject_index
83
        self._suggestion_arrays = []
84
        self._gold_subject_arrays = []
85
86
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
87
        if not isinstance(suggestion_batch, SuggestionBatch):
88
            suggestion_batch = SuggestionBatch(
89
                suggestion_batch, len(self._subject_index)
90
            )
91
        self._suggestion_arrays.append(suggestion_batch.array)
92
93
        # convert gold_subject_batch to sparse matrix
94
        ar = scipy.sparse.dok_array(
95
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
96
        )
97
        for idx, subject_set in enumerate(gold_subject_batch):
98
            for subject_id in subject_set:
99
                ar[idx, subject_id] = True
100
        self._gold_subject_arrays.append(ar.tocsr())
101
102
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
103
        y_pred_binary = y_pred > 0.0
104
105
        # define the available metrics as lazy lambda functions
106
        # so we can execute only the ones actually requested
107
        all_metrics = {
108
            "Precision (doc avg)": lambda: precision_score(
109
                y_true, y_pred_binary, average="samples"
110
            ),
111
            "Recall (doc avg)": lambda: recall_score(
112
                y_true, y_pred_binary, average="samples"
113
            ),
114
            "F1 score (doc avg)": lambda: f1_score(
115
                y_true, y_pred_binary, average="samples"
116
            ),
117
            "Precision (subj avg)": lambda: precision_score(
118
                y_true, y_pred_binary, average="macro"
119
            ),
120
            "Recall (subj avg)": lambda: recall_score(
121
                y_true, y_pred_binary, average="macro"
122
            ),
123
            "F1 score (subj avg)": lambda: f1_score(
124
                y_true, y_pred_binary, average="macro"
125
            ),
126
            "Precision (weighted subj avg)": lambda: precision_score(
127
                y_true, y_pred_binary, average="weighted"
128
            ),
129
            "Recall (weighted subj avg)": lambda: recall_score(
130
                y_true, y_pred_binary, average="weighted"
131
            ),
132
            "F1 score (weighted subj avg)": lambda: f1_score(
133
                y_true, y_pred_binary, average="weighted"
134
            ),
135
            "Precision (microavg)": lambda: precision_score(
136
                y_true, y_pred_binary, average="micro"
137
            ),
138
            "Recall (microavg)": lambda: recall_score(
139
                y_true, y_pred_binary, average="micro"
140
            ),
141
            "F1 score (microavg)": lambda: f1_score(
142
                y_true, y_pred_binary, average="micro"
143
            ),
144
            "F1@5": lambda: f1_score(
145
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
146
            ),
147
            "NDCG": lambda: ndcg_score(y_true, y_pred),
148
            "NDCG@5": lambda: ndcg_score(y_true, y_pred, limit=5),
149
            "NDCG@10": lambda: ndcg_score(y_true, y_pred, limit=10),
150
            "Precision@1": lambda: precision_score(
151
                y_true, filter_pred_top_k(y_pred, 1) > 0.0, average="samples"
152
            ),
153
            "Precision@3": lambda: precision_score(
154
                y_true, filter_pred_top_k(y_pred, 3) > 0.0, average="samples"
155
            ),
156
            "Precision@5": lambda: precision_score(
157
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average="samples"
158
            ),
159
            "True positives": lambda: true_positives(y_true, y_pred_binary),
160
            "False positives": lambda: false_positives(y_true, y_pred_binary),
161
            "False negatives": lambda: false_negatives(y_true, y_pred_binary),
162
        }
163
164
        if not metrics:
165
            metrics = all_metrics.keys()
166
167
        with warnings.catch_warnings():
168
            warnings.simplefilter("ignore")
169
170
            return {metric: all_metrics[metric]() for metric in metrics}
171
172
    def _result_per_subject_header(self, results_file):
173
        print(
174
            "\t".join(
175
                [
176
                    "URI",
177
                    "Label",
178
                    "Support",
179
                    "True_positives",
180
                    "False_positives",
181
                    "False_negatives",
182
                    "Precision",
183
                    "Recall",
184
                    "F1_score",
185
                ]
186
            ),
187
            file=results_file,
188
        )
189
190
    def _result_per_subject_body(self, zipped_results, results_file):
191
        for row in zipped_results:
192
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 191 is not entered. Are you sure this can never be the case?
Loading history...
193
194
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
195
        """Write results per subject (non-aggregated)
196
        to outputfile results_file, using labels in the given language"""
197
198
        y_pred = y_pred.T > 0.0
199
        y_true = y_true.T
200
201
        true_pos = y_true.multiply(y_pred).sum(axis=1)
202
        false_pos = (y_true < y_pred).sum(axis=1)
203
        false_neg = (y_true > y_pred).sum(axis=1)
204
        precision = np.nan_to_num(true_pos / (true_pos + false_pos))
205
        recall = np.nan_to_num(true_pos / (true_pos + false_neg))
206
        f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
207
208
        zipped = zip(
209
            [subj.uri for subj in self._subject_index],  # URI
210
            [subj.labels[language] for subj in self._subject_index],  # Label
211
            y_true.sum(axis=1),  # Support
212
            true_pos,  # True positives
213
            false_pos,  # False positives
214
            false_neg,  # False negatives
215
            precision,  # Precision
216
            recall,  # Recall
217
            f1_score,  # F1 score
218
        )
219
        self._result_per_subject_header(results_file)
220
        self._result_per_subject_body(zipped, results_file)
221
222
    def results(self, metrics=[], results_file=None, language=None):
223
        """evaluate a set of selected subjects against a gold standard using
224
        different metrics. If metrics is empty, use all available metrics.
225
        If results_file (file object) given, write results per subject to it
226
        with labels expressed in the given language."""
227
228
        if not self._suggestion_arrays:
229
            raise NotSupportedException("cannot evaluate empty corpus")
230
231
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
232
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
233
234
        results = self._evaluate_samples(y_true, y_pred, metrics)
235
        results["Documents evaluated"] = int(y_true.shape[0])
236
237
        if results_file:
238
            self.output_result_per_subject(y_true, y_pred, results_file, language)
239
        return results
240