Passed
Pull Request — main (#681)
by Osma
02:45
created

annif.eval.precision_at_k_score()   A

Complexity

Conditions 3

Size

Total Lines 14
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 14
rs 9.85
c 0
b 0
f 0
cc 3
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import warnings
4
5
import numpy as np
6
import scipy.sparse
7
from sklearn.metrics import f1_score, precision_score, recall_score
8
9
from annif.exception import NotSupportedException
10
from annif.suggestion import SuggestionBatch, filter_suggestion
11
12
13
def true_positives(y_true, y_pred):
14
    """calculate the number of true positives using bitwise operations,
15
    emulating the way sklearn evaluation metric functions work"""
16
    return int((y_true.multiply(y_pred)).sum())
17
18
19
def false_positives(y_true, y_pred):
20
    """calculate the number of false positives using bitwise operations,
21
    emulating the way sklearn evaluation metric functions work"""
22
    return int((y_true < y_pred).sum())
23
24
25
def false_negatives(y_true, y_pred):
26
    """calculate the number of false negatives using bitwise operations,
27
    emulating the way sklearn evaluation metric functions work"""
28
    return int((y_true > y_pred).sum())
29
30
31
def dcg_score(y_true, y_pred, limit=None):
32
    """return the discounted cumulative gain (DCG) score for the selected
33
    labels vs. relevant labels"""
34
35
    n_pred = y_pred.count_nonzero()
36
    if limit is not None:
37
        n_pred = min(limit, n_pred)
38
39
    top_k = y_pred.data.argsort()[-n_pred:][::-1]
40
    order = y_pred.indices[top_k]
41
    gain = y_true[:, order]
42
    discount = np.log2(np.arange(1, n_pred + 1) + 1)
43
    return (gain / discount).sum()
44
45
46
def ndcg_score(y_true, y_pred, limit=None):
47
    """return the normalized discounted cumulative gain (nDCG) score for the
48
    selected labels vs. relevant labels"""
49
50
    scores = np.ones(y_true.shape[0], dtype=np.float32)
51
    for i in range(y_true.shape[0]):
52
        true = y_true.getrow(i)
53
        idcg = dcg_score(true, true, limit)
54
        if idcg > 0:
55
            pred = y_pred.getrow(i)
56
            dcg = dcg_score(true, pred, limit)
57
            scores[i] = dcg / idcg
58
59
    return float(scores.mean())
60
61
62
class EvaluationBatch:
63
    """A class for evaluating batches of results using all available metrics.
64
    The evaluate() method is called once per document in the batch or evaluate_many()
65
    for a list of documents of the batch. Final results can be queried using the
66
    results() method."""
67
68
    def __init__(self, subject_index):
69
        self._subject_index = subject_index
70
        self._suggestion_arrays = []
71
        self._gold_subject_arrays = []
72
73
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
74
        if not isinstance(suggestion_batch, SuggestionBatch):
75
            suggestion_batch = SuggestionBatch.from_sequence(
76
                suggestion_batch, self._subject_index
77
            )
78
        self._suggestion_arrays.append(suggestion_batch.array)
79
80
        # convert gold_subject_batch to sparse matrix
81
        ar = scipy.sparse.dok_array(
82
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
83
        )
84
        for idx, subject_set in enumerate(gold_subject_batch):
85
            for subject_id in subject_set:
86
                ar[idx, subject_id] = True
87
        self._gold_subject_arrays.append(ar.tocsr())
88
89
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
90
        y_pred_binary = y_pred > 0.0
91
92
        # define the available metrics as lazy lambda functions
93
        # so we can execute only the ones actually requested
94
        all_metrics = {
95
            "Precision (doc avg)": lambda: precision_score(
96
                y_true, y_pred_binary, average="samples"
97
            ),
98
            "Recall (doc avg)": lambda: recall_score(
99
                y_true, y_pred_binary, average="samples"
100
            ),
101
            "F1 score (doc avg)": lambda: f1_score(
102
                y_true, y_pred_binary, average="samples"
103
            ),
104
            "Precision (subj avg)": lambda: precision_score(
105
                y_true, y_pred_binary, average="macro"
106
            ),
107
            "Recall (subj avg)": lambda: recall_score(
108
                y_true, y_pred_binary, average="macro"
109
            ),
110
            "F1 score (subj avg)": lambda: f1_score(
111
                y_true, y_pred_binary, average="macro"
112
            ),
113
            "Precision (weighted subj avg)": lambda: precision_score(
114
                y_true, y_pred_binary, average="weighted"
115
            ),
116
            "Recall (weighted subj avg)": lambda: recall_score(
117
                y_true, y_pred_binary, average="weighted"
118
            ),
119
            "F1 score (weighted subj avg)": lambda: f1_score(
120
                y_true, y_pred_binary, average="weighted"
121
            ),
122
            "Precision (microavg)": lambda: precision_score(
123
                y_true, y_pred_binary, average="micro"
124
            ),
125
            "Recall (microavg)": lambda: recall_score(
126
                y_true, y_pred_binary, average="micro"
127
            ),
128
            "F1 score (microavg)": lambda: f1_score(
129
                y_true, y_pred_binary, average="micro"
130
            ),
131
            "F1@5": lambda: f1_score(
132
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
133
            ),
134
            "NDCG": lambda: ndcg_score(y_true, y_pred),
135
            "NDCG@5": lambda: ndcg_score(y_true, y_pred, limit=5),
136
            "NDCG@10": lambda: ndcg_score(y_true, y_pred, limit=10),
137
            "Precision@1": lambda: precision_score(
138
                y_true, filter_suggestion(y_pred, 1) > 0.0, average="samples"
139
            ),
140
            "Precision@3": lambda: precision_score(
141
                y_true, filter_suggestion(y_pred, 3) > 0.0, average="samples"
142
            ),
143
            "Precision@5": lambda: precision_score(
144
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
145
            ),
146
            "True positives": lambda: true_positives(y_true, y_pred_binary),
147
            "False positives": lambda: false_positives(y_true, y_pred_binary),
148
            "False negatives": lambda: false_negatives(y_true, y_pred_binary),
149
        }
150
151
        if not metrics:
152
            metrics = all_metrics.keys()
153
154
        with warnings.catch_warnings():
155
            warnings.simplefilter("ignore")
156
157
            return {metric: all_metrics[metric]() for metric in metrics}
158
159
    def _result_per_subject_header(self, results_file):
160
        print(
161
            "\t".join(
162
                [
163
                    "URI",
164
                    "Label",
165
                    "Support",
166
                    "True_positives",
167
                    "False_positives",
168
                    "False_negatives",
169
                    "Precision",
170
                    "Recall",
171
                    "F1_score",
172
                ]
173
            ),
174
            file=results_file,
175
        )
176
177
    def _result_per_subject_body(self, zipped_results, results_file):
178
        for row in zipped_results:
179
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 178 is not entered. Are you sure this can never be the case?
Loading history...
180
181
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
182
        """Write results per subject (non-aggregated)
183
        to outputfile results_file, using labels in the given language"""
184
185
        y_pred = y_pred.T > 0.0
186
        y_true = y_true.T
187
188
        true_pos = y_true.multiply(y_pred).sum(axis=1)
189
        false_pos = (y_true < y_pred).sum(axis=1)
190
        false_neg = (y_true > y_pred).sum(axis=1)
191
        precision = np.nan_to_num(true_pos / (true_pos + false_pos))
192
        recall = np.nan_to_num(true_pos / (true_pos + false_neg))
193
        f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
194
195
        zipped = zip(
196
            [subj.uri for subj in self._subject_index],  # URI
197
            [subj.labels[language] for subj in self._subject_index],  # Label
198
            y_true.sum(axis=1),  # Support
199
            true_pos,  # True positives
200
            false_pos,  # False positives
201
            false_neg,  # False negatives
202
            precision,  # Precision
203
            recall,  # Recall
204
            f1_score,  # F1 score
205
        )
206
        self._result_per_subject_header(results_file)
207
        self._result_per_subject_body(zipped, results_file)
208
209
    def results(self, metrics=[], results_file=None, language=None):
210
        """evaluate a set of selected subjects against a gold standard using
211
        different metrics. If metrics is empty, use all available metrics.
212
        If results_file (file object) given, write results per subject to it
213
        with labels expressed in the given language."""
214
215
        if not self._suggestion_arrays:
216
            raise NotSupportedException("cannot evaluate empty corpus")
217
218
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
219
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
220
221
        results = self._evaluate_samples(y_true, y_pred, metrics)
222
        results["Documents evaluated"] = int(y_true.shape[0])
223
224
        if results_file:
225
            self.output_result_per_subject(y_true, y_pred, results_file, language)
226
        return results
227