Passed
Pull Request — main (#681)
by Osma
03:08
created

annif.eval.precision_at_k_score()   A

Complexity

Conditions 3

Size

Total Lines 14
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 14
rs 9.85
c 0
b 0
f 0
cc 3
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import warnings
4
5
import numpy as np
6
import scipy.sparse
7
from sklearn.metrics import f1_score, precision_score, recall_score
8
9
from annif.exception import NotSupportedException
10
from annif.suggestion import SuggestionBatch
11
from annif.util import filter_suggestion
12
13
14
def true_positives(y_true, y_pred):
15
    """calculate the number of true positives using bitwise operations,
16
    emulating the way sklearn evaluation metric functions work"""
17
    return int((y_true.multiply(y_pred)).sum())
18
19
20
def false_positives(y_true, y_pred):
21
    """calculate the number of false positives using bitwise operations,
22
    emulating the way sklearn evaluation metric functions work"""
23
    return int((y_true < y_pred).sum())
24
25
26
def false_negatives(y_true, y_pred):
27
    """calculate the number of false negatives using bitwise operations,
28
    emulating the way sklearn evaluation metric functions work"""
29
    return int((y_true > y_pred).sum())
30
31
32
def dcg_score(y_true, y_pred, limit=None):
33
    """return the discounted cumulative gain (DCG) score for the selected
34
    labels vs. relevant labels"""
35
36
    n_pred = y_pred.count_nonzero()
37
    if limit is not None:
38
        n_pred = min(limit, n_pred)
39
40
    top_k = y_pred.data.argsort()[-n_pred:][::-1]
41
    order = y_pred.indices[top_k]
42
    gain = y_true[:, order]
43
    discount = np.log2(np.arange(1, n_pred + 1) + 1)
44
    return (gain / discount).sum()
45
46
47
def ndcg_score(y_true, y_pred, limit=None):
48
    """return the normalized discounted cumulative gain (nDCG) score for the
49
    selected labels vs. relevant labels"""
50
51
    scores = np.ones(y_true.shape[0], dtype=np.float32)
52
    for i in range(y_true.shape[0]):
53
        true = y_true.getrow(i)
54
        idcg = dcg_score(true, true, limit)
55
        if idcg > 0:
56
            pred = y_pred.getrow(i)
57
            dcg = dcg_score(true, pred, limit)
58
            scores[i] = dcg / idcg
59
60
    return float(scores.mean())
61
62
63
class EvaluationBatch:
64
    """A class for evaluating batches of results using all available metrics.
65
    The evaluate() method is called once per document in the batch or evaluate_many()
66
    for a list of documents of the batch. Final results can be queried using the
67
    results() method."""
68
69
    def __init__(self, subject_index):
70
        self._subject_index = subject_index
71
        self._suggestion_arrays = []
72
        self._gold_subject_arrays = []
73
74
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
75
        if not isinstance(suggestion_batch, SuggestionBatch):
76
            suggestion_batch = SuggestionBatch(
77
                suggestion_batch, len(self._subject_index)
78
            )
79
        self._suggestion_arrays.append(suggestion_batch.array)
80
81
        # convert gold_subject_batch to sparse matrix
82
        ar = scipy.sparse.dok_array(
83
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
84
        )
85
        for idx, subject_set in enumerate(gold_subject_batch):
86
            for subject_id in subject_set:
87
                ar[idx, subject_id] = True
88
        self._gold_subject_arrays.append(ar.tocsr())
89
90
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
91
        y_pred_binary = y_pred > 0.0
92
93
        # define the available metrics as lazy lambda functions
94
        # so we can execute only the ones actually requested
95
        all_metrics = {
96
            "Precision (doc avg)": lambda: precision_score(
97
                y_true, y_pred_binary, average="samples"
98
            ),
99
            "Recall (doc avg)": lambda: recall_score(
100
                y_true, y_pred_binary, average="samples"
101
            ),
102
            "F1 score (doc avg)": lambda: f1_score(
103
                y_true, y_pred_binary, average="samples"
104
            ),
105
            "Precision (subj avg)": lambda: precision_score(
106
                y_true, y_pred_binary, average="macro"
107
            ),
108
            "Recall (subj avg)": lambda: recall_score(
109
                y_true, y_pred_binary, average="macro"
110
            ),
111
            "F1 score (subj avg)": lambda: f1_score(
112
                y_true, y_pred_binary, average="macro"
113
            ),
114
            "Precision (weighted subj avg)": lambda: precision_score(
115
                y_true, y_pred_binary, average="weighted"
116
            ),
117
            "Recall (weighted subj avg)": lambda: recall_score(
118
                y_true, y_pred_binary, average="weighted"
119
            ),
120
            "F1 score (weighted subj avg)": lambda: f1_score(
121
                y_true, y_pred_binary, average="weighted"
122
            ),
123
            "Precision (microavg)": lambda: precision_score(
124
                y_true, y_pred_binary, average="micro"
125
            ),
126
            "Recall (microavg)": lambda: recall_score(
127
                y_true, y_pred_binary, average="micro"
128
            ),
129
            "F1 score (microavg)": lambda: f1_score(
130
                y_true, y_pred_binary, average="micro"
131
            ),
132
            "F1@5": lambda: f1_score(
133
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
134
            ),
135
            "NDCG": lambda: ndcg_score(y_true, y_pred),
136
            "NDCG@5": lambda: ndcg_score(y_true, y_pred, limit=5),
137
            "NDCG@10": lambda: ndcg_score(y_true, y_pred, limit=10),
138
            "Precision@1": lambda: precision_score(
139
                y_true, filter_suggestion(y_pred, 1) > 0.0, average="samples"
140
            ),
141
            "Precision@3": lambda: precision_score(
142
                y_true, filter_suggestion(y_pred, 3) > 0.0, average="samples"
143
            ),
144
            "Precision@5": lambda: precision_score(
145
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
146
            ),
147
            "True positives": lambda: true_positives(y_true, y_pred_binary),
148
            "False positives": lambda: false_positives(y_true, y_pred_binary),
149
            "False negatives": lambda: false_negatives(y_true, y_pred_binary),
150
        }
151
152
        if not metrics:
153
            metrics = all_metrics.keys()
154
155
        with warnings.catch_warnings():
156
            warnings.simplefilter("ignore")
157
158
            return {metric: all_metrics[metric]() for metric in metrics}
159
160
    def _result_per_subject_header(self, results_file):
161
        print(
162
            "\t".join(
163
                [
164
                    "URI",
165
                    "Label",
166
                    "Support",
167
                    "True_positives",
168
                    "False_positives",
169
                    "False_negatives",
170
                    "Precision",
171
                    "Recall",
172
                    "F1_score",
173
                ]
174
            ),
175
            file=results_file,
176
        )
177
178
    def _result_per_subject_body(self, zipped_results, results_file):
179
        for row in zipped_results:
180
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 179 is not entered. Are you sure this can never be the case?
Loading history...
181
182
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
183
        """Write results per subject (non-aggregated)
184
        to outputfile results_file, using labels in the given language"""
185
186
        y_pred = y_pred.T > 0.0
187
        y_true = y_true.T
188
189
        true_pos = y_true.multiply(y_pred).sum(axis=1)
190
        false_pos = (y_true < y_pred).sum(axis=1)
191
        false_neg = (y_true > y_pred).sum(axis=1)
192
        precision = np.nan_to_num(true_pos / (true_pos + false_pos))
193
        recall = np.nan_to_num(true_pos / (true_pos + false_neg))
194
        f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
195
196
        zipped = zip(
197
            [subj.uri for subj in self._subject_index],  # URI
198
            [subj.labels[language] for subj in self._subject_index],  # Label
199
            y_true.sum(axis=1),  # Support
200
            true_pos,  # True positives
201
            false_pos,  # False positives
202
            false_neg,  # False negatives
203
            precision,  # Precision
204
            recall,  # Recall
205
            f1_score,  # F1 score
206
        )
207
        self._result_per_subject_header(results_file)
208
        self._result_per_subject_body(zipped, results_file)
209
210
    def results(self, metrics=[], results_file=None, language=None):
211
        """evaluate a set of selected subjects against a gold standard using
212
        different metrics. If metrics is empty, use all available metrics.
213
        If results_file (file object) given, write results per subject to it
214
        with labels expressed in the given language."""
215
216
        if not self._suggestion_arrays:
217
            raise NotSupportedException("cannot evaluate empty corpus")
218
219
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
220
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
221
222
        results = self._evaluate_samples(y_true, y_pred, metrics)
223
        results["Documents evaluated"] = int(y_true.shape[0])
224
225
        if results_file:
226
            self.output_result_per_subject(y_true, y_pred, results_file, language)
227
        return results
228