Passed
Pull Request — main (#681)
by Osma
10:12 queued 07:10
created

annif.eval.precision_at_k_score()   A

Complexity

Conditions 3

Size

Total Lines 14
Code Lines 11

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 11
dl 0
loc 14
rs 9.85
c 0
b 0
f 0
cc 3
nop 3
1
"""Evaluation metrics for Annif"""
2
3
import warnings
4
5
import numpy as np
6
import scipy.sparse
7
from sklearn.metrics import f1_score, precision_score, recall_score
8
9
from annif.exception import NotSupportedException
10
from annif.suggestion import SuggestionBatch, filter_suggestion
11
12
13
def true_positives(y_true, y_pred):
14
    """calculate the number of true positives using bitwise operations,
15
    emulating the way sklearn evaluation metric functions work"""
16
    return int((y_true.multiply(y_pred)).sum())
17
18
19
def false_positives(y_true, y_pred):
20
    """calculate the number of false positives using bitwise operations,
21
    emulating the way sklearn evaluation metric functions work"""
22
    return int((y_true < y_pred).sum())
23
24
25
def false_negatives(y_true, y_pred):
26
    """calculate the number of false negatives using bitwise operations,
27
    emulating the way sklearn evaluation metric functions work"""
28
    return int((y_true > y_pred).sum())
29
30
31
def dcg_score(y_true, y_pred, limit=None):
32
    """return the discounted cumulative gain (DCG) score for the selected
33
    labels vs. relevant labels"""
34
35
    n_pred = y_pred.count_nonzero()
36
    if limit is not None:
37
        n_pred = min(limit, n_pred)
38
39
    top_k = y_pred.data.argsort()[-n_pred:][::-1]
40
    order = y_pred.indices[top_k]
41
    gain = y_true[:, order]
42
    discount = np.log2(np.arange(1, n_pred + 1) + 1)
43
    return (gain / discount).sum()
44
45
46
def ndcg_score(y_true, y_pred, limit=None):
47
    """return the normalized discounted cumulative gain (nDCG) score for the
48
    selected labels vs. relevant labels"""
49
50
    scores = np.ones(y_true.shape[0], dtype=np.float32)
51
    for i in range(y_true.shape[0]):
52
        true = y_true.getrow(i)
53
        idcg = dcg_score(true, true, limit)
54
        if idcg > 0:
55
            pred = y_pred.getrow(i)
56
            dcg = dcg_score(true, pred, limit)
57
            scores[i] = dcg / idcg
58
59
    return float(scores.mean())
60
61
62
class EvaluationBatch:
63
    """A class for evaluating batches of results using all available metrics.
64
    The evaluate() method is called once per document in the batch or evaluate_many()
65
    for a list of documents of the batch. Final results can be queried using the
66
    results() method."""
67
68
    def __init__(self, subject_index):
69
        self._subject_index = subject_index
70
        self._suggestion_arrays = []
71
        self._gold_subject_arrays = []
72
73
    def evaluate_many(self, suggestion_batch, gold_subject_batch):
74
        if not isinstance(suggestion_batch, SuggestionBatch):
75
            suggestion_batch = SuggestionBatch.from_sequence(
76
                suggestion_batch, self._subject_index
77
            )
78
        self._suggestion_arrays.append(suggestion_batch.array)
79
80
        # convert gold_subject_batch to sparse matrix
81
        ar = scipy.sparse.dok_array(
82
            (len(gold_subject_batch), len(self._subject_index)), dtype=bool
83
        )
84
        for idx, subject_set in enumerate(gold_subject_batch):
85
            for subject_id in subject_set:
86
                ar[idx, subject_id] = True
87
        self._gold_subject_arrays.append(ar.tocsr())
88
89
    def _evaluate_samples(self, y_true, y_pred, metrics=[]):
90
        y_pred_binary = y_pred > 0.0
91
92
        # define the available metrics as lazy lambda functions
93
        # so we can execute only the ones actually requested
94
        all_metrics = {
95
            "Precision (doc avg)": lambda: precision_score(
96
                y_true, y_pred_binary, average="samples"
97
            ),
98
            "Recall (doc avg)": lambda: recall_score(
99
                y_true, y_pred_binary, average="samples"
100
            ),
101
            "F1 score (doc avg)": lambda: f1_score(
102
                y_true, y_pred_binary, average="samples"
103
            ),
104
            "Precision (subj avg)": lambda: precision_score(
105
                y_true, y_pred_binary, average="macro"
106
            ),
107
            "Recall (subj avg)": lambda: recall_score(
108
                y_true, y_pred_binary, average="macro"
109
            ),
110
            "F1 score (subj avg)": lambda: f1_score(
111
                y_true, y_pred_binary, average="macro"
112
            ),
113
            "Precision (weighted subj avg)": lambda: precision_score(
114
                y_true, y_pred_binary, average="weighted"
115
            ),
116
            "Recall (weighted subj avg)": lambda: recall_score(
117
                y_true, y_pred_binary, average="weighted"
118
            ),
119
            "F1 score (weighted subj avg)": lambda: f1_score(
120
                y_true, y_pred_binary, average="weighted"
121
            ),
122
            "Precision (microavg)": lambda: precision_score(
123
                y_true, y_pred_binary, average="micro"
124
            ),
125
            "Recall (microavg)": lambda: recall_score(
126
                y_true, y_pred_binary, average="micro"
127
            ),
128
            "F1 score (microavg)": lambda: f1_score(
129
                y_true, y_pred_binary, average="micro"
130
            ),
131
            "F1@5": lambda: f1_score(
132
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
133
            ),
134
            "NDCG": lambda: ndcg_score(y_true, y_pred),
135
            "NDCG@5": lambda: ndcg_score(y_true, y_pred, limit=5),
136
            "NDCG@10": lambda: ndcg_score(y_true, y_pred, limit=10),
137
            "Precision@1": lambda: precision_score(
138
                y_true, filter_suggestion(y_pred, 1) > 0.0, average="samples"
139
            ),
140
            "Precision@3": lambda: precision_score(
141
                y_true, filter_suggestion(y_pred, 3) > 0.0, average="samples"
142
            ),
143
            "Precision@5": lambda: precision_score(
144
                y_true, filter_suggestion(y_pred, 5) > 0.0, average="samples"
145
            ),
146
            "True positives": lambda: true_positives(y_true, y_pred_binary),
147
            "False positives": lambda: false_positives(y_true, y_pred_binary),
148
            "False negatives": lambda: false_negatives(y_true, y_pred_binary),
149
        }
150
151
        if not metrics:
152
            metrics = all_metrics.keys()
153
154
        with warnings.catch_warnings():
155
            warnings.simplefilter("ignore")
156
157
            return {metric: all_metrics[metric]() for metric in metrics}
158
159
    def _result_per_subject_header(self, results_file):
160
        print(
161
            "\t".join(
162
                [
163
                    "URI",
164
                    "Label",
165
                    "Support",
166
                    "True_positives",
167
                    "False_positives",
168
                    "False_negatives",
169
                    "Precision",
170
                    "Recall",
171
                    "F1_score",
172
                ]
173
            ),
174
            file=results_file,
175
        )
176
177
    def _result_per_subject_body(self, zipped_results, results_file):
178
        for row in zipped_results:
179
            print("\t".join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 178 is not entered. Are you sure this can never be the case?
Loading history...
180
181
    def output_result_per_subject(self, y_true, y_pred, results_file, language):
182
        """Write results per subject (non-aggregated)
183
        to outputfile results_file, using labels in the given language"""
184
185
        y_pred = y_pred.T > 0.0
186
        y_true = y_true.T
187
188
        true_pos = y_true.multiply(y_pred).sum(axis=1)
189
        false_pos = (y_true < y_pred).sum(axis=1)
190
        false_neg = (y_true > y_pred).sum(axis=1)
191
192
        with np.errstate(invalid="ignore"):
193
            precision = np.nan_to_num(true_pos / (true_pos + false_pos))
194
            recall = np.nan_to_num(true_pos / (true_pos + false_neg))
195
            f1_score = np.nan_to_num(2 * (precision * recall) / (precision + recall))
196
197
        zipped = zip(
198
            [subj.uri for subj in self._subject_index],  # URI
199
            [subj.labels[language] for subj in self._subject_index],  # Label
200
            y_true.sum(axis=1),  # Support
201
            true_pos,  # True positives
202
            false_pos,  # False positives
203
            false_neg,  # False negatives
204
            precision,  # Precision
205
            recall,  # Recall
206
            f1_score,  # F1 score
207
        )
208
        self._result_per_subject_header(results_file)
209
        self._result_per_subject_body(zipped, results_file)
210
211
    def results(self, metrics=[], results_file=None, language=None):
212
        """evaluate a set of selected subjects against a gold standard using
213
        different metrics. If metrics is empty, use all available metrics.
214
        If results_file (file object) given, write results per subject to it
215
        with labels expressed in the given language."""
216
217
        if not self._suggestion_arrays:
218
            raise NotSupportedException("cannot evaluate empty corpus")
219
220
        y_pred = scipy.sparse.csr_array(scipy.sparse.vstack(self._suggestion_arrays))
221
        y_true = scipy.sparse.csr_array(scipy.sparse.vstack(self._gold_subject_arrays))
222
223
        results = self._evaluate_samples(y_true, y_pred, metrics)
224
        results["Documents evaluated"] = int(y_true.shape[0])
225
226
        if results_file:
227
            self.output_result_per_subject(y_true, y_pred, results_file, language)
228
        return results
229