Passed
Pull Request — master (#414)
by Osma
02:11
created

annif.eval.false_negatives()   A

Complexity

Conditions 1

Size

Total Lines 4
Code Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 2
dl 0
loc 4
rs 10
c 0
b 0
f 0
cc 1
nop 2
1
"""Evaluation metrics for Annif"""
2
3
import statistics
4
import warnings
5
import numpy as np
6
from sklearn.metrics import precision_score, recall_score, f1_score
7
from sklearn.metrics import label_ranking_average_precision_score
8
from annif.exception import NotSupportedException
9
10
11
def filter_pred_top_k(preds, limit):
12
    """filter a 2D prediction vector, retaining only the top K suggestions
13
    for each individual prediction; the rest will be set to zeros"""
14
15
    masks = []
16
    for pred in preds:
17
        mask = np.zeros_like(pred, dtype=np.bool)
18
        top_k = np.argsort(pred)[::-1][:limit]
19
        mask[top_k] = True
20
        masks.append(mask)
21
    return preds * np.array(masks)
22
23
24
def true_positives(y_true, y_pred):
25
    """calculate the number of true positives using bitwise operations,
26
    emulating the way sklearn evaluation metric functions work"""
27
    return (y_true & y_pred).sum()
28
29
30
def false_positives(y_true, y_pred):
31
    """calculate the number of false positives using bitwise operations,
32
    emulating the way sklearn evaluation metric functions work"""
33
    return (~y_true & y_pred).sum()
34
35
36
def false_negatives(y_true, y_pred):
37
    """calculate the number of false negatives using bitwise operations,
38
    emulating the way sklearn evaluation metric functions work"""
39
    return (y_true & ~y_pred).sum()
40
41
42
def precision_at_k_score(y_true, y_pred, limit):
43
    """calculate the precision at K, i.e. the number of relevant items
44
    among the top K predicted ones"""
45
    scores = []
46
    for true, pred in zip(y_true, y_pred):
47
        order = pred.argsort()[::-1]
48
        orderlimit = min(limit, np.count_nonzero(pred))
49
        order = order[:orderlimit]
50
        gain = true[order]
51
        if orderlimit > 0:
52
            scores.append(gain.sum() / orderlimit)
53
        else:
54
            scores.append(0.0)
55
    return statistics.mean(scores)
56
57
58
def dcg_score(y_true, y_pred, limit=None):
59
    """return the discounted cumulative gain (DCG) score for the selected
60
    labels vs. relevant labels"""
61
    order = y_pred.argsort()[::-1]
62
    n_pred = np.count_nonzero(y_pred)
63
    if limit is not None:
64
        n_pred = min(limit, n_pred)
65
    order = order[:n_pred]
66
    gain = y_true[order]
67
    discount = np.log2(np.arange(order.size) + 2)
68
69
    return (gain / discount).sum()
70
71
72
def ndcg_score(y_true, y_pred, limit=None):
73
    """return the normalized discounted cumulative gain (nDCG) score for the
74
    selected labels vs. relevant labels"""
75
    scores = []
76
    for true, pred in zip(y_true, y_pred):
77
        idcg = dcg_score(true, true, limit)
78
        dcg = dcg_score(true, pred, limit)
79
        if idcg > 0:
80
            scores.append(dcg / idcg)
81
        else:
82
            scores.append(1.0)  # perfect score for no relevant hits case
83
    return statistics.mean(scores)
84
85
86
class EvaluationBatch:
87
    """A class for evaluating batches of results using all available metrics.
88
    The evaluate() method is called once per document in the batch.
89
    Final results can be queried using the results() method."""
90
91
    def __init__(self, subject_index):
92
        self._subject_index = subject_index
93
        self._samples = []
94
95
    def evaluate(self, hits, gold_subjects):
96
        self._samples.append((hits, gold_subjects))
97
98
    def _evaluate_samples(self, y_true, y_pred, metrics='all'):
99
        y_pred_binary = y_pred > 0.0
100
101
        # define the available metrics as lazy lambda functions
102
        # so we can execute only the ones actually requested
103
        all_metrics = {
104
            'Precision (doc avg)': lambda: precision_score(
105
                y_true, y_pred_binary, average='samples'),
106
            'Recall (doc avg)': lambda: recall_score(
107
                y_true, y_pred_binary, average='samples'),
108
            'F1 score (doc avg)': lambda: f1_score(
109
                y_true, y_pred_binary, average='samples'),
110
            'Precision (subj avg)': lambda: precision_score(
111
                y_true, y_pred_binary, average='macro'),
112
            'Recall (subj avg)': lambda: recall_score(
113
                y_true, y_pred_binary, average='macro'),
114
            'F1 score (subj avg)': lambda: f1_score(
115
                y_true, y_pred_binary, average='macro'),
116
            'Precision (weighted subj avg)': lambda: precision_score(
117
                y_true, y_pred_binary, average='weighted'),
118
            'Recall (weighted subj avg)': lambda: recall_score(
119
                y_true, y_pred_binary, average='weighted'),
120
            'F1 score (weighted subj avg)': lambda: f1_score(
121
                y_true, y_pred_binary, average='weighted'),
122
            'Precision (microavg)': lambda: precision_score(
123
                y_true, y_pred_binary, average='micro'),
124
            'Recall (microavg)': lambda: recall_score(
125
                y_true, y_pred_binary, average='micro'),
126
            'F1 score (microavg)': lambda: f1_score(
127
                y_true, y_pred_binary, average='micro'),
128
            'F1@5': lambda: f1_score(
129
                y_true, filter_pred_top_k(y_pred, 5) > 0.0, average='samples'),
130
            'NDCG': lambda: ndcg_score(y_true, y_pred),
131
            'NDCG@5': lambda: ndcg_score(y_true, y_pred, limit=5),
132
            'NDCG@10': lambda: ndcg_score(y_true, y_pred, limit=10),
133
            'Precision@1': lambda: precision_at_k_score(
134
                y_true, y_pred, limit=1),
135
            'Precision@3': lambda: precision_at_k_score(
136
                y_true, y_pred, limit=3),
137
            'Precision@5': lambda: precision_at_k_score(
138
                y_true, y_pred, limit=5),
139
            'LRAP': lambda: label_ranking_average_precision_score(
140
                y_true, y_pred),
141
            'True positives': lambda: true_positives(
142
                y_true, y_pred_binary),
143
            'False positives': lambda: false_positives(
144
                y_true, y_pred_binary),
145
            'False negatives': lambda: false_negatives(
146
                y_true, y_pred_binary),
147
        }
148
149
        if metrics == 'all':
150
            metrics = all_metrics.keys()
151
152
        with warnings.catch_warnings():
153
            warnings.simplefilter('ignore')
154
155
            return {metric: all_metrics[metric]() for metric in metrics}
156
157
    def _result_per_subject_header(self, results_file):
158
        print('\t'.join(['URI',
159
                         'Label',
160
                         'Support',
161
                         'True_positives',
162
                         'False_positives',
163
                         'False_negatives',
164
                         'Precision',
165
                         'Recall',
166
                         'F1_score']),
167
              file=results_file)
168
169
    def _result_per_subject_body(self, zipped_results, results_file):
170
        for row in zipped_results:
171
            print('\t'.join((str(e) for e in row)), file=results_file)
0 ignored issues
show
introduced by
The variable e does not seem to be defined in case the for loop on line 170 is not entered. Are you sure this can never be the case?
Loading history...
172
173
    def output_result_per_subject(self, y_true, y_pred, results_file):
174
        """Write results per subject (non-aggregated)
175
        to outputfile results_file"""
176
177
        y_pred = y_pred.T > 0.0
178
        y_true = y_true.T > 0.0
179
180
        true_pos = (y_true & y_pred)
181
        false_pos = (~y_true & y_pred)
182
        false_neg = (y_true & ~y_pred)
183
184
        r = len(y_true)
185
186
        zipped = zip(self._subject_index._uris,               # URI
187
                     self._subject_index._labels,             # Label
188
                     np.sum((true_pos + false_neg), axis=1),  # Support
189
                     np.sum(true_pos, axis=1),                # True_positives
190
                     np.sum(false_pos, axis=1),               # False_positives
191
                     np.sum(false_neg, axis=1),               # False_negatives
192
                     [precision_score(y_true[i], y_pred[i], zero_division=0)
193
                      for i in range(r)],                     # Precision
194
                     [recall_score(y_true[i], y_pred[i], zero_division=0)
195
                      for i in range(r)],                     # Recall
196
                     [f1_score(y_true[i], y_pred[i], zero_division=0)
197
                      for i in range(r)])                     # F1
198
        self._result_per_subject_header(results_file)
199
        self._result_per_subject_body(zipped, results_file)
200
201
    def results(self, metrics='all', results_file=None, warnings=False):
202
        """evaluate a set of selected subjects against a gold standard using
203
        different metrics. The set of metrics can be either 'all' or 'simple'.
204
        If results_file (file object) given, write results per subject to it"""
205
206
        if not self._samples:
207
            raise NotSupportedException("cannot evaluate empty corpus")
208
209
        y_true = np.array([gold_subjects.as_vector(self._subject_index,
210
                                                   warnings=warnings)
211
                           for hits, gold_subjects in self._samples])
212
        y_pred = np.array([hits.as_vector(self._subject_index)
213
                           for hits, gold_subjects in self._samples],
214
                          dtype=np.float32)
215
216
        results = self._evaluate_samples(
217
            y_true, y_pred, metrics)
218
        results['Documents evaluated'] = y_true.shape[0]
219
220
        if results_file:
221
            self.output_result_per_subject(y_true, y_pred, results_file)
222
        return results
223