Completed
Push — master ( 299d84...ccff81 )
by Osma
13s queued 11s
created

annif.eval.EvaluationBatch.results()   A

Complexity

Conditions 1

Size

Total Lines 14
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
eloc 9
nop 2
dl 0
loc 14
rs 9.95
c 0
b 0
f 0
1
"""Evaluation metrics for Annif"""
2
3
import collections
4
import statistics
5
import warnings
6
import numpy as np
7
from sklearn.metrics import precision_score, recall_score, f1_score
8
from sklearn.metrics import label_ranking_average_precision_score
9
10
11
def true_positives(y_true, y_pred):
12
    """calculate the number of true positives using bitwise operations,
13
    emulating the way sklearn evaluation metric functions work"""
14
    return (y_true & y_pred).sum()
15
16
17
def false_positives(y_true, y_pred):
18
    """calculate the number of false positives using bitwise operations,
19
    emulating the way sklearn evaluation metric functions work"""
20
    return (~y_true & y_pred).sum()
21
22
23
def false_negatives(y_true, y_pred):
24
    """calculate the number of false negatives using bitwise operations,
25
    emulating the way sklearn evaluation metric functions work"""
26
    return (y_true & ~y_pred).sum()
27
28
29
def precision_at_k_score(y_true, y_pred, limit):
30
    """calculate the precision at K, i.e. the number of relevant items
31
    among the top K predicted ones"""
32
    scores = []
33
    for true, pred in zip(y_true, y_pred):
34
        order = pred.argsort()[::-1]
35
        limit = min(limit, np.count_nonzero(pred))
36
        order = order[:limit]
37
        gain = true[order]
38
        scores.append(gain.sum() / limit)
39
    return statistics.mean(scores)
40
41
42
def dcg_score(y_true, y_pred, limit=None):
43
    """return the discounted cumulative gain (DCG) score for the selected
44
    labels vs. relevant labels"""
45
    order = y_pred.argsort()[::-1]
46
    n_pred = np.count_nonzero(y_pred)
47
    if limit is not None:
48
        n_pred = min(limit, n_pred)
49
    order = order[:n_pred]
50
    gain = y_true[order]
51
    discount = np.log2(np.arange(order.size) + 2)
52
53
    return (gain / discount).sum()
54
55
56
def ndcg_score(y_true, y_pred, limit=None):
57
    """return the normalized discounted cumulative gain (nDCG) score for the
58
    selected labels vs. relevant labels"""
59
    scores = []
60
    for true, pred in zip(y_true, y_pred):
61
        idcg = dcg_score(true, true, limit)
62
        dcg = dcg_score(true, pred, limit)
63
        if idcg > 0:
64
            scores.append(dcg / idcg)
65
        else:
66
            scores.append(1.0)  # perfect score for no relevant hits case
67
    return statistics.mean(scores)
68
69
70
class EvaluationBatch:
71
    """A class for evaluating batches of results using all available metrics.
72
    The evaluate() method is called once per document in the batch.
73
    Final results can be queried using the results() method."""
74
75
    def __init__(self, subject_index):
76
        self._subject_index = subject_index
77
        self._samples = []
78
79
    def evaluate(self, hits, gold_subjects):
80
        self._samples.append((hits, gold_subjects))
81
82
    def _evaluate_samples(self, y_true, y_pred, metrics='all'):
83
        y_pred_binary = y_pred > 0.0
84
        results = collections.OrderedDict()
85
        with warnings.catch_warnings():
86
            warnings.simplefilter('ignore')
87
88
            results['Precision (doc avg)'] = precision_score(
89
                y_true, y_pred_binary, average='samples')
90
            results['Recall (doc avg)'] = recall_score(
91
                y_true, y_pred_binary, average='samples')
92
            results['F1 score (doc avg)'] = f1_score(
93
                y_true, y_pred_binary, average='samples')
94
            if metrics == 'all':
95
                results['Precision (conc avg)'] = precision_score(
96
                    y_true, y_pred_binary, average='macro')
97
                results['Recall (conc avg)'] = recall_score(
98
                    y_true, y_pred_binary, average='macro')
99
                results['F1 score (conc avg)'] = f1_score(
100
                    y_true, y_pred_binary, average='macro')
101
                results['Precision (microavg)'] = precision_score(
102
                    y_true, y_pred_binary, average='micro')
103
                results['Recall (microavg)'] = recall_score(
104
                    y_true, y_pred_binary, average='micro')
105
                results['F1 score (microavg)'] = f1_score(
106
                    y_true, y_pred_binary, average='micro')
107
            results['NDCG'] = ndcg_score(y_true, y_pred)
108
            results['NDCG@5'] = ndcg_score(y_true, y_pred, limit=5)
109
            results['NDCG@10'] = ndcg_score(y_true, y_pred, limit=10)
110
            if metrics == 'all':
111
                results['Precision@1'] = precision_at_k_score(
112
                    y_true, y_pred, limit=1)
113
                results['Precision@3'] = precision_at_k_score(
114
                    y_true, y_pred, limit=3)
115
                results['Precision@5'] = precision_at_k_score(
116
                    y_true, y_pred, limit=5)
117
                results['LRAP'] = label_ranking_average_precision_score(
118
                    y_true, y_pred)
119
                results['True positives'] = true_positives(
120
                    y_true, y_pred_binary)
121
                results['False positives'] = false_positives(
122
                    y_true, y_pred_binary)
123
                results['False negatives'] = false_negatives(
124
                    y_true, y_pred_binary)
125
126
        return results
127
128
    def results(self, metrics='all'):
129
        """evaluate a set of selected subjects against a gold standard using
130
        different metrics. The set of metrics can be either 'all' or
131
        'simple'."""
132
133
        y_true = np.array([gold_subjects.as_vector(self._subject_index)
134
                           for hits, gold_subjects in self._samples])
135
        y_pred = np.array([hits.vector
136
                           for hits, gold_subjects in self._samples])
137
138
        results = self._evaluate_samples(
139
            y_true, y_pred, metrics)
140
        results['Documents evaluated'] = y_true.shape[0]
141
        return results
142