annif.eval.EvaluationBatch.results() - Code Metrics - Inspection of "Merge pull request #238 from NatLibFi/issue236-slo..." - NatLibFi/Annif - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 299d84...ccff81 )

by Osma

created 2019-01-21 07:46 UTC

annif.eval.EvaluationBatch.results() A

↳ Parent: annif.eval

Complexity

Conditions

Size

Total Lines	14
Code Lines	9

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	1
eloc	9
nop	2
dl	0
loc	14
rs	9.95
c	0
b	0
f	0

"""Evaluation metrics for Annif"""

import collections
import statistics
import warnings
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import label_ranking_average_precision_score


def true_positives(y_true, y_pred):
    """calculate the number of true positives using bitwise operations,
    emulating the way sklearn evaluation metric functions work"""
    return (y_true & y_pred).sum()


def false_positives(y_true, y_pred):
    """calculate the number of false positives using bitwise operations,
    emulating the way sklearn evaluation metric functions work"""
    return (~y_true & y_pred).sum()


def false_negatives(y_true, y_pred):
    """calculate the number of false negatives using bitwise operations,
    emulating the way sklearn evaluation metric functions work"""
    return (y_true & ~y_pred).sum()


def precision_at_k_score(y_true, y_pred, limit):
    """calculate the precision at K, i.e. the number of relevant items
    among the top K predicted ones"""
    scores = []
    for true, pred in zip(y_true, y_pred):
        order = pred.argsort()[::-1]
        limit = min(limit, np.count_nonzero(pred))
        order = order[:limit]
        gain = true[order]
        scores.append(gain.sum() / limit)
    return statistics.mean(scores)


def dcg_score(y_true, y_pred, limit=None):
    """return the discounted cumulative gain (DCG) score for the selected
    labels vs. relevant labels"""
    order = y_pred.argsort()[::-1]
    n_pred = np.count_nonzero(y_pred)
    if limit is not None:
        n_pred = min(limit, n_pred)
    order = order[:n_pred]
    gain = y_true[order]
    discount = np.log2(np.arange(order.size) + 2)

    return (gain / discount).sum()


def ndcg_score(y_true, y_pred, limit=None):
    """return the normalized discounted cumulative gain (nDCG) score for the
    selected labels vs. relevant labels"""
    scores = []
    for true, pred in zip(y_true, y_pred):
        idcg = dcg_score(true, true, limit)
        dcg = dcg_score(true, pred, limit)
        if idcg > 0:
            scores.append(dcg / idcg)
        else:
            scores.append(1.0)  # perfect score for no relevant hits case
    return statistics.mean(scores)


class EvaluationBatch:
    """A class for evaluating batches of results using all available metrics.
    The evaluate() method is called once per document in the batch.
    Final results can be queried using the results() method."""

    def __init__(self, subject_index):
        self._subject_index = subject_index
        self._samples = []

    def evaluate(self, hits, gold_subjects):
        self._samples.append((hits, gold_subjects))

    def _evaluate_samples(self, y_true, y_pred, metrics='all'):
        y_pred_binary = y_pred > 0.0
        results = collections.OrderedDict()
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')

            results['Precision (doc avg)'] = precision_score(
                y_true, y_pred_binary, average='samples')
            results['Recall (doc avg)'] = recall_score(
                y_true, y_pred_binary, average='samples')
            results['F1 score (doc avg)'] = f1_score(
                y_true, y_pred_binary, average='samples')
            if metrics == 'all':
                results['Precision (conc avg)'] = precision_score(
                    y_true, y_pred_binary, average='macro')
                results['Recall (conc avg)'] = recall_score(
                    y_true, y_pred_binary, average='macro')
                results['F1 score (conc avg)'] = f1_score(
                    y_true, y_pred_binary, average='macro')
                results['Precision (microavg)'] = precision_score(
                    y_true, y_pred_binary, average='micro')
                results['Recall (microavg)'] = recall_score(
                    y_true, y_pred_binary, average='micro')
                results['F1 score (microavg)'] = f1_score(
                    y_true, y_pred_binary, average='micro')
            results['NDCG'] = ndcg_score(y_true, y_pred)
            results['NDCG@5'] = ndcg_score(y_true, y_pred, limit=5)
            results['NDCG@10'] = ndcg_score(y_true, y_pred, limit=10)
            if metrics == 'all':
                results['Precision@1'] = precision_at_k_score(
                    y_true, y_pred, limit=1)
                results['Precision@3'] = precision_at_k_score(
                    y_true, y_pred, limit=3)
                results['Precision@5'] = precision_at_k_score(
                    y_true, y_pred, limit=5)
                results['LRAP'] = label_ranking_average_precision_score(
                    y_true, y_pred)
                results['True positives'] = true_positives(
                    y_true, y_pred_binary)
                results['False positives'] = false_positives(
                    y_true, y_pred_binary)
                results['False negatives'] = false_negatives(
                    y_true, y_pred_binary)

        return results

    def results(self, metrics='all'):
        """evaluate a set of selected subjects against a gold standard using
        different metrics. The set of metrics can be either 'all' or
        'simple'."""

        y_true = np.array([gold_subjects.as_vector(self._subject_index)
                           for hits, gold_subjects in self._samples])
        y_pred = np.array([hits.vector
                           for hits, gold_subjects in self._samples])

        results = self._evaluate_samples(
            y_true, y_pred, metrics)
        results['Documents evaluated'] = y_true.shape[0]
        return results


1			"""Evaluation metrics for Annif"""
2
3			import collections
4			import statistics
5			import warnings
6			import numpy as np
7			from sklearn.metrics import precision_score, recall_score, f1_score
8			from sklearn.metrics import label_ranking_average_precision_score
9
10
11			def true_positives(y_true, y_pred):
12			"""calculate the number of true positives using bitwise operations,
13			emulating the way sklearn evaluation metric functions work"""
14			return (y_true & y_pred).sum()
15
16
17			def false_positives(y_true, y_pred):
18			"""calculate the number of false positives using bitwise operations,
19			emulating the way sklearn evaluation metric functions work"""
20			return (~y_true & y_pred).sum()
21
22
23			def false_negatives(y_true, y_pred):
24			"""calculate the number of false negatives using bitwise operations,
25			emulating the way sklearn evaluation metric functions work"""
26			return (y_true & ~y_pred).sum()
27
28
29			def precision_at_k_score(y_true, y_pred, limit):
30			"""calculate the precision at K, i.e. the number of relevant items
31			among the top K predicted ones"""
32			scores = []
33			for true, pred in zip(y_true, y_pred):
34			order = pred.argsort()[::-1]
35			limit = min(limit, np.count_nonzero(pred))
36			order = order[:limit]
37			gain = true[order]
38			scores.append(gain.sum() / limit)
39			return statistics.mean(scores)
40
41
42			def dcg_score(y_true, y_pred, limit=None):
43			"""return the discounted cumulative gain (DCG) score for the selected
44			labels vs. relevant labels"""
45			order = y_pred.argsort()[::-1]
46			n_pred = np.count_nonzero(y_pred)
47			if limit is not None:
48			n_pred = min(limit, n_pred)
49			order = order[:n_pred]
50			gain = y_true[order]
51			discount = np.log2(np.arange(order.size) + 2)
52
53			return (gain / discount).sum()
54
55
56			def ndcg_score(y_true, y_pred, limit=None):
57			"""return the normalized discounted cumulative gain (nDCG) score for the
58			selected labels vs. relevant labels"""
59			scores = []
60			for true, pred in zip(y_true, y_pred):
61			idcg = dcg_score(true, true, limit)
62			dcg = dcg_score(true, pred, limit)
63			if idcg > 0:
64			scores.append(dcg / idcg)
65			else:
66			scores.append(1.0) # perfect score for no relevant hits case
67			return statistics.mean(scores)
68
69
70			class EvaluationBatch:
71			"""A class for evaluating batches of results using all available metrics.
72			The evaluate() method is called once per document in the batch.
73			Final results can be queried using the results() method."""
74
75			def __init__(self, subject_index):
76			self._subject_index = subject_index
77			self._samples = []
78
79			def evaluate(self, hits, gold_subjects):
80			self._samples.append((hits, gold_subjects))
81
82			def _evaluate_samples(self, y_true, y_pred, metrics='all'):
83			y_pred_binary = y_pred > 0.0
84			results = collections.OrderedDict()
85			with warnings.catch_warnings():
86			warnings.simplefilter('ignore')
87
88			results['Precision (doc avg)'] = precision_score(
89			y_true, y_pred_binary, average='samples')
90			results['Recall (doc avg)'] = recall_score(
91			y_true, y_pred_binary, average='samples')
92			results['F1 score (doc avg)'] = f1_score(
93			y_true, y_pred_binary, average='samples')
94			if metrics == 'all':
95			results['Precision (conc avg)'] = precision_score(
96			y_true, y_pred_binary, average='macro')
97			results['Recall (conc avg)'] = recall_score(
98			y_true, y_pred_binary, average='macro')
99			results['F1 score (conc avg)'] = f1_score(
100			y_true, y_pred_binary, average='macro')
101			results['Precision (microavg)'] = precision_score(
102			y_true, y_pred_binary, average='micro')
103			results['Recall (microavg)'] = recall_score(
104			y_true, y_pred_binary, average='micro')
105			results['F1 score (microavg)'] = f1_score(
106			y_true, y_pred_binary, average='micro')
107			results['NDCG'] = ndcg_score(y_true, y_pred)
108			results['NDCG@5'] = ndcg_score(y_true, y_pred, limit=5)
109			results['NDCG@10'] = ndcg_score(y_true, y_pred, limit=10)
110			if metrics == 'all':
111			results['Precision@1'] = precision_at_k_score(
112			y_true, y_pred, limit=1)
113			results['Precision@3'] = precision_at_k_score(
114			y_true, y_pred, limit=3)
115			results['Precision@5'] = precision_at_k_score(
116			y_true, y_pred, limit=5)
117			results['LRAP'] = label_ranking_average_precision_score(
118			y_true, y_pred)
119			results['True positives'] = true_positives(
120			y_true, y_pred_binary)
121			results['False positives'] = false_positives(
122			y_true, y_pred_binary)
123			results['False negatives'] = false_negatives(
124			y_true, y_pred_binary)
125
126			return results
127
128			def results(self, metrics='all'):
129			"""evaluate a set of selected subjects against a gold standard using
130			different metrics. The set of metrics can be either 'all' or
131			'simple'."""
132
133			y_true = np.array([gold_subjects.as_vector(self._subject_index)
134			for hits, gold_subjects in self._samples])
135			y_pred = np.array([hits.vector
136			for hits, gold_subjects in self._samples])
137
138			results = self._evaluate_samples(
139			y_true, y_pred, metrics)
140			results['Documents evaluated'] = y_true.shape[0]
141			return results
142

NatLibFi / Annif

Push — master ( 299d84...ccff81 )

annif.eval.EvaluationBatch.results() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like