1
|
|
|
import logging |
2
|
|
|
import numpy as np |
3
|
|
|
|
4
|
|
|
logger = logging.getLogger(__file__) |
5
|
|
|
|
6
|
|
|
per_class_performance_index = ['true_positive', 'true_negative', 'false_positive', 'false_negative', |
7
|
|
|
'accuracy', 'misclassification', 'recall', 'false positive rate', |
8
|
|
|
'specificity', 'precision', 'prevalence', 'f-1 measure', 'g-measure'] |
9
|
|
|
|
10
|
|
|
overall_performance_index = ['average accuracy', 'weighed accuracy', |
11
|
|
|
'precision (micro)', 'recall (micro)', 'f-1 score (micro)', |
12
|
|
|
'precision (macro)', 'recall (macro)', 'f-1 score (macro)', |
13
|
|
|
'exact matching ratio'] |
14
|
|
|
|
15
|
|
|
|
16
|
|
|
def get_confusion_matrix(num_classes, label, predicted): |
17
|
|
|
"""Calculate confusion matrix based on ground truth and predicted result |
18
|
|
|
|
19
|
|
|
Args: |
20
|
|
|
num_classes (:obj:`int`): Number of classes |
21
|
|
|
label (:obj:`list` of :obj:`int`): ground truth labels |
22
|
|
|
predicted (:obj:`list` of :obj:`int`): predicted labels |
23
|
|
|
|
24
|
|
|
Returns: |
25
|
|
|
:class:`numpy.array`: Confusion matrix (`numpy_class` by `numpy_class`) |
26
|
|
|
""" |
27
|
|
|
matrix = np.zeros((num_classes, num_classes)) |
28
|
|
|
for i in range(len(label)): |
29
|
|
|
matrix[label[i]][predicted[i]] += 1 |
30
|
|
|
return matrix |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
def get_performance_array(confusion_matrix): |
34
|
|
|
"""Calculate performance matrix based on the given confusion matrix |
35
|
|
|
|
36
|
|
|
Gets performance array for each class |
37
|
|
|
0 - True_Positive: number of samples that belong to class and classified correctly |
38
|
|
|
1 - True_Negative: number of samples that correctly classified as not belonging to class |
39
|
|
|
2 - False_Positive: number of samples that belong to class and not classified correctMeasure: |
40
|
|
|
3 - False_Negative: number of samples that do not belong to class but classified as class |
41
|
|
|
4 - Accuracy: Overall, how often is the classifier correct? (TP + TN) / (TP + TN + FP + FN) |
42
|
|
|
5 - Misclassification: Overall, how often is it wrong? (FP + FN) / (TP + TN + FP + FN) |
43
|
|
|
6 - Recall: When it's actually yes, how often does it predict yes? TP / (TP + FN) |
44
|
|
|
7 - False Positive Rate: When it's actually no, how often does it predict yes? FP / (FP + TN) |
45
|
|
|
8 - Specificity: When it's actually no, how often does it predict no? TN / (FP + TN) |
46
|
|
|
9 - Precision: When it predicts yes, how often is it correct? TP / (TP + FP) |
47
|
|
|
10 - Prevalence: How often does the yes condition actually occur in our sample? Total(class) / Total(samples) |
48
|
|
|
11 - F(1) Measure: 2 * (precision * recall) / (precision + recall) |
49
|
|
|
12 - G Measure: sqrt(precision * recall) |
50
|
|
|
|
51
|
|
|
Gets Overall Performance for the classifier |
52
|
|
|
0 - Average Accuracy: The average per-class effectiveness of a classifier |
53
|
|
|
1 - Weighed Accuracy: The average effectiveness of a classifier weighed by prevalence of each class |
54
|
|
|
2 - Precision (micro): Agreement of the class labels with those of a classifiers if calculated from sums of per-text |
55
|
|
|
decision |
56
|
|
|
3 - Recall (micro): Effectiveness of a classifier to identify class labels if calculated from sums of per-text |
57
|
|
|
decisions |
58
|
|
|
4 - F-Score (micro): Relationship between data's positive labels and those given by a classifier based on a sums of |
59
|
|
|
per-text decisions |
60
|
|
|
5 - Precision (macro): An average per-class agreement of the data class labels with those of a classifiers |
61
|
|
|
6 - Recall (macro): An average per-class effectiveness of a classifier to identify class labels |
62
|
|
|
7 - F-Score (micro): Relations between data's positive labels and those given by a classifier based on a per-class |
63
|
|
|
average |
64
|
|
|
8 - Exact Matching Ratio: The average per-text exact classification |
65
|
|
|
|
66
|
|
|
Note: In Multi-class classification, Micro-Precision == Micro-Recall == Micro-FScore == Exact Matching Ratio |
67
|
|
|
(Multi-class classification: each input is to be classified into one and only one class) |
68
|
|
|
|
69
|
|
|
Reference Document: |
70
|
|
|
Sokolova, M., & Lapalme, G. (2009). A systematic analysis of performance measures for classification tasks. |
71
|
|
|
Information Processing and Management, 45, p. 427-437 |
72
|
|
|
|
73
|
|
|
Args: |
74
|
|
|
num_classes (:obj:`int`): Number of classes |
75
|
|
|
confusion_matrix (:class:`numpy.array`): Confusion Matrix (numpy array of num_class by num_class) |
76
|
|
|
|
77
|
|
|
Returns: |
78
|
|
|
:obj:`tuple` of :class:`numpy.array`: tuple of overall performance and per class performance |
79
|
|
|
""" |
80
|
|
|
if confusion_matrix.shape[0] != confusion_matrix.shape[1]: |
81
|
|
|
logger.error("confusion matrix with shape " + str(confusion_matrix.shape) + " is not square.") |
82
|
|
|
return None, None |
83
|
|
|
|
84
|
|
|
num_classes = confusion_matrix.shape[0] |
85
|
|
|
|
86
|
|
|
per_class = np.zeros((num_classes, len(per_class_performance_index)), dtype=float) |
87
|
|
|
overall = np.zeros((len(overall_performance_index),), dtype=float) |
88
|
|
|
|
89
|
|
|
for i in range(num_classes): |
90
|
|
|
true_positive = confusion_matrix[i][i] |
91
|
|
|
true_negative = np.sum(confusion_matrix)\ |
92
|
|
|
- np.sum(confusion_matrix[i, :])\ |
93
|
|
|
- np.sum(confusion_matrix[:, i])\ |
94
|
|
|
+ confusion_matrix[i][i] |
95
|
|
|
false_positive = np.sum(confusion_matrix[:, i]) - confusion_matrix[i][i] |
96
|
|
|
false_negative = np.sum(confusion_matrix[i, :]) - confusion_matrix[i][i] |
97
|
|
|
# Accuracy: (TP + TN) / (TP + TN + FP + FN) |
98
|
|
|
per_class_accuracy = (true_positive + true_negative)\ |
99
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
100
|
|
|
# Mis-classification: (FP + FN) / (TP + TN + FP + FN) |
101
|
|
|
per_class_misclassification = (false_positive + false_negative)\ |
102
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
103
|
|
|
# Recall: TP / (TP + FN) |
104
|
|
|
if true_positive + false_negative == 0: |
105
|
|
|
per_class_recall = 0. |
106
|
|
|
else: |
107
|
|
|
per_class_recall = true_positive / (true_positive + false_negative) |
108
|
|
|
# False Positive Rate: FP / (FP + TN) |
109
|
|
|
if false_positive + true_negative == 0: |
110
|
|
|
per_class_fpr = 0. |
111
|
|
|
else: |
112
|
|
|
per_class_fpr = false_positive / (false_positive + true_negative) |
113
|
|
|
# Specificity: TN / (FP + TN) |
114
|
|
|
if false_positive + true_negative == 0: |
115
|
|
|
per_class_specificity = 0. |
116
|
|
|
else: |
117
|
|
|
per_class_specificity = true_negative / (false_positive + true_negative) |
118
|
|
|
# Precision: TP / (TP + FP) |
119
|
|
|
if true_positive + false_positive == 0: |
120
|
|
|
per_class_precision = 0. |
121
|
|
|
else: |
122
|
|
|
per_class_precision = true_positive / (true_positive + false_positive) |
123
|
|
|
# prevalence |
124
|
|
|
per_class_prevalence = (true_positive + false_negative)\ |
125
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
126
|
|
|
# F-1 Measure: 2 * (precision * recall) / (precision + |
127
|
|
|
if per_class_precision + per_class_recall == 0: |
128
|
|
|
per_class_fscore = 0. |
129
|
|
|
else: |
130
|
|
|
per_class_fscore = 2 * (per_class_precision * per_class_recall) / (per_class_precision + per_class_recall) |
131
|
|
|
# G Measure: sqrt(precision * recall) |
132
|
|
|
per_class_gscore = np.sqrt(per_class_precision * per_class_recall) |
133
|
|
|
per_class[i][0] = true_positive |
134
|
|
|
per_class[i][1] = true_negative |
135
|
|
|
per_class[i][2] = false_positive |
136
|
|
|
per_class[i][3] = false_negative |
137
|
|
|
per_class[i][4] = per_class_accuracy |
138
|
|
|
per_class[i][5] = per_class_misclassification |
139
|
|
|
per_class[i][6] = per_class_recall |
140
|
|
|
per_class[i][7] = per_class_fpr |
141
|
|
|
per_class[i][8] = per_class_specificity |
142
|
|
|
per_class[i][9] = per_class_precision |
143
|
|
|
per_class[i][10] = per_class_prevalence |
144
|
|
|
per_class[i][11] = per_class_fscore |
145
|
|
|
per_class[i][12] = per_class_gscore |
146
|
|
|
|
147
|
|
|
# Average Accuracy: Sum{i}{Accuracy{i}} / num_class |
148
|
|
|
overall[0] = np.sum(per_class[:, per_class_performance_index.index('accuracy')]) / num_classes |
149
|
|
|
# Weighed Accuracy: Sum{i}{Accuracy{i} * Prevalence{i}} / num_class |
150
|
|
|
overall[1] = np.dot(per_class[:, per_class_performance_index.index('accuracy')], |
151
|
|
|
per_class[:, per_class_performance_index.index('prevalence')]) |
152
|
|
|
# Precision (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FP_i} |
153
|
|
|
overall[2] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
154
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
155
|
|
|
per_class[:, per_class_performance_index.index('false_positive')]) |
156
|
|
|
# Recall (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FN_i} |
157
|
|
|
overall[3] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
158
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
159
|
|
|
per_class[:, per_class_performance_index.index('false_negative')]) |
160
|
|
|
# F_Score (micro): 2 * Precision_micro * Recall_micro / (Precision_micro + Recall_micro) |
161
|
|
|
overall[4] = 2 * overall[2] * overall[3] / (overall[2] + overall[3]) |
162
|
|
|
# Precision (macro): Sum{i}{Precision_i} / num_class |
163
|
|
|
overall[5] = np.sum(per_class[:, per_class_performance_index.index('precision')]) / num_classes |
164
|
|
|
# Recall (macro): Sum{i}{Recall_i} / num_class |
165
|
|
|
overall[6] = np.sum(per_class[:, per_class_performance_index.index('recall')]) / num_classes |
166
|
|
|
# F_Score (macro): 2 * Precision_macro * Recall_macro / (Precision_macro + Recall_macro) |
167
|
|
|
overall[7] = 2 * overall[5] * overall[6] / (overall[5] + overall[6]) |
168
|
|
|
# Exact Matching Ratio: |
169
|
|
|
overall[8] = np.trace(confusion_matrix) / np.sum(confusion_matrix) |
170
|
|
|
return overall, per_class |
171
|
|
|
|
172
|
|
|
|