|
1
|
|
|
"""Reference Document: |
|
2
|
|
|
Sokolova, M., & Lapalme, G. (2009). A systematic analysis of performance measures for classification tasks. |
|
3
|
|
|
Information Processing and Management, 45, p. 427-437 |
|
4
|
|
|
""" |
|
5
|
|
|
|
|
6
|
|
|
import logging |
|
7
|
|
|
import numpy as np |
|
8
|
|
|
|
|
9
|
|
|
logger = logging.getLogger(__file__) |
|
10
|
|
|
|
|
11
|
|
|
per_class_performance_index = ['true_positive', 'true_negative', 'false_positive', 'false_negative', |
|
12
|
|
|
'accuracy', 'misclassification', 'recall', 'false positive rate', |
|
13
|
|
|
'specificity', 'precision', 'prevalence', 'f-1 measure', 'g-measure'] |
|
14
|
|
|
|
|
15
|
|
|
overall_performance_index = ['average accuracy', 'weighed accuracy', |
|
16
|
|
|
'precision (micro)', 'recall (micro)', 'f-1 score (micro)', |
|
17
|
|
|
'precision (macro)', 'recall (macro)', 'f-1 score (macro)', |
|
18
|
|
|
'exact matching ratio'] |
|
19
|
|
|
|
|
20
|
|
|
|
|
21
|
|
|
def get_confusion_matrix_by_activity(num_classes, label, predicted): |
|
22
|
|
|
"""Calculate confusion matrix based on activity accuracy |
|
23
|
|
|
|
|
24
|
|
|
Instead of calculating confusion matrix by comparing ground truth and predicted |
|
25
|
|
|
result one by one, it compares if a segment of activity is correctly predicted. |
|
26
|
|
|
It also logs the shift of activity predicted versus labeled. |
|
27
|
|
|
""" |
|
28
|
|
|
return |
|
29
|
|
|
|
|
30
|
|
|
|
|
31
|
|
|
def get_confusion_matrix(num_classes, label, predicted): |
|
32
|
|
|
"""Calculate confusion matrix based on ground truth and predicted result |
|
33
|
|
|
|
|
34
|
|
|
Args: |
|
35
|
|
|
num_classes (:obj:`int`): Number of classes |
|
36
|
|
|
label (:obj:`list` of :obj:`int`): ground truth labels |
|
37
|
|
|
predicted (:obj:`list` of :obj:`int`): predicted labels |
|
38
|
|
|
|
|
39
|
|
|
Returns: |
|
40
|
|
|
:class:`numpy.array`: Confusion matrix (`numpy_class` by `numpy_class`) |
|
41
|
|
|
""" |
|
42
|
|
|
matrix = np.zeros((num_classes, num_classes)) |
|
43
|
|
|
for i in range(len(label)): |
|
44
|
|
|
matrix[label[i]][predicted[i]] += 1 |
|
45
|
|
|
return matrix |
|
46
|
|
|
|
|
47
|
|
|
|
|
48
|
|
|
def get_performance_array(confusion_matrix): |
|
49
|
|
|
r"""Calculate performance matrix based on the given confusion matrix |
|
50
|
|
|
|
|
51
|
|
|
[Sokolova2009]_ provides a detailed analysis for multi-class performance metrics. |
|
52
|
|
|
|
|
53
|
|
|
Per-class performance metrics: |
|
54
|
|
|
|
|
55
|
|
|
0. **True_Positive**: number of samples that belong to class and classified correctly |
|
56
|
|
|
1. **True_Negative**: number of samples that correctly classified as not belonging to class |
|
57
|
|
|
2. **False_Positive**: number of samples that belong to class and not classified correctMeasure: |
|
58
|
|
|
3. **False_Negative**: number of samples that do not belong to class but classified as class |
|
59
|
|
|
4. **Accuracy**: Overall, how often is the classifier correct? (TP + TN) / (TP + TN + FP + FN) |
|
60
|
|
|
5. **Misclassification**: Overall, how often is it wrong? (FP + FN) / (TP + TN + FP + FN) |
|
61
|
|
|
6. **Recall**: When it's actually yes, how often does it predict yes? TP / (TP + FN) |
|
62
|
|
|
7. **False Positive Rate**: When it's actually no, how often does it predict yes? FP / (FP + TN) |
|
63
|
|
|
8. **Specificity**: When it's actually no, how often does it predict no? TN / (FP + TN) |
|
64
|
|
|
9. **Precision**: When it predicts yes, how often is it correct? TP / (TP + FP) |
|
65
|
|
|
10. **Prevalence**: How often does the yes condition actually occur in our sample? Total(class) / Total(samples) |
|
66
|
|
|
11. **F(1) Measure**: 2 * (precision * recall) / (precision + recall) |
|
67
|
|
|
12. **G Measure**: sqrt(precision * recall) |
|
68
|
|
|
|
|
69
|
|
|
Gets Overall Performance for the classifier |
|
70
|
|
|
|
|
71
|
|
|
0. **Average Accuracy**: The average per-class effectiveness of a classifier |
|
72
|
|
|
1. **Weighed Accuracy**: The average effectiveness of a classifier weighed by prevalence of each class |
|
73
|
|
|
2. **Precision (micro)**: Agreement of the class labels with those of a classifiers if calculated from sums of per-text |
|
74
|
|
|
decision |
|
75
|
|
|
3. **Recall (micro)**: Effectiveness of a classifier to identify class labels if calculated from sums of per-text |
|
76
|
|
|
decisions |
|
77
|
|
|
4. **F-Score (micro)**: Relationship between data's positive labels and those given by a classifier based on a sums of |
|
78
|
|
|
per-text decisions |
|
79
|
|
|
5. **Precision (macro)**: An average per-class agreement of the data class labels with those of a classifiers |
|
80
|
|
|
6. **Recall (macro)**: An average per-class effectiveness of a classifier to identify class labels |
|
81
|
|
|
7. **F-Score (micro)**: Relations between data's positive labels and those given by a classifier based on a per-class |
|
82
|
|
|
average |
|
83
|
|
|
8. **Exact Matching Ratio**: The average per-text exact classification |
|
84
|
|
|
|
|
85
|
|
|
.. note:: |
|
86
|
|
|
|
|
87
|
|
|
In Multi-class classification, Micro-Precision == Micro-Recall == Micro-FScore == Exact Matching Ratio |
|
88
|
|
|
(Multi-class classification: each input is to be classified into one and only one class) |
|
89
|
|
|
|
|
90
|
|
|
Args: |
|
91
|
|
|
num_classes (:obj:`int`): Number of classes |
|
92
|
|
|
confusion_matrix (:class:`numpy.array`): Confusion Matrix (numpy array of num_class by num_class) |
|
93
|
|
|
|
|
94
|
|
|
Returns: |
|
95
|
|
|
:obj:`tuple` of :class:`numpy.array`: tuple of overall performance and per class performance |
|
96
|
|
|
""" |
|
97
|
|
|
if confusion_matrix.shape[0] != confusion_matrix.shape[1]: |
|
98
|
|
|
logger.error("confusion matrix with shape " + str(confusion_matrix.shape) + " is not square.") |
|
99
|
|
|
return None, None |
|
100
|
|
|
|
|
101
|
|
|
num_classes = confusion_matrix.shape[0] |
|
102
|
|
|
|
|
103
|
|
|
per_class = np.zeros((num_classes, len(per_class_performance_index)), dtype=float) |
|
104
|
|
|
overall = np.zeros((len(overall_performance_index),), dtype=float) |
|
105
|
|
|
|
|
106
|
|
|
for i in range(num_classes): |
|
107
|
|
|
true_positive = confusion_matrix[i][i] |
|
108
|
|
|
true_negative = np.sum(confusion_matrix)\ |
|
109
|
|
|
- np.sum(confusion_matrix[i, :])\ |
|
110
|
|
|
- np.sum(confusion_matrix[:, i])\ |
|
111
|
|
|
+ confusion_matrix[i][i] |
|
112
|
|
|
false_positive = np.sum(confusion_matrix[:, i]) - confusion_matrix[i][i] |
|
113
|
|
|
false_negative = np.sum(confusion_matrix[i, :]) - confusion_matrix[i][i] |
|
114
|
|
|
# Accuracy: (TP + TN) / (TP + TN + FP + FN) |
|
115
|
|
|
per_class_accuracy = (true_positive + true_negative)\ |
|
116
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
|
117
|
|
|
# Mis-classification: (FP + FN) / (TP + TN + FP + FN) |
|
118
|
|
|
per_class_misclassification = (false_positive + false_negative)\ |
|
119
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
|
120
|
|
|
# Recall: TP / (TP + FN) |
|
121
|
|
|
if true_positive + false_negative == 0: |
|
122
|
|
|
per_class_recall = 0. |
|
123
|
|
|
else: |
|
124
|
|
|
per_class_recall = true_positive / (true_positive + false_negative) |
|
125
|
|
|
# False Positive Rate: FP / (FP + TN) |
|
126
|
|
|
if false_positive + true_negative == 0: |
|
127
|
|
|
per_class_fpr = 0. |
|
128
|
|
|
else: |
|
129
|
|
|
per_class_fpr = false_positive / (false_positive + true_negative) |
|
130
|
|
|
# Specificity: TN / (FP + TN) |
|
131
|
|
|
if false_positive + true_negative == 0: |
|
132
|
|
|
per_class_specificity = 0. |
|
133
|
|
|
else: |
|
134
|
|
|
per_class_specificity = true_negative / (false_positive + true_negative) |
|
135
|
|
|
# Precision: TP / (TP + FP) |
|
136
|
|
|
if true_positive + false_positive == 0: |
|
137
|
|
|
per_class_precision = 0. |
|
138
|
|
|
else: |
|
139
|
|
|
per_class_precision = true_positive / (true_positive + false_positive) |
|
140
|
|
|
# prevalence |
|
141
|
|
|
per_class_prevalence = (true_positive + false_negative)\ |
|
142
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
|
143
|
|
|
# F-1 Measure: 2 * (precision * recall) / (precision + |
|
144
|
|
|
if per_class_precision + per_class_recall == 0: |
|
145
|
|
|
per_class_fscore = 0. |
|
146
|
|
|
else: |
|
147
|
|
|
per_class_fscore = 2 * (per_class_precision * per_class_recall) / (per_class_precision + per_class_recall) |
|
148
|
|
|
# G Measure: sqrt(precision * recall) |
|
149
|
|
|
per_class_gscore = np.sqrt(per_class_precision * per_class_recall) |
|
150
|
|
|
per_class[i][0] = true_positive |
|
151
|
|
|
per_class[i][1] = true_negative |
|
152
|
|
|
per_class[i][2] = false_positive |
|
153
|
|
|
per_class[i][3] = false_negative |
|
154
|
|
|
per_class[i][4] = per_class_accuracy |
|
155
|
|
|
per_class[i][5] = per_class_misclassification |
|
156
|
|
|
per_class[i][6] = per_class_recall |
|
157
|
|
|
per_class[i][7] = per_class_fpr |
|
158
|
|
|
per_class[i][8] = per_class_specificity |
|
159
|
|
|
per_class[i][9] = per_class_precision |
|
160
|
|
|
per_class[i][10] = per_class_prevalence |
|
161
|
|
|
per_class[i][11] = per_class_fscore |
|
162
|
|
|
per_class[i][12] = per_class_gscore |
|
163
|
|
|
|
|
164
|
|
|
# Average Accuracy: Sum{i}{Accuracy{i}} / num_class |
|
165
|
|
|
overall[0] = np.sum(per_class[:, per_class_performance_index.index('accuracy')]) / num_classes |
|
166
|
|
|
# Weighed Accuracy: Sum{i}{Accuracy{i} * Prevalence{i}} / num_class |
|
167
|
|
|
overall[1] = np.dot(per_class[:, per_class_performance_index.index('accuracy')], |
|
168
|
|
|
per_class[:, per_class_performance_index.index('prevalence')]) |
|
169
|
|
|
# Precision (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FP_i} |
|
170
|
|
|
overall[2] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
|
171
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
|
172
|
|
|
per_class[:, per_class_performance_index.index('false_positive')]) |
|
173
|
|
|
# Recall (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FN_i} |
|
174
|
|
|
overall[3] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
|
175
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
|
176
|
|
|
per_class[:, per_class_performance_index.index('false_negative')]) |
|
177
|
|
|
# F_Score (micro): 2 * Precision_micro * Recall_micro / (Precision_micro + Recall_micro) |
|
178
|
|
|
overall[4] = 2 * overall[2] * overall[3] / (overall[2] + overall[3]) |
|
179
|
|
|
# Precision (macro): Sum{i}{Precision_i} / num_class |
|
180
|
|
|
overall[5] = np.sum(per_class[:, per_class_performance_index.index('precision')]) / num_classes |
|
181
|
|
|
# Recall (macro): Sum{i}{Recall_i} / num_class |
|
182
|
|
|
overall[6] = np.sum(per_class[:, per_class_performance_index.index('recall')]) / num_classes |
|
183
|
|
|
# F_Score (macro): 2 * Precision_macro * Recall_macro / (Precision_macro + Recall_macro) |
|
184
|
|
|
overall[7] = 2 * overall[5] * overall[6] / (overall[5] + overall[6]) |
|
185
|
|
|
# Exact Matching Ratio: |
|
186
|
|
|
overall[8] = np.trace(confusion_matrix) / np.sum(confusion_matrix) |
|
187
|
|
|
return overall, per_class |
|
188
|
|
|
|
|
189
|
|
|
|