1
|
|
|
"""Reference Document: |
2
|
|
|
Sokolova, M., & Lapalme, G. (2009). A systematic analysis of performance measures for classification tasks. |
3
|
|
|
Information Processing and Management, 45, p. 427-437 |
4
|
|
|
""" |
5
|
|
|
|
6
|
|
|
import logging |
7
|
|
|
import numpy as np |
8
|
|
|
|
9
|
|
|
logger = logging.getLogger(__file__) |
10
|
|
|
|
11
|
|
|
per_class_performance_index = ['true_positive', 'true_negative', 'false_positive', 'false_negative', |
12
|
|
|
'accuracy', 'misclassification', 'recall', 'false positive rate', |
13
|
|
|
'specificity', 'precision', 'prevalence', 'f-1 measure', 'g-measure'] |
14
|
|
|
|
15
|
|
|
overall_performance_index = ['average accuracy', 'weighed accuracy', |
16
|
|
|
'precision (micro)', 'recall (micro)', 'f-1 score (micro)', |
17
|
|
|
'precision (macro)', 'recall (macro)', 'f-1 score (macro)', |
18
|
|
|
'exact matching ratio'] |
19
|
|
|
|
20
|
|
|
|
21
|
|
|
def get_confusion_matrix_by_activity(num_classes, label, predicted): |
22
|
|
|
"""Calculate confusion matrix based on activity accuracy |
23
|
|
|
|
24
|
|
|
Instead of calculating confusion matrix by comparing ground truth and predicted |
25
|
|
|
result one by one, it compares if a segment of activity is correctly predicted. |
26
|
|
|
It also logs the shift of activity predicted versus labeled. |
27
|
|
|
""" |
28
|
|
|
return |
29
|
|
|
|
30
|
|
|
|
31
|
|
|
def get_confusion_matrix(num_classes, label, predicted): |
32
|
|
|
"""Calculate confusion matrix based on ground truth and predicted result |
33
|
|
|
|
34
|
|
|
Args: |
35
|
|
|
num_classes (:obj:`int`): Number of classes |
36
|
|
|
label (:obj:`list` of :obj:`int`): ground truth labels |
37
|
|
|
predicted (:obj:`list` of :obj:`int`): predicted labels |
38
|
|
|
|
39
|
|
|
Returns: |
40
|
|
|
:class:`numpy.array`: Confusion matrix (`numpy_class` by `numpy_class`) |
41
|
|
|
""" |
42
|
|
|
matrix = np.zeros((num_classes, num_classes)) |
43
|
|
|
for i in range(len(label)): |
44
|
|
|
matrix[label[i]][predicted[i]] += 1 |
45
|
|
|
return matrix |
46
|
|
|
|
47
|
|
|
|
48
|
|
|
def get_performance_array(confusion_matrix): |
49
|
|
|
r"""Calculate performance matrix based on the given confusion matrix |
50
|
|
|
|
51
|
|
|
[Sokolova2009]_ provides a detailed analysis for multi-class performance metrics. |
52
|
|
|
|
53
|
|
|
Per-class performance metrics: |
54
|
|
|
|
55
|
|
|
0. **True_Positive**: number of samples that belong to class and classified correctly |
56
|
|
|
1. **True_Negative**: number of samples that correctly classified as not belonging to class |
57
|
|
|
2. **False_Positive**: number of samples that belong to class and not classified correctMeasure: |
58
|
|
|
3. **False_Negative**: number of samples that do not belong to class but classified as class |
59
|
|
|
4. **Accuracy**: Overall, how often is the classifier correct? (TP + TN) / (TP + TN + FP + FN) |
60
|
|
|
5. **Misclassification**: Overall, how often is it wrong? (FP + FN) / (TP + TN + FP + FN) |
61
|
|
|
6. **Recall**: When it's actually yes, how often does it predict yes? TP / (TP + FN) |
62
|
|
|
7. **False Positive Rate**: When it's actually no, how often does it predict yes? FP / (FP + TN) |
63
|
|
|
8. **Specificity**: When it's actually no, how often does it predict no? TN / (FP + TN) |
64
|
|
|
9. **Precision**: When it predicts yes, how often is it correct? TP / (TP + FP) |
65
|
|
|
10. **Prevalence**: How often does the yes condition actually occur in our sample? Total(class) / Total(samples) |
66
|
|
|
11. **F(1) Measure**: 2 * (precision * recall) / (precision + recall) |
67
|
|
|
12. **G Measure**: sqrt(precision * recall) |
68
|
|
|
|
69
|
|
|
Gets Overall Performance for the classifier |
70
|
|
|
|
71
|
|
|
0. **Average Accuracy**: The average per-class effectiveness of a classifier |
72
|
|
|
1. **Weighed Accuracy**: The average effectiveness of a classifier weighed by prevalence of each class |
73
|
|
|
2. **Precision (micro)**: Agreement of the class labels with those of a classifiers if calculated from sums of per-text |
74
|
|
|
decision |
75
|
|
|
3. **Recall (micro)**: Effectiveness of a classifier to identify class labels if calculated from sums of per-text |
76
|
|
|
decisions |
77
|
|
|
4. **F-Score (micro)**: Relationship between data's positive labels and those given by a classifier based on a sums of |
78
|
|
|
per-text decisions |
79
|
|
|
5. **Precision (macro)**: An average per-class agreement of the data class labels with those of a classifiers |
80
|
|
|
6. **Recall (macro)**: An average per-class effectiveness of a classifier to identify class labels |
81
|
|
|
7. **F-Score (micro)**: Relations between data's positive labels and those given by a classifier based on a per-class |
82
|
|
|
average |
83
|
|
|
8. **Exact Matching Ratio**: The average per-text exact classification |
84
|
|
|
|
85
|
|
|
.. note:: |
86
|
|
|
|
87
|
|
|
In Multi-class classification, Micro-Precision == Micro-Recall == Micro-FScore == Exact Matching Ratio |
88
|
|
|
(Multi-class classification: each input is to be classified into one and only one class) |
89
|
|
|
|
90
|
|
|
Args: |
91
|
|
|
num_classes (:obj:`int`): Number of classes |
92
|
|
|
confusion_matrix (:class:`numpy.array`): Confusion Matrix (numpy array of num_class by num_class) |
93
|
|
|
|
94
|
|
|
Returns: |
95
|
|
|
:obj:`tuple` of :class:`numpy.array`: tuple of overall performance and per class performance |
96
|
|
|
""" |
97
|
|
|
if confusion_matrix.shape[0] != confusion_matrix.shape[1]: |
98
|
|
|
logger.error("confusion matrix with shape " + str(confusion_matrix.shape) + " is not square.") |
99
|
|
|
return None, None |
100
|
|
|
|
101
|
|
|
num_classes = confusion_matrix.shape[0] |
102
|
|
|
|
103
|
|
|
per_class = np.zeros((num_classes, len(per_class_performance_index)), dtype=float) |
104
|
|
|
overall = np.zeros((len(overall_performance_index),), dtype=float) |
105
|
|
|
|
106
|
|
|
for i in range(num_classes): |
107
|
|
|
true_positive = confusion_matrix[i][i] |
108
|
|
|
true_negative = np.sum(confusion_matrix)\ |
109
|
|
|
- np.sum(confusion_matrix[i, :])\ |
110
|
|
|
- np.sum(confusion_matrix[:, i])\ |
111
|
|
|
+ confusion_matrix[i][i] |
112
|
|
|
false_positive = np.sum(confusion_matrix[:, i]) - confusion_matrix[i][i] |
113
|
|
|
false_negative = np.sum(confusion_matrix[i, :]) - confusion_matrix[i][i] |
114
|
|
|
# Accuracy: (TP + TN) / (TP + TN + FP + FN) |
115
|
|
|
per_class_accuracy = (true_positive + true_negative)\ |
116
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
117
|
|
|
# Mis-classification: (FP + FN) / (TP + TN + FP + FN) |
118
|
|
|
per_class_misclassification = (false_positive + false_negative)\ |
119
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
120
|
|
|
# Recall: TP / (TP + FN) |
121
|
|
|
if true_positive + false_negative == 0: |
122
|
|
|
per_class_recall = 0. |
123
|
|
|
else: |
124
|
|
|
per_class_recall = true_positive / (true_positive + false_negative) |
125
|
|
|
# False Positive Rate: FP / (FP + TN) |
126
|
|
|
if false_positive + true_negative == 0: |
127
|
|
|
per_class_fpr = 0. |
128
|
|
|
else: |
129
|
|
|
per_class_fpr = false_positive / (false_positive + true_negative) |
130
|
|
|
# Specificity: TN / (FP + TN) |
131
|
|
|
if false_positive + true_negative == 0: |
132
|
|
|
per_class_specificity = 0. |
133
|
|
|
else: |
134
|
|
|
per_class_specificity = true_negative / (false_positive + true_negative) |
135
|
|
|
# Precision: TP / (TP + FP) |
136
|
|
|
if true_positive + false_positive == 0: |
137
|
|
|
per_class_precision = 0. |
138
|
|
|
else: |
139
|
|
|
per_class_precision = true_positive / (true_positive + false_positive) |
140
|
|
|
# prevalence |
141
|
|
|
per_class_prevalence = (true_positive + false_negative)\ |
142
|
|
|
/ (true_positive + true_negative + false_positive + false_negative) |
143
|
|
|
# F-1 Measure: 2 * (precision * recall) / (precision + |
144
|
|
|
if per_class_precision + per_class_recall == 0: |
145
|
|
|
per_class_fscore = 0. |
146
|
|
|
else: |
147
|
|
|
per_class_fscore = 2 * (per_class_precision * per_class_recall) / (per_class_precision + per_class_recall) |
148
|
|
|
# G Measure: sqrt(precision * recall) |
149
|
|
|
per_class_gscore = np.sqrt(per_class_precision * per_class_recall) |
150
|
|
|
per_class[i][0] = true_positive |
151
|
|
|
per_class[i][1] = true_negative |
152
|
|
|
per_class[i][2] = false_positive |
153
|
|
|
per_class[i][3] = false_negative |
154
|
|
|
per_class[i][4] = per_class_accuracy |
155
|
|
|
per_class[i][5] = per_class_misclassification |
156
|
|
|
per_class[i][6] = per_class_recall |
157
|
|
|
per_class[i][7] = per_class_fpr |
158
|
|
|
per_class[i][8] = per_class_specificity |
159
|
|
|
per_class[i][9] = per_class_precision |
160
|
|
|
per_class[i][10] = per_class_prevalence |
161
|
|
|
per_class[i][11] = per_class_fscore |
162
|
|
|
per_class[i][12] = per_class_gscore |
163
|
|
|
|
164
|
|
|
# Average Accuracy: Sum{i}{Accuracy{i}} / num_class |
165
|
|
|
overall[0] = np.sum(per_class[:, per_class_performance_index.index('accuracy')]) / num_classes |
166
|
|
|
# Weighed Accuracy: Sum{i}{Accuracy{i} * Prevalence{i}} / num_class |
167
|
|
|
overall[1] = np.dot(per_class[:, per_class_performance_index.index('accuracy')], |
168
|
|
|
per_class[:, per_class_performance_index.index('prevalence')]) |
169
|
|
|
# Precision (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FP_i} |
170
|
|
|
overall[2] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
171
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
172
|
|
|
per_class[:, per_class_performance_index.index('false_positive')]) |
173
|
|
|
# Recall (micro): Sum{i}{TP_i} / Sum{i}{TP_i + FN_i} |
174
|
|
|
overall[3] = np.sum(per_class[:, per_class_performance_index.index('true_positive')]) / \ |
175
|
|
|
np.sum(per_class[:, per_class_performance_index.index('true_positive')] + |
176
|
|
|
per_class[:, per_class_performance_index.index('false_negative')]) |
177
|
|
|
# F_Score (micro): 2 * Precision_micro * Recall_micro / (Precision_micro + Recall_micro) |
178
|
|
|
overall[4] = 2 * overall[2] * overall[3] / (overall[2] + overall[3]) |
179
|
|
|
# Precision (macro): Sum{i}{Precision_i} / num_class |
180
|
|
|
overall[5] = np.sum(per_class[:, per_class_performance_index.index('precision')]) / num_classes |
181
|
|
|
# Recall (macro): Sum{i}{Recall_i} / num_class |
182
|
|
|
overall[6] = np.sum(per_class[:, per_class_performance_index.index('recall')]) / num_classes |
183
|
|
|
# F_Score (macro): 2 * Precision_macro * Recall_macro / (Precision_macro + Recall_macro) |
184
|
|
|
overall[7] = 2 * overall[5] * overall[6] / (overall[5] + overall[6]) |
185
|
|
|
# Exact Matching Ratio: |
186
|
|
|
overall[8] = np.trace(confusion_matrix) / np.sum(confusion_matrix) |
187
|
|
|
return overall, per_class |
188
|
|
|
|
189
|
|
|
|