gcForest.fit()   A
last analyzed

Complexity

Conditions 1

Size

Total Lines 2

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 1
c 1
b 0
f 0
dl 0
loc 2
rs 10
1
#!usr/bin/env python
2
import itertools
3
import numpy as np
4
from sklearn.ensemble import RandomForestClassifier
5
from sklearn.ensemble import ExtraTreesClassifier
6
from sklearn.model_selection import train_test_split
7
from sklearn.metrics import accuracy_score
8
9
10
class gcForest(object):
11
    max_acc = 0.0
12
    max_pred_layer = []
13
14
    def __init__(self, n_mgsRFtree=30, cascade_test_size=0.2, n_cascadeRF=2,
15
                 n_cascadeRFtree=101, cascade_layer=np.inf,
16
                 min_samples_cascade=0.05, tolerance=0.0):
17
        setattr(self, 'n_layer', 0)
18
        setattr(self, '_n_samples', 0)
19
        setattr(self, 'n_cascadeRF', int(n_cascadeRF))
20
        setattr(self, 'cascade_test_size', cascade_test_size)
21
        setattr(self, 'n_mgsRFtree', int(n_mgsRFtree))
22
        setattr(self, 'n_cascadeRFtree', int(n_cascadeRFtree))
23
        setattr(self, 'cascade_layer', cascade_layer)
24
        setattr(self, 'min_samples_cascade', min_samples_cascade)
25
        setattr(self, 'tolerance', tolerance)
26
27
    def fit(self, X, y):
28
        _ = self.cascade_forest(X, y)
29
30
    def predict_proba(self, X):
31
        cascade_all_pred_prob = self.cascade_forest(X)
32
        predict_proba = np.mean(cascade_all_pred_prob, axis=0)
33
        return predict_proba
34
35
    def predict(self, X):
36
        pred_proba = self.predict_proba(X=X)
37
        predictions = np.argmax(pred_proba, axis=1)
38
        return predictions
39
40
    def cascade_forest(self, X, y=None):
41
        if y is not None:
42
            setattr(self, 'n_layer', 0)
43
            test_size = getattr(self, 'cascade_test_size')
44
            max_layers = getattr(self, 'cascade_layer')
45
            tol = getattr(self, 'tolerance')
46
            # test_size = int(np.floor(X.shape[0] * test_size))
47
            # train_size = X.shape[0] - test_size
48
            # X_train = X[0:train_size, :]
49
            # y_train = y[0:train_size]
50
            # X_test = X[train_size:train_size + test_size, :]
51
            # y_test = y[train_size:train_size + test_size]
52
            # X_train, X_test, y_train, y_test = \
53
            #     train_test_split(X, y, test_size=test_size)
54
            X_train = X
55
            X_test = X
56
            y_train = y
57
            y_test = y
58
            self.n_layer += 1
59
            prf_pred_ref = self._cascade_layer(X_train, y_train)
60
            accuracy_ref = self._cascade_evaluation(X_test, y_test)
61
            feat_arr = self._create_feat_arr(X_train, prf_pred_ref)
62
63
            self.n_layer += 1
64
            prf_pred_layer = self._cascade_layer(feat_arr, y_train)
65
            accuracy_layer = self._cascade_evaluation(X_test, y_test)
66
            max_acc = accuracy_ref
67
            max_pred_layer = prf_pred_layer
68
69
            while accuracy_layer > (accuracy_ref + tol) and self.n_layer <= max_layers:
70
            #while accuracy_layer > (accuracy_ref - 0.000001) and \
71
            #    self.n_layer <= max_layers:
72
                if accuracy_layer > max_acc:
73
                    max_acc = accuracy_layer
74
                    max_pred_layer = prf_pred_layer
75
                    accuracy_ref = accuracy_layer
76
                    prf_pred_ref = prf_pred_layer
77
                    feat_arr = self._create_feat_arr(X_train, prf_pred_ref)
78
                    self.n_layer += 1
79
                    prf_pred_layer = self._cascade_layer(feat_arr, y_train)
80
                    accuracy_layer = self._cascade_evaluation(X_test, y_test)
81
82
                if accuracy_layer < accuracy_ref:
83
                    n_cascadeRF = getattr(self, 'n_cascadeRF')
84
                    for irf in range(n_cascadeRF):
85
                        delattr(self, '_casprf{}_{}'.format(self.n_layer, irf))
86
                        delattr(self, '_cascrf{}_{}'.format(self.n_layer, irf))
87
                    self.n_layer -= 1
88
89
            print("layer %d - accuracy %f ref %f" % (self.n_layer, accuracy_layer, accuracy_ref))
90
        else:
91
            at_layer = 1
92
            prf_pred_ref = self._cascade_layer(X, layer=at_layer)
93
            while at_layer < getattr(self, 'n_layer'):
94
                at_layer += 1
95
                feat_arr = self._create_feat_arr(X, prf_pred_ref)
96
                prf_pred_ref = self._cascade_layer(feat_arr, layer=at_layer)
97
98
        return prf_pred_ref
99
100
    def _cascade_layer(self, X, y=None, layer=0):
101
        n_tree = getattr(self, 'n_cascadeRFtree')
102
        n_cascadeRF = getattr(self, 'n_cascadeRF')
103
        min_samples = getattr(self, 'min_samples_cascade')
104
105
        prf = RandomForestClassifier(
106
            n_estimators=100, max_features=8,
107
            bootstrap=True, criterion="entropy", min_samples_split=20,
108
            max_depth=None, class_weight='balanced', oob_score=True)
109
        crf = ExtraTreesClassifier(
110
            n_estimators=100, max_depth=None,
111
            bootstrap=True, oob_score=True)
112
113
        prf_pred = []
114
        if y is not None:
115
            # print('Adding/Training Layer, n_layer={}'.format(self.n_layer))
116
            for irf in range(n_cascadeRF):
117
                prf.fit(X, y)
118
                crf.fit(X, y)
119
                setattr(self, '_casprf{}_{}'.format(self.n_layer, irf), prf)
120
                setattr(self, '_cascrf{}_{}'.format(self.n_layer, irf), crf)
121
                probas = prf.oob_decision_function_
122
                probas += crf.oob_decision_function_
123
                prf_pred.append(probas)
124
        elif y is None:
125
            for irf in range(n_cascadeRF):
126
                prf = getattr(self, '_casprf{}_{}'.format(layer, irf))
127
                crf = getattr(self, '_cascrf{}_{}'.format(layer, irf))
128
                probas = prf.predict_proba(X)
129
                probas += crf.predict_proba(X)
130
                prf_pred.append(probas)
131
132
        return prf_pred
133
134
    def _cascade_evaluation(self, X_test, y_test):
135
        casc_pred_prob = np.mean(self.cascade_forest(X_test), axis=0)
136
        casc_pred = np.argmax(casc_pred_prob, axis=1)
137
        casc_accuracy = accuracy_score(y_true=y_test, y_pred=casc_pred)
138
        #print('Layer validation accuracy = {}'.format(casc_accuracy))
139
140
        return casc_accuracy
141
142
    def _create_feat_arr(self, X, prf_pred):
143
        swap_pred = np.swapaxes(prf_pred, 0, 1)
144
        add_feat = swap_pred.reshape([np.shape(X)[0], -1])
145
        feat_arr = np.concatenate([add_feat, X], axis=1)
146
147
        return feat_arr
148