Completed
Push — master ( 582254...17fb6a )
by Wouter
03:58
created

RobustBiasAwareClassifier.fit()   F

Complexity

Conditions 9

Size

Total Lines 93

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 9.217

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 9
c 3
b 0
f 0
dl 0
loc 93
ccs 31
cts 36
cp 0.8611
crap 9.217
rs 3.3201

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class RobustBiasAwareClassifier(object):
18
    """
19
    Class of robust bias-aware classifiers.
20
21
    Reference: Liu & Ziebart (20140. Robust Classification under Sample
22
    Selection Bias. NIPS.
23
24
    Methods contain training and prediction functions.
25
    """
26
27 1
    def __init__(self, l2=0.0, order='first', gamma=1.0, tau=1e-5,
28
                 max_iter=100, clip=1000, verbose=True):
29
        """
30
        Set classifier instance parameters.
31
32
        INPUT   (1) float 'l2': l2-regularization parameter value (def:0.01)
33
                (2) str 'order': order of feature statistics to employ; options
34
                    are 'first', or 'second' (def: 'first')
35
                (3) float 'gamma': decaying learning rate (def: 1.0)
36
                (4) float 'tau': convergence threshold (def: 1e-5)
37
                (5) int 'max_iter': maximum number of iterations (def: 100)
38
                (6) int 'clip': upper bound on importance weights (def: 1000)
39
                (7) boolean 'verbose': report training progress (def: True)
40
        OUTPUT  None
41
        """
42 1
        self.l2 = l2
43 1
        self.order = order
44 1
        self.gamma = gamma
45 1
        self.tau = tau
46 1
        self.max_iter = max_iter
47 1
        self.clip = clip
48
49
        # Whether model has been trained
50 1
        self.is_trained = False
51
52
        # Dimensionality of training data
53 1
        self.train_data_dim = ''
54
55
        # Classifier parameters
56 1
        self.theta = 0
57
58
        # Verbosity
59 1
        self.verbose = verbose
60
61 1
    def feature_stats(self, X, y, order='first'):
62
        """
63
        Compute first-order moment feature statistics.
64
65
        INPUT   (1) array 'X': dataset (N samples by D features)
66
                (2) array 'y': label vector (N samples by 1)
67
        OUTPUT  (1) array
68
        """
69
        # Data shape
70 1
        N, D = X.shape
71
72
        # Expand label vector
73 1
        if len(y.shape) < 2:
74 1
            y = np.atleast_2d(y).T
75
76 1
        if (order == 'first'):
77
78
            # First-order consists of data times label
79 1
            mom = y * X
80
81
        elif (order == 'second'):
82
83
            # First-order consists of data times label
84
            yX = y * X
85
86
            # Second-order is label times Kronecker delta product of data
87
            yXX = y*np.kron(X, X)
88
89
            # Concatenate moments
90
            mom = np.concatenate((yX, yXX), axis=1)
91
92
        # Concatenate label vector, moments, and ones-augmentation
93 1
        return np.concatenate((y, mom, np.ones((N, 1))), axis=1)
94
95 1
    def iwe_kernel_densities(self, X, Z):
96
        """
97
        Estimate importance weights based on kernel density estimation.
98
99
        INPUT   (1) array 'X': source data (N samples by D features)
100
                (2) array 'Z': target data (M samples by D features)
101
        OUTPUT  (1) array: importance weights (N samples by 1)
102
        """
103
        # Data shapes
104 1
        N, DX = X.shape
105 1
        M, DZ = Z.shape
106
107
        # Assert equivalent dimensionalities
108 1
        assert DX == DZ
109
110
        # Compute probabilities based on source kernel densities
111 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
112 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
113
114
        # Check for numerics
115 1
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
116 1
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)
117
118
        # Return the ratio of probabilities
119 1
        return pT / pS
120
121 1
    def psi(self, X, theta, w, K=2):
122
        """
123
        Compute psi function.
124
125
        INPUT   (1) array 'X': data set (N samples by D features)
126
                (2) array 'theta': classifier parameters (D features by 1)
127
                (3) array 'w': importance-weights (N samples by 1)
128
                (4) int 'K': number of classes (def: 2)
129
        OUTPUT  (1) array 'psi' (N samples by K classes)
130
        """
131
        # Number of samples
132 1
        N = X.shape[0]
133
134
        # Preallocate psi array
135 1
        psi = np.zeros((N, K))
136
137
        # Loop over classes
138 1
        for k in range(K):
139
            # Compute feature statistics
140 1
            Xk = self.feature_stats(X, k*np.ones((N, 1)))
141
142
            # Compute psi function
143 1
            psi[:, k] = (w*np.dot(Xk, theta))[:, 0]
144
145 1
        return psi
146
147 1
    def posterior(self, psi):
148
        """
149
        Class-posterior estimation.
150
151
        INPUT   (1) array 'psi': weighted data-classifier output (N samples by
152
                    K classes)
153
        OUTPUT  (1) array 'pyx': class-posterior estimation (N samples by
154
                    K classes)
155
        """
156
        # Data shape
157 1
        N, K = psi.shape
158
159
        # Preallocate array
160 1
        pyx = np.zeros((N, K))
161
162
        # Subtract maximum value for numerical stability
163 1
        psi = (psi.T - np.max(psi, axis=1).T).T
164
165
        # Loop over classes
166 1
        for k in range(K):
167
168
            # Estimate posterior p^(Y=y | x_i)
169 1
            pyx[:, k] = np.exp(psi[:, k]) / np.sum(np.exp(psi), axis=1)
170
171 1
        return pyx
172
173 1
    def fit(self, X, y, Z):
174
        """
175
        Fit/train a robust bias-aware classifier.
176
177
        INPUT   (1) array 'X': source data (N samples by D features)
178
                (2) array 'y': source labels (N samples by 1)
179
                (3) array 'Z': target data (M samples by D features)
180
        OUTPUT  None
181
        """
182
        # Data shapes
183 1
        N, DX = X.shape
184 1
        M, DZ = Z.shape
185
186
        # Number of classes
187 1
        labels = np.unique(y)
188 1
        self.K = len(labels)
189
190
        # Assert equivalent dimensionalities
191 1
        assert DX == DZ
192
193
        # Dimenionsality of expanded feature space
194 1
        if (self.order == 'first'):
195 1
            D = 1 + DX + 1
196
        elif (self.order == 'second'):
197
            D = 1 + DX + DX**2 + 1
198
        else:
199
            raise ValueError
200
201
        # Compute moment-matching constraint
202 1
        c = np.mean(self.feature_stats(X, y, order=self.order), axis=0)
203
204
        # Estimate importance-weights
205 1
        w = self.iwe_kernel_densities(X, Z)
206
207
        # Inverse weights to achieve p_S(x)/p_T(x)
208 1
        w = 1./w
209
210
        # Clip weights if necessary
211 1
        w = np.clip(w, 0, self.clip)
212
213
        # Initialize classifier parameters
214 1
        theta = np.random.randn(1, D)*0.01
215
216
        # Start gradient descent
217 1
        for t in range(1, self.max_iter+1):
218
219
            # Calculate psi function
220 1
            psi = self.psi(X, theta.T, w, K=self.K)
221
222
            # Compute posterior
223 1
            pyx = self.posterior(psi)
224
225
            # Sum product of estimated posterior and feature stats
226 1
            pfs = 0
227 1
            for k in range(self.K):
228
229
                # Compute feature statistics for k-th class
230 1
                Xk = self.feature_stats(X, k*np.ones((N, 1)))
231
232
                # Element-wise product with posterior and sum over classes
233 1
                pfs += (pyx[:, k].T * Xk.T).T
234
235
            # Gradient computation and regularization
236 1
            dL = c - np.mean(pfs, axis=0) + self.l2*2*theta
237
238
            # Apply learning rate to gradient
239 1
            dT = dL / (t * self.gamma)
240
241
            # Update classifier parameters
242 1
            theta += dT
243
244
            # Report progress
245 1
            if self.verbose:
246 1
                if (t % (self.max_iter / 10)) == 1:
247 1
                    print('Iteration {:03}/{:03} - Norm gradient: {:.12}'
248
                          .format(t, self.max_iter, np.linalg.norm(dL)))
249
250
            # Check for convergence
251 1
            if (np.linalg.norm(dL) <= self.tau):
252
                print('Broke at {}'.format(t))
253
                break
254
255
        # Store resultant classifier parameters
256 1
        self.theta = theta
257
258
        # Store classes
259 1
        self.classes = labels
260
261
        # Mark classifier as trained
262 1
        self.is_trained = True
263
264
        # Store training data dimensionality
265 1
        self.train_data_dim = DX
266
267 1
    def predict(self, Z_):
268
        """
269
        Make predictions on new dataset.
270
271
        INPUT   (1) array 'Z_': new data set (M samples by D features)
272
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
273
        """
274
        # Data shape
275 1
        M, D = Z_.shape
276
277
        # If classifier is trained, check for same dimensionality
278 1
        if self.is_trained:
279 1
            assert self.train_data_dim == D
280
        else:
281
            raise UserWarning('Classifier is not trained yet.')
282
283
        # Calculate psi function for target samples
284 1
        psi = self.psi(Z_, self.theta.T, np.ones((M, 1)), K=self.K)
285
286
        # Compute posteriors for target samples
287 1
        pyz = self.posterior(psi)
288
289
        # Predictions through max-posteriors
290 1
        preds = np.argmax(pyz, axis=1)
291
292
        # Map predictions back to original labels
293 1
        preds = self.classes[preds]
294
295
        # Return predictions array
296 1
        return preds
297
298 1
    def get_params(self):
299
        """Get classifier parameters."""
300
        return self.clf.get_params()
301
302 1
    def is_trained(self):
303
        """Check whether classifier is trained."""
304
        return self.is_trained
305