Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

RobustBiasAwareClassifier.fit()   F

Complexity

Conditions 9

Size

Total Lines 102

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 31
CRAP Score 9.217

Importance

Changes 3
Bugs 0 Features 0
Metric Value
cc 9
c 3
b 0
f 0
dl 0
loc 102
ccs 31
cts 36
cp 0.8611
crap 9.217
rs 3.1304

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class RobustBiasAwareClassifier(object):
18
    """
19
    Class of robust bias-aware classifiers.
20
21
    Reference: Liu & Ziebart (20140. Robust Classification under Sample
22
    Selection Bias. NIPS.
23
24
    Methods contain training and prediction functions.
25
    """
26
27 1
    def __init__(self, l2=0.0, order='first', gamma=1.0, tau=1e-5,
28
                 max_iter=100, clip=1000, verbose=True):
29
        """
30
        Set classifier instance parameters.
31
32
        Parameters
33
        ----------
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        order : str
37
            order of feature statistics to employ; options are 'first', or
38
            'second' (def: 'first')
39
        gamma : float
40
            decaying learning rate (def: 1.0)
41
        tau : float
42
            convergence threshold (def: 1e-5)
43
        max_iter : int
44
            maximum number of iterations (def: 100)
45
        clip : float
46
            upper bound on importance weights (def: 1000.)
47
        verbose : bool
48
            report training progress (def: True)
49
50
        Returns
51
        -------
52
        None
53
54
        """
55 1
        self.l2 = l2
56 1
        self.order = order
57 1
        self.gamma = gamma
58 1
        self.tau = tau
59 1
        self.max_iter = max_iter
60 1
        self.clip = clip
61
62
        # Whether model has been trained
63 1
        self.is_trained = False
64
65
        # Dimensionality of training data
66 1
        self.train_data_dim = ''
67
68
        # Classifier parameters
69 1
        self.theta = 0
70
71
        # Verbosity
72 1
        self.verbose = verbose
73
74 1
    def feature_stats(self, X, y, order='first'):
75
        """
76
        Compute first-order moment feature statistics.
77
78
        Parameters
79
        ----------
80
        X : array
81
            dataset (N samples by D features)
82
        y : array
83
            label vector (N samples by 1)
84
85
        Returns
86
        -------
87
        array
88
            array containing label vector, feature moments and 1-augmentation.
89
90
        """
91
        # Data shape
92 1
        N, D = X.shape
93
94
        # Expand label vector
95 1
        if len(y.shape) < 2:
96 1
            y = np.atleast_2d(y).T
97
98 1
        if (order == 'first'):
99
100
            # First-order consists of data times label
101 1
            mom = y * X
102
103
        elif (order == 'second'):
104
105
            # First-order consists of data times label
106
            yX = y * X
107
108
            # Second-order is label times Kronecker delta product of data
109
            yXX = y*np.kron(X, X)
110
111
            # Concatenate moments
112
            mom = np.concatenate((yX, yXX), axis=1)
113
114
        # Concatenate label vector, moments, and ones-augmentation
115 1
        return np.concatenate((y, mom, np.ones((N, 1))), axis=1)
116
117 1
    def iwe_kernel_densities(self, X, Z):
118
        """
119
        Estimate importance weights based on kernel density estimation.
120
121
        Parameters
122
        ----------
123
            X : array
124
                source data (N samples by D features)
125
            Z : array
126
                target data (M samples by D features)
127
128
        Returns
129
        -------
130
        array
131
            importance weights (N samples by 1)
132
133
        """
134
        # Data shapes
135 1
        N, DX = X.shape
136 1
        M, DZ = Z.shape
137
138
        # Assert equivalent dimensionalities
139 1
        assert DX == DZ
140
141
        # Compute probabilities based on source kernel densities
142 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
143 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
144
145
        # Check for numerics
146 1
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
147 1
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)
148
149
        # Take the ratio of probabilities
150 1
        return pT / pS
151
152 1
    def psi(self, X, theta, w, K=2):
153
        """
154
        Compute psi function.
155
156
        Parameters
157
        ----------
158
        X : array
159
            data set (N samples by D features)
160
        theta : array
161
            classifier parameters (D features by 1)
162
        w : array
163
            importance-weights (N samples by 1)
164
        K : int
165
            number of classes (def: 2)
166
167
        Returns
168
        -------
169
        psi : array
170
            array with psi function values (N samples by K classes)
171
172
        """
173
        # Number of samples
174 1
        N = X.shape[0]
175
176
        # Preallocate psi array
177 1
        psi = np.zeros((N, K))
178
179
        # Loop over classes
180 1
        for k in range(K):
181
            # Compute feature statistics
182 1
            Xk = self.feature_stats(X, k*np.ones((N, 1)))
183
184
            # Compute psi function
185 1
            psi[:, k] = (w*np.dot(Xk, theta))[:, 0]
186
187 1
        return psi
188
189 1
    def posterior(self, psi):
190
        """
191
        Class-posterior estimation.
192
193
        Parameters
194
        ----------
195
        psi : array
196
            weighted data-classifier output (N samples by K classes)
197
198
        Returns
199
        -------
200
        pyx : array
201
            class-posterior estimation (N samples by K classes)
202
203
        """
204
        # Data shape
205 1
        N, K = psi.shape
206
207
        # Preallocate array
208 1
        pyx = np.zeros((N, K))
209
210
        # Subtract maximum value for numerical stability
211 1
        psi = (psi.T - np.max(psi, axis=1).T).T
212
213
        # Loop over classes
214 1
        for k in range(K):
215
216
            # Estimate posterior p^(Y=y | x_i)
217 1
            pyx[:, k] = np.exp(psi[:, k]) / np.sum(np.exp(psi), axis=1)
218
219 1
        return pyx
220
221 1
    def fit(self, X, y, Z):
222
        """
223
        Fit/train a robust bias-aware classifier.
224
225
        Parameters
226
        ----------
227
        X : array
228
            source data (N samples by D features)
229
        y : array
230
            source labels (N samples by 1)
231
        Z : array
232
            target data (M samples by D features)
233
234
        Returns
235
        -------
236
        None
237
238
        """
239
        # Data shapes
240 1
        N, DX = X.shape
241 1
        M, DZ = Z.shape
242
243
        # Number of classes
244 1
        labels = np.unique(y)
245 1
        self.K = len(labels)
246
247
        # Assert equivalent dimensionalities
248 1
        assert DX == DZ
249
250
        # Dimenionsality of expanded feature space
251 1
        if (self.order == 'first'):
252 1
            D = 1 + DX + 1
253
        elif (self.order == 'second'):
254
            D = 1 + DX + DX**2 + 1
255
        else:
256
            raise ValueError
257
258
        # Compute moment-matching constraint
259 1
        c = np.mean(self.feature_stats(X, y, order=self.order), axis=0)
260
261
        # Estimate importance-weights
262 1
        w = self.iwe_kernel_densities(X, Z)
263
264
        # Inverse weights to achieve p_S(x)/p_T(x)
265 1
        w = 1./w
266
267
        # Clip weights if necessary
268 1
        w = np.clip(w, 0, self.clip)
269
270
        # Initialize classifier parameters
271 1
        theta = np.random.randn(1, D)*0.01
272
273
        # Start gradient descent
274 1
        for t in range(1, self.max_iter+1):
275
276
            # Calculate psi function
277 1
            psi = self.psi(X, theta.T, w, K=self.K)
278
279
            # Compute posterior
280 1
            pyx = self.posterior(psi)
281
282
            # Sum product of estimated posterior and feature stats
283 1
            pfs = 0
284 1
            for k in range(self.K):
285
286
                # Compute feature statistics for k-th class
287 1
                Xk = self.feature_stats(X, k*np.ones((N, 1)))
288
289
                # Element-wise product with posterior and sum over classes
290 1
                pfs += (pyx[:, k].T * Xk.T).T
291
292
            # Gradient computation and regularization
293 1
            dL = c - np.mean(pfs, axis=0) + self.l2*2*theta
294
295
            # Apply learning rate to gradient
296 1
            dT = dL / (t * self.gamma)
297
298
            # Update classifier parameters
299 1
            theta += dT
300
301
            # Report progress
302 1
            if self.verbose:
303 1
                if (t % (self.max_iter / 10)) == 1:
304 1
                    print('Iteration {:03}/{:03} - Norm gradient: {:.12}'
305
                          .format(t, self.max_iter, np.linalg.norm(dL)))
306
307
            # Check for convergence
308 1
            if (np.linalg.norm(dL) <= self.tau):
309
                print('Broke at {}'.format(t))
310
                break
311
312
        # Store resultant classifier parameters
313 1
        self.theta = theta
314
315
        # Store classes
316 1
        self.classes = labels
317
318
        # Mark classifier as trained
319 1
        self.is_trained = True
320
321
        # Store training data dimensionality
322 1
        self.train_data_dim = DX
323
324 1
    def predict(self, Z):
325
        """
326
        Make predictions on new dataset.
327
328
        Parameters
329
        ----------
330
        Z : array
331
            new data set (M samples by D features)
332
333
        Returns
334
        -------
335
        preds : array
336
            label predictions (M samples by 1)
337
338
        """
339
        # Data shape
340 1
        M, D = Z.shape
341
342
        # If classifier is trained, check for same dimensionality
343 1
        if self.is_trained:
344 1
            if not self.train_data_dim == D:
345
                raise ValueError('''Test data is of different dimensionality
346
                                 than training data.''')
347
348
        # Calculate psi function for target samples
349 1
        psi = self.psi(Z, self.theta.T, np.ones((M, 1)), K=self.K)
350
351
        # Compute posteriors for target samples
352 1
        pyz = self.posterior(psi)
353
354
        # Predictions through max-posteriors
355 1
        preds = np.argmax(pyz, axis=1)
356
357
        # Map predictions back to original labels
358 1
        return self.classes[preds]
359
360 1
    def get_params(self):
361
        """Get classifier parameters."""
362
        return self.clf.get_params()
363
364 1
    def is_trained(self):
365
        """Check whether classifier is trained."""
366
        return self.is_trained
367