Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

RobustBiasAwareClassifier.predict()   B

Complexity

Conditions 3

Size

Total Lines 35

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 8
CRAP Score 3.0123

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 3
c 2
b 0
f 0
dl 0
loc 35
ccs 8
cts 9
cp 0.8889
crap 3.0123
rs 8.8571
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class RobustBiasAwareClassifier(object):
18
    """
19
    Class of robust bias-aware classifiers.
20
21
    Reference: Liu & Ziebart (20140. Robust Classification under Sample
22
    Selection Bias. NIPS.
23
24
    Methods contain training and prediction functions.
25
    """
26
27 1
    def __init__(self, l2=0.0, order='first', gamma=1.0, tau=1e-5,
28
                 max_iter=100, clip=1000, verbose=True):
29
        """
30
        Set classifier instance parameters.
31
32
        Parameters
33
        ----------
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        order : str
37
            order of feature statistics to employ; options are 'first', or
38
            'second' (def: 'first')
39
        gamma : float
40
            decaying learning rate (def: 1.0)
41
        tau : float
42
            convergence threshold (def: 1e-5)
43
        max_iter : int
44
            maximum number of iterations (def: 100)
45
        clip : float
46
            upper bound on importance weights (def: 1000.)
47
        verbose : bool
48
            report training progress (def: True)
49
50
        Returns
51
        -------
52
        None
53
54
        """
55 1
        self.l2 = l2
56 1
        self.order = order
57 1
        self.gamma = gamma
58 1
        self.tau = tau
59 1
        self.max_iter = max_iter
60 1
        self.clip = clip
61
62
        # Whether model has been trained
63 1
        self.is_trained = False
64
65
        # Dimensionality of training data
66 1
        self.train_data_dim = ''
67
68
        # Classifier parameters
69 1
        self.theta = 0
70
71
        # Verbosity
72 1
        self.verbose = verbose
73
74 1
    def feature_stats(self, X, y, order='first'):
75
        """
76
        Compute first-order moment feature statistics.
77
78
        Parameters
79
        ----------
80
        X : array
81
            dataset (N samples by D features)
82
        y : array
83
            label vector (N samples by 1)
84
85
        Returns
86
        -------
87
        array
88
            array containing label vector, feature moments and 1-augmentation.
89
90
        """
91
        # Data shape
92 1
        N, D = X.shape
93
94
        # Expand label vector
95 1
        if len(y.shape) < 2:
96 1
            y = np.atleast_2d(y).T
97
98 1
        if (order == 'first'):
99
100
            # First-order consists of data times label
101 1
            mom = y * X
102
103
        elif (order == 'second'):
104
105
            # First-order consists of data times label
106
            yX = y * X
107
108
            # Second-order is label times Kronecker delta product of data
109
            yXX = y*np.kron(X, X)
110
111
            # Concatenate moments
112
            mom = np.concatenate((yX, yXX), axis=1)
113
114
        # Concatenate label vector, moments, and ones-augmentation
115 1
        return np.concatenate((y, mom, np.ones((N, 1))), axis=1)
116
117 1
    def iwe_kernel_densities(self, X, Z):
118
        """
119
        Estimate importance weights based on kernel density estimation.
120
121
        Parameters
122
        ----------
123
            X : array
124
                source data (N samples by D features)
125
            Z : array
126
                target data (M samples by D features)
127
128
        Returns
129
        -------
130
        array
131
            importance weights (N samples by 1)
132
133
        """
134
        # Data shapes
135 1
        N, DX = X.shape
136 1
        M, DZ = Z.shape
137
138
        # Assert equivalent dimensionalities
139 1
        assert DX == DZ
140
141
        # Compute probabilities based on source kernel densities
142 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
143 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
144
145
        # Check for numerics
146 1
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
147 1
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)
148
149
        # Take the ratio of probabilities
150 1
        return pT / pS
151
152 1
    def psi(self, X, theta, w, K=2):
153
        """
154
        Compute psi function.
155
156
        Parameters
157
        ----------
158
        X : array
159
            data set (N samples by D features)
160
        theta : array
161
            classifier parameters (D features by 1)
162
        w : array
163
            importance-weights (N samples by 1)
164
        K : int
165
            number of classes (def: 2)
166
167
        Returns
168
        -------
169
        psi : array
170
            array with psi function values (N samples by K classes)
171
172
        """
173
        # Number of samples
174 1
        N = X.shape[0]
175
176
        # Preallocate psi array
177 1
        psi = np.zeros((N, K))
178
179
        # Loop over classes
180 1
        for k in range(K):
181
            # Compute feature statistics
182 1
            Xk = self.feature_stats(X, k*np.ones((N, 1)))
183
184
            # Compute psi function
185 1
            psi[:, k] = (w*np.dot(Xk, theta))[:, 0]
186
187 1
        return psi
188
189 1
    def posterior(self, psi):
190
        """
191
        Class-posterior estimation.
192
193
        Parameters
194
        ----------
195
        psi : array
196
            weighted data-classifier output (N samples by K classes)
197
198
        Returns
199
        -------
200
        pyx : array
201
            class-posterior estimation (N samples by K classes)
202
203
        """
204
        # Data shape
205 1
        N, K = psi.shape
206
207
        # Preallocate array
208 1
        pyx = np.zeros((N, K))
209
210
        # Subtract maximum value for numerical stability
211 1
        psi = (psi.T - np.max(psi, axis=1).T).T
212
213
        # Loop over classes
214 1
        for k in range(K):
215
216
            # Estimate posterior p^(Y=y | x_i)
217 1
            pyx[:, k] = np.exp(psi[:, k]) / np.sum(np.exp(psi), axis=1)
218
219 1
        return pyx
220
221 1
    def fit(self, X, y, Z):
222
        """
223
        Fit/train a robust bias-aware classifier.
224
225
        Parameters
226
        ----------
227
        X : array
228
            source data (N samples by D features)
229
        y : array
230
            source labels (N samples by 1)
231
        Z : array
232
            target data (M samples by D features)
233
234
        Returns
235
        -------
236
        None
237
238
        """
239
        # Data shapes
240 1
        N, DX = X.shape
241 1
        M, DZ = Z.shape
242
243
        # Number of classes
244 1
        labels = np.unique(y)
245 1
        self.K = len(labels)
246
247
        # Assert equivalent dimensionalities
248 1
        assert DX == DZ
249
250
        # Dimenionsality of expanded feature space
251 1
        if (self.order == 'first'):
252 1
            D = 1 + DX + 1
253
        elif (self.order == 'second'):
254
            D = 1 + DX + DX**2 + 1
255
        else:
256
            raise ValueError
257
258
        # Compute moment-matching constraint
259 1
        c = np.mean(self.feature_stats(X, y, order=self.order), axis=0)
260
261
        # Estimate importance-weights
262 1
        w = self.iwe_kernel_densities(X, Z)
263
264
        # Inverse weights to achieve p_S(x)/p_T(x)
265 1
        w = 1./w
266
267
        # Clip weights if necessary
268 1
        w = np.clip(w, 0, self.clip)
269
270
        # Initialize classifier parameters
271 1
        theta = np.random.randn(1, D)*0.01
272
273
        # Start gradient descent
274 1
        for t in range(1, self.max_iter+1):
275
276
            # Calculate psi function
277 1
            psi = self.psi(X, theta.T, w, K=self.K)
278
279
            # Compute posterior
280 1
            pyx = self.posterior(psi)
281
282
            # Sum product of estimated posterior and feature stats
283 1
            pfs = 0
284 1
            for k in range(self.K):
285
286
                # Compute feature statistics for k-th class
287 1
                Xk = self.feature_stats(X, k*np.ones((N, 1)))
288
289
                # Element-wise product with posterior and sum over classes
290 1
                pfs += (pyx[:, k].T * Xk.T).T
291
292
            # Gradient computation and regularization
293 1
            dL = c - np.mean(pfs, axis=0) + self.l2*2*theta
294
295
            # Apply learning rate to gradient
296 1
            dT = dL / (t * self.gamma)
297
298
            # Update classifier parameters
299 1
            theta += dT
300
301
            # Report progress
302 1
            if self.verbose:
303 1
                if (t % (self.max_iter / 10)) == 1:
304 1
                    print('Iteration {:03}/{:03} - Norm gradient: {:.12}'
305
                          .format(t, self.max_iter, np.linalg.norm(dL)))
306
307
            # Check for convergence
308 1
            if (np.linalg.norm(dL) <= self.tau):
309
                print('Broke at {}'.format(t))
310
                break
311
312
        # Store resultant classifier parameters
313 1
        self.theta = theta
314
315
        # Store classes
316 1
        self.classes = labels
317
318
        # Mark classifier as trained
319 1
        self.is_trained = True
320
321
        # Store training data dimensionality
322 1
        self.train_data_dim = DX
323
324 1
    def predict(self, Z):
325
        """
326
        Make predictions on new dataset.
327
328
        Parameters
329
        ----------
330
        Z : array
331
            new data set (M samples by D features)
332
333
        Returns
334
        -------
335
        preds : array
336
            label predictions (M samples by 1)
337
338
        """
339
        # Data shape
340 1
        M, D = Z.shape
341
342
        # If classifier is trained, check for same dimensionality
343 1
        if self.is_trained:
344 1
            if not self.train_data_dim == D:
345
                raise ValueError('''Test data is of different dimensionality
346
                                 than training data.''')
347
348
        # Calculate psi function for target samples
349 1
        psi = self.psi(Z, self.theta.T, np.ones((M, 1)), K=self.K)
350
351
        # Compute posteriors for target samples
352 1
        pyz = self.posterior(psi)
353
354
        # Predictions through max-posteriors
355 1
        preds = np.argmax(pyz, axis=1)
356
357
        # Map predictions back to original labels
358 1
        return self.classes[preds]
359
360 1
    def get_params(self):
361
        """Get classifier parameters."""
362
        return self.clf.get_params()
363
364 1
    def is_trained(self):
365
        """Check whether classifier is trained."""
366
        return self.is_trained
367