Completed
Branch master (f50597)
by Wouter
52s
created

RobustBiasAwareClassifier   A

Complexity

Total Complexity 27

Size/Duplication

Total Lines 281
Duplicated Lines 0 %

Test Coverage

Coverage 0%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
c 3
b 0
f 0
dl 0
loc 281
ccs 0
cts 93
cp 0
rs 10
wmc 27

9 Methods

Rating   Name   Duplication   Size   Complexity  
F fit() 0 89 9
B feature_stats() 0 33 4
B iwe_kernel_densities() 0 25 4
B __init__() 0 33 1
A get_params() 0 3 1
B posterior() 0 25 2
B psi() 0 25 2
B predict() 0 27 3
A is_trained() 0 3 1
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
import numpy as np
5
import scipy.stats as st
6
from scipy.sparse.linalg import eigs
7
from scipy.spatial.distance import cdist
8
import sklearn as sk
9
from sklearn.svm import LinearSVC
10
from sklearn.linear_model import LogisticRegression, LinearRegression
11
from sklearn.model_selection import cross_val_predict
12
from os.path import basename
13
14
from .util import is_pos_def
15
16
17
class RobustBiasAwareClassifier(object):
18
    """
19
    Class of robust bias-aware classifiers.
20
21
    Reference: Liu & Ziebart (20140. Robust Classification under Sample
22
    Selection Bias. NIPS.
23
24
    Methods contain training and prediction functions.
25
    """
26
27
    def __init__(self, l2=0.0, order='first', gamma=1.0, tau=1e-5,
28
                 max_iter=100, clip=1000, verbose=True):
29
        """
30
        Set classifier instance parameters.
31
32
        INPUT   (1) float 'l2': l2-regularization parameter value (def:0.01)
33
                (2) str 'order': order of feature statistics to employ; options
34
                    are 'first', or 'second' (def: 'first')
35
                (3) float 'gamma': decaying learning rate (def: 1.0)
36
                (4) float 'tau': convergence threshold (def: 1e-5)
37
                (5) int 'max_iter': maximum number of iterations (def: 100)
38
                (6) int 'clip': upper bound on importance weights (def: 1000)
39
                (7) boolean 'verbose': report training progress (def: True)
40
        OUTPUT  None
41
        """
42
        self.l2 = l2
43
        self.order = order
44
        self.gamma = gamma
45
        self.tau = tau
46
        self.max_iter = max_iter
47
        self.clip = clip
48
49
        # Whether model has been trained
50
        self.is_trained = False
51
52
        # Dimensionality of training data
53
        self.train_data_dim = ''
54
55
        # Classifier parameters
56
        self.theta = 0
57
58
        # Verbosity
59
        self.verbose = verbose
60
61
    def feature_stats(self, X, y, order='first'):
62
        """
63
        Compute first-order moment feature statistics.
64
65
        INPUT   (1) array 'X': dataset (N samples by D features)
66
                (2) array 'y': label vector (N samples by 1)
67
        OUTPUT  (1) array
68
        """
69
        # Data shape
70
        N, D = X.shape
71
72
        # Expand label vector
73
        if len(y.shape) < 2:
74
            y = np.atleast_2d(y).T
75
76
        if (order == 'first'):
77
78
            # First-order consists of data times label
79
            mom = y * X
80
81
        elif (order == 'second'):
82
83
            # First-order consists of data times label
84
            yX = y * X
85
86
            # Second-order is label times Kronecker delta product of data
87
            yXX = y*np.kron(X, X)
88
89
            # Concatenate moments
90
            mom = np.concatenate((yX, yXX), axis=1)
91
92
        # Concatenate label vector, moments, and ones-augmentation
93
        return np.concatenate((y, mom, np.ones((N, 1))), axis=1)
94
95
    def iwe_kernel_densities(self, X, Z):
96
        """
97
        Estimate importance weights based on kernel density estimation.
98
99
        INPUT   (1) array 'X': source data (N samples by D features)
100
                (2) array 'Z': target data (M samples by D features)
101
        OUTPUT  (1) array: importance weights (N samples by 1)
102
        """
103
        # Data shapes
104
        N, DX = X.shape
105
        M, DZ = Z.shape
106
107
        # Assert equivalent dimensionalities
108
        assert DX == DZ
109
110
        # Compute probabilities based on source kernel densities
111
        pT = st.gaussian_kde(Z.T).pdf(X.T)
112
        pS = st.gaussian_kde(X.T).pdf(X.T)
113
114
        # Check for numerics
115
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
116
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)
117
118
        # Return the ratio of probabilities
119
        return pT / pS
120
121
    def psi(self, X, theta, w, K=2):
122
        """
123
        Compute psi function.
124
125
        INPUT   (1) array 'X': data set (N samples by D features)
126
                (2) array 'theta': classifier parameters (D features by 1)
127
                (3) array 'w': importance-weights (N samples by 1)
128
                (4) int 'K': number of classes (def: 2)
129
        OUTPUT  (1) array 'psi' (N samples by K classes)
130
        """
131
        # Number of samples
132
        N = X.shape[0]
133
134
        # Preallocate psi array
135
        psi = np.zeros((N, K))
136
137
        # Loop over classes
138
        for k in range(K):
139
            # Compute feature statistics
140
            Xk = self.feature_stats(X, k*np.ones((N, 1)))
141
142
            # Compute psi function
143
            psi[:, k] = (w*np.dot(Xk, theta))[:, 0]
144
145
        return psi
146
147
    def posterior(self, psi):
148
        """
149
        Class-posterior estimation.
150
151
        INPUT   (1) array 'psi': weighted data-classifier output (N samples by
152
                    K classes)
153
        OUTPUT  (1) array 'pyx': class-posterior estimation (N samples by
154
                    K classes)
155
        """
156
        # Data shape
157
        N, K = psi.shape
158
159
        # Preallocate array
160
        pyx = np.zeros((N, K))
161
162
        # Subtract maximum value for numerical stability
163
        psi = (psi.T - np.max(psi, axis=1).T).T
164
165
        # Loop over classes
166
        for k in range(K):
167
168
            # Estimate posterior p^(Y=y | x_i)
169
            pyx[:, k] = np.exp(psi[:, k]) / np.sum(np.exp(psi), axis=1)
170
171
        return pyx
172
173
    def fit(self, X, y, Z):
174
        """
175
        Fit/train a robust bias-aware classifier.
176
177
        INPUT   (1) array 'X': source data (N samples by D features)
178
                (2) array 'y': source labels (N samples by 1)
179
                (3) array 'Z': target data (M samples by D features)
180
        OUTPUT  None
181
        """
182
        # Data shapes
183
        N, DX = X.shape
184
        M, DZ = Z.shape
185
186
        # Number of classes
187
        self.K = len(np.unique(y))
188
189
        # Assert equivalent dimensionalities
190
        assert DX == DZ
191
192
        # Dimenionsality of expanded feature space
193
        if (self.order == 'first'):
194
            D = 1 + DX + 1
195
        elif (self.order == 'second'):
196
            D = 1 + DX + DX**2 + 1
197
        else:
198
            raise ValueError
199
200
        # Compute moment-matching constraint
201
        c = np.mean(self.feature_stats(X, y, order=self.order), axis=0)
202
203
        # Estimate importance-weights
204
        w = self.iwe_kernel_densities(X, Z)
205
206
        # Inverse weights to achieve p_S(x)/p_T(x)
207
        w = 1./w
208
209
        # Clip weights if necessary
210
        w = np.clip(w, 0, self.clip)
211
212
        # Initialize classifier parameters
213
        theta = np.random.randn(1, D)*0.01
214
215
        # Start gradient descent
216
        for t in range(1, self.max_iter+1):
217
218
            # Calculate psi function
219
            psi = self.psi(X, theta.T, w, K=self.K)
220
221
            # Compute posterior
222
            pyx = self.posterior(psi)
223
224
            # Sum product of estimated posterior and feature stats
225
            pfs = 0
226
            for k in range(self.K):
227
228
                # Compute feature statistics for k-th class
229
                Xk = self.feature_stats(X, k*np.ones((N, 1)))
230
231
                # Element-wise product with posterior and sum over classes
232
                pfs += (pyx[:, k].T * Xk.T).T
233
234
            # Gradient computation and regularization
235
            dL = c - np.mean(pfs, axis=0) + self.l2*2*theta
236
237
            # Apply learning rate to gradient
238
            dT = dL / (t * self.gamma)
239
240
            # Update classifier parameters
241
            theta += dT
242
243
            # Report progress
244
            if self.verbose:
245
                if (t % (self.max_iter / 10)) == 1:
246
                    print('Iteration {:03}/{:03} - Norm gradient: {:.12}'
247
                          .format(t, self.max_iter, np.linalg.norm(dL)))
248
249
            # Check for convergence
250
            if (np.linalg.norm(dL) <= self.tau):
251
                print('Broke at {}'.format(t))
252
                break
253
254
        # Store resultant classifier parameters
255
        self.theta = theta
256
257
        # Mark classifier as trained
258
        self.is_trained = True
259
260
        # Store training data dimensionality
261
        self.train_data_dim = DX
262
263
    def predict(self, Z_):
264
        """
265
        Make predictions on new dataset.
266
267
        INPUT   (1) array 'Z_': new data set (M samples by D features)
268
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
269
        """
270
        # Data shape
271
        M, D = Z_.shape
272
273
        # If classifier is trained, check for same dimensionality
274
        if self.is_trained:
275
            assert self.train_data_dim == D
276
        else:
277
            raise UserWarning('Classifier is not trained yet.')
278
279
        # Calculate psi function for target samples
280
        psi = self.psi(Z_, self.theta.T, np.ones((M, 1)), K=self.K)
281
282
        # Compute posteriors for target samples
283
        pyz = self.posterior(psi)
284
285
        # Predictions through max-posteriors
286
        preds = np.argmax(pyz, axis=1)
287
288
        # Return predictions array
289
        return preds
290
291
    def get_params(self):
292
        """Get classifier parameters."""
293
        return self.clf.get_params()
294
295
    def is_trained(self):
296
        """Check whether classifier is trained."""
297
        return self.is_trained
298