Completed
Push — master ( 582254...17fb6a )
by Wouter
03:58
created

RobustBiasAwareClassifier   A

Complexity

Total Complexity 27

Size/Duplication

Total Lines 288
Duplicated Lines 0 %

Test Coverage

Coverage 87.5%

Importance

Changes 3
Bugs 0 Features 0
Metric Value
c 3
b 0
f 0
dl 0
loc 288
ccs 84
cts 96
cp 0.875
rs 10
wmc 27

9 Methods

Rating   Name   Duplication   Size   Complexity  
F fit() 0 93 9
B feature_stats() 0 33 4
B iwe_kernel_densities() 0 25 4
B __init__() 0 33 1
A get_params() 0 3 1
B posterior() 0 25 2
B psi() 0 25 2
B predict() 0 30 3
A is_trained() 0 3 1
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse.linalg import eigs
7 1
from scipy.spatial.distance import cdist
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class RobustBiasAwareClassifier(object):
18
    """
19
    Class of robust bias-aware classifiers.
20
21
    Reference: Liu & Ziebart (20140. Robust Classification under Sample
22
    Selection Bias. NIPS.
23
24
    Methods contain training and prediction functions.
25
    """
26
27 1
    def __init__(self, l2=0.0, order='first', gamma=1.0, tau=1e-5,
28
                 max_iter=100, clip=1000, verbose=True):
29
        """
30
        Set classifier instance parameters.
31
32
        INPUT   (1) float 'l2': l2-regularization parameter value (def:0.01)
33
                (2) str 'order': order of feature statistics to employ; options
34
                    are 'first', or 'second' (def: 'first')
35
                (3) float 'gamma': decaying learning rate (def: 1.0)
36
                (4) float 'tau': convergence threshold (def: 1e-5)
37
                (5) int 'max_iter': maximum number of iterations (def: 100)
38
                (6) int 'clip': upper bound on importance weights (def: 1000)
39
                (7) boolean 'verbose': report training progress (def: True)
40
        OUTPUT  None
41
        """
42 1
        self.l2 = l2
43 1
        self.order = order
44 1
        self.gamma = gamma
45 1
        self.tau = tau
46 1
        self.max_iter = max_iter
47 1
        self.clip = clip
48
49
        # Whether model has been trained
50 1
        self.is_trained = False
51
52
        # Dimensionality of training data
53 1
        self.train_data_dim = ''
54
55
        # Classifier parameters
56 1
        self.theta = 0
57
58
        # Verbosity
59 1
        self.verbose = verbose
60
61 1
    def feature_stats(self, X, y, order='first'):
62
        """
63
        Compute first-order moment feature statistics.
64
65
        INPUT   (1) array 'X': dataset (N samples by D features)
66
                (2) array 'y': label vector (N samples by 1)
67
        OUTPUT  (1) array
68
        """
69
        # Data shape
70 1
        N, D = X.shape
71
72
        # Expand label vector
73 1
        if len(y.shape) < 2:
74 1
            y = np.atleast_2d(y).T
75
76 1
        if (order == 'first'):
77
78
            # First-order consists of data times label
79 1
            mom = y * X
80
81
        elif (order == 'second'):
82
83
            # First-order consists of data times label
84
            yX = y * X
85
86
            # Second-order is label times Kronecker delta product of data
87
            yXX = y*np.kron(X, X)
88
89
            # Concatenate moments
90
            mom = np.concatenate((yX, yXX), axis=1)
91
92
        # Concatenate label vector, moments, and ones-augmentation
93 1
        return np.concatenate((y, mom, np.ones((N, 1))), axis=1)
94
95 1
    def iwe_kernel_densities(self, X, Z):
96
        """
97
        Estimate importance weights based on kernel density estimation.
98
99
        INPUT   (1) array 'X': source data (N samples by D features)
100
                (2) array 'Z': target data (M samples by D features)
101
        OUTPUT  (1) array: importance weights (N samples by 1)
102
        """
103
        # Data shapes
104 1
        N, DX = X.shape
105 1
        M, DZ = Z.shape
106
107
        # Assert equivalent dimensionalities
108 1
        assert DX == DZ
109
110
        # Compute probabilities based on source kernel densities
111 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
112 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
113
114
        # Check for numerics
115 1
        assert not np.any(np.isnan(pT)) or np.any(pT == 0)
116 1
        assert not np.any(np.isnan(pS)) or np.any(pS == 0)
117
118
        # Return the ratio of probabilities
119 1
        return pT / pS
120
121 1
    def psi(self, X, theta, w, K=2):
122
        """
123
        Compute psi function.
124
125
        INPUT   (1) array 'X': data set (N samples by D features)
126
                (2) array 'theta': classifier parameters (D features by 1)
127
                (3) array 'w': importance-weights (N samples by 1)
128
                (4) int 'K': number of classes (def: 2)
129
        OUTPUT  (1) array 'psi' (N samples by K classes)
130
        """
131
        # Number of samples
132 1
        N = X.shape[0]
133
134
        # Preallocate psi array
135 1
        psi = np.zeros((N, K))
136
137
        # Loop over classes
138 1
        for k in range(K):
139
            # Compute feature statistics
140 1
            Xk = self.feature_stats(X, k*np.ones((N, 1)))
141
142
            # Compute psi function
143 1
            psi[:, k] = (w*np.dot(Xk, theta))[:, 0]
144
145 1
        return psi
146
147 1
    def posterior(self, psi):
148
        """
149
        Class-posterior estimation.
150
151
        INPUT   (1) array 'psi': weighted data-classifier output (N samples by
152
                    K classes)
153
        OUTPUT  (1) array 'pyx': class-posterior estimation (N samples by
154
                    K classes)
155
        """
156
        # Data shape
157 1
        N, K = psi.shape
158
159
        # Preallocate array
160 1
        pyx = np.zeros((N, K))
161
162
        # Subtract maximum value for numerical stability
163 1
        psi = (psi.T - np.max(psi, axis=1).T).T
164
165
        # Loop over classes
166 1
        for k in range(K):
167
168
            # Estimate posterior p^(Y=y | x_i)
169 1
            pyx[:, k] = np.exp(psi[:, k]) / np.sum(np.exp(psi), axis=1)
170
171 1
        return pyx
172
173 1
    def fit(self, X, y, Z):
174
        """
175
        Fit/train a robust bias-aware classifier.
176
177
        INPUT   (1) array 'X': source data (N samples by D features)
178
                (2) array 'y': source labels (N samples by 1)
179
                (3) array 'Z': target data (M samples by D features)
180
        OUTPUT  None
181
        """
182
        # Data shapes
183 1
        N, DX = X.shape
184 1
        M, DZ = Z.shape
185
186
        # Number of classes
187 1
        labels = np.unique(y)
188 1
        self.K = len(labels)
189
190
        # Assert equivalent dimensionalities
191 1
        assert DX == DZ
192
193
        # Dimenionsality of expanded feature space
194 1
        if (self.order == 'first'):
195 1
            D = 1 + DX + 1
196
        elif (self.order == 'second'):
197
            D = 1 + DX + DX**2 + 1
198
        else:
199
            raise ValueError
200
201
        # Compute moment-matching constraint
202 1
        c = np.mean(self.feature_stats(X, y, order=self.order), axis=0)
203
204
        # Estimate importance-weights
205 1
        w = self.iwe_kernel_densities(X, Z)
206
207
        # Inverse weights to achieve p_S(x)/p_T(x)
208 1
        w = 1./w
209
210
        # Clip weights if necessary
211 1
        w = np.clip(w, 0, self.clip)
212
213
        # Initialize classifier parameters
214 1
        theta = np.random.randn(1, D)*0.01
215
216
        # Start gradient descent
217 1
        for t in range(1, self.max_iter+1):
218
219
            # Calculate psi function
220 1
            psi = self.psi(X, theta.T, w, K=self.K)
221
222
            # Compute posterior
223 1
            pyx = self.posterior(psi)
224
225
            # Sum product of estimated posterior and feature stats
226 1
            pfs = 0
227 1
            for k in range(self.K):
228
229
                # Compute feature statistics for k-th class
230 1
                Xk = self.feature_stats(X, k*np.ones((N, 1)))
231
232
                # Element-wise product with posterior and sum over classes
233 1
                pfs += (pyx[:, k].T * Xk.T).T
234
235
            # Gradient computation and regularization
236 1
            dL = c - np.mean(pfs, axis=0) + self.l2*2*theta
237
238
            # Apply learning rate to gradient
239 1
            dT = dL / (t * self.gamma)
240
241
            # Update classifier parameters
242 1
            theta += dT
243
244
            # Report progress
245 1
            if self.verbose:
246 1
                if (t % (self.max_iter / 10)) == 1:
247 1
                    print('Iteration {:03}/{:03} - Norm gradient: {:.12}'
248
                          .format(t, self.max_iter, np.linalg.norm(dL)))
249
250
            # Check for convergence
251 1
            if (np.linalg.norm(dL) <= self.tau):
252
                print('Broke at {}'.format(t))
253
                break
254
255
        # Store resultant classifier parameters
256 1
        self.theta = theta
257
258
        # Store classes
259 1
        self.classes = labels
260
261
        # Mark classifier as trained
262 1
        self.is_trained = True
263
264
        # Store training data dimensionality
265 1
        self.train_data_dim = DX
266
267 1
    def predict(self, Z_):
268
        """
269
        Make predictions on new dataset.
270
271
        INPUT   (1) array 'Z_': new data set (M samples by D features)
272
        OUTPUT  (1) array 'preds': label predictions (M samples by 1)
273
        """
274
        # Data shape
275 1
        M, D = Z_.shape
276
277
        # If classifier is trained, check for same dimensionality
278 1
        if self.is_trained:
279 1
            assert self.train_data_dim == D
280
        else:
281
            raise UserWarning('Classifier is not trained yet.')
282
283
        # Calculate psi function for target samples
284 1
        psi = self.psi(Z_, self.theta.T, np.ones((M, 1)), K=self.K)
285
286
        # Compute posteriors for target samples
287 1
        pyz = self.posterior(psi)
288
289
        # Predictions through max-posteriors
290 1
        preds = np.argmax(pyz, axis=1)
291
292
        # Map predictions back to original labels
293 1
        preds = self.classes[preds]
294
295
        # Return predictions array
296 1
        return preds
297
298 1
    def get_params(self):
299
        """Get classifier parameters."""
300
        return self.clf.get_params()
301
302 1
    def is_trained(self):
303
        """Check whether classifier is trained."""
304
        return self.is_trained
305