1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
1 |
|
import numpy as np |
5
|
1 |
|
import scipy.stats as st |
6
|
1 |
|
from scipy.optimize import minimize |
7
|
1 |
|
from scipy.sparse.linalg import eigs |
8
|
1 |
|
from scipy.spatial.distance import cdist |
9
|
1 |
|
import sklearn as sk |
10
|
1 |
|
from sklearn.svm import LinearSVC |
11
|
1 |
|
from sklearn.linear_model import LogisticRegression, LinearRegression |
12
|
1 |
|
from sklearn.model_selection import cross_val_predict |
13
|
1 |
|
from os.path import basename |
14
|
|
|
|
15
|
1 |
|
from .util import is_pos_def, one_hot |
16
|
|
|
|
17
|
|
|
|
18
|
1 |
|
class FeatureLevelDomainAdaptiveClassifier(object): |
19
|
|
|
""" |
20
|
|
|
Class of feature-level domain-adaptive classifiers. |
21
|
|
|
|
22
|
|
|
Reference: Kouw, Krijthe, Loog & Van der Maaten (2016). Feature-level |
23
|
|
|
domain adaptation. JMLR. |
24
|
|
|
|
25
|
|
|
Methods contain training and prediction functions. |
26
|
|
|
""" |
27
|
|
|
|
28
|
1 |
|
def __init__(self, l2=0.0, loss='logistic', transfer_model='blankout', |
29
|
|
|
max_iter=100, tolerance=1e-5, verbose=True): |
30
|
|
|
""" |
31
|
|
|
Set classifier instance parameters. |
32
|
|
|
|
33
|
|
|
Parameters |
34
|
|
|
---------- |
35
|
|
|
l2 : float |
36
|
|
|
l2-regularization parameter value (def:0.01) |
37
|
|
|
loss : str |
38
|
|
|
loss function for classifier, options are 'logistic' or 'quadratic' |
39
|
|
|
(def: 'logistic') |
40
|
|
|
transfer_model : str |
41
|
|
|
distribution to use for transfer model, options are 'dropout' and |
42
|
|
|
'blankout' (def: 'blankout') |
43
|
|
|
max_iter : int |
44
|
|
|
maximum number of iterations (def: 100) |
45
|
|
|
tolerance : float |
46
|
|
|
convergence criterion threshold on x (def: 1e-5) |
47
|
|
|
verbose : bool |
48
|
|
|
report training progress (def: True) |
49
|
|
|
|
50
|
|
|
Returns |
51
|
|
|
------- |
52
|
|
|
None |
53
|
|
|
|
54
|
|
|
""" |
55
|
|
|
# Classifier choices |
56
|
1 |
|
self.l2 = l2 |
57
|
1 |
|
self.loss = 'logistic' |
58
|
1 |
|
self.transfer_model = transfer_model |
59
|
|
|
|
60
|
|
|
# Optimization parameters |
61
|
1 |
|
self.max_iter = max_iter |
62
|
1 |
|
self.tolerance = tolerance |
63
|
|
|
|
64
|
|
|
# Whether model has been trained |
65
|
1 |
|
self.is_trained = False |
66
|
|
|
|
67
|
|
|
# Dimensionality of training data |
68
|
1 |
|
self.train_data_dim = 0 |
69
|
|
|
|
70
|
|
|
# Classifier parameters |
71
|
1 |
|
self.theta = 0 |
72
|
|
|
|
73
|
|
|
# Verbosity |
74
|
1 |
|
self.verbose = verbose |
75
|
|
|
|
76
|
1 |
|
def mle_transfer_dist(self, X, Z, dist='blankout'): |
77
|
|
|
""" |
78
|
|
|
Maximum likelihood estimation of transfer model parameters. |
79
|
|
|
|
80
|
|
|
Parameters |
81
|
|
|
---------- |
82
|
|
|
X : array |
83
|
|
|
source data set (N samples by D features) |
84
|
|
|
Z : array |
85
|
|
|
target data set (M samples by D features) |
86
|
|
|
dist : str |
87
|
|
|
distribution of transfer model, options are 'blankout' or 'dropout' |
88
|
|
|
(def: 'blankout') |
89
|
|
|
|
90
|
|
|
Returns |
91
|
|
|
------- |
92
|
|
|
iota : array |
93
|
|
|
estimated transfer model parameters (D features by 1) |
94
|
|
|
|
95
|
|
|
""" |
96
|
|
|
# Data shapes |
97
|
1 |
|
N, DX = X.shape |
98
|
1 |
|
M, DZ = Z.shape |
99
|
|
|
|
100
|
|
|
# Assert equivalent dimensionalities |
101
|
1 |
|
if not DX == DZ: |
102
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
103
|
|
|
|
104
|
|
|
# Blankout and dropout have same maximum likelihood estimator |
105
|
1 |
|
if (dist == 'blankout') or (dist == 'dropout'): |
106
|
|
|
|
107
|
|
|
# Rate parameters |
108
|
1 |
|
eta = np.mean(X > 0, axis=0) |
109
|
1 |
|
zeta = np.mean(Z > 0, axis=0) |
110
|
|
|
|
111
|
|
|
# Ratio of rate parameters |
112
|
1 |
|
iota = np.clip(1 - zeta / eta, 0, None) |
113
|
|
|
|
114
|
|
|
else: |
115
|
|
|
raise ValueError('Distribution unknown.') |
116
|
|
|
|
117
|
1 |
|
return iota |
118
|
|
|
|
119
|
1 |
|
def moments_transfer_model(self, X, iota, dist='blankout'): |
120
|
|
|
""" |
121
|
|
|
Moments of the transfer model. |
122
|
|
|
|
123
|
|
|
Parameters |
124
|
|
|
---------- |
125
|
|
|
X : array |
126
|
|
|
data set (N samples by D features) |
127
|
|
|
iota : array |
128
|
|
|
transfer model parameters (D samples by 1) |
129
|
|
|
dist : str |
130
|
|
|
transfer model, options are 'dropout' and 'blankout' |
131
|
|
|
(def: 'blankout') |
132
|
|
|
|
133
|
|
|
Returns |
134
|
|
|
------- |
135
|
|
|
E : array |
136
|
|
|
expected value of transfer model (N samples by D feautures) |
137
|
|
|
V : array |
138
|
|
|
variance of transfer model (D features by D features by N samples) |
139
|
|
|
|
140
|
|
|
""" |
141
|
|
|
# Data shape |
142
|
1 |
|
N, D = X.shape |
143
|
|
|
|
144
|
1 |
|
if (dist == 'dropout'): |
145
|
|
|
|
146
|
|
|
# First moment of transfer distribution |
147
|
|
|
E = (1-iota) * X |
148
|
|
|
|
149
|
|
|
# Second moment of transfer distribution |
150
|
|
|
V = np.zeros((D, D, N)) |
151
|
|
|
for i in range(N): |
152
|
|
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T*X[i, :]) |
153
|
|
|
|
154
|
1 |
|
elif (dist == 'blankout'): |
155
|
|
|
|
156
|
|
|
# First moment of transfer distribution |
157
|
1 |
|
E = X |
158
|
|
|
|
159
|
|
|
# Second moment of transfer distribution |
160
|
1 |
|
V = np.zeros((D, D, N)) |
161
|
1 |
|
for i in range(N): |
162
|
1 |
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T * X[i, :]) |
163
|
|
|
|
164
|
|
|
else: |
165
|
|
|
raise NotImplementedError('Transfer distribution not implemented') |
166
|
|
|
|
167
|
1 |
|
return E, V |
168
|
|
|
|
169
|
1 |
|
def flda_log_loss(self, theta, X, y, E, V, l2=0.0): |
170
|
|
|
""" |
171
|
|
|
Compute average loss for flda-log. |
172
|
|
|
|
173
|
|
|
Parameters |
174
|
|
|
---------- |
175
|
|
|
theta : array |
176
|
|
|
classifier parameters (D features by 1) |
177
|
|
|
X : array |
178
|
|
|
source data set (N samples by D features) |
179
|
|
|
y : array |
180
|
|
|
label vector (N samples by 1) |
181
|
|
|
E : array |
182
|
|
|
expected value with respect to transfer model (N samples by |
183
|
|
|
D features) |
184
|
|
|
V : array |
185
|
|
|
variance with respect to transfer model (D features by D features |
186
|
|
|
by N samples) |
187
|
|
|
l2 : float |
188
|
|
|
regularization parameter (def: 0.0) |
189
|
|
|
|
190
|
|
|
Returns |
191
|
|
|
------- |
192
|
|
|
dL : array |
193
|
|
|
Value of loss function. |
194
|
|
|
|
195
|
|
|
""" |
196
|
|
|
# Data shape |
197
|
1 |
|
N, D = X.shape |
198
|
|
|
|
199
|
|
|
# Assert y in {-1,+1} |
200
|
1 |
|
if not np.all(np.sort(np.unique(y)) == (-1, 1)): |
201
|
|
|
raise NotImplementedError('Labels can only be {-1, +1} for now.') |
202
|
|
|
|
203
|
|
|
# Precompute terms |
204
|
1 |
|
Xt = np.dot(X, theta) |
205
|
1 |
|
Et = np.dot(E, theta) |
206
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
207
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
208
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
209
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
210
|
|
|
|
211
|
|
|
# Log-partition function |
212
|
1 |
|
A = np.log(alpha) |
213
|
|
|
|
214
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
215
|
1 |
|
dA = beta / alpha |
216
|
|
|
|
217
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
218
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
219
|
|
|
|
220
|
|
|
# Compute pointwise loss (negative log-likelihood) |
221
|
1 |
|
L = np.zeros((N, 1)) |
222
|
1 |
|
for i in range(N): |
223
|
1 |
|
L[i] = -y[i] * Et[i] + A[i] + dA[i] * (Et[i] - Xt[i]) + \ |
224
|
|
|
1./2*d2A[i]*np.dot(np.dot(theta.T, V[:, :, i]), theta) |
225
|
|
|
|
226
|
|
|
# Compute risk (average loss) |
227
|
1 |
|
R = np.mean(L, axis=0) |
228
|
|
|
|
229
|
|
|
# Add regularization |
230
|
1 |
|
return R + l2*np.sum(theta**2, axis=0) |
231
|
|
|
|
232
|
1 |
|
def flda_log_grad(self, theta, X, y, E, V, l2=0.0): |
233
|
|
|
""" |
234
|
|
|
Compute gradient with respect to theta for flda-log. |
235
|
|
|
|
236
|
|
|
Parameters |
237
|
|
|
---------- |
238
|
|
|
theta : array |
239
|
|
|
classifier parameters (D features by 1) |
240
|
|
|
X : array |
241
|
|
|
source data set (N samples by D features) |
242
|
|
|
y : array |
243
|
|
|
label vector (N samples by 1) |
244
|
|
|
E : array |
245
|
|
|
expected value with respect to transfer model (N samples by |
246
|
|
|
D features) |
247
|
|
|
V : array |
248
|
|
|
variance with respect to transfer model (D features by D features |
249
|
|
|
by N samples) |
250
|
|
|
l2 : float |
251
|
|
|
regularization parameter (def: 0.0) |
252
|
|
|
|
253
|
|
|
Returns |
254
|
|
|
------- |
255
|
|
|
dR : array |
256
|
|
|
Value of gradient. |
257
|
|
|
|
258
|
|
|
""" |
259
|
|
|
# Data shape |
260
|
1 |
|
N, D = X.shape |
261
|
|
|
|
262
|
|
|
# Assert y in {-1,+1} |
263
|
1 |
|
if not np.all(np.sort(np.unique(y)) == (-1, 1)): |
264
|
|
|
raise NotImplementedError('Labels can only be {-1, +1} for now.') |
265
|
|
|
|
266
|
|
|
# Precompute common terms |
267
|
1 |
|
Xt = np.dot(X, theta) |
268
|
1 |
|
Et = np.dot(E, theta) |
269
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
270
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
271
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
272
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
273
|
|
|
|
274
|
|
|
# Log-partition function |
275
|
1 |
|
A = np.log(alpha) |
276
|
|
|
|
277
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
278
|
1 |
|
dA = beta / alpha |
279
|
|
|
|
280
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
281
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
282
|
|
|
|
283
|
1 |
|
dR = 0 |
284
|
1 |
|
for i in range(N): |
285
|
|
|
|
286
|
|
|
# Compute gradient terms |
287
|
1 |
|
t1 = -y[i]*E[i, :].T |
288
|
|
|
|
289
|
1 |
|
t2 = beta[i] / alpha[i] * X[i, :].T |
290
|
|
|
|
291
|
1 |
|
t3 = (gamma[i, :] / alpha[i] - beta[i]*delta[i, :] / |
292
|
|
|
alpha[i]**2).T * (Et[i] - Xt[i]) |
293
|
|
|
|
294
|
1 |
|
t4 = beta[i] / alpha[i] * (E[i, :] - X[i, :]).T |
295
|
|
|
|
296
|
1 |
|
t5 = (1 - beta[i]**2 / alpha[i]**2) * np.dot(V[:, :, i], theta) |
297
|
|
|
|
298
|
1 |
|
t6 = -(beta[i] * gamma[i, :] / alpha[i]**2 - beta[i]**2 * |
299
|
|
|
delta[i, :] / alpha[i]**3).T * np.dot(np.dot(theta.T, |
300
|
|
|
V[:, :, i]), theta) |
301
|
|
|
|
302
|
1 |
|
dR += t1 + t2 + t3 + t4 + t5 + t6 |
303
|
|
|
|
304
|
|
|
# Add regularization |
305
|
1 |
|
dR += l2*2*theta |
306
|
|
|
|
307
|
1 |
|
return dR |
308
|
|
|
|
309
|
1 |
|
def fit(self, X, y, Z): |
310
|
|
|
""" |
311
|
|
|
Fit/train a robust bias-aware classifier. |
312
|
|
|
|
313
|
|
|
Parameters |
314
|
|
|
---------- |
315
|
|
|
X : array |
316
|
|
|
source data (N samples by D features) |
317
|
|
|
y : array |
318
|
|
|
source labels (N samples by 1) |
319
|
|
|
Z : array |
320
|
|
|
target data (M samples by D features) |
321
|
|
|
|
322
|
|
|
Returns |
323
|
|
|
------- |
324
|
|
|
None |
325
|
|
|
|
326
|
|
|
""" |
327
|
|
|
# Data shapes |
328
|
1 |
|
N, DX = X.shape |
329
|
1 |
|
M, DZ = Z.shape |
330
|
|
|
|
331
|
|
|
# Assert equivalent dimensionalities |
332
|
1 |
|
if not DX == DZ: |
333
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
334
|
|
|
|
335
|
|
|
# Map to one-not-encoding |
336
|
1 |
|
Y, labels = one_hot(y, one_not=True) |
337
|
|
|
|
338
|
|
|
# Number of classes |
339
|
1 |
|
K = len(labels) |
340
|
|
|
|
341
|
|
|
# Compute transfer distribution parameters |
342
|
1 |
|
iota = self.mle_transfer_dist(X, Z) |
343
|
|
|
|
344
|
|
|
# Compute moments of transfer distribution |
345
|
1 |
|
E, V = self.moments_transfer_model(X, iota) |
346
|
|
|
|
347
|
|
|
# Select loss function |
348
|
1 |
|
if (self.loss == 'logistic'): |
349
|
|
|
|
350
|
|
|
# Preallocate parameter array |
351
|
1 |
|
theta = np.random.randn(DX, K) |
352
|
|
|
|
353
|
|
|
# Train a classifier for each class |
354
|
1 |
|
for k in range(K): |
355
|
|
|
|
356
|
|
|
# Shorthand for loss computation |
357
|
1 |
|
def L(theta): return self.flda_log_loss(theta, X, Y[:, k], |
358
|
|
|
E, V, l2=self.l2) |
359
|
|
|
|
360
|
|
|
# Shorthand for gradient computation |
361
|
1 |
|
def J(theta): return self.flda_log_grad(theta, X, Y[:, k], |
362
|
|
|
E, V, l2=self.l2) |
363
|
|
|
|
364
|
|
|
# Call scipy's minimizer |
365
|
1 |
|
results = minimize(L, theta[:, k], jac=J, method='BFGS', |
366
|
|
|
options={'gtol': self.tolerance, |
367
|
|
|
'disp': self.verbose}) |
368
|
|
|
|
369
|
|
|
# Store resultant classifier parameters |
370
|
1 |
|
theta[:, k] = results.x |
371
|
|
|
|
372
|
|
|
elif (self.loss == 'quadratic'): |
373
|
|
|
|
374
|
|
|
# Compute closed-form least-squares solution |
375
|
|
|
theta = np.inv(E.T*E + np.sum(V, axis=2) + l2*np.eye(D))\ |
376
|
|
|
* (E.T * Y) |
377
|
|
|
|
378
|
|
|
# Store trained classifier parameters |
379
|
1 |
|
self.theta = theta |
380
|
|
|
|
381
|
|
|
# Store classes |
382
|
1 |
|
self.classes = labels |
383
|
|
|
|
384
|
|
|
# Mark classifier as trained |
385
|
1 |
|
self.is_trained = True |
386
|
|
|
|
387
|
|
|
# Store training data dimensionality |
388
|
1 |
|
self.train_data_dim = DX |
389
|
|
|
|
390
|
1 |
|
def predict(self, Z_): |
391
|
|
|
""" |
392
|
|
|
Make predictions on new dataset. |
393
|
|
|
|
394
|
|
|
Parameters |
395
|
|
|
---------- |
396
|
|
|
Z : array |
397
|
|
|
new data set (M samples by D features) |
398
|
|
|
|
399
|
|
|
Returns |
400
|
|
|
------- |
401
|
|
|
preds : array |
402
|
|
|
label predictions (M samples by 1) |
403
|
|
|
|
404
|
|
|
""" |
405
|
|
|
# Data shape |
406
|
1 |
|
M, D = Z_.shape |
407
|
|
|
|
408
|
|
|
# If classifier is trained, check for same dimensionality |
409
|
1 |
|
if self.is_trained: |
410
|
1 |
|
if not self.train_data_dim == D: |
411
|
|
|
raise ValueError('''Test data is of different dimensionality |
412
|
|
|
than training data.''') |
413
|
|
|
|
414
|
|
|
# Predict target labels |
415
|
1 |
|
preds = np.argmax(np.dot(Z_, self.theta), axis=1) |
416
|
|
|
|
417
|
|
|
# Map predictions back to labels |
418
|
1 |
|
preds = self.classes[preds] |
419
|
|
|
|
420
|
|
|
# Return predictions array |
421
|
1 |
|
return preds |
422
|
|
|
|
423
|
1 |
|
def get_params(self): |
424
|
|
|
"""Get classifier parameters.""" |
425
|
|
|
return self.clf.get_params() |
426
|
|
|
|
427
|
1 |
|
def is_trained(self): |
428
|
|
|
"""Check whether classifier is trained.""" |
429
|
|
|
return self.is_trained |
430
|
|
|
|