|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
1 |
|
import numpy as np |
|
5
|
1 |
|
import scipy.stats as st |
|
6
|
1 |
|
from scipy.optimize import minimize |
|
7
|
1 |
|
from scipy.sparse.linalg import eigs |
|
8
|
1 |
|
from scipy.spatial.distance import cdist |
|
9
|
1 |
|
import sklearn as sk |
|
10
|
1 |
|
from sklearn.svm import LinearSVC |
|
11
|
1 |
|
from sklearn.linear_model import LogisticRegression, LinearRegression |
|
12
|
1 |
|
from sklearn.model_selection import cross_val_predict |
|
13
|
1 |
|
from os.path import basename |
|
14
|
|
|
|
|
15
|
1 |
|
from .util import is_pos_def, one_hot |
|
16
|
|
|
|
|
17
|
|
|
|
|
18
|
1 |
|
class FeatureLevelDomainAdaptiveClassifier(object): |
|
19
|
|
|
""" |
|
20
|
|
|
Class of feature-level domain-adaptive classifiers. |
|
21
|
|
|
|
|
22
|
|
|
Reference: Kouw, Krijthe, Loog & Van der Maaten (2016). Feature-level |
|
23
|
|
|
domain adaptation. JMLR. |
|
24
|
|
|
|
|
25
|
|
|
Methods contain training and prediction functions. |
|
26
|
|
|
""" |
|
27
|
|
|
|
|
28
|
1 |
|
def __init__(self, l2=0.0, loss='logistic', transfer_model='blankout', |
|
29
|
|
|
max_iter=100, tolerance=1e-5, verbose=True): |
|
30
|
|
|
""" |
|
31
|
|
|
Set classifier instance parameters. |
|
32
|
|
|
|
|
33
|
|
|
INPUT (1) float 'l2': l2-regularization parameter value (def:0.01) |
|
34
|
|
|
(2) str 'loss': loss function for classifier, options are |
|
35
|
|
|
'logistic' or 'quadratic' (def: 'logistic') |
|
36
|
|
|
(3) str 'transfer_model': distribution to use for transfer |
|
37
|
|
|
model, options are 'dropout' and 'blankout' |
|
38
|
|
|
(def: 'blankout') |
|
39
|
|
|
(4) int 'max_iter': maximum number of iterations (def: 100) |
|
40
|
|
|
(5) float 'tolerance': convergence criterion threshold on x |
|
41
|
|
|
(def: 1e-5) |
|
42
|
|
|
(7) boolean 'verbose': report training progress (def: True) |
|
43
|
|
|
OUTPUT None |
|
44
|
|
|
""" |
|
45
|
|
|
# Classifier choices |
|
46
|
1 |
|
self.l2 = l2 |
|
47
|
1 |
|
self.loss = 'logistic' |
|
48
|
1 |
|
self.transfer_model = transfer_model |
|
49
|
|
|
|
|
50
|
|
|
# Optimization parameters |
|
51
|
1 |
|
self.max_iter = max_iter |
|
52
|
1 |
|
self.tolerance = tolerance |
|
53
|
|
|
|
|
54
|
|
|
# Whether model has been trained |
|
55
|
1 |
|
self.is_trained = False |
|
56
|
|
|
|
|
57
|
|
|
# Dimensionality of training data |
|
58
|
1 |
|
self.train_data_dim = 0 |
|
59
|
|
|
|
|
60
|
|
|
# Classifier parameters |
|
61
|
1 |
|
self.theta = 0 |
|
62
|
|
|
|
|
63
|
|
|
# Verbosity |
|
64
|
1 |
|
self.verbose = verbose |
|
65
|
|
|
|
|
66
|
1 |
|
def mle_transfer_dist(self, X, Z, dist='blankout'): |
|
67
|
|
|
""" |
|
68
|
|
|
Maximum likelihood estimation of transfer model parameters. |
|
69
|
|
|
|
|
70
|
|
|
INPUT (1) array 'X': source data set (N samples by D features) |
|
71
|
|
|
(2) array 'Z': target data set (M samples by D features) |
|
72
|
|
|
(3) str 'dist': distribution of transfer model, options are |
|
73
|
|
|
'blankout' or 'dropout' (def: 'blankout') |
|
74
|
|
|
OUTPUT (1) array 'iota': estimated transfer model parameters |
|
75
|
|
|
(D features by 1) |
|
76
|
|
|
""" |
|
77
|
|
|
# Data shapes |
|
78
|
1 |
|
N, DX = X.shape |
|
79
|
1 |
|
M, DZ = Z.shape |
|
80
|
|
|
|
|
81
|
|
|
# Assert equivalent dimensionalities |
|
82
|
1 |
|
if not DX == DZ: |
|
83
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
|
84
|
|
|
|
|
85
|
|
|
# Blankout and dropout have same maximum likelihood estimator |
|
86
|
1 |
|
if (dist == 'blankout') or (dist == 'dropout'): |
|
87
|
|
|
|
|
88
|
|
|
# Rate parameters |
|
89
|
1 |
|
eta = np.mean(X > 0, axis=0) |
|
90
|
1 |
|
zeta = np.mean(Z > 0, axis=0) |
|
91
|
|
|
|
|
92
|
|
|
# Ratio of rate parameters |
|
93
|
1 |
|
iota = np.clip(1 - zeta / eta, 0, None) |
|
94
|
|
|
|
|
95
|
|
|
else: |
|
96
|
|
|
raise ValueError('Distribution unknown.') |
|
97
|
|
|
|
|
98
|
1 |
|
return iota |
|
99
|
|
|
|
|
100
|
1 |
|
def moments_transfer_model(self, X, iota, dist='blankout'): |
|
101
|
|
|
""" |
|
102
|
|
|
Moments of the transfer model. |
|
103
|
|
|
|
|
104
|
|
|
INPUT (1) array 'X': data set (N samples by D features) |
|
105
|
|
|
(2) array 'iota': transfer model parameters (D samples by 1) |
|
106
|
|
|
(3) str 'dist': transfer model, options are 'dropout' and |
|
107
|
|
|
'blankout' (def: 'blankout') |
|
108
|
|
|
OUTPUT (1) array 'E': expected value of transfer model (N samples by |
|
109
|
|
|
D feautures) |
|
110
|
|
|
(2) array 'V': variance of transfer model (D features by D |
|
111
|
|
|
features by N samples) |
|
112
|
|
|
""" |
|
113
|
|
|
# Data shape |
|
114
|
1 |
|
N, D = X.shape |
|
115
|
|
|
|
|
116
|
1 |
|
if (dist == 'dropout'): |
|
117
|
|
|
|
|
118
|
|
|
# First moment of transfer distribution |
|
119
|
|
|
E = (1-iota) * X |
|
120
|
|
|
|
|
121
|
|
|
# Second moment of transfer distribution |
|
122
|
|
|
V = np.zeros((D, D, N)) |
|
123
|
|
|
for i in range(N): |
|
124
|
|
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T*X[i, :]) |
|
125
|
|
|
|
|
126
|
1 |
|
elif (dist == 'blankout'): |
|
127
|
|
|
|
|
128
|
|
|
# First moment of transfer distribution |
|
129
|
1 |
|
E = X |
|
130
|
|
|
|
|
131
|
|
|
# Second moment of transfer distribution |
|
132
|
1 |
|
V = np.zeros((D, D, N)) |
|
133
|
1 |
|
for i in range(N): |
|
134
|
1 |
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T * X[i, :]) |
|
135
|
|
|
|
|
136
|
|
|
else: |
|
137
|
|
|
raise ValueError('Transfer distribution not implemented') |
|
138
|
|
|
|
|
139
|
1 |
|
return E, V |
|
140
|
|
|
|
|
141
|
1 |
|
def flda_log_loss(self, theta, X, y, E, V, l2=0.0): |
|
142
|
|
|
""" |
|
143
|
|
|
Compute average loss for flda-log. |
|
144
|
|
|
|
|
145
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
|
146
|
|
|
(2) array 'X': source data set () |
|
147
|
|
|
(3) array 'y': label vector (N samples by 1) |
|
148
|
|
|
(4) array 'E': expected value with respect to transfer model |
|
149
|
|
|
(N samples by D features) |
|
150
|
|
|
(5) array 'V': variance with respect to transfer model |
|
151
|
|
|
(D features by D features by N samples) |
|
152
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
|
153
|
|
|
OUTPUT (1) float 'L': loss function value |
|
154
|
|
|
""" |
|
155
|
|
|
# Data shape |
|
156
|
1 |
|
N, D = X.shape |
|
157
|
|
|
|
|
158
|
|
|
# Assert y in {-1,+1} |
|
159
|
1 |
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
|
160
|
|
|
|
|
161
|
|
|
# Precompute terms |
|
162
|
1 |
|
Xt = np.dot(X, theta) |
|
163
|
1 |
|
Et = np.dot(E, theta) |
|
164
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
|
165
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
|
166
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
|
167
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
|
168
|
|
|
|
|
169
|
|
|
# Log-partition function |
|
170
|
1 |
|
A = np.log(alpha) |
|
171
|
|
|
|
|
172
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
|
173
|
1 |
|
dA = beta / alpha |
|
174
|
|
|
|
|
175
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
|
176
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
|
177
|
|
|
|
|
178
|
|
|
# Compute pointwise loss (negative log-likelihood) |
|
179
|
1 |
|
L = np.zeros((N, 1)) |
|
180
|
1 |
|
for i in range(N): |
|
181
|
1 |
|
L[i] = -y[i] * Et[i] + A[i] + dA[i] * (Et[i] - Xt[i]) + \ |
|
182
|
|
|
1./2*d2A[i]*np.dot(np.dot(theta.T, V[:, :, i]), theta) |
|
183
|
|
|
|
|
184
|
|
|
# Compute risk (average loss) |
|
185
|
1 |
|
R = np.mean(L, axis=0) |
|
186
|
|
|
|
|
187
|
|
|
# Add regularization |
|
188
|
1 |
|
return R + l2*np.sum(theta**2, axis=0) |
|
189
|
|
|
|
|
190
|
1 |
|
def flda_log_grad(self, theta, X, y, E, V, l2=0.0): |
|
191
|
|
|
""" |
|
192
|
|
|
Compute gradient with respect to theta for flda-log. |
|
193
|
|
|
|
|
194
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
|
195
|
|
|
(2) array 'X': source data set () |
|
196
|
|
|
(3) array 'y': label vector (N samples by 1) |
|
197
|
|
|
(4) array 'E': expected value with respect to transfer model |
|
198
|
|
|
(N samples by D features) |
|
199
|
|
|
(5) array 'V': variance with respect to transfer model |
|
200
|
|
|
(D features by D features by N samples) |
|
201
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
|
202
|
|
|
OUTPUT (1) float |
|
203
|
|
|
""" |
|
204
|
|
|
# Data shape |
|
205
|
1 |
|
N, D = X.shape |
|
206
|
|
|
|
|
207
|
|
|
# Assert y in {-1,+1} |
|
208
|
1 |
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
|
209
|
|
|
|
|
210
|
|
|
# Precompute common terms |
|
211
|
1 |
|
Xt = np.dot(X, theta) |
|
212
|
1 |
|
Et = np.dot(E, theta) |
|
213
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
|
214
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
|
215
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
|
216
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
|
217
|
|
|
|
|
218
|
|
|
# Log-partition function |
|
219
|
1 |
|
A = np.log(alpha) |
|
220
|
|
|
|
|
221
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
|
222
|
1 |
|
dA = beta / alpha |
|
223
|
|
|
|
|
224
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
|
225
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
|
226
|
|
|
|
|
227
|
1 |
|
dR = 0 |
|
228
|
1 |
|
for i in range(N): |
|
229
|
|
|
|
|
230
|
|
|
# Compute gradient terms |
|
231
|
1 |
|
t1 = -y[i]*E[i, :].T |
|
232
|
|
|
|
|
233
|
1 |
|
t2 = beta[i] / alpha[i] * X[i, :].T |
|
234
|
|
|
|
|
235
|
1 |
|
t3 = (gamma[i, :] / alpha[i] - beta[i]*delta[i, :] / |
|
236
|
|
|
alpha[i]**2).T * (Et[i] - Xt[i]) |
|
237
|
|
|
|
|
238
|
1 |
|
t4 = beta[i] / alpha[i] * (E[i, :] - X[i, :]).T |
|
239
|
|
|
|
|
240
|
1 |
|
t5 = (1 - beta[i]**2 / alpha[i]**2) * np.dot(V[:, :, i], theta) |
|
241
|
|
|
|
|
242
|
1 |
|
t6 = -(beta[i] * gamma[i, :] / alpha[i]**2 - beta[i]**2 * |
|
243
|
|
|
delta[i, :] / alpha[i]**3).T * np.dot(np.dot(theta.T, |
|
244
|
|
|
V[:, :, i]), theta) |
|
245
|
|
|
|
|
246
|
1 |
|
dR += t1 + t2 + t3 + t4 + t5 + t6 |
|
247
|
|
|
|
|
248
|
|
|
# Add regularization |
|
249
|
1 |
|
dR += l2*2*theta |
|
250
|
|
|
|
|
251
|
1 |
|
return dR |
|
252
|
|
|
|
|
253
|
1 |
|
def fit(self, X, y, Z): |
|
254
|
|
|
""" |
|
255
|
|
|
Fit/train a robust bias-aware classifier. |
|
256
|
|
|
|
|
257
|
|
|
INPUT (1) array 'X': source data (N samples by D features) |
|
258
|
|
|
(2) array 'y': source labels (N samples by 1) |
|
259
|
|
|
(3) array 'Z': target data (M samples by D features) |
|
260
|
|
|
OUTPUT None |
|
261
|
|
|
""" |
|
262
|
|
|
# Data shapes |
|
263
|
1 |
|
N, DX = X.shape |
|
264
|
1 |
|
M, DZ = Z.shape |
|
265
|
|
|
|
|
266
|
|
|
# Assert equivalent dimensionalities |
|
267
|
1 |
|
if not DX == DZ: |
|
268
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
|
269
|
|
|
|
|
270
|
|
|
# Map to one-not-encoding |
|
271
|
1 |
|
Y, labels = one_hot(y, one_not=True) |
|
272
|
|
|
|
|
273
|
|
|
# Number of classes |
|
274
|
1 |
|
K = len(labels) |
|
275
|
|
|
|
|
276
|
|
|
# Compute transfer distribution parameters |
|
277
|
1 |
|
iota = self.mle_transfer_dist(X, Z) |
|
278
|
|
|
|
|
279
|
|
|
# Compute moments of transfer distribution |
|
280
|
1 |
|
E, V = self.moments_transfer_model(X, iota) |
|
281
|
|
|
|
|
282
|
|
|
# Select loss function |
|
283
|
1 |
|
if (self.loss == 'logistic'): |
|
284
|
|
|
|
|
285
|
|
|
# Preallocate parameter array |
|
286
|
1 |
|
theta = np.random.randn(DX, K) |
|
287
|
|
|
|
|
288
|
|
|
# Train a classifier for each class |
|
289
|
1 |
|
for k in range(K): |
|
290
|
|
|
|
|
291
|
|
|
# Shorthand for loss computation |
|
292
|
1 |
|
def L(theta): return self.flda_log_loss(theta, X, Y[:, k], |
|
293
|
|
|
E, V, l2=self.l2) |
|
294
|
|
|
|
|
295
|
|
|
# Shorthand for gradient computation |
|
296
|
1 |
|
def J(theta): return self.flda_log_grad(theta, X, Y[:, k], |
|
297
|
|
|
E, V, l2=self.l2) |
|
298
|
|
|
|
|
299
|
|
|
# Call scipy's minimizer |
|
300
|
1 |
|
results = minimize(L, theta[:, k], jac=J, method='BFGS', |
|
301
|
|
|
options={'gtol': self.tolerance, |
|
302
|
|
|
'disp': self.verbose}) |
|
303
|
|
|
|
|
304
|
|
|
# Store resultant classifier parameters |
|
305
|
1 |
|
theta[:, k] = results.x |
|
306
|
|
|
|
|
307
|
|
|
elif (self.loss == 'quadratic'): |
|
308
|
|
|
|
|
309
|
|
|
# Compute closed-form least-squares solution |
|
310
|
|
|
theta = np.inv(E.T*E + np.sum(V, axis=2) + l2*np.eye(D))\ |
|
311
|
|
|
* (E.T * Y) |
|
312
|
|
|
|
|
313
|
|
|
# Store trained classifier parameters |
|
314
|
1 |
|
self.theta = theta |
|
315
|
|
|
|
|
316
|
|
|
# Store classes |
|
317
|
1 |
|
self.classes = labels |
|
318
|
|
|
|
|
319
|
|
|
# Mark classifier as trained |
|
320
|
1 |
|
self.is_trained = True |
|
321
|
|
|
|
|
322
|
|
|
# Store training data dimensionality |
|
323
|
1 |
|
self.train_data_dim = DX |
|
324
|
|
|
|
|
325
|
1 |
|
def predict(self, Z_): |
|
326
|
|
|
""" |
|
327
|
|
|
Make predictions on new dataset. |
|
328
|
|
|
|
|
329
|
|
|
INPUT (1) array 'Z_': new data set (M samples by D features) |
|
330
|
|
|
OUTPUT (1) array 'preds': label predictions (M samples by 1) |
|
331
|
|
|
""" |
|
332
|
|
|
# Data shape |
|
333
|
1 |
|
M, D = Z_.shape |
|
334
|
|
|
|
|
335
|
|
|
# If classifier is trained, check for same dimensionality |
|
336
|
1 |
|
if self.is_trained: |
|
337
|
1 |
|
assert self.train_data_dim == D |
|
338
|
|
|
else: |
|
339
|
|
|
raise UserWarning('Classifier is not trained yet.') |
|
340
|
|
|
|
|
341
|
|
|
# Predict target labels |
|
342
|
1 |
|
preds = np.argmax(np.dot(Z_, self.theta), axis=1) |
|
343
|
|
|
|
|
344
|
|
|
# Map predictions back to labels |
|
345
|
1 |
|
preds = self.classes[preds] |
|
346
|
|
|
|
|
347
|
|
|
# Return predictions array |
|
348
|
1 |
|
return preds |
|
349
|
|
|
|
|
350
|
1 |
|
def get_params(self): |
|
351
|
|
|
"""Get classifier parameters.""" |
|
352
|
|
|
return self.clf.get_params() |
|
353
|
|
|
|
|
354
|
1 |
|
def is_trained(self): |
|
355
|
|
|
"""Check whether classifier is trained.""" |
|
356
|
|
|
return self.is_trained |
|
357
|
|
|
|