1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
1 |
|
import numpy as np |
5
|
1 |
|
import scipy.stats as st |
6
|
1 |
|
from scipy.optimize import minimize |
7
|
1 |
|
from scipy.sparse.linalg import eigs |
8
|
1 |
|
from scipy.spatial.distance import cdist |
9
|
1 |
|
import sklearn as sk |
10
|
1 |
|
from sklearn.svm import LinearSVC |
11
|
1 |
|
from sklearn.linear_model import LogisticRegression, LinearRegression |
12
|
1 |
|
from sklearn.model_selection import cross_val_predict |
13
|
1 |
|
from os.path import basename |
14
|
|
|
|
15
|
1 |
|
from .util import is_pos_def, one_hot |
16
|
|
|
|
17
|
|
|
|
18
|
1 |
|
class FeatureLevelDomainAdaptiveClassifier(object): |
19
|
|
|
""" |
20
|
|
|
Class of feature-level domain-adaptive classifiers. |
21
|
|
|
|
22
|
|
|
Reference: Kouw, Krijthe, Loog & Van der Maaten (2016). Feature-level |
23
|
|
|
domain adaptation. JMLR. |
24
|
|
|
|
25
|
|
|
Methods contain training and prediction functions. |
26
|
|
|
""" |
27
|
|
|
|
28
|
1 |
|
def __init__(self, l2=0.0, loss='logistic', transfer_model='blankout', |
29
|
|
|
max_iter=100, tolerance=1e-5, verbose=True): |
30
|
|
|
""" |
31
|
|
|
Set classifier instance parameters. |
32
|
|
|
|
33
|
|
|
INPUT (1) float 'l2': l2-regularization parameter value (def:0.01) |
34
|
|
|
(2) str 'loss': loss function for classifier, options are |
35
|
|
|
'logistic' or 'quadratic' (def: 'logistic') |
36
|
|
|
(3) str 'transfer_model': distribution to use for transfer |
37
|
|
|
model, options are 'dropout' and 'blankout' |
38
|
|
|
(def: 'blankout') |
39
|
|
|
(4) int 'max_iter': maximum number of iterations (def: 100) |
40
|
|
|
(5) float 'tolerance': convergence criterion threshold on x |
41
|
|
|
(def: 1e-5) |
42
|
|
|
(7) boolean 'verbose': report training progress (def: True) |
43
|
|
|
OUTPUT None |
44
|
|
|
""" |
45
|
|
|
# Classifier choices |
46
|
1 |
|
self.l2 = l2 |
47
|
1 |
|
self.loss = 'logistic' |
48
|
1 |
|
self.transfer_model = transfer_model |
49
|
|
|
|
50
|
|
|
# Optimization parameters |
51
|
1 |
|
self.max_iter = max_iter |
52
|
1 |
|
self.tolerance = tolerance |
53
|
|
|
|
54
|
|
|
# Whether model has been trained |
55
|
1 |
|
self.is_trained = False |
56
|
|
|
|
57
|
|
|
# Dimensionality of training data |
58
|
1 |
|
self.train_data_dim = 0 |
59
|
|
|
|
60
|
|
|
# Classifier parameters |
61
|
1 |
|
self.theta = 0 |
62
|
|
|
|
63
|
|
|
# Verbosity |
64
|
1 |
|
self.verbose = verbose |
65
|
|
|
|
66
|
1 |
|
def mle_transfer_dist(self, X, Z, dist='blankout'): |
67
|
|
|
""" |
68
|
|
|
Maximum likelihood estimation of transfer model parameters. |
69
|
|
|
|
70
|
|
|
INPUT (1) array 'X': source data set (N samples by D features) |
71
|
|
|
(2) array 'Z': target data set (M samples by D features) |
72
|
|
|
(3) str 'dist': distribution of transfer model, options are |
73
|
|
|
'blankout' or 'dropout' (def: 'blankout') |
74
|
|
|
OUTPUT (1) array 'iota': estimated transfer model parameters |
75
|
|
|
(D features by 1) |
76
|
|
|
""" |
77
|
|
|
# Data shapes |
78
|
1 |
|
N, DX = X.shape |
79
|
1 |
|
M, DZ = Z.shape |
80
|
|
|
|
81
|
|
|
# Assert equivalent dimensionalities |
82
|
1 |
|
if not DX == DZ: |
83
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
84
|
|
|
|
85
|
|
|
# Blankout and dropout have same maximum likelihood estimator |
86
|
1 |
|
if (dist == 'blankout') or (dist == 'dropout'): |
87
|
|
|
|
88
|
|
|
# Rate parameters |
89
|
1 |
|
eta = np.mean(X > 0, axis=0) |
90
|
1 |
|
zeta = np.mean(Z > 0, axis=0) |
91
|
|
|
|
92
|
|
|
# Ratio of rate parameters |
93
|
1 |
|
iota = np.clip(1 - zeta / eta, 0, None) |
94
|
|
|
|
95
|
|
|
else: |
96
|
|
|
raise ValueError('Distribution unknown.') |
97
|
|
|
|
98
|
1 |
|
return iota |
99
|
|
|
|
100
|
1 |
|
def moments_transfer_model(self, X, iota, dist='blankout'): |
101
|
|
|
""" |
102
|
|
|
Moments of the transfer model. |
103
|
|
|
|
104
|
|
|
INPUT (1) array 'X': data set (N samples by D features) |
105
|
|
|
(2) array 'iota': transfer model parameters (D samples by 1) |
106
|
|
|
(3) str 'dist': transfer model, options are 'dropout' and |
107
|
|
|
'blankout' (def: 'blankout') |
108
|
|
|
OUTPUT (1) array 'E': expected value of transfer model (N samples by |
109
|
|
|
D feautures) |
110
|
|
|
(2) array 'V': variance of transfer model (D features by D |
111
|
|
|
features by N samples) |
112
|
|
|
""" |
113
|
|
|
# Data shape |
114
|
1 |
|
N, D = X.shape |
115
|
|
|
|
116
|
1 |
|
if (dist == 'dropout'): |
117
|
|
|
|
118
|
|
|
# First moment of transfer distribution |
119
|
|
|
E = (1-iota) * X |
120
|
|
|
|
121
|
|
|
# Second moment of transfer distribution |
122
|
|
|
V = np.zeros((D, D, N)) |
123
|
|
|
for i in range(N): |
124
|
|
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T*X[i, :]) |
125
|
|
|
|
126
|
1 |
|
elif (dist == 'blankout'): |
127
|
|
|
|
128
|
|
|
# First moment of transfer distribution |
129
|
1 |
|
E = X |
130
|
|
|
|
131
|
|
|
# Second moment of transfer distribution |
132
|
1 |
|
V = np.zeros((D, D, N)) |
133
|
1 |
|
for i in range(N): |
134
|
1 |
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T * X[i, :]) |
135
|
|
|
|
136
|
|
|
else: |
137
|
|
|
raise ValueError('Transfer distribution not implemented') |
138
|
|
|
|
139
|
1 |
|
return E, V |
140
|
|
|
|
141
|
1 |
|
def flda_log_loss(self, theta, X, y, E, V, l2=0.0): |
142
|
|
|
""" |
143
|
|
|
Compute average loss for flda-log. |
144
|
|
|
|
145
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
146
|
|
|
(2) array 'X': source data set () |
147
|
|
|
(3) array 'y': label vector (N samples by 1) |
148
|
|
|
(4) array 'E': expected value with respect to transfer model |
149
|
|
|
(N samples by D features) |
150
|
|
|
(5) array 'V': variance with respect to transfer model |
151
|
|
|
(D features by D features by N samples) |
152
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
153
|
|
|
OUTPUT (1) float 'L': loss function value |
154
|
|
|
""" |
155
|
|
|
# Data shape |
156
|
1 |
|
N, D = X.shape |
157
|
|
|
|
158
|
|
|
# Assert y in {-1,+1} |
159
|
1 |
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
160
|
|
|
|
161
|
|
|
# Precompute terms |
162
|
1 |
|
Xt = np.dot(X, theta) |
163
|
1 |
|
Et = np.dot(E, theta) |
164
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
165
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
166
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
167
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
168
|
|
|
|
169
|
|
|
# Log-partition function |
170
|
1 |
|
A = np.log(alpha) |
171
|
|
|
|
172
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
173
|
1 |
|
dA = beta / alpha |
174
|
|
|
|
175
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
176
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
177
|
|
|
|
178
|
|
|
# Compute pointwise loss (negative log-likelihood) |
179
|
1 |
|
L = np.zeros((N, 1)) |
180
|
1 |
|
for i in range(N): |
181
|
1 |
|
L[i] = -y[i] * Et[i] + A[i] + dA[i] * (Et[i] - Xt[i]) + \ |
182
|
|
|
1./2*d2A[i]*np.dot(np.dot(theta.T, V[:, :, i]), theta) |
183
|
|
|
|
184
|
|
|
# Compute risk (average loss) |
185
|
1 |
|
R = np.mean(L, axis=0) |
186
|
|
|
|
187
|
|
|
# Add regularization |
188
|
1 |
|
return R + l2*np.sum(theta**2, axis=0) |
189
|
|
|
|
190
|
1 |
|
def flda_log_grad(self, theta, X, y, E, V, l2=0.0): |
191
|
|
|
""" |
192
|
|
|
Compute gradient with respect to theta for flda-log. |
193
|
|
|
|
194
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
195
|
|
|
(2) array 'X': source data set () |
196
|
|
|
(3) array 'y': label vector (N samples by 1) |
197
|
|
|
(4) array 'E': expected value with respect to transfer model |
198
|
|
|
(N samples by D features) |
199
|
|
|
(5) array 'V': variance with respect to transfer model |
200
|
|
|
(D features by D features by N samples) |
201
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
202
|
|
|
OUTPUT (1) float |
203
|
|
|
""" |
204
|
|
|
# Data shape |
205
|
1 |
|
N, D = X.shape |
206
|
|
|
|
207
|
|
|
# Assert y in {-1,+1} |
208
|
1 |
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
209
|
|
|
|
210
|
|
|
# Precompute common terms |
211
|
1 |
|
Xt = np.dot(X, theta) |
212
|
1 |
|
Et = np.dot(E, theta) |
213
|
1 |
|
alpha = np.exp(Xt) + np.exp(-Xt) |
214
|
1 |
|
beta = np.exp(Xt) - np.exp(-Xt) |
215
|
1 |
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
216
|
1 |
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
217
|
|
|
|
218
|
|
|
# Log-partition function |
219
|
1 |
|
A = np.log(alpha) |
220
|
|
|
|
221
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
222
|
1 |
|
dA = beta / alpha |
223
|
|
|
|
224
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
225
|
1 |
|
d2A = 1 - beta**2 / alpha**2 |
226
|
|
|
|
227
|
1 |
|
dR = 0 |
228
|
1 |
|
for i in range(N): |
229
|
|
|
|
230
|
|
|
# Compute gradient terms |
231
|
1 |
|
t1 = -y[i]*E[i, :].T |
232
|
|
|
|
233
|
1 |
|
t2 = beta[i] / alpha[i] * X[i, :].T |
234
|
|
|
|
235
|
1 |
|
t3 = (gamma[i, :] / alpha[i] - beta[i]*delta[i, :] / |
236
|
|
|
alpha[i]**2).T * (Et[i] - Xt[i]) |
237
|
|
|
|
238
|
1 |
|
t4 = beta[i] / alpha[i] * (E[i, :] - X[i, :]).T |
239
|
|
|
|
240
|
1 |
|
t5 = (1 - beta[i]**2 / alpha[i]**2) * np.dot(V[:, :, i], theta) |
241
|
|
|
|
242
|
1 |
|
t6 = -(beta[i] * gamma[i, :] / alpha[i]**2 - beta[i]**2 * |
243
|
|
|
delta[i, :] / alpha[i]**3).T * np.dot(np.dot(theta.T, |
244
|
|
|
V[:, :, i]), theta) |
245
|
|
|
|
246
|
1 |
|
dR += t1 + t2 + t3 + t4 + t5 + t6 |
247
|
|
|
|
248
|
|
|
# Add regularization |
249
|
1 |
|
dR += l2*2*theta |
250
|
|
|
|
251
|
1 |
|
return dR |
252
|
|
|
|
253
|
1 |
|
def fit(self, X, y, Z): |
254
|
|
|
""" |
255
|
|
|
Fit/train a robust bias-aware classifier. |
256
|
|
|
|
257
|
|
|
INPUT (1) array 'X': source data (N samples by D features) |
258
|
|
|
(2) array 'y': source labels (N samples by 1) |
259
|
|
|
(3) array 'Z': target data (M samples by D features) |
260
|
|
|
OUTPUT None |
261
|
|
|
""" |
262
|
|
|
# Data shapes |
263
|
1 |
|
N, DX = X.shape |
264
|
1 |
|
M, DZ = Z.shape |
265
|
|
|
|
266
|
|
|
# Assert equivalent dimensionalities |
267
|
1 |
|
if not DX == DZ: |
268
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
269
|
|
|
|
270
|
|
|
# Map to one-not-encoding |
271
|
1 |
|
Y, labels = one_hot(y, one_not=True) |
272
|
|
|
|
273
|
|
|
# Number of classes |
274
|
1 |
|
K = len(labels) |
275
|
|
|
|
276
|
|
|
# Compute transfer distribution parameters |
277
|
1 |
|
iota = self.mle_transfer_dist(X, Z) |
278
|
|
|
|
279
|
|
|
# Compute moments of transfer distribution |
280
|
1 |
|
E, V = self.moments_transfer_model(X, iota) |
281
|
|
|
|
282
|
|
|
# Select loss function |
283
|
1 |
|
if (self.loss == 'logistic'): |
284
|
|
|
|
285
|
|
|
# Preallocate parameter array |
286
|
1 |
|
theta = np.random.randn(DX, K) |
287
|
|
|
|
288
|
|
|
# Train a classifier for each class |
289
|
1 |
|
for k in range(K): |
290
|
|
|
|
291
|
|
|
# Shorthand for loss computation |
292
|
1 |
|
def L(theta): return self.flda_log_loss(theta, X, Y[:, k], |
293
|
|
|
E, V, l2=self.l2) |
294
|
|
|
|
295
|
|
|
# Shorthand for gradient computation |
296
|
1 |
|
def J(theta): return self.flda_log_grad(theta, X, Y[:, k], |
297
|
|
|
E, V, l2=self.l2) |
298
|
|
|
|
299
|
|
|
# Call scipy's minimizer |
300
|
1 |
|
results = minimize(L, theta[:, k], jac=J, method='BFGS', |
301
|
|
|
options={'gtol': self.tolerance, |
302
|
|
|
'disp': self.verbose}) |
303
|
|
|
|
304
|
|
|
# Store resultant classifier parameters |
305
|
1 |
|
theta[:, k] = results.x |
306
|
|
|
|
307
|
|
|
elif (self.loss == 'quadratic'): |
308
|
|
|
|
309
|
|
|
# Compute closed-form least-squares solution |
310
|
|
|
theta = np.inv(E.T*E + np.sum(V, axis=2) + l2*np.eye(D))\ |
311
|
|
|
* (E.T * Y) |
312
|
|
|
|
313
|
|
|
# Store trained classifier parameters |
314
|
1 |
|
self.theta = theta |
315
|
|
|
|
316
|
|
|
# Store classes |
317
|
1 |
|
self.classes = labels |
318
|
|
|
|
319
|
|
|
# Mark classifier as trained |
320
|
1 |
|
self.is_trained = True |
321
|
|
|
|
322
|
|
|
# Store training data dimensionality |
323
|
1 |
|
self.train_data_dim = DX |
324
|
|
|
|
325
|
1 |
|
def predict(self, Z_): |
326
|
|
|
""" |
327
|
|
|
Make predictions on new dataset. |
328
|
|
|
|
329
|
|
|
INPUT (1) array 'Z_': new data set (M samples by D features) |
330
|
|
|
OUTPUT (1) array 'preds': label predictions (M samples by 1) |
331
|
|
|
""" |
332
|
|
|
# Data shape |
333
|
1 |
|
M, D = Z_.shape |
334
|
|
|
|
335
|
|
|
# If classifier is trained, check for same dimensionality |
336
|
1 |
|
if self.is_trained: |
337
|
1 |
|
assert self.train_data_dim == D |
338
|
|
|
else: |
339
|
|
|
raise UserWarning('Classifier is not trained yet.') |
340
|
|
|
|
341
|
|
|
# Predict target labels |
342
|
1 |
|
preds = np.argmax(np.dot(Z_, self.theta), axis=1) |
343
|
|
|
|
344
|
|
|
# Map predictions back to labels |
345
|
1 |
|
preds = self.classes[preds] |
346
|
|
|
|
347
|
|
|
# Return predictions array |
348
|
1 |
|
return preds |
349
|
|
|
|
350
|
1 |
|
def get_params(self): |
351
|
|
|
"""Get classifier parameters.""" |
352
|
|
|
return self.clf.get_params() |
353
|
|
|
|
354
|
1 |
|
def is_trained(self): |
355
|
|
|
"""Check whether classifier is trained.""" |
356
|
|
|
return self.is_trained |
357
|
|
|
|