1
|
|
|
#!/usr/bin/env python |
2
|
|
|
# -*- coding: utf-8 -*- |
3
|
|
|
|
4
|
|
|
import numpy as np |
5
|
|
|
import scipy.stats as st |
6
|
|
|
from scipy.optimize import minimize |
7
|
|
|
from scipy.sparse.linalg import eigs |
8
|
|
|
from scipy.spatial.distance import cdist |
9
|
|
|
import sklearn as sk |
10
|
|
|
from sklearn.svm import LinearSVC |
11
|
|
|
from sklearn.linear_model import LogisticRegression, LinearRegression |
12
|
|
|
from sklearn.model_selection import cross_val_predict |
13
|
|
|
from os.path import basename |
14
|
|
|
|
15
|
|
|
from .util import is_pos_def, one_hot |
16
|
|
|
|
17
|
|
|
|
18
|
|
|
class FeatureLevelDomainAdaptiveClassifier(object): |
19
|
|
|
""" |
20
|
|
|
Class of feature-level domain-adaptive classifiers. |
21
|
|
|
|
22
|
|
|
Reference: Kouw, Krijthe, Loog & Van der Maaten (2016). Feature-level |
23
|
|
|
domain adaptation. JMLR. |
24
|
|
|
|
25
|
|
|
Methods contain training and prediction functions. |
26
|
|
|
""" |
27
|
|
|
|
28
|
|
|
def __init__(self, l2=0.0, loss='logistic', transfer_model='blankout', |
29
|
|
|
max_iter=100, tolerance=1e-5, verbose=True): |
30
|
|
|
""" |
31
|
|
|
Set classifier instance parameters. |
32
|
|
|
|
33
|
|
|
INPUT (1) float 'l2': l2-regularization parameter value (def:0.01) |
34
|
|
|
(2) str 'loss': loss function for classifier, options are |
35
|
|
|
'logistic' or 'quadratic' (def: 'logistic') |
36
|
|
|
(3) str 'transfer_model': distribution to use for transfer |
37
|
|
|
model, options are 'dropout' and 'blankout' |
38
|
|
|
(def: 'blankout') |
39
|
|
|
(4) int 'max_iter': maximum number of iterations (def: 100) |
40
|
|
|
(5) float 'tolerance': convergence criterion threshold on x |
41
|
|
|
(def: 1e-5) |
42
|
|
|
(7) boolean 'verbose': report training progress (def: True) |
43
|
|
|
OUTPUT None |
44
|
|
|
""" |
45
|
|
|
# Classifier choices |
46
|
|
|
self.l2 = l2 |
47
|
|
|
self.loss = 'logistic' |
48
|
|
|
self.transfer_model = transfer_model |
49
|
|
|
|
50
|
|
|
# Optimization parameters |
51
|
|
|
self.max_iter = max_iter |
52
|
|
|
self.tolerance = tolerance |
53
|
|
|
|
54
|
|
|
# Whether model has been trained |
55
|
|
|
self.is_trained = False |
56
|
|
|
|
57
|
|
|
# Dimensionality of training data |
58
|
|
|
self.train_data_dim = 0 |
59
|
|
|
|
60
|
|
|
# Classifier parameters |
61
|
|
|
self.theta = 0 |
62
|
|
|
|
63
|
|
|
# Verbosity |
64
|
|
|
self.verbose = verbose |
65
|
|
|
|
66
|
|
|
def mle_transfer_dist(self, X, Z, dist='blankout'): |
67
|
|
|
""" |
68
|
|
|
Maximum likelihood estimation of transfer model parameters. |
69
|
|
|
|
70
|
|
|
INPUT (1) array 'X': source data set (N samples by D features) |
71
|
|
|
(2) array 'Z': target data set (M samples by D features) |
72
|
|
|
(3) str 'dist': distribution of transfer model, options are |
73
|
|
|
'blankout' or 'dropout' (def: 'blankout') |
74
|
|
|
OUTPUT (1) array 'iota': estimated transfer model parameters |
75
|
|
|
(D features by 1) |
76
|
|
|
""" |
77
|
|
|
# Data shapes |
78
|
|
|
N, DX = X.shape |
79
|
|
|
M, DZ = Z.shape |
80
|
|
|
|
81
|
|
|
# Assert equivalent dimensionalities |
82
|
|
|
assert DX == DZ |
83
|
|
|
|
84
|
|
|
# Blankout and dropout have same maximum likelihood estimator |
85
|
|
|
if (dist == 'blankout') or (dist == 'dropout'): |
86
|
|
|
|
87
|
|
|
# Rate parameters |
88
|
|
|
eta = np.mean(X > 0, axis=0) |
89
|
|
|
zeta = np.mean(Z > 0, axis=0) |
90
|
|
|
|
91
|
|
|
# Ratio of rate parameters |
92
|
|
|
iota = np.clip(1 - zeta / eta, 0, None) |
93
|
|
|
|
94
|
|
|
else: |
95
|
|
|
raise ValueError('Distribution unknown.') |
96
|
|
|
|
97
|
|
|
return iota |
98
|
|
|
|
99
|
|
|
def moments_transfer_model(self, X, iota, dist='blankout'): |
100
|
|
|
""" |
101
|
|
|
Moments of the transfer model. |
102
|
|
|
|
103
|
|
|
INPUT (1) array 'X': data set (N samples by D features) |
104
|
|
|
(2) array 'iota': transfer model parameters (D samples by 1) |
105
|
|
|
(3) str 'dist': transfer model, options are 'dropout' and |
106
|
|
|
'blankout' (def: 'blankout') |
107
|
|
|
OUTPUT (1) array 'E': expected value of transfer model (N samples by |
108
|
|
|
D feautures) |
109
|
|
|
(2) array 'V': variance of transfer model (D features by D |
110
|
|
|
features by N samples) |
111
|
|
|
""" |
112
|
|
|
# Data shape |
113
|
|
|
N, D = X.shape |
114
|
|
|
|
115
|
|
|
if (dist == 'dropout'): |
116
|
|
|
|
117
|
|
|
# First moment of transfer distribution |
118
|
|
|
E = (1-iota) * X |
119
|
|
|
|
120
|
|
|
# Second moment of transfer distribution |
121
|
|
|
V = np.zeros((D, D, N)) |
122
|
|
|
for i in range(N): |
123
|
|
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T*X[i, :]) |
124
|
|
|
|
125
|
|
|
elif (dist == 'blankout'): |
126
|
|
|
|
127
|
|
|
# First moment of transfer distribution |
128
|
|
|
E = X |
129
|
|
|
|
130
|
|
|
# Second moment of transfer distribution |
131
|
|
|
V = np.zeros((D, D, N)) |
132
|
|
|
for i in range(N): |
133
|
|
|
V[:, :, i] = np.diag(iota * (1-iota)) * (X[i, :].T * X[i, :]) |
134
|
|
|
|
135
|
|
|
else: |
136
|
|
|
raise ValueError('Transfer distribution not implemented') |
137
|
|
|
|
138
|
|
|
return E, V |
139
|
|
|
|
140
|
|
|
def flda_log_loss(self, theta, X, y, E, V, l2=0.0): |
141
|
|
|
""" |
142
|
|
|
Compute average loss for flda-log. |
143
|
|
|
|
144
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
145
|
|
|
(2) array 'X': source data set () |
146
|
|
|
(3) array 'y': label vector (N samples by 1) |
147
|
|
|
(4) array 'E': expected value with respect to transfer model |
148
|
|
|
(N samples by D features) |
149
|
|
|
(5) array 'V': variance with respect to transfer model |
150
|
|
|
(D features by D features by N samples) |
151
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
152
|
|
|
OUTPUT (1) float 'L': loss function value |
153
|
|
|
""" |
154
|
|
|
# Data shape |
155
|
|
|
N, D = X.shape |
156
|
|
|
|
157
|
|
|
# Assert y in {-1,+1} |
158
|
|
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
159
|
|
|
|
160
|
|
|
# Precompute terms |
161
|
|
|
Xt = np.dot(X, theta) |
162
|
|
|
Et = np.dot(E, theta) |
163
|
|
|
alpha = np.exp(Xt) + np.exp(-Xt) |
164
|
|
|
beta = np.exp(Xt) - np.exp(-Xt) |
165
|
|
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
166
|
|
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
167
|
|
|
|
168
|
|
|
# Log-partition function |
169
|
|
|
A = np.log(alpha) |
170
|
|
|
|
171
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
172
|
|
|
dA = beta / alpha |
173
|
|
|
|
174
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
175
|
|
|
d2A = 1 - beta**2 / alpha**2 |
176
|
|
|
|
177
|
|
|
# Compute pointwise loss (negative log-likelihood) |
178
|
|
|
L = np.zeros((N, 1)) |
179
|
|
|
for i in range(N): |
180
|
|
|
L[i] = -y[i] * Et[i] + A[i] + dA[i] * (Et[i] - Xt[i]) + \ |
181
|
|
|
1./2*d2A[i]*np.dot(np.dot(theta.T, V[:, :, i]), theta) |
182
|
|
|
|
183
|
|
|
# Compute risk (average loss) |
184
|
|
|
R = np.mean(L, axis=0) |
185
|
|
|
|
186
|
|
|
# Add regularization |
187
|
|
|
return R + l2*np.sum(theta**2, axis=0) |
188
|
|
|
|
189
|
|
|
def flda_log_grad(self, theta, X, y, E, V, l2=0.0): |
190
|
|
|
""" |
191
|
|
|
Compute gradient with respect to theta for flda-log. |
192
|
|
|
|
193
|
|
|
INPUT (1) array 'theta': classifier parameters (D features by 1) |
194
|
|
|
(2) array 'X': source data set () |
195
|
|
|
(3) array 'y': label vector (N samples by 1) |
196
|
|
|
(4) array 'E': expected value with respect to transfer model |
197
|
|
|
(N samples by D features) |
198
|
|
|
(5) array 'V': variance with respect to transfer model |
199
|
|
|
(D features by D features by N samples) |
200
|
|
|
(6) float 'l2': regularization parameter (def: 0.0) |
201
|
|
|
OUTPUT (1) float |
202
|
|
|
""" |
203
|
|
|
# Data shape |
204
|
|
|
N, D = X.shape |
205
|
|
|
|
206
|
|
|
# Assert y in {-1,+1} |
207
|
|
|
assert np.all(np.sort(np.unique(y)) == (-1, 1)) |
208
|
|
|
|
209
|
|
|
# Precompute common terms |
210
|
|
|
Xt = np.dot(X, theta) |
211
|
|
|
Et = np.dot(E, theta) |
212
|
|
|
alpha = np.exp(Xt) + np.exp(-Xt) |
213
|
|
|
beta = np.exp(Xt) - np.exp(-Xt) |
214
|
|
|
gamma = (np.exp(Xt).T * X.T).T + (np.exp(-Xt).T * X.T).T |
215
|
|
|
delta = (np.exp(Xt).T * X.T).T - (np.exp(-Xt).T * X.T).T |
216
|
|
|
|
217
|
|
|
# Log-partition function |
218
|
|
|
A = np.log(alpha) |
219
|
|
|
|
220
|
|
|
# First-order partial derivative of log-partition w.r.t. Xt |
221
|
|
|
dA = beta / alpha |
222
|
|
|
|
223
|
|
|
# Second-order partial derivative of log-partition w.r.t. Xt |
224
|
|
|
d2A = 1 - beta**2 / alpha**2 |
225
|
|
|
|
226
|
|
|
dR = 0 |
227
|
|
|
for i in range(N): |
228
|
|
|
|
229
|
|
|
# Compute gradient terms |
230
|
|
|
t1 = -y[i]*E[i, :].T |
231
|
|
|
|
232
|
|
|
t2 = beta[i] / alpha[i] * X[i, :].T |
233
|
|
|
|
234
|
|
|
t3 = (gamma[i, :] / alpha[i] - beta[i]*delta[i, :] / |
235
|
|
|
alpha[i]**2).T * (Et[i] - Xt[i]) |
236
|
|
|
|
237
|
|
|
t4 = beta[i] / alpha[i] * (E[i, :] - X[i, :]).T |
238
|
|
|
|
239
|
|
|
t5 = (1 - beta[i]**2 / alpha[i]**2) * np.dot(V[:, :, i], theta) |
240
|
|
|
|
241
|
|
|
t6 = -(beta[i] * gamma[i, :] / alpha[i]**2 - beta[i]**2 * |
242
|
|
|
delta[i, :] / alpha[i]**3).T * np.dot(np.dot(theta.T, |
243
|
|
|
V[:, :, i]), theta) |
244
|
|
|
|
245
|
|
|
dR += t1 + t2 + t3 + t4 + t5 + t6 |
246
|
|
|
|
247
|
|
|
# Add regularization |
248
|
|
|
dR += l2*2*theta |
249
|
|
|
|
250
|
|
|
return dR |
251
|
|
|
|
252
|
|
|
def fit(self, X, y, Z): |
253
|
|
|
""" |
254
|
|
|
Fit/train a robust bias-aware classifier. |
255
|
|
|
|
256
|
|
|
INPUT (1) array 'X': source data (N samples by D features) |
257
|
|
|
(2) array 'y': source labels (N samples by 1) |
258
|
|
|
(3) array 'Z': target data (M samples by D features) |
259
|
|
|
OUTPUT None |
260
|
|
|
""" |
261
|
|
|
# Data shapes |
262
|
|
|
N, DX = X.shape |
263
|
|
|
M, DZ = Z.shape |
264
|
|
|
|
265
|
|
|
# Assert equivalent dimensionalities |
266
|
|
|
assert DX == DZ |
267
|
|
|
|
268
|
|
|
# Number of classes |
269
|
|
|
K = len(np.unique(y)) |
270
|
|
|
|
271
|
|
|
# Map to one-not-encoding |
272
|
|
|
Y = one_hot(y, one_not=True) |
273
|
|
|
|
274
|
|
|
# Compute transfer distribution parameters |
275
|
|
|
iota = self.mle_transfer_dist(X, Z) |
276
|
|
|
|
277
|
|
|
# Compute moments of transfer distribution |
278
|
|
|
E, V = self.moments_transfer_model(X, iota) |
279
|
|
|
|
280
|
|
|
# Select loss function |
281
|
|
|
if (self.loss == 'logistic'): |
282
|
|
|
|
283
|
|
|
# Preallocate parameter array |
284
|
|
|
theta = np.random.randn(DX, K) |
285
|
|
|
|
286
|
|
|
# Train a classifier for each class |
287
|
|
|
for k in range(K): |
288
|
|
|
|
289
|
|
|
# Shorthand for loss computation |
290
|
|
|
def L(theta): return self.flda_log_loss(theta, X, Y[:, k], |
291
|
|
|
E, V, l2=self.l2) |
292
|
|
|
|
293
|
|
|
# Shorthand for gradient computation |
294
|
|
|
def J(theta): return self.flda_log_grad(theta, X, Y[:, k], |
295
|
|
|
E, V, l2=self.l2) |
296
|
|
|
|
297
|
|
|
# Call scipy's minimizer |
298
|
|
|
results = minimize(L, theta[:, k], jac=J, method='BFGS', |
299
|
|
|
options={'gtol': self.tolerance, |
300
|
|
|
'disp': self.verbose}) |
301
|
|
|
|
302
|
|
|
# Store resultant classifier parameters |
303
|
|
|
theta[:, k] = results.x |
304
|
|
|
|
305
|
|
|
elif (self.loss == 'quadratic'): |
306
|
|
|
|
307
|
|
|
# Compute closed-form least-squares solution |
308
|
|
|
theta = np.inv(E.T*E + np.sum(V, axis=2) + l2*np.eye(D))\ |
309
|
|
|
* (E.T * Y) |
310
|
|
|
|
311
|
|
|
# Store trained classifier parameters |
312
|
|
|
self.theta = theta |
313
|
|
|
|
314
|
|
|
# Mark classifier as trained |
315
|
|
|
self.is_trained = True |
316
|
|
|
|
317
|
|
|
# Store training data dimensionality |
318
|
|
|
self.train_data_dim = DX |
319
|
|
|
|
320
|
|
|
def predict(self, Z_): |
321
|
|
|
""" |
322
|
|
|
Make predictions on new dataset. |
323
|
|
|
|
324
|
|
|
INPUT (1) array 'Z_': new data set (M samples by D features) |
325
|
|
|
OUTPUT (1) array 'preds': label predictions (M samples by 1) |
326
|
|
|
""" |
327
|
|
|
# Data shape |
328
|
|
|
M, D = Z_.shape |
329
|
|
|
|
330
|
|
|
# If classifier is trained, check for same dimensionality |
331
|
|
|
if self.is_trained: |
332
|
|
|
assert self.train_data_dim == D |
333
|
|
|
else: |
334
|
|
|
raise UserWarning('Classifier is not trained yet.') |
335
|
|
|
|
336
|
|
|
# Predict target labels |
337
|
|
|
preds = np.argmax(np.dot(Z_, self.theta), axis=1) |
338
|
|
|
|
339
|
|
|
# Return predictions array |
340
|
|
|
return preds |
341
|
|
|
|
342
|
|
|
def get_params(self): |
343
|
|
|
"""Get classifier parameters.""" |
344
|
|
|
return self.clf.get_params() |
345
|
|
|
|
346
|
|
|
def is_trained(self): |
347
|
|
|
"""Check whether classifier is trained.""" |
348
|
|
|
return self.is_trained |
349
|
|
|
|