|
1
|
|
|
#!/usr/bin/env python |
|
2
|
|
|
# -*- coding: utf-8 -*- |
|
3
|
|
|
|
|
4
|
1 |
|
import numpy as np |
|
5
|
1 |
|
import numpy.linalg as al |
|
6
|
1 |
|
from scipy.stats import multivariate_normal as mvn |
|
7
|
1 |
|
import sklearn as sk |
|
8
|
1 |
|
from sklearn.svm import LinearSVC |
|
9
|
1 |
|
from sklearn.linear_model import LogisticRegression, LinearRegression |
|
10
|
1 |
|
from sklearn.model_selection import cross_val_predict |
|
11
|
1 |
|
from os.path import basename |
|
12
|
|
|
|
|
13
|
1 |
|
from .util import one_hot, regularize_matrix |
|
14
|
|
|
|
|
15
|
|
|
|
|
16
|
1 |
|
class TargetContrastivePessimisticClassifier(object): |
|
17
|
|
|
""" |
|
18
|
|
|
Classifiers based on Target Contrastive Pessimistic Risk minimization. |
|
19
|
|
|
|
|
20
|
|
|
Methods contain models, risk functions, parameter estimation, etc. |
|
21
|
|
|
""" |
|
22
|
|
|
|
|
23
|
1 |
|
def __init__(self, loss='lda', l2=1.0, max_iter=500, tolerance=1e-12, |
|
24
|
|
|
learning_rate=1.0, rate_decay='linear', verbosity=0): |
|
25
|
|
|
""" |
|
26
|
|
|
Select a particular type of TCPR classifier. |
|
27
|
|
|
|
|
28
|
|
|
Parameters |
|
29
|
|
|
---------- |
|
30
|
|
|
loss : str |
|
31
|
|
|
loss function for TCP Risk, options: 'ls', 'least_squares', 'lda', |
|
32
|
|
|
'linear_discriminant_analysis', 'qda', |
|
33
|
|
|
'quadratic_discriminant_analysis' (def: 'lda') |
|
34
|
|
|
l2 : float |
|
35
|
|
|
l2-regularization parameter value (def:0.01) |
|
36
|
|
|
max_iter : int |
|
37
|
|
|
maximum number of iterations for optimization (def: 500) |
|
38
|
|
|
tolerance : float |
|
39
|
|
|
convergence criterion on the TCP parameters (def: 1e-5) |
|
40
|
|
|
learning_rate : float |
|
41
|
|
|
parameter for size of update of gradient (def: 1.0) |
|
42
|
|
|
rate_decay : str |
|
43
|
|
|
type of learning rate decay, options: 'linear', 'quadratic', |
|
44
|
|
|
'geometric', 'exponential' (def: 'linear') |
|
45
|
|
|
|
|
46
|
|
|
Returns |
|
47
|
|
|
------- |
|
48
|
|
|
None |
|
49
|
|
|
|
|
50
|
|
|
""" |
|
51
|
|
|
# Classifier options |
|
52
|
1 |
|
self.loss = loss |
|
53
|
1 |
|
self.l2 = l2 |
|
54
|
|
|
|
|
55
|
|
|
# Optimization parameters |
|
56
|
1 |
|
self.max_iter = max_iter |
|
57
|
1 |
|
self.tolerance = tolerance |
|
58
|
1 |
|
self.learning_rate = learning_rate |
|
59
|
1 |
|
self.rate_decay = rate_decay |
|
60
|
1 |
|
self.verbosity = verbosity |
|
61
|
|
|
|
|
62
|
1 |
|
if self.loss in ['linear discriminant analysis', 'lda']: |
|
63
|
|
|
|
|
64
|
|
|
# Set to short name |
|
65
|
1 |
|
self.loss = 'lda' |
|
66
|
|
|
|
|
67
|
|
|
elif self.loss in ['quadratic discriminant analysis', 'qda']: |
|
68
|
|
|
|
|
69
|
|
|
# Set to short name |
|
70
|
|
|
self.loss = 'qda' |
|
71
|
|
|
|
|
72
|
|
|
else: |
|
73
|
|
|
# Other loss functions are not implemented |
|
74
|
|
|
raise ValueError('Model not implemented.') |
|
75
|
|
|
|
|
76
|
|
|
# Initialize classifier and classes parameters |
|
77
|
1 |
|
self.parameters = [] |
|
78
|
1 |
|
self.classes = [] |
|
79
|
|
|
|
|
80
|
|
|
# Whether model has been trained |
|
81
|
1 |
|
self.is_trained = False |
|
82
|
|
|
|
|
83
|
|
|
# Dimensionality of training data |
|
84
|
1 |
|
self.train_data_dim = 0 |
|
85
|
|
|
|
|
86
|
1 |
|
def add_intercept(self, X): |
|
87
|
|
|
"""Add 1's to data as last features.""" |
|
88
|
|
|
# Data shape |
|
89
|
|
|
N, D = X.shape |
|
90
|
|
|
|
|
91
|
|
|
# Check if there's not already an intercept column |
|
92
|
|
|
if np.any(np.sum(X, axis=0) == N): |
|
93
|
|
|
|
|
94
|
|
|
# Report |
|
95
|
|
|
print('Intercept is not the last feature. Swapping..') |
|
96
|
|
|
|
|
97
|
|
|
# Find which column contains the intercept |
|
98
|
|
|
intercept_index = np.argwhere(np.sum(X, axis=0) == N) |
|
99
|
|
|
|
|
100
|
|
|
# Swap intercept to last |
|
101
|
|
|
X = X[:, np.setdiff1d(np.arange(D), intercept_index)] |
|
102
|
|
|
|
|
103
|
|
|
# Add intercept as last column |
|
104
|
|
|
X = np.hstack((X, np.ones((N, 1)))) |
|
105
|
|
|
|
|
106
|
|
|
# Append column of 1's to data, and increment dimensionality |
|
107
|
|
|
return X, D+1 |
|
108
|
|
|
|
|
109
|
1 |
|
def remove_intercept(self, X): |
|
110
|
|
|
"""Remove 1's from data as last features.""" |
|
111
|
|
|
# Data shape |
|
112
|
1 |
|
N, D = X.shape |
|
113
|
|
|
|
|
114
|
|
|
# Find which column contains the intercept |
|
115
|
1 |
|
intercept_index = np.argwhere(np.sum(X, axis=0) == N) |
|
116
|
|
|
|
|
117
|
|
|
# Swap intercept to last |
|
118
|
1 |
|
X = X[:, np.setdiff1d(np.arange(D), intercept_index)] |
|
119
|
|
|
|
|
120
|
1 |
|
return X, D-1 |
|
121
|
|
|
|
|
122
|
1 |
|
def project_simplex(self, v, z=1.0): |
|
123
|
|
|
""" |
|
124
|
|
|
Project vector onto simplex using sorting. |
|
125
|
|
|
|
|
126
|
|
|
Reference: "Efficient Projections onto the L1-Ball for Learning in High |
|
127
|
|
|
Dimensions (Duchi, Shalev-Shwartz, Singer, Chandra, 2006)." |
|
128
|
|
|
|
|
129
|
|
|
Parameters |
|
130
|
|
|
---------- |
|
131
|
|
|
v : array |
|
132
|
|
|
vector to be projected (n dimensions by 0) |
|
133
|
|
|
z : float |
|
134
|
|
|
constant (def: 1.0) |
|
135
|
|
|
|
|
136
|
|
|
Returns |
|
137
|
|
|
------- |
|
138
|
|
|
w : array |
|
139
|
|
|
projected vector (n dimensions by 0) |
|
140
|
|
|
|
|
141
|
|
|
""" |
|
142
|
|
|
# Number of dimensions |
|
143
|
1 |
|
n = v.shape[0] |
|
144
|
|
|
|
|
145
|
|
|
# Sort vector |
|
146
|
1 |
|
mu = np.sort(v, axis=0)[::-1] |
|
147
|
|
|
|
|
148
|
|
|
# Find rho |
|
149
|
1 |
|
C = np.cumsum(mu) - z |
|
150
|
1 |
|
j = np.arange(n) + 1 |
|
151
|
1 |
|
rho = j[mu - C/j > 0][-1] |
|
152
|
|
|
|
|
153
|
|
|
# Define theta |
|
154
|
1 |
|
theta = C[mu - C/j > 0][-1] / float(rho) |
|
155
|
|
|
|
|
156
|
|
|
# Subtract theta from original vector and cap at 0 |
|
157
|
1 |
|
w = np.maximum(v - theta, 0) |
|
158
|
|
|
|
|
159
|
|
|
# Return projected vector |
|
160
|
1 |
|
return w |
|
161
|
|
|
|
|
162
|
1 |
|
def learning_rate_t(self, t): |
|
163
|
|
|
""" |
|
164
|
|
|
Compute current learning rate after decay. |
|
165
|
|
|
|
|
166
|
|
|
Parameters |
|
167
|
|
|
---------- |
|
168
|
|
|
t : int |
|
169
|
|
|
current iteration |
|
170
|
|
|
|
|
171
|
|
|
Returns |
|
172
|
|
|
------- |
|
173
|
|
|
alpha : float |
|
174
|
|
|
current learning rate |
|
175
|
|
|
|
|
176
|
|
|
""" |
|
177
|
|
|
# Select rate decay |
|
178
|
1 |
|
if self.rate_decay == 'linear': |
|
179
|
|
|
|
|
180
|
|
|
# Linear dropoff between t=0 and t=T |
|
181
|
1 |
|
alpha = (self.max_iter - t)/(self.learning_rate*self.max_iter) |
|
182
|
|
|
|
|
183
|
|
|
elif self.rate_decay == 'quadratic': |
|
184
|
|
|
|
|
185
|
|
|
# Quadratic dropoff between t=0 and t=T |
|
186
|
|
|
alpha = ((self.max_iter - t)/(self.learning_rate*self.max_iter))**2 |
|
187
|
|
|
|
|
188
|
|
|
elif self.rate_decay == 'geometric': |
|
189
|
|
|
|
|
190
|
|
|
# Drop rate off inversely to time |
|
191
|
|
|
alpha = 1 / (self.learning_rate * t) |
|
192
|
|
|
|
|
193
|
|
|
elif self.rate_decay == 'exponential': |
|
194
|
|
|
|
|
195
|
|
|
# Exponential dropoff |
|
196
|
|
|
alpha = np.exp(-self.learning_rate * t) |
|
197
|
|
|
|
|
198
|
|
|
else: |
|
199
|
|
|
raise ValueError('Rate decay type unknown.') |
|
200
|
|
|
|
|
201
|
1 |
|
return alpha |
|
202
|
|
|
|
|
203
|
1 |
|
def risk(self, Z, theta, q): |
|
204
|
|
|
""" |
|
205
|
|
|
Compute target contrastive pessimistic risk. |
|
206
|
|
|
|
|
207
|
|
|
Parameters |
|
208
|
|
|
---------- |
|
209
|
|
|
Z : array |
|
210
|
|
|
target samples (M samples by D features) |
|
211
|
|
|
theta : array |
|
212
|
|
|
classifier parameters (D features by K classes) |
|
213
|
|
|
q : array |
|
214
|
|
|
soft labels (M samples by K classes) |
|
215
|
|
|
|
|
216
|
|
|
Returns |
|
217
|
|
|
------- |
|
218
|
|
|
float |
|
219
|
|
|
Value of risk function. |
|
220
|
|
|
|
|
221
|
|
|
""" |
|
222
|
|
|
# Number of classes |
|
223
|
1 |
|
K = q.shape[1] |
|
224
|
|
|
|
|
225
|
|
|
# Compute negative log-likelihood |
|
226
|
1 |
|
L = self.neg_log_likelihood(Z, theta) |
|
227
|
|
|
|
|
228
|
|
|
# Weight loss by soft labels |
|
229
|
1 |
|
for k in range(K): |
|
230
|
1 |
|
L[:, k] *= q[:, k] |
|
231
|
|
|
|
|
232
|
|
|
# Sum over weighted losses |
|
233
|
1 |
|
L = np.sum(L, axis=1) |
|
234
|
|
|
|
|
235
|
|
|
# Risk is average loss |
|
236
|
1 |
|
return np.mean(L, axis=0) |
|
237
|
|
|
|
|
238
|
1 |
|
def neg_log_likelihood(self, X, theta): |
|
239
|
|
|
""" |
|
240
|
|
|
Compute negative log-likelihood under Gaussian distributions. |
|
241
|
|
|
|
|
242
|
|
|
Parameters |
|
243
|
|
|
---------- |
|
244
|
|
|
X : array |
|
245
|
|
|
data (N samples by D features) |
|
246
|
|
|
theta : tuple(array, array, array) |
|
247
|
|
|
tuple containing class proportions 'pi', class means 'mu', |
|
248
|
|
|
and class-covariances 'Si' |
|
249
|
|
|
|
|
250
|
|
|
Returns |
|
251
|
|
|
------- |
|
252
|
|
|
L : array |
|
253
|
|
|
loss (N samples by K classes) |
|
254
|
|
|
|
|
255
|
|
|
""" |
|
256
|
|
|
# Unpack parameters |
|
257
|
1 |
|
pi, mu, Si = theta |
|
258
|
|
|
|
|
259
|
|
|
# Check if parameter sets match |
|
260
|
1 |
|
if not pi.shape[1] == mu.shape[0]: |
|
261
|
|
|
raise ValueError('Number proportions does not match number means.') |
|
262
|
|
|
|
|
263
|
1 |
|
if not mu.shape[1] == Si.shape[0] == Si.shape[1]: |
|
264
|
|
|
raise ValueError('''Dimensions of mean does not match dimensions of |
|
265
|
|
|
covariance matrix.''') |
|
266
|
|
|
|
|
267
|
|
|
# Number of classes |
|
268
|
1 |
|
K = pi.shape[1] |
|
269
|
|
|
|
|
270
|
|
|
# Data shape |
|
271
|
1 |
|
N, D = X.shape |
|
272
|
|
|
|
|
273
|
|
|
# Preallocate loss array |
|
274
|
1 |
|
L = np.zeros((N, K)) |
|
275
|
|
|
|
|
276
|
1 |
|
for k in range(K): |
|
277
|
|
|
|
|
278
|
|
|
# Check for linear or quadratic |
|
279
|
1 |
|
if self.loss == 'lda': |
|
280
|
|
|
|
|
281
|
1 |
|
try: |
|
282
|
|
|
# Probability under k-th Gaussian with shared covariance |
|
283
|
1 |
|
probs = mvn.pdf(X, mu[k, :], Si) |
|
284
|
|
|
|
|
285
|
|
|
except al.LinAlgError as err: |
|
286
|
|
|
print('Covariance matrix is singular. Add regularization.') |
|
287
|
|
|
raise err |
|
288
|
|
|
|
|
289
|
|
|
elif self.loss == 'qda': |
|
290
|
|
|
|
|
291
|
|
|
try: |
|
292
|
|
|
# Probability under k-th Gaussian with own covariance |
|
293
|
|
|
probs = mvn.pdf(X, mu[k, :], Si[:, :, k]) |
|
294
|
|
|
|
|
295
|
|
|
except al.LinAlgError as err: |
|
296
|
|
|
print('Covariance matrix is singular. Add regularization.') |
|
297
|
|
|
raise err |
|
298
|
|
|
|
|
299
|
|
|
else: |
|
300
|
|
|
raise ValueError('Loss unknown.') |
|
301
|
|
|
|
|
302
|
|
|
# Negative log-likelihood |
|
303
|
1 |
|
L[:, k] = -np.log(pi[0, k]) - np.log(probs) |
|
304
|
|
|
|
|
305
|
1 |
|
return L |
|
306
|
|
|
|
|
307
|
1 |
|
def discriminant_parameters(self, X, Y): |
|
308
|
|
|
""" |
|
309
|
|
|
Estimate parameters of Gaussian distribution for discriminant analysis. |
|
310
|
|
|
|
|
311
|
|
|
Parameters |
|
312
|
|
|
---------- |
|
313
|
|
|
X : array |
|
314
|
|
|
data array (N samples by D features) |
|
315
|
|
|
Y : array |
|
316
|
|
|
label array (N samples by K classes) |
|
317
|
|
|
|
|
318
|
|
|
Returns |
|
319
|
|
|
------- |
|
320
|
|
|
pi : array |
|
321
|
|
|
class proportions (1 by K classes) |
|
322
|
|
|
mu : array |
|
323
|
|
|
class means (K classes by D features) |
|
324
|
|
|
Si : array |
|
325
|
|
|
class covariances (D features D features by K classes) |
|
326
|
|
|
|
|
327
|
|
|
""" |
|
328
|
|
|
# Check labels |
|
329
|
1 |
|
K = Y.shape[1] |
|
330
|
|
|
|
|
331
|
|
|
# Check for sufficient labels |
|
332
|
1 |
|
if not K > 1: |
|
333
|
|
|
raise ValueError('Number of classes should be larger than 1.') |
|
334
|
|
|
|
|
335
|
|
|
# Data shape |
|
336
|
1 |
|
N, D = X.shape |
|
337
|
|
|
|
|
338
|
|
|
# Preallocate parameter arrays |
|
339
|
1 |
|
pi = np.zeros((1, K)) |
|
340
|
1 |
|
mu = np.zeros((K, D)) |
|
341
|
1 |
|
Si = np.zeros((D, D, K)) |
|
342
|
|
|
|
|
343
|
|
|
# For each class |
|
344
|
1 |
|
for k in range(K): |
|
345
|
|
|
|
|
346
|
|
|
# Number of samples for current class |
|
347
|
1 |
|
Nk = np.sum(Y[:, k], axis=0) |
|
348
|
|
|
|
|
349
|
|
|
# Check for no samples assigned to certain class |
|
350
|
1 |
|
if Nk == 0: |
|
351
|
|
|
|
|
352
|
|
|
# Proportion of samples for current class |
|
353
|
|
|
pi[0, k] = 0 |
|
354
|
|
|
|
|
355
|
|
|
# Mean of current class |
|
356
|
|
|
mu[k, :] = np.zeros((1, D)) |
|
357
|
|
|
|
|
358
|
|
|
# Covariance of current class |
|
359
|
|
|
Si[:, :, k] = np.eye(D, D) |
|
360
|
|
|
|
|
361
|
|
|
else: |
|
362
|
|
|
|
|
363
|
|
|
# Proportion of samples for current class |
|
364
|
1 |
|
pi[0, k] = Nk / N |
|
365
|
|
|
|
|
366
|
|
|
# Mean of current class |
|
367
|
1 |
|
mu[k, :] = np.dot(Y[:, k].T, X) / Nk |
|
368
|
|
|
|
|
369
|
|
|
# Subtract mean from data |
|
370
|
1 |
|
X_ = X - mu[k, :] |
|
371
|
|
|
|
|
372
|
|
|
# Diagonalize current label vector |
|
373
|
1 |
|
dYk = np.diag(Y[:, k]) |
|
374
|
|
|
|
|
375
|
|
|
# Covariance of current class |
|
376
|
1 |
|
Si[:, :, k] = np.dot(np.dot(X_.T, dYk), X_) / Nk |
|
377
|
|
|
|
|
378
|
|
|
# Regularization |
|
379
|
1 |
|
Si[:, :, k] = regularize_matrix(Si[:, :, k], a=self.l2) |
|
380
|
|
|
|
|
381
|
|
|
# Check for linear or quadratic discriminant analysis |
|
382
|
1 |
|
if self.loss == 'lda': |
|
383
|
|
|
|
|
384
|
|
|
# In LDA, the class-covariance matrices are combined |
|
385
|
1 |
|
Si = self.combine_class_covariances(Si, pi) |
|
386
|
|
|
|
|
387
|
|
|
# Check for numerical problems |
|
388
|
1 |
|
if np.any(np.isnan(mu)) or np.any(np.isnan(Si)): |
|
389
|
|
|
raise RuntimeError('Parameter estimate is NaN.') |
|
390
|
|
|
|
|
391
|
1 |
|
return pi, mu, Si |
|
392
|
|
|
|
|
393
|
1 |
|
def combine_class_covariances(self, Si, pi): |
|
394
|
|
|
""" |
|
395
|
|
|
Linear combination of class covariance matrices. |
|
396
|
|
|
|
|
397
|
|
|
Parameters |
|
398
|
|
|
---------- |
|
399
|
|
|
Si : array |
|
400
|
|
|
Covariance matrix (D features by D features by K classes) |
|
401
|
|
|
pi : array |
|
402
|
|
|
class proportions (1 by K classes) |
|
403
|
|
|
|
|
404
|
|
|
Returns |
|
405
|
|
|
------- |
|
406
|
|
|
Si : array |
|
407
|
|
|
Combined covariance matrix (D by D) |
|
408
|
|
|
|
|
409
|
|
|
""" |
|
410
|
|
|
# Number of classes |
|
411
|
1 |
|
K = Si.shape[2] |
|
412
|
|
|
|
|
413
|
|
|
# Check if w is size K |
|
414
|
1 |
|
if not pi.shape[1] == K: |
|
415
|
|
|
raise ValueError('''Number of proportions does not match with number |
|
416
|
|
|
classes of covariance matrix.''') |
|
417
|
|
|
|
|
418
|
|
|
# For each class |
|
419
|
1 |
|
for k in range(K): |
|
420
|
|
|
|
|
421
|
|
|
# Weight each class-covariance |
|
422
|
1 |
|
Si[:, :, k] = Si[:, :, k] * pi[0, k] |
|
423
|
|
|
|
|
424
|
|
|
# Sum over weighted class-covariances |
|
425
|
1 |
|
return np.sum(Si, axis=2) |
|
426
|
|
|
|
|
427
|
1 |
|
def tcpr_da(self, X, y, Z): |
|
428
|
|
|
""" |
|
429
|
|
|
Target Contrastive Pessimistic Risk - discriminant analysis. |
|
430
|
|
|
|
|
431
|
|
|
Parameters |
|
432
|
|
|
---------- |
|
433
|
|
|
X : array |
|
434
|
|
|
source data (N samples by D features) |
|
435
|
|
|
y : array |
|
436
|
|
|
source labels (N samples by 1) |
|
437
|
|
|
Z : array |
|
438
|
|
|
target data (M samples by D features) |
|
439
|
|
|
|
|
440
|
|
|
Returns |
|
441
|
|
|
------- |
|
442
|
|
|
theta : array |
|
443
|
|
|
classifier parameters (D features by K classes) |
|
444
|
|
|
|
|
445
|
|
|
""" |
|
446
|
|
|
# Data shapes |
|
447
|
1 |
|
N, DX = X.shape |
|
448
|
1 |
|
M, DZ = Z.shape |
|
449
|
|
|
|
|
450
|
|
|
# Assert equivalent dimensionalities |
|
451
|
1 |
|
if not DX == DZ: |
|
452
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
|
453
|
|
|
|
|
454
|
|
|
# Augment data with bias if necessary |
|
455
|
1 |
|
X, DX = self.remove_intercept(X) |
|
456
|
1 |
|
Z, DZ = self.remove_intercept(Z) |
|
457
|
|
|
|
|
458
|
|
|
# Map labels to one-hot-encoding |
|
459
|
1 |
|
Y, classes = one_hot(y) |
|
460
|
|
|
|
|
461
|
|
|
# Number of classes |
|
462
|
1 |
|
K = len(classes) |
|
463
|
|
|
|
|
464
|
|
|
# Check for at least 2 classes |
|
465
|
1 |
|
if not K > 1: |
|
466
|
|
|
raise ValueError('Number of classes should be larger than 1.') |
|
467
|
|
|
|
|
468
|
|
|
# Estimate parameters of source model |
|
469
|
1 |
|
theta_ref = self.discriminant_parameters(X, Y) |
|
470
|
|
|
|
|
471
|
|
|
# Loss is negative log-likelihood under reference parameters |
|
472
|
1 |
|
L_ref = self.neg_log_likelihood(Z, theta_ref) |
|
473
|
|
|
|
|
474
|
|
|
# Initialize target posterior |
|
475
|
1 |
|
q = np.ones((M, K)) / K |
|
476
|
|
|
|
|
477
|
1 |
|
print('Starting TCP optimization') |
|
478
|
|
|
|
|
479
|
1 |
|
TCPRt = np.inf |
|
480
|
1 |
|
for t in range(self.max_iter): |
|
481
|
|
|
|
|
482
|
|
|
# Maximization phase |
|
483
|
|
|
|
|
484
|
|
|
# Estimate parameters using TCP risk |
|
485
|
1 |
|
theta_tcp = self.discriminant_parameters(Z, q) |
|
486
|
|
|
|
|
487
|
|
|
# Minimization phase |
|
488
|
|
|
|
|
489
|
|
|
# Compute loss under new parameters |
|
490
|
1 |
|
L_tcp = self.neg_log_likelihood(Z, theta_tcp) |
|
491
|
|
|
|
|
492
|
|
|
# Gradient is difference in losses |
|
493
|
1 |
|
Dq = L_tcp - L_ref |
|
494
|
|
|
|
|
495
|
|
|
# Update learning rate |
|
496
|
1 |
|
alpha = self.learning_rate_t(t) |
|
497
|
|
|
|
|
498
|
|
|
# Steepest descent step |
|
499
|
1 |
|
q -= alpha*Dq |
|
500
|
|
|
|
|
501
|
|
|
# Project back onto simplex |
|
502
|
1 |
|
for m in range(M): |
|
503
|
1 |
|
q[m, :] = self.project_simplex(q[m, :]) |
|
504
|
|
|
|
|
505
|
|
|
# Monitor progress |
|
506
|
|
|
|
|
507
|
|
|
# Risks of current parameters |
|
508
|
1 |
|
R_tcp = self.risk(Z, theta_tcp, q) |
|
509
|
1 |
|
R_ref = self.risk(Z, theta_ref, q) |
|
510
|
|
|
|
|
511
|
|
|
# Assert no numerical problems |
|
512
|
1 |
|
if np.isnan(R_tcp) or np.isnan(R_ref): |
|
513
|
|
|
raise RuntimeError('Objective is NaN.') |
|
514
|
|
|
|
|
515
|
|
|
# Current TCP risk |
|
516
|
1 |
|
TCPRt_ = R_tcp - R_ref |
|
517
|
|
|
|
|
518
|
|
|
# Change in risk difference for this iteration |
|
519
|
1 |
|
dR = al.norm(TCPRt - TCPRt_) |
|
520
|
|
|
|
|
521
|
|
|
# Check for change smaller than tolerance |
|
522
|
1 |
|
if (dR < self.tolerance): |
|
523
|
1 |
|
print('Broke at iteration '+str(t)+', TCP Risk = '+str(TCPRt_)) |
|
524
|
1 |
|
break |
|
525
|
|
|
|
|
526
|
|
|
# Report progress |
|
527
|
1 |
|
if (t % 100) == 1: |
|
528
|
1 |
|
print('Iteration ' + str(t) + '/' + str(self.max_iter) + |
|
529
|
|
|
', TCP Risk = ' + str(TCPRt_)) |
|
530
|
|
|
|
|
531
|
|
|
# Update |
|
532
|
1 |
|
TCPRt = TCPRt_ |
|
533
|
|
|
|
|
534
|
|
|
# Return estimated parameters |
|
535
|
1 |
|
return theta_tcp |
|
536
|
|
|
|
|
537
|
1 |
|
def fit(self, X, y, Z): |
|
538
|
|
|
""" |
|
539
|
|
|
Fit/train an importance-weighted classifier. |
|
540
|
|
|
|
|
541
|
|
|
Parameters |
|
542
|
|
|
---------- |
|
543
|
|
|
X : array |
|
544
|
|
|
source data (N samples by D features) |
|
545
|
|
|
y : array |
|
546
|
|
|
source labels (N samples by 1) |
|
547
|
|
|
Z : array |
|
548
|
|
|
target data (M samples by D features) |
|
549
|
|
|
|
|
550
|
|
|
Returns |
|
551
|
|
|
------- |
|
552
|
|
|
None |
|
553
|
|
|
|
|
554
|
|
|
""" |
|
555
|
|
|
# Data shapes |
|
556
|
1 |
|
N, DX = X.shape |
|
557
|
1 |
|
M, DZ = Z.shape |
|
558
|
|
|
|
|
559
|
|
|
# Assert equivalent dimensionalities |
|
560
|
1 |
|
if not DX == DZ: |
|
561
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.') |
|
562
|
|
|
|
|
563
|
1 |
|
if self.loss in ['lda', 'qda']: |
|
564
|
|
|
|
|
565
|
|
|
# Discriminant analysis model for TCPR |
|
566
|
1 |
|
self.parameters = self.tcpr_da(X, y, Z) |
|
567
|
|
|
|
|
568
|
|
|
else: |
|
569
|
|
|
# Other loss functions are not implemented |
|
570
|
|
|
raise ValueError('Loss function unknown.') |
|
571
|
|
|
|
|
572
|
|
|
# Extract and store classes |
|
573
|
1 |
|
self.classes = np.unique(y) |
|
574
|
|
|
|
|
575
|
|
|
# Mark classifier as trained |
|
576
|
1 |
|
self.is_trained = True |
|
577
|
|
|
|
|
578
|
|
|
# Store training data dimensionality |
|
579
|
1 |
|
self.train_data_dim = DX |
|
580
|
|
|
|
|
581
|
1 |
|
def predict(self, Z_): |
|
582
|
|
|
""" |
|
583
|
|
|
Make predictions on new dataset. |
|
584
|
|
|
|
|
585
|
|
|
Parameters |
|
586
|
|
|
---------- |
|
587
|
|
|
Z : array |
|
588
|
|
|
new data set (M samples by D features) |
|
589
|
|
|
|
|
590
|
|
|
Returns |
|
591
|
|
|
------- |
|
592
|
|
|
preds : array |
|
593
|
|
|
label predictions (M samples by 1) |
|
594
|
|
|
|
|
595
|
|
|
""" |
|
596
|
|
|
# Data shape |
|
597
|
1 |
|
M, D = Z_.shape |
|
598
|
|
|
|
|
599
|
|
|
# If classifier is trained, check for same dimensionality |
|
600
|
1 |
|
if self.is_trained: |
|
601
|
1 |
|
if not self.train_data_dim == D: |
|
602
|
|
|
raise ValueError('''Test data is of different dimensionality |
|
603
|
|
|
than training data.''') |
|
604
|
|
|
|
|
605
|
1 |
|
if self.loss in ['lda', 'qda']: |
|
606
|
|
|
|
|
607
|
|
|
# Compute probabilities under each distribution |
|
608
|
1 |
|
probs = self.neg_log_likelihood(Z_, self.parameters) |
|
609
|
|
|
|
|
610
|
|
|
# Take largest probability as predictions |
|
611
|
1 |
|
preds = np.argmax(probs, axis=1) |
|
612
|
|
|
|
|
613
|
|
|
# Map predictions back to original labels |
|
614
|
1 |
|
preds = self.classes[preds] |
|
615
|
|
|
|
|
616
|
|
|
# Return predictions array |
|
617
|
1 |
|
return preds |
|
618
|
|
|
|
|
619
|
1 |
|
def get_params(self): |
|
620
|
|
|
"""Return classifier parameters.""" |
|
621
|
|
|
# Check if classifier is trained |
|
622
|
|
|
if self.is_trained: |
|
623
|
|
|
return self.parameters |
|
624
|
|
|
|
|
625
|
|
|
else: |
|
626
|
|
|
# Throw soft error |
|
627
|
|
|
print('Classifier is not trained yet.') |
|
628
|
|
|
return [] |
|
629
|
|
|
|
|
630
|
1 |
|
def error_rate(self, preds, u_): |
|
631
|
|
|
"""Compute classification error rate.""" |
|
632
|
|
|
return np.mean(preds != u_, axis=0) |
|
633
|
|
|
|