1
|
|
|
#!/usr/bin/env python
|
2
|
|
|
# -*- coding: utf-8 -*-
|
3
|
|
|
|
4
|
1 |
|
import numpy as np
|
5
|
1 |
|
import scipy.stats as st
|
6
|
1 |
|
from scipy.spatial.distance import cdist
|
7
|
1 |
|
import sklearn as sk
|
8
|
1 |
|
from sklearn.svm import LinearSVC
|
9
|
1 |
|
from sklearn.linear_model import LogisticRegression, LinearRegression
|
10
|
1 |
|
from sklearn.model_selection import cross_val_predict
|
11
|
1 |
|
from os.path import basename
|
12
|
1 |
|
from cvxopt import matrix, solvers
|
13
|
|
|
|
14
|
1 |
|
from .util import is_pos_def
|
15
|
|
|
|
16
|
|
|
|
17
|
1 |
|
class ImportanceWeightedClassifier(object):
|
18
|
|
|
"""
|
19
|
|
|
Class of importance-weighted classifiers.
|
20
|
|
|
|
21
|
|
|
Methods contain different importance-weight estimators and different loss
|
22
|
|
|
functions.
|
23
|
|
|
"""
|
24
|
|
|
|
25
|
1 |
View Code Duplication |
def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
|
|
|
|
|
26
|
|
|
clip=-1, kernel_type='rbf', bandwidth=1):
|
27
|
|
|
"""
|
28
|
|
|
Select a particular type of importance-weighted classifier.
|
29
|
|
|
|
30
|
|
|
Parameters
|
31
|
|
|
----------
|
32
|
|
|
loss : str
|
33
|
|
|
loss function for weighted classifier, options: 'logistic',
|
34
|
|
|
'quadratic', 'hinge' (def: 'logistic')
|
35
|
|
|
l2 : float
|
36
|
|
|
l2-regularization parameter value (def:0.01)
|
37
|
|
|
iwe : str
|
38
|
|
|
importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
|
39
|
|
|
'kde' (def: 'lr')
|
40
|
|
|
smoothing : bool
|
41
|
|
|
whether to apply Laplace smoothing to the nearest-neighbour
|
42
|
|
|
importance-weight estimator (def: True)
|
43
|
|
|
clip : float
|
44
|
|
|
maximum allowable importance-weight value; if set to -1, then the
|
45
|
|
|
weights are not clipped (def:-1)
|
46
|
|
|
kernel_type : str
|
47
|
|
|
what type of kernel to use for kernel density estimation or kernel
|
48
|
|
|
mean matching, options: 'diste', 'rbf' (def: 'rbf')
|
49
|
|
|
bandwidth : float
|
50
|
|
|
kernel bandwidth parameter value for kernel-based weight
|
51
|
|
|
estimators (def: 1)
|
52
|
|
|
|
53
|
|
|
Returns
|
54
|
|
|
-------
|
55
|
|
|
None
|
56
|
|
|
|
57
|
|
|
Examples
|
58
|
|
|
--------
|
59
|
|
|
>>>> clf = ImportanceWeightedClassifier()
|
60
|
|
|
|
61
|
|
|
"""
|
62
|
1 |
|
self.loss = loss
|
63
|
1 |
|
self.l2 = l2
|
64
|
1 |
|
self.iwe = iwe
|
65
|
1 |
|
self.smoothing = smoothing
|
66
|
1 |
|
self.clip = clip
|
67
|
1 |
|
self.kernel_type = kernel_type
|
68
|
1 |
|
self.bandwidth = bandwidth
|
69
|
|
|
|
70
|
|
|
# Initialize untrained classifiers based on choice of loss function
|
71
|
1 |
|
if self.loss == 'logistic':
|
72
|
|
|
# Logistic regression model
|
73
|
1 |
|
self.clf = LogisticRegression()
|
74
|
|
|
elif self.loss == 'quadratic':
|
75
|
|
|
# Least-squares model
|
76
|
|
|
self.clf = LinearRegression()
|
77
|
|
|
elif self.loss == 'hinge':
|
78
|
|
|
# Linear support vector machine
|
79
|
|
|
self.clf = LinearSVC()
|
80
|
|
|
else:
|
81
|
|
|
# Other loss functions are not implemented
|
82
|
|
|
raise NotImplementedError('Loss function not implemented.')
|
83
|
|
|
|
84
|
|
|
# Whether model has been trained
|
85
|
1 |
|
self.is_trained = False
|
86
|
|
|
|
87
|
|
|
# Dimensionality of training data
|
88
|
1 |
|
self.train_data_dim = ''
|
89
|
|
|
|
90
|
1 |
|
def iwe_ratio_gaussians(self, X, Z):
|
91
|
|
|
"""
|
92
|
|
|
Estimate importance weights based on a ratio of Gaussian distributions.
|
93
|
|
|
|
94
|
|
|
Parameters
|
95
|
|
|
----------
|
96
|
|
|
X : array
|
97
|
|
|
source data (N samples by D features)
|
98
|
|
|
Z : array
|
99
|
|
|
target data (M samples by D features)
|
100
|
|
|
|
101
|
|
|
Returns
|
102
|
|
|
-------
|
103
|
|
|
iw : array
|
104
|
|
|
importance weights (N samples by 1)
|
105
|
|
|
|
106
|
|
|
Examples
|
107
|
|
|
--------
|
108
|
|
|
X = np.random.randn(10, 2)
|
109
|
|
|
Z = np.random.randn(10, 2)
|
110
|
|
|
clf = ImportanceWeightedClassifier()
|
111
|
|
|
iw = clf.iwe_ratio_gaussians(X, Z)
|
112
|
|
|
|
113
|
|
|
"""
|
114
|
|
|
# Data shapes
|
115
|
1 |
|
N, DX = X.shape
|
116
|
1 |
|
M, DZ = Z.shape
|
117
|
|
|
|
118
|
|
|
# Assert equivalent dimensionalities
|
119
|
1 |
|
if not DX == DZ:
|
120
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
121
|
|
|
|
122
|
|
|
# Sample means in each domain
|
123
|
1 |
|
mu_X = np.mean(X, axis=0)
|
124
|
1 |
|
mu_Z = np.mean(Z, axis=0)
|
125
|
|
|
|
126
|
|
|
# Sample covariances
|
127
|
1 |
|
Si_X = np.cov(X.T)
|
128
|
1 |
|
Si_Z = np.cov(Z.T)
|
129
|
|
|
|
130
|
|
|
# Check for positive-definiteness of covariance matrices
|
131
|
1 |
|
if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
|
132
|
|
|
print('Warning: covariate matrices not PSD.')
|
133
|
|
|
|
134
|
|
|
regct = -6
|
135
|
|
|
while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
|
136
|
|
|
print('Adding regularization: ' + str(1**regct))
|
137
|
|
|
|
138
|
|
|
# Add regularization
|
139
|
|
|
Si_X += np.eye(DX)*10.**regct
|
140
|
|
|
Si_Z += np.eye(DZ)*10.**regct
|
141
|
|
|
|
142
|
|
|
# Increment regularization counter
|
143
|
|
|
regct += 1
|
144
|
|
|
|
145
|
|
|
# Compute probability of X under each domain
|
146
|
1 |
|
pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
|
147
|
1 |
|
pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
|
148
|
|
|
|
149
|
|
|
# Check for numerical problems
|
150
|
1 |
|
if np.any(np.isnan(pT)) or np.any(pT == 0):
|
151
|
|
|
raise ValueError('Source probabilities are NaN or 0.')
|
152
|
1 |
|
if np.any(np.isnan(pS)) or np.any(pS == 0):
|
153
|
|
|
raise ValueError('Target probabilities are NaN or 0.')
|
154
|
|
|
|
155
|
|
|
# Return the ratio of probabilities
|
156
|
1 |
|
return pT / pS
|
157
|
|
|
|
158
|
1 |
|
def iwe_kernel_densities(self, X, Z):
|
159
|
|
|
"""
|
160
|
|
|
Estimate importance weights based on kernel density estimation.
|
161
|
|
|
|
162
|
|
|
Parameters
|
163
|
|
|
----------
|
164
|
|
|
X : array
|
165
|
|
|
source data (N samples by D features)
|
166
|
|
|
Z : array
|
167
|
|
|
target data (M samples by D features)
|
168
|
|
|
|
169
|
|
|
Returns
|
170
|
|
|
-------
|
171
|
|
|
iw : array
|
172
|
|
|
importance weights (N samples by 1)
|
173
|
|
|
|
174
|
|
|
Examples
|
175
|
|
|
--------
|
176
|
|
|
X = np.random.randn(10, 2)
|
177
|
|
|
Z = np.random.randn(10, 2)
|
178
|
|
|
clf = ImportanceWeightedClassifier()
|
179
|
|
|
iw = clf.iwe_kernel_densities(X, Z)
|
180
|
|
|
|
181
|
|
|
"""
|
182
|
|
|
# Data shapes
|
183
|
1 |
|
N, DX = X.shape
|
184
|
1 |
|
M, DZ = Z.shape
|
185
|
|
|
|
186
|
|
|
# Assert equivalent dimensionalities
|
187
|
1 |
|
if not DX == DZ:
|
188
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
189
|
|
|
|
190
|
|
|
# Compute probabilities based on source kernel densities
|
191
|
1 |
|
pT = st.gaussian_kde(Z.T).pdf(X.T)
|
192
|
1 |
|
pS = st.gaussian_kde(X.T).pdf(X.T)
|
193
|
|
|
|
194
|
|
|
# Check for numerical problems
|
195
|
1 |
|
if np.any(np.isnan(pT)) or np.any(pT == 0):
|
196
|
|
|
raise ValueError('Source probabilities are NaN or 0.')
|
197
|
1 |
|
if np.any(np.isnan(pS)) or np.any(pS == 0):
|
198
|
|
|
raise ValueError('Target probabilities are NaN or 0.')
|
199
|
|
|
|
200
|
|
|
# Return the ratio of probabilities
|
201
|
1 |
|
return pT / pS
|
202
|
|
|
|
203
|
1 |
|
def iwe_logistic_discrimination(self, X, Z):
|
204
|
|
|
"""
|
205
|
|
|
Estimate importance weights based on logistic regression.
|
206
|
|
|
|
207
|
|
|
Parameters
|
208
|
|
|
----------
|
209
|
|
|
X : array
|
210
|
|
|
source data (N samples by D features)
|
211
|
|
|
Z : array
|
212
|
|
|
target data (M samples by D features)
|
213
|
|
|
|
214
|
|
|
Returns
|
215
|
|
|
-------
|
216
|
|
|
iw : array
|
217
|
|
|
importance weights (N samples by 1)
|
218
|
|
|
|
219
|
|
|
Examples
|
220
|
|
|
--------
|
221
|
|
|
X = np.random.randn(10, 2)
|
222
|
|
|
Z = np.random.randn(10, 2)
|
223
|
|
|
clf = ImportanceWeightedClassifier()
|
224
|
|
|
iw = clf.iwe_logistic_discrimination(X, Z)
|
225
|
|
|
|
226
|
|
|
"""
|
227
|
|
|
# Data shapes
|
228
|
1 |
|
N, DX = X.shape
|
229
|
1 |
|
M, DZ = Z.shape
|
230
|
|
|
|
231
|
|
|
# Assert equivalent dimensionalities
|
232
|
1 |
|
if not DX == DZ:
|
233
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
234
|
|
|
|
235
|
|
|
# Make domain-label variable
|
236
|
1 |
|
y = np.concatenate((np.zeros((N, 1)),
|
237
|
|
|
np.ones((M, 1))), axis=0)
|
238
|
|
|
|
239
|
|
|
# Concatenate data
|
240
|
1 |
|
XZ = np.concatenate((X, Z), axis=0)
|
241
|
|
|
|
242
|
|
|
# Call a logistic regressor
|
243
|
1 |
|
lr = LogisticRegression(C=self.l2)
|
244
|
|
|
|
245
|
|
|
# Predict probability of belonging to target using cross-validation
|
246
|
1 |
|
preds = cross_val_predict(lr, XZ, y[:, 0])
|
247
|
|
|
|
248
|
|
|
# Return predictions for source samples
|
249
|
1 |
|
return preds[:N]
|
250
|
|
|
|
251
|
1 |
|
def iwe_nearest_neighbours(self, X, Z):
|
252
|
|
|
"""
|
253
|
|
|
Estimate importance weights based on nearest-neighbours.
|
254
|
|
|
|
255
|
|
|
Parameters
|
256
|
|
|
----------
|
257
|
|
|
X : array
|
258
|
|
|
source data (N samples by D features)
|
259
|
|
|
Z : array
|
260
|
|
|
target data (M samples by D features)
|
261
|
|
|
|
262
|
|
|
Returns
|
263
|
|
|
-------
|
264
|
|
|
iw : array
|
265
|
|
|
importance weights (N samples by 1)
|
266
|
|
|
|
267
|
|
|
Examples
|
268
|
|
|
--------
|
269
|
|
|
X = np.random.randn(10, 2)
|
270
|
|
|
Z = np.random.randn(10, 2)
|
271
|
|
|
clf = ImportanceWeightedClassifier()
|
272
|
|
|
iw = clf.iwe_nearest_neighbours(X, Z)
|
273
|
|
|
|
274
|
|
|
"""
|
275
|
|
|
# Data shapes
|
276
|
1 |
|
N, DX = X.shape
|
277
|
1 |
|
M, DZ = Z.shape
|
278
|
|
|
|
279
|
|
|
# Assert equivalent dimensionalities
|
280
|
1 |
|
if not DX == DZ:
|
281
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
282
|
|
|
|
283
|
|
|
# Compute Euclidean distance between samples
|
284
|
1 |
|
d = cdist(X, Z, metric='euclidean')
|
285
|
|
|
|
286
|
|
|
# Count target samples within each source Voronoi cell
|
287
|
1 |
|
ix = np.argmin(d, axis=1)
|
288
|
1 |
|
iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
|
289
|
|
|
|
290
|
|
|
# Laplace smoothing
|
291
|
1 |
|
if self.smoothing:
|
292
|
1 |
|
iw = (iw + 1.) / (N + 1)
|
293
|
|
|
|
294
|
|
|
# Weight clipping
|
295
|
1 |
|
if self.clip > 0:
|
296
|
|
|
iw = np.minimum(self.clip, np.maximum(0, iw))
|
297
|
|
|
|
298
|
|
|
# Return weights
|
299
|
1 |
|
return iw
|
300
|
|
|
|
301
|
1 |
|
def iwe_kernel_mean_matching(self, X, Z):
|
302
|
|
|
"""
|
303
|
|
|
Estimate importance weights based on kernel mean matching.
|
304
|
|
|
|
305
|
|
|
Parameters
|
306
|
|
|
----------
|
307
|
|
|
X : array
|
308
|
|
|
source data (N samples by D features)
|
309
|
|
|
Z : array
|
310
|
|
|
target data (M samples by D features)
|
311
|
|
|
|
312
|
|
|
Returns
|
313
|
|
|
-------
|
314
|
|
|
iw : array
|
315
|
|
|
importance weights (N samples by 1)
|
316
|
|
|
|
317
|
|
|
Examples
|
318
|
|
|
--------
|
319
|
|
|
X = np.random.randn(10, 2)
|
320
|
|
|
Z = np.random.randn(10, 2)
|
321
|
|
|
clf = ImportanceWeightedClassifier()
|
322
|
|
|
iw = clf.iwe_kernel_mean_matching(X, Z)
|
323
|
|
|
|
324
|
|
|
"""
|
325
|
|
|
# Data shapes
|
326
|
1 |
|
N, DX = X.shape
|
327
|
1 |
|
M, DZ = Z.shape
|
328
|
|
|
|
329
|
|
|
# Assert equivalent dimensionalities
|
330
|
1 |
|
if not DX == DZ:
|
331
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
332
|
|
|
|
333
|
|
|
# Compute sample pairwise distances
|
334
|
1 |
|
KXX = cdist(X, X, metric='euclidean')
|
335
|
1 |
|
KXZ = cdist(X, Z, metric='euclidean')
|
336
|
|
|
|
337
|
|
|
# Check non-negative distances
|
338
|
1 |
|
if not np.all(KXX >= 0):
|
339
|
|
|
raise ValueError('Non-positive distance in source kernel.')
|
340
|
1 |
|
if not np.all(KXZ >= 0):
|
341
|
|
|
raise ValueError('Non-positive distance in source-target kernel.')
|
342
|
|
|
|
343
|
|
|
# Compute kernels
|
344
|
1 |
|
if self.kernel_type == 'rbf':
|
345
|
|
|
# Radial basis functions
|
346
|
1 |
|
KXX = np.exp(-KXX / (2*self.bandwidth**2))
|
347
|
1 |
|
KXZ = np.exp(-KXZ / (2*self.bandwidth**2))
|
348
|
|
|
|
349
|
|
|
# Collapse second kernel and normalize
|
350
|
1 |
|
KXZ = N/M * np.sum(KXZ, axis=1)
|
351
|
|
|
|
352
|
|
|
# Prepare for CVXOPT
|
353
|
1 |
|
Q = matrix(KXX, tc='d')
|
354
|
1 |
|
p = matrix(KXZ, tc='d')
|
355
|
1 |
|
G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
|
356
|
|
|
-1.*np.eye(N)), axis=0), tc='d')
|
357
|
1 |
|
h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
|
358
|
|
|
np.array([N/np.sqrt(N) - N], ndmin=2),
|
359
|
|
|
np.zeros((N, 1))), axis=0), tc='d')
|
360
|
|
|
|
361
|
|
|
# Call quadratic program solver
|
362
|
1 |
|
sol = solvers.qp(Q, p, G, h)
|
363
|
|
|
|
364
|
|
|
# Return optimal coefficients as importance weights
|
365
|
1 |
|
return np.array(sol['x'])[:, 0]
|
366
|
|
|
|
367
|
1 |
|
def fit(self, X, y, Z):
|
368
|
|
|
"""
|
369
|
|
|
Fit/train an importance-weighted classifier.
|
370
|
|
|
|
371
|
|
|
Arguments
|
372
|
|
|
X : array
|
373
|
|
|
source data (N samples by D features)
|
374
|
|
|
y : array
|
375
|
|
|
source labels (N samples by 1)
|
376
|
|
|
Z : array
|
377
|
|
|
target data (M samples by D features)
|
378
|
|
|
|
379
|
|
|
Returns
|
380
|
|
|
-------
|
381
|
|
|
None
|
382
|
|
|
|
383
|
|
|
Examples
|
384
|
|
|
--------
|
385
|
|
|
X = np.random.randn(10, 2)
|
386
|
|
|
y = np.vstack((-np.ones((5,)), np.ones((5,))))
|
387
|
|
|
Z = np.random.randn(10, 2)
|
388
|
|
|
clf = ImportanceWeightedClassifier()
|
389
|
|
|
clf.fit(X, y, Z)
|
390
|
|
|
|
391
|
|
|
"""
|
392
|
|
|
# Data shapes
|
393
|
|
|
N, DX = X.shape
|
394
|
|
|
M, DZ = Z.shape
|
395
|
|
|
|
396
|
|
|
# Assert equivalent dimensionalities
|
397
|
|
|
if not DX == DZ:
|
398
|
|
|
raise ValueError('Dimensionalities of X and Z should be equal.')
|
399
|
|
|
|
400
|
|
|
# Find importance-weights
|
401
|
|
|
if self.iwe == 'lr':
|
402
|
|
|
w = self.iwe_logistic_discrimination(X, Z)
|
403
|
|
|
elif self.iwe == 'rg':
|
404
|
|
|
w = self.iwe_ratio_gaussians(X, Z)
|
405
|
|
|
elif self.iwe == 'nn':
|
406
|
|
|
w = self.iwe_nearest_neighbours(X, Z)
|
407
|
|
|
elif self.iwe == 'kde':
|
408
|
|
|
w = self.iwe_kernel_densities(X, Z)
|
409
|
|
|
elif self.iwe == 'kmm':
|
410
|
|
|
w = self.iwe_kernel_mean_matching(X, Z)
|
411
|
|
|
else:
|
412
|
|
|
raise NotImplementedError('Estimator not implemented.')
|
413
|
|
|
|
414
|
|
|
# Train a weighted classifier
|
415
|
|
|
if self.loss == 'logistic':
|
416
|
|
|
# Logistic regression model with sample weights
|
417
|
|
|
self.clf.fit(X, y, w)
|
418
|
|
|
elif self.loss == 'quadratic':
|
419
|
|
|
# Least-squares model with sample weights
|
420
|
|
|
self.clf.fit(X, y, w)
|
421
|
|
|
elif self.loss == 'hinge':
|
422
|
|
|
# Linear support vector machine with sample weights
|
423
|
|
|
self.clf.fit(X, y, w)
|
424
|
|
|
else:
|
425
|
|
|
# Other loss functions are not implemented
|
426
|
|
|
raise NotImplementedError('Loss function not implemented.')
|
427
|
|
|
|
428
|
|
|
# Mark classifier as trained
|
429
|
|
|
self.is_trained = True
|
430
|
|
|
|
431
|
|
|
# Store training data dimensionality
|
432
|
|
|
self.train_data_dim = DX
|
433
|
|
|
|
434
|
1 |
|
def predict(self, Z_):
|
435
|
|
|
"""
|
436
|
|
|
Make predictions on new dataset.
|
437
|
|
|
|
438
|
|
|
Arguments
|
439
|
|
|
---------
|
440
|
|
|
Z_ : array
|
441
|
|
|
new data set (M samples by D features)
|
442
|
|
|
|
443
|
|
|
Returns
|
444
|
|
|
-------
|
445
|
|
|
preds : array
|
446
|
|
|
label predictions (M samples by 1)
|
447
|
|
|
|
448
|
|
|
Examples
|
449
|
|
|
--------
|
450
|
|
|
X = np.random.randn(10, 2)
|
451
|
|
|
y = np.vstack((-np.ones((5,)), np.ones((5,))))
|
452
|
|
|
Z = np.random.randn(10, 2)
|
453
|
|
|
clf = ImportanceWeightedClassifier()
|
454
|
|
|
clf.fit(X, y, Z)
|
455
|
|
|
u_pred = clf.predict(Z)
|
456
|
|
|
|
457
|
|
|
"""
|
458
|
|
|
# Data shape
|
459
|
|
|
M, D = Z_.shape
|
460
|
|
|
|
461
|
|
|
# If classifier is trained, check for same dimensionality
|
462
|
|
|
if self.is_trained:
|
463
|
|
|
if not self.train_data_dim == D:
|
464
|
|
|
raise ValueError('''Test data is of different dimensionality
|
465
|
|
|
than training data.''')
|
466
|
|
|
|
467
|
|
|
# Call scikit's predict function
|
468
|
|
|
preds = self.clf.predict(Z_)
|
469
|
|
|
|
470
|
|
|
# For quadratic loss function, correct predictions
|
471
|
|
|
if self.loss == 'quadratic':
|
472
|
|
|
preds = (np.sign(preds)+1)/2.
|
473
|
|
|
|
474
|
|
|
# Return predictions array
|
475
|
|
|
return preds
|
476
|
|
|
|
477
|
1 |
|
def get_params(self):
|
478
|
|
|
"""Get classifier parameters."""
|
479
|
|
|
return self.clf.get_params()
|
480
|
|
|
|
481
|
1 |
|
def is_trained(self):
|
482
|
|
|
"""Check whether classifier is trained."""
|
483
|
|
|
return self.is_trained
|
484
|
|
|
|