Completed
Push — master ( f50597...748d0e )
by Wouter
06:51
created

iwe_logistic_discrimination()   A

Complexity

Conditions 2

Size

Total Lines 47

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2.004

Importance

Changes 0
Metric Value
cc 2
c 0
b 0
f 0
dl 0
loc 47
ccs 9
cts 10
cp 0.9
crap 2.004
rs 9.0303
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.spatial.distance import cdist
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12 1
from cvxopt import matrix, solvers
13
14 1
from .util import is_pos_def
15
16
17 1
class ImportanceWeightedClassifier(object):
18
    """
19
    Class of importance-weighted classifiers.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
    """
24
25 1 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
26
                 clip=-1, kernel_type='rbf', bandwidth=1):
27
        """
28
        Select a particular type of importance-weighted classifier.
29
30
        Parameters
31
        ----------
32
        loss : str
33
            loss function for weighted classifier, options: 'logistic',
34
            'quadratic', 'hinge' (def: 'logistic')
35
        l2 : float
36
            l2-regularization parameter value (def:0.01)
37
        iwe : str
38
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
39
            'kde' (def: 'lr')
40
        smoothing : bool
41
            whether to apply Laplace smoothing to the nearest-neighbour
42
            importance-weight estimator (def: True)
43
        clip : float
44
            maximum allowable importance-weight value; if set to -1, then the
45
            weights are not clipped (def:-1)
46
        kernel_type : str
47
            what type of kernel to use for kernel density estimation or kernel
48
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
49
        bandwidth : float
50
            kernel bandwidth parameter value for kernel-based weight
51
            estimators (def: 1)
52
53
        Returns
54
        -------
55
        None
56
57
        Examples
58
        --------
59
        >>>> clf = ImportanceWeightedClassifier()
60
61
        """
62 1
        self.loss = loss
63 1
        self.l2 = l2
64 1
        self.iwe = iwe
65 1
        self.smoothing = smoothing
66 1
        self.clip = clip
67 1
        self.kernel_type = kernel_type
68 1
        self.bandwidth = bandwidth
69
70
        # Initialize untrained classifiers based on choice of loss function
71 1
        if self.loss == 'logistic':
72
            # Logistic regression model
73 1
            self.clf = LogisticRegression()
74
        elif self.loss == 'quadratic':
75
            # Least-squares model
76
            self.clf = LinearRegression()
77
        elif self.loss == 'hinge':
78
            # Linear support vector machine
79
            self.clf = LinearSVC()
80
        else:
81
            # Other loss functions are not implemented
82
            raise NotImplementedError('Loss function not implemented.')
83
84
        # Whether model has been trained
85 1
        self.is_trained = False
86
87
        # Dimensionality of training data
88 1
        self.train_data_dim = ''
89
90 1
    def iwe_ratio_gaussians(self, X, Z):
91
        """
92
        Estimate importance weights based on a ratio of Gaussian distributions.
93
94
        Parameters
95
        ----------
96
        X : array
97
            source data (N samples by D features)
98
        Z : array
99
            target data (M samples by D features)
100
101
        Returns
102
        -------
103
        iw : array
104
            importance weights (N samples by 1)
105
106
        Examples
107
        --------
108
        X = np.random.randn(10, 2)
109
        Z = np.random.randn(10, 2)
110
        clf = ImportanceWeightedClassifier()
111
        iw = clf.iwe_ratio_gaussians(X, Z)
112
113
        """
114
        # Data shapes
115 1
        N, DX = X.shape
116 1
        M, DZ = Z.shape
117
118
        # Assert equivalent dimensionalities
119 1
        if not DX == DZ:
120
            raise ValueError('Dimensionalities of X and Z should be equal.')
121
122
        # Sample means in each domain
123 1
        mu_X = np.mean(X, axis=0)
124 1
        mu_Z = np.mean(Z, axis=0)
125
126
        # Sample covariances
127 1
        Si_X = np.cov(X.T)
128 1
        Si_Z = np.cov(Z.T)
129
130
        # Check for positive-definiteness of covariance matrices
131 1
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
132
            print('Warning: covariate matrices not PSD.')
133
134
            regct = -6
135
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
136
                print('Adding regularization: ' + str(1**regct))
137
138
                # Add regularization
139
                Si_X += np.eye(DX)*10.**regct
140
                Si_Z += np.eye(DZ)*10.**regct
141
142
                # Increment regularization counter
143
                regct += 1
144
145
        # Compute probability of X under each domain
146 1
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
147 1
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
148
149
        # Check for numerical problems
150 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
151
            raise ValueError('Source probabilities are NaN or 0.')
152 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
153
            raise ValueError('Target probabilities are NaN or 0.')
154
155
        # Return the ratio of probabilities
156 1
        return pT / pS
157
158 1
    def iwe_kernel_densities(self, X, Z):
159
        """
160
        Estimate importance weights based on kernel density estimation.
161
162
        Parameters
163
        ----------
164
        X : array
165
            source data (N samples by D features)
166
        Z : array
167
            target data (M samples by D features)
168
169
        Returns
170
        -------
171
        iw : array
172
            importance weights (N samples by 1)
173
174
        Examples
175
        --------
176
        X = np.random.randn(10, 2)
177
        Z = np.random.randn(10, 2)
178
        clf = ImportanceWeightedClassifier()
179
        iw = clf.iwe_kernel_densities(X, Z)
180
181
        """
182
        # Data shapes
183 1
        N, DX = X.shape
184 1
        M, DZ = Z.shape
185
186
        # Assert equivalent dimensionalities
187 1
        if not DX == DZ:
188
            raise ValueError('Dimensionalities of X and Z should be equal.')
189
190
        # Compute probabilities based on source kernel densities
191 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
192 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
193
194
        # Check for numerical problems
195 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
196
            raise ValueError('Source probabilities are NaN or 0.')
197 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
198
            raise ValueError('Target probabilities are NaN or 0.')
199
200
        # Return the ratio of probabilities
201 1
        return pT / pS
202
203 1
    def iwe_logistic_discrimination(self, X, Z):
204
        """
205
        Estimate importance weights based on logistic regression.
206
207
        Parameters
208
        ----------
209
        X : array
210
            source data (N samples by D features)
211
        Z : array
212
            target data (M samples by D features)
213
214
        Returns
215
        -------
216
        iw : array
217
            importance weights (N samples by 1)
218
219
        Examples
220
        --------
221
        X = np.random.randn(10, 2)
222
        Z = np.random.randn(10, 2)
223
        clf = ImportanceWeightedClassifier()
224
        iw = clf.iwe_logistic_discrimination(X, Z)
225
226
        """
227
        # Data shapes
228 1
        N, DX = X.shape
229 1
        M, DZ = Z.shape
230
231
        # Assert equivalent dimensionalities
232 1
        if not DX == DZ:
233
            raise ValueError('Dimensionalities of X and Z should be equal.')
234
235
        # Make domain-label variable
236 1
        y = np.concatenate((np.zeros((N, 1)),
237
                            np.ones((M, 1))), axis=0)
238
239
        # Concatenate data
240 1
        XZ = np.concatenate((X, Z), axis=0)
241
242
        # Call a logistic regressor
243 1
        lr = LogisticRegression(C=self.l2)
244
245
        # Predict probability of belonging to target using cross-validation
246 1
        preds = cross_val_predict(lr, XZ, y[:, 0])
247
248
        # Return predictions for source samples
249 1
        return preds[:N]
250
251 1
    def iwe_nearest_neighbours(self, X, Z):
252
        """
253
        Estimate importance weights based on nearest-neighbours.
254
255
        Parameters
256
        ----------
257
        X : array
258
            source data (N samples by D features)
259
        Z : array
260
            target data (M samples by D features)
261
262
        Returns
263
        -------
264
        iw : array
265
            importance weights (N samples by 1)
266
267
        Examples
268
        --------
269
        X = np.random.randn(10, 2)
270
        Z = np.random.randn(10, 2)
271
        clf = ImportanceWeightedClassifier()
272
        iw = clf.iwe_nearest_neighbours(X, Z)
273
274
        """
275
        # Data shapes
276 1
        N, DX = X.shape
277 1
        M, DZ = Z.shape
278
279
        # Assert equivalent dimensionalities
280 1
        if not DX == DZ:
281
            raise ValueError('Dimensionalities of X and Z should be equal.')
282
283
        # Compute Euclidean distance between samples
284 1
        d = cdist(X, Z, metric='euclidean')
285
286
        # Count target samples within each source Voronoi cell
287 1
        ix = np.argmin(d, axis=1)
288 1
        iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
289
290
        # Laplace smoothing
291 1
        if self.smoothing:
292 1
            iw = (iw + 1.) / (N + 1)
293
294
        # Weight clipping
295 1
        if self.clip > 0:
296
            iw = np.minimum(self.clip, np.maximum(0, iw))
297
298
        # Return weights
299 1
        return iw
300
301 1
    def iwe_kernel_mean_matching(self, X, Z):
302
        """
303
        Estimate importance weights based on kernel mean matching.
304
305
        Parameters
306
        ----------
307
        X : array
308
            source data (N samples by D features)
309
        Z : array
310
            target data (M samples by D features)
311
312
        Returns
313
        -------
314
        iw : array
315
            importance weights (N samples by 1)
316
317
        Examples
318
        --------
319
        X = np.random.randn(10, 2)
320
        Z = np.random.randn(10, 2)
321
        clf = ImportanceWeightedClassifier()
322
        iw = clf.iwe_kernel_mean_matching(X, Z)
323
324
        """
325
        # Data shapes
326 1
        N, DX = X.shape
327 1
        M, DZ = Z.shape
328
329
        # Assert equivalent dimensionalities
330 1
        if not DX == DZ:
331
            raise ValueError('Dimensionalities of X and Z should be equal.')
332
333
        # Compute sample pairwise distances
334 1
        KXX = cdist(X, X, metric='euclidean')
335 1
        KXZ = cdist(X, Z, metric='euclidean')
336
337
        # Check non-negative distances
338 1
        if not np.all(KXX >= 0):
339
            raise ValueError('Non-positive distance in source kernel.')
340 1
        if not np.all(KXZ >= 0):
341
            raise ValueError('Non-positive distance in source-target kernel.')
342
343
        # Compute kernels
344 1
        if self.kernel_type == 'rbf':
345
            # Radial basis functions
346 1
            KXX = np.exp(-KXX / (2*self.bandwidth**2))
347 1
            KXZ = np.exp(-KXZ / (2*self.bandwidth**2))
348
349
        # Collapse second kernel and normalize
350 1
        KXZ = N/M * np.sum(KXZ, axis=1)
351
352
        # Prepare for CVXOPT
353 1
        Q = matrix(KXX, tc='d')
354 1
        p = matrix(KXZ, tc='d')
355 1
        G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
356
                                   -1.*np.eye(N)), axis=0), tc='d')
357 1
        h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
358
                                   np.array([N/np.sqrt(N) - N], ndmin=2),
359
                                   np.zeros((N, 1))), axis=0), tc='d')
360
361
        # Call quadratic program solver
362 1
        sol = solvers.qp(Q, p, G, h)
363
364
        # Return optimal coefficients as importance weights
365 1
        return np.array(sol['x'])[:, 0]
366
367 1
    def fit(self, X, y, Z):
368
        """
369
        Fit/train an importance-weighted classifier.
370
371
        Arguments
372
        X : array
373
            source data (N samples by D features)
374
        y : array
375
            source labels (N samples by 1)
376
        Z : array
377
            target data (M samples by D features)
378
379
        Returns
380
        -------
381
        None
382
383
        Examples
384
        --------
385
        X = np.random.randn(10, 2)
386
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
387
        Z = np.random.randn(10, 2)
388
        clf = ImportanceWeightedClassifier()
389
        clf.fit(X, y, Z)
390
391
        """
392
        # Data shapes
393
        N, DX = X.shape
394
        M, DZ = Z.shape
395
396
        # Assert equivalent dimensionalities
397
        if not DX == DZ:
398
            raise ValueError('Dimensionalities of X and Z should be equal.')
399
400
        # Find importance-weights
401
        if self.iwe == 'lr':
402
            w = self.iwe_logistic_discrimination(X, Z)
403
        elif self.iwe == 'rg':
404
            w = self.iwe_ratio_gaussians(X, Z)
405
        elif self.iwe == 'nn':
406
            w = self.iwe_nearest_neighbours(X, Z)
407
        elif self.iwe == 'kde':
408
            w = self.iwe_kernel_densities(X, Z)
409
        elif self.iwe == 'kmm':
410
            w = self.iwe_kernel_mean_matching(X, Z)
411
        else:
412
            raise NotImplementedError('Estimator not implemented.')
413
414
        # Train a weighted classifier
415
        if self.loss == 'logistic':
416
            # Logistic regression model with sample weights
417
            self.clf.fit(X, y, w)
418
        elif self.loss == 'quadratic':
419
            # Least-squares model with sample weights
420
            self.clf.fit(X, y, w)
421
        elif self.loss == 'hinge':
422
            # Linear support vector machine with sample weights
423
            self.clf.fit(X, y, w)
424
        else:
425
            # Other loss functions are not implemented
426
            raise NotImplementedError('Loss function not implemented.')
427
428
        # Mark classifier as trained
429
        self.is_trained = True
430
431
        # Store training data dimensionality
432
        self.train_data_dim = DX
433
434 1
    def predict(self, Z_):
435
        """
436
        Make predictions on new dataset.
437
438
        Arguments
439
        ---------
440
        Z_ : array
441
            new data set (M samples by D features)
442
443
        Returns
444
        -------
445
        preds : array
446
            label predictions (M samples by 1)
447
448
        Examples
449
        --------
450
        X = np.random.randn(10, 2)
451
        y = np.vstack((-np.ones((5,)), np.ones((5,))))
452
        Z = np.random.randn(10, 2)
453
        clf = ImportanceWeightedClassifier()
454
        clf.fit(X, y, Z)
455
        u_pred = clf.predict(Z)
456
457
        """
458
        # Data shape
459
        M, D = Z_.shape
460
461
        # If classifier is trained, check for same dimensionality
462
        if self.is_trained:
463
            if not self.train_data_dim == D:
464
                raise ValueError('''Test data is of different dimensionality
465
                                 than training data.''')
466
467
        # Call scikit's predict function
468
        preds = self.clf.predict(Z_)
469
470
        # For quadratic loss function, correct predictions
471
        if self.loss == 'quadratic':
472
            preds = (np.sign(preds)+1)/2.
473
474
        # Return predictions array
475
        return preds
476
477 1
    def get_params(self):
478
        """Get classifier parameters."""
479
        return self.clf.get_params()
480
481 1
    def is_trained(self):
482
        """Check whether classifier is trained."""
483
        return self.is_trained
484