Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

iwe_logistic_discrimination()   B

Complexity

Conditions 2

Size

Total Lines 40

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 9
CRAP Score 2.004

Importance

Changes 0
Metric Value
cc 2
c 0
b 0
f 0
dl 0
loc 40
ccs 9
cts 10
cp 0.9
crap 2.004
rs 8.8571
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.spatial.distance import cdist
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12 1
from cvxopt import matrix, solvers
13
14 1
from .util import is_pos_def
15
16
17 1
class ImportanceWeightedClassifier(object):
18
    """
19
    Class of importance-weighted classifiers.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
24
    Examples
25
    --------
26
    | >>>> X = np.random.randn(10, 2)
27
    | >>>> y = np.vstack((-np.ones((5,)), np.ones((5,))))
28
    | >>>> Z = np.random.randn(10, 2)
29
    | >>>> clf = ImportanceWeightedClassifier()
30
    | >>>> clf.fit(X, y, Z)
31
    | >>>> u_pred = clf.predict(Z)
32
33
    """
34
35 1 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, iwe='lr', smoothing=True,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
36
                 clip=-1, kernel_type='rbf', bandwidth=1):
37
        """
38
        Select a particular type of importance-weighted classifier.
39
40
        Parameters
41
        ----------
42
        loss : str
43
            loss function for weighted classifier, options: 'logistic',
44
            'quadratic', 'hinge' (def: 'logistic')
45
        l2 : float
46
            l2-regularization parameter value (def:0.01)
47
        iwe : str
48
            importance weight estimator, options: 'lr', 'nn', 'rg', 'kmm',
49
            'kde' (def: 'lr')
50
        smoothing : bool
51
            whether to apply Laplace smoothing to the nearest-neighbour
52
            importance-weight estimator (def: True)
53
        clip : float
54
            maximum allowable importance-weight value; if set to -1, then the
55
            weights are not clipped (def:-1)
56
        kernel_type : str
57
            what type of kernel to use for kernel density estimation or kernel
58
            mean matching, options: 'diste', 'rbf' (def: 'rbf')
59
        bandwidth : float
60
            kernel bandwidth parameter value for kernel-based weight
61
            estimators (def: 1)
62
63
        Returns
64
        -------
65
        None
66
67
        """
68 1
        self.loss = loss
69 1
        self.l2 = l2
70 1
        self.iwe = iwe
71 1
        self.smoothing = smoothing
72 1
        self.clip = clip
73 1
        self.kernel_type = kernel_type
74 1
        self.bandwidth = bandwidth
75
76
        # Initialize untrained classifiers based on choice of loss function
77 1
        if self.loss == 'logistic':
78
            # Logistic regression model
79 1
            self.clf = LogisticRegression()
80
        elif self.loss == 'quadratic':
81
            # Least-squares model
82
            self.clf = LinearRegression()
83
        elif self.loss == 'hinge':
84
            # Linear support vector machine
85
            self.clf = LinearSVC()
86
        else:
87
            # Other loss functions are not implemented
88
            raise NotImplementedError('Loss function not implemented.')
89
90
        # Whether model has been trained
91 1
        self.is_trained = False
92
93
        # Dimensionality of training data
94 1
        self.train_data_dim = ''
95
96 1
    def iwe_ratio_gaussians(self, X, Z):
97
        """
98
        Estimate importance weights based on a ratio of Gaussian distributions.
99
100
        Parameters
101
        ----------
102
        X : array
103
            source data (N samples by D features)
104
        Z : array
105
            target data (M samples by D features)
106
107
        Returns
108
        -------
109
        iw : array
110
            importance weights (N samples by 1)
111
112
        """
113
        # Data shapes
114 1
        N, DX = X.shape
115 1
        M, DZ = Z.shape
116
117
        # Assert equivalent dimensionalities
118 1
        if not DX == DZ:
119
            raise ValueError('Dimensionalities of X and Z should be equal.')
120
121
        # Sample means in each domain
122 1
        mu_X = np.mean(X, axis=0)
123 1
        mu_Z = np.mean(Z, axis=0)
124
125
        # Sample covariances
126 1
        Si_X = np.cov(X.T)
127 1
        Si_Z = np.cov(Z.T)
128
129
        # Check for positive-definiteness of covariance matrices
130 1
        if not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
131
            print('Warning: covariate matrices not PSD.')
132
133
            regct = -6
134
            while not (is_pos_def(Si_X) or is_pos_def(Si_Z)):
135
                print('Adding regularization: ' + str(1**regct))
136
137
                # Add regularization
138
                Si_X += np.eye(DX)*10.**regct
139
                Si_Z += np.eye(DZ)*10.**regct
140
141
                # Increment regularization counter
142
                regct += 1
143
144
        # Compute probability of X under each domain
145 1
        pT = st.multivariate_normal.pdf(X, mu_Z, Si_Z)
146 1
        pS = st.multivariate_normal.pdf(X, mu_X, Si_X)
147
148
        # Check for numerical problems
149 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
150
            raise ValueError('Source probabilities are NaN or 0.')
151 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
152
            raise ValueError('Target probabilities are NaN or 0.')
153
154
        # Return the ratio of probabilities
155 1
        return pT / pS
156
157 1
    def iwe_kernel_densities(self, X, Z):
158
        """
159
        Estimate importance weights based on kernel density estimation.
160
161
        Parameters
162
        ----------
163
        X : array
164
            source data (N samples by D features)
165
        Z : array
166
            target data (M samples by D features)
167
168
        Returns
169
        -------
170
        array
171
            importance weights (N samples by 1)
172
173
        """
174
        # Data shapes
175 1
        N, DX = X.shape
176 1
        M, DZ = Z.shape
177
178
        # Assert equivalent dimensionalities
179 1
        if not DX == DZ:
180
            raise ValueError('Dimensionalities of X and Z should be equal.')
181
182
        # Compute probabilities based on source kernel densities
183 1
        pT = st.gaussian_kde(Z.T).pdf(X.T)
184 1
        pS = st.gaussian_kde(X.T).pdf(X.T)
185
186
        # Check for numerical problems
187 1
        if np.any(np.isnan(pT)) or np.any(pT == 0):
188
            raise ValueError('Source probabilities are NaN or 0.')
189 1
        if np.any(np.isnan(pS)) or np.any(pS == 0):
190
            raise ValueError('Target probabilities are NaN or 0.')
191
192
        # Return the ratio of probabilities
193 1
        return pT / pS
194
195 1
    def iwe_logistic_discrimination(self, X, Z):
196
        """
197
        Estimate importance weights based on logistic regression.
198
199
        Parameters
200
        ----------
201
        X : array
202
            source data (N samples by D features)
203
        Z : array
204
            target data (M samples by D features)
205
206
        Returns
207
        -------
208
        array
209
            importance weights (N samples by 1)
210
211
        """
212
        # Data shapes
213 1
        N, DX = X.shape
214 1
        M, DZ = Z.shape
215
216
        # Assert equivalent dimensionalities
217 1
        if not DX == DZ:
218
            raise ValueError('Dimensionalities of X and Z should be equal.')
219
220
        # Make domain-label variable
221 1
        y = np.concatenate((np.zeros((N, 1)),
222
                            np.ones((M, 1))), axis=0)
223
224
        # Concatenate data
225 1
        XZ = np.concatenate((X, Z), axis=0)
226
227
        # Call a logistic regressor
228 1
        lr = LogisticRegression(C=self.l2)
229
230
        # Predict probability of belonging to target using cross-validation
231 1
        preds = cross_val_predict(lr, XZ, y[:, 0])
232
233
        # Return predictions for source samples
234 1
        return preds[:N]
235
236 1
    def iwe_nearest_neighbours(self, X, Z):
237
        """
238
        Estimate importance weights based on nearest-neighbours.
239
240
        Parameters
241
        ----------
242
        X : array
243
            source data (N samples by D features)
244
        Z : array
245
            target data (M samples by D features)
246
247
        Returns
248
        -------
249
        iw : array
250
            importance weights (N samples by 1)
251
252
        """
253
        # Data shapes
254 1
        N, DX = X.shape
255 1
        M, DZ = Z.shape
256
257
        # Assert equivalent dimensionalities
258 1
        if not DX == DZ:
259
            raise ValueError('Dimensionalities of X and Z should be equal.')
260
261
        # Compute Euclidean distance between samples
262 1
        d = cdist(X, Z, metric='euclidean')
263
264
        # Count target samples within each source Voronoi cell
265 1
        ix = np.argmin(d, axis=1)
266 1
        iw, _ = np.array(np.histogram(ix, np.arange(N+1)))
267
268
        # Laplace smoothing
269 1
        if self.smoothing:
270 1
            iw = (iw + 1.) / (N + 1)
271
272
        # Weight clipping
273 1
        if self.clip > 0:
274
            iw = np.minimum(self.clip, np.maximum(0, iw))
275
276
        # Return weights
277 1
        return iw
278
279 1
    def iwe_kernel_mean_matching(self, X, Z):
280
        """
281
        Estimate importance weights based on kernel mean matching.
282
283
        Parameters
284
        ----------
285
        X : array
286
            source data (N samples by D features)
287
        Z : array
288
            target data (M samples by D features)
289
290
        Returns
291
        -------
292
        iw : array
293
            importance weights (N samples by 1)
294
295
        """
296
        # Data shapes
297 1
        N, DX = X.shape
298 1
        M, DZ = Z.shape
299
300
        # Assert equivalent dimensionalities
301 1
        if not DX == DZ:
302
            raise ValueError('Dimensionalities of X and Z should be equal.')
303
304
        # Compute sample pairwise distances
305 1
        KXX = cdist(X, X, metric='euclidean')
306 1
        KXZ = cdist(X, Z, metric='euclidean')
307
308
        # Check non-negative distances
309 1
        if not np.all(KXX >= 0):
310
            raise ValueError('Non-positive distance in source kernel.')
311 1
        if not np.all(KXZ >= 0):
312
            raise ValueError('Non-positive distance in source-target kernel.')
313
314
        # Compute kernels
315 1
        if self.kernel_type == 'rbf':
316
            # Radial basis functions
317 1
            KXX = np.exp(-KXX / (2*self.bandwidth**2))
318 1
            KXZ = np.exp(-KXZ / (2*self.bandwidth**2))
319
320
        # Collapse second kernel and normalize
321 1
        KXZ = N/M * np.sum(KXZ, axis=1)
322
323
        # Prepare for CVXOPT
324 1
        Q = matrix(KXX, tc='d')
325 1
        p = matrix(KXZ, tc='d')
326 1
        G = matrix(np.concatenate((np.ones((1, N)), -1*np.ones((1, N)),
327
                                   -1.*np.eye(N)), axis=0), tc='d')
328 1
        h = matrix(np.concatenate((np.array([N/np.sqrt(N) + N], ndmin=2),
329
                                   np.array([N/np.sqrt(N) - N], ndmin=2),
330
                                   np.zeros((N, 1))), axis=0), tc='d')
331
332
        # Call quadratic program solver
333 1
        sol = solvers.qp(Q, p, G, h)
334
335
        # Return optimal coefficients as importance weights
336 1
        return np.array(sol['x'])[:, 0]
337
338 1
    def fit(self, X, y, Z):
339
        """
340
        Fit/train an importance-weighted classifier.
341
342
        Parameters
343
        ----------
344
        X : array
345
            source data (N samples by D features)
346
        y : array
347
            source labels (N samples by 1)
348
        Z : array
349
            target data (M samples by D features)
350
351
        Returns
352
        -------
353
        None
354
355
        """
356
        # Data shapes
357 1
        N, DX = X.shape
358 1
        M, DZ = Z.shape
359
360
        # Assert equivalent dimensionalities
361 1
        if not DX == DZ:
362
            raise ValueError('Dimensionalities of X and Z should be equal.')
363
364
        # Find importance-weights
365 1
        if self.iwe == 'lr':
366 1
            w = self.iwe_logistic_discrimination(X, Z)
367
        elif self.iwe == 'rg':
368
            w = self.iwe_ratio_gaussians(X, Z)
369
        elif self.iwe == 'nn':
370
            w = self.iwe_nearest_neighbours(X, Z)
371
        elif self.iwe == 'kde':
372
            w = self.iwe_kernel_densities(X, Z)
373
        elif self.iwe == 'kmm':
374
            w = self.iwe_kernel_mean_matching(X, Z)
375
        else:
376
            raise NotImplementedError('Estimator not implemented.')
377
378
        # Train a weighted classifier
379 1
        if self.loss == 'logistic':
380
            # Logistic regression model with sample weights
381 1
            self.clf.fit(X, y, w)
382
        elif self.loss == 'quadratic':
383
            # Least-squares model with sample weights
384
            self.clf.fit(X, y, w)
385
        elif self.loss == 'hinge':
386
            # Linear support vector machine with sample weights
387
            self.clf.fit(X, y, w)
388
        else:
389
            # Other loss functions are not implemented
390
            raise NotImplementedError('Loss function not implemented.')
391
392
        # Mark classifier as trained
393 1
        self.is_trained = True
394
395
        # Store training data dimensionality
396 1
        self.train_data_dim = DX
397
398 1
    def predict(self, Z):
399
        """
400
        Make predictions on new dataset.
401
402
        Parameters
403
        ----------
404
        Z : array
405
            new data set (M samples by D features)
406
407
        Returns
408
        -------
409
        preds : array
410
            label predictions (M samples by 1)
411
412
        """
413
        # Data shape
414 1
        M, D = Z.shape
415
416
        # If classifier is trained, check for same dimensionality
417 1
        if self.is_trained:
418 1
            if not self.train_data_dim == D:
419
                raise ValueError('''Test data is of different dimensionality
420
                                 than training data.''')
421
422
        # Call scikit's predict function
423 1
        preds = self.clf.predict(Z)
424
425
        # For quadratic loss function, correct predictions
426 1
        if self.loss == 'quadratic':
427
            preds = (np.sign(preds)+1)/2.
428
429
        # Return predictions array
430 1
        return preds
431
432 1
    def get_params(self):
433
        """Get classifier parameters."""
434
        return self.clf.get_params()
435
436 1
    def is_trained(self):
437
        """Check whether classifier is trained."""
438
        return self.is_trained
439