Completed
Push — master ( 582254...17fb6a )
by Wouter
03:58
created

discriminant_parameters()   C

Complexity

Conditions 7

Size

Total Lines 72

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 20
CRAP Score 7.2269

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 7
c 1
b 0
f 0
dl 0
loc 72
ccs 20
cts 24
cp 0.8333
crap 7.2269
rs 5.5637

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import numpy.linalg as al
6 1
from scipy.stats import multivariate_normal as mvn
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12
13 1
from .util import one_hot, regularize_matrix
14
15
16 1
class TargetContrastivePessimisticClassifier(object):
17
    """
18
    Classifiers based on Target Contrastive Pessimistic Risk minimization.
19
20
    Methods contain models, risk functions, parameter estimation, etc.
21
    """
22
23 1
    def __init__(self, loss='lda', l2=1.0, max_iter=500, tolerance=1e-12,
24
                 learning_rate=1.0, rate_decay='linear', verbosity=0):
25
        """
26
        Select a particular type of TCPR classifier.
27
28
        INPUT   (1) str 'loss': loss function for TCP Risk, options: 'ls',
29
                    'least_squares', 'lda', 'linear_discriminant_analysis',
30
                    'qda', 'quadratic_discriminant_analysis' (def: 'lda')
31
                (2) float 'l2': l2-regularization parameter value (def:0.01)
32
                (3) int 'max_iter': maximum number of iterations for
33
                    optimization (def: 500)
34
                (4) float 'tolerance': convergence criterion on the TCP
35
                    parameters (def: 1e-5)
36
                (5) float 'learning_rate': parameter for size of update of
37
                    gradient (def: 1.0)
38
                (6) str 'rate_decay': type of learning rate decay, options:
39
                    'linear', 'quadratic', 'geometric', 'exponential'
40
                    (def: 'linear')
41
        """
42
        # Classifier options
43 1
        self.loss = loss
44 1
        self.l2 = l2
45
46
        # Optimization parameters
47 1
        self.max_iter = max_iter
48 1
        self.tolerance = tolerance
49 1
        self.learning_rate = learning_rate
50 1
        self.rate_decay = rate_decay
51 1
        self.verbosity = verbosity
52
53 1
        if self.loss in ['linear discriminant analysis', 'lda']:
54
55
            # Set to short name
56 1
            self.loss = 'lda'
57
58
        elif self.loss in ['quadratic discriminant analysis', 'qda']:
59
60
            # Set to short name
61
            self.loss = 'qda'
62
63
        else:
64
            # Other loss functions are not implemented
65
            raise ValueError('Model not implemented.')
66
67
        # Initialize classifier and classes parameters
68 1
        self.parameters = []
69 1
        self.classes = []
70
71
        # Whether model has been trained
72 1
        self.is_trained = False
73
74
        # Dimensionality of training data
75 1
        self.train_data_dim = 0
76
77 1
    def add_intercept(self, X):
78
        """Add 1's to data as last features."""
79
        # Data shape
80
        N, D = X.shape
81
82
        # Check if there's not already an intercept column
83
        if np.any(np.sum(X, axis=0) == N):
84
85
            # Report
86
            print('Intercept is not the last feature. Swapping..')
87
88
            # Find which column contains the intercept
89
            intercept_index = np.argwhere(np.sum(X, axis=0) == N)
90
91
            # Swap intercept to last
92
            X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
93
94
        # Add intercept as last column
95
        X = np.hstack((X, np.ones((N, 1))))
96
97
        # Append column of 1's to data, and increment dimensionality
98
        return X, D+1
99
100 1
    def remove_intercept(self, X):
101
        """Remove 1's from data as last features."""
102
        # Data shape
103 1
        N, D = X.shape
104
105
        # Find which column contains the intercept
106 1
        intercept_index = np.argwhere(np.sum(X, axis=0) == N)
107
108
        # Swap intercept to last
109 1
        X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
110
111 1
        return X, D-1
112
113 1
    def project_simplex(self, v, z=1.0):
114
        """
115
        Project vector onto simplex using sorting.
116
117
        Reference: "Efficient Projections onto the L1-Ball for Learning in High
118
        Dimensions (Duchi, Shalev-Shwartz, Singer, Chandra, 2006)."
119
120
        INPUT   (1) array 'v': vector to be projected (n dimensions by 0)
121
                (2) float 'z': constant (def: 1.0)
122
        OUTPUT  (1) array 'w': projected vector (n dimensions by 0)
123
        """
124
        # Number of dimensions
125 1
        n = v.shape[0]
126
127
        # Sort vector
128 1
        mu = np.sort(v, axis=0)[::-1]
129
130
        # Find rho
131 1
        C = np.cumsum(mu) - z
132 1
        j = np.arange(n) + 1
133 1
        rho = j[mu - C/j > 0][-1]
134
135
        # Define theta
136 1
        theta = C[mu - C/j > 0][-1] / float(rho)
137
138
        # Subtract theta from original vector and cap at 0
139 1
        w = np.maximum(v - theta, 0)
140
141
        # Return projected vector
142 1
        return w
143
144 1
    def learning_rate_t(self, t):
145
        """Compute current learning rate after decay."""
146
        # Select rate decay
147 1
        if self.rate_decay == 'linear':
148
149
            # Linear dropoff between t=0 and t=T
150 1
            alpha = (self.max_iter - t)/(self.learning_rate*self.max_iter)
151
152
        elif self.rate_decay == 'quadratic':
153
154
            # Quadratic dropoff between t=0 and t=T
155
            alpha = ((self.max_iter - t)/(self.learning_rate*self.max_iter))**2
156
157
        elif self.rate_decay == 'geometric':
158
159
            # Drop rate off inversely to time
160
            alpha = 1 / (self.learning_rate * t)
161
162
        elif self.rate_decay == 'exponential':
163
164
            # Exponential dropoff
165
            alpha = np.exp(-self.learning_rate * t)
166
167
        else:
168
            raise ValueError('Rate decay type unknown.')
169
170 1
        return alpha
171
172 1
    def risk(self, Z, theta, q):
173
        """
174
        Compute target contrastive pessimistic risk.
175
176
        INPUT   (1) array 'Z': target samples (M samples by D features)
177
                (2) array 'theta': classifier parameters (D features by
178
                    K classes)
179
                (3) array 'q': soft labels (M samples by K classes)
180
        OUTPUT  (1) float: risk
181
        """
182
        # Number of classes
183 1
        K = q.shape[1]
184
185
        # Compute negative log-likelihood
186 1
        L = self.neg_log_likelihood(Z, theta)
187
188
        # Weight loss by soft labels
189 1
        for k in range(K):
190 1
            L[:, k] *= q[:, k]
191
192
        # Sum over weighted losses
193 1
        L = np.sum(L, axis=1)
194
195
        # Risk is average loss
196 1
        return np.mean(L, axis=0)
197
198 1
    def neg_log_likelihood(self, X, theta):
199
        """
200
        Compute negative log-likelihood under Gaussian distributions.
201
202
        INPUT   (1) array 'X': data (N samples by D features)
203
                (2) tuple 'theta': containing class proportions 'pi', class
204
                    means 'mu', and class-covariances 'Si'
205
        OUTPUT  (1) array 'L': loss (N samples by K classes)
206
        """
207
        # Unpack parameters
208 1
        pi, mu, Si = theta
209
210
        # Check if parameter sets match
211 1
        assert pi.shape[1] == mu.shape[0]
212 1
        assert mu.shape[1] == Si.shape[0]
213 1
        assert Si.shape[0] == Si.shape[1]
214
215
        # Number of classes
216 1
        K = pi.shape[1]
217
218
        # Data shape
219 1
        N, D = X.shape
220
221
        # Preallocate loss array
222 1
        L = np.zeros((N, K))
223
224 1
        for k in range(K):
225
226
            # Check for linear or quadratic
227 1
            if self.loss == 'lda':
228
229 1
                try:
230
                    # Probability under k-th Gaussian with shared covariance
231 1
                    probs = mvn.pdf(X, mu[k, :], Si)
232
233
                except al.LinAlgError as err:
234
                    print('Covariance matrix is singular. Add regularization.')
235
                    raise err
236
237
            elif self.loss == 'qda':
238
239
                try:
240
                    # Probability under k-th Gaussian with own covariance
241
                    probs = mvn.pdf(X, mu[k, :], Si[:, :, k])
242
243
                except al.LinAlgError as err:
244
                    print('Covariance matrix is singular. Add regularization.')
245
                    raise err
246
247
            else:
248
                raise ValueError('Loss unknown.')
249
250
            # Negative log-likelihood
251 1
            L[:, k] = -np.log(pi[0, k]) - np.log(probs)
252
253 1
        return L
254
255 1
    def discriminant_parameters(self, X, Y):
256
        """
257
        Estimate parameters of Gaussian distribution for discriminant analysis.
258
259
        INPUT   (1) array 'X': data array (N samples by D features)
260
                (2) array 'Y': label array (N samples by K classes)
261
        OUTPUT  (1) array 'pi': class proportions (1 by K classes)
262
                (2) array 'mu': class means (K classes by D features)
263
                (3) array 'Si': class covariances (D features D features by
264
                    K classes)
265
        """
266
        # Check labels
267 1
        K = Y.shape[1]
268 1
        assert K > 1
269
270
        # Data shape
271 1
        N, D = X.shape
272
273
        # Preallocate parameter arrays
274 1
        pi = np.zeros((1, K))
275 1
        mu = np.zeros((K, D))
276 1
        Si = np.zeros((D, D, K))
277
278
        # For each class
279 1
        for k in range(K):
280
281
            # Number of samples for current class
282 1
            Nk = np.sum(Y[:, k], axis=0)
283
284
            # Check for no samples assigned to certain class
285 1
            if Nk == 0:
286
287
                # Proportion of samples for current class
288
                pi[0, k] = 0
289
290
                # Mean of current class
291
                mu[k, :] = np.zeros((1, D))
292
293
                # Covariance of current class
294
                Si[:, :, k] = np.eye(D, D)
295
296
            else:
297
298
                # Proportion of samples for current class
299 1
                pi[0, k] = Nk / N
300
301
                # Mean of current class
302 1
                mu[k, :] = np.dot(Y[:, k].T, X) / Nk
303
304
                # Subtract mean from data
305 1
                X_ = X - mu[k, :]
306
307
                # Diagonalize current label vector
308 1
                dYk = np.diag(Y[:, k])
309
310
                # Covariance of current class
311 1
                Si[:, :, k] = np.dot(np.dot(X_.T, dYk), X_) / Nk
312
313
                # Regularization
314 1
                Si[:, :, k] = regularize_matrix(Si[:, :, k], a=self.l2)
315
316
        # Check for linear or quadratic discriminant analysis
317 1
        if self.loss == 'lda':
318
319
            # In LDA, the class-covariance matrices are combined
320 1
            Si = self.combine_class_covariances(Si, pi)
321
322
        # Check for numerical problems
323 1
        if np.any(np.isnan(mu)) or np.any(np.isnan(Si)):
324
            raise RuntimeError('Parameter estimate is NaN.')
325
326 1
        return pi, mu, Si
327
328 1
    def combine_class_covariances(self, Si, pi):
329
        """
330
        Linear combination of class covariance matrices.
331
332
        INPUT   (1) array 'Si': Covariance matrix (D features by D features by
333
                    K classes)
334
                (2) array 'pi': class proportions (1 by K classes)
335
        OUTPUT  (1) array 'Si': Combined covariance matrix (D by D)
336
        """
337
        # Number of classes
338 1
        K = Si.shape[2]
339
340
        # Check if w is size K
341 1
        assert pi.shape[1] == K
342
343
        # For each class
344 1
        for k in range(K):
345
346
            # Weight each class-covariance
347 1
            Si[:, :, k] = Si[:, :, k] * pi[0, k]
348
349
        # Sum over weighted class-covariances
350 1
        return np.sum(Si, axis=2)
351
352 1
    def tcpr_da(self, X, y, Z):
353
        """
354
        Target Contrastive Pessimistic Risk - discriminant analysis.
355
356
        INPUT   (1) array 'X': source data (N samples by D features)
357
                (2) array 'y': source labels (N samples by 1)
358
                (3) array 'Z': target data (M samples by D features)
359
        OUTPUT  (1) array 'theta': classifier parameters (D features by K
360
                classes)
361
        """
362
        # Data shapes
363 1
        N, DX = X.shape
364 1
        M, DZ = Z.shape
365
366
        # Assert equivalent dimensionalities
367 1
        if not DX == DZ:
368
            raise ValueError('Dimensionalities of X and Z should be equal.')
369
370
        # Augment data with bias if necessary
371 1
        X, DX = self.remove_intercept(X)
372 1
        Z, DZ = self.remove_intercept(Z)
373
374
        # Map labels to one-hot-encoding
375 1
        Y, classes = one_hot(y)
376
377
        # Number of classes
378 1
        K = len(classes)
379
380
        # Check for at least 2 classes
381 1
        assert K > 1
382
383
        # Estimate parameters of source model
384 1
        theta_ref = self.discriminant_parameters(X, Y)
385
386
        # Loss is negative log-likelihood under reference parameters
387 1
        L_ref = self.neg_log_likelihood(Z, theta_ref)
388
389
        # Initialize target posterior
390 1
        q = np.ones((M, K)) / K
391
392 1
        print('Starting TCP optimization')
393
394 1
        TCPRt = np.inf
395 1
        for t in range(self.max_iter):
396
397
            # Maximization phase
398
399
            # Estimate parameters using TCP risk
400 1
            theta_tcp = self.discriminant_parameters(Z, q)
401
402
            # Minimization phase
403
404
            # Compute loss under new parameters
405 1
            L_tcp = self.neg_log_likelihood(Z, theta_tcp)
406
407
            # Gradient is difference in losses
408 1
            Dq = L_tcp - L_ref
409
410
            # Update learning rate
411 1
            alpha = self.learning_rate_t(t)
412
413
            # Steepest descent step
414 1
            q -= alpha*Dq
415
416
            # Project back onto simplex
417 1
            for m in range(M):
418 1
                q[m, :] = self.project_simplex(q[m, :])
419
420
            # Monitor progress
421
422
            # Check for 
423
424
            # Risks of current parameters
425 1
            R_tcp = self.risk(Z, theta_tcp, q)
426 1
            R_ref = self.risk(Z, theta_ref, q)
427
428
            # Assert no numerical problems
429 1
            if np.isnan(R_tcp) or np.isnan(R_ref):
430
                raise RuntimeError('Objective is NaN.')
431
432
            # Current TCP risk
433 1
            TCPRt_ = R_tcp - R_ref
434
435
            # Change in risk difference for this iteration
436 1
            dR = al.norm(TCPRt - TCPRt_)
437
438
            # Check for change smaller than tolerance
439 1
            if (dR < self.tolerance):
440 1
                print('Broke at iteration '+str(t)+', TCP Risk = '+str(TCPRt_))
441 1
                break
442
443
            # Report progress
444 1
            if (t % 100) == 1:
445 1
                print('Iteration ' + str(t) + '/' + str(self.max_iter) +
446
                      ', TCP Risk = ' + str(TCPRt_))
447
448
            # Update
449 1
            TCPRt = TCPRt_
450
451
        # Return estimated parameters
452 1
        return theta_tcp
453
454 1
    def fit(self, X, y, Z):
455
        """
456
        Fit/train an importance-weighted classifier.
457
458
        INPUT   (1) array 'X': source data (N samples by D features)
459
                (2) array 'y': source labels (N samples by 1)
460
                (3) array 'Z': target data (M samples by D features)
461
        OUTPUT  None
462
        """
463
        # Data shapes
464 1
        N, DX = X.shape
465 1
        M, DZ = Z.shape
466
467
        # Assert equivalent dimensionalities
468 1
        if not DX == DZ:
469
            raise ValueError('Dimensionalities of X and Z should be equal.')
470
471 1
        if self.loss in ['lda', 'qda']:
472
473
            # Discriminant analysis model for TCPR
474 1
            self.parameters = self.tcpr_da(X, y, Z)
475
476
        else:
477
            # Other loss functions are not implemented
478
            raise ValueError('Loss function unknown.')
479
480
        # Extract and store classes
481 1
        self.classes = np.unique(y)
482
483
        # Mark classifier as trained
484 1
        self.is_trained = True
485
486
        # Store training data dimensionality
487 1
        self.train_data_dim = DX
488
489 1
    def predict(self, Z_):
490
        """
491
        Make predictions on new dataset.
492
493
        INPUT   (1) array 'Z_': new data set (M samples by D features)
494
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
495
        """
496
        # Data shape
497 1
        M, D = Z_.shape
498
499
        # If classifier is trained, check for same dimensionality
500 1
        if self.is_trained:
501 1
            assert D == self.train_data_dim
502
503 1
        if self.loss in ['lda', 'qda']:
504
505
            # Compute probabilities under each distribution
506 1
            probs = self.neg_log_likelihood(Z_, self.parameters)
507
508
            # Take largest probability as predictions
509 1
            preds = np.argmax(probs, axis=1)
510
511
        # Map predictions back to original labels
512 1
        preds = self.classes[preds]
513
514
        # Return predictions array
515 1
        return preds
516
517 1
    def get_params(self):
518
        """Return classifier parameters."""
519
        # Check if classifier is trained
520
        if self.is_trained:
521
            return self.parameters
522
523
        else:
524
            # Throw soft error
525
            print('Classifier is not trained yet.')
526
            return []
527
528 1
    def error_rate(self, preds, u_):
529
        """Compute classification error rate."""
530
        return np.mean(preds != u_, axis=0)
531