Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

TargetContrastivePessimisticClassifier.tcpr_da()   F

Complexity

Conditions 9

Size

Total Lines 109

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 34
CRAP Score 9.0432

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 9
c 1
b 0
f 0
dl 0
loc 109
ccs 34
cts 37
cp 0.9189
crap 9.0432
rs 3.1304

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import numpy.linalg as al
6 1
from scipy.stats import multivariate_normal as mvn
7 1
import sklearn as sk
8 1
from sklearn.svm import LinearSVC
9 1
from sklearn.linear_model import LogisticRegression, LinearRegression
10 1
from sklearn.model_selection import cross_val_predict
11 1
from os.path import basename
12
13 1
from .util import one_hot, regularize_matrix
14
15
16 1
class TargetContrastivePessimisticClassifier(object):
17
    """
18
    Classifiers based on Target Contrastive Pessimistic Risk minimization.
19
20
    Methods contain models, risk functions, parameter estimation, etc.
21
    """
22
23 1
    def __init__(self, loss='lda', l2=1.0, max_iter=500, tolerance=1e-12,
24
                 learning_rate=1.0, rate_decay='linear', verbosity=0):
25
        """
26
        Select a particular type of TCPR classifier.
27
28
        Parameters
29
        ----------
30
        loss : str
31
            loss function for TCP Risk, options: 'ls', 'least_squares', 'lda',
32
            'linear_discriminant_analysis', 'qda',
33
            'quadratic_discriminant_analysis' (def: 'lda')
34
        l2 : float
35
            l2-regularization parameter value (def:0.01)
36
        max_iter : int
37
            maximum number of iterations for optimization (def: 500)
38
        tolerance : float
39
            convergence criterion on the TCP parameters (def: 1e-5)
40
        learning_rate : float
41
            parameter for size of update of gradient (def: 1.0)
42
        rate_decay : str
43
            type of learning rate decay, options: 'linear', 'quadratic',
44
            'geometric', 'exponential' (def: 'linear')
45
46
        Returns
47
        -------
48
        None
49
50
        """
51
        # Classifier options
52 1
        self.loss = loss
53 1
        self.l2 = l2
54
55
        # Optimization parameters
56 1
        self.max_iter = max_iter
57 1
        self.tolerance = tolerance
58 1
        self.learning_rate = learning_rate
59 1
        self.rate_decay = rate_decay
60 1
        self.verbosity = verbosity
61
62 1
        if self.loss in ['linear discriminant analysis', 'lda']:
63
64
            # Set to short name
65 1
            self.loss = 'lda'
66
67
        elif self.loss in ['quadratic discriminant analysis', 'qda']:
68
69
            # Set to short name
70
            self.loss = 'qda'
71
72
        else:
73
            # Other loss functions are not implemented
74
            raise ValueError('Model not implemented.')
75
76
        # Initialize classifier and classes parameters
77 1
        self.parameters = []
78 1
        self.classes = []
79
80
        # Whether model has been trained
81 1
        self.is_trained = False
82
83
        # Dimensionality of training data
84 1
        self.train_data_dim = 0
85
86 1
    def add_intercept(self, X):
87
        """Add 1's to data as last features."""
88
        # Data shape
89
        N, D = X.shape
90
91
        # Check if there's not already an intercept column
92
        if np.any(np.sum(X, axis=0) == N):
93
94
            # Report
95
            print('Intercept is not the last feature. Swapping..')
96
97
            # Find which column contains the intercept
98
            intercept_index = np.argwhere(np.sum(X, axis=0) == N)
99
100
            # Swap intercept to last
101
            X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
102
103
        # Add intercept as last column
104
        X = np.hstack((X, np.ones((N, 1))))
105
106
        # Append column of 1's to data, and increment dimensionality
107
        return X, D+1
108
109 1
    def remove_intercept(self, X):
110
        """Remove 1's from data as last features."""
111
        # Data shape
112 1
        N, D = X.shape
113
114
        # Find which column contains the intercept
115 1
        intercept_index = np.argwhere(np.sum(X, axis=0) == N)
116
117
        # Swap intercept to last
118 1
        X = X[:, np.setdiff1d(np.arange(D), intercept_index)]
119
120 1
        return X, D-1
121
122 1
    def project_simplex(self, v, z=1.0):
123
        """
124
        Project vector onto simplex using sorting.
125
126
        Reference: "Efficient Projections onto the L1-Ball for Learning in High
127
        Dimensions (Duchi, Shalev-Shwartz, Singer, Chandra, 2006)."
128
129
        Parameters
130
        ----------
131
        v : array
132
            vector to be projected (n dimensions by 0)
133
        z : float
134
            constant (def: 1.0)
135
136
        Returns
137
        -------
138
        w : array
139
            projected vector (n dimensions by 0)
140
141
        """
142
        # Number of dimensions
143 1
        n = v.shape[0]
144
145
        # Sort vector
146 1
        mu = np.sort(v, axis=0)[::-1]
147
148
        # Find rho
149 1
        C = np.cumsum(mu) - z
150 1
        j = np.arange(n) + 1
151 1
        rho = j[mu - C/j > 0][-1]
152
153
        # Define theta
154 1
        theta = C[mu - C/j > 0][-1] / float(rho)
155
156
        # Subtract theta from original vector and cap at 0
157 1
        w = np.maximum(v - theta, 0)
158
159
        # Return projected vector
160 1
        return w
161
162 1
    def learning_rate_t(self, t):
163
        """
164
        Compute current learning rate after decay.
165
166
        Parameters
167
        ----------
168
        t : int
169
            current iteration
170
171
        Returns
172
        -------
173
        alpha : float
174
            current learning rate
175
176
        """
177
        # Select rate decay
178 1
        if self.rate_decay == 'linear':
179
180
            # Linear dropoff between t=0 and t=T
181 1
            alpha = (self.max_iter - t)/(self.learning_rate*self.max_iter)
182
183
        elif self.rate_decay == 'quadratic':
184
185
            # Quadratic dropoff between t=0 and t=T
186
            alpha = ((self.max_iter - t)/(self.learning_rate*self.max_iter))**2
187
188
        elif self.rate_decay == 'geometric':
189
190
            # Drop rate off inversely to time
191
            alpha = 1 / (self.learning_rate * t)
192
193
        elif self.rate_decay == 'exponential':
194
195
            # Exponential dropoff
196
            alpha = np.exp(-self.learning_rate * t)
197
198
        else:
199
            raise ValueError('Rate decay type unknown.')
200
201 1
        return alpha
202
203 1
    def risk(self, Z, theta, q):
204
        """
205
        Compute target contrastive pessimistic risk.
206
207
        Parameters
208
        ----------
209
        Z : array
210
            target samples (M samples by D features)
211
        theta : array
212
            classifier parameters (D features by K classes)
213
        q : array
214
            soft labels (M samples by K classes)
215
216
        Returns
217
        -------
218
        float
219
            Value of risk function.
220
221
        """
222
        # Number of classes
223 1
        K = q.shape[1]
224
225
        # Compute negative log-likelihood
226 1
        L = self.neg_log_likelihood(Z, theta)
227
228
        # Weight loss by soft labels
229 1
        for k in range(K):
230 1
            L[:, k] *= q[:, k]
231
232
        # Sum over weighted losses
233 1
        L = np.sum(L, axis=1)
234
235
        # Risk is average loss
236 1
        return np.mean(L, axis=0)
237
238 1
    def neg_log_likelihood(self, X, theta):
239
        """
240
        Compute negative log-likelihood under Gaussian distributions.
241
242
        Parameters
243
        ----------
244
        X : array
245
            data (N samples by D features)
246
        theta : tuple(array, array, array)
247
            tuple containing class proportions 'pi', class means 'mu',
248
            and class-covariances 'Si'
249
250
        Returns
251
        -------
252
        L : array
253
            loss (N samples by K classes)
254
255
        """
256
        # Unpack parameters
257 1
        pi, mu, Si = theta
258
259
        # Check if parameter sets match
260 1
        if not pi.shape[1] == mu.shape[0]:
261
            raise ValueError('Number proportions does not match number means.')
262
263 1
        if not mu.shape[1] == Si.shape[0] == Si.shape[1]:
264
            raise ValueError('''Dimensions of mean does not match dimensions of
265
                              covariance matrix.''')
266
267
        # Number of classes
268 1
        K = pi.shape[1]
269
270
        # Data shape
271 1
        N, D = X.shape
272
273
        # Preallocate loss array
274 1
        L = np.zeros((N, K))
275
276 1
        for k in range(K):
277
278
            # Check for linear or quadratic
279 1
            if self.loss == 'lda':
280
281 1
                try:
282
                    # Probability under k-th Gaussian with shared covariance
283 1
                    probs = mvn.pdf(X, mu[k, :], Si)
284
285
                except al.LinAlgError as err:
286
                    print('Covariance matrix is singular. Add regularization.')
287
                    raise err
288
289
            elif self.loss == 'qda':
290
291
                try:
292
                    # Probability under k-th Gaussian with own covariance
293
                    probs = mvn.pdf(X, mu[k, :], Si[:, :, k])
294
295
                except al.LinAlgError as err:
296
                    print('Covariance matrix is singular. Add regularization.')
297
                    raise err
298
299
            else:
300
                raise ValueError('Loss unknown.')
301
302
            # Negative log-likelihood
303 1
            L[:, k] = -np.log(pi[0, k]) - np.log(probs)
304
305 1
        return L
306
307 1
    def discriminant_parameters(self, X, Y):
308
        """
309
        Estimate parameters of Gaussian distribution for discriminant analysis.
310
311
        Parameters
312
        ----------
313
        X : array
314
            data array (N samples by D features)
315
        Y : array
316
            label array (N samples by K classes)
317
318
        Returns
319
        -------
320
        pi : array
321
            class proportions (1 by K classes)
322
        mu : array
323
            class means (K classes by D features)
324
        Si : array
325
            class covariances (D features D features by K classes)
326
327
        """
328
        # Check labels
329 1
        K = Y.shape[1]
330
331
        # Check for sufficient labels
332 1
        if not K > 1:
333
            raise ValueError('Number of classes should be larger than 1.')
334
335
        # Data shape
336 1
        N, D = X.shape
337
338
        # Preallocate parameter arrays
339 1
        pi = np.zeros((1, K))
340 1
        mu = np.zeros((K, D))
341 1
        Si = np.zeros((D, D, K))
342
343
        # For each class
344 1
        for k in range(K):
345
346
            # Number of samples for current class
347 1
            Nk = np.sum(Y[:, k], axis=0)
348
349
            # Check for no samples assigned to certain class
350 1
            if Nk == 0:
351
352
                # Proportion of samples for current class
353
                pi[0, k] = 0
354
355
                # Mean of current class
356
                mu[k, :] = np.zeros((1, D))
357
358
                # Covariance of current class
359
                Si[:, :, k] = np.eye(D, D)
360
361
            else:
362
363
                # Proportion of samples for current class
364 1
                pi[0, k] = Nk / N
365
366
                # Mean of current class
367 1
                mu[k, :] = np.dot(Y[:, k].T, X) / Nk
368
369
                # Subtract mean from data
370 1
                X_ = X - mu[k, :]
371
372
                # Diagonalize current label vector
373 1
                dYk = np.diag(Y[:, k])
374
375
                # Covariance of current class
376 1
                Si[:, :, k] = np.dot(np.dot(X_.T, dYk), X_) / Nk
377
378
                # Regularization
379 1
                Si[:, :, k] = regularize_matrix(Si[:, :, k], a=self.l2)
380
381
        # Check for linear or quadratic discriminant analysis
382 1
        if self.loss == 'lda':
383
384
            # In LDA, the class-covariance matrices are combined
385 1
            Si = self.combine_class_covariances(Si, pi)
386
387
        # Check for numerical problems
388 1
        if np.any(np.isnan(mu)) or np.any(np.isnan(Si)):
389
            raise RuntimeError('Parameter estimate is NaN.')
390
391 1
        return pi, mu, Si
392
393 1
    def combine_class_covariances(self, Si, pi):
394
        """
395
        Linear combination of class covariance matrices.
396
397
        Parameters
398
        ----------
399
        Si : array
400
            Covariance matrix (D features by D features by K classes)
401
        pi : array
402
            class proportions (1 by K classes)
403
404
        Returns
405
        -------
406
        Si : array
407
            Combined covariance matrix (D by D)
408
409
        """
410
        # Number of classes
411 1
        K = Si.shape[2]
412
413
        # Check if w is size K
414 1
        if not pi.shape[1] == K:
415
            raise ValueError('''Number of proportions does not match with number
416
                             classes of covariance matrix.''')
417
418
        # For each class
419 1
        for k in range(K):
420
421
            # Weight each class-covariance
422 1
            Si[:, :, k] = Si[:, :, k] * pi[0, k]
423
424
        # Sum over weighted class-covariances
425 1
        return np.sum(Si, axis=2)
426
427 1
    def tcpr_da(self, X, y, Z):
428
        """
429
        Target Contrastive Pessimistic Risk - discriminant analysis.
430
431
        Parameters
432
        ----------
433
        X : array
434
            source data (N samples by D features)
435
        y : array
436
            source labels (N samples by 1)
437
        Z : array
438
            target data (M samples by D features)
439
440
        Returns
441
        -------
442
        theta : array
443
            classifier parameters (D features by K classes)
444
445
        """
446
        # Data shapes
447 1
        N, DX = X.shape
448 1
        M, DZ = Z.shape
449
450
        # Assert equivalent dimensionalities
451 1
        if not DX == DZ:
452
            raise ValueError('Dimensionalities of X and Z should be equal.')
453
454
        # Augment data with bias if necessary
455 1
        X, DX = self.remove_intercept(X)
456 1
        Z, DZ = self.remove_intercept(Z)
457
458
        # Map labels to one-hot-encoding
459 1
        Y, classes = one_hot(y)
460
461
        # Number of classes
462 1
        K = len(classes)
463
464
        # Check for at least 2 classes
465 1
        if not K > 1:
466
            raise ValueError('Number of classes should be larger than 1.')
467
468
        # Estimate parameters of source model
469 1
        theta_ref = self.discriminant_parameters(X, Y)
470
471
        # Loss is negative log-likelihood under reference parameters
472 1
        L_ref = self.neg_log_likelihood(Z, theta_ref)
473
474
        # Initialize target posterior
475 1
        q = np.ones((M, K)) / K
476
477 1
        print('Starting TCP optimization')
478
479 1
        TCPRt = np.inf
480 1
        for t in range(self.max_iter):
481
482
            # Maximization phase
483
484
            # Estimate parameters using TCP risk
485 1
            theta_tcp = self.discriminant_parameters(Z, q)
486
487
            # Minimization phase
488
489
            # Compute loss under new parameters
490 1
            L_tcp = self.neg_log_likelihood(Z, theta_tcp)
491
492
            # Gradient is difference in losses
493 1
            Dq = L_tcp - L_ref
494
495
            # Update learning rate
496 1
            alpha = self.learning_rate_t(t)
497
498
            # Steepest descent step
499 1
            q -= alpha*Dq
500
501
            # Project back onto simplex
502 1
            for m in range(M):
503 1
                q[m, :] = self.project_simplex(q[m, :])
504
505
            # Monitor progress
506
507
            # Risks of current parameters
508 1
            R_tcp = self.risk(Z, theta_tcp, q)
509 1
            R_ref = self.risk(Z, theta_ref, q)
510
511
            # Assert no numerical problems
512 1
            if np.isnan(R_tcp) or np.isnan(R_ref):
513
                raise RuntimeError('Objective is NaN.')
514
515
            # Current TCP risk
516 1
            TCPRt_ = R_tcp - R_ref
517
518
            # Change in risk difference for this iteration
519 1
            dR = al.norm(TCPRt - TCPRt_)
520
521
            # Check for change smaller than tolerance
522 1
            if (dR < self.tolerance):
523 1
                print('Broke at iteration '+str(t)+', TCP Risk = '+str(TCPRt_))
524 1
                break
525
526
            # Report progress
527 1
            if (t % 100) == 1:
528 1
                print('Iteration ' + str(t) + '/' + str(self.max_iter) +
529
                      ', TCP Risk = ' + str(TCPRt_))
530
531
            # Update
532 1
            TCPRt = TCPRt_
533
534
        # Return estimated parameters
535 1
        return theta_tcp
536
537 1
    def fit(self, X, y, Z):
538
        """
539
        Fit/train an importance-weighted classifier.
540
541
        Parameters
542
        ----------
543
        X : array
544
            source data (N samples by D features)
545
        y : array
546
            source labels (N samples by 1)
547
        Z : array
548
            target data (M samples by D features)
549
550
        Returns
551
        -------
552
        None
553
554
        """
555
        # Data shapes
556 1
        N, DX = X.shape
557 1
        M, DZ = Z.shape
558
559
        # Assert equivalent dimensionalities
560 1
        if not DX == DZ:
561
            raise ValueError('Dimensionalities of X and Z should be equal.')
562
563 1
        if self.loss in ['lda', 'qda']:
564
565
            # Discriminant analysis model for TCPR
566 1
            self.parameters = self.tcpr_da(X, y, Z)
567
568
        else:
569
            # Other loss functions are not implemented
570
            raise ValueError('Loss function unknown.')
571
572
        # Extract and store classes
573 1
        self.classes = np.unique(y)
574
575
        # Mark classifier as trained
576 1
        self.is_trained = True
577
578
        # Store training data dimensionality
579 1
        self.train_data_dim = DX
580
581 1
    def predict(self, Z_):
582
        """
583
        Make predictions on new dataset.
584
        
585
        Parameters
586
        ----------
587
        Z : array
588
            new data set (M samples by D features)
589
590
        Returns
591
        -------
592
        preds : array
593
            label predictions (M samples by 1)
594
595
        """
596
        # Data shape
597 1
        M, D = Z_.shape
598
599
        # If classifier is trained, check for same dimensionality
600 1
        if self.is_trained:
601 1
            if not self.train_data_dim == D:
602
                raise ValueError('''Test data is of different dimensionality
603
                                 than training data.''')
604
605 1
        if self.loss in ['lda', 'qda']:
606
607
            # Compute probabilities under each distribution
608 1
            probs = self.neg_log_likelihood(Z_, self.parameters)
609
610
            # Take largest probability as predictions
611 1
            preds = np.argmax(probs, axis=1)
612
613
        # Map predictions back to original labels
614 1
        preds = self.classes[preds]
615
616
        # Return predictions array
617 1
        return preds
618
619 1
    def get_params(self):
620
        """Return classifier parameters."""
621
        # Check if classifier is trained
622
        if self.is_trained:
623
            return self.parameters
624
625
        else:
626
            # Throw soft error
627
            print('Classifier is not trained yet.')
628
            return []
629
630 1
    def error_rate(self, preds, u_):
631
        """Compute classification error rate."""
632
        return np.mean(preds != u_, axis=0)
633