Completed
Push — master ( f3d068...88fa67 )
by Wouter
03:44
created

StructuralCorrespondenceClassifier   A

Complexity

Total Complexity 23

Size/Duplication

Total Lines 297
Duplicated Lines 29.29 %

Test Coverage

Coverage 23.08%

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 87
loc 297
ccs 18
cts 78
cp 0.2308
rs 10
wmc 23

10 Methods

Rating   Name   Duplication   Size   Complexity  
A get_params() 0 3 1
B fit() 0 48 5
B Huber_loss() 0 34 1
B Huber_grad() 0 34 1
A is_trained() 0 3 1
A L() 0 1 1
B __init__() 50 50 4
B augment_features() 0 73 5
A J() 0 1 1
B predict() 37 37 5

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4 1
import numpy as np
5 1
import scipy.stats as st
6 1
from scipy.sparse import linalg
7 1
from scipy.optimize import minimize
8 1
import sklearn as sk
9 1
from sklearn.svm import LinearSVC
10 1
from sklearn.linear_model import LogisticRegression, LinearRegression
11 1
from sklearn.model_selection import cross_val_predict
12 1
from os.path import basename
13
14 1
from .util import is_pos_def
15
16
17 1
class StructuralCorrespondenceClassifier(object):
18
    """
19
    Class of classifiers based on structural correspondence learning.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
    """
24
25 1 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, num_pivots=1,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
26
                 num_components=1):
27
        """
28
        Select a particular type of importance-weighted classifier.
29
30
        Parameters
31
        ----------
32
        loss : str
33
            loss function for weighted classifier, options: 'logistic',
34
                'quadratic', 'hinge' (def: 'logistic')
35
        l2 : float
36
            l2-regularization parameter value (def:0.01)
37
        num_pivots : int
38
            number of pivot features to use (def: 1)
39
        num_components : int
40
            number of components to use after extracting pivot features
41
            (def: 1)
42
43
        Returns
44
        -------
45
        None
46
47
        """
48 1
        self.loss = loss
49 1
        self.l2 = l2
50 1
        self.num_pivots = num_pivots
51 1
        self.num_components = num_components
52
53
        # Initialize untrained classifiers based on choice of loss function
54 1
        if self.loss == 'logistic':
55
            # Logistic regression model
56 1
            self.clf = LogisticRegression()
57
        elif self.loss == 'quadratic':
58
            # Least-squares model
59
            self.clf = LinearRegression()
60
        elif self.loss == 'hinge':
61
            # Linear support vector machine
62
            self.clf = LinearSVC()
63
        else:
64
            # Other loss functions are not implemented
65
            raise NotImplementedError('Loss not implemented yet.')
66
67
        # Whether model has been trained
68 1
        self.is_trained = False
69
70
        # Maintain pivot component matrix
71 1
        self.C = 0
72
73
        # Dimensionality of training data
74 1
        self.train_data_dim = ''
75
76 1
    def augment_features(self, X, Z, l2=0.0):
77
        """
78
        Find a set of pivot features, train predictors and extract bases.
79
80
        Parameters
81
        X : array
82
            source data array (N samples by D features)
83
        Z : array
84
            target data array (M samples by D features)
85
        l2 : float
86
            regularization parameter value (def: 0.0)
87
88
        Returns
89
        -------
90
        None
91
92
        """
93
        # Data shapes
94
        N, DX = X.shape
95
        M, DZ = Z.shape
96
97
        # Assert equivalent dimensionalities
98
        if not DX == DZ:
99
            raise ValueError('Dimensionalities of X and Z should be equal.')
100
101
        # Concatenate source and target data
102
        XZ = np.concatenate((X, Z), axis=0)
103
104
        # Sort indices based on frequency of features (assumes BoW encoding)
105
        ix = np.argsort(np.sum(XZ, axis=0))
106
107
        # Keep most frequent features
108
        ix = ix[::-1][:self.num_pivots]
109
110
        # Slice out pivot features and relabel them as present(=1)/absent(=0)
111
        pivot = (XZ[:, ix] > 0).astype('float')
112
113
        # Solve prediction tasks with a Huber loss function
114
        P = np.zeros((DX, self.num_pivots))
115
116
        # Loop over pivot features
117
        for l in range(self.num_pivots):
118
119
            # Setup loss function for single pivot
120
            def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])
121
122
            # Setup gradient function for single pivot
123
            def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])
124
125
            # Make pivot predictor with a Huber loss function
126
            results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
127
                               options={'gtol': 1e-6, 'disp': True})
128
129
            # Store optimal parameters
130
            P[:, l] = results.x
131
132
        # Compute covariance matrix of predictors
133
        SP = np.cov(P)
134
135
        # Add regularization to ensure positive-definiteness
136
        SP += l2*np.eye(self.num_pivots)
137
138
        # Eigenvalue decomposition of pivot predictor matrix
139
        V, C = np.linalg.eig(SP)
140
141
        # Reduce number of components
142
        C = C[:, :self.num_components]
143
144
        # Augment features
145
        Xa = np.concatenate((np.dot(X, C), X), axis=1)
146
        Za = np.concatenate((np.dot(Z, C), Z), axis=1)
147
148
        return Xa, Za, C
149
150 1
    def Huber_loss(self, theta, X, y, l2=0.0):
151
        """
152
        Huber loss function.
153
154
        Reference: Ando & Zhang (2005a). A framework for learning predictive
155
        structures from multiple tasks and unlabeled data. JMLR.
156
157
        Parameters
158
        ----------
159
        theta : array
160
            classifier parameters (D features by 1)
161
        X : array
162
            data (N samples by D features)
163
        y : array
164
            label vector (N samples by 1)
165
        l2 : float
166
            l2-regularization parameter (def= 0.0)
167
168
        Returns
169
        -------
170
        array
171
            Objective function value.
172
173
        """
174
        # Precompute terms
175
        Xy = (X.T*y.T).T
176
        Xyt = np.dot(Xy, theta)
177
178
        # Indices of discontinuity
179
        ix = (Xyt >= -1)
180
181
        # Loss function
182
        return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
183
            + np.sum(-4*Xyt[~ix], axis=0) + l2*np.sum(theta**2, axis=0)
184
185 1
    def Huber_grad(self, theta, X, y, l2=0.0):
186
        """
187
        Huber gradient computation.
188
189
        Reference: Ando & Zhang (2005a). A framework for learning predictive
190
        structures from multiple tasks and unlabeled data. JMLR.
191
192
        Parameters
193
        ----------
194
        theta : array
195
            classifier parameters (D features by 1)
196
        X : array
197
            data (N samples by D features)
198
        y : array
199
            label vector (N samples by 1)
200
        l2 : float
201
            l2-regularization parameter (def= 0.0)
202
203
        Returns
204
        -------
205
        array
206
            Gradient with respect to classifier parameters
207
208
        """
209
        # Precompute terms
210
        Xy = (X.T*y.T).T
211
        Xyt = np.dot(Xy, theta)
212
213
        # Indices of discontinuity
214
        ix = (Xyt >= -1)
215
216
        # Gradient
217
        return np.sum(2*np.clip(1-Xyt[ix], 0, None).T * -Xy[ix, :].T,
218
                      axis=1).T + np.sum(-4*Xy[~ix, :], axis=0) + 2*l2*theta
219
220 1
    def fit(self, X, y, Z):
221
        """
222
        Fit/train an structural correpondence classifier.
223
224
        Parameters
225
        ----------
226
        X : array
227
            source data (N samples by D features)
228
        y : array
229
            source labels (N samples by 1)
230
        Z : array
231
            target data (M samples by D features)
232
233
        Returns
234
        -------
235
        None
236
237
        """
238
        # Data shapes
239
        N, DX = X.shape
240
        M, DZ = Z.shape
241
242
        # Assert equivalent dimensionalities
243
        if not DX == DZ:
244
            raise ValueError('Dimensionalities of X and Z should be equal.')
245
246
        # Augment features
247
        X, _, self.C = self.augment_features(X, Z, l2=self.l2)
248
249
        # Train a classifier
250
        if self.loss == 'logistic':
251
            # Logistic regression model
252
            self.clf.fit(X, y)
253
        elif self.loss == 'quadratic':
254
            # Least-squares model
255
            self.clf.fit(X, y)
256
        elif self.loss == 'hinge':
257
            # Linear support vector machine
258
            self.clf.fit(X, y)
259
        else:
260
            # Other loss functions are not implemented
261
            raise NotImplementedError('Loss not implemented.')
262
263
        # Mark classifier as trained
264
        self.is_trained = True
265
266
        # Store training data dimensionality
267
        self.train_data_dim = DX + self.num_components
268
269 1 View Code Duplication
    def predict(self, Z):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
270
        """
271
        Make predictions on new dataset.
272
273
        Parameters
274
        ----------
275
        Z : array
276
            new data set (M samples by D features)
277
278
        Returns
279
        -------
280
        preds : array
281
            label predictions (M samples by 1)
282
283
        """
284
        # Data shape
285
        M, D = Z.shape
286
287
        # If classifier is trained, check for same dimensionality
288
        if self.is_trained:
289
            if not self.train_data_dim == D:
290
                raise ValueError('''Test data is of different dimensionality
291
                                 than training data.''')
292
293
        # Check for augmentation
294
        if not self.train_data_dim == D:
295
            Z = np.concatenate((np.dot(Z, self.C), Z), axis=1)
296
297
        # Call scikit's predict function
298
        preds = self.clf.predict(Z)
299
300
        # For quadratic loss function, correct predictions
301
        if self.loss == 'quadratic':
302
            preds = (np.sign(preds)+1)/2.
303
304
        # Return predictions array
305
        return preds
306
307 1
    def get_params(self):
308
        """Get classifier parameters."""
309
        return self.clf.get_params()
310
311 1
    def is_trained(self):
312
        """Check whether classifier is trained."""
313
        return self.is_trained
314