Completed
Branch master (f50597)
by Wouter
52s
created

StructuralCorrespondenceClassifier   A

Complexity

Total Complexity 23

Size/Duplication

Total Lines 230
Duplicated Lines 33.48 %

Test Coverage

Coverage 0%

Importance

Changes 0
Metric Value
c 0
b 0
f 0
dl 77
loc 230
ccs 0
cts 73
cp 0
rs 10
wmc 23

10 Methods

Rating   Name   Duplication   Size   Complexity  
A get_params() 0 3 1
B fit() 38 38 5
B Huber_loss() 0 24 1
B Huber_grad() 0 24 1
A is_trained() 0 3 1
A L() 0 1 1
B __init__() 39 39 4
B augment_features() 0 56 5
A J() 0 1 1
B predict() 0 28 5

How to fix   Duplicated Code   

Duplicated Code

Duplicate code is one of the most pungent code smells. A rule that is often used is to re-structure code once it is duplicated in three or more places.

Common duplication problems, and corresponding solutions are:

1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
4
import numpy as np
5
import scipy.stats as st
6
from scipy.sparse import linalg
7
from scipy.optimize import minimize
8
import sklearn as sk
9
from sklearn.svm import LinearSVC
10
from sklearn.linear_model import LogisticRegression, LinearRegression
11
from sklearn.model_selection import cross_val_predict
12
from os.path import basename
13
14
from .util import is_pos_def
15
16
17
class StructuralCorrespondenceClassifier(object):
18
    """
19
    Class of classifiers based on structural correspondence learning.
20
21
    Methods contain different importance-weight estimators and different loss
22
    functions.
23
    """
24
25 View Code Duplication
    def __init__(self, loss='logistic', l2=1.0, num_pivots=1,
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
26
                 num_components=1):
27
        """
28
        Select a particular type of importance-weighted classifier.
29
30
        INPUT   (1) str 'loss': loss function for weighted classifier, options:
31
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
32
                (2) float 'l2': l2-regularization parameter value (def:0.01)
33
                (3) int 'num_pivots': number of pivot features to use (def: 1)
34
                (4) int 'num_components': number of components to use after
35
                    extracting pivot features (def: 1)
36
        """
37
        self.loss = loss
38
        self.l2 = l2
39
        self.num_pivots = num_pivots
40
        self.num_components = num_components
41
42
        # Initialize untrained classifiers based on choice of loss function
43
        if self.loss == 'logistic':
44
            # Logistic regression model
45
            self.clf = LogisticRegression()
46
        elif self.loss == 'quadratic':
47
            # Least-squares model
48
            self.clf = LinearRegression()
49
        elif self.loss == 'hinge':
50
            # Linear support vector machine
51
            self.clf = LinearSVC()
52
        else:
53
            # Other loss functions are not implemented
54
            raise NotImplementedError
55
56
        # Whether model has been trained
57
        self.is_trained = False
58
59
        # Maintain pivot component matrix
60
        self.C = 0
61
62
        # Dimensionality of training data
63
        self.train_data_dim = ''
64
65
    def augment_features(self, X, Z):
66
        """
67
        Find a set of pivot features, train predictors and extract bases.
68
69
        INPUT   (1) array 'X': source data array (N samples by D features)
70
                (2) array 'Z': target data array (M samples by D features)
71
        """
72
        # Data shapes
73
        N, DX = X.shape
74
        M, DZ = Z.shape
75
76
        # Assert equivalent dimensionalities
77
        assert DX == DZ
78
79
        # Concatenate source and target data
80
        XZ = np.concatenate((X, Z), axis=0)
81
82
        # Sort indices based on frequency of features (assumes BoW encoding)
83
        ix = np.argsort(np.sum(XZ, axis=0))
84
85
        # Keep most frequent features
86
        ix = ix[::-1][:self.num_pivots]
87
88
        # Slice out pivot features and relabel them as present(=1)/absent(=0)
89
        pivot = (XZ[:, ix] > 0).astype('float')
90
91
        # Solve prediction tasks with a Huber loss function
92
        P = np.zeros((DX, self.num_pivots))
93
94
        # Loop over pivot features
95
        for l in range(self.num_pivots):
96
97
            # Setup loss function for single pivot
98
            def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])
99
100
            # Setup gradient function for single pivot
101
            def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])
102
103
            # Make pivot predictor with a Huber loss function
104
            results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
105
                               options={'gtol': 1e-6, 'disp': True})
106
107
            # Store optimal parameters
108
            P[:, l] = results.x
109
110
        # Eigenvalue decomposition of pivot predictor matrix
111
        V, C = np.linalg.eig(np.cov(P))
112
113
        # Reduce number of components
114
        C = C[:, :self.num_components]
115
116
        # Augment features
117
        Xa = np.concatenate((np.dot(X, C), X), axis=1)
118
        Za = np.concatenate((np.dot(Z, C), Z), axis=1)
119
120
        return Xa, Za, C
121
122
    def Huber_loss(self, theta, X, y, l2=0.0):
123
        """
124
        Huber loss function.
125
126
        Reference: Ando & Zhang (2005a). A framework for learning predictive
127
        structures from multiple tasks and unlabeled data. JMLR.
128
129
        INPUT   (1) array 'theta': classifier parameters (D features by 1)
130
                (2) array 'X': data (N samples by D features)
131
                (3) array 'y': label vector (N samples by 1)
132
                (4) float 'l2': l2-regularization parameter (def= 0.0)
133
        OUTPUT  (1) Loss/objective function value
134
                (2) Gradient with respect to classifier parameters
135
        """
136
        # Precompute terms
137
        Xy = (X.T*y.T).T
138
        Xyt = np.dot(Xy, theta)
139
140
        # Indices of discontinuity
141
        ix = (Xyt >= -1)
142
143
        # Loss function
144
        return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
145
            + np.sum(-4*Xyt[~ix], axis=0) + l2*np.sum(theta**2, axis=0)
146
147
    def Huber_grad(self, theta, X, y, l2=0.0):
148
        """
149
        Huber gradient computation.
150
151
        Reference: Ando & Zhang (2005a). A framework for learning predictive
152
        structures from multiple tasks and unlabeled data. JMLR.
153
154
        INPUT   (1) array 'theta': classifier parameters (D features by 1)
155
                (2) array 'X': data (N samples by D features)
156
                (3) array 'y': label vector (N samples by 1)
157
                (4) float 'l2': l2-regularization parameter (def= 0.0)
158
        OUTPUT  (1) Loss/objective function value
159
                (2) Gradient with respect to classifier parameters
160
        """
161
        # Precompute terms
162
        Xy = (X.T*y.T).T
163
        Xyt = np.dot(Xy, theta)
164
165
        # Indices of discontinuity
166
        ix = (Xyt >= -1)
167
168
        # Gradient
169
        return np.sum(2*np.clip(1-Xyt[ix], 0, None).T * -Xy[ix, :].T,
170
                      axis=1).T + np.sum(-4*Xy[~ix, :], axis=0) + 2*l2*theta
171
172 View Code Duplication
    def fit(self, X, y, Z):
0 ignored issues
show
Duplication introduced
This code seems to be duplicated in your project.
Loading history...
173
        """
174
        Fit/train an structural correpondence classifier.
175
176
        INPUT   (1) array 'X': source data (N samples by D features)
177
                (2) array 'y': source labels (N samples by 1)
178
                (3) array 'Z': target data (M samples by D features)
179
        OUTPUT  None
180
        """
181
        # Data shapes
182
        N, DX = X.shape
183
        M, DZ = Z.shape
184
185
        # Assert equivalent dimensionalities
186
        assert DX == DZ
187
188
        # Augment features
189
        X, _, self.C = self.augment_features(X, Z)
190
191
        # Train a classifier
192
        if self.loss == 'logistic':
193
            # Logistic regression model
194
            self.clf.fit(X, y)
195
        elif self.loss == 'quadratic':
196
            # Least-squares model
197
            self.clf.fit(X, y)
198
        elif self.loss == 'hinge':
199
            # Linear support vector machine
200
            self.clf.fit(X, y)
201
        else:
202
            # Other loss functions are not implemented
203
            raise NotImplementedError
204
205
        # Mark classifier as trained
206
        self.is_trained = True
207
208
        # Store training data dimensionality
209
        self.train_data_dim = DX + self.num_components
210
211
    def predict(self, Z_):
212
        """
213
        Make predictions on new dataset.
214
215
        INPUT   (1) array 'Z_': new data set (M samples by D features)
216
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
217
        """
218
        # Data shape
219
        M, D = Z_.shape
220
221
        # If classifier is trained, check for same dimensionality
222
        if self.is_trained:
223
            assert self.train_data_dim == D or \
224
                   self.train_data_dim == D + self.num_components
225
226
        # Check for augmentation
227
        if not self.train_data_dim == D:
228
            Z_ = np.concatenate((np.dot(Z_, self.C), Z_), axis=1)
229
230
        # Call scikit's predict function
231
        preds = self.clf.predict(Z_)
232
233
        # For quadratic loss function, correct predictions
234
        if self.loss == 'quadratic':
235
            preds = (np.sign(preds)+1)/2.
236
237
        # Return predictions array
238
        return preds
239
240
    def get_params(self):
241
        """Get classifier parameters."""
242
        return self.clf.get_params()
243
244
    def is_trained(self):
245
        """Check whether classifier is trained."""
246
        return self.is_trained
247