StructuralCorrespondenceClassifier - Code Metrics - Inspection of "Added sphinx and readthedocs." - wmkouw/libTLDA - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( f3d068...88fa67 )

by Wouter

created 2018-06-17 15:54 UTC

StructuralCorrespondenceClassifier A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	297
Duplicated Lines	29.29 %

Test Coverage

Coverage

23.08%

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	87
loc	297
ccs	18
cts	78
cp	0.2308
rs	10
wmc	23

10 Methods

Rating	Name	Duplication	Size	Complexity
A	get_params()	0	3	1
B	fit()	0	48	5
B	Huber_loss()	0	34	1
B	Huber_grad()	0	34	1
A	is_trained()	0	3	1
A	L()	0	1	1
B	__init__()	50	50	4
B	augment_features()	0	73	5
A	J()	0	1	1
B	predict()	37	37	5

How to fix Duplicated Code

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import scipy.stats as st
from scipy.sparse import linalg
from scipy.optimize import minimize
import sklearn as sk
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict
from os.path import basename

from .util import is_pos_def


class StructuralCorrespondenceClassifier(object):
    """
    Class of classifiers based on structural correspondence learning.

    Methods contain different importance-weight estimators and different loss
    functions.
    """

    def __init__(self, loss='logistic', l2=1.0, num_pivots=1,

                 num_components=1):
        """
        Select a particular type of importance-weighted classifier.

        Parameters
        ----------
        loss : str
            loss function for weighted classifier, options: 'logistic',
                'quadratic', 'hinge' (def: 'logistic')
        l2 : float
            l2-regularization parameter value (def:0.01)
        num_pivots : int
            number of pivot features to use (def: 1)
        num_components : int
            number of components to use after extracting pivot features
            (def: 1)

        Returns
        -------
        None

        """
        self.loss = loss
        self.l2 = l2
        self.num_pivots = num_pivots
        self.num_components = num_components

        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss not implemented yet.')

        # Whether model has been trained
        self.is_trained = False

        # Maintain pivot component matrix
        self.C = 0

        # Dimensionality of training data
        self.train_data_dim = ''

    def augment_features(self, X, Z, l2=0.0):
        """
        Find a set of pivot features, train predictors and extract bases.

        Parameters
        X : array
            source data array (N samples by D features)
        Z : array
            target data array (M samples by D features)
        l2 : float
            regularization parameter value (def: 0.0)

        Returns
        -------
        None

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Concatenate source and target data
        XZ = np.concatenate((X, Z), axis=0)

        # Sort indices based on frequency of features (assumes BoW encoding)
        ix = np.argsort(np.sum(XZ, axis=0))

        # Keep most frequent features
        ix = ix[::-1][:self.num_pivots]

        # Slice out pivot features and relabel them as present(=1)/absent(=0)
        pivot = (XZ[:, ix] > 0).astype('float')

        # Solve prediction tasks with a Huber loss function
        P = np.zeros((DX, self.num_pivots))

        # Loop over pivot features
        for l in range(self.num_pivots):

            # Setup loss function for single pivot
            def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])

            # Setup gradient function for single pivot
            def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])

            # Make pivot predictor with a Huber loss function
            results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
                               options={'gtol': 1e-6, 'disp': True})

            # Store optimal parameters
            P[:, l] = results.x

        # Compute covariance matrix of predictors
        SP = np.cov(P)

        # Add regularization to ensure positive-definiteness
        SP += l2*np.eye(self.num_pivots)

        # Eigenvalue decomposition of pivot predictor matrix
        V, C = np.linalg.eig(SP)

        # Reduce number of components
        C = C[:, :self.num_components]

        # Augment features
        Xa = np.concatenate((np.dot(X, C), X), axis=1)
        Za = np.concatenate((np.dot(Z, C), Z), axis=1)

        return Xa, Za, C

    def Huber_loss(self, theta, X, y, l2=0.0):
        """
        Huber loss function.

        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.

        Parameters
        ----------
        theta : array
            classifier parameters (D features by 1)
        X : array
            data (N samples by D features)
        y : array
            label vector (N samples by 1)
        l2 : float
            l2-regularization parameter (def= 0.0)

        Returns
        -------
        array
            Objective function value.

        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Loss function
        return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
            + np.sum(-4*Xyt[~ix], axis=0) + l2*np.sum(theta**2, axis=0)

    def Huber_grad(self, theta, X, y, l2=0.0):
        """
        Huber gradient computation.

        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.

        Parameters
        ----------
        theta : array
            classifier parameters (D features by 1)
        X : array
            data (N samples by D features)
        y : array
            label vector (N samples by 1)
        l2 : float
            l2-regularization parameter (def= 0.0)

        Returns
        -------
        array
            Gradient with respect to classifier parameters

        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Gradient
        return np.sum(2*np.clip(1-Xyt[ix], 0, None).T * -Xy[ix, :].T,
                      axis=1).T + np.sum(-4*Xy[~ix, :], axis=0) + 2*l2*theta

    def fit(self, X, y, Z):
        """
        Fit/train an structural correpondence classifier.

        Parameters
        ----------
        X : array
            source data (N samples by D features)
        y : array
            source labels (N samples by 1)
        Z : array
            target data (M samples by D features)

        Returns
        -------
        None

        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        if not DX == DZ:
            raise ValueError('Dimensionalities of X and Z should be equal.')

        # Augment features
        X, _, self.C = self.augment_features(X, Z, l2=self.l2)

        # Train a classifier
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf.fit(X, y)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError('Loss not implemented.')

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX + self.num_components

    def predict(self, Z):

        """
        Make predictions on new dataset.

        Parameters
        ----------
        Z : array
            new data set (M samples by D features)

        Returns
        -------
        preds : array
            label predictions (M samples by 1)

        """
        # Data shape
        M, D = Z.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            if not self.train_data_dim == D:
                raise ValueError('''Test data is of different dimensionality
                                 than training data.''')

        # Check for augmentation
        if not self.train_data_dim == D:
            Z = np.concatenate((np.dot(Z, self.C), Z), axis=1)

        # Call scikit's predict function
        preds = self.clf.predict(Z)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained


1			#!/usr/bin/env python
2			# -- coding: utf-8 --
3
4	1		import numpy as np
5	1		import scipy.stats as st
6	1		from scipy.sparse import linalg
7	1		from scipy.optimize import minimize
8	1		import sklearn as sk
9	1		from sklearn.svm import LinearSVC
10	1		from sklearn.linear_model import LogisticRegression, LinearRegression
11	1		from sklearn.model_selection import cross_val_predict
12	1		from os.path import basename
13
14	1		from .util import is_pos_def
15
16
17	1		class StructuralCorrespondenceClassifier(object):
18			"""
19			Class of classifiers based on structural correspondence learning.
20
21			Methods contain different importance-weight estimators and different loss
22			functions.
23			"""
24
25	1	View Code Duplication	def __init__(self, loss='logistic', l2=1.0, num_pivots=1,
			0 ignored issues – show Duplication introduced 2018-06-12 14:41 UTC Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
26			num_components=1):
27			"""
28			Select a particular type of importance-weighted classifier.
29
30			Parameters
31			----------
32			loss : str
33			loss function for weighted classifier, options: 'logistic',
34			'quadratic', 'hinge' (def: 'logistic')
35			l2 : float
36			l2-regularization parameter value (def:0.01)
37			num_pivots : int
38			number of pivot features to use (def: 1)
39			num_components : int
40			number of components to use after extracting pivot features
41			(def: 1)
42
43			Returns
44			-------
45			None
46
47			"""
48	1		self.loss = loss
49	1		self.l2 = l2
50	1		self.num_pivots = num_pivots
51	1		self.num_components = num_components
52
53			# Initialize untrained classifiers based on choice of loss function
54	1		if self.loss == 'logistic':
55			# Logistic regression model
56	1		self.clf = LogisticRegression()
57			elif self.loss == 'quadratic':
58			# Least-squares model
59			self.clf = LinearRegression()
60			elif self.loss == 'hinge':
61			# Linear support vector machine
62			self.clf = LinearSVC()
63			else:
64			# Other loss functions are not implemented
65			raise NotImplementedError('Loss not implemented yet.')
66
67			# Whether model has been trained
68	1		self.is_trained = False
69
70			# Maintain pivot component matrix
71	1		self.C = 0
72
73			# Dimensionality of training data
74	1		self.train_data_dim = ''
75
76	1		def augment_features(self, X, Z, l2=0.0):
77			"""
78			Find a set of pivot features, train predictors and extract bases.
79
80			Parameters
81			X : array
82			source data array (N samples by D features)
83			Z : array
84			target data array (M samples by D features)
85			l2 : float
86			regularization parameter value (def: 0.0)
87
88			Returns
89			-------
90			None
91
92			"""
93			# Data shapes
94			N, DX = X.shape
95			M, DZ = Z.shape
96
97			# Assert equivalent dimensionalities
98			if not DX == DZ:
99			raise ValueError('Dimensionalities of X and Z should be equal.')
100
101			# Concatenate source and target data
102			XZ = np.concatenate((X, Z), axis=0)
103
104			# Sort indices based on frequency of features (assumes BoW encoding)
105			ix = np.argsort(np.sum(XZ, axis=0))
106
107			# Keep most frequent features
108			ix = ix[::-1][:self.num_pivots]
109
110			# Slice out pivot features and relabel them as present(=1)/absent(=0)
111			pivot = (XZ[:, ix] > 0).astype('float')
112
113			# Solve prediction tasks with a Huber loss function
114			P = np.zeros((DX, self.num_pivots))
115
116			# Loop over pivot features
117			for l in range(self.num_pivots):
118
119			# Setup loss function for single pivot
120			def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])
121
122			# Setup gradient function for single pivot
123			def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])
124
125			# Make pivot predictor with a Huber loss function
126			results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
127			options={'gtol': 1e-6, 'disp': True})
128
129			# Store optimal parameters
130			P[:, l] = results.x
131
132			# Compute covariance matrix of predictors
133			SP = np.cov(P)
134
135			# Add regularization to ensure positive-definiteness
136			SP += l2*np.eye(self.num_pivots)
137
138			# Eigenvalue decomposition of pivot predictor matrix
139			V, C = np.linalg.eig(SP)
140
141			# Reduce number of components
142			C = C[:, :self.num_components]
143
144			# Augment features
145			Xa = np.concatenate((np.dot(X, C), X), axis=1)
146			Za = np.concatenate((np.dot(Z, C), Z), axis=1)
147
148			return Xa, Za, C
149
150	1		def Huber_loss(self, theta, X, y, l2=0.0):
151			"""
152			Huber loss function.
153
154			Reference: Ando & Zhang (2005a). A framework for learning predictive
155			structures from multiple tasks and unlabeled data. JMLR.
156
157			Parameters
158			----------
159			theta : array
160			classifier parameters (D features by 1)
161			X : array
162			data (N samples by D features)
163			y : array
164			label vector (N samples by 1)
165			l2 : float
166			l2-regularization parameter (def= 0.0)
167
168			Returns
169			-------
170			array
171			Objective function value.
172
173			"""
174			# Precompute terms
175			Xy = (X.T*y.T).T
176			Xyt = np.dot(Xy, theta)
177
178			# Indices of discontinuity
179			ix = (Xyt >= -1)
180
181			# Loss function
182			return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
183			+ np.sum(-4Xyt[~ix], axis=0) + l2np.sum(theta**2, axis=0)
184
185	1		def Huber_grad(self, theta, X, y, l2=0.0):
186			"""
187			Huber gradient computation.
188
189			Reference: Ando & Zhang (2005a). A framework for learning predictive
190			structures from multiple tasks and unlabeled data. JMLR.
191
192			Parameters
193			----------
194			theta : array
195			classifier parameters (D features by 1)
196			X : array
197			data (N samples by D features)
198			y : array
199			label vector (N samples by 1)
200			l2 : float
201			l2-regularization parameter (def= 0.0)
202
203			Returns
204			-------
205			array
206			Gradient with respect to classifier parameters
207
208			"""
209			# Precompute terms
210			Xy = (X.T*y.T).T
211			Xyt = np.dot(Xy, theta)
212
213			# Indices of discontinuity
214			ix = (Xyt >= -1)
215
216			# Gradient
217			return np.sum(2np.clip(1-Xyt[ix], 0, None).T -Xy[ix, :].T,
218			axis=1).T + np.sum(-4Xy[~ix, :], axis=0) + 2l2*theta
219
220	1		def fit(self, X, y, Z):
221			"""
222			Fit/train an structural correpondence classifier.
223
224			Parameters
225			----------
226			X : array
227			source data (N samples by D features)
228			y : array
229			source labels (N samples by 1)
230			Z : array
231			target data (M samples by D features)
232
233			Returns
234			-------
235			None
236
237			"""
238			# Data shapes
239			N, DX = X.shape
240			M, DZ = Z.shape
241
242			# Assert equivalent dimensionalities
243			if not DX == DZ:
244			raise ValueError('Dimensionalities of X and Z should be equal.')
245
246			# Augment features
247			X, _, self.C = self.augment_features(X, Z, l2=self.l2)
248
249			# Train a classifier
250			if self.loss == 'logistic':
251			# Logistic regression model
252			self.clf.fit(X, y)
253			elif self.loss == 'quadratic':
254			# Least-squares model
255			self.clf.fit(X, y)
256			elif self.loss == 'hinge':
257			# Linear support vector machine
258			self.clf.fit(X, y)
259			else:
260			# Other loss functions are not implemented
261			raise NotImplementedError('Loss not implemented.')
262
263			# Mark classifier as trained
264			self.is_trained = True
265
266			# Store training data dimensionality
267			self.train_data_dim = DX + self.num_components
268
269	1	View Code Duplication	def predict(self, Z):
			0 ignored issues – show Duplication introduced 2018-06-17 15:58 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
270			"""
271			Make predictions on new dataset.
272
273			Parameters
274			----------
275			Z : array
276			new data set (M samples by D features)
277
278			Returns
279			-------
280			preds : array
281			label predictions (M samples by 1)
282
283			"""
284			# Data shape
285			M, D = Z.shape
286
287			# If classifier is trained, check for same dimensionality
288			if self.is_trained:
289			if not self.train_data_dim == D:
290			raise ValueError('''Test data is of different dimensionality
291			than training data.''')
292
293			# Check for augmentation
294			if not self.train_data_dim == D:
295			Z = np.concatenate((np.dot(Z, self.C), Z), axis=1)
296
297			# Call scikit's predict function
298			preds = self.clf.predict(Z)
299
300			# For quadratic loss function, correct predictions
301			if self.loss == 'quadratic':
302			preds = (np.sign(preds)+1)/2.
303
304			# Return predictions array
305			return preds
306
307	1		def get_params(self):
308			"""Get classifier parameters."""
309			return self.clf.get_params()
310
311	1		def is_trained(self):
312			"""Check whether classifier is trained."""
313			return self.is_trained
314

wmkouw / libTLDA

Push — master ( f3d068...88fa67 )

StructuralCorrespondenceClassifier A

Complexity

Size/Duplication

Test Coverage

Importance

10 Methods

How to fix Duplicated Code

Duplicated Code

Duplication Side-by-Side

Filter issues like