StructuralCorrespondenceClassifier - Code Metrics - Inspection of "Extended iw tests." - wmkouw/libTLDA - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Branch — master (f50597)

by Wouter

created 2018-06-12 14:40 UTC

StructuralCorrespondenceClassifier A

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	230
Duplicated Lines	33.48 %

Test Coverage

Coverage

Importance

Changes

Metric	Value
c	0
b	0
f	0
dl	77
loc	230
ccs	0
cts	73
cp	0
rs	10
wmc	23

10 Methods

Rating	Name	Duplication	Size	Complexity
A	get_params()	0	3	1
B	fit()	38	38	5
B	Huber_loss()	0	24	1
B	Huber_grad()	0	24	1
A	is_trained()	0	3	1
A	L()	0	1	1
B	__init__()	39	39	4
B	augment_features()	0	56	5
A	J()	0	1	1
B	predict()	0	28	5

How to fix Duplicated Code

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
import scipy.stats as st
from scipy.sparse import linalg
from scipy.optimize import minimize
import sklearn as sk
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import cross_val_predict
from os.path import basename

from .util import is_pos_def


class StructuralCorrespondenceClassifier(object):
    """
    Class of classifiers based on structural correspondence learning.

    Methods contain different importance-weight estimators and different loss
    functions.
    """

    def __init__(self, loss='logistic', l2=1.0, num_pivots=1,

                 num_components=1):
        """
        Select a particular type of importance-weighted classifier.

        INPUT   (1) str 'loss': loss function for weighted classifier, options:
                    'logistic', 'quadratic', 'hinge' (def: 'logistic')
                (2) float 'l2': l2-regularization parameter value (def:0.01)
                (3) int 'num_pivots': number of pivot features to use (def: 1)
                (4) int 'num_components': number of components to use after
                    extracting pivot features (def: 1)
        """
        self.loss = loss
        self.l2 = l2
        self.num_pivots = num_pivots
        self.num_components = num_components

        # Initialize untrained classifiers based on choice of loss function
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf = LogisticRegression()
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf = LinearRegression()
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf = LinearSVC()
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Whether model has been trained
        self.is_trained = False

        # Maintain pivot component matrix
        self.C = 0

        # Dimensionality of training data
        self.train_data_dim = ''

    def augment_features(self, X, Z):
        """
        Find a set of pivot features, train predictors and extract bases.

        INPUT   (1) array 'X': source data array (N samples by D features)
                (2) array 'Z': target data array (M samples by D features)
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Concatenate source and target data
        XZ = np.concatenate((X, Z), axis=0)

        # Sort indices based on frequency of features (assumes BoW encoding)
        ix = np.argsort(np.sum(XZ, axis=0))

        # Keep most frequent features
        ix = ix[::-1][:self.num_pivots]

        # Slice out pivot features and relabel them as present(=1)/absent(=0)
        pivot = (XZ[:, ix] > 0).astype('float')

        # Solve prediction tasks with a Huber loss function
        P = np.zeros((DX, self.num_pivots))

        # Loop over pivot features
        for l in range(self.num_pivots):

            # Setup loss function for single pivot
            def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])

            # Setup gradient function for single pivot
            def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])

            # Make pivot predictor with a Huber loss function
            results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
                               options={'gtol': 1e-6, 'disp': True})

            # Store optimal parameters
            P[:, l] = results.x

        # Eigenvalue decomposition of pivot predictor matrix
        V, C = np.linalg.eig(np.cov(P))

        # Reduce number of components
        C = C[:, :self.num_components]

        # Augment features
        Xa = np.concatenate((np.dot(X, C), X), axis=1)
        Za = np.concatenate((np.dot(Z, C), Z), axis=1)

        return Xa, Za, C

    def Huber_loss(self, theta, X, y, l2=0.0):
        """
        Huber loss function.

        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.

        INPUT   (1) array 'theta': classifier parameters (D features by 1)
                (2) array 'X': data (N samples by D features)
                (3) array 'y': label vector (N samples by 1)
                (4) float 'l2': l2-regularization parameter (def= 0.0)
        OUTPUT  (1) Loss/objective function value
                (2) Gradient with respect to classifier parameters
        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Loss function
        return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
            + np.sum(-4*Xyt[~ix], axis=0) + l2*np.sum(theta**2, axis=0)

    def Huber_grad(self, theta, X, y, l2=0.0):
        """
        Huber gradient computation.

        Reference: Ando & Zhang (2005a). A framework for learning predictive
        structures from multiple tasks and unlabeled data. JMLR.

        INPUT   (1) array 'theta': classifier parameters (D features by 1)
                (2) array 'X': data (N samples by D features)
                (3) array 'y': label vector (N samples by 1)
                (4) float 'l2': l2-regularization parameter (def= 0.0)
        OUTPUT  (1) Loss/objective function value
                (2) Gradient with respect to classifier parameters
        """
        # Precompute terms
        Xy = (X.T*y.T).T
        Xyt = np.dot(Xy, theta)

        # Indices of discontinuity
        ix = (Xyt >= -1)

        # Gradient
        return np.sum(2*np.clip(1-Xyt[ix], 0, None).T * -Xy[ix, :].T,
                      axis=1).T + np.sum(-4*Xy[~ix, :], axis=0) + 2*l2*theta

    def fit(self, X, y, Z):

        """
        Fit/train an structural correpondence classifier.

        INPUT   (1) array 'X': source data (N samples by D features)
                (2) array 'y': source labels (N samples by 1)
                (3) array 'Z': target data (M samples by D features)
        OUTPUT  None
        """
        # Data shapes
        N, DX = X.shape
        M, DZ = Z.shape

        # Assert equivalent dimensionalities
        assert DX == DZ

        # Augment features
        X, _, self.C = self.augment_features(X, Z)

        # Train a classifier
        if self.loss == 'logistic':
            # Logistic regression model
            self.clf.fit(X, y)
        elif self.loss == 'quadratic':
            # Least-squares model
            self.clf.fit(X, y)
        elif self.loss == 'hinge':
            # Linear support vector machine
            self.clf.fit(X, y)
        else:
            # Other loss functions are not implemented
            raise NotImplementedError

        # Mark classifier as trained
        self.is_trained = True

        # Store training data dimensionality
        self.train_data_dim = DX + self.num_components

    def predict(self, Z_):
        """
        Make predictions on new dataset.

        INPUT   (1) array 'Z_': new data set (M samples by D features)
        OUTPUT  (2) array 'preds': label predictions (M samples by 1)
        """
        # Data shape
        M, D = Z_.shape

        # If classifier is trained, check for same dimensionality
        if self.is_trained:
            assert self.train_data_dim == D or \
                   self.train_data_dim == D + self.num_components

        # Check for augmentation
        if not self.train_data_dim == D:
            Z_ = np.concatenate((np.dot(Z_, self.C), Z_), axis=1)

        # Call scikit's predict function
        preds = self.clf.predict(Z_)

        # For quadratic loss function, correct predictions
        if self.loss == 'quadratic':
            preds = (np.sign(preds)+1)/2.

        # Return predictions array
        return preds

    def get_params(self):
        """Get classifier parameters."""
        return self.clf.get_params()

    def is_trained(self):
        """Check whether classifier is trained."""
        return self.is_trained


1		#!/usr/bin/env python
2		# -- coding: utf-8 --
3
4		import numpy as np
5		import scipy.stats as st
6		from scipy.sparse import linalg
7		from scipy.optimize import minimize
8		import sklearn as sk
9		from sklearn.svm import LinearSVC
10		from sklearn.linear_model import LogisticRegression, LinearRegression
11		from sklearn.model_selection import cross_val_predict
12		from os.path import basename
13
14		from .util import is_pos_def
15
16
17		class StructuralCorrespondenceClassifier(object):
18		"""
19		Class of classifiers based on structural correspondence learning.
20
21		Methods contain different importance-weight estimators and different loss
22		functions.
23		"""
24
25	View Code Duplication	def __init__(self, loss='logistic', l2=1.0, num_pivots=1,
		0 ignored issues – show Duplication introduced 2018-06-12 14:41 UTC Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
26		num_components=1):
27		"""
28		Select a particular type of importance-weighted classifier.
29
30		INPUT (1) str 'loss': loss function for weighted classifier, options:
31		'logistic', 'quadratic', 'hinge' (def: 'logistic')
32		(2) float 'l2': l2-regularization parameter value (def:0.01)
33		(3) int 'num_pivots': number of pivot features to use (def: 1)
34		(4) int 'num_components': number of components to use after
35		extracting pivot features (def: 1)
36		"""
37		self.loss = loss
38		self.l2 = l2
39		self.num_pivots = num_pivots
40		self.num_components = num_components
41
42		# Initialize untrained classifiers based on choice of loss function
43		if self.loss == 'logistic':
44		# Logistic regression model
45		self.clf = LogisticRegression()
46		elif self.loss == 'quadratic':
47		# Least-squares model
48		self.clf = LinearRegression()
49		elif self.loss == 'hinge':
50		# Linear support vector machine
51		self.clf = LinearSVC()
52		else:
53		# Other loss functions are not implemented
54		raise NotImplementedError
55
56		# Whether model has been trained
57		self.is_trained = False
58
59		# Maintain pivot component matrix
60		self.C = 0
61
62		# Dimensionality of training data
63		self.train_data_dim = ''
64
65		def augment_features(self, X, Z):
66		"""
67		Find a set of pivot features, train predictors and extract bases.
68
69		INPUT (1) array 'X': source data array (N samples by D features)
70		(2) array 'Z': target data array (M samples by D features)
71		"""
72		# Data shapes
73		N, DX = X.shape
74		M, DZ = Z.shape
75
76		# Assert equivalent dimensionalities
77		assert DX == DZ
78
79		# Concatenate source and target data
80		XZ = np.concatenate((X, Z), axis=0)
81
82		# Sort indices based on frequency of features (assumes BoW encoding)
83		ix = np.argsort(np.sum(XZ, axis=0))
84
85		# Keep most frequent features
86		ix = ix[::-1][:self.num_pivots]
87
88		# Slice out pivot features and relabel them as present(=1)/absent(=0)
89		pivot = (XZ[:, ix] > 0).astype('float')
90
91		# Solve prediction tasks with a Huber loss function
92		P = np.zeros((DX, self.num_pivots))
93
94		# Loop over pivot features
95		for l in range(self.num_pivots):
96
97		# Setup loss function for single pivot
98		def L(theta): return self.Huber_loss(theta, XZ, pivot[:, l])
99
100		# Setup gradient function for single pivot
101		def J(theta): return self.Huber_grad(theta, XZ, pivot[:, l])
102
103		# Make pivot predictor with a Huber loss function
104		results = minimize(L, np.random.randn(DX, 1), jac=J, method='BFGS',
105		options={'gtol': 1e-6, 'disp': True})
106
107		# Store optimal parameters
108		P[:, l] = results.x
109
110		# Eigenvalue decomposition of pivot predictor matrix
111		V, C = np.linalg.eig(np.cov(P))
112
113		# Reduce number of components
114		C = C[:, :self.num_components]
115
116		# Augment features
117		Xa = np.concatenate((np.dot(X, C), X), axis=1)
118		Za = np.concatenate((np.dot(Z, C), Z), axis=1)
119
120		return Xa, Za, C
121
122		def Huber_loss(self, theta, X, y, l2=0.0):
123		"""
124		Huber loss function.
125
126		Reference: Ando & Zhang (2005a). A framework for learning predictive
127		structures from multiple tasks and unlabeled data. JMLR.
128
129		INPUT (1) array 'theta': classifier parameters (D features by 1)
130		(2) array 'X': data (N samples by D features)
131		(3) array 'y': label vector (N samples by 1)
132		(4) float 'l2': l2-regularization parameter (def= 0.0)
133		OUTPUT (1) Loss/objective function value
134		(2) Gradient with respect to classifier parameters
135		"""
136		# Precompute terms
137		Xy = (X.T*y.T).T
138		Xyt = np.dot(Xy, theta)
139
140		# Indices of discontinuity
141		ix = (Xyt >= -1)
142
143		# Loss function
144		return np.sum(np.clip(1 - Xyt[ix], 0, None)**2, axis=0) \
145		+ np.sum(-4Xyt[~ix], axis=0) + l2np.sum(theta**2, axis=0)
146
147		def Huber_grad(self, theta, X, y, l2=0.0):
148		"""
149		Huber gradient computation.
150
151		Reference: Ando & Zhang (2005a). A framework for learning predictive
152		structures from multiple tasks and unlabeled data. JMLR.
153
154		INPUT (1) array 'theta': classifier parameters (D features by 1)
155		(2) array 'X': data (N samples by D features)
156		(3) array 'y': label vector (N samples by 1)
157		(4) float 'l2': l2-regularization parameter (def= 0.0)
158		OUTPUT (1) Loss/objective function value
159		(2) Gradient with respect to classifier parameters
160		"""
161		# Precompute terms
162		Xy = (X.T*y.T).T
163		Xyt = np.dot(Xy, theta)
164
165		# Indices of discontinuity
166		ix = (Xyt >= -1)
167
168		# Gradient
169		return np.sum(2np.clip(1-Xyt[ix], 0, None).T -Xy[ix, :].T,
170		axis=1).T + np.sum(-4Xy[~ix, :], axis=0) + 2l2*theta
171
172	View Code Duplication	def fit(self, X, y, Z):
		0 ignored issues – show Duplication introduced 2018-06-12 14:41 UTC Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
173		"""
174		Fit/train an structural correpondence classifier.
175
176		INPUT (1) array 'X': source data (N samples by D features)
177		(2) array 'y': source labels (N samples by 1)
178		(3) array 'Z': target data (M samples by D features)
179		OUTPUT None
180		"""
181		# Data shapes
182		N, DX = X.shape
183		M, DZ = Z.shape
184
185		# Assert equivalent dimensionalities
186		assert DX == DZ
187
188		# Augment features
189		X, _, self.C = self.augment_features(X, Z)
190
191		# Train a classifier
192		if self.loss == 'logistic':
193		# Logistic regression model
194		self.clf.fit(X, y)
195		elif self.loss == 'quadratic':
196		# Least-squares model
197		self.clf.fit(X, y)
198		elif self.loss == 'hinge':
199		# Linear support vector machine
200		self.clf.fit(X, y)
201		else:
202		# Other loss functions are not implemented
203		raise NotImplementedError
204
205		# Mark classifier as trained
206		self.is_trained = True
207
208		# Store training data dimensionality
209		self.train_data_dim = DX + self.num_components
210
211		def predict(self, Z_):
212		"""
213		Make predictions on new dataset.
214
215		INPUT (1) array 'Z_': new data set (M samples by D features)
216		OUTPUT (2) array 'preds': label predictions (M samples by 1)
217		"""
218		# Data shape
219		M, D = Z_.shape
220
221		# If classifier is trained, check for same dimensionality
222		if self.is_trained:
223		assert self.train_data_dim == D or \
224		self.train_data_dim == D + self.num_components
225
226		# Check for augmentation
227		if not self.train_data_dim == D:
228		Z_ = np.concatenate((np.dot(Z_, self.C), Z_), axis=1)
229
230		# Call scikit's predict function
231		preds = self.clf.predict(Z_)
232
233		# For quadratic loss function, correct predictions
234		if self.loss == 'quadratic':
235		preds = (np.sign(preds)+1)/2.
236
237		# Return predictions array
238		return preds
239
240		def get_params(self):
241		"""Get classifier parameters."""
242		return self.clf.get_params()
243
244		def is_trained(self):
245		"""Check whether classifier is trained."""
246		return self.is_trained
247

wmkouw / libTLDA

Branch — master (f50597)

StructuralCorrespondenceClassifier A

Complexity

Size/Duplication

Test Coverage

Importance

10 Methods

How to fix Duplicated Code

Duplicated Code

Duplication Side-by-Side

Filter issues like