1
|
|
|
import numbers |
2
|
|
|
|
3
|
|
|
import numpy as np |
|
|
|
|
4
|
|
|
import scipy.sparse.linalg as sla |
|
|
|
|
5
|
|
|
|
6
|
|
|
import Orange.data |
7
|
|
|
from Orange.projection import Projector, Projection |
8
|
|
|
|
9
|
|
|
__all__ = ["CUR"] |
10
|
|
|
|
11
|
|
|
|
12
|
|
|
class CUR(Projector): |
13
|
|
|
"""CUR matrix decomposition |
14
|
|
|
|
15
|
|
|
Parameters |
16
|
|
|
---------- |
17
|
|
|
rank : boolean, optional, default: True |
18
|
|
|
number of most significant right singular vectors considered |
19
|
|
|
for computing feature statistical leverage scores |
20
|
|
|
|
21
|
|
|
max_error : float, optional, default: 1 |
22
|
|
|
relative error w.r.t. optimal `rank`-rank SVD approximation |
23
|
|
|
|
24
|
|
|
compute_U : boolean, optional, default: False |
25
|
|
|
Compute matrix U in the CUR decomposition or set it to None. |
26
|
|
|
|
27
|
|
|
If True matrix U is computed from C and R through Moore-Penrose |
28
|
|
|
generalized inverse as U = pinv(C) * X * pin(R). |
29
|
|
|
|
30
|
|
|
random_state : integer or numpy.RandomState, optional |
31
|
|
|
The generator used in importance sampling. If an integer is |
32
|
|
|
given, it fixes the seed. Defaults to the global numpy random |
33
|
|
|
number generator. |
34
|
|
|
|
35
|
|
|
preprocessors : list, optional (default="[]") |
36
|
|
|
An ordered list of preprocessors applied to data before |
37
|
|
|
training or testing. |
38
|
|
|
|
39
|
|
|
Attributes |
40
|
|
|
---------- |
41
|
|
|
lev_features_ : array-like, shape [n_features] |
42
|
|
|
Stores normalized statistical leverage scores of features |
43
|
|
|
|
44
|
|
|
features_ : array-like, shape [O(k log(k) / max_error^2)] |
45
|
|
|
Stores indices of features selected by the CUR algorithm |
46
|
|
|
|
47
|
|
|
lev_samples_ : array-like, shape [n_samples] |
48
|
|
|
Stores normalized statistical leverage scores of samples |
49
|
|
|
|
50
|
|
|
samples_ : array-like, shape [O(k log(k) / max_error^2)] |
51
|
|
|
Stores indices of samples selected by the CUR algorithm |
52
|
|
|
|
53
|
|
|
C_ : array-like, shape [n_samples, O(k log(k) / max_error^2)] |
54
|
|
|
Stores matrix C as defined in the CUR, a small number of |
55
|
|
|
columns from the data |
56
|
|
|
|
57
|
|
|
U_ : array-like, shape [O(k log(k) / max_error^2), O(k log(k) / max_error^2)] |
58
|
|
|
Stores matrix U as defined in the CUR |
59
|
|
|
|
60
|
|
|
R__ : array-like, shape [O(k log(k) / max_error^2), n_features] |
61
|
|
|
Stores matrix R as defined in the CUR, a small number of rows from the |
62
|
|
|
data |
63
|
|
|
|
64
|
|
|
References |
65
|
|
|
---------- |
66
|
|
|
"CUR matrix decompositions for improved data analysis" Mahoney, M.W; |
67
|
|
|
Drineas P. PNAS (2009) |
68
|
|
|
|
69
|
|
|
""" |
70
|
|
|
|
71
|
|
|
name = 'cur' |
72
|
|
|
|
73
|
|
|
def __init__(self, rank=3, max_error=1, compute_U=False, |
|
|
|
|
74
|
|
|
random_state=None, preprocessors=None): |
75
|
|
|
super().__init__(preprocessors=preprocessors) |
76
|
|
|
self.rank = rank |
77
|
|
|
self.compute_U = compute_U |
78
|
|
|
self.max_error = max_error |
79
|
|
|
if isinstance(random_state, numbers.Integral): |
80
|
|
|
self.random_state = np.random.RandomState(random_state) |
81
|
|
|
elif isinstance(random_state, np.random.RandomState): |
82
|
|
|
self.random_state = random_state |
83
|
|
|
else: |
84
|
|
|
self.random_state = np.random.mtrand._rand |
|
|
|
|
85
|
|
|
|
86
|
|
|
def fit(self, X, Y=None): |
|
|
|
|
87
|
|
|
U, s, V = sla.svds(X, self.rank) |
|
|
|
|
88
|
|
|
self.lev_features_, self.features_ = self._select_columns(X, [U.T, s, V.T]) |
|
|
|
|
89
|
|
|
self.lev_samples_, self.samples_ = self._select_columns(X.T, [V.T, s, U]) |
|
|
|
|
90
|
|
|
self.C_ = X[:, self.features_] |
|
|
|
|
91
|
|
|
self.R_ = X.T[:, self.samples_].T |
|
|
|
|
92
|
|
|
if self.compute_U: |
93
|
|
|
pinvC = np.linalg.pinv(self.C_) |
94
|
|
|
pinvR = np.linalg.pinv(self.R_) |
95
|
|
|
self.U_ = np.dot(np.dot(pinvC, X), pinvR) |
|
|
|
|
96
|
|
|
else: |
97
|
|
|
self.U_ = None |
|
|
|
|
98
|
|
|
return CURModel(self) |
99
|
|
|
|
100
|
|
|
def transform(self, X, axis): |
|
|
|
|
101
|
|
|
if axis == 0: |
102
|
|
|
return X[:, self.features_] |
103
|
|
|
else: |
104
|
|
|
return X[self.samples_, :] |
105
|
|
|
|
106
|
|
|
def _select_columns(self, X, UsV): |
|
|
|
|
107
|
|
|
U, s, V = UsV |
|
|
|
|
108
|
|
|
lev = self._score_leverage(V) |
109
|
|
|
c = self.rank * np.log(self.rank) / self.max_error**2 |
110
|
|
|
prob = np.minimum(1., c * lev) |
111
|
|
|
rnd = self.random_state.rand(X.shape[1]) |
112
|
|
|
cols = np.nonzero(rnd < prob)[0] |
113
|
|
|
return lev, cols |
114
|
|
|
|
115
|
|
|
def _score_leverage(self, V): |
|
|
|
|
116
|
|
|
return np.array(1. / self.rank * np.sum(np.power(V, 2), 1)) |
117
|
|
|
|
118
|
|
|
|
119
|
|
|
class CURModel(Projection): |
120
|
|
|
def __init__(self, proj): |
121
|
|
|
super().__init__(proj=proj) |
122
|
|
|
|
123
|
|
|
def __call__(self, data, axis=0): |
|
|
|
|
124
|
|
|
if data.domain is not self.domain: |
125
|
|
|
data = Orange.data.Table(self.domain, data) |
126
|
|
|
Xt = self.proj.transform(data.X, axis) |
127
|
|
|
|
128
|
|
|
if axis == 0: |
129
|
|
|
def cur_variable(i): |
130
|
|
|
var = data.domain.variables[i] |
131
|
|
|
return var.copy(compute_value=Projector(self, i)) |
132
|
|
|
|
133
|
|
|
domain = Orange.data.Domain( |
134
|
|
|
[cur_variable(org_idx) for org_idx in self.features_], |
135
|
|
|
class_vars=data.domain.class_vars) |
136
|
|
|
transformed_data = Orange.data.Table(domain, Xt, data.Y) |
|
|
|
|
137
|
|
|
elif axis == 1: |
138
|
|
|
Y = data.Y[self.proj.samples_] |
139
|
|
|
metas = data.metas[self.proj.samples_] |
140
|
|
|
transformed_data = Orange.data.Table(data.domain, Xt, Y, metas=metas) |
141
|
|
|
else: |
142
|
|
|
raise TypeError('CUR can select either columns ' |
143
|
|
|
'(axis = 0) or rows (axis = 1).') |
144
|
|
|
|
145
|
|
|
return transformed_data |
146
|
|
|
|
147
|
|
|
|
148
|
|
|
class Projector: |
|
|
|
|
149
|
|
|
def __init__(self, projection, feature): |
150
|
|
|
self.projection = projection |
151
|
|
|
self.feature = feature |
152
|
|
|
self.transformed = None |
153
|
|
|
|
154
|
|
|
def __call__(self, data): |
|
|
|
|
155
|
|
|
if data is not self.transformed: |
156
|
|
|
self.transformed = self.projection.transform(data.X) |
157
|
|
|
return self.transformed[:, self.feature] |
158
|
|
|
|
159
|
|
|
def __getstate__(self): |
160
|
|
|
d = dict(self.__dict__) |
161
|
|
|
d['transformed'] = None |
162
|
|
|
return d |
163
|
|
|
|
164
|
|
|
|
165
|
|
|
if __name__ == '__main__': |
166
|
|
|
import numpy as np |
|
|
|
|
167
|
|
|
import scipy.sparse.linalg as sla |
|
|
|
|
168
|
|
|
|
169
|
|
|
import Orange.data |
|
|
|
|
170
|
|
|
import Orange.projection |
171
|
|
|
|
172
|
|
|
np.random.seed(42) |
173
|
|
|
X = np.random.rand(60, 100) |
174
|
|
|
rank = 5 |
175
|
|
|
max_error = 1 |
176
|
|
|
|
177
|
|
|
data = Orange.data.Table(X) |
178
|
|
|
cur = Orange.projection.CUR(rank=rank, max_error=max_error, compute_U=True) |
179
|
|
|
cur_model = cur(data) |
180
|
|
|
|
181
|
|
|
transformed_data = cur_model(data, axis=0) |
182
|
|
|
np.testing.assert_array_equal(transformed_data.X, cur_model.C_) |
183
|
|
|
|
184
|
|
|
U, s, V = sla.svds(X, rank) |
185
|
|
|
S = np.diag(s) |
186
|
|
|
X_k = np.dot(U, np.dot(S, V)) |
187
|
|
|
err_svd = np.linalg.norm(X - X_k, 'fro') |
188
|
|
|
print('Fro. error (optimal SVD): %5.4f' % err_svd) |
189
|
|
|
|
190
|
|
|
X_hat = np.dot(cur_model.C_, np.dot(cur_model.U_, cur_model.R_)) |
191
|
|
|
err_cur = np.linalg.norm(X - X_hat, 'fro') |
192
|
|
|
print('Fro. error (CUR): %5.4f' % err_cur) |
193
|
|
|
# CUR guarantees with high probability err_cur <= (2+eps) err_svd |
194
|
|
|
assert err_cur < (3 + cur_model.max_error) * err_svd |
195
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.