|
1
|
|
|
import random |
|
2
|
|
|
import zlib |
|
3
|
|
|
import math |
|
4
|
|
|
from numbers import Real |
|
5
|
|
|
import numpy as np |
|
|
|
|
|
|
6
|
|
|
from Orange import data |
|
7
|
|
|
|
|
8
|
|
|
|
|
9
|
|
|
def _get_variable(dat, variable, expected_type=None, expected_name=""): |
|
10
|
|
|
failed = False |
|
11
|
|
|
if isinstance(variable, data.Variable): |
|
12
|
|
|
datvar = getattr(dat, "variable", None) |
|
13
|
|
|
if datvar is not None and datvar is not variable: |
|
14
|
|
|
raise ValueError("variable does not match the variable" |
|
15
|
|
|
"in the data") |
|
16
|
|
|
elif hasattr(dat, "domain"): |
|
17
|
|
|
variable = dat.domain[variable] |
|
18
|
|
|
elif hasattr(dat, "variable"): |
|
19
|
|
|
variable = dat.variable |
|
20
|
|
|
else: |
|
21
|
|
|
failed = True |
|
22
|
|
|
if failed or (expected_type is not None |
|
23
|
|
|
and not isinstance(variable, expected_type)): |
|
24
|
|
|
if isinstance(variable, data.Variable): |
|
25
|
|
|
raise ValueError( |
|
26
|
|
|
"expected %s variable not %s" % (expected_name, variable)) |
|
27
|
|
|
else: |
|
28
|
|
|
raise ValueError("expected %s, not '%s'" % |
|
29
|
|
|
(expected_type.__name__, type(variable).__name__)) |
|
30
|
|
|
return variable |
|
31
|
|
|
|
|
32
|
|
|
|
|
33
|
|
|
class Discrete(np.ndarray): |
|
34
|
|
|
def __new__(cls, dat, variable=None, unknowns=None): |
|
35
|
|
|
if isinstance(dat, data.Storage): |
|
36
|
|
|
if unknowns is not None: |
|
37
|
|
|
raise TypeError( |
|
38
|
|
|
"incompatible arguments (data storage and 'unknowns'") |
|
39
|
|
|
return cls.from_data(dat, variable) |
|
40
|
|
|
|
|
41
|
|
|
if variable is not None: |
|
42
|
|
|
variable = _get_variable(dat, variable) |
|
43
|
|
|
n = len(variable.values) |
|
44
|
|
|
else: |
|
45
|
|
|
n = len(dat) |
|
46
|
|
|
|
|
47
|
|
|
self = super().__new__(cls, n) |
|
48
|
|
|
self.variable = variable |
|
49
|
|
|
if dat is None: |
|
50
|
|
|
self[:] = 0 |
|
51
|
|
|
self.unknowns = unknowns or 0 |
|
52
|
|
|
else: |
|
53
|
|
|
self[:] = dat |
|
54
|
|
|
self.unknowns = (unknowns if unknowns is not None |
|
55
|
|
|
else getattr(dat, "unknowns", 0)) |
|
56
|
|
|
return self |
|
57
|
|
|
|
|
58
|
|
|
|
|
59
|
|
|
@classmethod |
|
60
|
|
|
def from_data(cls, data, variable): |
|
|
|
|
|
|
61
|
|
|
variable = _get_variable(data, variable) |
|
62
|
|
|
try: |
|
63
|
|
|
dist, unknowns = data._compute_distributions([variable])[0] |
|
|
|
|
|
|
64
|
|
|
self = super().__new__(cls, len(dist)) |
|
65
|
|
|
self[:] = dist |
|
66
|
|
|
self.unknowns = unknowns |
|
67
|
|
|
except NotImplementedError: |
|
68
|
|
|
self = super().__new__(cls, len(variable.values)) |
|
69
|
|
|
self[:] = np.zeros(len(variable.values)) |
|
70
|
|
|
self.unknowns = 0 |
|
71
|
|
|
if data.has_weights(): |
|
72
|
|
|
for val, w in zip(data[:, variable], data.W): |
|
73
|
|
|
if not math.isnan(val): |
|
74
|
|
|
self[val] += w |
|
75
|
|
|
else: |
|
76
|
|
|
self.unknowns += w |
|
77
|
|
|
else: |
|
78
|
|
|
for inst in data: |
|
79
|
|
|
val = inst[variable] |
|
80
|
|
|
if val == val: |
|
81
|
|
|
self[val] += 1 |
|
82
|
|
|
else: |
|
83
|
|
|
self.unknowns += 1 |
|
84
|
|
|
self.variable = variable |
|
85
|
|
|
return self |
|
86
|
|
|
|
|
87
|
|
|
|
|
88
|
|
|
def __eq__(self, other): |
|
89
|
|
|
return np.array_equal(self, other) and ( |
|
90
|
|
|
not hasattr(other, "unknowns") or self.unknowns == other.unknowns) |
|
91
|
|
|
|
|
92
|
|
|
def __ne__(self, other): |
|
93
|
|
|
return not self == other |
|
94
|
|
|
|
|
95
|
|
|
def __getitem__(self, index): |
|
96
|
|
|
if isinstance(index, str): |
|
97
|
|
|
index = self.variable.to_val(index) |
|
98
|
|
|
return super().__getitem__(index) |
|
99
|
|
|
|
|
100
|
|
|
|
|
101
|
|
|
def __setitem__(self, index, value): |
|
102
|
|
|
if isinstance(index, str): |
|
103
|
|
|
index = self.variable.to_val(index) |
|
104
|
|
|
super().__setitem__(index, value) |
|
105
|
|
|
|
|
106
|
|
|
|
|
107
|
|
|
def __hash__(self): |
|
108
|
|
|
return zlib.adler32(self) ^ hash(self.unknowns) |
|
109
|
|
|
|
|
110
|
|
|
|
|
111
|
|
|
def __add__(self, other): |
|
112
|
|
|
s = super().__add__(other) |
|
113
|
|
|
s.unknowns = self.unknowns + getattr(other, "unknowns", 0) |
|
114
|
|
|
return s |
|
115
|
|
|
|
|
116
|
|
|
|
|
117
|
|
|
def __iadd__(self, other): |
|
118
|
|
|
super().__iadd__(other) |
|
119
|
|
|
self.unknowns += getattr(other, "unknowns", 0) |
|
120
|
|
|
return self |
|
121
|
|
|
|
|
122
|
|
|
|
|
123
|
|
|
def __sub__(self, other): |
|
124
|
|
|
s = super().__sub__(other) |
|
125
|
|
|
s.unknowns = self.unknowns - getattr(other, "unknowns", 0) |
|
126
|
|
|
return s |
|
127
|
|
|
|
|
128
|
|
|
|
|
129
|
|
|
def __isub__(self, other): |
|
130
|
|
|
super().__isub__(other) |
|
131
|
|
|
self.unknowns -= getattr(other, "unknowns", 0) |
|
132
|
|
|
return self |
|
133
|
|
|
|
|
134
|
|
|
|
|
135
|
|
|
def __mul__(self, other): |
|
136
|
|
|
s = super().__mul__(other) |
|
137
|
|
|
if isinstance(other, Real): |
|
138
|
|
|
s.unknowns = self.unknowns / other |
|
139
|
|
|
return s |
|
140
|
|
|
|
|
141
|
|
|
|
|
142
|
|
|
def __imul__(self, other): |
|
143
|
|
|
super().__imul__(other) |
|
144
|
|
|
if isinstance(other, Real): |
|
145
|
|
|
self.unknowns *= other |
|
146
|
|
|
return self |
|
147
|
|
|
|
|
148
|
|
|
|
|
149
|
|
|
def __div__(self, other): |
|
150
|
|
|
s = super().__mul__(other) |
|
151
|
|
|
if isinstance(other, Real): |
|
152
|
|
|
s.unknowns = self.unknowns / other |
|
153
|
|
|
return s |
|
154
|
|
|
|
|
155
|
|
|
|
|
156
|
|
|
def __idiv__(self, other): |
|
157
|
|
|
super().__imul__(other) |
|
158
|
|
|
if isinstance(other, Real): |
|
159
|
|
|
self.unknowns /= other |
|
160
|
|
|
return self |
|
161
|
|
|
|
|
162
|
|
|
|
|
163
|
|
|
def normalize(self): |
|
164
|
|
|
t = np.sum(self) |
|
165
|
|
|
if t > 1e-6: |
|
166
|
|
|
self[:] /= t |
|
167
|
|
|
self.unknowns /= t |
|
168
|
|
|
elif self.shape[0]: |
|
169
|
|
|
self[:] = 1 / self.shape[0] |
|
170
|
|
|
|
|
171
|
|
|
|
|
172
|
|
|
def modus(self): |
|
173
|
|
|
val = np.argmax(self) |
|
174
|
|
|
return data.Value(self.variable, |
|
175
|
|
|
val) if self.variable is not None else val |
|
176
|
|
|
|
|
177
|
|
|
|
|
178
|
|
|
def random(self): |
|
179
|
|
|
v = random.random() * np.sum(self) |
|
180
|
|
|
s = i = 0 |
|
181
|
|
|
for i, e in enumerate(self): |
|
182
|
|
|
s += e |
|
183
|
|
|
if s > v: |
|
184
|
|
|
break |
|
185
|
|
|
return data.Value(self.variable, i) if self.variable is not None else i |
|
186
|
|
|
|
|
187
|
|
|
|
|
188
|
|
|
class Continuous(np.ndarray): |
|
189
|
|
|
def __new__(cls, dat, variable=None, unknowns=None): |
|
190
|
|
|
if isinstance(dat, data.Storage): |
|
191
|
|
|
if unknowns is not None: |
|
192
|
|
|
raise TypeError( |
|
193
|
|
|
"incompatible arguments (data storage and 'unknowns'") |
|
194
|
|
|
return cls.from_data(variable, dat) |
|
195
|
|
|
if isinstance(dat, int): |
|
196
|
|
|
self = super().__new__(cls, (2, dat)) |
|
197
|
|
|
self[:] = 0 |
|
198
|
|
|
self.unknowns = unknowns or 0 |
|
199
|
|
|
else: |
|
200
|
|
|
if not isinstance(dat, np.ndarray): |
|
201
|
|
|
dat = np.asarray(dat) |
|
202
|
|
|
self = super().__new__(cls, dat.shape) |
|
203
|
|
|
self[:] = dat |
|
204
|
|
|
self.unknowns = (unknowns if unknowns is not None |
|
205
|
|
|
else getattr(dat, "unknowns", 0)) |
|
206
|
|
|
self.variable = variable |
|
207
|
|
|
return self |
|
208
|
|
|
|
|
209
|
|
|
@classmethod |
|
210
|
|
|
def from_data(cls, variable, data): |
|
|
|
|
|
|
211
|
|
|
variable = _get_variable(data, variable) |
|
212
|
|
|
try: |
|
213
|
|
|
dist, unknowns = data._compute_distributions([variable])[0] |
|
|
|
|
|
|
214
|
|
|
except NotImplementedError: |
|
215
|
|
|
col = data[:, variable] |
|
216
|
|
|
dtype = col.dtype |
|
217
|
|
|
if data.has_weights(): |
|
218
|
|
|
if not "float" in dtype.name and "float" in col.dtype.name: |
|
219
|
|
|
dtype = col.dtype.name |
|
220
|
|
|
dist = np.empty((2, len(col)), dtype=dtype) |
|
221
|
|
|
dist[0, :] = col |
|
222
|
|
|
dist[1, :] = data.W |
|
223
|
|
|
else: |
|
224
|
|
|
dist = np.ones((2, len(col)), dtype=dtype) |
|
225
|
|
|
dist[0, :] = col |
|
226
|
|
|
dist.sort(axis=0) |
|
227
|
|
|
dist = np.array(_orange.valuecount(dist)) |
|
228
|
|
|
unknowns = len(col) - dist.shape[1] |
|
229
|
|
|
|
|
230
|
|
|
self = super().__new__(cls, dist.shape) |
|
231
|
|
|
self[:] = dist |
|
232
|
|
|
self.unknowns = unknowns |
|
233
|
|
|
self.variable = variable |
|
234
|
|
|
return self |
|
235
|
|
|
|
|
236
|
|
|
def __eq__(self, other): |
|
237
|
|
|
return np.array_equal(self, other) and ( |
|
238
|
|
|
not hasattr(other, "unknowns") or self.unknowns == other.unknowns) |
|
239
|
|
|
|
|
240
|
|
|
def __hash__(self): |
|
241
|
|
|
return zlib.adler32(self) ^ hash(self.unknowns) |
|
242
|
|
|
|
|
243
|
|
|
def normalize(self): |
|
244
|
|
|
t = np.sum(self[1, :]) |
|
245
|
|
|
if t > 1e-6: |
|
246
|
|
|
self[1, :] /= t |
|
247
|
|
|
self.unknowns /= t |
|
248
|
|
|
elif self.shape[1]: |
|
249
|
|
|
self[1, :] = 1 / self.shape[1] |
|
250
|
|
|
|
|
251
|
|
|
def modus(self): |
|
252
|
|
|
val = np.argmax(self[1, :]) |
|
253
|
|
|
return self[0, val] |
|
254
|
|
|
|
|
255
|
|
|
# TODO implement __getitem__ that will return a normal array, not Continuous |
|
|
|
|
|
|
256
|
|
|
def min(self): |
|
257
|
|
|
return self[0, 0] |
|
258
|
|
|
|
|
259
|
|
|
def max(self): |
|
260
|
|
|
return self[0, -1] |
|
261
|
|
|
|
|
262
|
|
|
def random(self): |
|
263
|
|
|
v = random.random() * np.sum(self[1, :]) |
|
264
|
|
|
s = 0 |
|
265
|
|
|
for x, prob in self.T: |
|
266
|
|
|
s += prob |
|
267
|
|
|
if s > v: |
|
268
|
|
|
return x |
|
269
|
|
|
|
|
270
|
|
|
def mean(self): |
|
271
|
|
|
return np.average(self[0], weights=self[1]) |
|
272
|
|
|
|
|
273
|
|
|
def variance(self): |
|
274
|
|
|
avg = self.mean() |
|
275
|
|
|
return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1]) |
|
276
|
|
|
|
|
277
|
|
|
def standard_deviation(self): |
|
278
|
|
|
return math.sqrt(self.variance()) |
|
279
|
|
|
|
|
280
|
|
|
|
|
281
|
|
|
|
|
282
|
|
|
def class_distribution(data): |
|
|
|
|
|
|
283
|
|
|
if data.domain.class_var: |
|
284
|
|
|
return get_distribution(data, data.domain.class_var) |
|
285
|
|
|
elif data.domain.class_vars: |
|
286
|
|
|
return [get_distribution(cls, data) for cls in data.domain.class_vars] |
|
287
|
|
|
else: |
|
288
|
|
|
raise ValueError("domain has no class attribute") |
|
289
|
|
|
|
|
290
|
|
|
|
|
291
|
|
|
def get_distribution(dat, variable, unknowns=None): |
|
292
|
|
|
variable = _get_variable(dat, variable) |
|
293
|
|
|
if variable.is_discrete: |
|
294
|
|
|
return Discrete(dat, variable, unknowns) |
|
295
|
|
|
elif variable.is_continuous: |
|
296
|
|
|
return Continuous(dat, variable, unknowns) |
|
297
|
|
|
else: |
|
298
|
|
|
raise TypeError("cannot compute distribution of '%s'" % |
|
299
|
|
|
type(variable).__name__) |
|
300
|
|
|
|
|
301
|
|
|
|
|
302
|
|
|
def get_distributions(dat, skipDiscrete=False, skipContinuous=False): |
|
303
|
|
|
vars = dat.domain.variables |
|
|
|
|
|
|
304
|
|
|
if skipDiscrete: |
|
305
|
|
|
if skipContinuous: |
|
306
|
|
|
return [] |
|
307
|
|
|
columns = [i for i, var in enumerate(vars) if var.is_continuous] |
|
308
|
|
|
elif skipContinuous: |
|
309
|
|
|
columns = [i for i, var in enumerate(vars) if var.is_discrete] |
|
310
|
|
|
else: |
|
311
|
|
|
columns = None |
|
312
|
|
|
try: |
|
313
|
|
|
dist_unks = dat._compute_distributions(columns) |
|
|
|
|
|
|
314
|
|
|
if columns is None: |
|
315
|
|
|
columns = np.arange(len(vars)) |
|
316
|
|
|
distributions = [] |
|
317
|
|
|
for col, (dist, unks) in zip(columns, dist_unks): |
|
318
|
|
|
distributions.append(get_distribution(dist, vars[col], unks)) |
|
319
|
|
|
except NotImplementedError: |
|
320
|
|
|
if columns is None: |
|
321
|
|
|
columns = np.arange(len(vars)) |
|
322
|
|
|
distributions = [get_distribution(dat, i) for i in columns] |
|
323
|
|
|
return distributions |
|
324
|
|
|
|
|
325
|
|
|
|
|
326
|
|
|
def get_distributions_for_columns(data, columns): |
|
|
|
|
|
|
327
|
|
|
""" |
|
328
|
|
|
Compute the distributions for columns. |
|
329
|
|
|
|
|
330
|
|
|
:param Orange.data.Table data: |
|
331
|
|
|
:param list columns: |
|
332
|
|
|
List of column indices into the `data.domain` (indices can be |
|
333
|
|
|
:class:`int` or instances of `Orange.data.Variable`) |
|
334
|
|
|
|
|
335
|
|
|
""" |
|
336
|
|
|
domain = data.domain |
|
337
|
|
|
# Normailze the columns to int indices |
|
338
|
|
|
columns = [col if isinstance(col, int) else domain.index(col) |
|
339
|
|
|
for col in columns] |
|
340
|
|
|
try: |
|
341
|
|
|
# Try the optimized code path (query the table|storage directly). |
|
342
|
|
|
dist_unks = data._compute_distributions(columns) |
|
|
|
|
|
|
343
|
|
|
except NotImplementedError: |
|
344
|
|
|
# Use default slow(er) implementation. |
|
345
|
|
|
return [get_distribution(data, i) for i in columns] |
|
346
|
|
|
else: |
|
347
|
|
|
# dist_unkn is a list of (values, unknowns) |
|
348
|
|
|
return [get_distribution(dist, domain[col], unknown) |
|
349
|
|
|
for col, (dist, unknown) in zip(columns, dist_unks)] |
|
350
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.pyfiles in your module folders. Make sure that you place one file in each sub-folder.