1
|
|
|
import random |
2
|
|
|
import zlib |
3
|
|
|
import math |
4
|
|
|
from numbers import Real |
5
|
|
|
import numpy as np |
|
|
|
|
6
|
|
|
from Orange import data |
7
|
|
|
|
8
|
|
|
|
9
|
|
|
def _get_variable(dat, variable, expected_type=None, expected_name=""): |
10
|
|
|
failed = False |
11
|
|
|
if isinstance(variable, data.Variable): |
12
|
|
|
datvar = getattr(dat, "variable", None) |
13
|
|
|
if datvar is not None and datvar is not variable: |
14
|
|
|
raise ValueError("variable does not match the variable" |
15
|
|
|
"in the data") |
16
|
|
|
elif hasattr(dat, "domain"): |
17
|
|
|
variable = dat.domain[variable] |
18
|
|
|
elif hasattr(dat, "variable"): |
19
|
|
|
variable = dat.variable |
20
|
|
|
else: |
21
|
|
|
failed = True |
22
|
|
|
if failed or (expected_type is not None |
23
|
|
|
and not isinstance(variable, expected_type)): |
24
|
|
|
if isinstance(variable, data.Variable): |
25
|
|
|
raise ValueError( |
26
|
|
|
"expected %s variable not %s" % (expected_name, variable)) |
27
|
|
|
else: |
28
|
|
|
raise ValueError("expected %s, not '%s'" % |
29
|
|
|
(expected_type.__name__, type(variable).__name__)) |
30
|
|
|
return variable |
31
|
|
|
|
32
|
|
|
|
33
|
|
|
class Discrete(np.ndarray): |
34
|
|
|
def __new__(cls, dat, variable=None, unknowns=None): |
35
|
|
|
if isinstance(dat, data.Storage): |
36
|
|
|
if unknowns is not None: |
37
|
|
|
raise TypeError( |
38
|
|
|
"incompatible arguments (data storage and 'unknowns'") |
39
|
|
|
return cls.from_data(dat, variable) |
40
|
|
|
|
41
|
|
|
if variable is not None: |
42
|
|
|
variable = _get_variable(dat, variable) |
43
|
|
|
n = len(variable.values) |
44
|
|
|
else: |
45
|
|
|
n = len(dat) |
46
|
|
|
|
47
|
|
|
self = super().__new__(cls, n) |
48
|
|
|
self.variable = variable |
49
|
|
|
if dat is None: |
50
|
|
|
self[:] = 0 |
51
|
|
|
self.unknowns = unknowns or 0 |
52
|
|
|
else: |
53
|
|
|
self[:] = dat |
54
|
|
|
self.unknowns = (unknowns if unknowns is not None |
55
|
|
|
else getattr(dat, "unknowns", 0)) |
56
|
|
|
return self |
57
|
|
|
|
58
|
|
|
|
59
|
|
|
@classmethod |
60
|
|
|
def from_data(cls, data, variable): |
|
|
|
|
61
|
|
|
variable = _get_variable(data, variable) |
62
|
|
|
try: |
63
|
|
|
dist, unknowns = data._compute_distributions([variable])[0] |
|
|
|
|
64
|
|
|
self = super().__new__(cls, len(dist)) |
65
|
|
|
self[:] = dist |
66
|
|
|
self.unknowns = unknowns |
67
|
|
|
except NotImplementedError: |
68
|
|
|
self = super().__new__(cls, len(variable.values)) |
69
|
|
|
self[:] = np.zeros(len(variable.values)) |
70
|
|
|
self.unknowns = 0 |
71
|
|
|
if data.has_weights(): |
72
|
|
|
for val, w in zip(data[:, variable], data.W): |
73
|
|
|
if not math.isnan(val): |
74
|
|
|
self[val] += w |
75
|
|
|
else: |
76
|
|
|
self.unknowns += w |
77
|
|
|
else: |
78
|
|
|
for inst in data: |
79
|
|
|
val = inst[variable] |
80
|
|
|
if val == val: |
81
|
|
|
self[val] += 1 |
82
|
|
|
else: |
83
|
|
|
self.unknowns += 1 |
84
|
|
|
self.variable = variable |
85
|
|
|
return self |
86
|
|
|
|
87
|
|
|
|
88
|
|
|
def __eq__(self, other): |
89
|
|
|
return np.array_equal(self, other) and ( |
90
|
|
|
not hasattr(other, "unknowns") or self.unknowns == other.unknowns) |
91
|
|
|
|
92
|
|
|
def __ne__(self, other): |
93
|
|
|
return not self == other |
94
|
|
|
|
95
|
|
|
def __getitem__(self, index): |
96
|
|
|
if isinstance(index, str): |
97
|
|
|
index = self.variable.to_val(index) |
98
|
|
|
return super().__getitem__(index) |
99
|
|
|
|
100
|
|
|
|
101
|
|
|
def __setitem__(self, index, value): |
102
|
|
|
if isinstance(index, str): |
103
|
|
|
index = self.variable.to_val(index) |
104
|
|
|
super().__setitem__(index, value) |
105
|
|
|
|
106
|
|
|
|
107
|
|
|
def __hash__(self): |
108
|
|
|
return zlib.adler32(self) ^ hash(self.unknowns) |
109
|
|
|
|
110
|
|
|
|
111
|
|
|
def __add__(self, other): |
112
|
|
|
s = super().__add__(other) |
113
|
|
|
s.unknowns = self.unknowns + getattr(other, "unknowns", 0) |
114
|
|
|
return s |
115
|
|
|
|
116
|
|
|
|
117
|
|
|
def __iadd__(self, other): |
118
|
|
|
super().__iadd__(other) |
119
|
|
|
self.unknowns += getattr(other, "unknowns", 0) |
120
|
|
|
return self |
121
|
|
|
|
122
|
|
|
|
123
|
|
|
def __sub__(self, other): |
124
|
|
|
s = super().__sub__(other) |
125
|
|
|
s.unknowns = self.unknowns - getattr(other, "unknowns", 0) |
126
|
|
|
return s |
127
|
|
|
|
128
|
|
|
|
129
|
|
|
def __isub__(self, other): |
130
|
|
|
super().__isub__(other) |
131
|
|
|
self.unknowns -= getattr(other, "unknowns", 0) |
132
|
|
|
return self |
133
|
|
|
|
134
|
|
|
|
135
|
|
|
def __mul__(self, other): |
136
|
|
|
s = super().__mul__(other) |
137
|
|
|
if isinstance(other, Real): |
138
|
|
|
s.unknowns = self.unknowns / other |
139
|
|
|
return s |
140
|
|
|
|
141
|
|
|
|
142
|
|
|
def __imul__(self, other): |
143
|
|
|
super().__imul__(other) |
144
|
|
|
if isinstance(other, Real): |
145
|
|
|
self.unknowns *= other |
146
|
|
|
return self |
147
|
|
|
|
148
|
|
|
|
149
|
|
|
def __div__(self, other): |
150
|
|
|
s = super().__mul__(other) |
151
|
|
|
if isinstance(other, Real): |
152
|
|
|
s.unknowns = self.unknowns / other |
153
|
|
|
return s |
154
|
|
|
|
155
|
|
|
|
156
|
|
|
def __idiv__(self, other): |
157
|
|
|
super().__imul__(other) |
158
|
|
|
if isinstance(other, Real): |
159
|
|
|
self.unknowns /= other |
160
|
|
|
return self |
161
|
|
|
|
162
|
|
|
|
163
|
|
|
def normalize(self): |
164
|
|
|
t = np.sum(self) |
165
|
|
|
if t > 1e-6: |
166
|
|
|
self[:] /= t |
167
|
|
|
self.unknowns /= t |
168
|
|
|
elif self.shape[0]: |
169
|
|
|
self[:] = 1 / self.shape[0] |
170
|
|
|
|
171
|
|
|
|
172
|
|
|
def modus(self): |
173
|
|
|
val = np.argmax(self) |
174
|
|
|
return data.Value(self.variable, |
175
|
|
|
val) if self.variable is not None else val |
176
|
|
|
|
177
|
|
|
|
178
|
|
|
def random(self): |
179
|
|
|
v = random.random() * np.sum(self) |
180
|
|
|
s = i = 0 |
181
|
|
|
for i, e in enumerate(self): |
182
|
|
|
s += e |
183
|
|
|
if s > v: |
184
|
|
|
break |
185
|
|
|
return data.Value(self.variable, i) if self.variable is not None else i |
186
|
|
|
|
187
|
|
|
|
188
|
|
|
class Continuous(np.ndarray): |
189
|
|
|
def __new__(cls, dat, variable=None, unknowns=None): |
190
|
|
|
if isinstance(dat, data.Storage): |
191
|
|
|
if unknowns is not None: |
192
|
|
|
raise TypeError( |
193
|
|
|
"incompatible arguments (data storage and 'unknowns'") |
194
|
|
|
return cls.from_data(variable, dat) |
195
|
|
|
if isinstance(dat, int): |
196
|
|
|
self = super().__new__(cls, (2, dat)) |
197
|
|
|
self[:] = 0 |
198
|
|
|
self.unknowns = unknowns or 0 |
199
|
|
|
else: |
200
|
|
|
if not isinstance(dat, np.ndarray): |
201
|
|
|
dat = np.asarray(dat) |
202
|
|
|
self = super().__new__(cls, dat.shape) |
203
|
|
|
self[:] = dat |
204
|
|
|
self.unknowns = (unknowns if unknowns is not None |
205
|
|
|
else getattr(dat, "unknowns", 0)) |
206
|
|
|
self.variable = variable |
207
|
|
|
return self |
208
|
|
|
|
209
|
|
|
@classmethod |
210
|
|
|
def from_data(cls, variable, data): |
|
|
|
|
211
|
|
|
variable = _get_variable(data, variable) |
212
|
|
|
try: |
213
|
|
|
dist, unknowns = data._compute_distributions([variable])[0] |
|
|
|
|
214
|
|
|
except NotImplementedError: |
215
|
|
|
col = data[:, variable] |
216
|
|
|
dtype = col.dtype |
217
|
|
|
if data.has_weights(): |
218
|
|
|
if not "float" in dtype.name and "float" in col.dtype.name: |
219
|
|
|
dtype = col.dtype.name |
220
|
|
|
dist = np.empty((2, len(col)), dtype=dtype) |
221
|
|
|
dist[0, :] = col |
222
|
|
|
dist[1, :] = data.W |
223
|
|
|
else: |
224
|
|
|
dist = np.ones((2, len(col)), dtype=dtype) |
225
|
|
|
dist[0, :] = col |
226
|
|
|
dist.sort(axis=0) |
227
|
|
|
dist = np.array(_orange.valuecount(dist)) |
228
|
|
|
unknowns = len(col) - dist.shape[1] |
229
|
|
|
|
230
|
|
|
self = super().__new__(cls, dist.shape) |
231
|
|
|
self[:] = dist |
232
|
|
|
self.unknowns = unknowns |
233
|
|
|
self.variable = variable |
234
|
|
|
return self |
235
|
|
|
|
236
|
|
|
def __eq__(self, other): |
237
|
|
|
return np.array_equal(self, other) and ( |
238
|
|
|
not hasattr(other, "unknowns") or self.unknowns == other.unknowns) |
239
|
|
|
|
240
|
|
|
def __hash__(self): |
241
|
|
|
return zlib.adler32(self) ^ hash(self.unknowns) |
242
|
|
|
|
243
|
|
|
def normalize(self): |
244
|
|
|
t = np.sum(self[1, :]) |
245
|
|
|
if t > 1e-6: |
246
|
|
|
self[1, :] /= t |
247
|
|
|
self.unknowns /= t |
248
|
|
|
elif self.shape[1]: |
249
|
|
|
self[1, :] = 1 / self.shape[1] |
250
|
|
|
|
251
|
|
|
def modus(self): |
252
|
|
|
val = np.argmax(self[1, :]) |
253
|
|
|
return self[0, val] |
254
|
|
|
|
255
|
|
|
# TODO implement __getitem__ that will return a normal array, not Continuous |
|
|
|
|
256
|
|
|
def min(self): |
257
|
|
|
return self[0, 0] |
258
|
|
|
|
259
|
|
|
def max(self): |
260
|
|
|
return self[0, -1] |
261
|
|
|
|
262
|
|
|
def random(self): |
263
|
|
|
v = random.random() * np.sum(self[1, :]) |
264
|
|
|
s = 0 |
265
|
|
|
for x, prob in self.T: |
266
|
|
|
s += prob |
267
|
|
|
if s > v: |
268
|
|
|
return x |
269
|
|
|
|
270
|
|
|
def mean(self): |
271
|
|
|
return np.average(self[0], weights=self[1]) |
272
|
|
|
|
273
|
|
|
def variance(self): |
274
|
|
|
avg = self.mean() |
275
|
|
|
return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1]) |
276
|
|
|
|
277
|
|
|
def standard_deviation(self): |
278
|
|
|
return math.sqrt(self.variance()) |
279
|
|
|
|
280
|
|
|
|
281
|
|
|
|
282
|
|
|
def class_distribution(data): |
|
|
|
|
283
|
|
|
if data.domain.class_var: |
284
|
|
|
return get_distribution(data, data.domain.class_var) |
285
|
|
|
elif data.domain.class_vars: |
286
|
|
|
return [get_distribution(cls, data) for cls in data.domain.class_vars] |
287
|
|
|
else: |
288
|
|
|
raise ValueError("domain has no class attribute") |
289
|
|
|
|
290
|
|
|
|
291
|
|
|
def get_distribution(dat, variable, unknowns=None): |
292
|
|
|
variable = _get_variable(dat, variable) |
293
|
|
|
if variable.is_discrete: |
294
|
|
|
return Discrete(dat, variable, unknowns) |
295
|
|
|
elif variable.is_continuous: |
296
|
|
|
return Continuous(dat, variable, unknowns) |
297
|
|
|
else: |
298
|
|
|
raise TypeError("cannot compute distribution of '%s'" % |
299
|
|
|
type(variable).__name__) |
300
|
|
|
|
301
|
|
|
|
302
|
|
|
def get_distributions(dat, skipDiscrete=False, skipContinuous=False): |
303
|
|
|
vars = dat.domain.variables |
|
|
|
|
304
|
|
|
if skipDiscrete: |
305
|
|
|
if skipContinuous: |
306
|
|
|
return [] |
307
|
|
|
columns = [i for i, var in enumerate(vars) if var.is_continuous] |
308
|
|
|
elif skipContinuous: |
309
|
|
|
columns = [i for i, var in enumerate(vars) if var.is_discrete] |
310
|
|
|
else: |
311
|
|
|
columns = None |
312
|
|
|
try: |
313
|
|
|
dist_unks = dat._compute_distributions(columns) |
|
|
|
|
314
|
|
|
if columns is None: |
315
|
|
|
columns = np.arange(len(vars)) |
316
|
|
|
distributions = [] |
317
|
|
|
for col, (dist, unks) in zip(columns, dist_unks): |
318
|
|
|
distributions.append(get_distribution(dist, vars[col], unks)) |
319
|
|
|
except NotImplementedError: |
320
|
|
|
if columns is None: |
321
|
|
|
columns = np.arange(len(vars)) |
322
|
|
|
distributions = [get_distribution(dat, i) for i in columns] |
323
|
|
|
return distributions |
324
|
|
|
|
325
|
|
|
|
326
|
|
|
def get_distributions_for_columns(data, columns): |
|
|
|
|
327
|
|
|
""" |
328
|
|
|
Compute the distributions for columns. |
329
|
|
|
|
330
|
|
|
:param Orange.data.Table data: |
331
|
|
|
:param list columns: |
332
|
|
|
List of column indices into the `data.domain` (indices can be |
333
|
|
|
:class:`int` or instances of `Orange.data.Variable`) |
334
|
|
|
|
335
|
|
|
""" |
336
|
|
|
domain = data.domain |
337
|
|
|
# Normailze the columns to int indices |
338
|
|
|
columns = [col if isinstance(col, int) else domain.index(col) |
339
|
|
|
for col in columns] |
340
|
|
|
try: |
341
|
|
|
# Try the optimized code path (query the table|storage directly). |
342
|
|
|
dist_unks = data._compute_distributions(columns) |
|
|
|
|
343
|
|
|
except NotImplementedError: |
344
|
|
|
# Use default slow(er) implementation. |
345
|
|
|
return [get_distribution(data, i) for i in columns] |
346
|
|
|
else: |
347
|
|
|
# dist_unkn is a list of (values, unknowns) |
348
|
|
|
return [get_distribution(dist, domain[col], unknown) |
349
|
|
|
for col, (dist, unknown) in zip(columns, dist_unks)] |
350
|
|
|
|
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.