1 | import numpy as np |
||
0 ignored issues
–
show
|
|||
2 | |||
3 | from Orange.data import DiscreteVariable, Domain |
||
4 | from Orange.data.sql.table import SqlTable |
||
5 | from Orange.statistics import distribution, contingency |
||
6 | from .transformation import Transformation |
||
7 | from . import _discretize |
||
0 ignored issues
–
show
|
|||
8 | |||
9 | __all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer"] |
||
10 | |||
11 | |||
12 | class Discretizer(Transformation): |
||
13 | """Value transformer that returns an index of the bin for the given value. |
||
14 | """ |
||
15 | def __init__(self, variable, points): |
||
16 | super().__init__(variable) |
||
17 | self.points = points |
||
18 | |||
19 | def transform(self, c): |
||
20 | if c.size: |
||
21 | # HB 20151202: numpy 1.10+ needs some points. |
||
22 | if len(self.points): |
||
23 | aa = np.digitize(c, self.points) |
||
24 | else: |
||
25 | aa = np.array([0] * len(c)) |
||
26 | return np.where(np.isnan(c), np.NaN, aa) |
||
27 | else: |
||
28 | return np.array([], dtype=int) |
||
29 | |||
30 | @staticmethod |
||
31 | def _fmt_interval(low, high, decimals): |
||
32 | assert low is not None or high is not None |
||
33 | assert low is None or high is None or low < high |
||
34 | assert decimals >= 0 |
||
35 | |||
36 | def fmt_value(value): |
||
37 | if value is None or np.isinf(value): |
||
38 | return None |
||
39 | val = str(round(value, decimals)) |
||
40 | if val.endswith(".0"): |
||
41 | return val[:-2] |
||
42 | return val |
||
43 | |||
44 | low, high = fmt_value(low), fmt_value(high) |
||
45 | if not low: |
||
46 | return "< {}".format(high) |
||
47 | if not high: |
||
48 | return "≥ {}".format(low) |
||
49 | return "{} - {}".format(low, high) |
||
50 | |||
51 | @classmethod |
||
52 | def create_discretized_var(cls, var, points): |
||
53 | lpoints = list(points) |
||
54 | if lpoints: |
||
55 | values = [ |
||
56 | cls._fmt_interval(low, high, var.number_of_decimals) |
||
57 | for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])] |
||
58 | to_sql = BinSql(var, lpoints) |
||
59 | else: |
||
60 | values = ["single_value"] |
||
61 | to_sql = SingleValueSql(values[0]) |
||
62 | |||
63 | dvar = DiscreteVariable(name=var.name, values=values, |
||
64 | compute_value=cls(var, points)) |
||
65 | dvar.source_variable = var |
||
66 | dvar.to_sql = to_sql |
||
67 | return dvar |
||
68 | |||
69 | |||
70 | class BinSql: |
||
71 | def __init__(self, var, points): |
||
72 | self.var = var |
||
73 | self.points = points |
||
74 | |||
75 | def __call__(self): |
||
76 | return 'width_bucket(%s, ARRAY%s::double precision[])' % ( |
||
77 | self.var.to_sql(), str(self.points)) |
||
78 | |||
79 | |||
80 | class SingleValueSql: |
||
81 | def __init__(self, value): |
||
82 | self.value = value |
||
83 | |||
84 | def __call__(self): |
||
85 | return "'%s'" % self.value |
||
86 | |||
87 | |||
88 | class Discretization: |
||
89 | """Abstract base class for discretization classes.""" |
||
90 | def __call__(self, data, variable): |
||
91 | """ |
||
92 | Compute discretization of the given variable on the given data. |
||
93 | Return a new variable with the appropriate domain |
||
94 | (:obj:`Orange.data.DiscreteVariable.values`) and transformer |
||
95 | (:obj:`Orange.data.Variable.compute_value`). |
||
96 | """ |
||
97 | raise NotImplementedError( |
||
98 | "Subclasses of 'Discretization' need to implement " |
||
99 | "the call operator") |
||
100 | |||
101 | |||
102 | class EqualFreq(Discretization): |
||
103 | """Discretization into bins with approximately equal number of data |
||
104 | instances. |
||
105 | |||
106 | .. attribute:: n |
||
107 | |||
108 | Number of bins (default: 4). The actual number may be lower if the |
||
109 | variable has less than n distinct values. |
||
110 | """ |
||
111 | def __init__(self, n=4): |
||
112 | self.n = n |
||
113 | |||
114 | # noinspection PyProtectedMember |
||
115 | def __call__(self, data, attribute): |
||
116 | if type(data) == SqlTable: |
||
117 | att = attribute.to_sql() |
||
118 | quantiles = [(i + 1) / self.n for i in range(self.n - 1)] |
||
119 | query = data._sql_query( |
||
0 ignored issues
–
show
It seems like
_sql_query was declared protected and should not be accessed from this context.
Prefixing a member variable class MyParent:
def __init__(self):
self._x = 1;
self.y = 2;
class MyChild(MyParent):
def some_method(self):
return self._x # Ok, since accessed from a child class
class AnotherClass:
def some_method(self, instance_of_my_child):
return instance_of_my_child._x # Would be flagged as AnotherClass is not
# a child class of MyParent
![]() |
|||
120 | ['quantile(%s, ARRAY%s)' % (att, str(quantiles))]) |
||
121 | with data._execute_sql_query(query) as cur: |
||
0 ignored issues
–
show
It seems like
_execute_sql_query was declared protected and should not be accessed from this context.
Prefixing a member variable class MyParent:
def __init__(self):
self._x = 1;
self.y = 2;
class MyChild(MyParent):
def some_method(self):
return self._x # Ok, since accessed from a child class
class AnotherClass:
def some_method(self, instance_of_my_child):
return instance_of_my_child._x # Would be flagged as AnotherClass is not
# a child class of MyParent
![]() |
|||
122 | points = sorted(set(cur.fetchone()[0])) |
||
123 | else: |
||
124 | d = distribution.get_distribution(data, attribute) |
||
125 | points = _discretize.split_eq_freq(d, self.n) |
||
126 | return Discretizer.create_discretized_var( |
||
127 | data.domain[attribute], points) |
||
128 | |||
129 | |||
130 | class EqualWidth(Discretization): |
||
131 | """Discretization into a fixed number of bins with equal widths. |
||
132 | |||
133 | .. attribute:: n |
||
134 | |||
135 | Number of bins (default: 4). |
||
136 | """ |
||
137 | def __init__(self, n=4): |
||
138 | self.n = n |
||
139 | |||
140 | # noinspection PyProtectedMember |
||
141 | def __call__(self, data, attribute, fixed=None): |
||
0 ignored issues
–
show
|
|||
142 | if fixed: |
||
143 | min, max = fixed[attribute.name] |
||
0 ignored issues
–
show
|
|||
144 | points = self._split_eq_width_fixed(min, max, n=self.n) |
||
145 | else: |
||
146 | if type(data) == SqlTable: |
||
147 | att = attribute.to_sql() |
||
148 | query = data._sql_query(['min(%s)::double precision' % att, |
||
0 ignored issues
–
show
It seems like
_sql_query was declared protected and should not be accessed from this context.
Prefixing a member variable class MyParent:
def __init__(self):
self._x = 1;
self.y = 2;
class MyChild(MyParent):
def some_method(self):
return self._x # Ok, since accessed from a child class
class AnotherClass:
def some_method(self, instance_of_my_child):
return instance_of_my_child._x # Would be flagged as AnotherClass is not
# a child class of MyParent
![]() |
|||
149 | 'max(%s)::double precision' % att]) |
||
150 | with data._execute_sql_query(query) as cur: |
||
0 ignored issues
–
show
It seems like
_execute_sql_query was declared protected and should not be accessed from this context.
Prefixing a member variable class MyParent:
def __init__(self):
self._x = 1;
self.y = 2;
class MyChild(MyParent):
def some_method(self):
return self._x # Ok, since accessed from a child class
class AnotherClass:
def some_method(self, instance_of_my_child):
return instance_of_my_child._x # Would be flagged as AnotherClass is not
# a child class of MyParent
![]() |
|||
151 | min, max = cur.fetchone() |
||
152 | dif = (max - min) / self.n |
||
153 | points = [min + (i + 1) * dif for i in range(self.n - 1)] |
||
154 | else: |
||
155 | # TODO: why is the whole distribution computed instead of |
||
0 ignored issues
–
show
|
|||
156 | # just min/max |
||
157 | d = distribution.get_distribution(data, attribute) |
||
158 | points = self._split_eq_width(d, n=self.n) |
||
159 | return Discretizer.create_discretized_var( |
||
160 | data.domain[attribute], points) |
||
161 | |||
162 | @staticmethod |
||
163 | def _split_eq_width(dist, n): |
||
164 | min = dist[0][0] |
||
0 ignored issues
–
show
|
|||
165 | max = dist[0][-1] |
||
0 ignored issues
–
show
|
|||
166 | if min == max: |
||
167 | return [] |
||
168 | dif = (max - min) / n |
||
169 | return [min + (i + 1) * dif for i in range(n - 1)] |
||
170 | |||
171 | @staticmethod |
||
172 | def _split_eq_width_fixed(min, max, n): |
||
0 ignored issues
–
show
|
|||
173 | if min == max: |
||
174 | return [] |
||
175 | dif = (max - min) / n |
||
176 | return [min + (i + 1) * dif for i in range(n - 1)] |
||
177 | |||
178 | |||
179 | # noinspection PyPep8Naming |
||
180 | class EntropyMDL(Discretization): |
||
181 | """ |
||
182 | Discretization into bins inferred by recursively splitting the values to |
||
183 | minimize the class-entropy. The procedure stops when further splits would |
||
184 | decrease the entropy for less than the corresponding increase of minimal |
||
185 | description length (MDL). [FayyadIrani93]. |
||
186 | |||
187 | If there are no suitable cut-off points, the procedure returns a single bin, |
||
188 | which means that the new feature is constant and can be removed. |
||
189 | |||
190 | .. attribute:: force |
||
191 | |||
192 | Induce at least one cut-off point, even when its information |
||
193 | gain is lower than MDL (default: False). |
||
194 | |||
195 | """ |
||
196 | def __init__(self, force=False): |
||
197 | self.force = force |
||
198 | |||
199 | def __call__(self, data, attribute): |
||
200 | cont = contingency.get_contingency(data, attribute) |
||
201 | values, I = cont.values, cont.counts.T |
||
202 | cut_ind = np.array(self._entropy_discretize_sorted(I, self.force)) |
||
203 | if len(cut_ind) > 0: |
||
204 | # "the midpoint between each successive pair of examples" (FI p.1) |
||
205 | points = (values[cut_ind] + values[cut_ind - 1]) / 2. |
||
206 | else: |
||
207 | points = [] |
||
208 | return Discretizer.create_discretized_var( |
||
209 | data.domain[attribute], points) |
||
210 | |||
211 | @classmethod |
||
212 | def _normalize(cls, X, axis=None, out=None): |
||
213 | """ |
||
214 | Normalize `X` array so it sums to 1.0 over the `axis`. |
||
215 | |||
216 | Parameters |
||
217 | ---------- |
||
218 | X : array |
||
219 | Array to normalize. |
||
220 | axis : optional int |
||
221 | Axis over which the resulting array sums to 1. |
||
222 | out : optional array |
||
223 | Output array of the same shape as X. |
||
224 | """ |
||
225 | X = np.asarray(X, dtype=float) |
||
226 | scale = np.sum(X, axis=axis, keepdims=True) |
||
227 | if out is None: |
||
228 | return X / scale |
||
229 | else: |
||
230 | if out is not X: |
||
231 | assert out.shape == X.shape |
||
232 | out[:] = X |
||
233 | out /= scale |
||
234 | return out |
||
235 | |||
236 | @classmethod |
||
237 | def _entropy_normalized(cls, D, axis=None): |
||
238 | """ |
||
239 | Compute the entropy of distribution array `D`. |
||
240 | |||
241 | `D` must be a distribution (i.e. sum to 1.0 over `axis`) |
||
242 | |||
243 | Parameters |
||
244 | ---------- |
||
245 | D : array |
||
246 | Distribution. |
||
247 | axis : optional int |
||
248 | Axis of `D` along which to compute the entropy. |
||
249 | |||
250 | """ |
||
251 | # req: (np.sum(D, axis=axis) >= 0).all() |
||
252 | # req: (np.sum(D, axis=axis) <= 1).all() |
||
253 | # req: np.all(np.abs(np.sum(D, axis=axis) - 1) < 1e-9) |
||
254 | |||
255 | D = np.asarray(D) |
||
256 | Dc = np.clip(D, np.finfo(D.dtype).eps, 1.0) |
||
257 | return - np.sum(D * np.log2(Dc), axis=axis) |
||
258 | |||
259 | @classmethod |
||
260 | def _entropy(cls, D, axis=None): |
||
261 | """ |
||
262 | Compute the entropy of distribution `D`. |
||
263 | |||
264 | Parameters |
||
265 | ---------- |
||
266 | D : array |
||
267 | Distribution. |
||
268 | axis : optional int |
||
269 | Axis of `D` along which to compute the entropy. |
||
270 | |||
271 | """ |
||
272 | D = cls._normalize(D, axis=axis) |
||
273 | return cls._entropy_normalized(D, axis=axis) |
||
274 | |||
275 | @classmethod |
||
276 | def _entropy1(cls, D): |
||
277 | """ |
||
278 | Compute the entropy of distributions in `D` |
||
279 | (one per each row). |
||
280 | """ |
||
281 | D = cls._normalize(D) |
||
282 | return _discretize.entropy_normalized1(D) |
||
283 | |||
284 | @classmethod |
||
285 | def _entropy2(cls, D): |
||
286 | """ |
||
287 | Compute the entropy of distributions in `D` |
||
288 | (one per each row). |
||
289 | """ |
||
290 | D = cls._normalize(D, axis=1) |
||
291 | return _discretize.entropy_normalized2(D) |
||
292 | |||
293 | @classmethod |
||
294 | def _entropy_cuts_sorted(cls, CS): |
||
295 | """ |
||
296 | Return the class information entropy induced by partitioning |
||
297 | the `CS` distribution at all N-1 candidate cut points. |
||
298 | |||
299 | Parameters |
||
300 | ---------- |
||
301 | CS : (N, K) array of class distributions. |
||
302 | """ |
||
303 | CS = np.asarray(CS) |
||
304 | # |--|-------|--------| |
||
305 | # S1 ^ S2 |
||
306 | # S1 contains all points which are <= to cut point |
||
307 | # Cumulative distributions for S1 and S2 (left right set) |
||
308 | # i.e. a cut at index i separates the CS into S1Dist[i] and S2Dist[i] |
||
309 | S1Dist = np.cumsum(CS, axis=0)[:-1] |
||
310 | S2Dist = np.cumsum(CS[::-1], axis=0)[-2::-1] |
||
311 | |||
312 | # Entropy of S1[i] and S2[i] sets |
||
313 | ES1 = cls._entropy2(S1Dist) |
||
314 | ES2 = cls._entropy2(S2Dist) |
||
315 | |||
316 | # Number of cases in S1[i] and S2[i] sets |
||
317 | S1_count = np.sum(S1Dist, axis=1) |
||
318 | S2_count = np.sum(S2Dist, axis=1) |
||
319 | |||
320 | # Number of all cases |
||
321 | S_count = np.sum(CS) |
||
322 | |||
323 | ES1w = ES1 * S1_count / S_count |
||
324 | ES2w = ES2 * S2_count / S_count |
||
325 | |||
326 | # E(A, T; S) Class information entropy of the partition S |
||
327 | E = ES1w + ES2w |
||
328 | |||
329 | return E, ES1, ES2 |
||
330 | |||
331 | @classmethod |
||
332 | def _entropy_discretize_sorted(cls, C, force=False): |
||
333 | """ |
||
334 | Entropy discretization on a sorted C. |
||
335 | |||
336 | :param C: (N, K) array of class distributions. |
||
337 | |||
338 | """ |
||
339 | E, ES1, ES2 = cls._entropy_cuts_sorted(C) |
||
340 | # TODO: Also get the left right distribution counts from |
||
0 ignored issues
–
show
|
|||
341 | # entropy_cuts_sorted, |
||
342 | |||
343 | # Note the + 1 |
||
344 | if len(E) == 0: |
||
345 | return [] |
||
346 | cut_index = np.argmin(E) + 1 |
||
347 | |||
348 | # Distribution of classed in S1, S2 and S |
||
349 | S1_c = np.sum(C[:cut_index], axis=0) |
||
350 | S2_c = np.sum(C[cut_index:], axis=0) |
||
351 | S_c = S1_c + S2_c |
||
352 | |||
353 | ES = cls._entropy1(np.sum(C, axis=0)) |
||
354 | ES1, ES2 = ES1[cut_index - 1], ES2[cut_index - 1] |
||
355 | |||
356 | # Information gain of the best split |
||
357 | Gain = ES - E[cut_index - 1] |
||
358 | # Number of different classes in S, S1 and S2 |
||
359 | k = float(np.sum(S_c > 0)) |
||
360 | k1 = float(np.sum(S1_c > 0)) |
||
361 | k2 = float(np.sum(S2_c > 0)) |
||
362 | |||
363 | assert k > 0 |
||
364 | delta = np.log2(3 ** k - 2) - (k * ES - k1 * ES1 - k2 * ES2) |
||
365 | N = float(np.sum(S_c)) |
||
366 | |||
367 | if Gain > np.log2(N - 1) / N + delta / N: |
||
368 | # Accept the cut point and recursively split the subsets. |
||
369 | left, right = [], [] |
||
370 | if k1 > 1 and cut_index > 1: |
||
371 | left = cls._entropy_discretize_sorted(C[:cut_index, :]) |
||
372 | if k2 > 1 and cut_index < len(C) - 1: |
||
373 | right = cls._entropy_discretize_sorted(C[cut_index:, :]) |
||
374 | return left + [cut_index] + [i + cut_index for i in right] |
||
375 | elif force: |
||
376 | return [cut_index] |
||
377 | else: |
||
378 | return [] |
||
379 | |||
380 | |||
381 | class DomainDiscretizer: |
||
382 | """Discretizes all continuous features in the data. |
||
383 | |||
384 | .. attribute:: method |
||
385 | |||
386 | Feature discretization method (instance of |
||
387 | :obj:`Orange.preprocess.Discretization`). If `None` (default), |
||
388 | :class:`Orange.preprocess.EqualFreq` with 4 intervals is |
||
389 | used. |
||
390 | |||
391 | .. attribute:: clean |
||
392 | |||
393 | If `True`, features discretized into a single interval constant are |
||
394 | removed. This is useful for discretization methods that infer the |
||
395 | number of intervals from the data, such as |
||
396 | :class:`Orange.preprocess.EntropyMDL` (default: `True`). |
||
397 | |||
398 | .. attribute:: discretize_class |
||
399 | |||
400 | Determines whether a target is also discretized if it is continuous. |
||
401 | (default: `False`) |
||
402 | """ |
||
403 | def __new__(cls, data=None, |
||
404 | discretize_class=False, method=None, clean=True, fixed=None): |
||
405 | self = super().__new__(cls) |
||
406 | self.discretize_class = discretize_class |
||
407 | self.method = method |
||
408 | self.clean = clean |
||
409 | if data is None: |
||
410 | return self |
||
411 | else: |
||
412 | return self(data, fixed) |
||
413 | |||
414 | def __call__(self, data, fixed=None): |
||
415 | """ |
||
416 | Compute and return discretized domain. |
||
417 | |||
418 | :param data: Data to discretize. |
||
419 | """ |
||
420 | |||
421 | def transform_list(s, fixed=None): |
||
422 | new_vars = [] |
||
423 | for var in s: |
||
424 | if var.is_continuous: |
||
425 | if fixed and var.name in fixed.keys(): |
||
426 | nv = method(data, var, fixed) |
||
427 | else: |
||
428 | nv = method(data, var) |
||
429 | if not self.clean or len(nv.values) > 1: |
||
430 | new_vars.append(nv) |
||
431 | else: |
||
432 | new_vars.append(var) |
||
433 | return new_vars |
||
434 | if self.method is None: |
||
435 | method = EqualFreq(n=4) |
||
436 | else: |
||
437 | method = self.method |
||
438 | domain = data.domain |
||
439 | new_attrs = transform_list(domain.attributes, fixed) |
||
440 | if self.discretize_class: |
||
441 | new_classes = transform_list(domain.class_vars) |
||
442 | else: |
||
443 | new_classes = domain.class_vars |
||
444 | return Domain(new_attrs, new_classes) |
||
445 |
This can be caused by one of the following:
1. Missing Dependencies
This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.
2. Missing __init__.py files
This error could also result from missing
__init__.py
files in your module folders. Make sure that you place one file in each sub-folder.