GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (4082)

Orange/preprocess/discretize.py (15 issues)

1
import numpy as np
0 ignored issues
show
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
2
3
from Orange.data import DiscreteVariable, Domain
4
from Orange.data.sql.table import SqlTable
5
from Orange.statistics import distribution, contingency
6
from .transformation import Transformation
7
from . import _discretize
0 ignored issues
show
The name _discretize does not seem to exist in module Orange.preprocess.
Loading history...
8
9
__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer"]
10
11
12
class Discretizer(Transformation):
13
    """Value transformer that returns an index of the bin for the given value.
14
    """
15
    def __init__(self, variable, points):
16
        super().__init__(variable)
17
        self.points = points
18
19
    def transform(self, c):
20
        if c.size:
21
            # HB 20151202: numpy 1.10+ needs some points.
22
            if len(self.points):
23
                aa = np.digitize(c, self.points)
24
            else:
25
                aa = np.array([0] * len(c))
26
            return np.where(np.isnan(c), np.NaN, aa)
27
        else:
28
            return np.array([], dtype=int)
29
30
    @staticmethod
31
    def _fmt_interval(low, high, decimals):
32
        assert low is not None or high is not None
33
        assert low is None or high is None or low < high
34
        assert decimals >= 0
35
36
        def fmt_value(value):
37
            if value is None or np.isinf(value):
38
                return None
39
            val = str(round(value, decimals))
40
            if val.endswith(".0"):
41
                return val[:-2]
42
            return val
43
44
        low, high = fmt_value(low), fmt_value(high)
45
        if not low:
46
            return "< {}".format(high)
47
        if not high:
48
            return "≥ {}".format(low)
49
        return "{} - {}".format(low, high)
50
51
    @classmethod
52
    def create_discretized_var(cls, var, points):
53
        lpoints = list(points)
54
        if lpoints:
55
            values = [
56
                cls._fmt_interval(low, high, var.number_of_decimals)
57
                for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
58
            to_sql = BinSql(var, lpoints)
59
        else:
60
            values = ["single_value"]
61
            to_sql = SingleValueSql(values[0])
62
63
        dvar = DiscreteVariable(name=var.name, values=values,
64
                                compute_value=cls(var, points))
65
        dvar.source_variable = var
66
        dvar.to_sql = to_sql
67
        return dvar
68
69
70
class BinSql:
71
    def __init__(self, var, points):
72
        self.var = var
73
        self.points = points
74
75
    def __call__(self):
76
        return 'width_bucket(%s, ARRAY%s::double precision[])' % (
77
            self.var.to_sql(), str(self.points))
78
79
80
class SingleValueSql:
81
    def __init__(self, value):
82
        self.value = value
83
84
    def __call__(self):
85
        return "'%s'" % self.value
86
87
88
class Discretization:
89
    """Abstract base class for discretization classes."""
90
    def __call__(self, data, variable):
91
        """
92
        Compute discretization of the given variable on the given data.
93
        Return a new variable with the appropriate domain
94
        (:obj:`Orange.data.DiscreteVariable.values`) and transformer
95
        (:obj:`Orange.data.Variable.compute_value`).
96
        """
97
        raise NotImplementedError(
98
            "Subclasses of 'Discretization' need to implement "
99
            "the call operator")
100
101
102
class EqualFreq(Discretization):
103
    """Discretization into bins with approximately equal number of data
104
    instances.
105
106
    .. attribute:: n
107
108
        Number of bins (default: 4). The actual number may be lower if the
109
        variable has less than n distinct values.
110
    """
111
    def __init__(self, n=4):
112
        self.n = n
113
114
    # noinspection PyProtectedMember
115
    def __call__(self, data, attribute):
116
        if type(data) == SqlTable:
117
            att = attribute.to_sql()
118
            quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
119
            query = data._sql_query(
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _sql_query was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
120
                ['quantile(%s, ARRAY%s)' % (att, str(quantiles))])
121
            with data._execute_sql_query(query) as cur:
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _execute_sql_query was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
122
                points = sorted(set(cur.fetchone()[0]))
123
        else:
124
            d = distribution.get_distribution(data, attribute)
125
            points = _discretize.split_eq_freq(d, self.n)
126
        return Discretizer.create_discretized_var(
127
            data.domain[attribute], points)
128
129
130
class EqualWidth(Discretization):
131
    """Discretization into a fixed number of bins with equal widths.
132
133
    .. attribute:: n
134
135
        Number of bins (default: 4).
136
    """
137
    def __init__(self, n=4):
138
        self.n = n
139
140
    # noinspection PyProtectedMember
141
    def __call__(self, data, attribute, fixed=None):
0 ignored issues
show
Arguments number differs from overridden '__call__' method
Loading history...
142
        if fixed:
143
            min, max = fixed[attribute.name]
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in max.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
Bug Best Practice introduced by
This seems to re-define the built-in min.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
144
            points = self._split_eq_width_fixed(min, max, n=self.n)
145
        else:
146
            if type(data) == SqlTable:
147
                att = attribute.to_sql()
148
                query = data._sql_query(['min(%s)::double precision' % att,
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _sql_query was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
149
                                         'max(%s)::double precision' % att])
150
                with data._execute_sql_query(query) as cur:
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _execute_sql_query was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
151
                    min, max = cur.fetchone()
152
                dif = (max - min) / self.n
153
                points = [min + (i + 1) * dif for i in range(self.n - 1)]
154
            else:
155
                # TODO: why is the whole distribution computed instead of
0 ignored issues
show
TODO and FIXME comments should generally be avoided.
Loading history...
156
                # just min/max
157
                d = distribution.get_distribution(data, attribute)
158
                points = self._split_eq_width(d, n=self.n)
159
        return Discretizer.create_discretized_var(
160
            data.domain[attribute], points)
161
162
    @staticmethod
163
    def _split_eq_width(dist, n):
164
        min = dist[0][0]
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in min.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
165
        max = dist[0][-1]
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in max.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
166
        if min == max:
167
            return []
168
        dif = (max - min) / n
169
        return [min + (i + 1) * dif for i in range(n - 1)]
170
171
    @staticmethod
172
    def _split_eq_width_fixed(min, max, n):
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in max.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
Bug Best Practice introduced by
This seems to re-define the built-in min.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
173
        if min == max:
174
            return []
175
        dif = (max - min) / n
176
        return [min + (i + 1) * dif for i in range(n - 1)]
177
178
179
# noinspection PyPep8Naming
180
class EntropyMDL(Discretization):
181
    """
182
    Discretization into bins inferred by recursively splitting the values to
183
    minimize the class-entropy. The procedure stops when further splits would
184
    decrease the entropy for less than the corresponding increase of minimal
185
    description length (MDL). [FayyadIrani93].
186
187
    If there are no suitable cut-off points, the procedure returns a single bin,
188
    which means that the new feature is constant and can be removed.
189
190
    .. attribute:: force
191
192
        Induce at least one cut-off point, even when its information
193
        gain is lower than MDL (default: False).
194
195
    """
196
    def __init__(self, force=False):
197
        self.force = force
198
199
    def __call__(self, data, attribute):
200
        cont = contingency.get_contingency(data, attribute)
201
        values, I = cont.values, cont.counts.T
202
        cut_ind = np.array(self._entropy_discretize_sorted(I, self.force))
203
        if len(cut_ind) > 0:
204
            # "the midpoint between each successive pair of examples" (FI p.1)
205
            points = (values[cut_ind] + values[cut_ind - 1]) / 2.
206
        else:
207
            points = []
208
        return Discretizer.create_discretized_var(
209
            data.domain[attribute], points)
210
211
    @classmethod
212
    def _normalize(cls, X, axis=None, out=None):
213
        """
214
        Normalize `X` array so it sums to 1.0 over the `axis`.
215
216
        Parameters
217
        ----------
218
        X : array
219
            Array to normalize.
220
        axis : optional int
221
            Axis over which the resulting array sums to 1.
222
        out : optional array
223
            Output array of the same shape as X.
224
        """
225
        X = np.asarray(X, dtype=float)
226
        scale = np.sum(X, axis=axis, keepdims=True)
227
        if out is None:
228
            return X / scale
229
        else:
230
            if out is not X:
231
                assert out.shape == X.shape
232
                out[:] = X
233
            out /= scale
234
            return out
235
236
    @classmethod
237
    def _entropy_normalized(cls, D, axis=None):
238
        """
239
        Compute the entropy of distribution array `D`.
240
241
        `D` must be a distribution (i.e. sum to 1.0 over `axis`)
242
243
        Parameters
244
        ----------
245
        D : array
246
            Distribution.
247
        axis : optional int
248
            Axis of `D` along which to compute the entropy.
249
250
        """
251
        # req: (np.sum(D, axis=axis) >= 0).all()
252
        # req: (np.sum(D, axis=axis) <= 1).all()
253
        # req: np.all(np.abs(np.sum(D, axis=axis) - 1) < 1e-9)
254
255
        D = np.asarray(D)
256
        Dc = np.clip(D, np.finfo(D.dtype).eps, 1.0)
257
        return - np.sum(D * np.log2(Dc), axis=axis)
258
259
    @classmethod
260
    def _entropy(cls, D, axis=None):
261
        """
262
        Compute the entropy of distribution `D`.
263
264
        Parameters
265
        ----------
266
        D : array
267
            Distribution.
268
        axis : optional int
269
            Axis of `D` along which to compute the entropy.
270
271
        """
272
        D = cls._normalize(D, axis=axis)
273
        return cls._entropy_normalized(D, axis=axis)
274
275
    @classmethod
276
    def _entropy1(cls, D):
277
        """
278
        Compute the entropy of distributions in `D`
279
        (one per each row).
280
        """
281
        D = cls._normalize(D)
282
        return _discretize.entropy_normalized1(D)
283
284
    @classmethod
285
    def _entropy2(cls, D):
286
        """
287
        Compute the entropy of distributions in `D`
288
        (one per each row).
289
        """
290
        D = cls._normalize(D, axis=1)
291
        return _discretize.entropy_normalized2(D)
292
293
    @classmethod
294
    def _entropy_cuts_sorted(cls, CS):
295
        """
296
        Return the class information entropy induced by partitioning
297
        the `CS` distribution at all N-1 candidate cut points.
298
299
        Parameters
300
        ----------
301
        CS : (N, K) array of class distributions.
302
        """
303
        CS = np.asarray(CS)
304
        # |--|-------|--------|
305
        #  S1    ^       S2
306
        # S1 contains all points which are <= to cut point
307
        # Cumulative distributions for S1 and S2 (left right set)
308
        # i.e. a cut at index i separates the CS into S1Dist[i] and S2Dist[i]
309
        S1Dist = np.cumsum(CS, axis=0)[:-1]
310
        S2Dist = np.cumsum(CS[::-1], axis=0)[-2::-1]
311
312
        # Entropy of S1[i] and S2[i] sets
313
        ES1 = cls._entropy2(S1Dist)
314
        ES2 = cls._entropy2(S2Dist)
315
316
        # Number of cases in S1[i] and S2[i] sets
317
        S1_count = np.sum(S1Dist, axis=1)
318
        S2_count = np.sum(S2Dist, axis=1)
319
320
        # Number of all cases
321
        S_count = np.sum(CS)
322
323
        ES1w = ES1 * S1_count / S_count
324
        ES2w = ES2 * S2_count / S_count
325
326
        # E(A, T; S) Class information entropy of the partition S
327
        E = ES1w + ES2w
328
329
        return E, ES1, ES2
330
331
    @classmethod
332
    def _entropy_discretize_sorted(cls, C, force=False):
333
        """
334
        Entropy discretization on a sorted C.
335
336
        :param C: (N, K) array of class distributions.
337
338
        """
339
        E, ES1, ES2 = cls._entropy_cuts_sorted(C)
340
        # TODO: Also get the left right distribution counts from
0 ignored issues
show
TODO and FIXME comments should generally be avoided.
Loading history...
341
        # entropy_cuts_sorted,
342
343
        # Note the + 1
344
        if len(E) == 0:
345
            return []
346
        cut_index = np.argmin(E) + 1
347
348
        # Distribution of classed in S1, S2 and S
349
        S1_c = np.sum(C[:cut_index], axis=0)
350
        S2_c = np.sum(C[cut_index:], axis=0)
351
        S_c = S1_c + S2_c
352
353
        ES = cls._entropy1(np.sum(C, axis=0))
354
        ES1, ES2 = ES1[cut_index - 1], ES2[cut_index - 1]
355
356
        # Information gain of the best split
357
        Gain = ES - E[cut_index - 1]
358
        # Number of different classes in S, S1 and S2
359
        k = float(np.sum(S_c > 0))
360
        k1 = float(np.sum(S1_c > 0))
361
        k2 = float(np.sum(S2_c > 0))
362
363
        assert k > 0
364
        delta = np.log2(3 ** k - 2) - (k * ES - k1 * ES1 - k2 * ES2)
365
        N = float(np.sum(S_c))
366
367
        if Gain > np.log2(N - 1) / N + delta / N:
368
            # Accept the cut point and recursively split the subsets.
369
            left, right = [], []
370
            if k1 > 1 and cut_index > 1:
371
                left = cls._entropy_discretize_sorted(C[:cut_index, :])
372
            if k2 > 1 and cut_index < len(C) - 1:
373
                right = cls._entropy_discretize_sorted(C[cut_index:, :])
374
            return left + [cut_index] + [i + cut_index for i in right]
375
        elif force:
376
            return [cut_index]
377
        else:
378
            return []
379
380
381
class DomainDiscretizer:
382
    """Discretizes all continuous features in the data.
383
384
    .. attribute:: method
385
386
        Feature discretization method (instance of
387
        :obj:`Orange.preprocess.Discretization`). If `None` (default),
388
        :class:`Orange.preprocess.EqualFreq` with 4 intervals is
389
        used.
390
391
    .. attribute:: clean
392
393
        If `True`, features discretized into a single interval constant are
394
        removed. This is useful for discretization methods that infer the
395
        number of intervals from the data, such as
396
        :class:`Orange.preprocess.EntropyMDL` (default: `True`).
397
398
    .. attribute:: discretize_class
399
400
        Determines whether a target is also discretized if it is continuous.
401
        (default: `False`)
402
    """
403
    def __new__(cls, data=None,
404
                discretize_class=False, method=None, clean=True, fixed=None):
405
        self = super().__new__(cls)
406
        self.discretize_class = discretize_class
407
        self.method = method
408
        self.clean = clean
409
        if data is None:
410
            return self
411
        else:
412
            return self(data, fixed)
413
414
    def __call__(self, data, fixed=None):
415
        """
416
        Compute and return discretized domain.
417
418
        :param data: Data to discretize.
419
        """
420
421
        def transform_list(s, fixed=None):
422
            new_vars = []
423
            for var in s:
424
                if var.is_continuous:
425
                    if fixed and var.name in fixed.keys():
426
                        nv = method(data, var, fixed)
427
                    else:
428
                        nv = method(data, var)
429
                    if not self.clean or len(nv.values) > 1:
430
                        new_vars.append(nv)
431
                else:
432
                    new_vars.append(var)
433
            return new_vars
434
        if self.method is None:
435
            method = EqualFreq(n=4)
436
        else:
437
            method = self.method
438
        domain = data.domain
439
        new_attrs = transform_list(domain.attributes, fixed)
440
        if self.discretize_class:
441
            new_classes = transform_list(domain.class_vars)
442
        else:
443
            new_classes = domain.class_vars
444
        return Domain(new_attrs, new_classes)
445