Issues in discretize.py (master) - Issues in master - hugobuddel/orange3 - Measure and Improve Code Quality continuously with Scrutinizer

Issues (4082)

Orange/preprocess/discretize.py (15 issues)

Labels

Severity

import numpy as np
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3

from Orange.data import DiscreteVariable, Domain
from Orange.data.sql.table import SqlTable
from Orange.statistics import distribution, contingency
from .transformation import Transformation
from . import _discretize


__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer"]


class Discretizer(Transformation):
    """Value transformer that returns an index of the bin for the given value.
    """
    def __init__(self, variable, points):
        super().__init__(variable)
        self.points = points

    def transform(self, c):
        if c.size:
            # HB 20151202: numpy 1.10+ needs some points.
            if len(self.points):
                aa = np.digitize(c, self.points)
            else:
                aa = np.array([0] * len(c))
            return np.where(np.isnan(c), np.NaN, aa)
        else:
            return np.array([], dtype=int)

    @staticmethod
    def _fmt_interval(low, high, decimals):
        assert low is not None or high is not None
        assert low is None or high is None or low < high
        assert decimals >= 0

        def fmt_value(value):
            if value is None or np.isinf(value):
                return None
            val = str(round(value, decimals))
            if val.endswith(".0"):
                return val[:-2]
            return val

        low, high = fmt_value(low), fmt_value(high)
        if not low:
            return "< {}".format(high)
        if not high:
            return "≥ {}".format(low)
        return "{} - {}".format(low, high)

    @classmethod
    def create_discretized_var(cls, var, points):
        lpoints = list(points)
        if lpoints:
            values = [
                cls._fmt_interval(low, high, var.number_of_decimals)
                for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
            to_sql = BinSql(var, lpoints)
        else:
            values = ["single_value"]
            to_sql = SingleValueSql(values[0])

        dvar = DiscreteVariable(name=var.name, values=values,
                                compute_value=cls(var, points))
        dvar.source_variable = var
        dvar.to_sql = to_sql
        return dvar


class BinSql:
    def __init__(self, var, points):
        self.var = var
        self.points = points

    def __call__(self):
        return 'width_bucket(%s, ARRAY%s::double precision[])' % (
            self.var.to_sql(), str(self.points))


class SingleValueSql:
    def __init__(self, value):
        self.value = value

    def __call__(self):
        return "'%s'" % self.value


class Discretization:
    """Abstract base class for discretization classes."""
    def __call__(self, data, variable):
        """
        Compute discretization of the given variable on the given data.
        Return a new variable with the appropriate domain
        (:obj:`Orange.data.DiscreteVariable.values`) and transformer
        (:obj:`Orange.data.Variable.compute_value`).
        """
        raise NotImplementedError(
            "Subclasses of 'Discretization' need to implement "
            "the call operator")


class EqualFreq(Discretization):
    """Discretization into bins with approximately equal number of data
    instances.

    .. attribute:: n

        Number of bins (default: 4). The actual number may be lower if the
        variable has less than n distinct values.
    """
    def __init__(self, n=4):
        self.n = n

    # noinspection PyProtectedMember
    def __call__(self, data, attribute):
        if type(data) == SqlTable:
            att = attribute.to_sql()
            quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
            query = data._sql_query(
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
                ['quantile(%s, ARRAY%s)' % (att, str(quantiles))])
            with data._execute_sql_query(query) as cur:
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
                points = sorted(set(cur.fetchone()[0]))
        else:
            d = distribution.get_distribution(data, attribute)
            points = _discretize.split_eq_freq(d, self.n)
        return Discretizer.create_discretized_var(
            data.domain[attribute], points)


class EqualWidth(Discretization):
    """Discretization into a fixed number of bins with equal widths.

    .. attribute:: n

        Number of bins (default: 4).
    """
    def __init__(self, n=4):
        self.n = n

    # noinspection PyProtectedMember
    def __call__(self, data, attribute, fixed=None):

        if fixed:
            min, max = fixed[attribute.name]

            points = self._split_eq_width_fixed(min, max, n=self.n)
        else:
            if type(data) == SqlTable:
                att = attribute.to_sql()
                query = data._sql_query(['min(%s)::double precision' % att,
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
                                         'max(%s)::double precision' % att])
                with data._execute_sql_query(query) as cur:
class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
                    min, max = cur.fetchone()
                dif = (max - min) / self.n
                points = [min + (i + 1) * dif for i in range(self.n - 1)]
            else:
                # TODO: why is the whole distribution computed instead of

                # just min/max
                d = distribution.get_distribution(data, attribute)
                points = self._split_eq_width(d, n=self.n)
        return Discretizer.create_discretized_var(
            data.domain[attribute], points)

    @staticmethod
    def _split_eq_width(dist, n):
        min = dist[0][0]

        max = dist[0][-1]

        if min == max:
            return []
        dif = (max - min) / n
        return [min + (i + 1) * dif for i in range(n - 1)]

    @staticmethod
    def _split_eq_width_fixed(min, max, n):

        if min == max:
            return []
        dif = (max - min) / n
        return [min + (i + 1) * dif for i in range(n - 1)]


# noinspection PyPep8Naming
class EntropyMDL(Discretization):
    """
    Discretization into bins inferred by recursively splitting the values to
    minimize the class-entropy. The procedure stops when further splits would
    decrease the entropy for less than the corresponding increase of minimal
    description length (MDL). [FayyadIrani93].

    If there are no suitable cut-off points, the procedure returns a single bin,
    which means that the new feature is constant and can be removed.

    .. attribute:: force

        Induce at least one cut-off point, even when its information
        gain is lower than MDL (default: False).

    """
    def __init__(self, force=False):
        self.force = force

    def __call__(self, data, attribute):
        cont = contingency.get_contingency(data, attribute)
        values, I = cont.values, cont.counts.T
        cut_ind = np.array(self._entropy_discretize_sorted(I, self.force))
        if len(cut_ind) > 0:
            # "the midpoint between each successive pair of examples" (FI p.1)
            points = (values[cut_ind] + values[cut_ind - 1]) / 2.
        else:
            points = []
        return Discretizer.create_discretized_var(
            data.domain[attribute], points)

    @classmethod
    def _normalize(cls, X, axis=None, out=None):
        """
        Normalize `X` array so it sums to 1.0 over the `axis`.

        Parameters
        ----------
        X : array
            Array to normalize.
        axis : optional int
            Axis over which the resulting array sums to 1.
        out : optional array
            Output array of the same shape as X.
        """
        X = np.asarray(X, dtype=float)
        scale = np.sum(X, axis=axis, keepdims=True)
        if out is None:
            return X / scale
        else:
            if out is not X:
                assert out.shape == X.shape
                out[:] = X
            out /= scale
            return out

    @classmethod
    def _entropy_normalized(cls, D, axis=None):
        """
        Compute the entropy of distribution array `D`.

        `D` must be a distribution (i.e. sum to 1.0 over `axis`)

        Parameters
        ----------
        D : array
            Distribution.
        axis : optional int
            Axis of `D` along which to compute the entropy.

        """
        # req: (np.sum(D, axis=axis) >= 0).all()
        # req: (np.sum(D, axis=axis) <= 1).all()
        # req: np.all(np.abs(np.sum(D, axis=axis) - 1) < 1e-9)

        D = np.asarray(D)
        Dc = np.clip(D, np.finfo(D.dtype).eps, 1.0)
        return - np.sum(D * np.log2(Dc), axis=axis)

    @classmethod
    def _entropy(cls, D, axis=None):
        """
        Compute the entropy of distribution `D`.

        Parameters
        ----------
        D : array
            Distribution.
        axis : optional int
            Axis of `D` along which to compute the entropy.

        """
        D = cls._normalize(D, axis=axis)
        return cls._entropy_normalized(D, axis=axis)

    @classmethod
    def _entropy1(cls, D):
        """
        Compute the entropy of distributions in `D`
        (one per each row).
        """
        D = cls._normalize(D)
        return _discretize.entropy_normalized1(D)

    @classmethod
    def _entropy2(cls, D):
        """
        Compute the entropy of distributions in `D`
        (one per each row).
        """
        D = cls._normalize(D, axis=1)
        return _discretize.entropy_normalized2(D)

    @classmethod
    def _entropy_cuts_sorted(cls, CS):
        """
        Return the class information entropy induced by partitioning
        the `CS` distribution at all N-1 candidate cut points.

        Parameters
        ----------
        CS : (N, K) array of class distributions.
        """
        CS = np.asarray(CS)
        # |--|-------|--------|
        #  S1    ^       S2
        # S1 contains all points which are <= to cut point
        # Cumulative distributions for S1 and S2 (left right set)
        # i.e. a cut at index i separates the CS into S1Dist[i] and S2Dist[i]
        S1Dist = np.cumsum(CS, axis=0)[:-1]
        S2Dist = np.cumsum(CS[::-1], axis=0)[-2::-1]

        # Entropy of S1[i] and S2[i] sets
        ES1 = cls._entropy2(S1Dist)
        ES2 = cls._entropy2(S2Dist)

        # Number of cases in S1[i] and S2[i] sets
        S1_count = np.sum(S1Dist, axis=1)
        S2_count = np.sum(S2Dist, axis=1)

        # Number of all cases
        S_count = np.sum(CS)

        ES1w = ES1 * S1_count / S_count
        ES2w = ES2 * S2_count / S_count

        # E(A, T; S) Class information entropy of the partition S
        E = ES1w + ES2w

        return E, ES1, ES2

    @classmethod
    def _entropy_discretize_sorted(cls, C, force=False):
        """
        Entropy discretization on a sorted C.

        :param C: (N, K) array of class distributions.

        """
        E, ES1, ES2 = cls._entropy_cuts_sorted(C)
        # TODO: Also get the left right distribution counts from

        # entropy_cuts_sorted,

        # Note the + 1
        if len(E) == 0:
            return []
        cut_index = np.argmin(E) + 1

        # Distribution of classed in S1, S2 and S
        S1_c = np.sum(C[:cut_index], axis=0)
        S2_c = np.sum(C[cut_index:], axis=0)
        S_c = S1_c + S2_c

        ES = cls._entropy1(np.sum(C, axis=0))
        ES1, ES2 = ES1[cut_index - 1], ES2[cut_index - 1]

        # Information gain of the best split
        Gain = ES - E[cut_index - 1]
        # Number of different classes in S, S1 and S2
        k = float(np.sum(S_c > 0))
        k1 = float(np.sum(S1_c > 0))
        k2 = float(np.sum(S2_c > 0))

        assert k > 0
        delta = np.log2(3 ** k - 2) - (k * ES - k1 * ES1 - k2 * ES2)
        N = float(np.sum(S_c))

        if Gain > np.log2(N - 1) / N + delta / N:
            # Accept the cut point and recursively split the subsets.
            left, right = [], []
            if k1 > 1 and cut_index > 1:
                left = cls._entropy_discretize_sorted(C[:cut_index, :])
            if k2 > 1 and cut_index < len(C) - 1:
                right = cls._entropy_discretize_sorted(C[cut_index:, :])
            return left + [cut_index] + [i + cut_index for i in right]
        elif force:
            return [cut_index]
        else:
            return []


class DomainDiscretizer:
    """Discretizes all continuous features in the data.

    .. attribute:: method

        Feature discretization method (instance of
        :obj:`Orange.preprocess.Discretization`). If `None` (default),
        :class:`Orange.preprocess.EqualFreq` with 4 intervals is
        used.

    .. attribute:: clean

        If `True`, features discretized into a single interval constant are
        removed. This is useful for discretization methods that infer the
        number of intervals from the data, such as
        :class:`Orange.preprocess.EntropyMDL` (default: `True`).

    .. attribute:: discretize_class

        Determines whether a target is also discretized if it is continuous.
        (default: `False`)
    """
    def __new__(cls, data=None,
                discretize_class=False, method=None, clean=True, fixed=None):
        self = super().__new__(cls)
        self.discretize_class = discretize_class
        self.method = method
        self.clean = clean
        if data is None:
            return self
        else:
            return self(data, fixed)

    def __call__(self, data, fixed=None):
        """
        Compute and return discretized domain.

        :param data: Data to discretize.
        """

        def transform_list(s, fixed=None):
            new_vars = []
            for var in s:
                if var.is_continuous:
                    if fixed and var.name in fixed.keys():
                        nv = method(data, var, fixed)
                    else:
                        nv = method(data, var)
                    if not self.clean or len(nv.values) > 1:
                        new_vars.append(nv)
                else:
                    new_vars.append(var)
            return new_vars
        if self.method is None:
            method = EqualFreq(n=4)
        else:
            method = self.method
        domain = data.domain
        new_attrs = transform_list(domain.attributes, fixed)
        if self.discretize_class:
            new_classes = transform_list(domain.class_vars)
        else:
            new_classes = domain.class_vars
        return Domain(new_attrs, new_classes)


GitHub Access Token became invalid

Issues (4082)

Orange/preprocess/discretize.py (15 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing init.py files

1			import numpy as np
			0 ignored issues – show Configuration introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
2
3			from Orange.data import DiscreteVariable, Domain
4			from Orange.data.sql.table import SqlTable
5			from Orange.statistics import distribution, contingency
6			from .transformation import Transformation
7			from . import _discretize
			0 ignored issues – show Bug introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this The name `_discretize` does not seem to exist in module `Orange.preprocess`. Loading history...
8
9			__all__ = ["EqualFreq", "EqualWidth", "EntropyMDL", "DomainDiscretizer"]
10
11
12			class Discretizer(Transformation):
13			"""Value transformer that returns an index of the bin for the given value.
14			"""
15			def __init__(self, variable, points):
16			super().__init__(variable)
17			self.points = points
18
19			def transform(self, c):
20			if c.size:
21			# HB 20151202: numpy 1.10+ needs some points.
22			if len(self.points):
23			aa = np.digitize(c, self.points)
24			else:
25			aa = np.array([0] * len(c))
26			return np.where(np.isnan(c), np.NaN, aa)
27			else:
28			return np.array([], dtype=int)
29
30			@staticmethod
31			def _fmt_interval(low, high, decimals):
32			assert low is not None or high is not None
33			assert low is None or high is None or low < high
34			assert decimals >= 0
35
36			def fmt_value(value):
37			if value is None or np.isinf(value):
38			return None
39			val = str(round(value, decimals))
40			if val.endswith(".0"):
41			return val[:-2]
42			return val
43
44			low, high = fmt_value(low), fmt_value(high)
45			if not low:
46			return "< {}".format(high)
47			if not high:
48			return "≥ {}".format(low)
49			return "{} - {}".format(low, high)
50
51			@classmethod
52			def create_discretized_var(cls, var, points):
53			lpoints = list(points)
54			if lpoints:
55			values = [
56			cls._fmt_interval(low, high, var.number_of_decimals)
57			for low, high in zip([-np.inf] + lpoints, lpoints + [np.inf])]
58			to_sql = BinSql(var, lpoints)
59			else:
60			values = ["single_value"]
61			to_sql = SingleValueSql(values[0])
62
63			dvar = DiscreteVariable(name=var.name, values=values,
64			compute_value=cls(var, points))
65			dvar.source_variable = var
66			dvar.to_sql = to_sql
67			return dvar
68
69
70			class BinSql:
71			def __init__(self, var, points):
72			self.var = var
73			self.points = points
74
75			def __call__(self):
76			return 'width_bucket(%s, ARRAY%s::double precision[])' % (
77			self.var.to_sql(), str(self.points))
78
79
80			class SingleValueSql:
81			def __init__(self, value):
82			self.value = value
83
84			def __call__(self):
85			return "'%s'" % self.value
86
87
88			class Discretization:
89			"""Abstract base class for discretization classes."""
90			def __call__(self, data, variable):
91			"""
92			Compute discretization of the given variable on the given data.
93			Return a new variable with the appropriate domain
94			(:obj:`Orange.data.DiscreteVariable.values`) and transformer
95			(:obj:`Orange.data.Variable.compute_value`).
96			"""
97			raise NotImplementedError(
98			"Subclasses of 'Discretization' need to implement "
99			"the call operator")
100
101
102			class EqualFreq(Discretization):
103			"""Discretization into bins with approximately equal number of data
104			instances.
105
106			.. attribute:: n
107
108			Number of bins (default: 4). The actual number may be lower if the
109			variable has less than n distinct values.
110			"""
111			def __init__(self, n=4):
112			self.n = n
113
114			# noinspection PyProtectedMember
115			def __call__(self, data, attribute):
116			if type(data) == SqlTable:
117			att = attribute.to_sql()
118			quantiles = [(i + 1) / self.n for i in range(self.n - 1)]
119			query = data._sql_query(
			0 ignored issues – show Coding Style Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this It seems like `_sql_query` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
120			['quantile(%s, ARRAY%s)' % (att, str(quantiles))])
121			with data._execute_sql_query(query) as cur:
			0 ignored issues – show Coding Style Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this It seems like `_execute_sql_query` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
122			points = sorted(set(cur.fetchone()[0]))
123			else:
124			d = distribution.get_distribution(data, attribute)
125			points = _discretize.split_eq_freq(d, self.n)
126			return Discretizer.create_discretized_var(
127			data.domain[attribute], points)
128
129
130			class EqualWidth(Discretization):
131			"""Discretization into a fixed number of bins with equal widths.
132
133			.. attribute:: n
134
135			Number of bins (default: 4).
136			"""
137			def __init__(self, n=4):
138			self.n = n
139
140			# noinspection PyProtectedMember
141			def __call__(self, data, attribute, fixed=None):
			0 ignored issues – show Bug introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this Arguments number differs from overridden '__call__' method Loading history...
142			if fixed:
143			min, max = fixed[attribute.name]
			0 ignored issues – show Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `max`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history... Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `min`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history...
144			points = self._split_eq_width_fixed(min, max, n=self.n)
145			else:
146			if type(data) == SqlTable:
147			att = attribute.to_sql()
148			query = data._sql_query(['min(%s)::double precision' % att,
			0 ignored issues – show Coding Style Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this It seems like `_sql_query` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
149			'max(%s)::double precision' % att])
150			with data._execute_sql_query(query) as cur:
			0 ignored issues – show Coding Style Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this It seems like `_execute_sql_query` was declared protected and should not be accessed from this context. Prefixing a member variable `_` is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class: class MyParent: def __init__(self): self._x = 1; self.y = 2; class MyChild(MyParent): def some_method(self): return self._x # Ok, since accessed from a child class class AnotherClass: def some_method(self, instance_of_my_child): return instance_of_my_child._x # Would be flagged as AnotherClass is not # a child class of MyParent Loading history...
151			min, max = cur.fetchone()
152			dif = (max - min) / self.n
153			points = [min + (i + 1) * dif for i in range(self.n - 1)]
154			else:
155			# TODO: why is the whole distribution computed instead of
			0 ignored issues – show Coding Style introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this `TODO` and `FIXME` comments should generally be avoided. Loading history...
156			# just min/max
157			d = distribution.get_distribution(data, attribute)
158			points = self._split_eq_width(d, n=self.n)
159			return Discretizer.create_discretized_var(
160			data.domain[attribute], points)
161
162			@staticmethod
163			def _split_eq_width(dist, n):
164			min = dist[0][0]
			0 ignored issues – show Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `min`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history...
165			max = dist[0][-1]
			0 ignored issues – show Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `max`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history...
166			if min == max:
167			return []
168			dif = (max - min) / n
169			return [min + (i + 1) * dif for i in range(n - 1)]
170
171			@staticmethod
172			def _split_eq_width_fixed(min, max, n):
			0 ignored issues – show Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `max`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history... Bug Best Practice introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this This seems to re-define the built-in `min`. It is generally discouraged to redefine built-ins as this makes code very hard to read. Loading history...
173			if min == max:
174			return []
175			dif = (max - min) / n
176			return [min + (i + 1) * dif for i in range(n - 1)]
177
178
179			# noinspection PyPep8Naming
180			class EntropyMDL(Discretization):
181			"""
182			Discretization into bins inferred by recursively splitting the values to
183			minimize the class-entropy. The procedure stops when further splits would
184			decrease the entropy for less than the corresponding increase of minimal
185			description length (MDL). [FayyadIrani93].
186
187			If there are no suitable cut-off points, the procedure returns a single bin,
188			which means that the new feature is constant and can be removed.
189
190			.. attribute:: force
191
192			Induce at least one cut-off point, even when its information
193			gain is lower than MDL (default: False).
194
195			"""
196			def __init__(self, force=False):
197			self.force = force
198
199			def __call__(self, data, attribute):
200			cont = contingency.get_contingency(data, attribute)
201			values, I = cont.values, cont.counts.T
202			cut_ind = np.array(self._entropy_discretize_sorted(I, self.force))
203			if len(cut_ind) > 0:
204			# "the midpoint between each successive pair of examples" (FI p.1)
205			points = (values[cut_ind] + values[cut_ind - 1]) / 2.
206			else:
207			points = []
208			return Discretizer.create_discretized_var(
209			data.domain[attribute], points)
210
211			@classmethod
212			def _normalize(cls, X, axis=None, out=None):
213			"""
214			Normalize `X` array so it sums to 1.0 over the `axis`.
215
216			Parameters
217			----------
218			X : array
219			Array to normalize.
220			axis : optional int
221			Axis over which the resulting array sums to 1.
222			out : optional array
223			Output array of the same shape as X.
224			"""
225			X = np.asarray(X, dtype=float)
226			scale = np.sum(X, axis=axis, keepdims=True)
227			if out is None:
228			return X / scale
229			else:
230			if out is not X:
231			assert out.shape == X.shape
232			out[:] = X
233			out /= scale
234			return out
235
236			@classmethod
237			def _entropy_normalized(cls, D, axis=None):
238			"""
239			Compute the entropy of distribution array `D`.
240
241			`D` must be a distribution (i.e. sum to 1.0 over `axis`)
242
243			Parameters
244			----------
245			D : array
246			Distribution.
247			axis : optional int
248			Axis of `D` along which to compute the entropy.
249
250			"""
251			# req: (np.sum(D, axis=axis) >= 0).all()
252			# req: (np.sum(D, axis=axis) <= 1).all()
253			# req: np.all(np.abs(np.sum(D, axis=axis) - 1) < 1e-9)
254
255			D = np.asarray(D)
256			Dc = np.clip(D, np.finfo(D.dtype).eps, 1.0)
257			return - np.sum(D * np.log2(Dc), axis=axis)
258
259			@classmethod
260			def _entropy(cls, D, axis=None):
261			"""
262			Compute the entropy of distribution `D`.
263
264			Parameters
265			----------
266			D : array
267			Distribution.
268			axis : optional int
269			Axis of `D` along which to compute the entropy.
270
271			"""
272			D = cls._normalize(D, axis=axis)
273			return cls._entropy_normalized(D, axis=axis)
274
275			@classmethod
276			def _entropy1(cls, D):
277			"""
278			Compute the entropy of distributions in `D`
279			(one per each row).
280			"""
281			D = cls._normalize(D)
282			return _discretize.entropy_normalized1(D)
283
284			@classmethod
285			def _entropy2(cls, D):
286			"""
287			Compute the entropy of distributions in `D`
288			(one per each row).
289			"""
290			D = cls._normalize(D, axis=1)
291			return _discretize.entropy_normalized2(D)
292
293			@classmethod
294			def _entropy_cuts_sorted(cls, CS):
295			"""
296			Return the class information entropy induced by partitioning
297			the `CS` distribution at all N-1 candidate cut points.
298
299			Parameters
300			----------
301			CS : (N, K) array of class distributions.
302			"""
303			CS = np.asarray(CS)
304			# \|--\|-------\|--------\|
305			# S1 ^ S2
306			# S1 contains all points which are <= to cut point
307			# Cumulative distributions for S1 and S2 (left right set)
308			# i.e. a cut at index i separates the CS into S1Dist[i] and S2Dist[i]
309			S1Dist = np.cumsum(CS, axis=0)[:-1]
310			S2Dist = np.cumsum(CS[::-1], axis=0)[-2::-1]
311
312			# Entropy of S1[i] and S2[i] sets
313			ES1 = cls._entropy2(S1Dist)
314			ES2 = cls._entropy2(S2Dist)
315
316			# Number of cases in S1[i] and S2[i] sets
317			S1_count = np.sum(S1Dist, axis=1)
318			S2_count = np.sum(S2Dist, axis=1)
319
320			# Number of all cases
321			S_count = np.sum(CS)
322
323			ES1w = ES1 * S1_count / S_count
324			ES2w = ES2 * S2_count / S_count
325
326			# E(A, T; S) Class information entropy of the partition S
327			E = ES1w + ES2w
328
329			return E, ES1, ES2
330
331			@classmethod
332			def _entropy_discretize_sorted(cls, C, force=False):
333			"""
334			Entropy discretization on a sorted C.
335
336			:param C: (N, K) array of class distributions.
337
338			"""
339			E, ES1, ES2 = cls._entropy_cuts_sorted(C)
340			# TODO: Also get the left right distribution counts from
			0 ignored issues – show Coding Style introduced 2015-12-02 09:15 UTC by Report Bug Copy Issue Report Show Similar Issues like this `TODO` and `FIXME` comments should generally be avoided. Loading history...
341			# entropy_cuts_sorted,
342
343			# Note the + 1
344			if len(E) == 0:
345			return []
346			cut_index = np.argmin(E) + 1
347
348			# Distribution of classed in S1, S2 and S
349			S1_c = np.sum(C[:cut_index], axis=0)
350			S2_c = np.sum(C[cut_index:], axis=0)
351			S_c = S1_c + S2_c
352
353			ES = cls._entropy1(np.sum(C, axis=0))
354			ES1, ES2 = ES1[cut_index - 1], ES2[cut_index - 1]
355
356			# Information gain of the best split
357			Gain = ES - E[cut_index - 1]
358			# Number of different classes in S, S1 and S2
359			k = float(np.sum(S_c > 0))
360			k1 = float(np.sum(S1_c > 0))
361			k2 = float(np.sum(S2_c > 0))
362
363			assert k > 0
364			delta = np.log2(3 ** k - 2) - (k * ES - k1 * ES1 - k2 * ES2)
365			N = float(np.sum(S_c))
366
367			if Gain > np.log2(N - 1) / N + delta / N:
368			# Accept the cut point and recursively split the subsets.
369			left, right = [], []
370			if k1 > 1 and cut_index > 1:
371			left = cls._entropy_discretize_sorted(C[:cut_index, :])
372			if k2 > 1 and cut_index < len(C) - 1:
373			right = cls._entropy_discretize_sorted(C[cut_index:, :])
374			return left + [cut_index] + [i + cut_index for i in right]
375			elif force:
376			return [cut_index]
377			else:
378			return []
379
380
381			class DomainDiscretizer:
382			"""Discretizes all continuous features in the data.
383
384			.. attribute:: method
385
386			Feature discretization method (instance of
387			:obj:`Orange.preprocess.Discretization`). If `None` (default),
388			:class:`Orange.preprocess.EqualFreq` with 4 intervals is
389			used.
390
391			.. attribute:: clean
392
393			If `True`, features discretized into a single interval constant are
394			removed. This is useful for discretization methods that infer the
395			number of intervals from the data, such as
396			:class:`Orange.preprocess.EntropyMDL` (default: `True`).
397
398			.. attribute:: discretize_class
399
400			Determines whether a target is also discretized if it is continuous.
401			(default: `False`)
402			"""
403			def __new__(cls, data=None,
404			discretize_class=False, method=None, clean=True, fixed=None):
405			self = super().__new__(cls)
406			self.discretize_class = discretize_class
407			self.method = method
408			self.clean = clean
409			if data is None:
410			return self
411			else:
412			return self(data, fixed)
413
414			def __call__(self, data, fixed=None):
415			"""
416			Compute and return discretized domain.
417
418			:param data: Data to discretize.
419			"""
420
421			def transform_list(s, fixed=None):
422			new_vars = []
423			for var in s:
424			if var.is_continuous:
425			if fixed and var.name in fixed.keys():
426			nv = method(data, var, fixed)
427			else:
428			nv = method(data, var)
429			if not self.clean or len(nv.values) > 1:
430			new_vars.append(nv)
431			else:
432			new_vars.append(var)
433			return new_vars
434			if self.method is None:
435			method = EqualFreq(n=4)
436			else:
437			method = self.method
438			domain = data.domain
439			new_attrs = transform_list(domain.attributes, fixed)
440			if self.discretize_class:
441			new_classes = transform_list(domain.class_vars)
442			else:
443			new_classes = domain.class_vars
444			return Domain(new_attrs, new_classes)
445

hugobuddel / orange3

GitHub Access Token became invalid

Issues (4082)

Orange/preprocess/discretize.py (15 issues)

Labels

Severity

Introduced By

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files