GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

Issues (4082)

Orange/statistics/distribution.py (11 issues)

1
import random
2
import zlib
3
import math
4
from numbers import Real
5
import numpy as np
0 ignored issues
show
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
6
from Orange import data
7
8
9
def _get_variable(dat, variable, expected_type=None, expected_name=""):
10
    failed = False
11
    if isinstance(variable, data.Variable):
12
        datvar = getattr(dat, "variable", None)
13
        if datvar is not None and datvar is not variable:
14
            raise ValueError("variable does not match the variable"
15
                             "in the data")
16
    elif hasattr(dat, "domain"):
17
        variable = dat.domain[variable]
18
    elif hasattr(dat, "variable"):
19
        variable = dat.variable
20
    else:
21
        failed = True
22
    if failed or (expected_type is not None
23
                  and not isinstance(variable, expected_type)):
24
        if isinstance(variable, data.Variable):
25
            raise ValueError(
26
                "expected %s variable not %s" % (expected_name, variable))
27
        else:
28
            raise ValueError("expected %s, not '%s'" %
29
                             (expected_type.__name__, type(variable).__name__))
30
    return variable
31
32
33
class Discrete(np.ndarray):
34
    def __new__(cls, dat, variable=None, unknowns=None):
35
        if isinstance(dat, data.Storage):
36
            if unknowns is not None:
37
                raise TypeError(
38
                    "incompatible arguments (data storage and 'unknowns'")
39
            return cls.from_data(dat, variable)
40
41
        if variable is not None:
42
            variable = _get_variable(dat, variable)
43
            n = len(variable.values)
44
        else:
45
            n = len(dat)
46
47
        self = super().__new__(cls, n)
48
        self.variable = variable
49
        if dat is None:
50
            self[:] = 0
51
            self.unknowns = unknowns or 0
52
        else:
53
            self[:] = dat
54
            self.unknowns = (unknowns if unknowns is not None
55
                             else getattr(dat, "unknowns", 0))
56
        return self
57
58
59
    @classmethod
60
    def from_data(cls, data, variable):
0 ignored issues
show
Comprehensibility Bug introduced by
data is re-defining a name which is already available in the outer-scope (previously defined on line 6).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
61
        variable = _get_variable(data, variable)
62
        try:
63
            dist, unknowns = data._compute_distributions([variable])[0]
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _compute_distributions was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
64
            self = super().__new__(cls, len(dist))
65
            self[:] = dist
66
            self.unknowns = unknowns
67
        except NotImplementedError:
68
            self = super().__new__(cls, len(variable.values))
69
            self[:] = np.zeros(len(variable.values))
70
            self.unknowns = 0
71
            if data.has_weights():
72
                for val, w in zip(data[:, variable], data.W):
73
                    if not math.isnan(val):
74
                        self[val] += w
75
                    else:
76
                        self.unknowns += w
77
            else:
78
                for inst in data:
79
                    val = inst[variable]
80
                    if val == val:
81
                        self[val] += 1
82
                    else:
83
                        self.unknowns += 1
84
        self.variable = variable
85
        return self
86
87
88
    def __eq__(self, other):
89
        return np.array_equal(self, other) and (
90
            not hasattr(other, "unknowns") or self.unknowns == other.unknowns)
91
92
    def __ne__(self, other):
93
        return not self == other
94
95
    def __getitem__(self, index):
96
        if isinstance(index, str):
97
            index = self.variable.to_val(index)
98
        return super().__getitem__(index)
99
100
101
    def __setitem__(self, index, value):
102
        if isinstance(index, str):
103
            index = self.variable.to_val(index)
104
        super().__setitem__(index, value)
105
106
107
    def __hash__(self):
108
        return zlib.adler32(self) ^ hash(self.unknowns)
109
110
111
    def __add__(self, other):
112
        s = super().__add__(other)
113
        s.unknowns = self.unknowns + getattr(other, "unknowns", 0)
114
        return s
115
116
117
    def __iadd__(self, other):
118
        super().__iadd__(other)
119
        self.unknowns += getattr(other, "unknowns", 0)
120
        return self
121
122
123
    def __sub__(self, other):
124
        s = super().__sub__(other)
125
        s.unknowns = self.unknowns - getattr(other, "unknowns", 0)
126
        return s
127
128
129
    def __isub__(self, other):
130
        super().__isub__(other)
131
        self.unknowns -= getattr(other, "unknowns", 0)
132
        return self
133
134
135
    def __mul__(self, other):
136
        s = super().__mul__(other)
137
        if isinstance(other, Real):
138
            s.unknowns = self.unknowns / other
139
        return s
140
141
142
    def __imul__(self, other):
143
        super().__imul__(other)
144
        if isinstance(other, Real):
145
            self.unknowns *= other
146
        return self
147
148
149
    def __div__(self, other):
150
        s = super().__mul__(other)
151
        if isinstance(other, Real):
152
            s.unknowns = self.unknowns / other
153
        return s
154
155
156
    def __idiv__(self, other):
157
        super().__imul__(other)
158
        if isinstance(other, Real):
159
            self.unknowns /= other
160
        return self
161
162
163
    def normalize(self):
164
        t = np.sum(self)
165
        if t > 1e-6:
166
            self[:] /= t
167
            self.unknowns /= t
168
        elif self.shape[0]:
169
            self[:] = 1 / self.shape[0]
170
171
172
    def modus(self):
173
        val = np.argmax(self)
174
        return data.Value(self.variable,
175
                          val) if self.variable is not None else val
176
177
178
    def random(self):
179
        v = random.random() * np.sum(self)
180
        s = i = 0
181
        for i, e in enumerate(self):
182
            s += e
183
            if s > v:
184
                break
185
        return data.Value(self.variable, i) if self.variable is not None else i
186
187
188
class Continuous(np.ndarray):
189
    def __new__(cls, dat, variable=None, unknowns=None):
190
        if isinstance(dat, data.Storage):
191
            if unknowns is not None:
192
                raise TypeError(
193
                    "incompatible arguments (data storage and 'unknowns'")
194
            return cls.from_data(variable, dat)
195
        if isinstance(dat, int):
196
            self = super().__new__(cls, (2, dat))
197
            self[:] = 0
198
            self.unknowns = unknowns or 0
199
        else:
200
            if not isinstance(dat, np.ndarray):
201
                dat = np.asarray(dat)
202
            self = super().__new__(cls, dat.shape)
203
            self[:] = dat
204
            self.unknowns = (unknowns if unknowns is not None
205
                             else getattr(dat, "unknowns", 0))
206
        self.variable = variable
207
        return self
208
209
    @classmethod
210
    def from_data(cls, variable, data):
0 ignored issues
show
Comprehensibility Bug introduced by
data is re-defining a name which is already available in the outer-scope (previously defined on line 6).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
211
        variable = _get_variable(data, variable)
212
        try:
213
            dist, unknowns = data._compute_distributions([variable])[0]
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _compute_distributions was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
214
        except NotImplementedError:
215
            col = data[:, variable]
216
            dtype = col.dtype
217
            if data.has_weights():
218
                if not "float" in dtype.name and "float" in col.dtype.name:
219
                    dtype = col.dtype.name
220
                dist = np.empty((2, len(col)), dtype=dtype)
221
                dist[0, :] = col
222
                dist[1, :] = data.W
223
            else:
224
                dist = np.ones((2, len(col)), dtype=dtype)
225
                dist[0, :] = col
226
            dist.sort(axis=0)
227
            dist = np.array(_orange.valuecount(dist))
228
            unknowns = len(col) - dist.shape[1]
229
230
        self = super().__new__(cls, dist.shape)
231
        self[:] = dist
232
        self.unknowns = unknowns
233
        self.variable = variable
234
        return self
235
236
    def __eq__(self, other):
237
        return np.array_equal(self, other) and (
238
            not hasattr(other, "unknowns") or self.unknowns == other.unknowns)
239
240
    def __hash__(self):
241
        return zlib.adler32(self) ^ hash(self.unknowns)
242
243
    def normalize(self):
244
        t = np.sum(self[1, :])
245
        if t > 1e-6:
246
            self[1, :] /= t
247
            self.unknowns /= t
248
        elif self.shape[1]:
249
            self[1, :] = 1 / self.shape[1]
250
251
    def modus(self):
252
        val = np.argmax(self[1, :])
253
        return self[0, val]
254
255
    # TODO implement __getitem__ that will return a normal array, not Continuous
0 ignored issues
show
TODO and FIXME comments should generally be avoided.
Loading history...
256
    def min(self):
257
        return self[0, 0]
258
259
    def max(self):
260
        return self[0, -1]
261
262
    def random(self):
263
        v = random.random() * np.sum(self[1, :])
264
        s = 0
265
        for x, prob in self.T:
266
            s += prob
267
            if s > v:
268
                return x
269
270
    def mean(self):
271
        return np.average(self[0], weights=self[1])
272
273
    def variance(self):
274
        avg = self.mean()
275
        return sum([((x-avg)**2)*w for x, w in zip(self[0], self[1])])/sum(self[1])
276
277
    def standard_deviation(self):
278
        return math.sqrt(self.variance())
279
280
281
282
def class_distribution(data):
0 ignored issues
show
Comprehensibility Bug introduced by
data is re-defining a name which is already available in the outer-scope (previously defined on line 6).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
283
    if data.domain.class_var:
284
        return get_distribution(data, data.domain.class_var)
285
    elif data.domain.class_vars:
286
        return [get_distribution(cls, data) for cls in data.domain.class_vars]
287
    else:
288
        raise ValueError("domain has no class attribute")
289
290
291
def get_distribution(dat, variable, unknowns=None):
292
    variable = _get_variable(dat, variable)
293
    if variable.is_discrete:
294
        return Discrete(dat, variable, unknowns)
295
    elif variable.is_continuous:
296
        return Continuous(dat, variable, unknowns)
297
    else:
298
        raise TypeError("cannot compute distribution of '%s'" %
299
                        type(variable).__name__)
300
301
302
def get_distributions(dat, skipDiscrete=False, skipContinuous=False):
303
    vars = dat.domain.variables
0 ignored issues
show
Bug Best Practice introduced by
This seems to re-define the built-in vars.

It is generally discouraged to redefine built-ins as this makes code very hard to read.

Loading history...
304
    if skipDiscrete:
305
        if skipContinuous:
306
            return []
307
        columns = [i for i, var in enumerate(vars) if var.is_continuous]
308
    elif skipContinuous:
309
        columns = [i for i, var in enumerate(vars) if var.is_discrete]
310
    else:
311
        columns = None
312
    try:
313
        dist_unks = dat._compute_distributions(columns)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _compute_distributions was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
314
        if columns is None:
315
            columns = np.arange(len(vars))
316
        distributions = []
317
        for col, (dist, unks) in zip(columns, dist_unks):
318
            distributions.append(get_distribution(dist, vars[col], unks))
319
    except NotImplementedError:
320
        if columns is None:
321
            columns = np.arange(len(vars))
322
        distributions = [get_distribution(dat, i) for i in columns]
323
    return distributions
324
325
326
def get_distributions_for_columns(data, columns):
0 ignored issues
show
Comprehensibility Bug introduced by
data is re-defining a name which is already available in the outer-scope (previously defined on line 6).

It is generally a bad practice to shadow variables from the outer-scope. In most cases, this is done unintentionally and might lead to unexpected behavior:

param = 5

class Foo:
    def __init__(self, param):   # "param" would be flagged here
        self.param = param
Loading history...
327
    """
328
    Compute the distributions for columns.
329
330
    :param Orange.data.Table data:
331
    :param list columns:
332
        List of column indices into the `data.domain` (indices can be
333
        :class:`int` or instances of `Orange.data.Variable`)
334
335
    """
336
    domain = data.domain
337
    # Normailze the columns to int indices
338
    columns = [col if isinstance(col, int) else domain.index(col)
339
               for col in columns]
340
    try:
341
        # Try the optimized code path (query the table|storage directly).
342
        dist_unks = data._compute_distributions(columns)
0 ignored issues
show
Coding Style Best Practice introduced by
It seems like _compute_distributions was declared protected and should not be accessed from this context.

Prefixing a member variable _ is usually regarded as the equivalent of declaring it with protected visibility that exists in other languages. Consequentially, such a member should only be accessed from the same class or a child class:

class MyParent:
    def __init__(self):
        self._x = 1;
        self.y = 2;

class MyChild(MyParent):
    def some_method(self):
        return self._x    # Ok, since accessed from a child class

class AnotherClass:
    def some_method(self, instance_of_my_child):
        return instance_of_my_child._x   # Would be flagged as AnotherClass is not
                                         # a child class of MyParent
Loading history...
343
    except NotImplementedError:
344
        # Use default slow(er) implementation.
345
        return [get_distribution(data, i) for i in columns]
346
    else:
347
        # dist_unkn is a list of (values, unknowns)
348
        return [get_distribution(dist, domain[col], unknown)
349
                for col, (dist, unknown) in zip(columns, dist_unks)]
350