GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.

discrete_goodness_of_fit()   F
last analyzed

Complexity

Conditions 11

Size

Total Lines 31

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 11
c 2
b 0
f 0
dl 0
loc 31
rs 3.1764

How to fix   Complexity   

Complexity

Complex classes like discrete_goodness_of_fit() often do a lot of different things. To break such a class down, we need to identify a cohesive component within that class. A common approach to find such a component is to look for fields/methods that share the same prefixes, or suffixes.

Once you have determined the fields that belong together, you can apply the Extract Class refactoring. If the component makes sense as a sub-class, Extract Subclass is also a candidate, and is often faster.

1
# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
2
# Copyright (c) 2015, Gamelan Labs, Inc.
3
# Copyright (c) 2016, Google, Inc.
4
#
5
# Redistribution and use in source and binary forms, with or without
6
# modification, are permitted provided that the following conditions
7
# are met:
8
#
9
# - Redistributions of source code must retain the above copyright
10
#   notice, this list of conditions and the following disclaimer.
11
# - Redistributions in binary form must reproduce the above copyright
12
#   notice, this list of conditions and the following disclaimer in the
13
#   documentation and/or other materials provided with the distribution.
14
# - Neither the name of Salesforce.com nor the names of its contributors
15
#   may be used to endorse or promote products derived from this
16
#   software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
27
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
28
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
from __future__ import division
31
from collections import defaultdict
32
from math import gamma
33
try:
34
    from itertools import izip as zip
35
except ImportError:
36
    pass
37
import random
38
import sys
39
40
import numpy
41
import numpy.random
42
from numpy import pi
43
from scipy.spatial import cKDTree
44
45
from .utils import chi2sf
46
47
NoneType = type(None)
48
49
#: Data types for integral random variables.
50
#:
51
#: For Python 2.7, this tuple also includes `long`.
52
INTEGRAL_TYPES = (int, )
53
54
#: Data types for continuous random variables.
55
CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64)
56
57
#: Data types for discrete random variables.
58
#:
59
#: For Python 2.7, this tuple also includes `long` and `basestring`.
60
DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64)
61
62
if sys.version_info < (3, ):
63
    # `str` is a subclass of `basestring`, so this is doing a little
64
    # more work than is necessary, but it should not cause a problem.
65
    DISCRETE_TYPES += (long, basestring)  # noqa
66
    INTEGRAL_TYPES += (long, )  # noqa
67
68
69
def seed_all(seed):
70
    random.seed(seed)
71
    numpy.random.seed(seed)
72
73
74
def get_dim(thing):
75
    if hasattr(thing, '__len__'):
76
        return len(thing)
77
    else:
78
        return 1
79
80
81
def print_histogram(probs, counts):
82
    WIDTH = 60.0
83
    max_count = max(counts)
84
    print('{: >8} {: >8}'.format('Prob', 'Count'))
85
    for prob, count in sorted(zip(probs, counts), reverse=True):
86
        width = int(round(WIDTH * count / max_count))
87
        print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width))
88
89
90
def multinomial_goodness_of_fit(
91
        probs,
92
        counts,
93
        total_count,
94
        truncated=False,
95
        plot=False):
96
    """
97
    Pearson's chi^2 test, on possibly truncated data.
98
    http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
99
100
    Returns:
101
        p-value of truncated multinomial sample.
102
    """
103
    assert len(probs) == len(counts)
104
    assert truncated or total_count == sum(counts)
105
    chi_squared = 0
106
    dof = 0
107
    if plot:
108
        print_histogram(probs, counts)
109
    for p, c in zip(probs, counts):
110
        if p == 1:
111
            return 1 if c == total_count else 0
112
        assert p < 1, 'bad probability: %g' % p
113
        if p > 0:
114
            mean = total_count * p
115
            variance = total_count * p * (1 - p)
116
            assert variance > 1,\
117
                'WARNING goodness of fit is inaccurate; use more samples'
118
            chi_squared += (c - mean) ** 2 / variance
119
            dof += 1
120
        else:
121
            print('WARNING zero probability in goodness-of-fit test')
122
            if c > 0:
123
                return float('inf')
124
125
    if not truncated:
126
        dof -= 1
127
128
    survival = chi2sf(chi_squared, dof)
129
    return survival
130
131
132
def unif01_goodness_of_fit(samples, plot=False):
133
    """
134
    Bin uniformly distributed samples and apply Pearson's chi^2 test.
135
    """
136
    samples = numpy.array(samples, dtype=float)
137
    assert samples.min() >= 0.0
138
    assert samples.max() <= 1.0
139
    bin_count = int(round(len(samples) ** 0.333))
140
    assert bin_count >= 7, 'WARNING imprecise test, use more samples'
141
    probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count
142
    counts = numpy.zeros(bin_count, dtype=numpy.int)
143
    for sample in samples:
144
        counts[min(bin_count - 1, int(bin_count * sample))] += 1
145
    return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot)
146
147
148
def exp_goodness_of_fit(
149
        samples,
150
        plot=False,
151
        normalized=True,
152
        return_dict=False):
153
    """
154
    Transform exponentially distribued samples to unif01 distribution
155
    and assess goodness of fit via binned Pearson's chi^2 test.
156
157
    Inputs:
158
        samples - a list of real-valued samples from a candidate distribution
159
    """
160
    result = {}
161
    if not normalized:
162
        result['norm'] = numpy.mean(samples)
163
        samples /= result['norm']
164
    unif01_samples = numpy.exp(-samples)
165
    result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot)
166
    return result if return_dict else result['gof']
167
168
169
def density_goodness_of_fit(
170
        samples,
171
        probs,
172
        plot=False,
173
        normalized=True,
174
        return_dict=False):
175
    """
176
    Transform arbitrary continuous samples to unif01 distribution
177
    and assess goodness of fit via binned Pearson's chi^2 test.
178
179
    Inputs:
180
        samples - a list of real-valued samples from a distribution
181
        probs - a list of probability densities evaluated at those samples
182
    """
183
    assert len(samples) == len(probs)
184
    assert len(samples) > 100, 'WARNING imprecision; use more samples'
185
    pairs = list(zip(samples, probs))
186
    pairs.sort()
187
    samples = numpy.array([x for x, p in pairs])
188
    probs = numpy.array([p for x, p in pairs])
189
    density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1])
190
    gaps = samples[1:] - samples[:-1]
191
    exp_samples = density * gaps
192
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
193
194
195
def volume_of_sphere(dim, radius):
196
    assert isinstance(dim, INTEGRAL_TYPES)
197
    return radius ** dim * pi ** (0.5 * dim) / gamma(0.5 * dim + 1)
198
199
200
def get_nearest_neighbor_distances(samples):
201
    if not hasattr(samples[0], '__iter__'):
202
        samples = numpy.array([samples]).T
203
    distances, indices = cKDTree(samples).query(samples, k=2)
204
    return distances[:, 1]
205
206
207
def vector_density_goodness_of_fit(
208
        samples,
209
        probs,
210
        plot=False,
211
        normalized=True,
212
        return_dict=False):
213
    """
214
    Transform arbitrary multivariate continuous samples
215
    to unif01 distribution via nearest neighbor distribution [1,2,3]
216
    and assess goodness of fit via binned Pearson's chi^2 test.
217
218
    [1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668
219
    [2] http://arxiv.org/pdf/1006.3019v2.pdf
220
    [3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution
221
222
    Inputs:
223
        samples - a list of real-vector-valued samples from a distribution
224
        probs - a list of probability densities evaluated at those samples
225
    """
226
    assert len(samples)
227
    assert len(samples) == len(probs)
228
    dim = get_dim(samples[0])
229
    assert dim
230
    assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples'
231
    radii = get_nearest_neighbor_distances(samples)
232
    density = len(samples) * numpy.array(probs)
233
    volume = volume_of_sphere(dim, radii)
234
    exp_samples = density * volume
235
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
236
237
238
def trivial_density_goodness_of_fit(
239
        samples,
240
        probs,
241
        plot=False,
242
        normalized=True,
243
        return_dict=False):
244
    assert len(samples)
245
    assert all(sample == samples[0] for sample in samples)
246
    result = {'gof': 1.0}
247
    if not normalized:
248
        result['norm'] = probs[0]
249
    if return_dict:
250
        return result
251
    else:
252
        return result['gof']
253
254
255
def auto_density_goodness_of_fit(
256
        samples,
257
        probs,
258
        plot=False,
259
        normalized=True,
260
        return_dict=False):
261
    """
262
    Dispatch on sample dimention and delegate to one of:
263
    - density_goodness_of_fit
264
    - vector_density_goodness_of_fit
265
    - trivial_density_goodness_of_fit
266
    """
267
    assert len(samples)
268
    dim = get_dim(samples[0])
269
    if dim == 0:
270
        fun = trivial_density_goodness_of_fit
271
    elif dim == 1:
272
        fun = density_goodness_of_fit
273
        if hasattr(samples[0], '__len__'):
274
            samples = [sample[0] for sample in samples]
275
    else:
276
        fun = vector_density_goodness_of_fit
277
    return fun(samples, probs, plot, normalized, return_dict)
278
279
280
def discrete_goodness_of_fit(
281
        samples,
282
        probs_dict,
283
        truncate_beyond=8,
284
        plot=False,
285
        normalized=True):
286
    """
287
    Transform arbitrary discrete data to multinomial
288
    and assess goodness of fit via Pearson's chi^2 test.
289
    """
290
    if not normalized:
291
        norm = sum(probs_dict.values())
292
        probs_dict = {i: p / norm for i, p in probs_dict.items()}
293
    counts = defaultdict(lambda: 0)
294
    for sample in samples:
295
        assert sample in probs_dict
296
        counts[sample] += 1
297
    items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()]
298
    items.sort(reverse=True)
299
    truncated = (truncate_beyond and truncate_beyond < len(items))
300
    if truncated:
301
        items = items[:truncate_beyond]
302
    probs = [prob for prob, count in items]
303
    counts = [count for prob, count in items]
304
    assert sum(counts) > 100, 'WARNING imprecision; use more samples'
305
    return multinomial_goodness_of_fit(
306
        probs,
307
        counts,
308
        len(samples),
309
        truncated=truncated,
310
        plot=plot)
311
312
313
def split_discrete_continuous(data):
314
    """
315
    Convert arbitrary data to a pair `(discrete, continuous)`
316
    where `discrete` is hashable and `continuous` is a list of floats.
317
    """
318
    if isinstance(data, DISCRETE_TYPES):
319
        return data, []
320
    elif isinstance(data, CONTINUOUS_TYPES):
321
        return None, [data]
322
    elif isinstance(data, (tuple, list)):
323
        discrete = []
324
        continuous = []
325
        for part in data:
326
            d, c = split_discrete_continuous(part)
327
            discrete.append(d)
328
            continuous += c
329
        return tuple(discrete), continuous
330
    elif isinstance(data, numpy.ndarray):
331
        assert data.dtype in [numpy.float64, numpy.float32]
332
        return (None,) * len(data), list(map(float, data))
333
    else:
334
        raise TypeError(
335
            'split_discrete_continuous does not accept {} of type {}'.format(
336
                repr(data), str(type(data))))
337
338
339
def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True):
340
    """
341
    Test general mixed discrete+continuous datatypes by
342
    (1) testing the continuous part conditioned on each discrete value
343
    (2) testing the discrete part marginalizing over the continuous part
344
    (3) testing the estimated total probability (if normalized = True)
345
346
    Inputs:
347
        samples - a list of plain-old-data samples from a distribution
348
        probs - a list of probability densities evaluated at those samples
349
    """
350
    assert len(samples)
351
    discrete_samples = []
352
    strata = defaultdict(lambda: ([], []))
353
    for sample, prob in zip(samples, probs):
354
        d, c = split_discrete_continuous(sample)
355
        discrete_samples.append(d)
356
        samples, probs = strata[d]
357
        samples.append(c)
358
        probs.append(prob)
359
360
    # Continuous part
361
    gofs = []
362
    discrete_probs = {}
363
    for key, (samples, probs) in strata.items():
364
        result = auto_density_goodness_of_fit(
365
            samples,
366
            probs,
367
            plot=plot,
368
            normalized=False,
369
            return_dict=True)
370
        gofs.append(result['gof'])
371
        discrete_probs[key] = result['norm']
372
373
    # Discrete part
374
    if len(strata) > 1:
375
        gofs.append(discrete_goodness_of_fit(
376
            discrete_samples,
377
            discrete_probs,
378
            plot=plot,
379
            normalized=False))
380
381
    # Normalization
382
    if normalized:
383
        norm = sum(discrete_probs.values())
384
        discrete_counts = [len(samples) for samples, _ in strata.values()]
385
        norm_variance = sum(1.0 / count for count in discrete_counts)
386
        dof = len(discrete_counts)
387
        chi_squared = (1 - norm) ** 2 / norm_variance
388
        gofs.append(chi2sf(chi_squared, dof))
389
        if plot:
390
            print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5))
391
            print('     = {}'.format(
392
                ' + '.join(map('{:.4g}'.format, discrete_probs.values()))))
393
394
    return min(gofs)
395