GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Push — master ( a41705...c23734 )
by Fritz
10s
created

density_goodness_of_fit()   B

Complexity

Conditions 5

Size

Total Lines 24

Duplication

Lines 0
Ratio 0 %

Importance

Changes 2
Bugs 0 Features 0
Metric Value
cc 5
c 2
b 0
f 0
dl 0
loc 24
rs 8.1671
1
# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
2
# Copyright (c) 2015, Gamelan Labs, Inc.
3
# Copyright (c) 2016, Google, Inc.
4
#
5
# Redistribution and use in source and binary forms, with or without
6
# modification, are permitted provided that the following conditions
7
# are met:
8
#
9
# - Redistributions of source code must retain the above copyright
10
#   notice, this list of conditions and the following disclaimer.
11
# - Redistributions in binary form must reproduce the above copyright
12
#   notice, this list of conditions and the following disclaimer in the
13
#   documentation and/or other materials provided with the distribution.
14
# - Neither the name of Salesforce.com nor the names of its contributors
15
#   may be used to endorse or promote products derived from this
16
#   software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
27
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
28
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
from __future__ import division
30
31
import random
32
import sys
33
import numpy
34
import numpy.random
35
from numpy import pi
36
import scipy.stats
37
from scipy.special import gamma
38
try:
39
    from itertools import izip as zip
40
except ImportError:
41
    pass
42
from collections import defaultdict
43
44
NoneType = type(None)
45
46
#: Data types for integral random variables.
47
#:
48
#: For Python 2.7, this tuple also includes `long`.
49
INTEGRAL_TYPES = (int, )
50
51
#: Data types for continuous random variables.
52
CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64)
53
54
#: Data types for discrete random variables.
55
#:
56
#: For Python 2.7, this tuple also includes `long` and `basestring`.
57
DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64)
58
59
if sys.version_info < (3, ):
60
    # `str` is a subclass of `basestring`, so this is doing a little
61
    # more work than is necessary, but it should not cause a problem.
62
    DISCRETE_TYPES += (long, basestring)  # noqa
63
    INTEGRAL_TYPES += (long, )  # noqa
64
65
66
def seed_all(seed):
67
    random.seed(seed)
68
    numpy.random.seed(seed)
69
70
71
def get_dim(thing):
72
    if hasattr(thing, '__len__'):
73
        return len(thing)
74
    else:
75
        return 1
76
77
78
def print_histogram(probs, counts):
79
    WIDTH = 60.0
80
    max_count = max(counts)
81
    print('{: >8} {: >8}'.format('Prob', 'Count'))
82
    for prob, count in sorted(zip(probs, counts), reverse=True):
83
        width = int(round(WIDTH * count / max_count))
84
        print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width))
85
86
87
def multinomial_goodness_of_fit(
88
        probs,
89
        counts,
90
        total_count,
91
        truncated=False,
92
        plot=False):
93
    """
94
    Pearson's chi^2 test, on possibly truncated data.
95
    http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
96
97
    Returns:
98
        p-value of truncated multinomial sample.
99
    """
100
    assert len(probs) == len(counts)
101
    assert truncated or total_count == sum(counts)
102
    chi_squared = 0
103
    dof = 0
104
    if plot:
105
        print_histogram(probs, counts)
106
    for p, c in zip(probs, counts):
107
        if p == 1:
108
            return 1 if c == total_count else 0
109
        assert p < 1, 'bad probability: %g' % p
110
        if p > 0:
111
            mean = total_count * p
112
            variance = total_count * p * (1 - p)
113
            assert variance > 1,\
114
                'WARNING goodness of fit is inaccurate; use more samples'
115
            chi_squared += (c - mean) ** 2 / variance
116
            dof += 1
117
        else:
118
            print('WARNING zero probability in goodness-of-fit test')
119
            if c > 0:
120
                return float('inf')
121
122
    if not truncated:
123
        dof -= 1
124
125
    survival = scipy.stats.chi2.sf(chi_squared, dof)
126
    return survival
127
128
129
def unif01_goodness_of_fit(samples, plot=False):
130
    """
131
    Bin uniformly distributed samples and apply Pearson's chi^2 test.
132
    """
133
    samples = numpy.array(samples, dtype=float)
134
    assert samples.min() >= 0.0
135
    assert samples.max() <= 1.0
136
    bin_count = int(round(len(samples) ** 0.333))
137
    assert bin_count >= 7, 'WARNING imprecise test, use more samples'
138
    probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count
139
    counts = numpy.zeros(bin_count, dtype=numpy.int)
140
    for sample in samples:
141
        counts[min(bin_count - 1, int(bin_count * sample))] += 1
142
    return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot)
143
144
145
def exp_goodness_of_fit(
146
        samples,
147
        plot=False,
148
        normalized=True,
149
        return_dict=False):
150
    """
151
    Transform exponentially distribued samples to unif01 distribution
152
    and assess goodness of fit via binned Pearson's chi^2 test.
153
154
    Inputs:
155
        samples - a list of real-valued samples from a candidate distribution
156
    """
157
    result = {}
158
    if not normalized:
159
        result['norm'] = numpy.mean(samples)
160
        samples /= result['norm']
161
    unif01_samples = numpy.exp(-samples)
162
    result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot)
163
    return result if return_dict else result['gof']
164
165
166
def density_goodness_of_fit(
167
        samples,
168
        probs,
169
        plot=False,
170
        normalized=True,
171
        return_dict=False):
172
    """
173
    Transform arbitrary continuous samples to unif01 distribution
174
    and assess goodness of fit via binned Pearson's chi^2 test.
175
176
    Inputs:
177
        samples - a list of real-valued samples from a distribution
178
        probs - a list of probability densities evaluated at those samples
179
    """
180
    assert len(samples) == len(probs)
181
    assert len(samples) > 100, 'WARNING imprecision; use more samples'
182
    pairs = list(zip(samples, probs))
183
    pairs.sort()
184
    samples = numpy.array([x for x, p in pairs])
185
    probs = numpy.array([p for x, p in pairs])
186
    density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1])
187
    gaps = samples[1:] - samples[:-1]
188
    exp_samples = density * gaps
189
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
190
191
192
def volume_of_sphere(dim, radius):
193
    assert isinstance(dim, INTEGRAL_TYPES)
194
    return radius ** dim * pi ** (0.5 * dim) / gamma(0.5 * dim + 1)
195
196
197
def get_nearest_neighbor_distances(samples):
198
    from sklearn.neighbors import NearestNeighbors
199
    if not hasattr(samples[0], '__iter__'):
200
        samples = numpy.array([samples]).T
201
    neighbors = NearestNeighbors(n_neighbors=2).fit(samples)
202
    distances, indices = neighbors.kneighbors(samples)
203
    return distances[:, 1]
204
205
206
def vector_density_goodness_of_fit(
207
        samples,
208
        probs,
209
        plot=False,
210
        normalized=True,
211
        return_dict=False):
212
    """
213
    Transform arbitrary multivariate continuous samples
214
    to unif01 distribution via nearest neighbor distribution [1,2,3]
215
    and assess goodness of fit via binned Pearson's chi^2 test.
216
217
    [1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668
218
    [2] http://arxiv.org/pdf/1006.3019v2.pdf
219
    [3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution
220
221
    Inputs:
222
        samples - a list of real-vector-valued samples from a distribution
223
        probs - a list of probability densities evaluated at those samples
224
    """
225
    assert len(samples)
226
    assert len(samples) == len(probs)
227
    dim = get_dim(samples[0])
228
    assert dim
229
    assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples'
230
    radii = get_nearest_neighbor_distances(samples)
231
    density = len(samples) * numpy.array(probs)
232
    volume = volume_of_sphere(dim, radii)
233
    exp_samples = density * volume
234
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
235
236
237
def trivial_density_goodness_of_fit(
238
        samples,
239
        probs,
240
        plot=False,
241
        normalized=True,
242
        return_dict=False):
243
    assert len(samples)
244
    assert all(sample == samples[0] for sample in samples)
245
    result = {'gof': 1.0}
246
    if not normalized:
247
        result['norm'] = probs[0]
248
    if return_dict:
249
        return result
250
    else:
251
        return result['gof']
252
253
254
def auto_density_goodness_of_fit(
255
        samples,
256
        probs,
257
        plot=False,
258
        normalized=True,
259
        return_dict=False):
260
    """
261
    Dispatch on sample dimention and delegate to one of:
262
    - density_goodness_of_fit
263
    - vector_density_goodness_of_fit
264
    - trivial_density_goodness_of_fit
265
    """
266
    assert len(samples)
267
    dim = get_dim(samples[0])
268
    if dim == 0:
269
        fun = trivial_density_goodness_of_fit
270
    elif dim == 1:
271
        fun = density_goodness_of_fit
272
        if hasattr(samples[0], '__len__'):
273
            samples = [sample[0] for sample in samples]
274
    else:
275
        fun = vector_density_goodness_of_fit
276
    return fun(samples, probs, plot, normalized, return_dict)
277
278
279
def discrete_goodness_of_fit(
280
        samples,
281
        probs_dict,
282
        truncate_beyond=8,
283
        plot=False,
284
        normalized=True):
285
    """
286
    Transform arbitrary discrete data to multinomial
287
    and assess goodness of fit via Pearson's chi^2 test.
288
    """
289
    if not normalized:
290
        norm = sum(probs_dict.values())
291
        probs_dict = {i: p / norm for i, p in probs_dict.items()}
292
    counts = defaultdict(lambda: 0)
293
    for sample in samples:
294
        assert sample in probs_dict
295
        counts[sample] += 1
296
    items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()]
297
    items.sort(reverse=True)
298
    truncated = (truncate_beyond and truncate_beyond < len(items))
299
    if truncated:
300
        items = items[:truncate_beyond]
301
    probs = [prob for prob, count in items]
302
    counts = [count for prob, count in items]
303
    assert sum(counts) > 100, 'WARNING imprecision; use more samples'
304
    return multinomial_goodness_of_fit(
305
        probs,
306
        counts,
307
        len(samples),
308
        truncated=truncated,
309
        plot=plot)
310
311
312
def split_discrete_continuous(data):
313
    """
314
    Convert arbitrary data to a pair `(discrete, continuous)`
315
    where `discrete` is hashable and `continuous` is a list of floats.
316
    """
317
    if isinstance(data, DISCRETE_TYPES):
318
        return data, []
319
    elif isinstance(data, CONTINUOUS_TYPES):
320
        return None, [data]
321
    elif isinstance(data, (tuple, list)):
322
        discrete = []
323
        continuous = []
324
        for part in data:
325
            d, c = split_discrete_continuous(part)
326
            discrete.append(d)
327
            continuous += c
328
        return tuple(discrete), continuous
329
    elif isinstance(data, numpy.ndarray):
330
        assert data.dtype in [numpy.float64, numpy.float32]
331
        return (None,) * len(data), list(map(float, data))
332
    else:
333
        raise TypeError(
334
            'split_discrete_continuous does not accept {} of type {}'.format(
335
                repr(data), str(type(data))))
336
337
338
def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True):
339
    """
340
    Test general mixed discrete+continuous datatypes by
341
    (1) testing the continuous part conditioned on each discrete value
342
    (2) testing the discrete part marginalizing over the continuous part
343
    (3) testing the estimated total probability (if normalized = True)
344
345
    Inputs:
346
        samples - a list of plain-old-data samples from a distribution
347
        probs - a list of probability densities evaluated at those samples
348
    """
349
    assert len(samples)
350
    discrete_samples = []
351
    strata = defaultdict(lambda: ([], []))
352
    for sample, prob in zip(samples, probs):
353
        d, c = split_discrete_continuous(sample)
354
        discrete_samples.append(d)
355
        samples, probs = strata[d]
356
        samples.append(c)
357
        probs.append(prob)
358
359
    # Continuous part
360
    gofs = []
361
    discrete_probs = {}
362
    for key, (samples, probs) in strata.items():
363
        result = auto_density_goodness_of_fit(
364
            samples,
365
            probs,
366
            plot=plot,
367
            normalized=False,
368
            return_dict=True)
369
        gofs.append(result['gof'])
370
        discrete_probs[key] = result['norm']
371
372
    # Discrete part
373
    if len(strata) > 1:
374
        gofs.append(discrete_goodness_of_fit(
375
            discrete_samples,
376
            discrete_probs,
377
            plot=plot,
378
            normalized=False))
379
380
    # Normalization
381
    if normalized:
382
        norm = sum(discrete_probs.values())
383
        discrete_counts = [len(samples) for samples, _ in strata.values()]
384
        norm_variance = sum(1.0 / count for count in discrete_counts)
385
        dof = len(discrete_counts)
386
        chi_squared = (1 - norm) ** 2 / norm_variance
387
        gofs.append(scipy.stats.chi2.sf(chi_squared, dof))
388
        if plot:
389
            print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5))
390
            print('     = {}'.format(
391
                ' + '.join(map('{:.4g}'.format, discrete_probs.values()))))
392
393
    return min(gofs)
394