GitHub Access Token became invalid

It seems like the GitHub access token used for retrieving details about this repository from GitHub became invalid. This might prevent certain types of inspections from being run (in particular, everything related to pull requests).
Please ask an admin of your repository to re-new the access token on this website.
Completed
Pull Request — master (#14)
by
unknown
59s
created

get_nearest_neighbor_distances()   B

Complexity

Conditions 5

Size

Total Lines 14

Duplication

Lines 0
Ratio 0 %

Importance

Changes 1
Bugs 0 Features 0
Metric Value
cc 5
c 1
b 0
f 0
dl 0
loc 14
rs 8.5454
1
# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
2
# Copyright (c) 2015, Gamelan Labs, Inc.
3
# Copyright (c) 2016, Google, Inc.
4
#
5
# Redistribution and use in source and binary forms, with or without
6
# modification, are permitted provided that the following conditions
7
# are met:
8
#
9
# - Redistributions of source code must retain the above copyright
10
#   notice, this list of conditions and the following disclaimer.
11
# - Redistributions in binary form must reproduce the above copyright
12
#   notice, this list of conditions and the following disclaimer in the
13
#   documentation and/or other materials provided with the distribution.
14
# - Neither the name of Salesforce.com nor the names of its contributors
15
#   may be used to endorse or promote products derived from this
16
#   software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
22
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
27
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
28
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30
from __future__ import division
31
from collections import defaultdict
32
from math import gamma
33
try:
34
    from itertools import izip as zip
35
except ImportError:
36
    pass
37
import math
38
import random
39
import sys
40
41
import kdtree
42
import numpy
43
import numpy.random
44
from numpy import pi
45
46
from .utils import chi2sf
47
48
NoneType = type(None)
49
50
#: Data types for integral random variables.
51
#:
52
#: For Python 2.7, this tuple also includes `long`.
53
INTEGRAL_TYPES = (int, )
54
55
#: Data types for continuous random variables.
56
CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64)
57
58
#: Data types for discrete random variables.
59
#:
60
#: For Python 2.7, this tuple also includes `long` and `basestring`.
61
DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64)
62
63
if sys.version_info < (3, ):
64
    # `str` is a subclass of `basestring`, so this is doing a little
65
    # more work than is necessary, but it should not cause a problem.
66
    DISCRETE_TYPES += (long, basestring)  # noqa
67
    INTEGRAL_TYPES += (long, )  # noqa
68
69
70
def seed_all(seed):
71
    random.seed(seed)
72
    numpy.random.seed(seed)
73
74
75
def get_dim(thing):
76
    if hasattr(thing, '__len__'):
77
        return len(thing)
78
    else:
79
        return 1
80
81
82
def print_histogram(probs, counts):
83
    WIDTH = 60.0
84
    max_count = max(counts)
85
    print('{: >8} {: >8}'.format('Prob', 'Count'))
86
    for prob, count in sorted(zip(probs, counts), reverse=True):
87
        width = int(round(WIDTH * count / max_count))
88
        print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width))
89
90
91
def multinomial_goodness_of_fit(
92
        probs,
93
        counts,
94
        total_count,
95
        truncated=False,
96
        plot=False):
97
    """
98
    Pearson's chi^2 test, on possibly truncated data.
99
    http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
100
101
    Returns:
102
        p-value of truncated multinomial sample.
103
    """
104
    assert len(probs) == len(counts)
105
    assert truncated or total_count == sum(counts)
106
    chi_squared = 0
107
    dof = 0
108
    if plot:
109
        print_histogram(probs, counts)
110
    for p, c in zip(probs, counts):
111
        if p == 1:
112
            return 1 if c == total_count else 0
113
        assert p < 1, 'bad probability: %g' % p
114
        if p > 0:
115
            mean = total_count * p
116
            variance = total_count * p * (1 - p)
117
            assert variance > 1,\
118
                'WARNING goodness of fit is inaccurate; use more samples'
119
            chi_squared += (c - mean) ** 2 / variance
120
            dof += 1
121
        else:
122
            print('WARNING zero probability in goodness-of-fit test')
123
            if c > 0:
124
                return float('inf')
125
126
    if not truncated:
127
        dof -= 1
128
129
    survival = chi2sf(chi_squared, dof)
130
    return survival
131
132
133
def unif01_goodness_of_fit(samples, plot=False):
134
    """
135
    Bin uniformly distributed samples and apply Pearson's chi^2 test.
136
    """
137
    samples = numpy.array(samples, dtype=float)
138
    assert samples.min() >= 0.0
139
    assert samples.max() <= 1.0
140
    bin_count = int(round(len(samples) ** 0.333))
141
    assert bin_count >= 7, 'WARNING imprecise test, use more samples'
142
    probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count
143
    counts = numpy.zeros(bin_count, dtype=numpy.int)
144
    for sample in samples:
145
        counts[min(bin_count - 1, int(bin_count * sample))] += 1
146
    return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot)
147
148
149
def exp_goodness_of_fit(
150
        samples,
151
        plot=False,
152
        normalized=True,
153
        return_dict=False):
154
    """
155
    Transform exponentially distribued samples to unif01 distribution
156
    and assess goodness of fit via binned Pearson's chi^2 test.
157
158
    Inputs:
159
        samples - a list of real-valued samples from a candidate distribution
160
    """
161
    result = {}
162
    if not normalized:
163
        result['norm'] = numpy.mean(samples)
164
        samples /= result['norm']
165
    unif01_samples = numpy.exp(-samples)
166
    result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot)
167
    return result if return_dict else result['gof']
168
169
170
def density_goodness_of_fit(
171
        samples,
172
        probs,
173
        plot=False,
174
        normalized=True,
175
        return_dict=False):
176
    """
177
    Transform arbitrary continuous samples to unif01 distribution
178
    and assess goodness of fit via binned Pearson's chi^2 test.
179
180
    Inputs:
181
        samples - a list of real-valued samples from a distribution
182
        probs - a list of probability densities evaluated at those samples
183
    """
184
    assert len(samples) == len(probs)
185
    assert len(samples) > 100, 'WARNING imprecision; use more samples'
186
    pairs = list(zip(samples, probs))
187
    pairs.sort()
188
    samples = numpy.array([x for x, p in pairs])
189
    probs = numpy.array([p for x, p in pairs])
190
    density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1])
191
    gaps = samples[1:] - samples[:-1]
192
    exp_samples = density * gaps
193
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
194
195
196
def volume_of_sphere(dim, radius):
197
    """Compute the radius or radii of a hypersphere or hyperspheres.
198
199
    This method returns the volume of a hypersphere of dimension `dim`
200
    and the given `radius`. If `radius` is an iterable, it is assumed to
201
    be an iterable of radii, and this function returns a list of volumes
202
    of hyperspheres, each of whose radius is the corresponding element
203
    of `radius`.
204
205
    """
206
    assert isinstance(dim, INTEGRAL_TYPES)
207
    denom = gamma(0.5 * dim + 1)
208
    numer_scale = pi ** (0.5 * dim)
209
    if hasattr(radius, '__iter__'):
210
        return [(r ** dim) * numer_scale / denom for r in radius]
211
    else:
212
        return (radius ** dim) * numer_scale / denom
213
214
215
def get_nearest_neighbor_distances(samples):
216
    if not hasattr(samples[0], '__iter__'):
217
        samples = [(x, ) for x in samples]
218
    if isinstance(samples, numpy.ndarray):
219
        samples = samples.tolist()
220
    tree = kdtree.create(samples)
221
    # `tree.search_knn(point, k=2)` returns a list of the two nearest
222
    # neighbors. The first neighbor is always the point itself, so we
223
    # get the second neighbor (that's the first subscript [1]). It
224
    # actually returns a pair comprising the node in the kdtree and the
225
    # *squared* distance, so we just extract the squared distance
226
    # (that's the second subscript [1]). Finally, we un-square the
227
    # squared distance.
228
    return [math.sqrt(tree.search_knn(point, k=2)[1][1]) for point in samples]
229
230
231
def vector_density_goodness_of_fit(
232
        samples,
233
        probs,
234
        plot=False,
235
        normalized=True,
236
        return_dict=False):
237
    """
238
    Transform arbitrary multivariate continuous samples
239
    to unif01 distribution via nearest neighbor distribution [1,2,3]
240
    and assess goodness of fit via binned Pearson's chi^2 test.
241
242
    [1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668
243
    [2] http://arxiv.org/pdf/1006.3019v2.pdf
244
    [3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution
245
246
    Inputs:
247
        samples - a list of real-vector-valued samples from a distribution
248
        probs - a list of probability densities evaluated at those samples
249
    """
250
    assert len(samples)
251
    assert len(samples) == len(probs)
252
    dim = get_dim(samples[0])
253
    assert dim
254
    assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples'
255
    radii = get_nearest_neighbor_distances(samples)
256
    density = len(samples) * numpy.array(probs)
257
    volume = volume_of_sphere(dim, radii)
258
    exp_samples = density * volume
259
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
260
261
262
def trivial_density_goodness_of_fit(
263
        samples,
264
        probs,
265
        plot=False,
266
        normalized=True,
267
        return_dict=False):
268
    assert len(samples)
269
    assert all(sample == samples[0] for sample in samples)
270
    result = {'gof': 1.0}
271
    if not normalized:
272
        result['norm'] = probs[0]
273
    if return_dict:
274
        return result
275
    else:
276
        return result['gof']
277
278
279
def auto_density_goodness_of_fit(
280
        samples,
281
        probs,
282
        plot=False,
283
        normalized=True,
284
        return_dict=False):
285
    """
286
    Dispatch on sample dimention and delegate to one of:
287
    - density_goodness_of_fit
288
    - vector_density_goodness_of_fit
289
    - trivial_density_goodness_of_fit
290
    """
291
    assert len(samples)
292
    dim = get_dim(samples[0])
293
    if dim == 0:
294
        fun = trivial_density_goodness_of_fit
295
    elif dim == 1:
296
        fun = density_goodness_of_fit
297
        if hasattr(samples[0], '__len__'):
298
            samples = [sample[0] for sample in samples]
299
    else:
300
        fun = vector_density_goodness_of_fit
301
    return fun(samples, probs, plot, normalized, return_dict)
302
303
304
def discrete_goodness_of_fit(
305
        samples,
306
        probs_dict,
307
        truncate_beyond=8,
308
        plot=False,
309
        normalized=True):
310
    """
311
    Transform arbitrary discrete data to multinomial
312
    and assess goodness of fit via Pearson's chi^2 test.
313
    """
314
    if not normalized:
315
        norm = sum(probs_dict.values())
316
        probs_dict = {i: p / norm for i, p in probs_dict.items()}
317
    counts = defaultdict(lambda: 0)
318
    for sample in samples:
319
        assert sample in probs_dict
320
        counts[sample] += 1
321
    items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()]
322
    items.sort(reverse=True)
323
    truncated = (truncate_beyond and truncate_beyond < len(items))
324
    if truncated:
325
        items = items[:truncate_beyond]
326
    probs = [prob for prob, count in items]
327
    counts = [count for prob, count in items]
328
    assert sum(counts) > 100, 'WARNING imprecision; use more samples'
329
    return multinomial_goodness_of_fit(
330
        probs,
331
        counts,
332
        len(samples),
333
        truncated=truncated,
334
        plot=plot)
335
336
337
def split_discrete_continuous(data):
338
    """
339
    Convert arbitrary data to a pair `(discrete, continuous)`
340
    where `discrete` is hashable and `continuous` is a list of floats.
341
    """
342
    if isinstance(data, DISCRETE_TYPES):
343
        return data, []
344
    elif isinstance(data, CONTINUOUS_TYPES):
345
        return None, [data]
346
    elif isinstance(data, (tuple, list)):
347
        discrete = []
348
        continuous = []
349
        for part in data:
350
            d, c = split_discrete_continuous(part)
351
            discrete.append(d)
352
            continuous += c
353
        return tuple(discrete), continuous
354
    elif isinstance(data, numpy.ndarray):
355
        assert data.dtype in [numpy.float64, numpy.float32]
356
        return (None,) * len(data), list(map(float, data))
357
    else:
358
        raise TypeError(
359
            'split_discrete_continuous does not accept {} of type {}'.format(
360
                repr(data), str(type(data))))
361
362
363
def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True):
364
    """
365
    Test general mixed discrete+continuous datatypes by
366
    (1) testing the continuous part conditioned on each discrete value
367
    (2) testing the discrete part marginalizing over the continuous part
368
    (3) testing the estimated total probability (if normalized = True)
369
370
    Inputs:
371
        samples - a list of plain-old-data samples from a distribution
372
        probs - a list of probability densities evaluated at those samples
373
    """
374
    assert len(samples)
375
    discrete_samples = []
376
    strata = defaultdict(lambda: ([], []))
377
    for sample, prob in zip(samples, probs):
378
        d, c = split_discrete_continuous(sample)
379
        discrete_samples.append(d)
380
        samples, probs = strata[d]
381
        samples.append(c)
382
        probs.append(prob)
383
384
    # Continuous part
385
    gofs = []
386
    discrete_probs = {}
387
    for key, (samples, probs) in strata.items():
388
        result = auto_density_goodness_of_fit(
389
            samples,
390
            probs,
391
            plot=plot,
392
            normalized=False,
393
            return_dict=True)
394
        gofs.append(result['gof'])
395
        discrete_probs[key] = result['norm']
396
397
    # Discrete part
398
    if len(strata) > 1:
399
        gofs.append(discrete_goodness_of_fit(
400
            discrete_samples,
401
            discrete_probs,
402
            plot=plot,
403
            normalized=False))
404
405
    # Normalization
406
    if normalized:
407
        norm = sum(discrete_probs.values())
408
        discrete_counts = [len(samples) for samples, _ in strata.values()]
409
        norm_variance = sum(1.0 / count for count in discrete_counts)
410
        dof = len(discrete_counts)
411
        chi_squared = (1 - norm) ** 2 / norm_variance
412
        gofs.append(chi2sf(chi_squared, dof))
413
        if plot:
414
            print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5))
415
            print('     = {}'.format(
416
                ' + '.join(map('{:.4g}'.format, discrete_probs.values()))))
417
418
    return min(gofs)
419