get_nearest_neighbor_distances() - Code Metrics - Inspection of "Uses KDTree from python-kdtree package, not SciPy" - posterior/goftests - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Pull Request — master (#14)

unknown

created 2017-03-31 04:11 UTC

get_nearest_neighbor_distances() B

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	5
c	1
b	0
f	0
dl	0
loc	14
rs	8.5454

# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
# Copyright (c) 2015, Gamelan Labs, Inc.
# Copyright (c) 2016, Google, Inc.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
# - Neither the name of Salesforce.com nor the names of its contributors
#   may be used to endorse or promote products derived from this
#   software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from __future__ import division
from collections import defaultdict
from math import gamma
try:
    from itertools import izip as zip
except ImportError:
    pass
import math
import random
import sys

import kdtree
import numpy
import numpy.random
from numpy import pi

from .utils import chi2sf

NoneType = type(None)

#: Data types for integral random variables.
#:
#: For Python 2.7, this tuple also includes `long`.
INTEGRAL_TYPES = (int, )

#: Data types for continuous random variables.
CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64)

#: Data types for discrete random variables.
#:
#: For Python 2.7, this tuple also includes `long` and `basestring`.
DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64)

if sys.version_info < (3, ):
    # `str` is a subclass of `basestring`, so this is doing a little
    # more work than is necessary, but it should not cause a problem.
    DISCRETE_TYPES += (long, basestring)  # noqa
    INTEGRAL_TYPES += (long, )  # noqa


def seed_all(seed):
    random.seed(seed)
    numpy.random.seed(seed)


def get_dim(thing):
    if hasattr(thing, '__len__'):
        return len(thing)
    else:
        return 1


def print_histogram(probs, counts):
    WIDTH = 60.0
    max_count = max(counts)
    print('{: >8} {: >8}'.format('Prob', 'Count'))
    for prob, count in sorted(zip(probs, counts), reverse=True):
        width = int(round(WIDTH * count / max_count))
        print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width))


def multinomial_goodness_of_fit(
        probs,
        counts,
        total_count,
        truncated=False,
        plot=False):
    """
    Pearson's chi^2 test, on possibly truncated data.
    http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test

    Returns:
        p-value of truncated multinomial sample.
    """
    assert len(probs) == len(counts)
    assert truncated or total_count == sum(counts)
    chi_squared = 0
    dof = 0
    if plot:
        print_histogram(probs, counts)
    for p, c in zip(probs, counts):
        if p == 1:
            return 1 if c == total_count else 0
        assert p < 1, 'bad probability: %g' % p
        if p > 0:
            mean = total_count * p
            variance = total_count * p * (1 - p)
            assert variance > 1,\
                'WARNING goodness of fit is inaccurate; use more samples'
            chi_squared += (c - mean) ** 2 / variance
            dof += 1
        else:
            print('WARNING zero probability in goodness-of-fit test')
            if c > 0:
                return float('inf')

    if not truncated:
        dof -= 1

    survival = chi2sf(chi_squared, dof)
    return survival


def unif01_goodness_of_fit(samples, plot=False):
    """
    Bin uniformly distributed samples and apply Pearson's chi^2 test.
    """
    samples = numpy.array(samples, dtype=float)
    assert samples.min() >= 0.0
    assert samples.max() <= 1.0
    bin_count = int(round(len(samples) ** 0.333))
    assert bin_count >= 7, 'WARNING imprecise test, use more samples'
    probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count
    counts = numpy.zeros(bin_count, dtype=numpy.int)
    for sample in samples:
        counts[min(bin_count - 1, int(bin_count * sample))] += 1
    return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot)


def exp_goodness_of_fit(
        samples,
        plot=False,
        normalized=True,
        return_dict=False):
    """
    Transform exponentially distribued samples to unif01 distribution
    and assess goodness of fit via binned Pearson's chi^2 test.

    Inputs:
        samples - a list of real-valued samples from a candidate distribution
    """
    result = {}
    if not normalized:
        result['norm'] = numpy.mean(samples)
        samples /= result['norm']
    unif01_samples = numpy.exp(-samples)
    result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot)
    return result if return_dict else result['gof']


def density_goodness_of_fit(
        samples,
        probs,
        plot=False,
        normalized=True,
        return_dict=False):
    """
    Transform arbitrary continuous samples to unif01 distribution
    and assess goodness of fit via binned Pearson's chi^2 test.

    Inputs:
        samples - a list of real-valued samples from a distribution
        probs - a list of probability densities evaluated at those samples
    """
    assert len(samples) == len(probs)
    assert len(samples) > 100, 'WARNING imprecision; use more samples'
    pairs = list(zip(samples, probs))
    pairs.sort()
    samples = numpy.array([x for x, p in pairs])
    probs = numpy.array([p for x, p in pairs])
    density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1])
    gaps = samples[1:] - samples[:-1]
    exp_samples = density * gaps
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)


def volume_of_sphere(dim, radius):
    """Compute the radius or radii of a hypersphere or hyperspheres.

    This method returns the volume of a hypersphere of dimension `dim`
    and the given `radius`. If `radius` is an iterable, it is assumed to
    be an iterable of radii, and this function returns a list of volumes
    of hyperspheres, each of whose radius is the corresponding element
    of `radius`.

    """
    assert isinstance(dim, INTEGRAL_TYPES)
    denom = gamma(0.5 * dim + 1)
    numer_scale = pi ** (0.5 * dim)
    if hasattr(radius, '__iter__'):
        return [(r ** dim) * numer_scale / denom for r in radius]
    else:
        return (radius ** dim) * numer_scale / denom


def get_nearest_neighbor_distances(samples):
    if not hasattr(samples[0], '__iter__'):
        samples = [(x, ) for x in samples]
    if isinstance(samples, numpy.ndarray):
        samples = samples.tolist()
    tree = kdtree.create(samples)
    # `tree.search_knn(point, k=2)` returns a list of the two nearest
    # neighbors. The first neighbor is always the point itself, so we
    # get the second neighbor (that's the first subscript [1]). It
    # actually returns a pair comprising the node in the kdtree and the
    # *squared* distance, so we just extract the squared distance
    # (that's the second subscript [1]). Finally, we un-square the
    # squared distance.
    return [math.sqrt(tree.search_knn(point, k=2)[1][1]) for point in samples]


def vector_density_goodness_of_fit(
        samples,
        probs,
        plot=False,
        normalized=True,
        return_dict=False):
    """
    Transform arbitrary multivariate continuous samples
    to unif01 distribution via nearest neighbor distribution [1,2,3]
    and assess goodness of fit via binned Pearson's chi^2 test.

    [1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668
    [2] http://arxiv.org/pdf/1006.3019v2.pdf
    [3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution

    Inputs:
        samples - a list of real-vector-valued samples from a distribution
        probs - a list of probability densities evaluated at those samples
    """
    assert len(samples)
    assert len(samples) == len(probs)
    dim = get_dim(samples[0])
    assert dim
    assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples'
    radii = get_nearest_neighbor_distances(samples)
    density = len(samples) * numpy.array(probs)
    volume = volume_of_sphere(dim, radii)
    exp_samples = density * volume
    return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)


def trivial_density_goodness_of_fit(
        samples,
        probs,
        plot=False,
        normalized=True,
        return_dict=False):
    assert len(samples)
    assert all(sample == samples[0] for sample in samples)
    result = {'gof': 1.0}
    if not normalized:
        result['norm'] = probs[0]
    if return_dict:
        return result
    else:
        return result['gof']


def auto_density_goodness_of_fit(
        samples,
        probs,
        plot=False,
        normalized=True,
        return_dict=False):
    """
    Dispatch on sample dimention and delegate to one of:
    - density_goodness_of_fit
    - vector_density_goodness_of_fit
    - trivial_density_goodness_of_fit
    """
    assert len(samples)
    dim = get_dim(samples[0])
    if dim == 0:
        fun = trivial_density_goodness_of_fit
    elif dim == 1:
        fun = density_goodness_of_fit
        if hasattr(samples[0], '__len__'):
            samples = [sample[0] for sample in samples]
    else:
        fun = vector_density_goodness_of_fit
    return fun(samples, probs, plot, normalized, return_dict)


def discrete_goodness_of_fit(
        samples,
        probs_dict,
        truncate_beyond=8,
        plot=False,
        normalized=True):
    """
    Transform arbitrary discrete data to multinomial
    and assess goodness of fit via Pearson's chi^2 test.
    """
    if not normalized:
        norm = sum(probs_dict.values())
        probs_dict = {i: p / norm for i, p in probs_dict.items()}
    counts = defaultdict(lambda: 0)
    for sample in samples:
        assert sample in probs_dict
        counts[sample] += 1
    items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()]
    items.sort(reverse=True)
    truncated = (truncate_beyond and truncate_beyond < len(items))
    if truncated:
        items = items[:truncate_beyond]
    probs = [prob for prob, count in items]
    counts = [count for prob, count in items]
    assert sum(counts) > 100, 'WARNING imprecision; use more samples'
    return multinomial_goodness_of_fit(
        probs,
        counts,
        len(samples),
        truncated=truncated,
        plot=plot)


def split_discrete_continuous(data):
    """
    Convert arbitrary data to a pair `(discrete, continuous)`
    where `discrete` is hashable and `continuous` is a list of floats.
    """
    if isinstance(data, DISCRETE_TYPES):
        return data, []
    elif isinstance(data, CONTINUOUS_TYPES):
        return None, [data]
    elif isinstance(data, (tuple, list)):
        discrete = []
        continuous = []
        for part in data:
            d, c = split_discrete_continuous(part)
            discrete.append(d)
            continuous += c
        return tuple(discrete), continuous
    elif isinstance(data, numpy.ndarray):
        assert data.dtype in [numpy.float64, numpy.float32]
        return (None,) * len(data), list(map(float, data))
    else:
        raise TypeError(
            'split_discrete_continuous does not accept {} of type {}'.format(
                repr(data), str(type(data))))


def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True):
    """
    Test general mixed discrete+continuous datatypes by
    (1) testing the continuous part conditioned on each discrete value
    (2) testing the discrete part marginalizing over the continuous part
    (3) testing the estimated total probability (if normalized = True)

    Inputs:
        samples - a list of plain-old-data samples from a distribution
        probs - a list of probability densities evaluated at those samples
    """
    assert len(samples)
    discrete_samples = []
    strata = defaultdict(lambda: ([], []))
    for sample, prob in zip(samples, probs):
        d, c = split_discrete_continuous(sample)
        discrete_samples.append(d)
        samples, probs = strata[d]
        samples.append(c)
        probs.append(prob)

    # Continuous part
    gofs = []
    discrete_probs = {}
    for key, (samples, probs) in strata.items():
        result = auto_density_goodness_of_fit(
            samples,
            probs,
            plot=plot,
            normalized=False,
            return_dict=True)
        gofs.append(result['gof'])
        discrete_probs[key] = result['norm']

    # Discrete part
    if len(strata) > 1:
        gofs.append(discrete_goodness_of_fit(
            discrete_samples,
            discrete_probs,
            plot=plot,
            normalized=False))

    # Normalization
    if normalized:
        norm = sum(discrete_probs.values())
        discrete_counts = [len(samples) for samples, _ in strata.values()]
        norm_variance = sum(1.0 / count for count in discrete_counts)
        dof = len(discrete_counts)
        chi_squared = (1 - norm) ** 2 / norm_variance
        gofs.append(chi2sf(chi_squared, dof))
        if plot:
            print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5))
            print('     = {}'.format(
                ' + '.join(map('{:.4g}'.format, discrete_probs.values()))))

    return min(gofs)


1			# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved.
2			# Copyright (c) 2015, Gamelan Labs, Inc.
3			# Copyright (c) 2016, Google, Inc.
4			#
5			# Redistribution and use in source and binary forms, with or without
6			# modification, are permitted provided that the following conditions
7			# are met:
8			#
9			# - Redistributions of source code must retain the above copyright
10			# notice, this list of conditions and the following disclaimer.
11			# - Redistributions in binary form must reproduce the above copyright
12			# notice, this list of conditions and the following disclaimer in the
13			# documentation and/or other materials provided with the distribution.
14			# - Neither the name of Salesforce.com nor the names of its contributors
15			# may be used to endorse or promote products derived from this
16			# software without specific prior written permission.
17			#
18			# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19			# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20			# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
21			# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
22			# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
23			# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
24			# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
25			# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26			# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
27			# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
28			# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30			from __future__ import division
31			from collections import defaultdict
32			from math import gamma
33			try:
34			from itertools import izip as zip
35			except ImportError:
36			pass
37			import math
38			import random
39			import sys
40
41			import kdtree
42			import numpy
43			import numpy.random
44			from numpy import pi
45
46			from .utils import chi2sf
47
48			NoneType = type(None)
49
50			#: Data types for integral random variables.
51			#:
52			#: For Python 2.7, this tuple also includes `long`.
53			INTEGRAL_TYPES = (int, )
54
55			#: Data types for continuous random variables.
56			CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64)
57
58			#: Data types for discrete random variables.
59			#:
60			#: For Python 2.7, this tuple also includes `long` and `basestring`.
61			DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64)
62
63			if sys.version_info < (3, ):
64			# `str` is a subclass of `basestring`, so this is doing a little
65			# more work than is necessary, but it should not cause a problem.
66			DISCRETE_TYPES += (long, basestring) # noqa
67			INTEGRAL_TYPES += (long, ) # noqa
68
69
70			def seed_all(seed):
71			random.seed(seed)
72			numpy.random.seed(seed)
73
74
75			def get_dim(thing):
76			if hasattr(thing, '__len__'):
77			return len(thing)
78			else:
79			return 1
80
81
82			def print_histogram(probs, counts):
83			WIDTH = 60.0
84			max_count = max(counts)
85			print('{: >8} {: >8}'.format('Prob', 'Count'))
86			for prob, count in sorted(zip(probs, counts), reverse=True):
87			width = int(round(WIDTH * count / max_count))
88			print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width))
89
90
91			def multinomial_goodness_of_fit(
92			probs,
93			counts,
94			total_count,
95			truncated=False,
96			plot=False):
97			"""
98			Pearson's chi^2 test, on possibly truncated data.
99			http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
100
101			Returns:
102			p-value of truncated multinomial sample.
103			"""
104			assert len(probs) == len(counts)
105			assert truncated or total_count == sum(counts)
106			chi_squared = 0
107			dof = 0
108			if plot:
109			print_histogram(probs, counts)
110			for p, c in zip(probs, counts):
111			if p == 1:
112			return 1 if c == total_count else 0
113			assert p < 1, 'bad probability: %g' % p
114			if p > 0:
115			mean = total_count * p
116			variance = total_count * p * (1 - p)
117			assert variance > 1,\
118			'WARNING goodness of fit is inaccurate; use more samples'
119			chi_squared += (c - mean) ** 2 / variance
120			dof += 1
121			else:
122			print('WARNING zero probability in goodness-of-fit test')
123			if c > 0:
124			return float('inf')
125
126			if not truncated:
127			dof -= 1
128
129			survival = chi2sf(chi_squared, dof)
130			return survival
131
132
133			def unif01_goodness_of_fit(samples, plot=False):
134			"""
135			Bin uniformly distributed samples and apply Pearson's chi^2 test.
136			"""
137			samples = numpy.array(samples, dtype=float)
138			assert samples.min() >= 0.0
139			assert samples.max() <= 1.0
140			bin_count = int(round(len(samples) ** 0.333))
141			assert bin_count >= 7, 'WARNING imprecise test, use more samples'
142			probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count
143			counts = numpy.zeros(bin_count, dtype=numpy.int)
144			for sample in samples:
145			counts[min(bin_count - 1, int(bin_count * sample))] += 1
146			return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot)
147
148
149			def exp_goodness_of_fit(
150			samples,
151			plot=False,
152			normalized=True,
153			return_dict=False):
154			"""
155			Transform exponentially distribued samples to unif01 distribution
156			and assess goodness of fit via binned Pearson's chi^2 test.
157
158			Inputs:
159			samples - a list of real-valued samples from a candidate distribution
160			"""
161			result = {}
162			if not normalized:
163			result['norm'] = numpy.mean(samples)
164			samples /= result['norm']
165			unif01_samples = numpy.exp(-samples)
166			result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot)
167			return result if return_dict else result['gof']
168
169
170			def density_goodness_of_fit(
171			samples,
172			probs,
173			plot=False,
174			normalized=True,
175			return_dict=False):
176			"""
177			Transform arbitrary continuous samples to unif01 distribution
178			and assess goodness of fit via binned Pearson's chi^2 test.
179
180			Inputs:
181			samples - a list of real-valued samples from a distribution
182			probs - a list of probability densities evaluated at those samples
183			"""
184			assert len(samples) == len(probs)
185			assert len(samples) > 100, 'WARNING imprecision; use more samples'
186			pairs = list(zip(samples, probs))
187			pairs.sort()
188			samples = numpy.array([x for x, p in pairs])
189			probs = numpy.array([p for x, p in pairs])
190			density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1])
191			gaps = samples[1:] - samples[:-1]
192			exp_samples = density * gaps
193			return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
194
195
196			def volume_of_sphere(dim, radius):
197			"""Compute the radius or radii of a hypersphere or hyperspheres.
198
199			This method returns the volume of a hypersphere of dimension `dim`
200			and the given `radius`. If `radius` is an iterable, it is assumed to
201			be an iterable of radii, and this function returns a list of volumes
202			of hyperspheres, each of whose radius is the corresponding element
203			of `radius`.
204
205			"""
206			assert isinstance(dim, INTEGRAL_TYPES)
207			denom = gamma(0.5 * dim + 1)
208			numer_scale = pi ** (0.5 * dim)
209			if hasattr(radius, '__iter__'):
210			return [(r ** dim) * numer_scale / denom for r in radius]
211			else:
212			return (radius ** dim) * numer_scale / denom
213
214
215			def get_nearest_neighbor_distances(samples):
216			if not hasattr(samples[0], '__iter__'):
217			samples = [(x, ) for x in samples]
218			if isinstance(samples, numpy.ndarray):
219			samples = samples.tolist()
220			tree = kdtree.create(samples)
221			# `tree.search_knn(point, k=2)` returns a list of the two nearest
222			# neighbors. The first neighbor is always the point itself, so we
223			# get the second neighbor (that's the first subscript [1]). It
224			# actually returns a pair comprising the node in the kdtree and the
225			# squared distance, so we just extract the squared distance
226			# (that's the second subscript [1]). Finally, we un-square the
227			# squared distance.
228			return [math.sqrt(tree.search_knn(point, k=2)[1][1]) for point in samples]
229
230
231			def vector_density_goodness_of_fit(
232			samples,
233			probs,
234			plot=False,
235			normalized=True,
236			return_dict=False):
237			"""
238			Transform arbitrary multivariate continuous samples
239			to unif01 distribution via nearest neighbor distribution [1,2,3]
240			and assess goodness of fit via binned Pearson's chi^2 test.
241
242			[1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668
243			[2] http://arxiv.org/pdf/1006.3019v2.pdf
244			[3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution
245
246			Inputs:
247			samples - a list of real-vector-valued samples from a distribution
248			probs - a list of probability densities evaluated at those samples
249			"""
250			assert len(samples)
251			assert len(samples) == len(probs)
252			dim = get_dim(samples[0])
253			assert dim
254			assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples'
255			radii = get_nearest_neighbor_distances(samples)
256			density = len(samples) * numpy.array(probs)
257			volume = volume_of_sphere(dim, radii)
258			exp_samples = density * volume
259			return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict)
260
261
262			def trivial_density_goodness_of_fit(
263			samples,
264			probs,
265			plot=False,
266			normalized=True,
267			return_dict=False):
268			assert len(samples)
269			assert all(sample == samples[0] for sample in samples)
270			result = {'gof': 1.0}
271			if not normalized:
272			result['norm'] = probs[0]
273			if return_dict:
274			return result
275			else:
276			return result['gof']
277
278
279			def auto_density_goodness_of_fit(
280			samples,
281			probs,
282			plot=False,
283			normalized=True,
284			return_dict=False):
285			"""
286			Dispatch on sample dimention and delegate to one of:
287			- density_goodness_of_fit
288			- vector_density_goodness_of_fit
289			- trivial_density_goodness_of_fit
290			"""
291			assert len(samples)
292			dim = get_dim(samples[0])
293			if dim == 0:
294			fun = trivial_density_goodness_of_fit
295			elif dim == 1:
296			fun = density_goodness_of_fit
297			if hasattr(samples[0], '__len__'):
298			samples = [sample[0] for sample in samples]
299			else:
300			fun = vector_density_goodness_of_fit
301			return fun(samples, probs, plot, normalized, return_dict)
302
303
304			def discrete_goodness_of_fit(
305			samples,
306			probs_dict,
307			truncate_beyond=8,
308			plot=False,
309			normalized=True):
310			"""
311			Transform arbitrary discrete data to multinomial
312			and assess goodness of fit via Pearson's chi^2 test.
313			"""
314			if not normalized:
315			norm = sum(probs_dict.values())
316			probs_dict = {i: p / norm for i, p in probs_dict.items()}
317			counts = defaultdict(lambda: 0)
318			for sample in samples:
319			assert sample in probs_dict
320			counts[sample] += 1
321			items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()]
322			items.sort(reverse=True)
323			truncated = (truncate_beyond and truncate_beyond < len(items))
324			if truncated:
325			items = items[:truncate_beyond]
326			probs = [prob for prob, count in items]
327			counts = [count for prob, count in items]
328			assert sum(counts) > 100, 'WARNING imprecision; use more samples'
329			return multinomial_goodness_of_fit(
330			probs,
331			counts,
332			len(samples),
333			truncated=truncated,
334			plot=plot)
335
336
337			def split_discrete_continuous(data):
338			"""
339			Convert arbitrary data to a pair `(discrete, continuous)`
340			where `discrete` is hashable and `continuous` is a list of floats.
341			"""
342			if isinstance(data, DISCRETE_TYPES):
343			return data, []
344			elif isinstance(data, CONTINUOUS_TYPES):
345			return None, [data]
346			elif isinstance(data, (tuple, list)):
347			discrete = []
348			continuous = []
349			for part in data:
350			d, c = split_discrete_continuous(part)
351			discrete.append(d)
352			continuous += c
353			return tuple(discrete), continuous
354			elif isinstance(data, numpy.ndarray):
355			assert data.dtype in [numpy.float64, numpy.float32]
356			return (None,) * len(data), list(map(float, data))
357			else:
358			raise TypeError(
359			'split_discrete_continuous does not accept {} of type {}'.format(
360			repr(data), str(type(data))))
361
362
363			def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True):
364			"""
365			Test general mixed discrete+continuous datatypes by
366			(1) testing the continuous part conditioned on each discrete value
367			(2) testing the discrete part marginalizing over the continuous part
368			(3) testing the estimated total probability (if normalized = True)
369
370			Inputs:
371			samples - a list of plain-old-data samples from a distribution
372			probs - a list of probability densities evaluated at those samples
373			"""
374			assert len(samples)
375			discrete_samples = []
376			strata = defaultdict(lambda: ([], []))
377			for sample, prob in zip(samples, probs):
378			d, c = split_discrete_continuous(sample)
379			discrete_samples.append(d)
380			samples, probs = strata[d]
381			samples.append(c)
382			probs.append(prob)
383
384			# Continuous part
385			gofs = []
386			discrete_probs = {}
387			for key, (samples, probs) in strata.items():
388			result = auto_density_goodness_of_fit(
389			samples,
390			probs,
391			plot=plot,
392			normalized=False,
393			return_dict=True)
394			gofs.append(result['gof'])
395			discrete_probs[key] = result['norm']
396
397			# Discrete part
398			if len(strata) > 1:
399			gofs.append(discrete_goodness_of_fit(
400			discrete_samples,
401			discrete_probs,
402			plot=plot,
403			normalized=False))
404
405			# Normalization
406			if normalized:
407			norm = sum(discrete_probs.values())
408			discrete_counts = [len(samples) for samples, _ in strata.values()]
409			norm_variance = sum(1.0 / count for count in discrete_counts)
410			dof = len(discrete_counts)
411			chi_squared = (1 - norm) ** 2 / norm_variance
412			gofs.append(chi2sf(chi_squared, dof))
413			if plot:
414			print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5))
415			print(' = {}'.format(
416			' + '.join(map('{:.4g}'.format, discrete_probs.values()))))
417
418			return min(gofs)
419

posterior / goftests

GitHub Access Token became invalid

Pull Request — master (#14)

get_nearest_neighbor_distances() B

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like