cdf_to_pdf() - Code Metrics - Inspection of "Bump to version 0.2.1" - posterior/goftests - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 668ca3...a3efe1 )

by Fritz

created 2016-06-08 03:21 UTC

cdf_to_pdf() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes	1
Bugs	0	Features	0

Metric	Value
cc	2
c	1
b	0
f	0
dl	0
loc	6
rs	9.4285

# Copyright (c) 2014, Salesforce.com, Inc.  All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# - Redistributions of source code must retain the above copyright
#   notice, this list of conditions and the following disclaimer.
# - Redistributions in binary form must reproduce the above copyright
#   notice, this list of conditions and the following disclaimer in the
#   documentation and/or other materials provided with the distribution.
# - Neither the name of Salesforce.com nor the names of its contributors
#   may be used to endorse or promote products derived from this
#   software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from itertools import izip
import numpy
from matplotlib import pyplot
from sklearn.neighbors import NearestNeighbors
from goftests import volume_of_sphere
import parsable


def get_dim(value):
    if isinstance(value, float):
        return 1
    else:
        return len(value)


def get_samples(model, EXAMPLE, sample_count):
    shared = model.Shared.from_dict(EXAMPLE['shared'])
    values = EXAMPLE['values']
    group = model.Group.from_values(shared, values)

    # This version seems to be broken
    # sampler = model.Sampler()
    # sampler.init(shared, group)
    # ...
    # for _ in xrange(sample_count):
    #     value = sampler.eval(shared)

    samples = []
    probs = []
    for _ in xrange(sample_count):
        value = group.sample_value(shared)
        samples.append(value)
        score = group.score_value(shared, value)
        probs.append(score)

    return numpy.array(samples), numpy.array(probs)


def get_edge_stats(samples, probs):
    if not hasattr(samples[0], '__iter__'):
        samples = numpy.array([samples]).T
    neighbors = NearestNeighbors(n_neighbors=2).fit(samples)
    distances, indices = neighbors.kneighbors(samples)
    return {'lengths': distances[:, 1], 'probs': probs}


@parsable.command
def plot_edges(sample_count=1000, seed=0):
    '''
    Plot edges of niw examples.
    '''
    seed_all(seed)
    fig, axes = pyplot.subplots(
        len(niw.EXAMPLES),
        2,
        sharey='row',
        figsize=(8, 12))

    model = niw
    for EXAMPLE, (ax1, ax2) in izip(model.EXAMPLES, axes):
        dim = get_dim(EXAMPLE['shared']['mu'])
        samples, probs = get_samples(model, EXAMPLE, sample_count)
        edges = get_edge_stats(samples, probs)

        edge_lengths = numpy.log(edges['lengths'])
        edge_probs = edges['probs']
        edge_stats = [
            numpy.exp((s - d) / dim)
            for d, s in izip(edge_lengths, edge_probs)
        ]

        ax1.set_title('NIW, dim = {}'.format(dim))
        ax1.scatter(edge_lengths, edge_probs, lw=0, alpha=0.5)
        ax1.set_ylabel('log(edge prob)')

        ax2.scatter(edge_stats, edge_probs, lw=0, alpha=0.5)
        ax2.yaxis.set_label_position('right')

    ax1.set_xlabel('log(edge length)')
    ax2.set_ylabel('statistic')
    fig.tight_layout()
    fig.subplots_adjust(wspace=0)
    pyplot.show()


def cdf_to_pdf(Y, X, bandwidth=0.1):
    assert len(Y) == len(X)
    shift = max(1, int(round(len(Y) * bandwidth)))
    Y = (1.0 / shift) * (Y[shift:] - Y[:-shift])
    X = 0.5 * (X[shift:] + X[:-shift])
    return Y, X


def plot_cdfs(examples):
    '''
    Plot test statistic cdfs based on the Nearest Neighbor distribution.
    '''
    seed_all(seed)

    fig, (ax1, ax2) = pyplot.subplots(2, 1, sharex=True, figsize=(8, 10))
    ax1.plot([0, 1], [0, 1], 'k--')
    ax2.plot([0, 1], [1, 1], 'k--')

    for example in model.examples:
        sample_count = len(example['samples'])
        dim = get_dim(example['samples'][0])
        samples, probs = get_samples(model, EXAMPLE, sample_count)
        edges = get_edge_stats(example['samples'], example['probs'])
        radii = edges['lengths']
        intensities = sample_count * numpy.array(edges['probs'])

        cdf = numpy.array([
            1 - numpy.exp(-intensity * volume_of_sphere(dim, radius))
            for intensity, radius in izip(intensities, radii)
        ])
        cdf.sort()
        X = numpy.arange(0.5 / sample_count, 1, 1.0 / sample_count)

        pdf, Xp = cdf_to_pdf(cdf, X)
        pdf *= sample_count

        error = 2 * (sum(cdf) / sample_count) - 1
        if abs(error) < 0.05:
            status = 'PASS'
            linestyle = '-'
        else:
            status = 'FAIL'
            linestyle = '--'
        label = '{} {}({}) error = {:.3g}'.format(status, name, dim, error)
        ax1.plot(X, cdf, linestyle=linestyle, label=label)
        ax2.plot(Xp, pdf, linestyle=linestyle, label=label)

    ax1.set_title('GOF of Nearest Neighbor Statistic')
    ax1.legend(loc='best', prop={'size': 10}, fancybox=True, framealpha=0.5)
    ax1.set_ylabel('CDF')
    ax2.set_ylabel('PDF')
    pyplot.tight_layout()
    fig.subplots_adjust(hspace=0)
    pyplot.show()


def neighbor_scatter(samples, probs, title='nearest neighbor'):
    '''
    Plot nearest neighbor statistic cdf for all datatpoints in a 2d dataset.
    '''
    sample_count = len(samples)
    assert sample_count
    dim = len(samples[0])
    assert dim == 2, dim

    pyplot.figure()
    cmap = pyplot.get_cmap('bwr')

    edges = get_edge_stats(samples, probs)
    radii = edges['lengths']
    intensities = sample_count * numpy.array(edges['probs'])

    cdf = numpy.array([
        1 - numpy.exp(-intensity * volume_of_sphere(dim, radius))
        for intensity, radius in izip(intensities, radii)
    ])
    error = 2 * (sum(cdf) / sample_count) - 1

    X = [value[0] for value in samples]
    Y = [value[1] for value in samples]
    colors = cdf

    pyplot.title('{} error = {:0.3g}'.format(title, error))
    pyplot.scatter(X, Y, 50, alpha=0.5, c=colors, cmap=cmap)
    pyplot.axis('equal')

    pyplot.tight_layout()
    pyplot.show()


1			# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved.
2			#
3			# Redistribution and use in source and binary forms, with or without
4			# modification, are permitted provided that the following conditions
5			# are met:
6			#
7			# - Redistributions of source code must retain the above copyright
8			# notice, this list of conditions and the following disclaimer.
9			# - Redistributions in binary form must reproduce the above copyright
10			# notice, this list of conditions and the following disclaimer in the
11			# documentation and/or other materials provided with the distribution.
12			# - Neither the name of Salesforce.com nor the names of its contributors
13			# may be used to endorse or promote products derived from this
14			# software without specific prior written permission.
15			#
16			# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17			# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18			# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19			# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20			# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21			# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22			# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
23			# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24			# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
25			# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
26			# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28			from itertools import izip
29			import numpy
30			from matplotlib import pyplot
31			from sklearn.neighbors import NearestNeighbors
32			from goftests import volume_of_sphere
33			import parsable
34
35
36			def get_dim(value):
37			if isinstance(value, float):
38			return 1
39			else:
40			return len(value)
41
42
43			def get_samples(model, EXAMPLE, sample_count):
44			shared = model.Shared.from_dict(EXAMPLE['shared'])
45			values = EXAMPLE['values']
46			group = model.Group.from_values(shared, values)
47
48			# This version seems to be broken
49			# sampler = model.Sampler()
50			# sampler.init(shared, group)
51			# ...
52			# for _ in xrange(sample_count):
53			# value = sampler.eval(shared)
54
55			samples = []
56			probs = []
57			for _ in xrange(sample_count):
58			value = group.sample_value(shared)
59			samples.append(value)
60			score = group.score_value(shared, value)
61			probs.append(score)
62
63			return numpy.array(samples), numpy.array(probs)
64
65
66			def get_edge_stats(samples, probs):
67			if not hasattr(samples[0], '__iter__'):
68			samples = numpy.array([samples]).T
69			neighbors = NearestNeighbors(n_neighbors=2).fit(samples)
70			distances, indices = neighbors.kneighbors(samples)
71			return {'lengths': distances[:, 1], 'probs': probs}
72
73
74			@parsable.command
75			def plot_edges(sample_count=1000, seed=0):
76			'''
77			Plot edges of niw examples.
78			'''
79			seed_all(seed)
80			fig, axes = pyplot.subplots(
81			len(niw.EXAMPLES),
82			2,
83			sharey='row',
84			figsize=(8, 12))
85
86			model = niw
87			for EXAMPLE, (ax1, ax2) in izip(model.EXAMPLES, axes):
88			dim = get_dim(EXAMPLE['shared']['mu'])
89			samples, probs = get_samples(model, EXAMPLE, sample_count)
90			edges = get_edge_stats(samples, probs)
91
92			edge_lengths = numpy.log(edges['lengths'])
93			edge_probs = edges['probs']
94			edge_stats = [
95			numpy.exp((s - d) / dim)
96			for d, s in izip(edge_lengths, edge_probs)
97			]
98
99			ax1.set_title('NIW, dim = {}'.format(dim))
100			ax1.scatter(edge_lengths, edge_probs, lw=0, alpha=0.5)
101			ax1.set_ylabel('log(edge prob)')
102
103			ax2.scatter(edge_stats, edge_probs, lw=0, alpha=0.5)
104			ax2.yaxis.set_label_position('right')
105
106			ax1.set_xlabel('log(edge length)')
107			ax2.set_ylabel('statistic')
108			fig.tight_layout()
109			fig.subplots_adjust(wspace=0)
110			pyplot.show()
111
112
113			def cdf_to_pdf(Y, X, bandwidth=0.1):
114			assert len(Y) == len(X)
115			shift = max(1, int(round(len(Y) * bandwidth)))
116			Y = (1.0 / shift) * (Y[shift:] - Y[:-shift])
117			X = 0.5 * (X[shift:] + X[:-shift])
118			return Y, X
119
120
121			def plot_cdfs(examples):
122			'''
123			Plot test statistic cdfs based on the Nearest Neighbor distribution.
124			'''
125			seed_all(seed)
126
127			fig, (ax1, ax2) = pyplot.subplots(2, 1, sharex=True, figsize=(8, 10))
128			ax1.plot([0, 1], [0, 1], 'k--')
129			ax2.plot([0, 1], [1, 1], 'k--')
130
131			for example in model.examples:
132			sample_count = len(example['samples'])
133			dim = get_dim(example['samples'][0])
134			samples, probs = get_samples(model, EXAMPLE, sample_count)
135			edges = get_edge_stats(example['samples'], example['probs'])
136			radii = edges['lengths']
137			intensities = sample_count * numpy.array(edges['probs'])
138
139			cdf = numpy.array([
140			1 - numpy.exp(-intensity * volume_of_sphere(dim, radius))
141			for intensity, radius in izip(intensities, radii)
142			])
143			cdf.sort()
144			X = numpy.arange(0.5 / sample_count, 1, 1.0 / sample_count)
145
146			pdf, Xp = cdf_to_pdf(cdf, X)
147			pdf *= sample_count
148
149			error = 2 * (sum(cdf) / sample_count) - 1
150			if abs(error) < 0.05:
151			status = 'PASS'
152			linestyle = '-'
153			else:
154			status = 'FAIL'
155			linestyle = '--'
156			label = '{} {}({}) error = {:.3g}'.format(status, name, dim, error)
157			ax1.plot(X, cdf, linestyle=linestyle, label=label)
158			ax2.plot(Xp, pdf, linestyle=linestyle, label=label)
159
160			ax1.set_title('GOF of Nearest Neighbor Statistic')
161			ax1.legend(loc='best', prop={'size': 10}, fancybox=True, framealpha=0.5)
162			ax1.set_ylabel('CDF')
163			ax2.set_ylabel('PDF')
164			pyplot.tight_layout()
165			fig.subplots_adjust(hspace=0)
166			pyplot.show()
167
168
169			def neighbor_scatter(samples, probs, title='nearest neighbor'):
170			'''
171			Plot nearest neighbor statistic cdf for all datatpoints in a 2d dataset.
172			'''
173			sample_count = len(samples)
174			assert sample_count
175			dim = len(samples[0])
176			assert dim == 2, dim
177
178			pyplot.figure()
179			cmap = pyplot.get_cmap('bwr')
180
181			edges = get_edge_stats(samples, probs)
182			radii = edges['lengths']
183			intensities = sample_count * numpy.array(edges['probs'])
184
185			cdf = numpy.array([
186			1 - numpy.exp(-intensity * volume_of_sphere(dim, radius))
187			for intensity, radius in izip(intensities, radii)
188			])
189			error = 2 * (sum(cdf) / sample_count) - 1
190
191			X = [value[0] for value in samples]
192			Y = [value[1] for value in samples]
193			colors = cdf
194
195			pyplot.title('{} error = {:0.3g}'.format(title, error))
196			pyplot.scatter(X, Y, 50, alpha=0.5, c=colors, cmap=cmap)
197			pyplot.axis('equal')
198
199			pyplot.tight_layout()
200			pyplot.show()
201

posterior / goftests

GitHub Access Token became invalid

Push — master ( 668ca3...a3efe1 )

cdf_to_pdf() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like