|
1
|
|
|
# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved. |
|
2
|
|
|
# Copyright (c) 2015, Gamelan Labs, Inc. |
|
3
|
|
|
# Copyright (c) 2016, Google, Inc. |
|
4
|
|
|
# |
|
5
|
|
|
# Redistribution and use in source and binary forms, with or without |
|
6
|
|
|
# modification, are permitted provided that the following conditions |
|
7
|
|
|
# are met: |
|
8
|
|
|
# |
|
9
|
|
|
# - Redistributions of source code must retain the above copyright |
|
10
|
|
|
# notice, this list of conditions and the following disclaimer. |
|
11
|
|
|
# - Redistributions in binary form must reproduce the above copyright |
|
12
|
|
|
# notice, this list of conditions and the following disclaimer in the |
|
13
|
|
|
# documentation and/or other materials provided with the distribution. |
|
14
|
|
|
# - Neither the name of Salesforce.com nor the names of its contributors |
|
15
|
|
|
# may be used to endorse or promote products derived from this |
|
16
|
|
|
# software without specific prior written permission. |
|
17
|
|
|
# |
|
18
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
19
|
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
20
|
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
21
|
|
|
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
|
22
|
|
|
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
|
23
|
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
|
24
|
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
|
25
|
|
|
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
26
|
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR |
|
27
|
|
|
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
|
28
|
|
|
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
29
|
|
|
from __future__ import division |
|
30
|
|
|
try: |
|
31
|
|
|
from itertools import izip as zip |
|
32
|
|
|
except ImportError: |
|
33
|
|
|
pass |
|
34
|
|
|
import numpy |
|
35
|
|
|
import scipy.stats |
|
36
|
|
|
from numpy import pi |
|
37
|
|
|
from numpy.testing import rand |
|
38
|
|
|
from nose import SkipTest |
|
39
|
|
|
from nose.tools import assert_almost_equal |
|
40
|
|
|
from nose.tools import assert_equal |
|
41
|
|
|
from nose.tools import assert_greater |
|
42
|
|
|
from nose.tools import assert_less |
|
43
|
|
|
from goftests import seed_all |
|
44
|
|
|
from goftests import get_dim |
|
45
|
|
|
from goftests import multinomial_goodness_of_fit |
|
46
|
|
|
from goftests import discrete_goodness_of_fit |
|
47
|
|
|
from goftests import auto_density_goodness_of_fit |
|
48
|
|
|
from goftests import mixed_density_goodness_of_fit |
|
49
|
|
|
from goftests import split_discrete_continuous |
|
50
|
|
|
from goftests import volume_of_sphere |
|
51
|
|
|
|
|
52
|
|
|
NUM_BASE_SAMPLES = 200 |
|
53
|
|
|
|
|
54
|
|
|
NUM_SAMPLES_SCALE = 1000 |
|
55
|
|
|
|
|
56
|
|
|
TEST_FAILURE_RATE = 5e-4 |
|
57
|
|
|
|
|
58
|
|
|
|
|
59
|
|
|
def test_multinomial_goodness_of_fit(): |
|
60
|
|
|
for dim in range(2, 20): |
|
61
|
|
|
yield _test_multinomial_goodness_of_fit, dim |
|
62
|
|
|
|
|
63
|
|
|
|
|
64
|
|
|
def _test_multinomial_goodness_of_fit(dim): |
|
65
|
|
|
seed_all(0) |
|
66
|
|
|
sample_count = int(1e5) |
|
67
|
|
|
probs = numpy.random.dirichlet([1] * dim) |
|
68
|
|
|
|
|
69
|
|
|
counts = numpy.random.multinomial(sample_count, probs) |
|
70
|
|
|
p_good = multinomial_goodness_of_fit(probs, counts, sample_count) |
|
71
|
|
|
assert_greater(p_good, TEST_FAILURE_RATE) |
|
72
|
|
|
|
|
73
|
|
|
unif_counts = numpy.random.multinomial(sample_count, [1. / dim] * dim) |
|
74
|
|
|
p_bad = multinomial_goodness_of_fit(probs, unif_counts, sample_count) |
|
75
|
|
|
assert_less(p_bad, TEST_FAILURE_RATE) |
|
76
|
|
|
|
|
77
|
|
|
|
|
78
|
|
|
def test_volume_of_sphere(): |
|
79
|
|
|
for r in [0.1, 1.0, 10.0]: |
|
80
|
|
|
assert_almost_equal(volume_of_sphere(1, r), 2.0 * r) |
|
81
|
|
|
assert_almost_equal(volume_of_sphere(2, r), pi * r ** 2) |
|
82
|
|
|
assert_almost_equal(volume_of_sphere(3, r), 4 / 3.0 * pi * r ** 3) |
|
83
|
|
|
|
|
84
|
|
|
|
|
85
|
|
|
split_examples = [ |
|
86
|
|
|
{'mixed': False, 'discrete': False, 'continuous': []}, |
|
87
|
|
|
{'mixed': 0, 'discrete': 0, 'continuous': []}, |
|
88
|
|
|
{'mixed': 'abc', 'discrete': 'abc', 'continuous': []}, |
|
89
|
|
|
{'mixed': 0.0, 'discrete': None, 'continuous': [0.0]}, |
|
90
|
|
|
{'mixed': (), 'discrete': (), 'continuous': []}, |
|
91
|
|
|
{'mixed': [], 'discrete': (), 'continuous': []}, |
|
92
|
|
|
{'mixed': (0,), 'discrete': (0, ), 'continuous': []}, |
|
93
|
|
|
{'mixed': [0, ], 'discrete': (0, ), 'continuous': []}, |
|
94
|
|
|
{'mixed': (0.0, ), 'discrete': (None, ), 'continuous': [0.0]}, |
|
95
|
|
|
{'mixed': [0.0, ], 'discrete': (None, ), 'continuous': [0.0]}, |
|
96
|
|
|
{ |
|
97
|
|
|
'mixed': [True, 1, 'xyz', 3.14, [None, (), ([2.71],)]], |
|
98
|
|
|
'discrete': (True, 1, 'xyz', None, (None, (), ((None,),))), |
|
99
|
|
|
'continuous': [3.14, 2.71], |
|
100
|
|
|
}, |
|
101
|
|
|
{ |
|
102
|
|
|
'mixed': numpy.zeros(3), |
|
103
|
|
|
'discrete': (None, None, None), |
|
104
|
|
|
'continuous': [0.0, 0.0, 0.0], |
|
105
|
|
|
}, |
|
106
|
|
|
] |
|
107
|
|
|
|
|
108
|
|
|
|
|
109
|
|
|
def split_example(i): |
|
110
|
|
|
example = split_examples[i] |
|
111
|
|
|
discrete, continuous = split_discrete_continuous(example['mixed']) |
|
112
|
|
|
assert_equal(discrete, example['discrete']) |
|
113
|
|
|
assert_almost_equal(continuous, example['continuous']) |
|
114
|
|
|
|
|
115
|
|
|
|
|
116
|
|
|
def test_split_continuous_discrete(): |
|
117
|
|
|
for i in range(len(split_examples)): |
|
118
|
|
|
yield split_example, i |
|
119
|
|
|
|
|
120
|
|
|
|
|
121
|
|
|
seed_all(0) |
|
122
|
|
|
default_params = { |
|
123
|
|
|
'bernoulli': [(0.2,)], |
|
124
|
|
|
'beta': [ |
|
125
|
|
|
(0.5, 0.5), |
|
126
|
|
|
(0.5, 1.5), |
|
127
|
|
|
(0.5, 2.5), |
|
128
|
|
|
], |
|
129
|
|
|
'binom': [(40, 0.4)], |
|
130
|
|
|
'dirichlet': [ |
|
131
|
|
|
([2.0, 2.5],), |
|
132
|
|
|
([2.0, 2.5, 3.0],), |
|
133
|
|
|
([2.0, 2.5, 3.0, 3.5],), |
|
134
|
|
|
], |
|
135
|
|
|
'erlang': [(7,)], |
|
136
|
|
|
'dlaplace': [(0.8,)], |
|
137
|
|
|
'frechet': [tuple(2 * rand(1)) + (0,) + tuple(2 * rand(2))], |
|
138
|
|
|
'geom': [(0.1,)], |
|
139
|
|
|
'hypergeom': [(40, 14, 24)], |
|
140
|
|
|
'logser': [(0.9,)], |
|
141
|
|
|
'multivariate_normal': [ |
|
142
|
|
|
(numpy.ones(1), numpy.eye(1)), |
|
143
|
|
|
(numpy.ones(2), numpy.eye(2)), |
|
144
|
|
|
(numpy.ones(3), numpy.eye(3)), |
|
145
|
|
|
], |
|
146
|
|
|
'nbinom': [(40, 0.4)], |
|
147
|
|
|
'ncf': [(27, 27, 0.415784417992)], |
|
148
|
|
|
'planck': [(0.51,)], |
|
149
|
|
|
'poisson': [(20,)], |
|
150
|
|
|
'reciprocal': [tuple(numpy.array([0, 1]) + rand(1)[0])], |
|
151
|
|
|
'triang': [tuple(rand(1))], |
|
152
|
|
|
'truncnorm': [(0.1, 2.0)], |
|
153
|
|
|
'vonmises': [tuple(1.0 + rand(1))], |
|
154
|
|
|
'wrapcauchy': [(0.5,)], |
|
155
|
|
|
'zipf': [(1.2,)], |
|
156
|
|
|
} |
|
157
|
|
|
|
|
158
|
|
|
known_failures = set([ |
|
159
|
|
|
'alpha', |
|
160
|
|
|
'boltzmann', |
|
161
|
|
|
'gausshyper', # very slow |
|
162
|
|
|
'ksone', # ??? |
|
163
|
|
|
'levy_stable', # ??? |
|
164
|
|
|
'randint', # too sparse |
|
165
|
|
|
'rv_continuous', # abstract |
|
166
|
|
|
'rv_discrete', # abstract |
|
167
|
|
|
'zipf', # bug? |
|
168
|
|
|
'invwishart', # matrix |
|
169
|
|
|
'wishart', # matrix |
|
170
|
|
|
'matrix_normal', # matrix |
|
171
|
|
|
]) |
|
172
|
|
|
|
|
173
|
|
|
|
|
174
|
|
|
def transform_dirichlet(ps): |
|
175
|
|
|
dim = len(ps) |
|
176
|
|
|
assert dim > 1 |
|
177
|
|
|
# return ps[:-1] - ps[-1] * (dim ** 0.5 - 1.0) / (dim - 1.0) |
|
178
|
|
|
return ps[:-1] |
|
179
|
|
|
|
|
180
|
|
|
|
|
181
|
|
|
transforms = { |
|
182
|
|
|
'dirichlet': transform_dirichlet, |
|
183
|
|
|
} |
|
184
|
|
|
|
|
185
|
|
|
|
|
186
|
|
|
def _test_scipy_stats(name): |
|
187
|
|
|
if name in known_failures: |
|
188
|
|
|
raise SkipTest('known failure') |
|
189
|
|
|
dist = getattr(scipy.stats, name) |
|
190
|
|
|
try: |
|
191
|
|
|
params = default_params[name] |
|
192
|
|
|
except KeyError: |
|
193
|
|
|
params = [tuple(1.0 + rand(dist.numargs))] |
|
194
|
|
|
for param in params: |
|
195
|
|
|
print('param = {}'.format(param)) |
|
196
|
|
|
dim = get_dim(dist.rvs(*param, size=2)[0]) |
|
197
|
|
|
sample_count = NUM_BASE_SAMPLES + NUM_SAMPLES_SCALE * dim |
|
198
|
|
|
samples = list(dist.rvs(*param, size=sample_count)) |
|
199
|
|
|
if name in transforms: |
|
200
|
|
|
transformed = list(map(transforms[name], samples)) |
|
201
|
|
|
else: |
|
202
|
|
|
transformed = samples |
|
203
|
|
|
|
|
204
|
|
|
if hasattr(dist, 'pmf'): |
|
205
|
|
|
probs = [dist.pmf(sample, *param) for sample in samples] |
|
206
|
|
|
probs_dict = dict(zip(samples, probs)) |
|
207
|
|
|
gof = discrete_goodness_of_fit(transformed, probs_dict, plot=True) |
|
208
|
|
|
else: |
|
209
|
|
|
probs = [dist.pdf(sample, *param) for sample in samples] |
|
210
|
|
|
gof = auto_density_goodness_of_fit(transformed, probs, plot=True) |
|
211
|
|
|
assert_greater(gof, TEST_FAILURE_RATE) |
|
212
|
|
|
|
|
213
|
|
|
gof = mixed_density_goodness_of_fit(transformed, probs, plot=True) |
|
214
|
|
|
assert_greater(gof, TEST_FAILURE_RATE) |
|
215
|
|
|
|
|
216
|
|
|
|
|
217
|
|
|
def test_scipy_stats(): |
|
218
|
|
|
seed_all(0) |
|
219
|
|
|
for name in dir(scipy.stats): |
|
220
|
|
|
if hasattr(getattr(scipy.stats, name), 'rvs'): |
|
221
|
|
|
yield _test_scipy_stats, name |
|
222
|
|
|
|