|
1
|
|
|
# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved. |
|
2
|
|
|
# Copyright (c) 2015, Gamelan Labs, Inc. |
|
3
|
|
|
# Copyright (c) 2016, Google, Inc. |
|
4
|
|
|
# |
|
5
|
|
|
# Redistribution and use in source and binary forms, with or without |
|
6
|
|
|
# modification, are permitted provided that the following conditions |
|
7
|
|
|
# are met: |
|
8
|
|
|
# |
|
9
|
|
|
# - Redistributions of source code must retain the above copyright |
|
10
|
|
|
# notice, this list of conditions and the following disclaimer. |
|
11
|
|
|
# - Redistributions in binary form must reproduce the above copyright |
|
12
|
|
|
# notice, this list of conditions and the following disclaimer in the |
|
13
|
|
|
# documentation and/or other materials provided with the distribution. |
|
14
|
|
|
# - Neither the name of Salesforce.com nor the names of its contributors |
|
15
|
|
|
# may be used to endorse or promote products derived from this |
|
16
|
|
|
# software without specific prior written permission. |
|
17
|
|
|
# |
|
18
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|
19
|
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|
20
|
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
|
21
|
|
|
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
|
22
|
|
|
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
|
23
|
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
|
24
|
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
|
25
|
|
|
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
|
26
|
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR |
|
27
|
|
|
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
|
28
|
|
|
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
29
|
|
|
|
|
30
|
|
|
from __future__ import division |
|
31
|
|
|
from collections import defaultdict |
|
32
|
|
|
from math import gamma |
|
33
|
|
|
try: |
|
34
|
|
|
from itertools import izip as zip |
|
35
|
|
|
except ImportError: |
|
36
|
|
|
pass |
|
37
|
|
|
import random |
|
38
|
|
|
import sys |
|
39
|
|
|
|
|
40
|
|
|
import numpy |
|
41
|
|
|
import numpy.random |
|
42
|
|
|
from numpy import pi |
|
43
|
|
|
from scipy.spatial import cKDTree |
|
44
|
|
|
|
|
45
|
|
|
from .utils import chi2sf |
|
46
|
|
|
|
|
47
|
|
|
NoneType = type(None) |
|
48
|
|
|
|
|
49
|
|
|
#: Data types for integral random variables. |
|
50
|
|
|
#: |
|
51
|
|
|
#: For Python 2.7, this tuple also includes `long`. |
|
52
|
|
|
INTEGRAL_TYPES = (int, ) |
|
53
|
|
|
|
|
54
|
|
|
#: Data types for continuous random variables. |
|
55
|
|
|
CONTINUOUS_TYPES = (float, numpy.float32, numpy.float64) |
|
56
|
|
|
|
|
57
|
|
|
#: Data types for discrete random variables. |
|
58
|
|
|
#: |
|
59
|
|
|
#: For Python 2.7, this tuple also includes `long` and `basestring`. |
|
60
|
|
|
DISCRETE_TYPES = (NoneType, bool, int, str, numpy.int32, numpy.int64) |
|
61
|
|
|
|
|
62
|
|
|
if sys.version_info < (3, ): |
|
63
|
|
|
# `str` is a subclass of `basestring`, so this is doing a little |
|
64
|
|
|
# more work than is necessary, but it should not cause a problem. |
|
65
|
|
|
DISCRETE_TYPES += (long, basestring) # noqa |
|
66
|
|
|
INTEGRAL_TYPES += (long, ) # noqa |
|
67
|
|
|
|
|
68
|
|
|
|
|
69
|
|
|
def seed_all(seed): |
|
70
|
|
|
random.seed(seed) |
|
71
|
|
|
numpy.random.seed(seed) |
|
72
|
|
|
|
|
73
|
|
|
|
|
74
|
|
|
def get_dim(thing): |
|
75
|
|
|
if hasattr(thing, '__len__'): |
|
76
|
|
|
return len(thing) |
|
77
|
|
|
else: |
|
78
|
|
|
return 1 |
|
79
|
|
|
|
|
80
|
|
|
|
|
81
|
|
|
def print_histogram(probs, counts): |
|
82
|
|
|
WIDTH = 60.0 |
|
83
|
|
|
max_count = max(counts) |
|
84
|
|
|
print('{: >8} {: >8}'.format('Prob', 'Count')) |
|
85
|
|
|
for prob, count in sorted(zip(probs, counts), reverse=True): |
|
86
|
|
|
width = int(round(WIDTH * count / max_count)) |
|
87
|
|
|
print('{: >8.3f} {: >8d} {}'.format(prob, count, '-' * width)) |
|
88
|
|
|
|
|
89
|
|
|
|
|
90
|
|
|
def multinomial_goodness_of_fit( |
|
91
|
|
|
probs, |
|
92
|
|
|
counts, |
|
93
|
|
|
total_count, |
|
94
|
|
|
truncated=False, |
|
95
|
|
|
plot=False): |
|
96
|
|
|
""" |
|
97
|
|
|
Pearson's chi^2 test, on possibly truncated data. |
|
98
|
|
|
http://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test |
|
99
|
|
|
|
|
100
|
|
|
Returns: |
|
101
|
|
|
p-value of truncated multinomial sample. |
|
102
|
|
|
""" |
|
103
|
|
|
assert len(probs) == len(counts) |
|
104
|
|
|
assert truncated or total_count == sum(counts) |
|
105
|
|
|
chi_squared = 0 |
|
106
|
|
|
dof = 0 |
|
107
|
|
|
if plot: |
|
108
|
|
|
print_histogram(probs, counts) |
|
109
|
|
|
for p, c in zip(probs, counts): |
|
110
|
|
|
if p == 1: |
|
111
|
|
|
return 1 if c == total_count else 0 |
|
112
|
|
|
assert p < 1, 'bad probability: %g' % p |
|
113
|
|
|
if p > 0: |
|
114
|
|
|
mean = total_count * p |
|
115
|
|
|
variance = total_count * p * (1 - p) |
|
116
|
|
|
assert variance > 1,\ |
|
117
|
|
|
'WARNING goodness of fit is inaccurate; use more samples' |
|
118
|
|
|
chi_squared += (c - mean) ** 2 / variance |
|
119
|
|
|
dof += 1 |
|
120
|
|
|
else: |
|
121
|
|
|
print('WARNING zero probability in goodness-of-fit test') |
|
122
|
|
|
if c > 0: |
|
123
|
|
|
return float('inf') |
|
124
|
|
|
|
|
125
|
|
|
if not truncated: |
|
126
|
|
|
dof -= 1 |
|
127
|
|
|
|
|
128
|
|
|
survival = chi2sf(chi_squared, dof) |
|
129
|
|
|
return survival |
|
130
|
|
|
|
|
131
|
|
|
|
|
132
|
|
|
def unif01_goodness_of_fit(samples, plot=False): |
|
133
|
|
|
""" |
|
134
|
|
|
Bin uniformly distributed samples and apply Pearson's chi^2 test. |
|
135
|
|
|
""" |
|
136
|
|
|
samples = numpy.array(samples, dtype=float) |
|
137
|
|
|
assert samples.min() >= 0.0 |
|
138
|
|
|
assert samples.max() <= 1.0 |
|
139
|
|
|
bin_count = int(round(len(samples) ** 0.333)) |
|
140
|
|
|
assert bin_count >= 7, 'WARNING imprecise test, use more samples' |
|
141
|
|
|
probs = numpy.ones(bin_count, dtype=numpy.float) / bin_count |
|
142
|
|
|
counts = numpy.zeros(bin_count, dtype=numpy.int) |
|
143
|
|
|
for sample in samples: |
|
144
|
|
|
counts[min(bin_count - 1, int(bin_count * sample))] += 1 |
|
145
|
|
|
return multinomial_goodness_of_fit(probs, counts, len(samples), plot=plot) |
|
146
|
|
|
|
|
147
|
|
|
|
|
148
|
|
|
def exp_goodness_of_fit( |
|
149
|
|
|
samples, |
|
150
|
|
|
plot=False, |
|
151
|
|
|
normalized=True, |
|
152
|
|
|
return_dict=False): |
|
153
|
|
|
""" |
|
154
|
|
|
Transform exponentially distribued samples to unif01 distribution |
|
155
|
|
|
and assess goodness of fit via binned Pearson's chi^2 test. |
|
156
|
|
|
|
|
157
|
|
|
Inputs: |
|
158
|
|
|
samples - a list of real-valued samples from a candidate distribution |
|
159
|
|
|
""" |
|
160
|
|
|
result = {} |
|
161
|
|
|
if not normalized: |
|
162
|
|
|
result['norm'] = numpy.mean(samples) |
|
163
|
|
|
samples /= result['norm'] |
|
164
|
|
|
unif01_samples = numpy.exp(-samples) |
|
165
|
|
|
result['gof'] = unif01_goodness_of_fit(unif01_samples, plot=plot) |
|
166
|
|
|
return result if return_dict else result['gof'] |
|
167
|
|
|
|
|
168
|
|
|
|
|
169
|
|
|
def density_goodness_of_fit( |
|
170
|
|
|
samples, |
|
171
|
|
|
probs, |
|
172
|
|
|
plot=False, |
|
173
|
|
|
normalized=True, |
|
174
|
|
|
return_dict=False): |
|
175
|
|
|
""" |
|
176
|
|
|
Transform arbitrary continuous samples to unif01 distribution |
|
177
|
|
|
and assess goodness of fit via binned Pearson's chi^2 test. |
|
178
|
|
|
|
|
179
|
|
|
Inputs: |
|
180
|
|
|
samples - a list of real-valued samples from a distribution |
|
181
|
|
|
probs - a list of probability densities evaluated at those samples |
|
182
|
|
|
""" |
|
183
|
|
|
assert len(samples) == len(probs) |
|
184
|
|
|
assert len(samples) > 100, 'WARNING imprecision; use more samples' |
|
185
|
|
|
pairs = list(zip(samples, probs)) |
|
186
|
|
|
pairs.sort() |
|
187
|
|
|
samples = numpy.array([x for x, p in pairs]) |
|
188
|
|
|
probs = numpy.array([p for x, p in pairs]) |
|
189
|
|
|
density = len(samples) * numpy.sqrt(probs[1:] * probs[:-1]) |
|
190
|
|
|
gaps = samples[1:] - samples[:-1] |
|
191
|
|
|
exp_samples = density * gaps |
|
192
|
|
|
return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict) |
|
193
|
|
|
|
|
194
|
|
|
|
|
195
|
|
|
def volume_of_sphere(dim, radius): |
|
196
|
|
|
assert isinstance(dim, INTEGRAL_TYPES) |
|
197
|
|
|
return radius ** dim * pi ** (0.5 * dim) / gamma(0.5 * dim + 1) |
|
198
|
|
|
|
|
199
|
|
|
|
|
200
|
|
|
def get_nearest_neighbor_distances(samples): |
|
201
|
|
|
if not hasattr(samples[0], '__iter__'): |
|
202
|
|
|
samples = numpy.array([samples]).T |
|
203
|
|
|
distances, indices = cKDTree(samples).query(samples, k=2) |
|
204
|
|
|
return distances[:, 1] |
|
205
|
|
|
|
|
206
|
|
|
|
|
207
|
|
|
def vector_density_goodness_of_fit( |
|
208
|
|
|
samples, |
|
209
|
|
|
probs, |
|
210
|
|
|
plot=False, |
|
211
|
|
|
normalized=True, |
|
212
|
|
|
return_dict=False): |
|
213
|
|
|
""" |
|
214
|
|
|
Transform arbitrary multivariate continuous samples |
|
215
|
|
|
to unif01 distribution via nearest neighbor distribution [1,2,3] |
|
216
|
|
|
and assess goodness of fit via binned Pearson's chi^2 test. |
|
217
|
|
|
|
|
218
|
|
|
[1] http://projecteuclid.org/download/pdf_1/euclid.aop/1176993668 |
|
219
|
|
|
[2] http://arxiv.org/pdf/1006.3019v2.pdf |
|
220
|
|
|
[3] http://en.wikipedia.org/wiki/Nearest_neighbour_distribution |
|
221
|
|
|
|
|
222
|
|
|
Inputs: |
|
223
|
|
|
samples - a list of real-vector-valued samples from a distribution |
|
224
|
|
|
probs - a list of probability densities evaluated at those samples |
|
225
|
|
|
""" |
|
226
|
|
|
assert len(samples) |
|
227
|
|
|
assert len(samples) == len(probs) |
|
228
|
|
|
dim = get_dim(samples[0]) |
|
229
|
|
|
assert dim |
|
230
|
|
|
assert len(samples) > 1000 * dim, 'WARNING imprecision; use more samples' |
|
231
|
|
|
radii = get_nearest_neighbor_distances(samples) |
|
232
|
|
|
density = len(samples) * numpy.array(probs) |
|
233
|
|
|
volume = volume_of_sphere(dim, radii) |
|
234
|
|
|
exp_samples = density * volume |
|
235
|
|
|
return exp_goodness_of_fit(exp_samples, plot, normalized, return_dict) |
|
236
|
|
|
|
|
237
|
|
|
|
|
238
|
|
|
def trivial_density_goodness_of_fit( |
|
239
|
|
|
samples, |
|
240
|
|
|
probs, |
|
241
|
|
|
plot=False, |
|
242
|
|
|
normalized=True, |
|
243
|
|
|
return_dict=False): |
|
244
|
|
|
assert len(samples) |
|
245
|
|
|
assert all(sample == samples[0] for sample in samples) |
|
246
|
|
|
result = {'gof': 1.0} |
|
247
|
|
|
if not normalized: |
|
248
|
|
|
result['norm'] = probs[0] |
|
249
|
|
|
if return_dict: |
|
250
|
|
|
return result |
|
251
|
|
|
else: |
|
252
|
|
|
return result['gof'] |
|
253
|
|
|
|
|
254
|
|
|
|
|
255
|
|
|
def auto_density_goodness_of_fit( |
|
256
|
|
|
samples, |
|
257
|
|
|
probs, |
|
258
|
|
|
plot=False, |
|
259
|
|
|
normalized=True, |
|
260
|
|
|
return_dict=False): |
|
261
|
|
|
""" |
|
262
|
|
|
Dispatch on sample dimention and delegate to one of: |
|
263
|
|
|
- density_goodness_of_fit |
|
264
|
|
|
- vector_density_goodness_of_fit |
|
265
|
|
|
- trivial_density_goodness_of_fit |
|
266
|
|
|
""" |
|
267
|
|
|
assert len(samples) |
|
268
|
|
|
dim = get_dim(samples[0]) |
|
269
|
|
|
if dim == 0: |
|
270
|
|
|
fun = trivial_density_goodness_of_fit |
|
271
|
|
|
elif dim == 1: |
|
272
|
|
|
fun = density_goodness_of_fit |
|
273
|
|
|
if hasattr(samples[0], '__len__'): |
|
274
|
|
|
samples = [sample[0] for sample in samples] |
|
275
|
|
|
else: |
|
276
|
|
|
fun = vector_density_goodness_of_fit |
|
277
|
|
|
return fun(samples, probs, plot, normalized, return_dict) |
|
278
|
|
|
|
|
279
|
|
|
|
|
280
|
|
|
def discrete_goodness_of_fit( |
|
281
|
|
|
samples, |
|
282
|
|
|
probs_dict, |
|
283
|
|
|
truncate_beyond=8, |
|
284
|
|
|
plot=False, |
|
285
|
|
|
normalized=True): |
|
286
|
|
|
""" |
|
287
|
|
|
Transform arbitrary discrete data to multinomial |
|
288
|
|
|
and assess goodness of fit via Pearson's chi^2 test. |
|
289
|
|
|
""" |
|
290
|
|
|
if not normalized: |
|
291
|
|
|
norm = sum(probs_dict.values()) |
|
292
|
|
|
probs_dict = {i: p / norm for i, p in probs_dict.items()} |
|
293
|
|
|
counts = defaultdict(lambda: 0) |
|
294
|
|
|
for sample in samples: |
|
295
|
|
|
assert sample in probs_dict |
|
296
|
|
|
counts[sample] += 1 |
|
297
|
|
|
items = [(prob, counts.get(i, 0)) for i, prob in probs_dict.items()] |
|
298
|
|
|
items.sort(reverse=True) |
|
299
|
|
|
truncated = (truncate_beyond and truncate_beyond < len(items)) |
|
300
|
|
|
if truncated: |
|
301
|
|
|
items = items[:truncate_beyond] |
|
302
|
|
|
probs = [prob for prob, count in items] |
|
303
|
|
|
counts = [count for prob, count in items] |
|
304
|
|
|
assert sum(counts) > 100, 'WARNING imprecision; use more samples' |
|
305
|
|
|
return multinomial_goodness_of_fit( |
|
306
|
|
|
probs, |
|
307
|
|
|
counts, |
|
308
|
|
|
len(samples), |
|
309
|
|
|
truncated=truncated, |
|
310
|
|
|
plot=plot) |
|
311
|
|
|
|
|
312
|
|
|
|
|
313
|
|
|
def split_discrete_continuous(data): |
|
314
|
|
|
""" |
|
315
|
|
|
Convert arbitrary data to a pair `(discrete, continuous)` |
|
316
|
|
|
where `discrete` is hashable and `continuous` is a list of floats. |
|
317
|
|
|
""" |
|
318
|
|
|
if isinstance(data, DISCRETE_TYPES): |
|
319
|
|
|
return data, [] |
|
320
|
|
|
elif isinstance(data, CONTINUOUS_TYPES): |
|
321
|
|
|
return None, [data] |
|
322
|
|
|
elif isinstance(data, (tuple, list)): |
|
323
|
|
|
discrete = [] |
|
324
|
|
|
continuous = [] |
|
325
|
|
|
for part in data: |
|
326
|
|
|
d, c = split_discrete_continuous(part) |
|
327
|
|
|
discrete.append(d) |
|
328
|
|
|
continuous += c |
|
329
|
|
|
return tuple(discrete), continuous |
|
330
|
|
|
elif isinstance(data, numpy.ndarray): |
|
331
|
|
|
assert data.dtype in [numpy.float64, numpy.float32] |
|
332
|
|
|
return (None,) * len(data), list(map(float, data)) |
|
333
|
|
|
else: |
|
334
|
|
|
raise TypeError( |
|
335
|
|
|
'split_discrete_continuous does not accept {} of type {}'.format( |
|
336
|
|
|
repr(data), str(type(data)))) |
|
337
|
|
|
|
|
338
|
|
|
|
|
339
|
|
|
def mixed_density_goodness_of_fit(samples, probs, plot=False, normalized=True): |
|
340
|
|
|
""" |
|
341
|
|
|
Test general mixed discrete+continuous datatypes by |
|
342
|
|
|
(1) testing the continuous part conditioned on each discrete value |
|
343
|
|
|
(2) testing the discrete part marginalizing over the continuous part |
|
344
|
|
|
(3) testing the estimated total probability (if normalized = True) |
|
345
|
|
|
|
|
346
|
|
|
Inputs: |
|
347
|
|
|
samples - a list of plain-old-data samples from a distribution |
|
348
|
|
|
probs - a list of probability densities evaluated at those samples |
|
349
|
|
|
""" |
|
350
|
|
|
assert len(samples) |
|
351
|
|
|
discrete_samples = [] |
|
352
|
|
|
strata = defaultdict(lambda: ([], [])) |
|
353
|
|
|
for sample, prob in zip(samples, probs): |
|
354
|
|
|
d, c = split_discrete_continuous(sample) |
|
355
|
|
|
discrete_samples.append(d) |
|
356
|
|
|
samples, probs = strata[d] |
|
357
|
|
|
samples.append(c) |
|
358
|
|
|
probs.append(prob) |
|
359
|
|
|
|
|
360
|
|
|
# Continuous part |
|
361
|
|
|
gofs = [] |
|
362
|
|
|
discrete_probs = {} |
|
363
|
|
|
for key, (samples, probs) in strata.items(): |
|
364
|
|
|
result = auto_density_goodness_of_fit( |
|
365
|
|
|
samples, |
|
366
|
|
|
probs, |
|
367
|
|
|
plot=plot, |
|
368
|
|
|
normalized=False, |
|
369
|
|
|
return_dict=True) |
|
370
|
|
|
gofs.append(result['gof']) |
|
371
|
|
|
discrete_probs[key] = result['norm'] |
|
372
|
|
|
|
|
373
|
|
|
# Discrete part |
|
374
|
|
|
if len(strata) > 1: |
|
375
|
|
|
gofs.append(discrete_goodness_of_fit( |
|
376
|
|
|
discrete_samples, |
|
377
|
|
|
discrete_probs, |
|
378
|
|
|
plot=plot, |
|
379
|
|
|
normalized=False)) |
|
380
|
|
|
|
|
381
|
|
|
# Normalization |
|
382
|
|
|
if normalized: |
|
383
|
|
|
norm = sum(discrete_probs.values()) |
|
384
|
|
|
discrete_counts = [len(samples) for samples, _ in strata.values()] |
|
385
|
|
|
norm_variance = sum(1.0 / count for count in discrete_counts) |
|
386
|
|
|
dof = len(discrete_counts) |
|
387
|
|
|
chi_squared = (1 - norm) ** 2 / norm_variance |
|
388
|
|
|
gofs.append(chi2sf(chi_squared, dof)) |
|
389
|
|
|
if plot: |
|
390
|
|
|
print('norm = {:.4g} +- {:.4g}'.format(norm, norm_variance ** 0.5)) |
|
391
|
|
|
print(' = {}'.format( |
|
392
|
|
|
' + '.join(map('{:.4g}'.format, discrete_probs.values())))) |
|
393
|
|
|
|
|
394
|
|
|
return min(gofs) |
|
395
|
|
|
|