1
|
|
|
# Copyright (c) 2014, Salesforce.com, Inc. All rights reserved. |
2
|
|
|
# |
3
|
|
|
# Redistribution and use in source and binary forms, with or without |
4
|
|
|
# modification, are permitted provided that the following conditions |
5
|
|
|
# are met: |
6
|
|
|
# |
7
|
|
|
# - Redistributions of source code must retain the above copyright |
8
|
|
|
# notice, this list of conditions and the following disclaimer. |
9
|
|
|
# - Redistributions in binary form must reproduce the above copyright |
10
|
|
|
# notice, this list of conditions and the following disclaimer in the |
11
|
|
|
# documentation and/or other materials provided with the distribution. |
12
|
|
|
# - Neither the name of Salesforce.com nor the names of its contributors |
13
|
|
|
# may be used to endorse or promote products derived from this |
14
|
|
|
# software without specific prior written permission. |
15
|
|
|
# |
16
|
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
17
|
|
|
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
18
|
|
|
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
19
|
|
|
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
20
|
|
|
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
21
|
|
|
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, |
22
|
|
|
# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS |
23
|
|
|
# OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
24
|
|
|
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR |
25
|
|
|
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
26
|
|
|
# USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
27
|
|
|
|
28
|
|
|
''' |
29
|
|
|
A parameter-free clustering prior based on partition entropy. |
30
|
|
|
|
31
|
|
|
The Dirichlet prior is often used in nonparametric mixture models to model |
32
|
|
|
the partitioning of observations into clusters. |
33
|
|
|
This class implements a parameter-free alternative to the Dirichlet prior |
34
|
|
|
that preserves exchangeability, preserves asymptotic convergence rate of |
35
|
|
|
density estimation, and has an elegant interpretation as a |
36
|
|
|
minimum-description-length prior. |
37
|
|
|
|
38
|
|
|
Motivation |
39
|
|
|
---------- |
40
|
|
|
|
41
|
|
|
In conjugate mixture models, samples from the posterior are sufficiently |
42
|
|
|
represented by a partitioning of observations into clusters, say as an |
43
|
|
|
assignment vector X with cluster labels X_i for each observation i. |
44
|
|
|
In our ~10^6-observation production system, we found the data size of this |
45
|
|
|
assignment vector to be a limiting factor in query latency, even after |
46
|
|
|
lossless compression. To address this problem, we tried to incorporate the |
47
|
|
|
Shannon entropy of this assignment vector directly into the prior, as |
48
|
|
|
an information criterion. Surprisingly, the resulting low-entropy prior |
49
|
|
|
enjoys a number of properties: |
50
|
|
|
|
51
|
|
|
- The low-entropy prior enjoys similar asymptotic convergence as the |
52
|
|
|
Dirichlet prior. |
53
|
|
|
|
54
|
|
|
- The probability of a clustering is elegant and easy to evaluate |
55
|
|
|
(up to an unknown normalizing constant). |
56
|
|
|
|
57
|
|
|
- The resulting distribution resembles a CRP distribution with parameter |
58
|
|
|
alpha = exp(-1), but slightly avoids small clusters. |
59
|
|
|
|
60
|
|
|
- MAP estimates are minimum-description-length, as measured by assignment |
61
|
|
|
vector complexity. |
62
|
|
|
|
63
|
|
|
- The low-entropy prior is parameter free, unlike the CRP, Pitman-Yor, or |
64
|
|
|
Mixture of Finite Mixture models. |
65
|
|
|
|
66
|
|
|
A difficulty is that the prior depends on dataset size, and is hence not a |
67
|
|
|
proper nonparametric generative model. |
68
|
|
|
''' |
69
|
|
|
|
70
|
|
|
import os |
71
|
|
|
from collections import defaultdict |
72
|
|
|
import numpy |
73
|
|
|
from numpy import log, exp |
74
|
|
|
import math |
75
|
|
|
import matplotlib |
76
|
|
|
matplotlib.use('Agg') |
77
|
|
|
from matplotlib import pyplot, font_manager |
78
|
|
|
import parsable |
79
|
|
|
from distributions.lp.special import fast_log |
80
|
|
|
from distributions.io.stream import json_stream_load, json_stream_dump |
81
|
|
|
parsable = parsable.Parsable() |
82
|
|
|
|
83
|
|
|
assert exp # pacify pyflakes |
84
|
|
|
|
85
|
|
|
|
86
|
|
|
DEFAULT_MAX_SIZE = 47 |
87
|
|
|
|
88
|
|
|
ROOT = os.path.dirname(os.path.abspath(__file__)) |
89
|
|
|
TEMP = os.path.join(ROOT, 'clustering.data') |
90
|
|
|
if not os.path.exists(TEMP): |
91
|
|
|
os.makedirs(TEMP) |
92
|
|
|
|
93
|
|
|
CACHE = {} |
94
|
|
|
|
95
|
|
|
|
96
|
|
|
def savefig(stem): |
97
|
|
|
for extension in ['png', 'pdf']: |
98
|
|
|
name = '{}/{}.{}'.format(TEMP, stem, extension) |
99
|
|
|
print 'saving', name |
100
|
|
|
pyplot.tight_layout() |
101
|
|
|
pyplot.savefig(name) |
102
|
|
|
|
103
|
|
|
|
104
|
|
|
def get_larger_counts(small_counts): |
105
|
|
|
large_counts = defaultdict(lambda: 0.0) |
106
|
|
|
for small_shape, count in small_counts.iteritems(): |
107
|
|
|
|
108
|
|
|
# create a new partition |
109
|
|
|
large_shape = (1,) + small_shape |
110
|
|
|
large_counts[large_shape] += count |
111
|
|
|
|
112
|
|
|
# add to each existing partition |
113
|
|
|
for i in xrange(len(small_shape)): |
114
|
|
|
large_shape = list(small_shape) |
115
|
|
|
large_shape[i] += 1 |
116
|
|
|
large_shape.sort() |
117
|
|
|
large_shape = tuple(large_shape) |
118
|
|
|
large_counts[large_shape] += count |
119
|
|
|
|
120
|
|
|
return dict(large_counts) |
121
|
|
|
|
122
|
|
|
|
123
|
|
|
def get_smaller_probs(small_counts, large_counts, large_probs): |
124
|
|
|
assert len(large_counts) == len(large_probs) |
125
|
|
|
small_probs = {} |
126
|
|
|
for small_shape, count in small_counts.iteritems(): |
127
|
|
|
prob = 0.0 |
128
|
|
|
|
129
|
|
|
# create a new partition |
130
|
|
|
large_shape = (1,) + small_shape |
131
|
|
|
prob += large_probs[large_shape] / large_counts[large_shape] |
132
|
|
|
|
133
|
|
|
# add to each existing partition |
134
|
|
|
for i in xrange(len(small_shape)): |
135
|
|
|
large_shape = list(small_shape) |
136
|
|
|
large_shape[i] += 1 |
137
|
|
|
large_shape.sort() |
138
|
|
|
large_shape = tuple(large_shape) |
139
|
|
|
prob += large_probs[large_shape] / large_counts[large_shape] |
140
|
|
|
|
141
|
|
|
small_probs[small_shape] = count * prob |
142
|
|
|
return small_probs |
143
|
|
|
|
144
|
|
|
|
145
|
|
|
def get_counts(size): |
146
|
|
|
''' |
147
|
|
|
Count partition shapes of a given sample size. |
148
|
|
|
|
149
|
|
|
Inputs: |
150
|
|
|
size = sample_size |
151
|
|
|
Returns: |
152
|
|
|
dict : shape -> count |
153
|
|
|
''' |
154
|
|
|
assert 0 <= size |
155
|
|
|
cache_file = '{}/counts.{}.json.bz2'.format(TEMP, size) |
156
|
|
|
if cache_file not in CACHE: |
157
|
|
|
if os.path.exists(cache_file): |
158
|
|
|
flat = json_stream_load(cache_file) |
159
|
|
|
large = {tuple(key): val for key, val in flat} |
160
|
|
|
else: |
161
|
|
|
if size == 0: |
162
|
|
|
large = {(): 1.0} |
163
|
|
|
else: |
164
|
|
|
small = get_counts(size - 1) |
165
|
|
|
large = get_larger_counts(small) |
166
|
|
|
print 'caching', cache_file |
167
|
|
|
json_stream_dump(large.iteritems(), cache_file) |
168
|
|
|
CACHE[cache_file] = large |
169
|
|
|
return CACHE[cache_file] |
170
|
|
|
|
171
|
|
|
|
172
|
|
|
def enum_counts(max_size): |
173
|
|
|
return [get_counts(size) for size in range(1 + max_size)] |
174
|
|
|
|
175
|
|
|
|
176
|
|
|
def get_log_z(shape): |
177
|
|
|
return sum(n * math.log(n) for n in shape) |
178
|
|
|
|
179
|
|
|
|
180
|
|
|
def get_log_Z(counts): |
181
|
|
|
return numpy.logaddexp.reduce([ |
182
|
|
|
get_log_z(shape) + math.log(count) |
183
|
|
|
for shape, count in counts.iteritems() |
184
|
|
|
]) |
185
|
|
|
|
186
|
|
|
|
187
|
|
|
def get_probs(size): |
188
|
|
|
counts = get_counts(size).copy() |
189
|
|
|
for shape, count in counts.iteritems(): |
190
|
|
|
counts[shape] = get_log_z(shape) + math.log(count) |
191
|
|
|
log_Z = numpy.logaddexp.reduce(counts.values()) |
192
|
|
|
for shape, log_z in counts.iteritems(): |
193
|
|
|
counts[shape] = math.exp(log_z - log_Z) |
194
|
|
|
return counts |
195
|
|
|
|
196
|
|
|
|
197
|
|
|
def get_subprobs(size, max_size): |
198
|
|
|
''' |
199
|
|
|
Compute probabilities of shapes of partial assignment vectors. |
200
|
|
|
|
201
|
|
|
Inputs: |
202
|
|
|
size = sample_size |
203
|
|
|
max_size = dataset_size |
204
|
|
|
Returns: |
205
|
|
|
dict : shape -> prob |
206
|
|
|
''' |
207
|
|
|
assert 0 <= size |
208
|
|
|
assert size <= max_size |
209
|
|
|
cache_file = '{}/subprobs.{}.{}.json.bz2'.format(TEMP, size, max_size) |
210
|
|
|
if cache_file not in CACHE: |
211
|
|
|
if os.path.exists(cache_file): |
212
|
|
|
flat = json_stream_load(cache_file) |
213
|
|
|
small_probs = {tuple(key): val for key, val in flat} |
214
|
|
|
else: |
215
|
|
|
if size == max_size: |
216
|
|
|
small_probs = get_probs(size) |
217
|
|
|
else: |
218
|
|
|
small_counts = get_counts(size) |
219
|
|
|
large_counts = get_counts(size + 1) |
220
|
|
|
large_probs = get_subprobs(size + 1, max_size) |
221
|
|
|
small_probs = get_smaller_probs( |
222
|
|
|
small_counts, |
223
|
|
|
large_counts, |
224
|
|
|
large_probs) |
225
|
|
|
print 'caching', cache_file |
226
|
|
|
json_stream_dump(small_probs.iteritems(), cache_file) |
227
|
|
|
CACHE[cache_file] = small_probs |
228
|
|
|
return CACHE[cache_file] |
229
|
|
|
|
230
|
|
|
|
231
|
|
|
def enum_probs(max_size): |
232
|
|
|
return [get_probs(size) for size in range(max_size + 1)] |
233
|
|
|
|
234
|
|
|
|
235
|
|
|
@parsable.command |
236
|
|
|
def priors(N=100): |
237
|
|
|
''' |
238
|
|
|
Plots different partition priors. |
239
|
|
|
''' |
240
|
|
|
X = numpy.array(range(1, N + 1)) |
241
|
|
|
|
242
|
|
|
def plot(Y, *args, **kwargs): |
243
|
|
|
Y = numpy.array(Y) |
244
|
|
|
Y -= numpy.logaddexp.reduce(Y) |
245
|
|
|
pyplot.plot(X, Y, *args, **kwargs) |
246
|
|
|
|
247
|
|
|
def crp(alpha): |
248
|
|
|
assert 0 < alpha |
249
|
|
|
prob = numpy.zeros(len(X)) |
250
|
|
|
prob[1:] = log(X[1:] - 1) |
251
|
|
|
prob[0] = log(alpha) |
252
|
|
|
return prob |
253
|
|
|
|
254
|
|
|
def n_log_n(n): |
255
|
|
|
return n * log(n) |
256
|
|
|
|
257
|
|
|
def entropy(): |
258
|
|
|
prob = numpy.zeros(len(X)) |
259
|
|
|
prob[1:] = n_log_n(X[1:]) - n_log_n(X[1:] - 1) |
260
|
|
|
return prob |
261
|
|
|
|
262
|
|
|
def plot_crp(alpha): |
263
|
|
|
plot(crp(eval(alpha)), label='CRP({})'.format(alpha)) |
264
|
|
|
|
265
|
|
|
def plot_entropy(): |
266
|
|
|
plot(entropy(), 'k--', linewidth=2, label='low-entropy') |
267
|
|
|
|
268
|
|
|
pyplot.figure(figsize=(8, 4)) |
269
|
|
|
|
270
|
|
|
plot_entropy() |
271
|
|
|
plot_crp('0.01') |
272
|
|
|
plot_crp('0.1') |
273
|
|
|
plot_crp('exp(-1)') |
274
|
|
|
plot_crp('1.0') |
275
|
|
|
plot_crp('10.0') |
276
|
|
|
|
277
|
|
|
pyplot.title('Posterior Predictive Curves of Clustering Priors') |
278
|
|
|
pyplot.xlabel('category size') |
279
|
|
|
pyplot.ylabel('log(probability)') |
280
|
|
|
pyplot.xscale('log') |
281
|
|
|
pyplot.legend(loc='best') |
282
|
|
|
|
283
|
|
|
savefig('priors') |
284
|
|
|
|
285
|
|
|
|
286
|
|
|
def get_pairwise(counts): |
287
|
|
|
size = sum(iter(counts).next()) |
288
|
|
|
paired = 0.0 |
289
|
|
|
for shape, prob in counts.iteritems(): |
290
|
|
|
paired += prob * sum(n * (n - 1) for n in shape) / (size * (size - 1)) |
291
|
|
|
return paired |
292
|
|
|
|
293
|
|
|
|
294
|
|
|
@parsable.command |
295
|
|
|
def pairwise(max_size=DEFAULT_MAX_SIZE): |
296
|
|
|
''' |
297
|
|
|
Plot probability that two points lie in same cluster, |
298
|
|
|
as function of data set size. |
299
|
|
|
''' |
300
|
|
|
all_counts = enum_probs(max_size) |
301
|
|
|
sizes = range(2, len(all_counts)) |
302
|
|
|
probs = [get_pairwise(all_counts[i]) for i in sizes] |
303
|
|
|
|
304
|
|
|
pyplot.figure() |
305
|
|
|
pyplot.plot(sizes, probs, marker='.') |
306
|
|
|
pyplot.title('\n'.join([ |
307
|
|
|
'Cohabitation probability depends on dataset size', |
308
|
|
|
'(unlike the CRP or PYP)' |
309
|
|
|
])) |
310
|
|
|
pyplot.xlabel('# objects') |
311
|
|
|
pyplot.ylabel('P[two random objects in same cluster]') |
312
|
|
|
pyplot.xscale('log') |
313
|
|
|
# pyplot.yscale('log') |
314
|
|
|
pyplot.ylim(0, 1) |
315
|
|
|
savefig('pairwise') |
316
|
|
|
|
317
|
|
|
|
318
|
|
|
def get_color_range(size): |
319
|
|
|
scale = 1.0 / (size - 1.0) |
320
|
|
|
return [ |
321
|
|
|
(scale * t, 0.5, scale * (size - t - 1)) |
322
|
|
|
for t in range(size) |
323
|
|
|
] |
324
|
|
|
|
325
|
|
|
|
326
|
|
|
def approximate_postpred_correction(subsample_size, dataset_size): |
327
|
|
|
t = numpy.log(1.0 * dataset_size / subsample_size) |
328
|
|
|
return t * (0.45 - 0.1 / subsample_size - 0.1 / dataset_size) |
329
|
|
|
|
330
|
|
|
|
331
|
|
|
def ad_hoc_size_factor(subsample_size, dataset_size): |
332
|
|
|
return numpy.exp( |
333
|
|
|
approximate_postpred_correction(subsample_size, dataset_size)) |
334
|
|
|
|
335
|
|
|
|
336
|
|
|
@parsable.command |
337
|
|
|
def postpred(subsample_size=10): |
338
|
|
|
''' |
339
|
|
|
Plot posterior predictive probability and approximations, |
340
|
|
|
fixing subsample size and varying cluster size and dataset size. |
341
|
|
|
''' |
342
|
|
|
size = subsample_size |
343
|
|
|
max_sizes = [size] + [2, 3, 5, 8, 10, 15, 20, 30, 40, 50] |
344
|
|
|
max_sizes = sorted(set(s for s in max_sizes if s >= size)) |
345
|
|
|
colors = get_color_range(len(max_sizes)) |
346
|
|
|
pyplot.figure(figsize=(12, 8)) |
347
|
|
|
Y_max = 0 |
348
|
|
|
|
349
|
|
|
large_counts = get_counts(size) |
350
|
|
|
for max_size, color in zip(max_sizes, colors): |
351
|
|
|
large_probs = get_subprobs(size, max_size) |
352
|
|
|
small_probs = get_subprobs(size - 1, max_size) |
353
|
|
|
|
354
|
|
|
def plot(X, Y, **kwargs): |
355
|
|
|
pyplot.scatter( |
356
|
|
|
X, Y, |
357
|
|
|
color=color, |
358
|
|
|
edgecolors='none', |
359
|
|
|
**kwargs) |
360
|
|
|
|
361
|
|
|
plot([], [], label='max_size = {}'.format(max_size)) |
362
|
|
|
|
363
|
|
|
max_small_prob = max(small_probs.itervalues()) |
364
|
|
|
for small_shape, small_prob in small_probs.iteritems(): |
365
|
|
|
X = [] |
366
|
|
|
Y = [] |
367
|
|
|
|
368
|
|
|
# create a new partition |
369
|
|
|
n = 1 |
370
|
|
|
large_shape = (1,) + small_shape |
371
|
|
|
prob = large_probs[large_shape] / large_counts[large_shape] |
372
|
|
|
X.append(n) |
373
|
|
|
Y.append(prob) |
374
|
|
|
singleton_prob = prob |
375
|
|
|
|
376
|
|
|
# add to each existing partition |
377
|
|
|
for i in range(len(small_shape)): |
378
|
|
|
n = small_shape[i] + 1 |
379
|
|
|
large_shape = list(small_shape) |
380
|
|
|
large_shape[i] += 1 |
381
|
|
|
large_shape.sort() |
382
|
|
|
large_shape = tuple(large_shape) |
383
|
|
|
prob = large_probs[large_shape] / large_counts[large_shape] |
384
|
|
|
X.append(n) |
385
|
|
|
Y.append(prob) |
386
|
|
|
|
387
|
|
|
X = numpy.array(X) |
388
|
|
|
Y = numpy.array(Y) |
389
|
|
|
Y /= singleton_prob |
390
|
|
|
alpha = small_prob / max_small_prob |
391
|
|
|
plot(X, Y, alpha=alpha) |
392
|
|
|
Y_max = max(Y_max, max(Y)) |
393
|
|
|
|
394
|
|
|
X = numpy.array(range(1, size + 1)) |
395
|
|
|
|
396
|
|
|
# entropy |
397
|
|
|
entropy = numpy.array([ |
398
|
|
|
x * (x / (x - 1.0)) ** (x - 1.0) if x > 1 else 1 |
399
|
|
|
for x in X |
400
|
|
|
]) |
401
|
|
|
Y = entropy / entropy.min() |
402
|
|
|
pyplot.plot(X, Y, 'k--', label='entropy', linewidth=2) |
403
|
|
|
|
404
|
|
|
# CRP |
405
|
|
|
alpha = math.exp(-1) |
406
|
|
|
Y = numpy.array([x - 1 if x > 1 else alpha for x in X]) |
407
|
|
|
Y /= Y.min() |
408
|
|
|
pyplot.plot(X, Y, 'g-', label='CRP(exp(-1))'.format(alpha)) |
409
|
|
|
|
410
|
|
|
# ad hoc |
411
|
|
|
factors = ad_hoc_size_factor(size, numpy.array(max_sizes)) |
412
|
|
|
for factor in factors: |
413
|
|
|
Y = entropy.copy() |
414
|
|
|
Y[0] *= factor |
415
|
|
|
Y /= Y.min() |
416
|
|
|
pyplot.plot(X, Y, 'r--') |
417
|
|
|
pyplot.plot([], [], 'r--', label='ad hoc') |
418
|
|
|
|
419
|
|
|
pyplot.yscale('log') |
420
|
|
|
pyplot.xscale('log') |
421
|
|
|
pyplot.title( |
422
|
|
|
'Adding 1 point to subsample of {} points out of {} total'.format( |
423
|
|
|
size, max_size)) |
424
|
|
|
pyplot.xlabel('cluster size') |
425
|
|
|
pyplot.ylabel('posterior predictive probability') |
426
|
|
|
pyplot.xlim(1, size * 1.01) |
427
|
|
|
pyplot.ylim(1, Y_max * 1.01) |
428
|
|
|
pyplot.legend( |
429
|
|
|
prop=font_manager.FontProperties(size=10), |
430
|
|
|
loc='upper left') |
431
|
|
|
savefig('postpred') |
432
|
|
|
|
433
|
|
|
|
434
|
|
|
def true_postpred_correction(subsample_size, dataset_size): |
435
|
|
|
''' |
436
|
|
|
Compute true postpred constant according to size-based approximation. |
437
|
|
|
''' |
438
|
|
|
large_counts = get_counts(subsample_size) |
439
|
|
|
large_probs = get_subprobs(subsample_size, dataset_size) |
440
|
|
|
small_probs = get_subprobs(subsample_size - 1, dataset_size) |
441
|
|
|
|
442
|
|
|
numer = 0 |
443
|
|
|
denom = 0 |
444
|
|
|
|
445
|
|
|
for small_shape, small_prob in small_probs.iteritems(): |
446
|
|
|
probs = [] |
447
|
|
|
|
448
|
|
|
# create a new partition |
449
|
|
|
n = 1 |
450
|
|
|
large_shape = (1,) + small_shape |
451
|
|
|
prob = large_probs[large_shape] / large_counts[large_shape] |
452
|
|
|
probs.append((n, prob)) |
453
|
|
|
|
454
|
|
|
# add to each existing partition |
455
|
|
|
for i in range(len(small_shape)): |
456
|
|
|
n = small_shape[i] + 1 |
457
|
|
|
large_shape = list(small_shape) |
458
|
|
|
large_shape[i] += 1 |
459
|
|
|
large_shape.sort() |
460
|
|
|
large_shape = tuple(large_shape) |
461
|
|
|
prob = large_probs[large_shape] / large_counts[large_shape] |
462
|
|
|
probs.append((n, prob)) |
463
|
|
|
|
464
|
|
|
total = sum(prob for _, prob in probs) |
465
|
|
|
singleton_prob = probs[0][1] |
466
|
|
|
for n, prob in probs[1:]: |
467
|
|
|
weight = small_prob * prob / total |
468
|
|
|
baseline = -math.log(n * (n / (n - 1.0)) ** (n - 1.0)) |
469
|
|
|
correction = math.log(singleton_prob / prob) - baseline |
470
|
|
|
numer += weight * correction |
471
|
|
|
denom += weight |
472
|
|
|
|
473
|
|
|
return numer / denom if denom > 0 else 1.0 |
474
|
|
|
|
475
|
|
|
|
476
|
|
|
@parsable.command |
477
|
|
|
def dataprob(subsample_size=10, dataset_size=50): |
478
|
|
|
''' |
479
|
|
|
Plot data prob approximation. |
480
|
|
|
|
481
|
|
|
This tests the accuracy of LowEntropy.score_counts(...). |
482
|
|
|
''' |
483
|
|
|
true_probs = get_subprobs(subsample_size, dataset_size) |
484
|
|
|
naive_probs = get_probs(subsample_size) |
485
|
|
|
shapes = true_probs.keys() |
486
|
|
|
|
487
|
|
|
# apply ad hoc size factor |
488
|
|
|
approx_probs = naive_probs.copy() |
489
|
|
|
factor = ad_hoc_size_factor(subsample_size, dataset_size) |
490
|
|
|
print 'factor =', factor |
491
|
|
|
for shape in shapes: |
492
|
|
|
approx_probs[shape] *= factor ** (len(shape) - 2) |
493
|
|
|
|
494
|
|
|
X = numpy.array([true_probs[shape] for shape in shapes]) |
495
|
|
|
Y0 = numpy.array([naive_probs[shape] for shape in shapes]) |
496
|
|
|
Y1 = numpy.array([approx_probs[shape] for shape in shapes]) |
497
|
|
|
|
498
|
|
|
pyplot.figure() |
499
|
|
|
pyplot.scatter(X, Y0, color='blue', edgecolors='none', label='naive') |
500
|
|
|
pyplot.scatter(X, Y1, color='red', edgecolors='none', label='approx') |
501
|
|
|
pyplot.xlabel('true probability') |
502
|
|
|
pyplot.ylabel('approximation') |
503
|
|
|
LB = min(X.min(), Y0.min(), Y1.min()) |
504
|
|
|
UB = max(X.max(), Y0.max(), Y1.max()) |
505
|
|
|
pyplot.xlim(LB, UB) |
506
|
|
|
pyplot.ylim(LB, UB) |
507
|
|
|
pyplot.plot([LB, UB], [LB, UB], 'k--') |
508
|
|
|
pyplot.xscale('log') |
509
|
|
|
pyplot.yscale('log') |
510
|
|
|
pyplot.title('\n'.join([ |
511
|
|
|
'Approximate data probability', |
512
|
|
|
'subsample_size = {}, dataset_size = {}'.format( |
513
|
|
|
subsample_size, |
514
|
|
|
dataset_size), |
515
|
|
|
])) |
516
|
|
|
pyplot.legend( |
517
|
|
|
prop=font_manager.FontProperties(size=10), |
518
|
|
|
loc='lower right') |
519
|
|
|
savefig('dataprob') |
520
|
|
|
|
521
|
|
|
|
522
|
|
|
def true_dataprob_correction(subsample_size, dataset_size): |
523
|
|
|
''' |
524
|
|
|
Compute true normalization correction. |
525
|
|
|
''' |
526
|
|
|
naive_probs = get_probs(subsample_size) |
527
|
|
|
factor = ad_hoc_size_factor(subsample_size, dataset_size) |
528
|
|
|
Z = sum( |
529
|
|
|
prob * factor ** (len(shape) - 1) |
530
|
|
|
for shape, prob in naive_probs.iteritems() |
531
|
|
|
) |
532
|
|
|
return -math.log(Z) |
533
|
|
|
|
534
|
|
|
|
535
|
|
|
def approximate_dataprob_correction(subsample_size, dataset_size): |
536
|
|
|
n = math.log(subsample_size) |
537
|
|
|
N = math.log(dataset_size) |
538
|
|
|
return 0.061 * n * (n - N) * (n + N) ** 0.75 |
539
|
|
|
|
540
|
|
|
|
541
|
|
|
@parsable.command |
542
|
|
|
def normalization(max_size=DEFAULT_MAX_SIZE): |
543
|
|
|
''' |
544
|
|
|
Plot approximation to partition function of low-entropy clustering |
545
|
|
|
distribution for various set sizes. |
546
|
|
|
''' |
547
|
|
|
pyplot.figure() |
548
|
|
|
|
549
|
|
|
all_counts = enum_counts(max_size) |
550
|
|
|
sizes = numpy.array(range(1, 1 + max_size)) |
551
|
|
|
log_Z = numpy.array([ |
552
|
|
|
get_log_Z(all_counts[size]) for size in sizes |
553
|
|
|
]) |
554
|
|
|
log_z_max = numpy.array([get_log_z([size]) for size in sizes]) |
555
|
|
|
|
556
|
|
|
coeffs = [ |
557
|
|
|
(log_Z[i] / log_z_max[i] - 1.0) * sizes[i] ** 0.75 |
558
|
|
|
for i in [1, -1] |
559
|
|
|
] |
560
|
|
|
coeffs += [0.27, 0.275, 0.28] |
561
|
|
|
for coeff in coeffs: |
562
|
|
|
print coeff |
563
|
|
|
approx = log_z_max * (1 + coeff * sizes ** -0.75) |
564
|
|
|
X = sizes ** -0.75 |
565
|
|
|
Y = (log_Z - approx) / log_Z |
566
|
|
|
pyplot.plot(X, Y, marker='.', label='coeff = {}'.format(coeff)) |
567
|
|
|
pyplot.xlim(0, 1) |
568
|
|
|
pyplot.xlabel('1 / size') |
569
|
|
|
pyplot.ylabel('approx error') |
570
|
|
|
pyplot.title( |
571
|
|
|
'log(Z) ~ log(z_max) * (1 + coeff * size ** -0.75)') |
572
|
|
|
pyplot.legend(loc='best') |
573
|
|
|
savefig('normalization') |
574
|
|
|
|
575
|
|
|
|
576
|
|
|
@parsable.command |
577
|
|
|
def approximations(max_size=DEFAULT_MAX_SIZE): |
578
|
|
|
''' |
579
|
|
|
Plot both main approximations for many (subsample, dataset) sizes: |
580
|
|
|
(1) normalization constant, and |
581
|
|
|
(2) postpred size factor |
582
|
|
|
''' |
583
|
|
|
sizes = [1, 2, 3, 4, 5, 7, 10, 15, 20, 30, 40, 50, 60] |
584
|
|
|
sizes = [size for size in sizes if size <= max_size] |
585
|
|
|
keys = [(x, y) for x in sizes for y in sizes if x <= y] |
586
|
|
|
|
587
|
|
|
truth1 = {} |
588
|
|
|
truth2 = {} |
589
|
|
|
approx1 = {} |
590
|
|
|
approx2 = {} |
591
|
|
|
for key in keys: |
592
|
|
|
size, max_size = key |
593
|
|
|
|
594
|
|
|
# postpred correction |
595
|
|
|
if size > 1: |
596
|
|
|
truth1[key] = true_postpred_correction(size, max_size) |
597
|
|
|
approx1[key] = approximate_postpred_correction(size, max_size) |
598
|
|
|
|
599
|
|
|
# normalization correction |
600
|
|
|
truth2[key] = true_dataprob_correction(size, max_size) |
601
|
|
|
approx2[key] = approximate_dataprob_correction(size, max_size) |
602
|
|
|
|
603
|
|
|
fig, (ax1, ax2) = pyplot.subplots(2, 1, sharex=True, figsize=(12, 8)) |
604
|
|
|
ax1.set_title('Approximation accuracies of postpred and dataprob') |
605
|
|
|
ax2.set_ylabel('log(Z correction)') |
606
|
|
|
ax1.set_ylabel('log(singleton postpred correction)') |
607
|
|
|
ax2.set_xlabel('subsample size') |
608
|
|
|
ax1.set_xlim(min(sizes) * 0.95, max(sizes) * 1.05) |
609
|
|
|
ax1.set_xscale('log') |
610
|
|
|
ax2.set_xscale('log') |
611
|
|
|
|
612
|
|
|
def plot(ax, X, y, values, *args, **kwargs): |
613
|
|
|
Y = [values[x, y] for x in X if (x, y) in values] |
614
|
|
|
X = [x for x in X if (x, y) in values] |
615
|
|
|
ax.plot(X, Y, *args, alpha=0.5, marker='.', **kwargs) |
616
|
|
|
|
617
|
|
|
for max_size in sizes: |
618
|
|
|
X = [n for n in sizes if n <= max_size] |
619
|
|
|
plot(ax1, X, max_size, truth1, 'k-') |
620
|
|
|
plot(ax1, X, max_size, approx1, 'r-') |
621
|
|
|
plot(ax2, X, max_size, truth2, 'k-') |
622
|
|
|
plot(ax2, X, max_size, approx2, 'r-') |
623
|
|
|
|
624
|
|
|
plot(ax1, [], None, {}, 'r-', label='approximation') |
625
|
|
|
plot(ax1, [], None, {}, 'k-', label='truth') |
626
|
|
|
ax1.legend(loc='upper right') |
627
|
|
|
|
628
|
|
|
savefig('approximations') |
629
|
|
|
|
630
|
|
|
|
631
|
|
|
@parsable.command |
632
|
|
|
def fastlog(): |
633
|
|
|
''' |
634
|
|
|
Plot accuracy of fastlog term in cluster_add_score. |
635
|
|
|
''' |
636
|
|
|
X = numpy.array([2.0 ** i for i in range(20 + 1)]) |
637
|
|
|
Y0 = numpy.array([x * math.log(1. + 1. / x) for x in X]) |
638
|
|
|
Y1 = numpy.array([x * fast_log(1. + 1. / x) for x in X]) |
639
|
|
|
Y2 = numpy.array([1.0 for x in X]) |
640
|
|
|
|
641
|
|
|
fig, (ax1, ax2) = pyplot.subplots(2, 1, sharex=True) |
642
|
|
|
|
643
|
|
|
ax1.plot(X, Y0, 'ko', label='math.log') |
644
|
|
|
ax1.plot(X, Y1, 'r-', label='lp.special.fast_log') |
645
|
|
|
ax1.plot(X, Y2, 'b-', label='asymptote') |
646
|
|
|
ax2.plot(X, numpy.abs(Y1 - Y0), 'r-', label='lp.special.fast_log') |
647
|
|
|
ax2.plot(X, numpy.abs(Y2 - Y0), 'b-', label='asymptote') |
648
|
|
|
|
649
|
|
|
ax1.set_title('lp.special.fast_log approximation') |
650
|
|
|
ax1.set_ylabel('n log(1 + 1 / n)') |
651
|
|
|
ax2.set_ylabel('approximation error') |
652
|
|
|
ax2.set_xlabel('n') |
653
|
|
|
ax1.set_xscale('log') |
654
|
|
|
ax2.set_xscale('log') |
655
|
|
|
ax2.set_yscale('log') |
656
|
|
|
ax1.legend(loc='best') |
657
|
|
|
ax2.legend(loc='best') |
658
|
|
|
savefig('fastlog') |
659
|
|
|
|
660
|
|
|
|
661
|
|
|
def number_table(numbers, width=5): |
662
|
|
|
lines = [] |
663
|
|
|
line = '' |
664
|
|
|
for i, number in enumerate(numbers): |
665
|
|
|
if i % width == 0: |
666
|
|
|
if line: |
667
|
|
|
lines.append(line + ',') |
668
|
|
|
line = ' %0.8f' % number |
669
|
|
|
else: |
670
|
|
|
line += ', %0.8f' % number |
671
|
|
|
if line: |
672
|
|
|
lines.append(line) |
673
|
|
|
return '\n'.join(lines) |
674
|
|
|
|
675
|
|
|
|
676
|
|
|
@parsable.command |
677
|
|
|
def code(max_size=DEFAULT_MAX_SIZE): |
678
|
|
|
''' |
679
|
|
|
Generate C++ code for clustering partition function. |
680
|
|
|
''' |
681
|
|
|
all_counts = enum_counts(max_size) |
682
|
|
|
sizes = range(1 + max_size) |
683
|
|
|
log_Z = [ |
684
|
|
|
get_log_Z(all_counts[size]) if size else 0 |
685
|
|
|
for size in sizes |
686
|
|
|
] |
687
|
|
|
size = sizes[-1] |
688
|
|
|
coeff = (log_Z[-1] / get_log_z([size]) - 1.0) * size ** 0.75 |
689
|
|
|
|
690
|
|
|
print '# Insert this in src/clustering.cc:' |
691
|
|
|
lines = [ |
692
|
|
|
'// this code was generated by derivations/clustering.py', |
693
|
|
|
'static const float log_partition_function_table[%d] =' % |
694
|
|
|
(max_size + 1), |
695
|
|
|
'{', |
696
|
|
|
number_table(log_Z), |
697
|
|
|
'};', |
698
|
|
|
'', |
699
|
|
|
'// this code was generated by derivations/clustering.py', |
700
|
|
|
'template<class count_t>', |
701
|
|
|
'float Clustering<count_t>::LowEntropy::log_partition_function (', |
702
|
|
|
' count_t sample_size) const', |
703
|
|
|
'{', |
704
|
|
|
' // TODO incorporate dataset_size for higher accuracy', |
705
|
|
|
' count_t n = sample_size;', |
706
|
|
|
' if (n < %d) {' % (max_size + 1), |
707
|
|
|
' return log_partition_function_table[n];', |
708
|
|
|
' } else {', |
709
|
|
|
' float coeff = %0.8ff;' % coeff, |
710
|
|
|
' float log_z_max = n * fast_log(n);', |
711
|
|
|
' return log_z_max * (1.f + coeff * powf(n, -0.75f));', |
712
|
|
|
' }', |
713
|
|
|
'}', |
714
|
|
|
] |
715
|
|
|
print '\n'.join(lines) |
716
|
|
|
print |
717
|
|
|
|
718
|
|
|
print '# Insert this in distributions/dbg/clustering.py:' |
719
|
|
|
lines = [ |
720
|
|
|
'# this code was generated by derivations/clustering.py', |
721
|
|
|
'log_partition_function_table = [', |
722
|
|
|
number_table(log_Z), |
723
|
|
|
']', |
724
|
|
|
'', |
725
|
|
|
'', |
726
|
|
|
'# this code was generated by derivations/clustering.py', |
727
|
|
|
'def log_partition_function(sample_size):', |
728
|
|
|
' # TODO incorporate dataset_size for higher accuracy', |
729
|
|
|
' n = sample_size', |
730
|
|
|
' if n < %d:' % (max_size + 1), |
731
|
|
|
' return LowEntropy.log_partition_function_table[n]', |
732
|
|
|
' else:', |
733
|
|
|
' coeff = %0.8f' % coeff, |
734
|
|
|
' log_z_max = n * log(n)', |
735
|
|
|
' return log_z_max * (1.0 + coeff * n ** -0.75)', |
736
|
|
|
] |
737
|
|
|
print '\n'.join(lines) |
738
|
|
|
print |
739
|
|
|
|
740
|
|
|
|
741
|
|
|
@parsable.command |
742
|
|
|
def plots(): |
743
|
|
|
''' |
744
|
|
|
Generate all plots. |
745
|
|
|
''' |
746
|
|
|
priors() |
747
|
|
|
pairwise() |
748
|
|
|
postpred() |
749
|
|
|
dataprob() |
750
|
|
|
approximations() |
751
|
|
|
normalization() |
752
|
|
|
fastlog() |
753
|
|
|
|
754
|
|
|
|
755
|
|
|
if __name__ == '__main__': |
756
|
|
|
parsable.dispatch() |
757
|
|
|
|