1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# vim:fileencoding=utf-8 |
3
|
|
|
# |
4
|
|
|
# Copyright (c) 2017-2018 Stefan Bender |
5
|
|
|
# |
6
|
|
|
# This module is part of sciapy. |
7
|
|
|
# sciapy is free software: you can redistribute it or modify |
8
|
|
|
# it under the terms of the GNU General Public License as published |
9
|
|
|
# by the Free Software Foundation, version 2. |
10
|
|
|
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html. |
11
|
1 |
|
"""SCIAMACHY data regression MCMC sampler |
12
|
|
|
|
13
|
|
|
Markov Chain Monte Carlo (MCMC) routines to sample |
14
|
|
|
the posterior probability of SCIAMACHY data regression fits. |
15
|
|
|
Uses the :class:`emcee.EnsembleSampler` [#]_ do do the real work. |
16
|
|
|
|
17
|
|
|
.. [#] https://emcee.readthedocs.io |
18
|
|
|
""" |
19
|
|
|
|
20
|
1 |
|
import logging |
21
|
1 |
|
from multiprocessing import Pool |
22
|
|
|
|
23
|
1 |
|
import numpy as np |
24
|
1 |
|
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver |
25
|
1 |
|
from scipy.special import logsumexp |
26
|
|
|
|
27
|
1 |
|
import celerite |
28
|
1 |
|
import george |
29
|
1 |
|
import emcee |
30
|
|
|
|
31
|
1 |
|
try: |
32
|
1 |
|
from tqdm import tqdm |
33
|
|
|
have_tqdm = True |
34
|
1 |
|
except ImportError: |
35
|
1 |
|
have_tqdm = False |
36
|
|
|
|
37
|
|
|
|
38
|
1 |
|
__all__ = ["mcmc_sample_model"] |
39
|
|
|
|
40
|
|
|
|
41
|
1 |
|
def _lpost(p, model, y=None, beta=1.): |
42
|
1 |
|
model.set_parameter_vector(p) |
43
|
1 |
|
lprior = model.log_prior() |
44
|
1 |
|
if not np.isfinite(lprior): |
45
|
1 |
|
return (-np.inf, [np.nan]) |
46
|
1 |
|
log_likelihood = model.log_likelihood(y, quiet=True) |
47
|
1 |
|
return (beta * (log_likelihood + lprior), [log_likelihood]) |
48
|
|
|
|
49
|
|
|
|
50
|
1 |
|
def _sample_mcmc(sampler, nsamples, p0, rst0, |
51
|
|
|
show_progress, progress_mod, debug=False): |
52
|
1 |
|
smpl = sampler.sample(p0, rstate0=rst0, iterations=nsamples) |
53
|
1 |
|
if have_tqdm and show_progress: |
54
|
|
|
smpl = tqdm(smpl, total=nsamples, disable=None) |
55
|
1 |
|
for i, result in enumerate(smpl): |
56
|
1 |
|
if show_progress and (i + 1) % progress_mod == 0: |
57
|
|
|
if not have_tqdm and not debug: |
58
|
|
|
logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples)) |
59
|
|
|
if debug: |
60
|
|
|
_pos, _logp, _, _ = result |
61
|
|
|
logging.debug("%5.1f%% lnpmax: %s, p(lnpmax): %s", |
62
|
|
|
100 * (float(i + 1) / nsamples), |
63
|
|
|
np.max(_logp), _pos[np.argmax(_logp)]) |
64
|
1 |
|
return result |
|
|
|
|
65
|
|
|
|
66
|
|
|
|
67
|
1 |
|
def mcmc_sample_model(model, y, beta=1., |
68
|
|
|
nwalkers=100, nburnin=200, nprod=800, |
69
|
|
|
nthreads=1, optimized=False, |
70
|
|
|
bounds=None, |
71
|
|
|
return_logpost=False, |
72
|
|
|
show_progress=False, progress_mod=10): |
73
|
|
|
"""Markov Chain Monte Carlo sampling interface |
74
|
|
|
|
75
|
|
|
MCMC sampling interface to sample posterior probabilities using the |
76
|
|
|
:class:`emcee.EnsembleSampler` [#]_. |
77
|
|
|
|
78
|
|
|
.. [#] https://emcee.readthedocs.io |
79
|
|
|
|
80
|
|
|
Arguments |
81
|
|
|
--------- |
82
|
|
|
model : celerite.GP, george.GP, or sciapy.regress.RegressionModel instance |
83
|
|
|
The model to draw posterior samples from. It should provide either |
84
|
|
|
`log_likelihood()` and `log_prior()` functions or be directly callable |
85
|
|
|
via `__call__()`. |
86
|
|
|
y : (N,) array_like |
87
|
|
|
The data to condition the probabilities on. |
88
|
|
|
beta : float, optional |
89
|
|
|
Tempering factor for the probability, default: 1. |
90
|
|
|
nwalkers : int, optional |
91
|
|
|
The number of MCMC walkers (default: 100). If this number is smaller |
92
|
|
|
than 4 times the number of parameters, it is multiplied by the number |
93
|
|
|
of parameters. Otherwise it specifies the number of parameters directly. |
94
|
|
|
nburnin : int, optional |
95
|
|
|
The number of burn-in samples to draw, default: 200. |
96
|
|
|
nprod : int, optional |
97
|
|
|
The number of production samples to draw, default: 800. |
98
|
|
|
nthreads : int, optional |
99
|
|
|
The number of threads to use with a `multiprocessing.Pool`, |
100
|
|
|
used as `pool` for `emcee.EnsembleSampler`. Default: 1. |
101
|
|
|
optimized : bool, optional |
102
|
|
|
Indicate whether the actual (starting) position was determined with an |
103
|
|
|
optimization algorithm beforehand. If `False` (the default), a |
104
|
|
|
pre-burn-in run optimizes the starting position. Sampling continues |
105
|
|
|
from there with the normal burn-in and production runs. |
106
|
|
|
In that case, latin hypercube sampling is used to distribute the walker |
107
|
|
|
starting positions equally in parameter space. |
108
|
|
|
bounds : iterable, optional |
109
|
|
|
The parameter bounds as a list of (min, max) entries. |
110
|
|
|
Default: None |
111
|
|
|
return_logpost : bool, optional |
112
|
|
|
Indicate whether or not to return the sampled log probabilities as well. |
113
|
|
|
Default: False |
114
|
|
|
show_progress : bool, optional |
115
|
|
|
Print the percentage of samples every `progress_mod` samples. |
116
|
|
|
Default: False |
117
|
|
|
progress_mod : int, optional |
118
|
|
|
Interval in samples to print the percentage of samples. |
119
|
|
|
Default: 10 |
120
|
|
|
|
121
|
|
|
Returns |
122
|
|
|
------- |
123
|
|
|
samples or (samples, logpost) : array_like or tuple |
124
|
|
|
(nwalkers * nprod, ndim) array of the sampled parameters from the |
125
|
|
|
production run if return_logpost is `False`. |
126
|
|
|
A tuple of an (nwalkers * nprod, ndim) array (the same as above) |
127
|
|
|
and an (nwalkers,) array with the second entry containing the |
128
|
|
|
log posterior probabilities if return_logpost is `True`. |
129
|
|
|
""" |
130
|
1 |
|
v = model.get_parameter_vector() |
131
|
1 |
|
ndim = len(v) |
132
|
1 |
|
if nwalkers < 4 * ndim: |
133
|
1 |
|
nwalkers *= ndim |
134
|
1 |
|
logging.info("MCMC parameters: %s walkers, %s burn-in samples, " |
135
|
|
|
"%s production samples using %s threads.", |
136
|
|
|
nwalkers, nburnin, nprod, nthreads) |
137
|
|
|
|
138
|
1 |
|
if isinstance(model, celerite.GP) or isinstance(model, george.GP): |
139
|
1 |
|
mod_func = _lpost |
140
|
1 |
|
mod_args = (model, y, beta) |
141
|
|
|
else: |
142
|
|
|
mod_func = model |
143
|
|
|
mod_args = (beta,) |
144
|
|
|
|
145
|
|
|
# Initialize the walkers. |
146
|
1 |
|
if not optimized: |
147
|
|
|
# scipy.optimize's DifferentialEvolutionSolver uses |
148
|
|
|
# latin hypercube sampling as starting positions. |
149
|
|
|
# We just use their initialization to avoid duplicating code. |
150
|
1 |
|
if bounds is None: |
151
|
|
|
bounds = model.get_parameter_bounds() |
152
|
1 |
|
de_solver = DifferentialEvolutionSolver(_lpost, |
153
|
|
|
bounds=bounds, |
154
|
|
|
popsize=nwalkers // ndim) |
155
|
|
|
# The initial population should reflect latin hypercube sampling |
156
|
1 |
|
p0 = de_solver.population |
157
|
|
|
# fill up to full size in case the number of walkers is not a |
158
|
|
|
# multiple of the number of parameters |
159
|
1 |
|
missing = nwalkers - p0.shape[0] |
160
|
1 |
|
p0 = np.vstack([p0] + |
161
|
|
|
[v + 1e-2 * np.random.randn(ndim) for _ in range(missing)]) |
162
|
|
|
else: |
163
|
1 |
|
p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)]) |
164
|
|
|
|
165
|
|
|
# set up the sampling pool |
166
|
1 |
|
if nthreads > 1: |
167
|
|
|
pool = Pool(processes=nthreads) |
168
|
|
|
else: |
169
|
1 |
|
pool = None |
170
|
1 |
|
sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args, |
171
|
|
|
pool=pool) |
172
|
|
|
|
173
|
1 |
|
rst0 = np.random.get_state() |
174
|
|
|
|
175
|
1 |
|
if not optimized: |
176
|
1 |
|
logging.info("Running MCMC fit (%s samples)", nburnin) |
177
|
1 |
|
p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0, |
178
|
|
|
show_progress, progress_mod, debug=True) |
179
|
1 |
|
logging.info("MCMC fit finished.") |
180
|
|
|
|
181
|
1 |
|
p = p0[np.argmax(lnp0)] |
182
|
1 |
|
logging.info("Fit max logpost: %s, params: %s, exp(params): %s", |
183
|
|
|
np.max(lnp0), p, np.exp(p)) |
184
|
1 |
|
model.set_parameter_vector(p) |
185
|
1 |
|
logging.debug("params: %s", model.get_parameter_dict()) |
186
|
1 |
|
logging.debug("log_likelihood: %s", model.log_likelihood(y)) |
187
|
1 |
|
p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)] |
188
|
1 |
|
sampler.reset() |
189
|
|
|
|
190
|
1 |
|
logging.info("Running burn-in (%s samples)", nburnin) |
191
|
1 |
|
p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0, |
192
|
|
|
show_progress, progress_mod) |
193
|
1 |
|
logging.info("Burn-in finished.") |
194
|
|
|
|
195
|
1 |
|
p = p0[np.argmax(lnp0)] |
196
|
1 |
|
logging.info("burn-in max logpost: %s, params: %s, exp(params): %s", |
197
|
|
|
np.max(lnp0), p, np.exp(p)) |
198
|
1 |
|
model.set_parameter_vector(p) |
199
|
1 |
|
logging.debug("params: %s", model.get_parameter_dict()) |
200
|
1 |
|
logging.debug("log_likelihood: %s", model.log_likelihood(y)) |
201
|
1 |
|
sampler.reset() |
202
|
|
|
|
203
|
1 |
|
logging.info("Running production chain (%s samples)", nprod) |
204
|
1 |
|
_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod) |
205
|
1 |
|
logging.info("Production run finished.") |
206
|
|
|
|
207
|
1 |
|
samples = sampler.flatchain |
208
|
1 |
|
lnp = sampler.flatlnprobability |
209
|
|
|
# first column in the blobs are the log likelihoods |
210
|
1 |
|
lnlh = np.array(sampler.blobs)[..., 0].ravel().astype(float) |
211
|
1 |
|
post_expect_loglh = np.nanmean(np.array(lnlh)) |
212
|
1 |
|
logging.info("total samples: %s", samples.shape) |
213
|
|
|
|
214
|
1 |
|
samplmean = np.mean(samples, axis=0) |
215
|
1 |
|
logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s", |
216
|
|
|
samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean))) |
217
|
|
|
|
218
|
1 |
|
samplmedian = np.median(samples, axis=0) |
219
|
1 |
|
logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s", |
220
|
|
|
samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian))) |
221
|
|
|
|
222
|
1 |
|
logging.info("max logpost: %s, params: %s, exp(params): %s", |
223
|
|
|
np.max(lnp), samples[np.argmax(lnp)], |
224
|
|
|
np.exp(samples[np.argmax(lnp)])) |
225
|
|
|
|
226
|
1 |
|
logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp)) |
227
|
1 |
|
logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp)) |
228
|
1 |
|
logging.info("poor man's evidence 1 sum: %s, mean: %s", |
229
|
|
|
np.sum(np.exp(lnp)), np.mean(np.exp(lnp))) |
230
|
1 |
|
logging.info("poor man's evidence 2 max: %s, std: %s", |
231
|
|
|
np.max(np.exp(lnp)), np.std(np.exp(lnp))) |
232
|
1 |
|
logging.info("poor man's evidence 3: %s", |
233
|
|
|
np.max(np.exp(lnp)) / np.std(np.exp(lnp))) |
234
|
1 |
|
logging.info("poor man's evidence 4 sum: %s", |
235
|
|
|
logsumexp(lnp, b=1. / lnp.shape[0], axis=0)) |
236
|
|
|
|
237
|
|
|
# mode |
238
|
1 |
|
model.set_parameter_vector(samples[np.argmax(lnp)]) |
239
|
1 |
|
log_lh = model.log_likelihood(y) |
240
|
|
|
# Use the likelihood instead of the posterior |
241
|
|
|
# https://doi.org/10.3847/1538-3881/aa9332 |
242
|
1 |
|
logging.info("BIC lh: %s", np.log(len(y)) * ndim - 2 * log_lh) |
243
|
|
|
# DIC |
244
|
1 |
|
sample_deviance = -2 * np.max(lnp) |
245
|
1 |
|
deviance_at_sample = -2 * (model.log_prior() + log_lh) |
246
|
1 |
|
pd = sample_deviance - deviance_at_sample |
247
|
1 |
|
dic = 2 * sample_deviance - deviance_at_sample |
248
|
1 |
|
logging.info("max logpost log_lh: %s, AIC: %s, DIC: %s, pd: %s", |
249
|
|
|
model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd) |
250
|
|
|
# mean |
251
|
1 |
|
model.set_parameter_vector(samplmean) |
252
|
1 |
|
log_lh = model.log_likelihood(y) |
253
|
1 |
|
log_lh_mean = log_lh |
254
|
|
|
# DIC |
255
|
1 |
|
sample_deviance = -2 * np.nanmean(lnp) |
256
|
1 |
|
deviance_at_sample = -2 * (model.log_prior() + log_lh) |
257
|
1 |
|
pd = sample_deviance - deviance_at_sample |
258
|
1 |
|
dic = 2 * sample_deviance - deviance_at_sample |
259
|
1 |
|
logging.info("mean log_lh: %s, AIC: %s, DIC: %s, pd: %s", |
260
|
|
|
model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd) |
261
|
|
|
# median |
262
|
1 |
|
model.set_parameter_vector(samplmedian) |
263
|
1 |
|
log_lh = model.log_likelihood(y) |
264
|
|
|
# DIC |
265
|
1 |
|
sample_deviance = -2 * np.nanmedian(lnp) |
266
|
1 |
|
deviance_at_sample = -2 * (model.log_prior() + log_lh) |
267
|
1 |
|
dic = 2 * sample_deviance - deviance_at_sample |
268
|
1 |
|
pd = sample_deviance - deviance_at_sample |
269
|
1 |
|
logging.info("median log_lh: %s, AIC: %s, DIC: %s, pd: %s", |
270
|
|
|
model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd) |
271
|
|
|
# (4)--(6) in Ando2011 doi:10.1080/01966324.2011.10737798 |
272
|
1 |
|
pd_ando = 2 * (log_lh_mean - post_expect_loglh) |
273
|
1 |
|
ic5 = - 2 * post_expect_loglh + 2 * pd_ando |
274
|
1 |
|
ic6 = - 2 * post_expect_loglh + 2 * ndim |
275
|
1 |
|
logging.info("Ando2011: pd: %s, IC(5): %s, IC(6): %s", |
276
|
|
|
pd_ando, ic5, ic6) |
277
|
|
|
|
278
|
1 |
|
if return_logpost: |
279
|
1 |
|
return samples, lnp |
280
|
|
|
return samples |
281
|
|
|
|