sciapy.regress.mcmc._sample_mcmc()   C
last analyzed

Complexity

Conditions 9

Size

Total Lines 15
Code Lines 15

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 6
CRAP Score 19.125

Importance

Changes 0
Metric Value
cc 9
eloc 15
nop 7
dl 0
loc 15
ccs 6
cts 12
cp 0.5
crap 19.125
rs 6.6666
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data regression MCMC sampler
12
13
Markov Chain Monte Carlo (MCMC) routines to sample
14
the posterior probability of SCIAMACHY data regression fits.
15
Uses the :class:`emcee.EnsembleSampler` [#]_ do do the real work.
16
17
.. [#] https://emcee.readthedocs.io
18
"""
19
20 1
import logging
21 1
from multiprocessing import Pool
22
23 1
import numpy as np
24 1
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
25 1
from scipy.special import logsumexp
26
27 1
import celerite
28 1
import george
29 1
import emcee
30
31 1
try:
32 1
	from tqdm import tqdm
33
	have_tqdm = True
34 1
except ImportError:
35 1
	have_tqdm = False
36
37
38 1
__all__ = ["mcmc_sample_model"]
39
40
41 1
def _lpost(p, model, y=None, beta=1.):
42 1
	model.set_parameter_vector(p)
43 1
	lprior = model.log_prior()
44 1
	if not np.isfinite(lprior):
45 1
		return (-np.inf, [np.nan])
46 1
	log_likelihood = model.log_likelihood(y, quiet=True)
47 1
	return (beta * (log_likelihood + lprior), [log_likelihood])
48
49
50 1
def _sample_mcmc(sampler, nsamples, p0, rst0,
51
		show_progress, progress_mod, debug=False):
52 1
	smpl = sampler.sample(p0, rstate0=rst0, iterations=nsamples)
53 1
	if have_tqdm and show_progress:
54
		smpl = tqdm(smpl, total=nsamples, disable=None)
55 1
	for i, result in enumerate(smpl):
56 1
		if show_progress and (i + 1) % progress_mod == 0:
57
			if not have_tqdm and not debug:
58
				logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
59
			if debug:
60
				_pos, _logp, _, _ = result
61
				logging.debug("%5.1f%% lnpmax: %s, p(lnpmax): %s",
62
					100 * (float(i + 1) / nsamples),
63
					np.max(_logp), _pos[np.argmax(_logp)])
64 1
	return result
0 ignored issues
show
introduced by
The variable result does not seem to be defined in case the for loop on line 55 is not entered. Are you sure this can never be the case?
Loading history...
65
66
67 1
def mcmc_sample_model(model, y, beta=1.,
68
		nwalkers=100, nburnin=200, nprod=800,
69
		nthreads=1, optimized=False,
70
		bounds=None,
71
		return_logpost=False,
72
		show_progress=False, progress_mod=10):
73
	"""Markov Chain Monte Carlo sampling interface
74
75
	MCMC sampling interface to sample posterior probabilities using the
76
	:class:`emcee.EnsembleSampler` [#]_.
77
78
	.. [#] https://emcee.readthedocs.io
79
80
	Arguments
81
	---------
82
	model : celerite.GP, george.GP, or sciapy.regress.RegressionModel instance
83
		The model to draw posterior samples from. It should provide either
84
		`log_likelihood()` and `log_prior()` functions or be directly callable
85
		via `__call__()`.
86
	y : (N,) array_like
87
		The data to condition the probabilities on.
88
	beta : float, optional
89
		Tempering factor for the probability, default: 1.
90
	nwalkers : int, optional
91
		The number of MCMC walkers (default: 100). If this number is smaller
92
		than 4 times the number of parameters, it is multiplied by the number
93
		of parameters. Otherwise it specifies the number of parameters directly.
94
	nburnin : int, optional
95
		The number of burn-in samples to draw, default: 200.
96
	nprod : int, optional
97
		The number of production samples to draw, default: 800.
98
	nthreads : int, optional
99
		The number of threads to use with a `multiprocessing.Pool`,
100
		used as `pool` for `emcee.EnsembleSampler`. Default: 1.
101
	optimized : bool, optional
102
		Indicate whether the actual (starting) position was determined with an
103
		optimization algorithm beforehand. If `False` (the default), a
104
		pre-burn-in run optimizes the starting position. Sampling continues
105
		from there with the normal burn-in and production runs.
106
		In that case, latin hypercube sampling is used to distribute the walker
107
		starting positions equally in parameter space.
108
	bounds : iterable, optional
109
		The parameter bounds as a list of (min, max) entries.
110
		Default: None
111
	return_logpost : bool, optional
112
		Indicate whether or not to  return the sampled log probabilities as well.
113
		Default: False
114
	show_progress : bool, optional
115
		Print the percentage of samples every `progress_mod` samples.
116
		Default: False
117
	progress_mod : int, optional
118
		Interval in samples to print the percentage of samples.
119
		Default: 10
120
121
	Returns
122
	-------
123
	samples or (samples, logpost) : array_like or tuple
124
		(nwalkers * nprod, ndim) array of the sampled parameters from the
125
		production run if return_logpost is `False`.
126
		A tuple of an (nwalkers * nprod, ndim) array (the same as above)
127
		and an (nwalkers,) array with the second entry containing the
128
		log posterior probabilities if return_logpost is `True`.
129
	"""
130 1
	v = model.get_parameter_vector()
131 1
	ndim = len(v)
132 1
	if nwalkers < 4 * ndim:
133 1
		nwalkers *= ndim
134 1
	logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
135
				"%s production samples using %s threads.",
136
				nwalkers, nburnin, nprod, nthreads)
137
138 1
	if isinstance(model, celerite.GP) or isinstance(model, george.GP):
139 1
		mod_func = _lpost
140 1
		mod_args = (model, y, beta)
141
	else:
142
		mod_func = model
143
		mod_args = (beta,)
144
145
	# Initialize the walkers.
146 1
	if not optimized:
147
		# scipy.optimize's DifferentialEvolutionSolver uses
148
		# latin hypercube sampling as starting positions.
149
		# We just use their initialization to avoid duplicating code.
150 1
		if bounds is None:
151
			bounds = model.get_parameter_bounds()
152 1
		de_solver = DifferentialEvolutionSolver(_lpost,
153
					bounds=bounds,
154
					popsize=nwalkers // ndim)
155
		# The initial population should reflect latin hypercube sampling
156 1
		p0 = de_solver.population
157
		# fill up to full size in case the number of walkers is not a
158
		# multiple of the number of parameters
159 1
		missing = nwalkers - p0.shape[0]
160 1
		p0 = np.vstack([p0] +
161
			[v + 1e-2 * np.random.randn(ndim) for _ in range(missing)])
162
	else:
163 1
		p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])
164
165
	# set up the sampling pool
166 1
	if nthreads > 1:
167
		pool = Pool(processes=nthreads)
168
	else:
169 1
		pool = None
170 1
	sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
171
			pool=pool)
172
173 1
	rst0 = np.random.get_state()
174
175 1
	if not optimized:
176 1
		logging.info("Running MCMC fit (%s samples)", nburnin)
177 1
		p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
178
				show_progress, progress_mod, debug=True)
179 1
		logging.info("MCMC fit finished.")
180
181 1
		p = p0[np.argmax(lnp0)]
182 1
		logging.info("Fit max logpost: %s, params: %s, exp(params): %s",
183
					np.max(lnp0), p, np.exp(p))
184 1
		model.set_parameter_vector(p)
185 1
		logging.debug("params: %s", model.get_parameter_dict())
186 1
		logging.debug("log_likelihood: %s", model.log_likelihood(y))
187 1
		p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]
188 1
		sampler.reset()
189
190 1
	logging.info("Running burn-in (%s samples)", nburnin)
191 1
	p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
192
			show_progress, progress_mod)
193 1
	logging.info("Burn-in finished.")
194
195 1
	p = p0[np.argmax(lnp0)]
196 1
	logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
197
				np.max(lnp0), p, np.exp(p))
198 1
	model.set_parameter_vector(p)
199 1
	logging.debug("params: %s", model.get_parameter_dict())
200 1
	logging.debug("log_likelihood: %s", model.log_likelihood(y))
201 1
	sampler.reset()
202
203 1
	logging.info("Running production chain (%s samples)", nprod)
204 1
	_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
205 1
	logging.info("Production run finished.")
206
207 1
	samples = sampler.flatchain
208 1
	lnp = sampler.flatlnprobability
209
	# first column in the blobs are the log likelihoods
210 1
	lnlh = np.array(sampler.blobs)[..., 0].ravel().astype(float)
211 1
	post_expect_loglh = np.nanmean(np.array(lnlh))
212 1
	logging.info("total samples: %s", samples.shape)
213
214 1
	samplmean = np.mean(samples, axis=0)
215 1
	logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
216
			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))
217
218 1
	samplmedian = np.median(samples, axis=0)
219 1
	logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
220
			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))
221
222 1
	logging.info("max logpost: %s, params: %s, exp(params): %s",
223
			np.max(lnp), samples[np.argmax(lnp)],
224
			np.exp(samples[np.argmax(lnp)]))
225
226 1
	logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
227 1
	logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
228 1
	logging.info("poor man's evidence 1 sum: %s, mean: %s",
229
			np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
230 1
	logging.info("poor man's evidence 2 max: %s, std: %s",
231
			np.max(np.exp(lnp)), np.std(np.exp(lnp)))
232 1
	logging.info("poor man's evidence 3: %s",
233
			np.max(np.exp(lnp)) / np.std(np.exp(lnp)))
234 1
	logging.info("poor man's evidence 4 sum: %s",
235
			logsumexp(lnp, b=1. / lnp.shape[0], axis=0))
236
237
	# mode
238 1
	model.set_parameter_vector(samples[np.argmax(lnp)])
239 1
	log_lh = model.log_likelihood(y)
240
	# Use the likelihood instead of the posterior
241
	# https://doi.org/10.3847/1538-3881/aa9332
242 1
	logging.info("BIC lh: %s", np.log(len(y)) * ndim - 2 * log_lh)
243
	# DIC
244 1
	sample_deviance = -2 * np.max(lnp)
245 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
246 1
	pd = sample_deviance - deviance_at_sample
247 1
	dic = 2 * sample_deviance - deviance_at_sample
248 1
	logging.info("max logpost log_lh: %s, AIC: %s, DIC: %s, pd: %s",
249
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
250
	# mean
251 1
	model.set_parameter_vector(samplmean)
252 1
	log_lh = model.log_likelihood(y)
253 1
	log_lh_mean = log_lh
254
	# DIC
255 1
	sample_deviance = -2 * np.nanmean(lnp)
256 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
257 1
	pd = sample_deviance - deviance_at_sample
258 1
	dic = 2 * sample_deviance - deviance_at_sample
259 1
	logging.info("mean log_lh: %s, AIC: %s, DIC: %s, pd: %s",
260
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
261
	# median
262 1
	model.set_parameter_vector(samplmedian)
263 1
	log_lh = model.log_likelihood(y)
264
	# DIC
265 1
	sample_deviance = -2 * np.nanmedian(lnp)
266 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
267 1
	dic = 2 * sample_deviance - deviance_at_sample
268 1
	pd = sample_deviance - deviance_at_sample
269 1
	logging.info("median log_lh: %s, AIC: %s, DIC: %s, pd: %s",
270
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
271
	# (4)--(6) in Ando2011 doi:10.1080/01966324.2011.10737798
272 1
	pd_ando = 2 * (log_lh_mean - post_expect_loglh)
273 1
	ic5 = - 2 * post_expect_loglh + 2 * pd_ando
274 1
	ic6 = - 2 * post_expect_loglh + 2 * ndim
275 1
	logging.info("Ando2011: pd: %s, IC(5): %s, IC(6): %s",
276
			pd_ando, ic5, ic6)
277
278 1
	if return_logpost:
279 1
		return samples, lnp
280
	return samples
281