Passed
Push — master ( 65b079...f515c9 )
by Stefan
05:52
created

sciapy.regress.mcmc._sample_mcmc()   C

Complexity

Conditions 9

Size

Total Lines 15
Code Lines 15

Duplication

Lines 15
Ratio 100 %

Code Coverage

Tests 6
CRAP Score 19.125

Importance

Changes 0
Metric Value
cc 9
eloc 15
nop 7
dl 15
loc 15
ccs 6
cts 12
cp 0.5
crap 19.125
rs 6.6666
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data regression MCMC sampler
12
13
Markov Chain Monte Carlo (MCMC) routines to sample
14
the posterior probability of SCIAMACHY data regression fits.
15
Uses the :class:`emcee.EnsembleSampler` [#]_ do do the real work.
16
17
.. [#] https://emcee.readthedocs.io
18
"""
19
20 1
import logging
21 1
from multiprocessing import Pool
22
23 1
import numpy as np
24 1
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
25 1
from scipy.special import logsumexp
26
27 1
import celerite
28 1
import george
29 1
import emcee
30
31 1
try:
32 1
	from tqdm import tqdm
33
	have_tqdm = True
34 1
except ImportError:
35 1
	have_tqdm = False
36
37
38 1
__all__ = ["mcmc_sample_model"]
39
40
41 1
def _lpost(p, model, y=None, beta=1.):
42
	model.set_parameter_vector(p)
43
	lprior = model.log_prior()
44
	if not np.isfinite(lprior):
45
		return (-np.inf, [np.nan])
46
	log_likelihood = model.log_likelihood(y, quiet=True)
47
	return (beta * (log_likelihood + lprior), [log_likelihood])
48
49
50 1 View Code Duplication
def _sample_mcmc(sampler, nsamples, p0, rst0,
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
51
		show_progress, progress_mod, debug=False):
52 1
	smpl = sampler.sample(p0, rstate0=rst0, iterations=nsamples)
53 1
	if have_tqdm and show_progress:
54
		smpl = tqdm(smpl, total=nsamples, disable=None)
55 1
	for i, result in enumerate(smpl):
56 1
		if show_progress and (i + 1) % progress_mod == 0:
57
			if not have_tqdm and not debug:
58
				logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
59
			if debug:
60
				_pos, _logp, _, _ = result
61
				logging.debug("%5.1f%% lnpmax: %s, p(lnpmax): %s",
62
					100 * (float(i + 1) / nsamples),
63
					np.max(_logp), _pos[np.argmax(_logp)])
64 1
	return result
0 ignored issues
show
introduced by
The variable result does not seem to be defined in case the for loop on line 55 is not entered. Are you sure this can never be the case?
Loading history...
65
66
67 1 View Code Duplication
def mcmc_sample_model(model, y, beta=1.,
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
68
		nwalkers=100, nburnin=200, nprod=800,
69
		nthreads=1, optimized=False,
70
		bounds=None,
71
		return_logpost=False,
72
		show_progress=False, progress_mod=10):
73
	"""Markov Chain Monte Carlo sampling interface
74
75
	MCMC sampling interface to sample posterior probabilities using the
76
	:class:`emcee.EnsembleSampler` [#]_.
77
78
	.. [#] https://emcee.readthedocs.io
79
80
	Arguments
81
	---------
82
	model : celerite.GP, george.GP, or sciapy.regress.RegressionModel instance
83
		The model to draw posterior samples from. It should provide either
84
		`log_likelihood()` and `log_prior()` functions or be directly callable
85
		via `__call__()`.
86
	y : (N,) array_like
87
		The data to condition the probabilities on.
88
	beta : float, optional
89
		Tempering factor for the probability, default: 1.
90
	nwalkers : int, optional
91
		The number of MCMC walkers (default: 100). If this number is smaller
92
		than 4 times the number of parameters, it is multiplied by the number
93
		of parameters. Otherwise it specifies the number of parameters directly.
94
	nburnin : int, optional
95
		The number of burn-in samples to draw, default: 200.
96
	nprod : int, optional
97
		The number of production samples to draw, default: 800.
98
	nthreads : int, optional
99
		The number of threads to use with a `multiprocessing.Pool`,
100
		used as `pool` for `emcee.EnsembleSampler`. Default: 1.
101
	optimized : bool, optional
102
		Indicate whether the actual (starting) position was determined with an
103
		optimization algorithm beforehand. If `False` (the default), a
104
		pre-burn-in run optimizes the starting position. Sampling continues
105
		from there with the normal burn-in and production runs.
106
		In that case, latin hypercube sampling is used to distribute the walker
107
		starting positions equally in parameter space.
108
	bounds : iterable, optional
109
		The parameter bounds as a list of (min, max) entries.
110
		Default: None
111
	return_logpost : bool, optional
112
		Indicate whether or not to  return the sampled log probabilities as well.
113
		Default: False
114
	show_progress : bool, optional
115
		Print the percentage of samples every `progress_mod` samples.
116
		Default: False
117
	progress_mod : int, optional
118
		Interval in samples to print the percentage of samples.
119
		Default: 10
120
121
	Returns
122
	-------
123
	samples or (samples, logpost) : array_like or tuple
124
		(nwalkers * nprod, ndim) array of the sampled parameters from the
125
		production run if return_logpost is `False`.
126
		A tuple of an (nwalkers * nprod, ndim) array (the same as above)
127
		and an (nwalkers,) array with the second entry containing the
128
		log posterior probabilities if return_logpost is `True`.
129
	"""
130 1
	v = model.get_parameter_vector()
131 1
	ndim = len(v)
132 1
	if nwalkers < 4 * ndim:
133 1
		nwalkers *= ndim
134 1
	logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
135
				"%s production samples using %s threads.",
136
				nwalkers, nburnin, nprod, nthreads)
137
138 1
	if isinstance(model, celerite.GP) or isinstance(model, george.GP):
139 1
		mod_func = _lpost
140 1
		mod_args = (model, y, beta)
141
	else:
142
		mod_func = model
143
		mod_args = (beta,)
144
145
	# Initialize the walkers.
146 1
	if not optimized:
147
		# scipy.optimize's DifferentialEvolutionSolver uses
148
		# latin hypercube sampling as starting positions.
149
		# We just use their initialization to avoid duplicating code.
150
		if bounds is None:
151
			bounds = model.get_parameter_bounds()
152
		de_solver = DifferentialEvolutionSolver(_lpost,
153
					bounds=bounds,
154
					popsize=nwalkers // ndim)
155
		# The initial population should reflect latin hypercube sampling
156
		p0 = de_solver.population
157
		# fill up to full size in case the number of walkers is not a
158
		# multiple of the number of parameters
159
		missing = nwalkers - p0.shape[0]
160
		p0 = np.vstack([p0] +
161
			[v + 1e-2 * np.random.randn(ndim) for _ in range(missing)])
162
	else:
163 1
		p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])
164
165
	# set up the sampling pool
166 1
	pool = Pool(processes=nthreads)
167 1
	sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
168
			pool=pool)
169
170 1
	rst0 = np.random.get_state()
171
172 1
	if not optimized:
173
		logging.info("Running MCMC fit (%s samples)", nburnin)
174
		p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
175
				show_progress, progress_mod, debug=True)
176
		logging.info("MCMC fit finished.")
177
178
		p = p0[np.argmax(lnp0)]
179
		logging.info("Fit max logpost: %s, params: %s, exp(params): %s",
180
					np.max(lnp0), p, np.exp(p))
181
		model.set_parameter_vector(p)
182
		logging.debug("params: %s", model.get_parameter_dict())
183
		logging.debug("log_likelihood: %s", model.log_likelihood(y))
184
		p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]
185
		sampler.reset()
186
187 1
	logging.info("Running burn-in (%s samples)", nburnin)
188 1
	p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
189
			show_progress, progress_mod)
190 1
	logging.info("Burn-in finished.")
191
192 1
	p = p0[np.argmax(lnp0)]
193 1
	logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
194
				np.max(lnp0), p, np.exp(p))
195 1
	model.set_parameter_vector(p)
196 1
	logging.debug("params: %s", model.get_parameter_dict())
197 1
	logging.debug("log_likelihood: %s", model.log_likelihood(y))
198 1
	sampler.reset()
199
200 1
	logging.info("Running production chain (%s samples)", nprod)
201 1
	_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
202 1
	logging.info("Production run finished.")
203
204 1
	samples = sampler.flatchain
205 1
	lnp = sampler.flatlnprobability
206
	# first column in the blobs are the log likelihoods
207 1
	lnlh = np.array(sampler.blobs)[..., 0].ravel().astype(float)
208 1
	post_expect_loglh = np.nanmean(np.array(lnlh))
209 1
	logging.info("total samples: %s", samples.shape)
210
211 1
	samplmean = np.mean(samples, axis=0)
212 1
	logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
213
			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))
214
215 1
	samplmedian = np.median(samples, axis=0)
216 1
	logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
217
			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))
218
219 1
	logging.info("max logpost: %s, params: %s, exp(params): %s",
220
			np.max(lnp), samples[np.argmax(lnp)],
221
			np.exp(samples[np.argmax(lnp)]))
222
223 1
	logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
224 1
	logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
225 1
	logging.info("poor man's evidence 1 sum: %s, mean: %s",
226
			np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
227 1
	logging.info("poor man's evidence 2 max: %s, std: %s",
228
			np.max(np.exp(lnp)), np.std(np.exp(lnp)))
229 1
	logging.info("poor man's evidence 3: %s",
230
			np.max(np.exp(lnp)) / np.std(np.exp(lnp)))
231 1
	logging.info("poor man's evidence 4 sum: %s",
232
			logsumexp(lnp, b=1. / lnp.shape[0], axis=0))
233
234
	# mode
235 1
	model.set_parameter_vector(samples[np.argmax(lnp)])
236 1
	log_lh = model.log_likelihood(y)
237
	# Use the likelihood instead of the posterior
238
	# https://doi.org/10.3847/1538-3881/aa9332
239 1
	logging.info("BIC lh: %s", np.log(len(y)) * ndim - 2 * log_lh)
240
	# DIC
241 1
	sample_deviance = -2 * np.max(lnp)
242 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
243 1
	pd = sample_deviance - deviance_at_sample
244 1
	dic = 2 * sample_deviance - deviance_at_sample
245 1
	logging.info("max logpost log_lh: %s, AIC: %s, DIC: %s, pd: %s",
246
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
247
	# mean
248 1
	model.set_parameter_vector(samplmean)
249 1
	log_lh = model.log_likelihood(y)
250 1
	log_lh_mean = log_lh
251
	# DIC
252 1
	sample_deviance = -2 * np.nanmean(lnp)
253 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
254 1
	pd = sample_deviance - deviance_at_sample
255 1
	dic = 2 * sample_deviance - deviance_at_sample
256 1
	logging.info("mean log_lh: %s, AIC: %s, DIC: %s, pd: %s",
257
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
258
	# median
259 1
	model.set_parameter_vector(samplmedian)
260 1
	log_lh = model.log_likelihood(y)
261
	# DIC
262 1
	sample_deviance = -2 * np.nanmedian(lnp)
263 1
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
264 1
	dic = 2 * sample_deviance - deviance_at_sample
265 1
	pd = sample_deviance - deviance_at_sample
266 1
	logging.info("median log_lh: %s, AIC: %s, DIC: %s, pd: %s",
267
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
268
	# (4)--(6) in Ando2011 doi:10.1080/01966324.2011.10737798
269 1
	pd_ando = 2 * (log_lh_mean - post_expect_loglh)
270 1
	ic5 = - 2 * post_expect_loglh + 2 * pd_ando
271 1
	ic6 = - 2 * post_expect_loglh + 2 * ndim
272 1
	logging.info("Ando2011: pd: %s, IC(5): %s, IC(6): %s",
273
			pd_ando, ic5, ic6)
274
275 1
	if return_logpost:
276 1
		return samples, lnp
277
	return samples
278