Passed
Branch master (9c78f3)
by Stefan
06:59
created

sciapy.regress.mcmc._lpost()   A

Complexity

Conditions 2

Size

Total Lines 7
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Code Coverage

Tests 1
CRAP Score 4.5185

Importance

Changes 0
Metric Value
cc 2
eloc 7
nop 4
dl 0
loc 7
ccs 1
cts 7
cp 0.1429
crap 4.5185
rs 10
c 0
b 0
f 0
1
# -*- coding: utf-8 -*-
2
# vim:fileencoding=utf-8
3
#
4
# Copyright (c) 2017-2018 Stefan Bender
5
#
6
# This module is part of sciapy.
7
# sciapy is free software: you can redistribute it or modify
8
# it under the terms of the GNU General Public License as published
9
# by the Free Software Foundation, version 2.
10
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11 1
"""SCIAMACHY data regression MCMC sampler
12
13
Markov Chain Monte Carlo (MCMC) routines to sample
14
the posterior probability of SCIAMACHY data regression fits.
15
Uses the :class:`emcee.EnsembleSampler` [#]_ do do the real work.
16
17
.. [#] https://emcee.readthedocs.io
18
"""
19
20 1
import logging
21 1
from multiprocessing import Pool
22
23 1
import numpy as np
24 1
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
25 1
from scipy.special import logsumexp
26
27 1
import celerite
28 1
import george
29 1
import emcee
30
31 1
try:
32 1
	from tqdm import tqdm
33
	have_tqdm = True
34 1
except ImportError:
35 1
	have_tqdm = False
36
37
38 1
__all__ = ["mcmc_sample_model"]
39
40
41 1
def _lpost(p, model, y=None, beta=1.):
42
	model.set_parameter_vector(p)
43
	lprior = model.log_prior()
44
	if not np.isfinite(lprior):
45
		return (-np.inf, np.nan)
46
	log_likelihood = model.log_likelihood(y, quiet=True)
47
	return (beta * (log_likelihood + lprior), [log_likelihood])
48
49
50 1 View Code Duplication
def _sample_mcmc(sampler, nsamples, p0, rst0,
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
51
		show_progress, progress_mod, debug=False):
52
	smpl = sampler.sample(p0, rstate0=rst0, iterations=nsamples)
53
	if have_tqdm:
54
		smpl = tqdm(smpl, total=nsamples, disable=None)
55
	for i, result in enumerate(smpl):
56
		if show_progress and (i + 1) % progress_mod == 0:
57
			if not have_tqdm and not debug:
58
				logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
59
			if debug:
60
				_pos, _logp, _, _ = result
61
				logging.debug("%5.1f%% lnpmax: %s, p(lnpmax): %s",
62
					100 * (float(i + 1) / nsamples),
63
					np.max(_logp), _pos[np.argmax(_logp)])
64
	return result
0 ignored issues
show
introduced by
The variable result does not seem to be defined in case the for loop on line 55 is not entered. Are you sure this can never be the case?
Loading history...
65
66
67 1 View Code Duplication
def mcmc_sample_model(model, y, beta=1.,
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
68
		nwalkers=100, nburnin=200, nprod=800,
69
		nthreads=1, optimized=False,
70
		bounds=None,
71
		return_logpost=False,
72
		show_progress=False, progress_mod=10):
73
	"""Markov Chain Monte Carlo sampling interface
74
75
	MCMC sampling interface to sample posterior probabilities using the
76
	:class:`emcee.EnsembleSampler` [#]_.
77
78
	.. [#] https://emcee.readthedocs.io
79
80
	Arguments
81
	---------
82
	model : celerite.GP, george.GP, or sciapy.regress.RegressionModel instance
83
		The model to draw posterior samples from. It should provide either
84
		`log_likelihood()` and `log_prior()` functions or be directly callable
85
		via `__call__()`.
86
	y : (N,) array_like
87
		The data to condition the probabilities on.
88
	beta : float, optional
89
		Tempering factor for the probability, default: 1.
90
	nwalkers : int, optional
91
		The number of MCMC walkers (default: 100). If this number is smaller
92
		than 4 times the number of parameters, it is multiplied by the number
93
		of parameters. Otherwise it specifies the number of parameters directly.
94
	nburnin : int, optional
95
		The number of burn-in samples to draw, default: 200.
96
	nprod : int, optional
97
		The number of production samples to draw, default: 800.
98
	nthreads : int, optional
99
		The number of threads to use with a `multiprocessing.Pool`,
100
		used as `pool` for `emcee.EnsembleSampler`. Default: 1.
101
	optimized : bool, optional
102
		Indicate whether the actual (starting) position was determined with an
103
		optimization algorithm beforehand. If `False` (the default), a
104
		pre-burn-in run optimizes the starting position. Sampling continues
105
		from there with the normal burn-in and production runs.
106
		In that case, latin hypercube sampling is used to distribute the walker
107
		starting positions equally in parameter space.
108
	bounds : iterable, optional
109
		The parameter bounds as a list of (min, max) entries.
110
		Default: None
111
	return_logpost : bool, optional
112
		Indicate whether or not to  return the sampled log probabilities as well.
113
		Default: False
114
	show_progress : bool, optional
115
		Print the percentage of samples every `progress_mod` samples.
116
		Default: False
117
	progress_mod : int, optional
118
		Interval in samples to print the percentage of samples.
119
		Default: 10
120
121
	Returns
122
	-------
123
	samples or (samples, logpost) : array_like or tuple
124
		(nwalkers * nprod, ndim) array of the sampled parameters from the
125
		production run if return_logpost is `False`.
126
		A tuple of an (nwalkers * nprod, ndim) array (the same as above)
127
		and an (nwalkers,) array with the second entry containing the
128
		log posterior probabilities if return_logpost is `True`.
129
	"""
130
	v = model.get_parameter_vector()
131
	ndim = len(v)
132
	if nwalkers < 4 * ndim:
133
		nwalkers *= ndim
134
	logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
135
				"%s production samples using %s threads.",
136
				nwalkers, nburnin, nprod, nthreads)
137
138
	if isinstance(model, celerite.GP) or isinstance(model, george.GP):
139
		mod_func = _lpost
140
		mod_args = (model, y, beta)
141
	else:
142
		mod_func = model
143
		mod_args = (beta,)
144
145
	# Initialize the walkers.
146
	if not optimized:
147
		# scipy.optimize's DifferentialEvolutionSolver uses
148
		# latin hypercube sampling as starting positions.
149
		# We just use their initialization to avoid duplicating code.
150
		if bounds is None:
151
			bounds = model.get_parameter_bounds()
152
		de_solver = DifferentialEvolutionSolver(_lpost,
153
					bounds=bounds,
154
					popsize=nwalkers // ndim)
155
		# The initial population should reflect latin hypercube sampling
156
		p0 = de_solver.population
157
	else:
158
		p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])
159
160
	# set up the sampling pool
161
	pool = Pool(processes=nthreads)
162
	sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
163
			pool=pool)
164
165
	rst0 = np.random.get_state()
166
167
	if not optimized:
168
		logging.info("Running MCMC fit (%s samples)", nburnin)
169
		p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
170
				show_progress, progress_mod, debug=True)
171
		logging.info("MCMC fit finished.")
172
173
		p = p0[np.argmax(lnp0)]
174
		logging.info("Fit max logpost: %s, params: %s, exp(params): %s",
175
					np.max(lnp0), p, np.exp(p))
176
		model.set_parameter_vector(p)
177
		logging.debug("params: %s", model.get_parameter_dict())
178
		logging.debug("log_likelihood: %s", model.log_likelihood(y))
179
		p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]
180
		sampler.reset()
181
182
	logging.info("Running burn-in (%s samples)", nburnin)
183
	p0, lnp0, rst0, _ = _sample_mcmc(sampler, nburnin, p0, rst0,
184
			show_progress, progress_mod)
185
	logging.info("Burn-in finished.")
186
187
	p = p0[np.argmax(lnp0)]
188
	logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
189
				np.max(lnp0), p, np.exp(p))
190
	model.set_parameter_vector(p)
191
	logging.debug("params: %s", model.get_parameter_dict())
192
	logging.debug("log_likelihood: %s", model.log_likelihood(y))
193
	sampler.reset()
194
195
	logging.info("Running production chain (%s samples)", nprod)
196
	_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
197
	logging.info("Production run finished.")
198
199
	samples = sampler.flatchain
200
	lnp = sampler.flatlnprobability
201
	# first column in the blobs are the log likelihoods
202
	lnlh = np.array(sampler.blobs)[..., 0].ravel().astype(float)
203
	post_expect_loglh = np.nanmean(np.array(lnlh))
204
	logging.info("total samples: %s", samples.shape)
205
206
	samplmean = np.mean(samples, axis=0)
207
	logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
208
			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))
209
210
	samplmedian = np.median(samples, axis=0)
211
	logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
212
			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))
213
214
	logging.info("max logpost: %s, params: %s, exp(params): %s",
215
			np.max(lnp), samples[np.argmax(lnp)],
216
			np.exp(samples[np.argmax(lnp)]))
217
218
	logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
219
	logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
220
	logging.info("poor man's evidence 1 sum: %s, mean: %s",
221
			np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
222
	logging.info("poor man's evidence 2 max: %s, std: %s",
223
			np.max(np.exp(lnp)), np.std(np.exp(lnp)))
224
	logging.info("poor man's evidence 3: %s",
225
			np.max(np.exp(lnp)) / np.std(np.exp(lnp)))
226
	logging.info("poor man's evidence 4 sum: %s",
227
			logsumexp(lnp, b=1. / lnp.shape[0], axis=0))
228
229
	# mode
230
	model.set_parameter_vector(samples[np.argmax(lnp)])
231
	log_lh = model.log_likelihood(y)
232
	# Use the likelihood instead of the posterior
233
	# https://doi.org/10.3847/1538-3881/aa9332
234
	logging.info("BIC lh: %s", np.log(len(y)) * ndim - 2 * log_lh)
235
	# DIC
236
	sample_deviance = -2 * np.max(lnp)
237
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
238
	pd = sample_deviance - deviance_at_sample
239
	dic = 2 * sample_deviance - deviance_at_sample
240
	logging.info("max logpost log_lh: %s, AIC: %s, DIC: %s, pd: %s",
241
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
242
	# mean
243
	model.set_parameter_vector(samplmean)
244
	log_lh = model.log_likelihood(y)
245
	log_lh_mean = log_lh
246
	# DIC
247
	sample_deviance = -2 * np.nanmean(lnp)
248
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
249
	pd = sample_deviance - deviance_at_sample
250
	dic = 2 * sample_deviance - deviance_at_sample
251
	logging.info("mean log_lh: %s, AIC: %s, DIC: %s, pd: %s",
252
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
253
	# median
254
	model.set_parameter_vector(samplmedian)
255
	log_lh = model.log_likelihood(y)
256
	# DIC
257
	sample_deviance = -2 * np.nanmedian(lnp)
258
	deviance_at_sample = -2 * (model.log_prior() + log_lh)
259
	dic = 2 * sample_deviance - deviance_at_sample
260
	pd = sample_deviance - deviance_at_sample
261
	logging.info("median log_lh: %s, AIC: %s, DIC: %s, pd: %s",
262
			model.log_likelihood(y), 2 * ndim - 2 * log_lh, dic, pd)
263
	# (4)--(6) in Ando2011 doi:10.1080/01966324.2011.10737798
264
	pd_ando = 2 * (log_lh_mean - post_expect_loglh)
265
	ic5 = - 2 * post_expect_loglh + 2 * pd_ando
266
	ic6 = - 2 * post_expect_loglh + 2 * ndim
267
	logging.info("Ando2011: pd: %s, IC(5): %s, IC(6): %s",
268
			pd_ando, ic5, ic6)
269
270
	if return_logpost:
271
		return samples, lnp
272
	return samples
273