1
|
|
|
# -*- coding: utf-8 -*- |
2
|
|
|
# vim:fileencoding=utf-8 |
3
|
|
|
# |
4
|
|
|
# Copyright (c) 2017-2018 Stefan Bender |
5
|
|
|
# |
6
|
|
|
# This module is part of sciapy. |
7
|
|
|
# sciapy is free software: you can redistribute it or modify |
8
|
|
|
# it under the terms of the GNU General Public License as published |
9
|
|
|
# by the Free Software Foundation, version 2. |
10
|
|
|
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html. |
11
|
1 |
|
"""SCIAMACHY data regression MCMC sampler |
12
|
|
|
|
13
|
|
|
Markov Chain Monte Carlo (MCMC) routines to sample |
14
|
|
|
the posterior probability of SCIAMACHY data regression fits. |
15
|
|
|
Uses the `emcee.EnsembleSampler` [1] do do the real work. |
16
|
|
|
|
17
|
|
|
[1](https://emcee.readthedocs.io) |
18
|
|
|
""" |
19
|
|
|
|
20
|
1 |
|
import logging |
21
|
1 |
|
from multiprocessing import Pool |
22
|
|
|
|
23
|
1 |
|
import numpy as np |
24
|
1 |
|
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver |
25
|
|
|
|
26
|
1 |
|
import celerite |
27
|
1 |
|
import george |
28
|
1 |
|
import emcee |
29
|
|
|
|
30
|
1 |
|
__all__ = ["mcmc_sample_model"] |
31
|
|
|
|
32
|
|
|
|
33
|
1 |
|
def _lpost(p, model, y=None, beta=1.): |
|
|
|
|
34
|
|
|
model.set_parameter_vector(p) |
35
|
|
|
lprior = model.log_prior() |
36
|
|
|
if not np.isfinite(lprior): |
37
|
|
|
return -np.inf |
38
|
|
|
return beta * (model.log_likelihood(y, quiet=True) + lprior) |
39
|
|
|
|
40
|
|
|
|
41
|
1 |
View Code Duplication |
def _sample_mcmc(sampler, nsamples, p0, rst0, |
|
|
|
|
42
|
|
|
show_progress, progress_mod, debug=False): |
43
|
|
|
for i, result in enumerate( |
44
|
|
|
sampler.sample(p0, rstate0=rst0, iterations=nsamples)): |
45
|
|
|
if show_progress and (i + 1) % progress_mod == 0: |
46
|
|
|
logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples)) |
47
|
|
|
if debug: |
48
|
|
|
_pos, _logp, _ = result |
49
|
|
|
logging.debug("lnpmax: %s, p(lnpmax): %s", |
50
|
|
|
np.max(_logp), _pos[np.argmax(_logp)]) |
51
|
|
|
return result |
|
|
|
|
52
|
|
|
|
53
|
|
|
|
54
|
1 |
View Code Duplication |
def mcmc_sample_model(model, y, beta=1., |
|
|
|
|
55
|
|
|
nwalkers=100, nburnin=200, nprod=800, |
56
|
|
|
nthreads=1, optimized=False, |
57
|
|
|
bounds=None, |
58
|
|
|
return_logpost=False, |
59
|
|
|
show_progress=False, progress_mod=10): |
60
|
|
|
"""Markov Chain Monte Carlo sampling interface |
61
|
|
|
|
62
|
|
|
MCMC sampling interface to sample posterior probabilities using the |
63
|
|
|
`emcee.EnsembleSampler`[1]. |
64
|
|
|
|
65
|
|
|
[1](https://emcee.readthedocs.io) |
66
|
|
|
|
67
|
|
|
Arguments |
68
|
|
|
--------- |
69
|
|
|
model : celerite.GP, george.GP or sciapy.regress.RegressionModel instance |
70
|
|
|
The model to draw posterior samples from. It should provide either |
71
|
|
|
`log_likelihood()` and `log_prior()` functions be directly callable |
72
|
|
|
via `__call__()`. |
73
|
|
|
y : (N,) array_like |
74
|
|
|
The data to condition the probabilities on. |
75
|
|
|
beta : float, optional |
76
|
|
|
Tempering factor for the probability, default: 1. |
77
|
|
|
nwalkers : int, optional |
78
|
|
|
The number of MCMC walkers (default: 100). If this number is smaller |
79
|
|
|
than 4 times the number of parameters, it is multiplied by the number |
80
|
|
|
of parameters. Otherwise it specifies the number of parameters directly. |
81
|
|
|
nburnin : int, optional |
82
|
|
|
The number of burn-in samples to draw, default: 200. |
83
|
|
|
nprod : int, optional |
84
|
|
|
The number of production samples to draw, default: 800. |
85
|
|
|
nthreads : int, optional |
86
|
|
|
The number of threads to use with a `multiprocessing.Pool`, |
87
|
|
|
used as `pool` for `emcee.EnsembleSampler`. Default: 1. |
88
|
|
|
optimized : bool, optional |
89
|
|
|
Indicate whether the actual (starting) position was determined with an |
90
|
|
|
optimization algorithm beforehand. If `False` (the default), a |
91
|
|
|
pre-burn-in run optimizes the starting position. Sampling continues |
92
|
|
|
from there with the normal burn-in and production runs. |
93
|
|
|
In that case, latin hypercube sampling is used to distribute the walker |
94
|
|
|
starting positions equally in parameter space. |
95
|
|
|
bounds : iterable, optional |
96
|
|
|
The parameter bounds as a list of (min, max) entries. |
97
|
|
|
Default: None |
98
|
|
|
return_logpost : bool, optional |
99
|
|
|
Indicate whether or not to return the sampled log probabilities as well. |
100
|
|
|
Default: False |
101
|
|
|
show_progress : bool, optional |
102
|
|
|
Print the percentage of samples every `progress_mod` samples. |
103
|
|
|
Default: False |
104
|
|
|
progress_mod : int, optional |
105
|
|
|
Interval in samples to print the percentage of samples. |
106
|
|
|
Default: 10 |
107
|
|
|
|
108
|
|
|
Returns |
109
|
|
|
------- |
110
|
|
|
samples or (samples, logpost) : array_like or tuple |
111
|
|
|
(nwalkers * nprod, ndim) array of the sampled parameters from the |
112
|
|
|
production run if return_logpost is `False`. |
113
|
|
|
A tuple of an (nwalkers * nprod, ndim) array (the same as above) |
114
|
|
|
and an (nwalkers,) array with the second entry containing the |
115
|
|
|
log posterior probabilities if return_logpost is `True`. |
116
|
|
|
""" |
117
|
|
|
v = model.get_parameter_vector() |
|
|
|
|
118
|
|
|
ndim = len(v) |
119
|
|
|
if nwalkers < 4 * ndim: |
120
|
|
|
nwalkers *= ndim |
121
|
|
|
logging.info("MCMC parameters: %s walkers, %s burn-in samples, " |
122
|
|
|
"%s production samples using %s threads.", |
123
|
|
|
nwalkers, nburnin, nprod, nthreads) |
124
|
|
|
|
125
|
|
|
if isinstance(model, celerite.GP) or isinstance(model, george.GP): |
|
|
|
|
126
|
|
|
mod_func = _lpost |
127
|
|
|
mod_args = (model, y, beta) |
128
|
|
|
else: |
129
|
|
|
mod_func = model |
130
|
|
|
mod_args = (beta,) |
131
|
|
|
|
132
|
|
|
# Initialize the walkers. |
133
|
|
|
if not optimized: |
134
|
|
|
# scipy.optimize's DifferentialEvolutionSolver uses |
135
|
|
|
# latin hypercube sampling as starting positions. |
136
|
|
|
# We just use their initialization to avoid duplicating code. |
137
|
|
|
if bounds is None: |
138
|
|
|
bounds = model.get_parameter_bounds() |
139
|
|
|
de_solver = DifferentialEvolutionSolver(_lpost, |
140
|
|
|
bounds=bounds, |
141
|
|
|
popsize=nwalkers // ndim) |
142
|
|
|
# The initial population should reflect latin hypercube sampling |
143
|
|
|
p0 = de_solver.population |
|
|
|
|
144
|
|
|
else: |
145
|
|
|
p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)]) |
|
|
|
|
146
|
|
|
|
147
|
|
|
# set up the sampling pool |
148
|
|
|
pool = Pool(processes=nthreads) |
149
|
|
|
sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args, |
150
|
|
|
pool=pool) |
151
|
|
|
|
152
|
|
|
logging.info("Running burn-in ({0} samples)".format(nburnin)) |
|
|
|
|
153
|
|
|
p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, None, |
|
|
|
|
154
|
|
|
show_progress, progress_mod, debug=True) |
155
|
|
|
logging.info("Burn-in finished.") |
156
|
|
|
|
157
|
|
|
p = p0[np.argmax(lnp0)] |
|
|
|
|
158
|
|
|
logging.info("burn-in max logpost: %s, params: %s, exp(params): %s", |
159
|
|
|
np.max(lnp0), p, np.exp(p)) |
160
|
|
|
model.set_parameter_vector(p) |
161
|
|
|
logging.debug("params: %s", model.get_parameter_dict()) |
162
|
|
|
logging.debug("log_likelihood: %s", model.log_likelihood(y)) |
163
|
|
|
sampler.reset() |
164
|
|
|
|
165
|
|
|
if not optimized: |
166
|
|
|
p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)] |
|
|
|
|
167
|
|
|
logging.info("Running second burn-in ({0} samples)".format(nburnin)) |
|
|
|
|
168
|
|
|
p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, rst0, |
|
|
|
|
169
|
|
|
show_progress, progress_mod) |
170
|
|
|
sampler.reset() |
171
|
|
|
logging.info("Second burn-in finished.") |
172
|
|
|
|
173
|
|
|
logging.info("Running production chain ({0} samples)".format(nprod)) |
|
|
|
|
174
|
|
|
_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod) |
175
|
|
|
logging.info("Production run finished.") |
176
|
|
|
|
177
|
|
|
samples = sampler.flatchain |
178
|
|
|
lnp = sampler.flatlnprobability |
179
|
|
|
logging.info("total samples: %s", samples.shape) |
180
|
|
|
|
181
|
|
|
samplmean = np.mean(samples, axis=0) |
182
|
|
|
logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s", |
183
|
|
|
samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean))) |
184
|
|
|
|
185
|
|
|
samplmedian = np.median(samples, axis=0) |
186
|
|
|
logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s", |
187
|
|
|
samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian))) |
188
|
|
|
|
189
|
|
|
logging.info("max logpost: %s, params: %s, exp(params): %s", |
190
|
|
|
np.max(lnp), samples[np.argmax(lnp)], |
191
|
|
|
np.exp(samples[np.argmax(lnp)])) |
192
|
|
|
|
193
|
|
|
logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp)) |
194
|
|
|
logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp)) |
195
|
|
|
logging.info("poor man's evidence 1 sum: %s, mean: %s", np.sum(np.exp(lnp)), np.mean(np.exp(lnp))) |
196
|
|
|
logging.info("poor man's evidence 2 max: %s, std: %s", np.max(np.exp(lnp)), np.std(np.exp(lnp))) |
197
|
|
|
logging.info("poor man's evidence 3: %s", np.max(np.exp(lnp)) / np.std(np.exp(lnp))) |
198
|
|
|
|
199
|
|
|
if return_logpost: |
200
|
|
|
return samples, lnp |
201
|
|
|
return samples |
202
|
|
|
|
This check looks for invalid names for a range of different identifiers.
You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.
If your project includes a Pylint configuration file, the settings contained in that file take precedence.
To find out more about Pylint, please refer to their site.