sciapy.regress.mcmc._sample_mcmc() - Code Metrics - Inspection of "regress: Fix variable in Lyman-alpha loader" - st-bender/sciapy - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Push — master ( 5f801b...822a99 )

by Stefan

created 2018-08-26 18:29 UTC

sciapy.regress.mcmc._sample_mcmc() A

↳ Parent: sciapy.regress.mcmc

Complexity

Conditions

Size

Total Lines	11
Code Lines	11

Duplication

Lines	11
Ratio	100 %

Code Coverage

Tests	1
CRAP Score	21.748

Importance

Changes

Metric	Value
cc	5
eloc	11
nop	7
dl	11
loc	11
ccs	1
cts	8
cp	0.125
crap	21.748
rs	9.3333
c	0
b	0
f	0

# -*- coding: utf-8 -*-
# vim:fileencoding=utf-8
#
# Copyright (c) 2017-2018 Stefan Bender
#
# This module is part of sciapy.
# sciapy is free software: you can redistribute it or modify
# it under the terms of the GNU General Public License as published
# by the Free Software Foundation, version 2.
# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
"""SCIAMACHY data regression MCMC sampler

Markov Chain Monte Carlo (MCMC) routines to sample
the posterior probability of SCIAMACHY data regression fits.
Uses the `emcee.EnsembleSampler` [1] do do the real work.

[1](https://emcee.readthedocs.io)
"""

import logging
from multiprocessing import Pool

import numpy as np
from scipy.optimize._differentialevolution import DifferentialEvolutionSolver

import celerite
import george
import emcee

__all__ = ["mcmc_sample_model"]


def _lpost(p, model, y=None, beta=1.):

	model.set_parameter_vector(p)
	lprior = model.log_prior()
	if not np.isfinite(lprior):
		return -np.inf
	return beta * (model.log_likelihood(y, quiet=True) + lprior)


def _sample_mcmc(sampler, nsamples, p0, rst0,

		show_progress, progress_mod, debug=False):
	for i, result in enumerate(
			sampler.sample(p0, rstate0=rst0, iterations=nsamples)):
		if show_progress and (i + 1) % progress_mod == 0:
			logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
			if debug:
				_pos, _logp, _ = result
				logging.debug("lnpmax: %s, p(lnpmax): %s",
					np.max(_logp), _pos[np.argmax(_logp)])
	return result



def mcmc_sample_model(model, y, beta=1.,

		nwalkers=100, nburnin=200, nprod=800,
		nthreads=1, optimized=False,
		bounds=None,
		return_logpost=False,
		show_progress=False, progress_mod=10):
	"""Markov Chain Monte Carlo sampling interface

	MCMC sampling interface to sample posterior probabilities using the
	`emcee.EnsembleSampler`[1].

	[1](https://emcee.readthedocs.io)

	Arguments
	---------
	model : celerite.GP, george.GP or sciapy.regress.RegressionModel instance
		The model to draw posterior samples from. It should provide either
		`log_likelihood()` and `log_prior()` functions be directly callable
		via `__call__()`.
	y : (N,) array_like
		The data to condition the probabilities on.
	beta : float, optional
		Tempering factor for the probability, default: 1.
	nwalkers : int, optional
		The number of MCMC walkers (default: 100). If this number is smaller
		than 4 times the number of parameters, it is multiplied by the number
		of parameters. Otherwise it specifies the number of parameters directly.
	nburnin : int, optional
		The number of burn-in samples to draw, default: 200.
	nprod : int, optional
		The number of production samples to draw, default: 800.
	nthreads : int, optional
		The number of threads to use with a `multiprocessing.Pool`,
		used as `pool` for `emcee.EnsembleSampler`. Default: 1.
	optimized : bool, optional
		Indicate whether the actual (starting) position was determined with an
		optimization algorithm beforehand. If `False` (the default), a
		pre-burn-in run optimizes the starting position. Sampling continues
		from there with the normal burn-in and production runs.
		In that case, latin hypercube sampling is used to distribute the walker
		starting positions equally in parameter space.
	bounds : iterable, optional
		The parameter bounds as a list of (min, max) entries.
		Default: None
	return_logpost : bool, optional
		Indicate whether or not to  return the sampled log probabilities as well.
		Default: False
	show_progress : bool, optional
		Print the percentage of samples every `progress_mod` samples.
		Default: False
	progress_mod : int, optional
		Interval in samples to print the percentage of samples.
		Default: 10

	Returns
	-------
	samples or (samples, logpost) : array_like or tuple
		(nwalkers * nprod, ndim) array of the sampled parameters from the
		production run if return_logpost is `False`.
		A tuple of an (nwalkers * nprod, ndim) array (the same as above)
		and an (nwalkers,) array with the second entry containing the
		log posterior probabilities if return_logpost is `True`.
	"""
	v = model.get_parameter_vector()

	ndim = len(v)
	if nwalkers < 4 * ndim:
		nwalkers *= ndim
	logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
				"%s production samples using %s threads.",
				nwalkers, nburnin, nprod, nthreads)

	if isinstance(model, celerite.GP) or isinstance(model, george.GP):

		mod_func = _lpost
		mod_args = (model, y, beta)
	else:
		mod_func = model
		mod_args = (beta,)

	# Initialize the walkers.
	if not optimized:
		# scipy.optimize's DifferentialEvolutionSolver uses
		# latin hypercube sampling as starting positions.
		# We just use their initialization to avoid duplicating code.
		if bounds is None:
			bounds = model.get_parameter_bounds()
		de_solver = DifferentialEvolutionSolver(_lpost,
					bounds=bounds,
					popsize=nwalkers // ndim)
		# The initial population should reflect latin hypercube sampling
		p0 = de_solver.population

	else:
		p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])


	# set up the sampling pool
	pool = Pool(processes=nthreads)
	sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
			pool=pool)

	logging.info("Running burn-in ({0} samples)".format(nburnin))

	p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, None,

			show_progress, progress_mod, debug=True)
	logging.info("Burn-in finished.")

	p = p0[np.argmax(lnp0)]

	logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
				np.max(lnp0), p, np.exp(p))
	model.set_parameter_vector(p)
	logging.debug("params: %s", model.get_parameter_dict())
	logging.debug("log_likelihood: %s", model.log_likelihood(y))
	sampler.reset()

	if not optimized:
		p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]

		logging.info("Running second burn-in ({0} samples)".format(nburnin))

		p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, rst0,

				show_progress, progress_mod)
		sampler.reset()
		logging.info("Second burn-in finished.")

	logging.info("Running production chain ({0} samples)".format(nprod))

	_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
	logging.info("Production run finished.")

	samples = sampler.flatchain
	lnp = sampler.flatlnprobability
	logging.info("total samples: %s", samples.shape)

	samplmean = np.mean(samples, axis=0)
	logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))

	samplmedian = np.median(samples, axis=0)
	logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))

	logging.info("max logpost: %s, params: %s, exp(params): %s",
			np.max(lnp), samples[np.argmax(lnp)],
			np.exp(samples[np.argmax(lnp)]))

	logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
	logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
	logging.info("poor man's evidence 1 sum: %s, mean: %s", np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
	logging.info("poor man's evidence 2 max: %s, std: %s", np.max(np.exp(lnp)), np.std(np.exp(lnp)))
	logging.info("poor man's evidence 3: %s", np.max(np.exp(lnp)) / np.std(np.exp(lnp)))

	if return_logpost:
		return samples, lnp
	return samples


1			# -- coding: utf-8 --
2			# vim:fileencoding=utf-8
3			#
4			# Copyright (c) 2017-2018 Stefan Bender
5			#
6			# This module is part of sciapy.
7			# sciapy is free software: you can redistribute it or modify
8			# it under the terms of the GNU General Public License as published
9			# by the Free Software Foundation, version 2.
10			# See accompanying LICENSE file or http://www.gnu.org/licenses/gpl-2.0.html.
11	1		"""SCIAMACHY data regression MCMC sampler
12
13			Markov Chain Monte Carlo (MCMC) routines to sample
14			the posterior probability of SCIAMACHY data regression fits.
15			Uses the `emcee.EnsembleSampler` [1] do do the real work.
16
17			[1](https://emcee.readthedocs.io)
18			"""
19
20	1		import logging
21	1		from multiprocessing import Pool
22
23	1		import numpy as np
24	1		from scipy.optimize._differentialevolution import DifferentialEvolutionSolver
25
26	1		import celerite
27	1		import george
28	1		import emcee
29
30	1		__all__ = ["mcmc_sample_model"]
31
32
33	1		def _lpost(p, model, y=None, beta=1.):
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `p` does not conform to the argument naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `y` does not conform to the argument naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
34			model.set_parameter_vector(p)
35			lprior = model.log_prior()
36			if not np.isfinite(lprior):
37			return -np.inf
38			return beta * (model.log_likelihood(y, quiet=True) + lprior)
39
40
41	1	View Code Duplication	def _sample_mcmc(sampler, nsamples, p0, rst0,
			0 ignored issues – show Duplication introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history... Coding Style Naming introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the argument naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... best-practice introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report Too many arguments (7/5) Loading history...
42			show_progress, progress_mod, debug=False):
43			for i, result in enumerate(
44			sampler.sample(p0, rstate0=rst0, iterations=nsamples)):
45			if show_progress and (i + 1) % progress_mod == 0:
46			logging.info("%5.1f%%", 100 * (float(i + 1) / nsamples))
47			if debug:
48			_pos, _logp, _ = result
49			logging.debug("lnpmax: %s, p(lnpmax): %s",
50			np.max(_logp), _pos[np.argmax(_logp)])
51			return result
			0 ignored issues – show introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report The variable `result` does not seem to be defined in case the `for` loop on line `43` is not entered. Are you sure this can never be the case? Loading history...
52
53
54	1	View Code Duplication	def mcmc_sample_model(model, y, beta=1.,
			0 ignored issues – show Duplication introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history... Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `y` does not conform to the argument naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history... best-practice introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report Too many arguments (12/5) Loading history... Comprehensibility introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report This function exceeds the maximum number of variables (27/15). Loading history...
55			nwalkers=100, nburnin=200, nprod=800,
56			nthreads=1, optimized=False,
57			bounds=None,
58			return_logpost=False,
59			show_progress=False, progress_mod=10):
60			"""Markov Chain Monte Carlo sampling interface
61
62			MCMC sampling interface to sample posterior probabilities using the
63			`emcee.EnsembleSampler`[1].
64
65			[1](https://emcee.readthedocs.io)
66
67			Arguments
68			---------
69			model : celerite.GP, george.GP or sciapy.regress.RegressionModel instance
70			The model to draw posterior samples from. It should provide either
71			`log_likelihood()` and `log_prior()` functions be directly callable
72			via `__call__()`.
73			y : (N,) array_like
74			The data to condition the probabilities on.
75			beta : float, optional
76			Tempering factor for the probability, default: 1.
77			nwalkers : int, optional
78			The number of MCMC walkers (default: 100). If this number is smaller
79			than 4 times the number of parameters, it is multiplied by the number
80			of parameters. Otherwise it specifies the number of parameters directly.
81			nburnin : int, optional
82			The number of burn-in samples to draw, default: 200.
83			nprod : int, optional
84			The number of production samples to draw, default: 800.
85			nthreads : int, optional
86			The number of threads to use with a `multiprocessing.Pool`,
87			used as `pool` for `emcee.EnsembleSampler`. Default: 1.
88			optimized : bool, optional
89			Indicate whether the actual (starting) position was determined with an
90			optimization algorithm beforehand. If `False` (the default), a
91			pre-burn-in run optimizes the starting position. Sampling continues
92			from there with the normal burn-in and production runs.
93			In that case, latin hypercube sampling is used to distribute the walker
94			starting positions equally in parameter space.
95			bounds : iterable, optional
96			The parameter bounds as a list of (min, max) entries.
97			Default: None
98			return_logpost : bool, optional
99			Indicate whether or not to return the sampled log probabilities as well.
100			Default: False
101			show_progress : bool, optional
102			Print the percentage of samples every `progress_mod` samples.
103			Default: False
104			progress_mod : int, optional
105			Interval in samples to print the percentage of samples.
106			Default: 10
107
108			Returns
109			-------
110			samples or (samples, logpost) : array_like or tuple
111			(nwalkers * nprod, ndim) array of the sampled parameters from the
112			production run if return_logpost is `False`.
113			A tuple of an (nwalkers * nprod, ndim) array (the same as above)
114			and an (nwalkers,) array with the second entry containing the
115			log posterior probabilities if return_logpost is `True`.
116			"""
117			v = model.get_parameter_vector()
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `v` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
118			ndim = len(v)
119			if nwalkers < 4 * ndim:
120			nwalkers *= ndim
121			logging.info("MCMC parameters: %s walkers, %s burn-in samples, "
122			"%s production samples using %s threads.",
123			nwalkers, nburnin, nprod, nthreads)
124
125			if isinstance(model, celerite.GP) or isinstance(model, george.GP):
			0 ignored issues – show Unused Code introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report Consider merging these isinstance calls to isinstance(model, (celerite.GP, george.GP)) Loading history...
126			mod_func = _lpost
127			mod_args = (model, y, beta)
128			else:
129			mod_func = model
130			mod_args = (beta,)
131
132			# Initialize the walkers.
133			if not optimized:
134			# scipy.optimize's DifferentialEvolutionSolver uses
135			# latin hypercube sampling as starting positions.
136			# We just use their initialization to avoid duplicating code.
137			if bounds is None:
138			bounds = model.get_parameter_bounds()
139			de_solver = DifferentialEvolutionSolver(_lpost,
140			bounds=bounds,
141			popsize=nwalkers // ndim)
142			# The initial population should reflect latin hypercube sampling
143			p0 = de_solver.population
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
144			else:
145			p0 = np.array([v + 1e-2 * np.random.randn(ndim) for _ in range(nwalkers)])
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
146
147			# set up the sampling pool
148			pool = Pool(processes=nthreads)
149			sampler = emcee.EnsembleSampler(nwalkers, ndim, mod_func, args=mod_args,
150			pool=pool)
151
152			logging.info("Running burn-in ({0} samples)".format(nburnin))
			0 ignored issues – show introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report Use formatting in logging functions and pass the parameters as arguments Loading history...
153			p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, None,
			0 ignored issues – show Coding Style Naming introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
154			show_progress, progress_mod, debug=True)
155			logging.info("Burn-in finished.")
156
157			p = p0[np.argmax(lnp0)]
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `p` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
158			logging.info("burn-in max logpost: %s, params: %s, exp(params): %s",
159			np.max(lnp0), p, np.exp(p))
160			model.set_parameter_vector(p)
161			logging.debug("params: %s", model.get_parameter_dict())
162			logging.debug("log_likelihood: %s", model.log_likelihood(y))
163			sampler.reset()
164
165			if not optimized:
166			p0 = [p + 1e-4 * np.random.randn(ndim) for _ in range(nwalkers)]
			0 ignored issues – show Coding Style Naming introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
167			logging.info("Running second burn-in ({0} samples)".format(nburnin))
			0 ignored issues – show introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report Use formatting in logging functions and pass the parameters as arguments Loading history...
168			p0, lnp0, rst0 = _sample_mcmc(sampler, nburnin, p0, rst0,
			0 ignored issues – show Coding Style Naming introduced 2018-08-26 18:34 UTC by Report Bug Copy Issue Report The name `p0` does not conform to the variable naming conventions (`(([a-z][a-z0-9_]{2,30})\|(_[a-z0-9_]*))$`). This check looks for invalid names for a range of different identifiers. You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements. If your project includes a Pylint configuration file, the settings contained in that file take precedence. To find out more about Pylint, please refer to their site. Loading history...
169			show_progress, progress_mod)
170			sampler.reset()
171			logging.info("Second burn-in finished.")
172
173			logging.info("Running production chain ({0} samples)".format(nprod))
			0 ignored issues – show introduced 2018-07-02 09:55 UTC by Report Bug Copy Issue Report Use formatting in logging functions and pass the parameters as arguments Loading history...
174			_sample_mcmc(sampler, nprod, p0, rst0, show_progress, progress_mod)
175			logging.info("Production run finished.")
176
177			samples = sampler.flatchain
178			lnp = sampler.flatlnprobability
179			logging.info("total samples: %s", samples.shape)
180
181			samplmean = np.mean(samples, axis=0)
182			logging.info("mean: %s, exp(mean): %s, sqrt(exp(mean)): %s",
183			samplmean, np.exp(samplmean), np.sqrt(np.exp(samplmean)))
184
185			samplmedian = np.median(samples, axis=0)
186			logging.info("median: %s, exp(median): %s, sqrt(exp(median)): %s",
187			samplmedian, np.exp(samplmedian), np.sqrt(np.exp(samplmedian)))
188
189			logging.info("max logpost: %s, params: %s, exp(params): %s",
190			np.max(lnp), samples[np.argmax(lnp)],
191			np.exp(samples[np.argmax(lnp)]))
192
193			logging.info("AIC: %s", 2 * ndim - 2 * np.max(lnp))
194			logging.info("BIC: %s", np.log(len(y)) * ndim - 2 * np.max(lnp))
195			logging.info("poor man's evidence 1 sum: %s, mean: %s", np.sum(np.exp(lnp)), np.mean(np.exp(lnp)))
196			logging.info("poor man's evidence 2 max: %s, std: %s", np.max(np.exp(lnp)), np.std(np.exp(lnp)))
197			logging.info("poor man's evidence 3: %s", np.max(np.exp(lnp)) / np.std(np.exp(lnp)))
198
199			if return_logpost:
200			return samples, lnp
201			return samples
202

st-bender / sciapy

Push — master ( 5f801b...822a99 )

sciapy.regress.mcmc._sample_mcmc() A

Complexity

Size

Duplication

Code Coverage

Importance

Duplication Side-by-Side

Filter issues like