asgardpy.stats.stats.fetch_pivot_energy() - Code Metrics - Inspection of "Update to support Gammapy v1.2" - chaimain/asgardpy - Measure and Improve Code Quality continuously with Scrutinizer

Passed

Pull Request — main (#165)

by Chaitanya

created 2024-03-05 10:58 UTC

asgardpy.stats.stats.fetch_pivot_energy() A

↳ Parent: Project

Complexity

Conditions

Size

Total Lines	36
Code Lines	13

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	3
eloc	13
nop	1
dl	0
loc	36
rs	9.75
c	0
b	0
f	0

"""
Module for performing some statistic functions.
"""

import numpy as np
from gammapy.stats.fit_statistics import cash, wstat
from scipy.stats import chi2, norm

__all__ = [
    "check_model_preference_aic",
    "check_model_preference_lrt",
    "get_chi2_sig_pval",
    "get_goodness_of_fit_stats",
    "get_ts_target",
]


def get_chi2_sig_pval(test_stat, ndof):
    """
    Using the log-likelihood value for a model fitting to data, along with the
    total degrees of freedom, evaluate the significance value in terms of gaussian
    distribution along with one-tailed p-value for the fitting statistics.

    In Gammapy, for 3D analysis, cash statistics is used, while for 1D analysis,
    wstat statistics is used. Check the documentation for more details
    https://docs.gammapy.org/1.2/user-guide/stats/index.html

    Parameters
    ----------
    test_stat: float
        The test statistic (-2 ln L) value of the fitting.
    ndof: int
        Total number of degrees of freedom.

    Returns
    -------
    chi2_sig: float
        significance (Chi2) of the likelihood of fit model estimated in
        Gaussian distribution.
    pval: float
        p-value for the model fitting

    """
    pval = chi2.sf(test_stat, ndof)
    chi2_sig = norm.isf(pval / 2)

    return chi2_sig, pval


def check_model_preference_lrt(test_stat_1, test_stat_2, ndof_1, ndof_2):
    """
    Log-likelihood ratio test. Checking the preference of a "nested" spectral
    model2 (observed), over a primary model1.

    Parameters
    ----------
    test_stat_1: float
        The test statistic (-2 ln L) of the Fit result of the primary spectral model.
    test_stat_2: float
        The test statistic (-2 ln L) of the Fit result of the nested spectral model.
    ndof_1: int
        Number of degrees of freedom for the primary model
    ndof_2: int
        Number of degrees of freedom for the nested model

    Returns
    -------
    p_value: float
        p-value for the ratio of the likelihoods
    gaussian_sigmas: float
        significance (Chi2) of the ratio of the likelihoods estimated in
        Gaussian distribution.
    n_dof: int
        number of degrees of freedom or free parameters between primary and
        nested model.
    """
    n_dof = ndof_1 - ndof_2

    if n_dof < 1:
        print(f"DoF is lower in {ndof_1} compared to {ndof_2}")

        return np.nan, np.nan, n_dof

    gaussian_sigmas, p_value = get_chi2_sig_pval(test_stat_1 - test_stat_2, n_dof)

    return p_value, gaussian_sigmas, n_dof


def check_model_preference_aic(list_stat, list_dof):
    """
    Akaike Information Criterion (AIC) preference over a list of stat and DoF
    (degree of freedom) to get relative likelihood of a given list of best-fit
    models.

    Parameters
    ----------
    list_wstat: list
        List of stat or -2 Log likelihood values for a list of models.
    list_dof: list
        List of degrees of freedom or list of free parameters, for a list of models.

    Returns
    -------
    list_rel_p: list
        List of relative likelihood probabilities, for a list of models.
    """
    list_aic_stat = []
    for stat, dof in zip(list_stat, list_dof, strict=True):
        aic_stat = stat + 2 * dof
        list_aic_stat.append(aic_stat)
    list_aic_stat = np.array(list_aic_stat)

    aic_stat_min = np.min(list_aic_stat)

    list_b_stat = []
    for aic in list_aic_stat:
        b_stat = np.exp((aic_stat_min - aic) / 2)
        list_b_stat.append(b_stat)
    list_b_stat = np.array(list_b_stat)

    list_rel_p = []
    for b_stat in list_b_stat:
        rel_p = b_stat / np.sum(list_b_stat)
        list_rel_p.append(rel_p)
    list_rel_p = np.array(list_rel_p)

    return list_rel_p


def get_goodness_of_fit_stats(datasets, instrument_spectral_info):
    """
    Evaluating the Goodness of Fit statistics of the fitting of the model to
    the dataset.

    We first use the get_ts_target function to get the total test statistic for
    the (observed) best fit of the model to the data, and the (expected)
    perfect fit of model and data (model = data), for the given target source
    region/pixel.

    We then evaluate the total number of Degrees of Freedom for the Fit as the
    difference between the number of relevant energy bins used in the evaluation
    and the number of free model parameters.

    The fit statistics difference is used as the test statistic value for
    get_chi2_sig_pval function along with the total number of degrees of freedom
    to get the final statistics for the goodness of fit.

    The fit statistics information is updated in the dict object provided and
    a logging message is passed.

    Parameter
    ---------
    datasets: `gammapy.datasets.Datasets`
        List of Datasets object, which can contain 3D and/or 1D datasets
    instrument_spectral_info: dict
        Dict of information for storing relevant fit stats

    Return
    ------
    instrument_spectral_info: dict
        Filled Dict of information with relevant fit statistics
    stat_message: str
        String for logging the fit statistics
    """
    stat_best_fit, stat_max_fit = get_ts_target(datasets)

    instrument_spectral_info["max_fit_stat"] = stat_max_fit
    instrument_spectral_info["best_fit_stat"] = stat_best_fit
    ndof = instrument_spectral_info["DoF"]
    stat_diff_gof = stat_best_fit - stat_max_fit

    fit_chi2_sig, fit_pval = get_chi2_sig_pval(stat_diff_gof, ndof)

    instrument_spectral_info["fit_chi2_sig"] = fit_chi2_sig
    instrument_spectral_info["fit_pval"] = fit_pval

    stat_message = "The Chi2/dof value of the goodness of Fit is "
    stat_message += f"{stat_diff_gof:.2f}/{ndof}\nand the p-value is {fit_pval:.3e} "
    stat_message += f"and in Significance {fit_chi2_sig:.2f} sigmas"
    stat_message += f"\nwith best fit TS (Observed) as {stat_best_fit:.3f} "
    stat_message += f"and max fit TS (Expected) as {stat_max_fit:.3f}"

    return instrument_spectral_info, stat_message


def get_ts_target(datasets):
    """
    From a given list of DL4 datasets, with assumed associated models, estimate
    the total test statistic values, in the given target source region/pixel,
    for the (observed) best fit of the model to the data, and the (expected)
    perfect fit of model and data (model = data).

    For consistency in the evaluation of the statistic values, we will use the
    basic Fit Statistic functions in Gammapy for Poisson Data:

    * `cash <https://docs.gammapy.org/1.2/api/gammapy.stats.cash.html>`_

    * `wstat <https://docs.gammapy.org/1.2/api/gammapy.stats.wstat.html>`_

    For the different type of Statistics used in Gammapy for 3D/1D datasets,
    and for our use case of getting the best fit and perfect fit, we will pass
    the appropriate values, by adapting to the following methods,

    * Best Fit (Observed):

        * `Cash stat_array <https://docs.gammapy.org/1.2/api/gammapy.datasets.MapDataset.html#gammapy.datasets.MapDataset.stat_array # noqa>`_

        * `Wstat stat_array <https://docs.gammapy.org/1.2/api/gammapy.datasets.MapDatasetOnOff.html#gammapy.datasets.MapDatasetOnOff.stat_array # noqa>`_

    * Perfect Fit (Expected):

        * `Cash stat_max <https://docs.gammapy.org/1.2/api/gammapy.stats.CashCountsStatistic.html#gammapy.stats.CashCountsStatistic.stat_max # noqa>`_

        * `Wstat stat_max <https://docs.gammapy.org/1.2/api/gammapy.stats.WStatCountsStatistic.html#gammapy.stats.WStatCountsStatistic.stat_max # noqa>`_

    Parameter
    ---------
    datasets: `gammapy.datasets.Datasets`
        List of Datasets object, which can contain 3D and/or 1D datasets

    Return
    ------
    stat_best_fit: float
        Total sum of test statistic of the best fit of model to data, summed
        over all energy bins.
    stat_max_fit: float
        Test statistic difference of the perfect fit of model to data summed
        over all energy bins.
    """  # noqa
    stat_best_fit = 0
    stat_max_fit = 0

    for data in datasets:
        if data.stat_type != "chi2":
            # Assuming that the Counts Map is created with the target source as its center
            region = data.counts.geom.center_skydir

            if data.stat_type == "cash":
                counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
                mu_on = (data.npred() * data.mask).get_spectrum(region).data

                stat_best_fit += np.nansum(cash(n_on=counts_on, mu_on=mu_on).ravel())
                stat_max_fit += np.nansum(cash(n_on=counts_on, mu_on=counts_on).ravel())

            elif data.stat_type == "wstat":
                counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
                counts_off = np.nan_to_num((data.counts_off * data.mask).get_spectrum(region)).data

                # alpha is evaluated by acceptance ratios, and
                # Background is evaluated with given alpha and counts_off,
                # but for alpha to be of the same shape (in the target region),
                # it will be reevaluated
                bkg = np.nan_to_num((data.background * data.mask).get_spectrum(region))

                with np.errstate(invalid="ignore", divide="ignore"):
                    alpha = bkg / counts_off
                mu_signal = np.nan_to_num((data.npred_signal() * data.mask).get_spectrum(region)).data
                max_pred = counts_on - bkg

                stat_best_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=mu_signal))
                stat_max_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=max_pred))
        else:
            # For FluxxPointsDataset
            stat_best_fit += np.nansum(data.stat_array())
            stat_max_fit += len(data.data.dnde.data)

    return stat_best_fit, stat_max_fit


1			"""
2			Module for performing some statistic functions.
3			"""
4
5			import numpy as np
6			from gammapy.stats.fit_statistics import cash, wstat
7			from scipy.stats import chi2, norm
8
9			__all__ = [
10			"check_model_preference_aic",
11			"check_model_preference_lrt",
12			"get_chi2_sig_pval",
13			"get_goodness_of_fit_stats",
14			"get_ts_target",
15			]
16
17
18			def get_chi2_sig_pval(test_stat, ndof):
19			"""
20			Using the log-likelihood value for a model fitting to data, along with the
21			total degrees of freedom, evaluate the significance value in terms of gaussian
22			distribution along with one-tailed p-value for the fitting statistics.
23
24			In Gammapy, for 3D analysis, cash statistics is used, while for 1D analysis,
25			wstat statistics is used. Check the documentation for more details
26			https://docs.gammapy.org/1.2/user-guide/stats/index.html
27
28			Parameters
29			----------
30			test_stat: float
31			The test statistic (-2 ln L) value of the fitting.
32			ndof: int
33			Total number of degrees of freedom.
34
35			Returns
36			-------
37			chi2_sig: float
38			significance (Chi2) of the likelihood of fit model estimated in
39			Gaussian distribution.
40			pval: float
41			p-value for the model fitting
42
43			"""
44			pval = chi2.sf(test_stat, ndof)
45			chi2_sig = norm.isf(pval / 2)
46
47			return chi2_sig, pval
48
49
50			def check_model_preference_lrt(test_stat_1, test_stat_2, ndof_1, ndof_2):
51			"""
52			Log-likelihood ratio test. Checking the preference of a "nested" spectral
53			model2 (observed), over a primary model1.
54
55			Parameters
56			----------
57			test_stat_1: float
58			The test statistic (-2 ln L) of the Fit result of the primary spectral model.
59			test_stat_2: float
60			The test statistic (-2 ln L) of the Fit result of the nested spectral model.
61			ndof_1: int
62			Number of degrees of freedom for the primary model
63			ndof_2: int
64			Number of degrees of freedom for the nested model
65
66			Returns
67			-------
68			p_value: float
69			p-value for the ratio of the likelihoods
70			gaussian_sigmas: float
71			significance (Chi2) of the ratio of the likelihoods estimated in
72			Gaussian distribution.
73			n_dof: int
74			number of degrees of freedom or free parameters between primary and
75			nested model.
76			"""
77			n_dof = ndof_1 - ndof_2
78
79			if n_dof < 1:
80			print(f"DoF is lower in {ndof_1} compared to {ndof_2}")
81
82			return np.nan, np.nan, n_dof
83
84			gaussian_sigmas, p_value = get_chi2_sig_pval(test_stat_1 - test_stat_2, n_dof)
85
86			return p_value, gaussian_sigmas, n_dof
87
88
89			def check_model_preference_aic(list_stat, list_dof):
90			"""
91			Akaike Information Criterion (AIC) preference over a list of stat and DoF
92			(degree of freedom) to get relative likelihood of a given list of best-fit
93			models.
94
95			Parameters
96			----------
97			list_wstat: list
98			List of stat or -2 Log likelihood values for a list of models.
99			list_dof: list
100			List of degrees of freedom or list of free parameters, for a list of models.
101
102			Returns
103			-------
104			list_rel_p: list
105			List of relative likelihood probabilities, for a list of models.
106			"""
107			list_aic_stat = []
108			for stat, dof in zip(list_stat, list_dof, strict=True):
109			aic_stat = stat + 2 * dof
110			list_aic_stat.append(aic_stat)
111			list_aic_stat = np.array(list_aic_stat)
112
113			aic_stat_min = np.min(list_aic_stat)
114
115			list_b_stat = []
116			for aic in list_aic_stat:
117			b_stat = np.exp((aic_stat_min - aic) / 2)
118			list_b_stat.append(b_stat)
119			list_b_stat = np.array(list_b_stat)
120
121			list_rel_p = []
122			for b_stat in list_b_stat:
123			rel_p = b_stat / np.sum(list_b_stat)
124			list_rel_p.append(rel_p)
125			list_rel_p = np.array(list_rel_p)
126
127			return list_rel_p
128
129
130			def get_goodness_of_fit_stats(datasets, instrument_spectral_info):
131			"""
132			Evaluating the Goodness of Fit statistics of the fitting of the model to
133			the dataset.
134
135			We first use the get_ts_target function to get the total test statistic for
136			the (observed) best fit of the model to the data, and the (expected)
137			perfect fit of model and data (model = data), for the given target source
138			region/pixel.
139
140			We then evaluate the total number of Degrees of Freedom for the Fit as the
141			difference between the number of relevant energy bins used in the evaluation
142			and the number of free model parameters.
143
144			The fit statistics difference is used as the test statistic value for
145			get_chi2_sig_pval function along with the total number of degrees of freedom
146			to get the final statistics for the goodness of fit.
147
148			The fit statistics information is updated in the dict object provided and
149			a logging message is passed.
150
151			Parameter
152			---------
153			datasets: `gammapy.datasets.Datasets`
154			List of Datasets object, which can contain 3D and/or 1D datasets
155			instrument_spectral_info: dict
156			Dict of information for storing relevant fit stats
157
158			Return
159			------
160			instrument_spectral_info: dict
161			Filled Dict of information with relevant fit statistics
162			stat_message: str
163			String for logging the fit statistics
164			"""
165			stat_best_fit, stat_max_fit = get_ts_target(datasets)
166
167			instrument_spectral_info["max_fit_stat"] = stat_max_fit
168			instrument_spectral_info["best_fit_stat"] = stat_best_fit
169			ndof = instrument_spectral_info["DoF"]
170			stat_diff_gof = stat_best_fit - stat_max_fit
171
172			fit_chi2_sig, fit_pval = get_chi2_sig_pval(stat_diff_gof, ndof)
173
174			instrument_spectral_info["fit_chi2_sig"] = fit_chi2_sig
175			instrument_spectral_info["fit_pval"] = fit_pval
176
177			stat_message = "The Chi2/dof value of the goodness of Fit is "
178			stat_message += f"{stat_diff_gof:.2f}/{ndof}\nand the p-value is {fit_pval:.3e} "
179			stat_message += f"and in Significance {fit_chi2_sig:.2f} sigmas"
180			stat_message += f"\nwith best fit TS (Observed) as {stat_best_fit:.3f} "
181			stat_message += f"and max fit TS (Expected) as {stat_max_fit:.3f}"
182
183			return instrument_spectral_info, stat_message
184
185
186			def get_ts_target(datasets):
187			"""
188			From a given list of DL4 datasets, with assumed associated models, estimate
189			the total test statistic values, in the given target source region/pixel,
190			for the (observed) best fit of the model to the data, and the (expected)
191			perfect fit of model and data (model = data).
192
193			For consistency in the evaluation of the statistic values, we will use the
194			basic Fit Statistic functions in Gammapy for Poisson Data:
195
196			* `cash <https://docs.gammapy.org/1.2/api/gammapy.stats.cash.html>`_
197
198			* `wstat <https://docs.gammapy.org/1.2/api/gammapy.stats.wstat.html>`_
199
200			For the different type of Statistics used in Gammapy for 3D/1D datasets,
201			and for our use case of getting the best fit and perfect fit, we will pass
202			the appropriate values, by adapting to the following methods,
203
204			* Best Fit (Observed):
205
206			* `Cash stat_array <https://docs.gammapy.org/1.2/api/gammapy.datasets.MapDataset.html#gammapy.datasets.MapDataset.stat_array # noqa>`_
207
208			* `Wstat stat_array <https://docs.gammapy.org/1.2/api/gammapy.datasets.MapDatasetOnOff.html#gammapy.datasets.MapDatasetOnOff.stat_array # noqa>`_
209
210			* Perfect Fit (Expected):
211
212			* `Cash stat_max <https://docs.gammapy.org/1.2/api/gammapy.stats.CashCountsStatistic.html#gammapy.stats.CashCountsStatistic.stat_max # noqa>`_
213
214			* `Wstat stat_max <https://docs.gammapy.org/1.2/api/gammapy.stats.WStatCountsStatistic.html#gammapy.stats.WStatCountsStatistic.stat_max # noqa>`_
215
216			Parameter
217			---------
218			datasets: `gammapy.datasets.Datasets`
219			List of Datasets object, which can contain 3D and/or 1D datasets
220
221			Return
222			------
223			stat_best_fit: float
224			Total sum of test statistic of the best fit of model to data, summed
225			over all energy bins.
226			stat_max_fit: float
227			Test statistic difference of the perfect fit of model to data summed
228			over all energy bins.
229			""" # noqa
230			stat_best_fit = 0
231			stat_max_fit = 0
232
233			for data in datasets:
234			if data.stat_type != "chi2":
235			# Assuming that the Counts Map is created with the target source as its center
236			region = data.counts.geom.center_skydir
237
238			if data.stat_type == "cash":
239			counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
240			mu_on = (data.npred() * data.mask).get_spectrum(region).data
241
242			stat_best_fit += np.nansum(cash(n_on=counts_on, mu_on=mu_on).ravel())
243			stat_max_fit += np.nansum(cash(n_on=counts_on, mu_on=counts_on).ravel())
244
245			elif data.stat_type == "wstat":
246			counts_on = (data.counts.copy() * data.mask).get_spectrum(region).data
247			counts_off = np.nan_to_num((data.counts_off * data.mask).get_spectrum(region)).data
248
249			# alpha is evaluated by acceptance ratios, and
250			# Background is evaluated with given alpha and counts_off,
251			# but for alpha to be of the same shape (in the target region),
252			# it will be reevaluated
253			bkg = np.nan_to_num((data.background * data.mask).get_spectrum(region))
254
255			with np.errstate(invalid="ignore", divide="ignore"):
256			alpha = bkg / counts_off
257			mu_signal = np.nan_to_num((data.npred_signal() * data.mask).get_spectrum(region)).data
258			max_pred = counts_on - bkg
259
260			stat_best_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=mu_signal))
261			stat_max_fit += np.nansum(wstat(n_on=counts_on, n_off=counts_off, alpha=alpha, mu_sig=max_pred))
262			else:
263			# For FluxxPointsDataset
264			stat_best_fit += np.nansum(data.stat_array())
265			stat_max_fit += len(data.data.dnde.data)
266
267			return stat_best_fit, stat_max_fit
268

chaimain / asgardpy

Pull Request — main (#165)

asgardpy.stats.stats.fetch_pivot_energy() A

Complexity

Size

Duplication

Importance

Duplication Side-by-Side

Filter issues like