savu.plugins.stats.statistics - Code Metrics - Inspection of "Statistics" - DiamondLightSource/Savu - Measure and Improve Code Quality continuously with Scrutinizer

Test Failed

Pull Request — master (#878)

by Daniil

created 2022-03-08 17:29 UTC

savu.plugins.stats.statistics F

↳ Parent: Project

Complexity

Total Complexity

122

Size/Duplication

Total Lines	502
Duplicated Lines	0 %

Importance

Changes

Metric	Value
eloc	373
dl	0
loc	502
rs	2
c	0
b	0
f	0
wmc	122

29 Methods

Rating	Name	Size	Complexity
B	Statistics.setup()	18	6
A	Statistics._set_loop_stats()	9	1
C	Statistics._write_stats_to_file()	31	11
B	Statistics.get_stats_from_dataset()	29	7
A	Statistics._delete_stats_metadata()	3	1
B	Statistics._set_pattern_info()	20	8
A	Statistics._get_unpadded_slice_list()	15	3
A	Statistics.get_stats_from_name()	19	2
A	Statistics.set_stats_residuals()	5	1
A	Statistics.rmsd_from_rss()	2	1
A	Statistics.calc_rmsd()	8	2
A	Statistics.calc_rss()	11	2
C	Statistics.get_stats()	41	11
A	Statistics._array_to_dict()	5	2
A	Statistics._combine_mpi_stats()	8	3
A	Statistics.calc_volume_stats()	14	2
B	Statistics.write_slice_stats_to_file()	26	6
A	Statistics._setup_iterative()	7	3
A	Statistics._count()	3	1
A	Statistics._post_chain()	5	2
A	Statistics.calc_stats_residuals()	5	2
B	Statistics._unpad_slice()	20	6
A	Statistics._link_stats_to_datasets()	17	5
A	Statistics.calc_slice_stats()	21	5
A	Statistics._de_list()	7	3
A	Statistics.__init__()	7	1
A	Statistics.set_slice_stats()	8	4
D	Statistics.set_volume_stats()	48	13
B	Statistics._setup_class()	31	8

How to fix Complexity

"""
.. module:: statistics
   :platform: Unix
   :synopsis: Contains and processes statistics information for each plugin.

.. moduleauthor::Jacob Williamson <[email protected]>

"""

from savu.plugins.savers.utils.hdf5_utils import Hdf5Utils
from savu.plugins.stats.stats_utils import StatsUtils
from savu.core.iterate_plugin_group_utils import check_if_in_iterative_loop

import h5py as h5
import numpy as np
import os
from mpi4py import MPI


class Statistics(object):
    _pattern_list = ["SINOGRAM", "PROJECTION", "TANGENTOGRAM", "VOLUME_YZ", "VOLUME_XZ", "VOLUME_XY", "VOLUME_3D", "4D_SCAN", "SINOMOVIE"]
    _no_stats_plugins = ["BasicOperations", "Mipmap"]
    _key_list = ["max", "min", "mean", "mean_std_dev", "median_std_dev", "NRMSD"]
    #_savers = ["Hdf5Saver", "ImageSaver", "MrcSaver", "TiffSaver", "XrfSaver"]
    _has_setup = False


    def __init__(self):
        self.calc_stats = True
        self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
        self.stats_before_processing = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
        self.residuals = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
        self._repeat_count = 0
        self.p_num = None

    def setup(self, plugin_self, pattern=None):
        if not Statistics._has_setup:
            self._setup_class(plugin_self.exp)
        self.plugin_name = plugin_self.name
        if plugin_self.name in Statistics._no_stats_plugins:
            self.calc_stats = False
        if self.calc_stats:
            self.plugin = plugin_self
            self._pad_dims = []
            self._already_called = False
            self.p_num = Statistics.count
            if pattern:
                self.pattern = pattern
            else:
                self._set_pattern_info()
        if self.calc_stats:
            Statistics._any_stats = True
        self._setup_iterative()

    def _setup_iterative(self):
        self._iterative_group = check_if_in_iterative_loop(Statistics.exp)
        if self._iterative_group:
            if self._iterative_group.start_index == Statistics.count:
                Statistics._loop_counter += 1
                Statistics.loop_stats.append({"NRMSD": np.array([])})
            self.l_num = Statistics._loop_counter - 1

    @classmethod
    def _setup_class(cls, exp):
        """Sets up the statistics class for the whole plugin chain (only called once)"""
        try:
            if exp.meta_data.get("stats") == "on":
                cls._stats_flag = True
            elif exp.meta_data.get("stats") == "off":
                cls._stats_flag = False
        except KeyError:
            cls._stats_flag = True
        cls._any_stats = False
        cls.count = 2
        cls.global_stats = {}
        cls.loop_stats = []
        cls.exp = exp
        cls.n_plugins = len(exp.meta_data.plugin_list.plugin_list)
        for i in range(1, cls.n_plugins + 1):
            cls.global_stats[i] = np.array([])
        cls.global_residuals = {}
        cls.plugin_numbers = {}
        cls.plugin_names = {}
        cls._loop_counter = 0
        cls._RMSD = True
        cls.path = exp.meta_data['out_path']
        if cls.path[-1] == '/':
            cls.path = cls.path[0:-1]
        cls.path = f"{cls.path}/stats"
        if MPI.COMM_WORLD.rank == 0:
            if not os.path.exists(cls.path):
                os.mkdir(cls.path)
        cls._has_setup = True

    def get_stats(self, p_num=None, stat=None, instance=-1):
        """Returns stats associated with a certain plugin, given the plugin number (its place in the process list).

        :param p_num: Plugin  number of the plugin whose associated stats are being fetched.
            If p_num <= 0, it is relative to the plugin number of the current plugin being run.
            E.g current plugin number = 5, p_num = -2 --> will return stats of the third plugin.
            By default will gather stats for the current plugin.
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
            If left blank will return the whole dictionary of stats:
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
        :param instance: In cases where there are multiple set of stats associated with a plugin
            due to loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
            By default will retrieve the most recent set.
        """
        if p_num is None:
            p_num = self.p_num
        if p_num <= 0:
            try:
                p_num = self.p_num + p_num
            except TypeError:
                p_num = Statistics.count + p_num
        if Statistics.global_stats[p_num].ndim == 1 and instance in (None, 0, 1, -1, "all"):
            stats_array = Statistics.global_stats[p_num]
        else:
            if instance == "all":
                stats_list = [self.get_stats(p_num, stat=stat, instance=1)]
                n = 2
                if Statistics.global_stats[p_num].ndim != 1:
                    while n <= len(Statistics.global_stats[p_num]):
                        stats_list.append(self.get_stats(p_num, stat=stat, instance=n))
                        n += 1
                return stats_list
            if instance > 0:
                instance -= 1
            stats_array = Statistics.global_stats[p_num][instance]
        stats_dict = self._array_to_dict(stats_array)
        if stat is not None:
            return stats_dict[stat]
        else:
            return stats_dict

    def get_stats_from_name(self, plugin_name, n=None, stat=None, instance=-1):
        """Returns stats associated with a certain plugin.

        :param plugin_name: name of the plugin whose associated stats are being fetched.
        :param n: In a case where there are multiple instances of **plugin_name** in the process list,
            specify the nth instance. Not specifying will select the first (or only) instance.
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
            If left blank will return the whole dictionary of stats:
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
        :param instance: In cases where there are multiple set of stats associated with a plugin
            due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
            By default will retrieve the most recent set.
        """
        name = plugin_name
        if n not in (None, 0, 1):
            name = name + str(n)
        p_num = Statistics.plugin_numbers[name]
        return self.get_stats(p_num, stat, instance)

    def get_stats_from_dataset(self, dataset, stat=None, instance=-1):
        """Returns stats associated with a dataset.

        :param dataset: The dataset whose associated stats are being fetched.
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
            If left blank will return the whole dictionary of stats:
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD'}
        :param instance: In cases where there are multiple set of stats associated with a dataset
            due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
            By default will retrieve the most recent set.
        """
        stats_list = [dataset.meta_data.get("stats")]
        n = 2
        while ("stats" + str(n)) in list(dataset.meta_data.get_dictionary().keys()):
            stats_list.append(dataset.meta_data.get("stats" + str(n)))
            n += 1
        if stat:
            for i in range(len(stats_list)):
                stats_list[i] = stats_list[i][stat]
        if instance in (None, 0, 1):
            stats = stats_list[0]
        elif instance == "all":
            stats = stats_list
        else:
            if instance >= 2:
                instance -= 1
            stats = stats_list[instance]
        return stats

    def set_slice_stats(self, my_slice, base_slice=None, pad=True):
        slice_stats_after = self.calc_slice_stats(my_slice, base_slice, pad=pad)
        if base_slice:
            slice_stats_before = self.calc_slice_stats(base_slice, pad=pad)
            for key in list(self.stats_before_processing.keys()):
                self.stats_before_processing[key].append(slice_stats_before[key])
        for key in list(self.stats.keys()):
            self.stats[key].append(slice_stats_after[key])

    def calc_slice_stats(self, my_slice, base_slice=None, pad=True):
        """Calculates and returns slice stats for the current slice.

        :param my_slice: The slice whose stats are being calculated.
        :param base_slice: Provide a base slice to calculate residuals from, to calculate RMSD.
        """
        if my_slice is not None:
            my_slice = self._de_list(my_slice)
            if pad:
                my_slice = self._unpad_slice(my_slice)
            slice_stats = {'max': np.amax(my_slice).astype('float64'), 'min': np.amin(my_slice).astype('float64'),
                           'mean': np.mean(my_slice), 'std_dev': np.std(my_slice), 'data_points': my_slice.size}
            if base_slice is not None and self._RMSD:
                base_slice = self._de_list(base_slice)
                base_slice = self._unpad_slice(base_slice)
                rss = self.calc_rss(my_slice, base_slice)
            else:
                rss = None
            slice_stats['RSS'] = rss
            return slice_stats
        return None

    def calc_rss(self, array1, array2):  # residual sum of squares # very slow needs looking at
        if array1.shape == array2.shape:
            residuals = np.subtract(array1, array2)
            rss = 0
            #for value in (np.nditer(residuals)):
            #    rss += value**2
            rss = np.sum(value for value in np.nditer(residuals))
        else:
            #print("Warning: cannot calculate RSS, arrays different sizes.")
            rss = None
        return rss

    def rmsd_from_rss(self, rss, n):
        return np.sqrt(rss/n)

    def calc_rmsd(self, array1, array2):
        if array1.shape == array2.shape:
            rss = self.calc_rss(array1, array2)
            rmsd = self.rmsd_from_rss(rss, array1.size)
        else:
            print("Warning: cannot calculate RMSD, arrays different sizes.")  # need to make this an actual warning
            rmsd = None
        return rmsd

    def calc_stats_residuals(self, stats_before, stats_after):
        residuals = {'max': None, 'min': None, 'mean': None, 'std_dev': None}
        for key in list(residuals.keys()):
            residuals[key] = stats_after[key] - stats_before[key]
        return residuals

    def set_stats_residuals(self, residuals):
        self.residuals['max'].append(residuals['max'])
        self.residuals['min'].append(residuals['min'])
        self.residuals['mean'].append(residuals['mean'])
        self.residuals['std_dev'].append(residuals['std_dev'])

    def calc_volume_stats(self, slice_stats):
        volume_stats = np.array([max(slice_stats['max']), min(slice_stats['min']), np.mean(slice_stats['mean']),
                                np.mean(slice_stats['std_dev']), np.median(slice_stats['std_dev'])])
        if None not in slice_stats['RSS']:
            total_rss = sum(slice_stats['RSS'])
            n = sum(slice_stats['data_points'])
            RMSD = self.rmsd_from_rss(total_rss, n)
            the_range = volume_stats[0] - volume_stats[1]
            NRMSD = RMSD / the_range  # normalised RMSD (dividing by the range)
            volume_stats = np.append(volume_stats, NRMSD)
        else:
            #volume_stats = np.append(volume_stats, None)
            pass
        return volume_stats

    def _set_loop_stats(self):
        # NEED TO CHANGE THIS - MUST USE SLICES
        data_obj1 = list(self._iterative_group._ip_data_dict["iterating"].keys())[0]
        data_obj2 = self._iterative_group._ip_data_dict["iterating"][data_obj1]
        RMSD = self.calc_rmsd(data_obj1.data, data_obj2.data)
        the_range = self.get_stats(self.p_num, stat="max", instance=self._iterative_group._ip_iteration) -\
                self.get_stats(self.p_num, stat="min", instance=self._iterative_group._ip_iteration)
        NRMSD = RMSD/the_range
        Statistics.loop_stats[self.l_num]["NRMSD"] = np.append(Statistics.loop_stats[self.l_num]["NRMSD"], NRMSD)

    def set_volume_stats(self):
        """Calculates volume-wide statistics from slice stats, and updates class-wide arrays with these values.
        Links volume stats with the output dataset and writes slice stats to file.
        """
        stats = self.stats
        combined_stats = self._combine_mpi_stats(stats)
        if not self.p_num:
            self.p_num = Statistics.count
        p_num = self.p_num
        name = self.plugin_name
        i = 2
        if not self._iterative_group:
            while name in list(Statistics.plugin_numbers.keys()):
                name = self.plugin_name + str(i)
                i += 1
        elif self._iterative_group._ip_iteration == 0:
            while name in list(Statistics.plugin_numbers.keys()):
                name = self.plugin_name + str(i)
                i += 1

        if p_num not in list(Statistics.plugin_names.keys()):
            Statistics.plugin_names[p_num] = name
        Statistics.plugin_numbers[name] = p_num
        if len(self.stats['max']) != 0:
            stats_array = self.calc_volume_stats(combined_stats)
            Statistics.global_residuals[p_num] = {}
            #before_processing = self.calc_volume_stats(self.stats_before_processing)
            #for key in list(before_processing.keys()):
            #    Statistics.global_residuals[p_num][key] = Statistics.global_stats[p_num][key] - before_processing[key]

            if len(Statistics.global_stats[p_num]) == 0:
                Statistics.global_stats[p_num] = stats_array
            else:
                Statistics.global_stats[p_num] = np.vstack([Statistics.global_stats[p_num], stats_array])

            stats_dict = self._array_to_dict(stats_array)
            self._link_stats_to_datasets(stats_dict, self._iterative_group)

        if self._iterative_group:
            if self._iterative_group.end_index == p_num and self._iterative_group._ip_iteration != 0:
                #self._set_loop_stats()
                pass

        self._write_stats_to_file(p_num)
        self._already_called = True
        self._repeat_count += 1
        if self._iterative_group:
            self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}



    def _combine_mpi_stats(self, slice_stats):
        comm = MPI.COMM_WORLD
        combined_stats_list = comm.allgather(slice_stats)
        combined_stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
        for single_stats in combined_stats_list:
            for key in list(single_stats.keys()):
                combined_stats[key] += single_stats[key]
        return combined_stats

    def _array_to_dict(self, stats_array):
        stats_dict = {}
        for i, value in enumerate(stats_array):
            stats_dict[Statistics._key_list[i]] = value
        return stats_dict

    def _set_pattern_info(self):
        """Gathers information about the pattern of the data in the current plugin."""
        out_datasets = self.plugin.get_out_datasets()
        try:
            self.pattern = self.plugin.parameters['pattern']
            if self.pattern == None:
                raise KeyError
        except KeyError:
            if not out_datasets:
                self.pattern = None
            else:
                patterns = out_datasets[0].get_data_patterns()
                for pattern in patterns:
                    if 1 in patterns.get(pattern)["slice_dims"]:
                        self.pattern = pattern
                        break
        self.calc_stats = False
        for dataset in out_datasets:
            if bool(set(Statistics._pattern_list) & set(dataset.data_info.get("data_patterns"))):
                self.calc_stats = True

    def _link_stats_to_datasets(self, stats_dict, iterative=False):
        """Links the volume wide statistics to the output dataset(s)"""
        out_dataset = self.plugin.get_out_datasets()[0]
        my_dataset = out_dataset
        if iterative:
            if "itr_clone" in out_dataset.group_name:
                my_dataset = list(iterative._ip_data_dict["iterating"].keys())[0]
        n_datasets = self.plugin.nOutput_datasets()

        i = 2
        group_name = "stats"
        #out_dataset.data_info.set([group_name], stats)
        while group_name in list(my_dataset.meta_data.get_dictionary().keys()):
            group_name = f"stats{i}"
            i += 1
        for key in list(stats_dict.keys()):
            my_dataset.meta_data.set([group_name, key], stats_dict[key])

    def _delete_stats_metadata(self, plugin):
        out_dataset = plugin.get_out_datasets()[0]
        out_dataset.meta_data.delete("stats")

    def _write_stats_to_file(self, p_num=None, plugin_name=None):
        if p_num is None:
            p_num = self.p_num
        if plugin_name is None:
            plugin_name = self.plugin_names[p_num]
        path = Statistics.path
        filename = f"{path}/stats.h5"
        stats = self.global_stats[p_num]
        self.hdf5 = Hdf5Utils(self.exp)
        with h5.File(filename, "a", driver="mpio", comm=MPI.COMM_WORLD) as h5file:
            group = h5file.require_group("stats")
            if stats.shape != (0,):
                if str(p_num) in list(group.keys()):
                    del group[str(p_num)]
                dataset = group.create_dataset(str(p_num), shape=stats.shape, dtype=stats.dtype)
                dataset[::] = stats[::]
                dataset.attrs.create("plugin_name", plugin_name)
                dataset.attrs.create("pattern", self.pattern)
            if self._iterative_group:
                l_stats = Statistics.loop_stats[self.l_num]
                group1 = h5file.require_group("iterative")
                if self._iterative_group._ip_iteration == self._iterative_group._ip_fixed_iterations - 1\
                        and self.p_num == self._iterative_group.end_index:
                    dataset1 = group1.create_dataset(str(self.l_num), shape=l_stats["NRMSD"].shape, dtype=l_stats["NRMSD"].dtype)
                    dataset1[::] = l_stats["NRMSD"][::]
                    loop_plugins = []
                    for i in range(self._iterative_group.start_index, self._iterative_group.end_index + 1):
                        if i in list(self.plugin_names.keys()):
                            loop_plugins.append(self.plugin_names[i])
                    dataset1.attrs.create("loop_plugins", loop_plugins)
                    dataset.attrs.create("n_loop_plugins", len(loop_plugins))


    def write_slice_stats_to_file(self, slice_stats=None, p_num=None):
        """Writes slice statistics to a h5 file. Placed in the stats folder in the output directory."""
        if not slice_stats:
            slice_stats = self.stats
        if not p_num:
            p_num = self.count
            plugin_name = self.plugin_name
        else:
            plugin_name = self.plugin_names[p_num]
        combined_stats = self._combine_mpi_stats(slice_stats)
        slice_stats_arrays = {}
        datasets = {}
        path = Statistics.path
        filename = f"{path}/stats_p{p_num}_{plugin_name}.h5"
        self.hdf5 = Hdf5Utils(self.plugin.exp)
        with h5.File(filename, "a", driver="mpio", comm=MPI.COMM_WORLD) as h5file:
            i = 2
            group_name = "/stats"
            while group_name in h5file:
                group_name = f"/stats{i}"
                i += 1
            group = h5file.create_group(group_name, track_order=None)
            for key in list(combined_stats.keys()):
                slice_stats_arrays[key] = np.array(combined_stats[key])
                datasets[key] = self.hdf5.create_dataset_nofill(group, key, (len(slice_stats_arrays[key]),), slice_stats_arrays[key].dtype)
                datasets[key][::] = slice_stats_arrays[key]

    def _unpad_slice(self, slice1):
        """If data is padded in the slice dimension, removes this pad."""
        out_datasets = self.plugin.get_out_datasets()
        if len(out_datasets) == 1:
            out_dataset = out_datasets[0]
        else:
            for dataset in out_datasets:
                if self.pattern in list(dataset.data_info.get(["data_patterns"]).keys()):
                    out_dataset = dataset
                    break
        slice_dims = out_dataset.get_slice_dimensions()

        if self.plugin.pcount == 0:
            self._slice_list, self._pad = self._get_unpadded_slice_list(slice1, slice_dims)
        if self._pad:
            #for slice_dim in slice_dims:
            slice_dim = slice_dims[0]
            temp_slice = np.swapaxes(slice1, 0, slice_dim)
            temp_slice = temp_slice[self._slice_list[slice_dim]]
            slice1 = np.swapaxes(temp_slice, 0, slice_dim)
        return slice1

    def _get_unpadded_slice_list(self, slice1, slice_dims):
        """Creates slice object(s) to un-pad slices in the slice dimension(s)."""
        slice_list = list(self.plugin.slice_list[0])
        pad = False
        if len(slice_list) == len(slice1.shape):
            #for i in slice_dims:
            i = slice_dims[0]
            slice_width = self.plugin.slice_list[0][i].stop - self.plugin.slice_list[0][i].start
            if slice_width != slice1.shape[i]:
                pad = True
                pad_width = (slice1.shape[i] - slice_width) // 2  # Assuming symmetrical padding
                slice_list[i] = slice(pad_width, pad_width + 1, 1)
            return tuple(slice_list), pad
        else:
            return self.plugin.slice_list[0], pad

    def _de_list(self, slice1):
        """If the slice is in a list, remove it from that list."""
        if type(slice1) == list:
            if len(slice1) != 0:
                slice1 = slice1[0]
                slice1 = self._de_list(slice1)
        return slice1


    @classmethod
    def _count(cls):
        cls.count += 1

    @classmethod
    def _post_chain(cls):
        if cls._any_stats & cls._stats_flag:
            stats_utils = StatsUtils()
            stats_utils.generate_figures(f"{cls.path}/stats.h5", cls.path)


1			"""
2			.. module:: statistics
3			:platform: Unix
4			:synopsis: Contains and processes statistics information for each plugin.
5
6			.. moduleauthor::Jacob Williamson <[email protected]>
7
8			"""
9
10			from savu.plugins.savers.utils.hdf5_utils import Hdf5Utils
11			from savu.plugins.stats.stats_utils import StatsUtils
12			from savu.core.iterate_plugin_group_utils import check_if_in_iterative_loop
13
14			import h5py as h5
15			import numpy as np
16			import os
17			from mpi4py import MPI
18
19
20			class Statistics(object):
21			_pattern_list = ["SINOGRAM", "PROJECTION", "TANGENTOGRAM", "VOLUME_YZ", "VOLUME_XZ", "VOLUME_XY", "VOLUME_3D", "4D_SCAN", "SINOMOVIE"]
22			_no_stats_plugins = ["BasicOperations", "Mipmap"]
23			_key_list = ["max", "min", "mean", "mean_std_dev", "median_std_dev", "NRMSD"]
24			#_savers = ["Hdf5Saver", "ImageSaver", "MrcSaver", "TiffSaver", "XrfSaver"]
25			_has_setup = False
26
27
28			def __init__(self):
29			self.calc_stats = True
30			self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
31			self.stats_before_processing = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
32			self.residuals = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
33			self._repeat_count = 0
34			self.p_num = None
35
36			def setup(self, plugin_self, pattern=None):
37			if not Statistics._has_setup:
38			self._setup_class(plugin_self.exp)
39			self.plugin_name = plugin_self.name
40			if plugin_self.name in Statistics._no_stats_plugins:
41			self.calc_stats = False
42			if self.calc_stats:
43			self.plugin = plugin_self
44			self._pad_dims = []
45			self._already_called = False
46			self.p_num = Statistics.count
47			if pattern:
48			self.pattern = pattern
49			else:
50			self._set_pattern_info()
51			if self.calc_stats:
52			Statistics._any_stats = True
53			self._setup_iterative()
54
55			def _setup_iterative(self):
56			self._iterative_group = check_if_in_iterative_loop(Statistics.exp)
57			if self._iterative_group:
58			if self._iterative_group.start_index == Statistics.count:
59			Statistics._loop_counter += 1
60			Statistics.loop_stats.append({"NRMSD": np.array([])})
61			self.l_num = Statistics._loop_counter - 1
62
63			@classmethod
64			def _setup_class(cls, exp):
65			"""Sets up the statistics class for the whole plugin chain (only called once)"""
66			try:
67			if exp.meta_data.get("stats") == "on":
68			cls._stats_flag = True
69			elif exp.meta_data.get("stats") == "off":
70			cls._stats_flag = False
71			except KeyError:
72			cls._stats_flag = True
73			cls._any_stats = False
74			cls.count = 2
75			cls.global_stats = {}
76			cls.loop_stats = []
77			cls.exp = exp
78			cls.n_plugins = len(exp.meta_data.plugin_list.plugin_list)
79			for i in range(1, cls.n_plugins + 1):
80			cls.global_stats[i] = np.array([])
81			cls.global_residuals = {}
82			cls.plugin_numbers = {}
83			cls.plugin_names = {}
84			cls._loop_counter = 0
85			cls._RMSD = True
86			cls.path = exp.meta_data['out_path']
87			if cls.path[-1] == '/':
88			cls.path = cls.path[0:-1]
89			cls.path = f"{cls.path}/stats"
90			if MPI.COMM_WORLD.rank == 0:
91			if not os.path.exists(cls.path):
92			os.mkdir(cls.path)
93			cls._has_setup = True
94
95			def get_stats(self, p_num=None, stat=None, instance=-1):
96			"""Returns stats associated with a certain plugin, given the plugin number (its place in the process list).
97
98			:param p_num: Plugin number of the plugin whose associated stats are being fetched.
99			If p_num <= 0, it is relative to the plugin number of the current plugin being run.
100			E.g current plugin number = 5, p_num = -2 --> will return stats of the third plugin.
101			By default will gather stats for the current plugin.
102			:param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
103			If left blank will return the whole dictionary of stats:
104			{'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
105			:param instance: In cases where there are multiple set of stats associated with a plugin
106			due to loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
107			stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
108			By default will retrieve the most recent set.
109			"""
110			if p_num is None:
111			p_num = self.p_num
112			if p_num <= 0:
113			try:
114			p_num = self.p_num + p_num
115			except TypeError:
116			p_num = Statistics.count + p_num
117			if Statistics.global_stats[p_num].ndim == 1 and instance in (None, 0, 1, -1, "all"):
118			stats_array = Statistics.global_stats[p_num]
119			else:
120			if instance == "all":
121			stats_list = [self.get_stats(p_num, stat=stat, instance=1)]
122			n = 2
123			if Statistics.global_stats[p_num].ndim != 1:
124			while n <= len(Statistics.global_stats[p_num]):
125			stats_list.append(self.get_stats(p_num, stat=stat, instance=n))
126			n += 1
127			return stats_list
128			if instance > 0:
129			instance -= 1
130			stats_array = Statistics.global_stats[p_num][instance]
131			stats_dict = self._array_to_dict(stats_array)
132			if stat is not None:
133			return stats_dict[stat]
134			else:
135			return stats_dict
136
137			def get_stats_from_name(self, plugin_name, n=None, stat=None, instance=-1):
138			"""Returns stats associated with a certain plugin.
139
140			:param plugin_name: name of the plugin whose associated stats are being fetched.
141			:param n: In a case where there are multiple instances of plugin_name in the process list,
142			specify the nth instance. Not specifying will select the first (or only) instance.
143			:param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
144			If left blank will return the whole dictionary of stats:
145			{'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
146			:param instance: In cases where there are multiple set of stats associated with a plugin
147			due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
148			stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
149			By default will retrieve the most recent set.
150			"""
151			name = plugin_name
152			if n not in (None, 0, 1):
153			name = name + str(n)
154			p_num = Statistics.plugin_numbers[name]
155			return self.get_stats(p_num, stat, instance)
156
157			def get_stats_from_dataset(self, dataset, stat=None, instance=-1):
158			"""Returns stats associated with a dataset.
159
160			:param dataset: The dataset whose associated stats are being fetched.
161			:param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
162			If left blank will return the whole dictionary of stats:
163			{'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD'}
164			:param instance: In cases where there are multiple set of stats associated with a dataset
165			due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
166			stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
167			By default will retrieve the most recent set.
168			"""
169			stats_list = [dataset.meta_data.get("stats")]
170			n = 2
171			while ("stats" + str(n)) in list(dataset.meta_data.get_dictionary().keys()):
172			stats_list.append(dataset.meta_data.get("stats" + str(n)))
173			n += 1
174			if stat:
175			for i in range(len(stats_list)):
176			stats_list[i] = stats_list[i][stat]
177			if instance in (None, 0, 1):
178			stats = stats_list[0]
179			elif instance == "all":
180			stats = stats_list
181			else:
182			if instance >= 2:
183			instance -= 1
184			stats = stats_list[instance]
185			return stats
186
187			def set_slice_stats(self, my_slice, base_slice=None, pad=True):
188			slice_stats_after = self.calc_slice_stats(my_slice, base_slice, pad=pad)
189			if base_slice:
190			slice_stats_before = self.calc_slice_stats(base_slice, pad=pad)
191			for key in list(self.stats_before_processing.keys()):
192			self.stats_before_processing[key].append(slice_stats_before[key])
193			for key in list(self.stats.keys()):
194			self.stats[key].append(slice_stats_after[key])
195
196			def calc_slice_stats(self, my_slice, base_slice=None, pad=True):
197			"""Calculates and returns slice stats for the current slice.
198
199			:param my_slice: The slice whose stats are being calculated.
200			:param base_slice: Provide a base slice to calculate residuals from, to calculate RMSD.
201			"""
202			if my_slice is not None:
203			my_slice = self._de_list(my_slice)
204			if pad:
205			my_slice = self._unpad_slice(my_slice)
206			slice_stats = {'max': np.amax(my_slice).astype('float64'), 'min': np.amin(my_slice).astype('float64'),
207			'mean': np.mean(my_slice), 'std_dev': np.std(my_slice), 'data_points': my_slice.size}
208			if base_slice is not None and self._RMSD:
209			base_slice = self._de_list(base_slice)
210			base_slice = self._unpad_slice(base_slice)
211			rss = self.calc_rss(my_slice, base_slice)
212			else:
213			rss = None
214			slice_stats['RSS'] = rss
215			return slice_stats
216			return None
217
218			def calc_rss(self, array1, array2): # residual sum of squares # very slow needs looking at
219			if array1.shape == array2.shape:
220			residuals = np.subtract(array1, array2)
221			rss = 0
222			#for value in (np.nditer(residuals)):
223			# rss += value**2
224			rss = np.sum(value for value in np.nditer(residuals))
225			else:
226			#print("Warning: cannot calculate RSS, arrays different sizes.")
227			rss = None
228			return rss
229
230			def rmsd_from_rss(self, rss, n):
231			return np.sqrt(rss/n)
232
233			def calc_rmsd(self, array1, array2):
234			if array1.shape == array2.shape:
235			rss = self.calc_rss(array1, array2)
236			rmsd = self.rmsd_from_rss(rss, array1.size)
237			else:
238			print("Warning: cannot calculate RMSD, arrays different sizes.") # need to make this an actual warning
239			rmsd = None
240			return rmsd
241
242			def calc_stats_residuals(self, stats_before, stats_after):
243			residuals = {'max': None, 'min': None, 'mean': None, 'std_dev': None}
244			for key in list(residuals.keys()):
245			residuals[key] = stats_after[key] - stats_before[key]
246			return residuals
247
248			def set_stats_residuals(self, residuals):
249			self.residuals['max'].append(residuals['max'])
250			self.residuals['min'].append(residuals['min'])
251			self.residuals['mean'].append(residuals['mean'])
252			self.residuals['std_dev'].append(residuals['std_dev'])
253
254			def calc_volume_stats(self, slice_stats):
255			volume_stats = np.array([max(slice_stats['max']), min(slice_stats['min']), np.mean(slice_stats['mean']),
256			np.mean(slice_stats['std_dev']), np.median(slice_stats['std_dev'])])
257			if None not in slice_stats['RSS']:
258			total_rss = sum(slice_stats['RSS'])
259			n = sum(slice_stats['data_points'])
260			RMSD = self.rmsd_from_rss(total_rss, n)
261			the_range = volume_stats[0] - volume_stats[1]
262			NRMSD = RMSD / the_range # normalised RMSD (dividing by the range)
263			volume_stats = np.append(volume_stats, NRMSD)
264			else:
265			#volume_stats = np.append(volume_stats, None)
266			pass
267			return volume_stats
268
269			def _set_loop_stats(self):
270			# NEED TO CHANGE THIS - MUST USE SLICES
271			data_obj1 = list(self._iterative_group._ip_data_dict["iterating"].keys())[0]
272			data_obj2 = self._iterative_group._ip_data_dict["iterating"][data_obj1]
273			RMSD = self.calc_rmsd(data_obj1.data, data_obj2.data)
274			the_range = self.get_stats(self.p_num, stat="max", instance=self._iterative_group._ip_iteration) -\
275			self.get_stats(self.p_num, stat="min", instance=self._iterative_group._ip_iteration)
276			NRMSD = RMSD/the_range
277			Statistics.loop_stats[self.l_num]["NRMSD"] = np.append(Statistics.loop_stats[self.l_num]["NRMSD"], NRMSD)
278
279			def set_volume_stats(self):
280			"""Calculates volume-wide statistics from slice stats, and updates class-wide arrays with these values.
281			Links volume stats with the output dataset and writes slice stats to file.
282			"""
283			stats = self.stats
284			combined_stats = self._combine_mpi_stats(stats)
285			if not self.p_num:
286			self.p_num = Statistics.count
287			p_num = self.p_num
288			name = self.plugin_name
289			i = 2
290			if not self._iterative_group:
291			while name in list(Statistics.plugin_numbers.keys()):
292			name = self.plugin_name + str(i)
293			i += 1
294			elif self._iterative_group._ip_iteration == 0:
295			while name in list(Statistics.plugin_numbers.keys()):
296			name = self.plugin_name + str(i)
297			i += 1
298
299			if p_num not in list(Statistics.plugin_names.keys()):
300			Statistics.plugin_names[p_num] = name
301			Statistics.plugin_numbers[name] = p_num
302			if len(self.stats['max']) != 0:
303			stats_array = self.calc_volume_stats(combined_stats)
304			Statistics.global_residuals[p_num] = {}
305			#before_processing = self.calc_volume_stats(self.stats_before_processing)
306			#for key in list(before_processing.keys()):
307			# Statistics.global_residuals[p_num][key] = Statistics.global_stats[p_num][key] - before_processing[key]
308
309			if len(Statistics.global_stats[p_num]) == 0:
310			Statistics.global_stats[p_num] = stats_array
311			else:
312			Statistics.global_stats[p_num] = np.vstack([Statistics.global_stats[p_num], stats_array])
313
314			stats_dict = self._array_to_dict(stats_array)
315			self._link_stats_to_datasets(stats_dict, self._iterative_group)
316
317			if self._iterative_group:
318			if self._iterative_group.end_index == p_num and self._iterative_group._ip_iteration != 0:
319			#self._set_loop_stats()
320			pass
321
322			self._write_stats_to_file(p_num)
323			self._already_called = True
324			self._repeat_count += 1
325			if self._iterative_group:
326			self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
327
328
329
330			def _combine_mpi_stats(self, slice_stats):
331			comm = MPI.COMM_WORLD
332			combined_stats_list = comm.allgather(slice_stats)
333			combined_stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
334			for single_stats in combined_stats_list:
335			for key in list(single_stats.keys()):
336			combined_stats[key] += single_stats[key]
337			return combined_stats
338
339			def _array_to_dict(self, stats_array):
340			stats_dict = {}
341			for i, value in enumerate(stats_array):
342			stats_dict[Statistics._key_list[i]] = value
343			return stats_dict
344
345			def _set_pattern_info(self):
346			"""Gathers information about the pattern of the data in the current plugin."""
347			out_datasets = self.plugin.get_out_datasets()
348			try:
349			self.pattern = self.plugin.parameters['pattern']
350			if self.pattern == None:
351			raise KeyError
352			except KeyError:
353			if not out_datasets:
354			self.pattern = None
355			else:
356			patterns = out_datasets[0].get_data_patterns()
357			for pattern in patterns:
358			if 1 in patterns.get(pattern)["slice_dims"]:
359			self.pattern = pattern
360			break
361			self.calc_stats = False
362			for dataset in out_datasets:
363			if bool(set(Statistics._pattern_list) & set(dataset.data_info.get("data_patterns"))):
364			self.calc_stats = True
365
366			def _link_stats_to_datasets(self, stats_dict, iterative=False):
367			"""Links the volume wide statistics to the output dataset(s)"""
368			out_dataset = self.plugin.get_out_datasets()[0]
369			my_dataset = out_dataset
370			if iterative:
371			if "itr_clone" in out_dataset.group_name:
372			my_dataset = list(iterative._ip_data_dict["iterating"].keys())[0]
373			n_datasets = self.plugin.nOutput_datasets()
374
375			i = 2
376			group_name = "stats"
377			#out_dataset.data_info.set([group_name], stats)
378			while group_name in list(my_dataset.meta_data.get_dictionary().keys()):
379			group_name = f"stats{i}"
380			i += 1
381			for key in list(stats_dict.keys()):
382			my_dataset.meta_data.set([group_name, key], stats_dict[key])
383
384			def _delete_stats_metadata(self, plugin):
385			out_dataset = plugin.get_out_datasets()[0]
386			out_dataset.meta_data.delete("stats")
387
388			def _write_stats_to_file(self, p_num=None, plugin_name=None):
389			if p_num is None:
390			p_num = self.p_num
391			if plugin_name is None:
392			plugin_name = self.plugin_names[p_num]
393			path = Statistics.path
394			filename = f"{path}/stats.h5"
395			stats = self.global_stats[p_num]
396			self.hdf5 = Hdf5Utils(self.exp)
397			with h5.File(filename, "a", driver="mpio", comm=MPI.COMM_WORLD) as h5file:
398			group = h5file.require_group("stats")
399			if stats.shape != (0,):
400			if str(p_num) in list(group.keys()):
401			del group[str(p_num)]
402			dataset = group.create_dataset(str(p_num), shape=stats.shape, dtype=stats.dtype)
403			dataset[::] = stats[::]
404			dataset.attrs.create("plugin_name", plugin_name)
405			dataset.attrs.create("pattern", self.pattern)
406			if self._iterative_group:
407			l_stats = Statistics.loop_stats[self.l_num]
408			group1 = h5file.require_group("iterative")
409			if self._iterative_group._ip_iteration == self._iterative_group._ip_fixed_iterations - 1\
410			and self.p_num == self._iterative_group.end_index:
411			dataset1 = group1.create_dataset(str(self.l_num), shape=l_stats["NRMSD"].shape, dtype=l_stats["NRMSD"].dtype)
412			dataset1[::] = l_stats["NRMSD"][::]
413			loop_plugins = []
414			for i in range(self._iterative_group.start_index, self._iterative_group.end_index + 1):
415			if i in list(self.plugin_names.keys()):
416			loop_plugins.append(self.plugin_names[i])
417			dataset1.attrs.create("loop_plugins", loop_plugins)
418			dataset.attrs.create("n_loop_plugins", len(loop_plugins))
			0 ignored issues – show introduced 2022-02-10 17:04 UTC by Report Bug Copy Issue Report The variable `dataset` does not seem to be defined in case `stats.shape != TupleNode` on line `399` is `False`. Are you sure this can never be the case? Loading history...
419
420			def write_slice_stats_to_file(self, slice_stats=None, p_num=None):
421			"""Writes slice statistics to a h5 file. Placed in the stats folder in the output directory."""
422			if not slice_stats:
423			slice_stats = self.stats
424			if not p_num:
425			p_num = self.count
426			plugin_name = self.plugin_name
427			else:
428			plugin_name = self.plugin_names[p_num]
429			combined_stats = self._combine_mpi_stats(slice_stats)
430			slice_stats_arrays = {}
431			datasets = {}
432			path = Statistics.path
433			filename = f"{path}/stats_p{p_num}_{plugin_name}.h5"
434			self.hdf5 = Hdf5Utils(self.plugin.exp)
435			with h5.File(filename, "a", driver="mpio", comm=MPI.COMM_WORLD) as h5file:
436			i = 2
437			group_name = "/stats"
438			while group_name in h5file:
439			group_name = f"/stats{i}"
440			i += 1
441			group = h5file.create_group(group_name, track_order=None)
442			for key in list(combined_stats.keys()):
443			slice_stats_arrays[key] = np.array(combined_stats[key])
444			datasets[key] = self.hdf5.create_dataset_nofill(group, key, (len(slice_stats_arrays[key]),), slice_stats_arrays[key].dtype)
445			datasets[key][::] = slice_stats_arrays[key]
446
447			def _unpad_slice(self, slice1):
448			"""If data is padded in the slice dimension, removes this pad."""
449			out_datasets = self.plugin.get_out_datasets()
450			if len(out_datasets) == 1:
451			out_dataset = out_datasets[0]
452			else:
453			for dataset in out_datasets:
454			if self.pattern in list(dataset.data_info.get(["data_patterns"]).keys()):
455			out_dataset = dataset
456			break
457			slice_dims = out_dataset.get_slice_dimensions()
			0 ignored issues – show introduced 2021-12-13 15:56 UTC by Report Bug Copy Issue Report The variable `out_dataset` does not seem to be defined for all execution paths. Loading history...
458			if self.plugin.pcount == 0:
459			self._slice_list, self._pad = self._get_unpadded_slice_list(slice1, slice_dims)
460			if self._pad:
461			#for slice_dim in slice_dims:
462			slice_dim = slice_dims[0]
463			temp_slice = np.swapaxes(slice1, 0, slice_dim)
464			temp_slice = temp_slice[self._slice_list[slice_dim]]
465			slice1 = np.swapaxes(temp_slice, 0, slice_dim)
466			return slice1
467
468			def _get_unpadded_slice_list(self, slice1, slice_dims):
469			"""Creates slice object(s) to un-pad slices in the slice dimension(s)."""
470			slice_list = list(self.plugin.slice_list[0])
471			pad = False
472			if len(slice_list) == len(slice1.shape):
473			#for i in slice_dims:
474			i = slice_dims[0]
475			slice_width = self.plugin.slice_list[0][i].stop - self.plugin.slice_list[0][i].start
476			if slice_width != slice1.shape[i]:
477			pad = True
478			pad_width = (slice1.shape[i] - slice_width) // 2 # Assuming symmetrical padding
479			slice_list[i] = slice(pad_width, pad_width + 1, 1)
480			return tuple(slice_list), pad
481			else:
482			return self.plugin.slice_list[0], pad
483
484			def _de_list(self, slice1):
485			"""If the slice is in a list, remove it from that list."""
486			if type(slice1) == list:
487			if len(slice1) != 0:
488			slice1 = slice1[0]
489			slice1 = self._de_list(slice1)
490			return slice1
491
492
493			@classmethod
494			def _count(cls):
495			cls.count += 1
496
497			@classmethod
498			def _post_chain(cls):
499			if cls._any_stats & cls._stats_flag:
500			stats_utils = StatsUtils()
501			stats_utils.generate_figures(f"{cls.path}/stats.h5", cls.path)
502

DiamondLightSource / Savu

Pull Request — master (#878)

savu.plugins.stats.statistics F

Complexity

Size/Duplication

Importance

29 Methods

How to fix Complexity

Complexity

Duplication Side-by-Side

Filter issues like