Test Failed
Pull Request — master (#878)
by
unknown
04:38
created

Statistics.calc_stats_residuals()   A

Complexity

Conditions 2

Size

Total Lines 5
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 5
nop 3
dl 0
loc 5
rs 10
c 0
b 0
f 0
1
"""
2
.. module:: statistics
3
   :platform: Unix
4
   :synopsis: Contains and processes statistics information for each plugin.
5
6
.. moduleauthor::Jacob Williamson <[email protected]>
7
8
"""
9
10
from savu.plugins.savers.utils.hdf5_utils import Hdf5Utils
11
from savu.plugins.stats.stats_utils import StatsUtils
12
from savu.core.iterate_plugin_group_utils import check_if_in_iterative_loop
13
14
import h5py as h5
15
import numpy as np
16
import os
17
from mpi4py import MPI
18
19
20
class Statistics(object):
21
    _pattern_list = ["SINOGRAM", "PROJECTION", "TANGENTOGRAM", "VOLUME_YZ", "VOLUME_XZ", "VOLUME_XY", "VOLUME_3D", "4D_SCAN", "SINOMOVIE"]
22
    _no_stats_plugins = ["BasicOperations", "Mipmap"]
23
    _key_list = ["max", "min", "mean", "mean_std_dev", "median_std_dev", "NRMSD"]
24
    #_savers = ["Hdf5Saver", "ImageSaver", "MrcSaver", "TiffSaver", "XrfSaver"]
25
    _has_setup = False
26
27
28
    def __init__(self):
29
        self.calc_stats = True
30
        self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
31
        self.stats_before_processing = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
32
        self.residuals = {'max': [], 'min': [], 'mean': [], 'std_dev': []}
33
        self._repeat_count = 0
34
        self.p_num = None
35
        self.GPU = False
36
37
    def setup(self, plugin_self, pattern=None):
38
        if not Statistics._has_setup:
39
            self._setup_class(plugin_self.exp)
40
        self.plugin_name = plugin_self.name
41
        if plugin_self.name in Statistics._no_stats_plugins:
42
            self.calc_stats = False
43
        if self.calc_stats:
44
            self.plugin = plugin_self
45
            self._pad_dims = []
46
            self._already_called = False
47
            self.p_num = Statistics.count
48
            if pattern:
49
                self.pattern = pattern
50
            else:
51
                self._set_pattern_info()
52
        if self.calc_stats:
53
            Statistics._any_stats = True
54
        self._setup_iterative()
55
56
    def _setup_iterative(self):
57
        self._iterative_group = check_if_in_iterative_loop(Statistics.exp)
58
        if self._iterative_group:
59
            if self._iterative_group.start_index == Statistics.count:
60
                Statistics._loop_counter += 1
61
                Statistics.loop_stats.append({"NRMSD": np.array([])})
62
            self.l_num = Statistics._loop_counter - 1
63
64
    @classmethod
65
    def _setup_class(cls, exp):
66
        """Sets up the statistics class for the whole plugin chain (only called once)"""
67
        try:
68
            if exp.meta_data.get("stats") == "on":
69
                cls._stats_flag = True
70
            elif exp.meta_data.get("stats") == "off":
71
                cls._stats_flag = False
72
        except KeyError:
73
            cls._stats_flag = True
74
        cls._any_stats = False
75
        cls.count = 2
76
        cls.global_stats = {}
77
        cls.loop_stats = []
78
        cls.exp = exp
79
        cls.n_plugins = len(exp.meta_data.plugin_list.plugin_list)
80
        for i in range(1, cls.n_plugins + 1):
81
            cls.global_stats[i] = np.array([])
82
        cls.global_residuals = {}
83
        cls.plugin_numbers = {}
84
        cls.plugin_names = {}
85
        cls._loop_counter = 0
86
        cls._RMSD = True
87
        cls.path = exp.meta_data['out_path']
88
        if cls.path[-1] == '/':
89
            cls.path = cls.path[0:-1]
90
        cls.path = f"{cls.path}/stats"
91
        if MPI.COMM_WORLD.rank == 0:
92
            if not os.path.exists(cls.path):
93
                os.mkdir(cls.path)
94
        cls._has_setup = True
95
96
    def get_stats(self, p_num=None, stat=None, instance=-1):
97
        """Returns stats associated with a certain plugin, given the plugin number (its place in the process list).
98
99
        :param p_num: Plugin  number of the plugin whose associated stats are being fetched.
100
            If p_num <= 0, it is relative to the plugin number of the current plugin being run.
101
            E.g current plugin number = 5, p_num = -2 --> will return stats of the third plugin.
102
            By default will gather stats for the current plugin.
103
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
104
            If left blank will return the whole dictionary of stats:
105
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
106
        :param instance: In cases where there are multiple set of stats associated with a plugin
107
            due to loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
108
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
109
            By default will retrieve the most recent set.
110
        """
111
        if p_num is None:
112
            p_num = self.p_num
113
        if p_num <= 0:
114
            try:
115
                p_num = self.p_num + p_num
116
            except TypeError:
117
                p_num = Statistics.count + p_num
118
        if Statistics.global_stats[p_num].ndim == 1 and instance in (None, 0, 1, -1, "all"):
119
            stats_array = Statistics.global_stats[p_num]
120
        else:
121
            if instance == "all":
122
                stats_list = [self.get_stats(p_num, stat=stat, instance=1)]
123
                n = 2
124
                if Statistics.global_stats[p_num].ndim != 1:
125
                    while n <= len(Statistics.global_stats[p_num]):
126
                        stats_list.append(self.get_stats(p_num, stat=stat, instance=n))
127
                        n += 1
128
                return stats_list
129
            if instance > 0:
130
                instance -= 1
131
            stats_array = Statistics.global_stats[p_num][instance]
132
        stats_dict = self._array_to_dict(stats_array)
133
        if stat is not None:
134
            return stats_dict[stat]
135
        else:
136
            return stats_dict
137
138
    def get_stats_from_name(self, plugin_name, n=None, stat=None, instance=-1):
139
        """Returns stats associated with a certain plugin.
140
141
        :param plugin_name: name of the plugin whose associated stats are being fetched.
142
        :param n: In a case where there are multiple instances of **plugin_name** in the process list,
143
            specify the nth instance. Not specifying will select the first (or only) instance.
144
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
145
            If left blank will return the whole dictionary of stats:
146
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD' }
147
        :param instance: In cases where there are multiple set of stats associated with a plugin
148
            due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
149
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
150
            By default will retrieve the most recent set.
151
        """
152
        name = plugin_name
153
        if n not in (None, 0, 1):
154
            name = name + str(n)
155
        p_num = Statistics.plugin_numbers[name]
156
        return self.get_stats(p_num, stat, instance)
157
158
    def get_stats_from_dataset(self, dataset, stat=None, instance=-1):
159
        """Returns stats associated with a dataset.
160
161
        :param dataset: The dataset whose associated stats are being fetched.
162
        :param stat: Specify the stat parameter you want to fetch, i.e 'max', 'mean', 'median_std_dev'.
163
            If left blank will return the whole dictionary of stats:
164
            {'max': , 'min': , 'mean': , 'mean_std_dev': , 'median_std_dev': , 'NRMSD'}
165
        :param instance: In cases where there are multiple set of stats associated with a dataset
166
            due to iterative loops or multi-parameters, specify which set you want to retrieve, i.e 3 to retrieve the
167
            stats associated with the third run of a plugin. Pass 'all' to get a list of all sets.
168
            By default will retrieve the most recent set.
169
        """
170
        stats_list = [dataset.meta_data.get("stats")]
171
        n = 2
172
        while ("stats" + str(n)) in list(dataset.meta_data.get_dictionary().keys()):
173
            stats_list.append(dataset.meta_data.get("stats" + str(n)))
174
            n += 1
175
        if stat:
176
            for i in range(len(stats_list)):
177
                stats_list[i] = stats_list[i][stat]
178
        if instance in (None, 0, 1):
179
            stats = stats_list[0]
180
        elif instance == "all":
181
            stats = stats_list
182
        else:
183
            if instance >= 2:
184
                instance -= 1
185
            stats = stats_list[instance]
186
        return stats
187
188
    def set_slice_stats(self, my_slice, base_slice=None, pad=True):
189
        slice_stats_after = self.calc_slice_stats(my_slice, base_slice, pad=pad)
190
        if base_slice:
191
            slice_stats_before = self.calc_slice_stats(base_slice, pad=pad)
192
            for key in list(self.stats_before_processing.keys()):
193
                self.stats_before_processing[key].append(slice_stats_before[key])
194
        for key in list(self.stats.keys()):
195
            self.stats[key].append(slice_stats_after[key])
196
197
    def calc_slice_stats(self, my_slice, base_slice=None, pad=True):
198
        """Calculates and returns slice stats for the current slice.
199
200
        :param my_slice: The slice whose stats are being calculated.
201
        :param base_slice: Provide a base slice to calculate residuals from, to calculate RMSD.
202
        """
203
        if my_slice is not None:
204
            my_slice = self._de_list(my_slice)
205
            if pad:
206
                my_slice = self._unpad_slice(my_slice)
207
            slice_stats = {'max': np.amax(my_slice).astype('float64'), 'min': np.amin(my_slice).astype('float64'),
208
                           'mean': np.mean(my_slice), 'std_dev': np.std(my_slice), 'data_points': my_slice.size}
209
            if base_slice is not None and self._RMSD:
210
                base_slice = self._de_list(base_slice)
211
                base_slice = self._unpad_slice(base_slice)
212
                rss = self.calc_rss(my_slice, base_slice)
213
            else:
214
                rss = None
215
            slice_stats['RSS'] = rss
216
            return slice_stats
217
        return None
218
219
    def calc_rss(self, array1, array2):  # residual sum of squares # very slow needs looking at
220
        if array1.shape == array2.shape:
221
            residuals = np.subtract(array1, array2)
222
            rss = 0
223
            #for value in (np.nditer(residuals)):
224
            #    rss += value**2
225
            rss = np.sum(value for value in np.nditer(residuals))
226
        else:
227
            #print("Warning: cannot calculate RSS, arrays different sizes.")
228
            rss = None
229
        return rss
230
231
    def rmsd_from_rss(self, rss, n):
232
        return np.sqrt(rss/n)
233
234
    def calc_rmsd(self, array1, array2):
235
        if array1.shape == array2.shape:
236
            rss = self.calc_rss(array1, array2)
237
            rmsd = self.rmsd_from_rss(rss, array1.size)
238
        else:
239
            print("Warning: cannot calculate RMSD, arrays different sizes.")  # need to make this an actual warning
240
            rmsd = None
241
        return rmsd
242
243
    def calc_stats_residuals(self, stats_before, stats_after):
244
        residuals = {'max': None, 'min': None, 'mean': None, 'std_dev': None}
245
        for key in list(residuals.keys()):
246
            residuals[key] = stats_after[key] - stats_before[key]
247
        return residuals
248
249
    def set_stats_residuals(self, residuals):
250
        self.residuals['max'].append(residuals['max'])
251
        self.residuals['min'].append(residuals['min'])
252
        self.residuals['mean'].append(residuals['mean'])
253
        self.residuals['std_dev'].append(residuals['std_dev'])
254
255
    def calc_volume_stats(self, slice_stats):
256
        volume_stats = np.array([max(slice_stats['max']), min(slice_stats['min']), np.mean(slice_stats['mean']),
257
                                np.mean(slice_stats['std_dev']), np.median(slice_stats['std_dev'])])
258
        if None not in slice_stats['RSS']:
259
            total_rss = sum(slice_stats['RSS'])
260
            n = sum(slice_stats['data_points'])
261
            RMSD = self.rmsd_from_rss(total_rss, n)
262
            the_range = volume_stats[0] - volume_stats[1]
263
            NRMSD = RMSD / the_range  # normalised RMSD (dividing by the range)
264
            volume_stats = np.append(volume_stats, NRMSD)
265
        else:
266
            #volume_stats = np.append(volume_stats, None)
267
            pass
268
        return volume_stats
269
270
    def _set_loop_stats(self):
271
        # NEED TO CHANGE THIS - MUST USE SLICES
272
        data_obj1 = list(self._iterative_group._ip_data_dict["iterating"].keys())[0]
273
        data_obj2 = self._iterative_group._ip_data_dict["iterating"][data_obj1]
274
        RMSD = self.calc_rmsd(data_obj1.data, data_obj2.data)
275
        the_range = self.get_stats(self.p_num, stat="max", instance=self._iterative_group._ip_iteration) -\
276
                self.get_stats(self.p_num, stat="min", instance=self._iterative_group._ip_iteration)
277
        NRMSD = RMSD/the_range
278
        Statistics.loop_stats[self.l_num]["NRMSD"] = np.append(Statistics.loop_stats[self.l_num]["NRMSD"], NRMSD)
279
280
    def set_volume_stats(self):
281
        """Calculates volume-wide statistics from slice stats, and updates class-wide arrays with these values.
282
        Links volume stats with the output dataset and writes slice stats to file.
283
        """
284
        stats = self.stats
285
        if self.GPU:
286
            comm = self.plugin.new_comm
287
        else:
288
            comm = MPI.COMM_WORLD
289
        combined_stats = self._combine_mpi_stats(stats, comm=comm)
290
        if not self.p_num:
291
            self.p_num = Statistics.count
292
        p_num = self.p_num
293
        name = self.plugin_name
294
        i = 2
295
        if not self._iterative_group:
296
            while name in list(Statistics.plugin_numbers.keys()):
297
                name = self.plugin_name + str(i)
298
                i += 1
299
        elif self._iterative_group._ip_iteration == 0:
300
            while name in list(Statistics.plugin_numbers.keys()):
301
                name = self.plugin_name + str(i)
302
                i += 1
303
304
        if p_num not in list(Statistics.plugin_names.keys()):
305
            Statistics.plugin_names[p_num] = name
306
        Statistics.plugin_numbers[name] = p_num
307
        if len(self.stats['max']) != 0:
308
            stats_array = self.calc_volume_stats(combined_stats)
309
            Statistics.global_residuals[p_num] = {}
310
            #before_processing = self.calc_volume_stats(self.stats_before_processing)
311
            #for key in list(before_processing.keys()):
312
            #    Statistics.global_residuals[p_num][key] = Statistics.global_stats[p_num][key] - before_processing[key]
313
314
            if len(Statistics.global_stats[p_num]) == 0:
315
                Statistics.global_stats[p_num] = stats_array
316
            else:
317
                Statistics.global_stats[p_num] = np.vstack([Statistics.global_stats[p_num], stats_array])
318
319
            stats_dict = self._array_to_dict(stats_array)
320
            self._link_stats_to_datasets(stats_dict, self._iterative_group)
321
322
        if self._iterative_group:
323
            if self._iterative_group.end_index == p_num and self._iterative_group._ip_iteration != 0:
324
                #self._set_loop_stats()
325
                pass
326
        self._write_stats_to_file(p_num, comm=comm)
327
        self._already_called = True
328
        self._repeat_count += 1
329
        if self._iterative_group:
330
            self.stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
331
332
333
    def _combine_mpi_stats(self, slice_stats, comm=MPI.COMM_WORLD):
334
        combined_stats_list = comm.allgather(slice_stats)
335
        combined_stats = {'max': [], 'min': [], 'mean': [], 'std_dev': [], 'RSS': [], 'data_points': []}
336
        for single_stats in combined_stats_list:
337
            for key in list(single_stats.keys()):
338
                combined_stats[key] += single_stats[key]
339
        return combined_stats
340
341
    def _array_to_dict(self, stats_array):
342
        stats_dict = {}
343
        for i, value in enumerate(stats_array):
344
            stats_dict[Statistics._key_list[i]] = value
345
        return stats_dict
346
347
    def _set_pattern_info(self):
348
        """Gathers information about the pattern of the data in the current plugin."""
349
        out_datasets = self.plugin.get_out_datasets()
350
        try:
351
            self.pattern = self.plugin.parameters['pattern']
352
            if self.pattern == None:
353
                raise KeyError
354
        except KeyError:
355
            if not out_datasets:
356
                self.pattern = None
357
            else:
358
                patterns = out_datasets[0].get_data_patterns()
359
                for pattern in patterns:
360
                    if 1 in patterns.get(pattern)["slice_dims"]:
361
                        self.pattern = pattern
362
                        break
363
        self.calc_stats = False
364
        for dataset in out_datasets:
365
            if bool(set(Statistics._pattern_list) & set(dataset.data_info.get("data_patterns"))):
366
                self.calc_stats = True
367
368
    def _link_stats_to_datasets(self, stats_dict, iterative=False):
369
        """Links the volume wide statistics to the output dataset(s)"""
370
        out_dataset = self.plugin.get_out_datasets()[0]
371
        my_dataset = out_dataset
372
        if iterative:
373
            if "itr_clone" in out_dataset.group_name:
374
                my_dataset = list(iterative._ip_data_dict["iterating"].keys())[0]
375
        n_datasets = self.plugin.nOutput_datasets()
376
377
        i = 2
378
        group_name = "stats"
379
        #out_dataset.data_info.set([group_name], stats)
380
        while group_name in list(my_dataset.meta_data.get_dictionary().keys()):
381
            group_name = f"stats{i}"
382
            i += 1
383
        for key in list(stats_dict.keys()):
384
            my_dataset.meta_data.set([group_name, key], stats_dict[key])
385
386
    def _delete_stats_metadata(self, plugin):
387
        out_dataset = plugin.get_out_datasets()[0]
388
        out_dataset.meta_data.delete("stats")
389
390
    def _write_stats_to_file(self, p_num=None, plugin_name=None, comm=MPI.COMM_WORLD):
391
        if p_num is None:
392
            p_num = self.p_num
393
        if plugin_name is None:
394
            plugin_name = self.plugin_names[p_num]
395
        path = Statistics.path
396
        filename = f"{path}/stats.h5"
397
        stats = self.global_stats[p_num]
398
        self.hdf5 = Hdf5Utils(self.exp)
399
        with h5.File(filename, "a", driver="mpio", comm=comm) as h5file:
400
            group = h5file.require_group("stats")
401
            if stats.shape != (0,):
402
                if str(p_num) in list(group.keys()):
403
                    del group[str(p_num)]
404
                dataset = group.create_dataset(str(p_num), shape=stats.shape, dtype=stats.dtype)
405
                dataset[::] = stats[::]
406
                dataset.attrs.create("plugin_name", plugin_name)
407
                dataset.attrs.create("pattern", self.pattern)
408
            if self._iterative_group:
409
                l_stats = Statistics.loop_stats[self.l_num]
410
                group1 = h5file.require_group("iterative")
411
                if self._iterative_group._ip_iteration == self._iterative_group._ip_fixed_iterations - 1\
412
                        and self.p_num == self._iterative_group.end_index:
413
                    dataset1 = group1.create_dataset(str(self.l_num), shape=l_stats["NRMSD"].shape, dtype=l_stats["NRMSD"].dtype)
414
                    dataset1[::] = l_stats["NRMSD"][::]
415
                    loop_plugins = []
416
                    for i in range(self._iterative_group.start_index, self._iterative_group.end_index + 1):
417
                        if i in list(self.plugin_names.keys()):
418
                            loop_plugins.append(self.plugin_names[i])
419
                    dataset1.attrs.create("loop_plugins", loop_plugins)
420
                    dataset.attrs.create("n_loop_plugins", len(loop_plugins))
0 ignored issues
show
introduced by
The variable dataset does not seem to be defined in case stats.shape != TupleNode on line 401 is False. Are you sure this can never be the case?
Loading history...
421
422
    def write_slice_stats_to_file(self, slice_stats=None, p_num=None, comm=MPI.COMM_WORLD):
423
        """Writes slice statistics to a h5 file. Placed in the stats folder in the output directory."""
424
        if not slice_stats:
425
            slice_stats = self.stats
426
        if not p_num:
427
            p_num = self.count
428
            plugin_name = self.plugin_name
429
        else:
430
            plugin_name = self.plugin_names[p_num]
431
        combined_stats = self._combine_mpi_stats(slice_stats)
432
        slice_stats_arrays = {}
433
        datasets = {}
434
        path = Statistics.path
435
        filename = f"{path}/stats_p{p_num}_{plugin_name}.h5"
436
        self.hdf5 = Hdf5Utils(self.plugin.exp)
437
        with h5.File(filename, "a", driver="mpio", comm=comm) as h5file:
438
            i = 2
439
            group_name = "/stats"
440
            while group_name in h5file:
441
                group_name = f"/stats{i}"
442
                i += 1
443
            group = h5file.create_group(group_name, track_order=None)
444
            for key in list(combined_stats.keys()):
445
                slice_stats_arrays[key] = np.array(combined_stats[key])
446
                datasets[key] = self.hdf5.create_dataset_nofill(group, key, (len(slice_stats_arrays[key]),), slice_stats_arrays[key].dtype)
447
                datasets[key][::] = slice_stats_arrays[key]
448
449
    def _unpad_slice(self, slice1):
450
        """If data is padded in the slice dimension, removes this pad."""
451
        out_datasets = self.plugin.get_out_datasets()
452
        if len(out_datasets) == 1:
453
            out_dataset = out_datasets[0]
454
        else:
455
            for dataset in out_datasets:
456
                if self.pattern in list(dataset.data_info.get(["data_patterns"]).keys()):
457
                    out_dataset = dataset
458
                    break
459
        slice_dims = out_dataset.get_slice_dimensions()
0 ignored issues
show
introduced by
The variable out_dataset does not seem to be defined for all execution paths.
Loading history...
460
        if self.plugin.pcount == 0:
461
            self._slice_list, self._pad = self._get_unpadded_slice_list(slice1, slice_dims)
462
        if self._pad:
463
            #for slice_dim in slice_dims:
464
            slice_dim = slice_dims[0]
465
            temp_slice = np.swapaxes(slice1, 0, slice_dim)
466
            temp_slice = temp_slice[self._slice_list[slice_dim]]
467
            slice1 = np.swapaxes(temp_slice, 0, slice_dim)
468
        return slice1
469
470
    def _get_unpadded_slice_list(self, slice1, slice_dims):
471
        """Creates slice object(s) to un-pad slices in the slice dimension(s)."""
472
        slice_list = list(self.plugin.slice_list[0])
473
        pad = False
474
        if len(slice_list) == len(slice1.shape):
475
            #for i in slice_dims:
476
            i = slice_dims[0]
477
            slice_width = self.plugin.slice_list[0][i].stop - self.plugin.slice_list[0][i].start
478
            if slice_width != slice1.shape[i]:
479
                pad = True
480
                pad_width = (slice1.shape[i] - slice_width) // 2  # Assuming symmetrical padding
481
                slice_list[i] = slice(pad_width, pad_width + 1, 1)
482
            return tuple(slice_list), pad
483
        else:
484
            return self.plugin.slice_list[0], pad
485
486
    def _de_list(self, slice1):
487
        """If the slice is in a list, remove it from that list."""
488
        if type(slice1) == list:
489
            if len(slice1) != 0:
490
                slice1 = slice1[0]
491
                slice1 = self._de_list(slice1)
492
        return slice1
493
494
495
    @classmethod
496
    def _count(cls):
497
        cls.count += 1
498
499
    @classmethod
500
    def _post_chain(cls):
501
        if cls._any_stats & cls._stats_flag:
502
            stats_utils = StatsUtils()
503
            stats_utils.generate_figures(f"{cls.path}/stats.h5", cls.path)
504