Completed
Push — master ( 9e6483...4de886 )
by Mubdi
01:41
created

knowyourdata.kyd.KYD.display_basic_stats_new()   B

Complexity

Conditions 5

Size

Total Lines 65
Code Lines 44

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 5
eloc 44
nop 1
dl 0
loc 65
rs 8.2662
c 0
b 0
f 0

How to fix   Long Method   

Long Method

Small methods make your code easier to understand, in particular if combined with a good name. Besides, if your method is small, finding a good name is usually much easier.

For example, if you find yourself adding comments to a method's body, this is usually a good sign to extract the commented part to a new method, and use the comment as a starting point when coming up with a good name for this new method.

Commonly applied refactorings include:

1
"""
2
KnowYourData
3
============
4
5
A rapid and lightweight module to describe the statistics and structure of
6
data arrays for interactive use.
7
8
The most simple use case to display data is if you have a numpy array 'x':
9
10
    >>> from knowyourdata import kyd
11
    >>> kyd(x)
12
13
"""
14
15
import sys
16
import numpy as np
17
18
19
class KYD(object):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
best-practice introduced by
Too many instance attributes (25/7)
Loading history...
20
    """The Central Class for KYD"""
21
22
    # Variable for Data Vector
23
    data = None
24
25
    # Initial Flags
26
    f_allfinite = False
27
    f_allnonfinite = False
28
    f_hasnan = False
29
    f_hasinf = False
30
31
    # Initialized Numbers
32
    num_nan = 0
33
    num_inf = 0
34
35
    # Display Settings
36
    col_width = 10
37
    precision = 4
38
39
    def check_finite(self):
40
        """Checking to see if all elements are finite and setting flags"""
41
        if np.all(np.isfinite(self.data)):
42
            self.filt_data = self.data
43
            self.f_allfinite = True
44
        else:
45
            finite_inds = np.where(np.isfinite(self.data))
46
47
            self.filt_data = self.data[finite_inds]
48
49
            if self.filt_data.size == 0:
50
                self.f_allnonfinite = True
51
52
            if np.any(np.isnan(self.data)):
53
                self.f_hasnan = True
54
                self.num_nan = np.sum(np.isnan(self.data))
55
56
            if np.any(np.isinf(self.data)):
57
                self.f_hasinf = True
58
                self.num_inf = np.sum(np.isinf(self.data))
59
60
    def check_struct(self):
61
        """Determining the Structure of the Numpy Array"""
62
        self.dtype = self.data.dtype
63
        self.ndim = self.data.ndim
64
        self.shape = self.data.shape
65
        self.size = self.data.size
66
        self.memsize = sys.getsizeof(self.data)
67
        self.human_memsize = sizeof_fmt(self.memsize)
68
69
    def get_basic_stats(self):
70
        """Get basic statistics about array"""
71
72
        if self.f_allnonfinite:
73
            self.min = self.max = self.range = np.nan
74
            self.mean = self.std = self.median = np.nan
75
            self.firstquartile = self.thirdquartile = np.nan
76
            self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan])
77
78
            return
79
80
        self.min = np.float_(np.min(self.filt_data))
81
        self.max = np.float_(np.max(self.filt_data))
82
        self.range = self.max - self.min
83
        self.mean = np.mean(self.filt_data)
84
        self.std = np.std(self.filt_data)
85
        self.median = np.float_(np.median(self.filt_data))
86
        self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
87
        self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
88
        self.ci_99 = np.float_(
89
            np.percentile(self.filt_data, np.array([0.5, 99.5])))
90
        self.ci_95 = np.float_(
91
            np.percentile(self.filt_data, np.array([2.5, 97.5])))
92
        self.ci_68 = np.float_(
93
            np.percentile(self.filt_data, np.array([16.0, 84.0])))
94
95
    def display_basic_stats_new(self):
96
        """Display Basic Statistics"""
97
        pstr_list = []
98
99
        pstr_struct_header1 = "Basic Statistics  "
100
        pstr_struct_header2 = ''
101
102
        pstr_list.append(pstr_struct_header1)
103
        pstr_list.append(pstr_struct_header2)
104
105
        template_str = (
106
            " {0:^10} "
107
            " {1:>8} "
108
            " {2:<10} "
109
            " {3:>8} "
110
            " {4:<10} "
111
        )
112
113
        tmp_data = [
114
            [
115
                "Mean:", "{self.mean:.{self.precision}}".format(self=self),
116
                "",
117
                "Std Dev:", "{self.std:.{self.precision}}".format(self=self)
118
            ],
119
            ["Min:", "1Q:", "Median:", "3Q:", "Max:"],
120
            [
121
                "{self.min: .{self.precision}}".format(self=self),
122
                "{self.firstquartile: .{self.precision}}".format(self=self),
123
                "{self.median: .{self.precision}}".format(self=self),
124
                "{self.thirdquartile: .{self.precision}}".format(self=self),
125
                "{self.max: .{self.precision}}".format(self=self),
126
            ],
127
            ['-99 CI:', '-95 CI:', '-68 CI:', '+68 CI:', '+95 CI:', '+99 CI:'],
128
            [
129
                "{self.ci_99[0]: .{self.precision}}".format(self=self),
130
                "{self.ci_95[0]: .{self.precision}}".format(self=self),
131
                "{self.ci_68[0]: .{self.precision}}".format(self=self),
132
                "{self.ci_68[1]: .{self.precision}}".format(self=self),
133
                "{self.ci_95[1]: .{self.precision}}".format(self=self),
134
                "{self.ci_99[1]: .{self.precision}}".format(self=self),
135
            ],
136
        ]
137
138
        n_tmp_data = len(tmp_data)
139
140
        num_rows_in_cols = [len(i) for i in tmp_data]
141
        num_rows = np.max(num_rows_in_cols)
142
143
        for i in range(n_tmp_data):
144
            tmp_col = tmp_data[i]
145
            for j in range(num_rows_in_cols[i], num_rows):
0 ignored issues
show
Unused Code introduced by
The variable j seems to be unused.
Loading history...
146
                tmp_col.append("")
147
148
        for i in range(num_rows):
149
            pstr_list.append(
150
                template_str.format(
151
                    tmp_data[0][i],
152
                    tmp_data[1][i],
153
                    tmp_data[2][i],
154
                    tmp_data[3][i],
155
                    tmp_data[4][i],
156
                )
157
            )
158
159
        return pstr_list
160
161
    def display_struct(self):
162
        """Display information about array structure"""
163
164
        pstr_list = []
165
166
        # pstr_struct_header0 = "................."
167
        # Commenting out Ansi Coloured Version
168
        # pstr_struct_header1 = '\033[1m' + "Array Structure  " + '\033[0m'
169
        pstr_struct_header1 = "Array Structure  "
170
        pstr_struct_header2 = "                 "
171
172
        # pstr_list.append(pstr_struct_header0)
173
        pstr_list.append(pstr_struct_header1)
174
        pstr_list.append(pstr_struct_header2)
175
176
        pstr_n_dim = (
177
            "Number of Dimensions:\t"
178
            "{self.ndim}").format(
179
                self=self)
180
        pstr_list.append(pstr_n_dim)
181
182
        pstr_shape = (
183
            "Shape of Dimensions:\t"
184
            "{self.shape}").format(
185
                self=self)
186
        pstr_list.append(pstr_shape)
187
188
        pstr_dtype = (
189
            "Array Data Type:\t"
190
            "{self.dtype}").format(
191
                self=self)
192
        pstr_list.append(pstr_dtype)
193
194
        pstr_memsize = (
195
            "Memory Size:\t\t"
196
            "{self.human_memsize}").format(
197
                self=self)
198
        pstr_list.append(pstr_memsize)
199
200
        pstr_spacer = ("")
201
        pstr_list.append(pstr_spacer)
202
203
        pstr_numnan = (
204
            "Number of NaN:\t"
205
            "{self.num_nan}").format(
206
                self=self)
207
        pstr_list.append(pstr_numnan)
208
209
        pstr_numinf = (
210
            "Number of Inf:\t"
211
            "{self.num_inf}").format(
212
                self=self)
213
        pstr_list.append(pstr_numinf)
214
215
        return pstr_list
216
217
    def display(self, short=False):
218
        """Displaying all relevant statistics"""
219
220
        if short:
221
            pass
222
223
        print("")
224
        pstr_basic = self.display_basic_stats_new()
225
        pstr_struct = self.display_struct()
226
        n_basic = len(pstr_basic)
227
        n_struct = len(pstr_struct)
228
229
        l_colwidth = max([len(x) for x in pstr_basic]) + 1
230
231
        r_colwidth = max([len(x) for x in pstr_struct]) + 2
232
233
        # new_colwidth = self.col_width + 20
234
235
        # Finding the longest string
236
        len_list = max([n_basic, n_struct])
237
238
        for i in range(len_list):
239
            tmp_str = '| '
240
            if i < n_basic:
241
                tmp_str += (pstr_basic[i].ljust(l_colwidth))
242
            else:
243
                tmp_str += ''.ljust(l_colwidth)
244
            tmp_str += ' | '
245
246
            if i < n_struct:
247
                tmp_str += (pstr_struct[i].expandtabs().ljust(r_colwidth))
248
            else:
249
                tmp_str += ''.ljust(r_colwidth)
250
            tmp_str += '\t|'
251
252
            print(tmp_str)
253
254
        print("")
255
256
    def clear_memory(self):
257
        """Ensuring the Numpy Array does not exist in memory"""
258
        del self.data
259
        del self.filt_data
260
261
    def __init__(self, data):
262
        super(KYD, self).__init__()
263
264
        # Ensuring that the array is a numpy array
265
        if not isinstance(data, np.ndarray):
266
            data = np.array(data)
267
268
        self.data = data
269
270
        self.check_finite()
271
        self.check_struct()
272
        self.get_basic_stats()
273
        self.clear_memory()
274
275
276
def sizeof_fmt(num, suffix='B'):
277
    """Return human readable version of in-memory size.
278
    Code from Fred Cirera from Stack Overflow:
279
    https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
280
    """
281
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
282
        if abs(num) < 1024.0:
283
            return "%3.1f%s%s" % (num, unit, suffix)
284
        num /= 1024.0
285
    return "%.1f%s%s" % (num, 'Yi', suffix)
286
287
288
def kyd(data, full_statistics=False):
289
    """Print statistics of any numpy array
290
291
    data -- Numpy Array of Data
292
293
    Keyword arguments:
294
    full_statistics -- printing all detailed statistics of the sources
295
    (Currently Not Implemented)
296
297
    """
298
299
    data_kyd = KYD(data)
300
    if full_statistics:
301
        data_kyd.display()
302
    else:
303
        data_kyd.display(short=True)
304
305
    return data_kyd
306