Completed
Push — master ( 15cfe4...5ce4a0 )
by Mubdi
03:12
created

knowyourdata.kyd.sizeof_fmt()   A

Complexity

Conditions 3

Size

Total Lines 10
Code Lines 6

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 3
eloc 6
nop 2
dl 0
loc 10
rs 9.4285
c 0
b 0
f 0
1
"""
2
KnowYourData
3
============
4
5
A rapid and lightweight module to describe the statistics and structure of
6
data arrays for interactive use.
7
8
The most simple use case to display data is if you have a numpy array 'x':
9
10
    >>> from knowyourdata import kyd
11
    >>> kyd(x)
12
13
"""
14
15
import sys
16
import numpy as np
17
18
19
class KYD(object):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
best-practice introduced by
Too many instance attributes (23/7)
Loading history...
20
    """The Central Class for KYD"""
21
22
    # Variable for Data Vector
23
    data = None
24
25
    # Initial Flags
26
    f_allfinite = False
27
    f_allnonfinite = False
28
    f_hasnan = False
29
    f_hasinf = False
30
31
    # Display Settings
32
    col_width = 10
33
    precision = 4
34
35
    def check_finite(self):
36
        """Checking to see if all elements are finite and setting flags"""
37
        if np.all(np.isfinite(self.data)):
38
            self.filt_data = self.data
39
            self.f_allfinite = True
40
        else:
41
            finite_inds = np.where(np.isfinite(self.data))
42
43
            self.filt_data = self.data[finite_inds]
44
45
            if self.filt_data.size == 0:
46
                self.f_allnonfinite = True
47
48
            if np.any(np.isnan(self.data)):
49
                self.f_hasnan = True
50
            if np.any(np.isinf(self.data)):
51
                self.f_hasinf = True
52
53
    def check_struct(self):
54
        """Determining the Structure of the Numpy Array"""
55
        self.dtype = self.data.dtype
56
        self.ndim = self.data.ndim
57
        self.shape = self.data.shape
58
        self.size = self.data.size
59
        self.memsize = sys.getsizeof(self.data)
60
        self.human_memsize = sizeof_fmt(self.memsize)
61
62
    def get_basic_stats(self):
63
        """Get basic statistics about array"""
64
65
        if self.f_allnonfinite:
66
            self.min = self.max = self.range = np.nan
67
            self.mean = self.std = self.median = np.nan
68
            self.firstquartile = self.thirdquartile = np.nan
69
            self.cl_68 = self.cl_95 = self.cl_99 = np.array([np.nan, np.nan])
70
71
            return
72
73
        self.min = np.float_(np.min(self.filt_data))
74
        self.max = np.float_(np.max(self.filt_data))
75
        self.range = self.max - self.min
76
        self.mean = np.mean(self.filt_data)
77
        self.std = np.std(self.filt_data)
78
        self.median = np.float_(np.median(self.filt_data))
79
        self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
80
        self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
81
        self.cl_99 = np.float_(
82
            np.percentile(self.filt_data, np.array([0.5, 99.5])))
83
        self.cl_95 = np.float_(
84
            np.percentile(self.filt_data, np.array([2.5, 97.5])))
85
        self.cl_68 = np.float_(
86
            np.percentile(self.filt_data, np.array([16.0, 84.0])))
87
88
    def display_basic_stats(self):
89
        """Display basic statistics of array"""
90
        pstr_list = []
91
92
        # Heading for Section
93
94
        pstr_struct_header1 = '\033[1m' + "Basic Statistics  " + '\033[0m'
95
        pstr_struct_header2 = ''
96
97
        pstr_list.append(pstr_struct_header1)
98
        pstr_list.append(pstr_struct_header2)
99
100
        # Mean and Standard Deviation
101
102
        pstr_meanstdhead = (
103
            "{0:^15}"
104
            "{1:^15}"
105
        ).format("Mean", "Std Dev")
106
        pstr_meanstdhead = (
107
            "{0:^{self.col_width}}"
108
        ).format(pstr_meanstdhead, self=self)
109
        pstr_list.append(pstr_meanstdhead)
110
111
        pstr_meanstdstat = (
112
            "{self.mean:^15.{self.precision}}"
113
            "{self.std:^15.{self.precision}}"
114
        ).format(self=self)
115
        pstr_meanstdstat = (
116
            "{0:^{self.col_width}}"
117
        ).format(pstr_meanstdstat, self=self)
118
        pstr_list.append(pstr_meanstdstat)
119
120
        pstr_list.append("")
121
122
        # Three point statistics
123
124
        pstr_3pthead = (
125
            "{0:^10}"
126
            "{1:^10}"
127
            "{2:^10}"
128
            "{3:^10}"
129
            "{4:^10}"
130
        ).format('Min', '1Q', 'Median', '3Q', 'Max')
131
        pstr_3pthead = (
132
            "{0:^{self.col_width}}"
133
        ).format(pstr_3pthead, self=self)
134
        pstr_list.append(pstr_3pthead)
135
136
        pstr_3ptstat = (
137
            "{self.min:^10.{self.precision}}"
138
            "{self.firstquartile:^10.{self.precision}}"
139
            "{self.median:^10.{self.precision}}"
140
            "{self.thirdquartile:^10.{self.precision}}"
141
            "{self.max:^10.{self.precision}}"
142
        ).format(self=self)
143
        pstr_3ptstat = (
144
            "{0:^{self.col_width}}"
145
        ).format(pstr_3ptstat, self=self)
146
        pstr_list.append(pstr_3ptstat)
147
148
        pstr_list.append("")
149
150
        # Confidence Levels
151
152
        pstr_clhead = (
153
            "{0:^10}"
154
            "{1:^10}"
155
            "{2:^10}"
156
            "{3:^10}"
157
            "{4:^10}"
158
            "{5:^10}"
159
        ).format('-99 CL', '-95 CL', '-68 CL', '+68 CL', '+95 CL', '+99 CL')
160
        pstr_clhead = (
161
            "{0:^{self.col_width}}"
162
        ).format(pstr_clhead, self=self)
163
        pstr_list.append(pstr_clhead)
164
165
        pstr_clstat = (
166
            "{self.cl_99[0]:^10.{self.precision}}"
167
            "{self.cl_95[0]:^10.{self.precision}}"
168
            "{self.cl_68[0]:^10.{self.precision}}"
169
            "{self.cl_68[1]:^10.{self.precision}}"
170
            "{self.cl_95[1]:^10.{self.precision}}"
171
            "{self.cl_99[1]:^10.{self.precision}}"
172
        ).format(self=self)
173
        pstr_clstat = (
174
            "{0:^{self.col_width}}"
175
        ).format(pstr_clstat, self=self)
176
        pstr_list.append(pstr_clstat)
177
178
        return pstr_list
179
180
    def display_struct(self):
181
        """Display information about array structure"""
182
183
        pstr_list = []
184
185
        # pstr_struct_header0 = "................."
186
        pstr_struct_header1 = '\033[1m' + "Array Structure  " + '\033[0m'
187
        pstr_struct_header2 = "                 "
188
189
        # pstr_list.append(pstr_struct_header0)
190
        pstr_list.append(pstr_struct_header1)
191
        pstr_list.append(pstr_struct_header2)
192
193
        pstr_n_dim = (
194
            "Number of Dimensions:\t"
195
            "{self.ndim}").format(
196
                self=self)
197
        pstr_list.append(pstr_n_dim)
198
199
        pstr_shape = (
200
            "Shape of Dimensions:\t"
201
            "{self.shape}").format(
202
                self=self)
203
        pstr_list.append(pstr_shape)
204
205
        pstr_dtype = (
206
            "Array Data Type:\t"
207
            "{self.dtype}").format(
208
                self=self)
209
        pstr_list.append(pstr_dtype)
210
211
        pstr_memsize = (
212
            "Memory Size:\t\t"
213
            "{self.human_memsize}").format(
214
                self=self)
215
        pstr_list.append(pstr_memsize)
216
217
        return pstr_list
218
219
    def display(self, short=False):
220
        """Displaying all relevant statistics"""
221
222
        if short:
223
            pass
224
225
        print("")
226
        pstr_basic = self.display_basic_stats()
227
        pstr_struct = self.display_struct()
228
        n_basic = len(pstr_basic)
229
        n_struct = len(pstr_struct)
230
231
        l_colwidth = max([len(x) for x in pstr_basic]) + 1
232
233
        r_colwidth = max([len(x) for x in pstr_struct]) + 2
234
235
        # new_colwidth = self.col_width + 20
236
237
        # Finding the longest string
238
        len_list = max([n_basic, n_struct])
239
240
        for i in range(len_list):
241
            tmp_str = '| '
242
            if i < n_basic:
243
                tmp_str += (pstr_basic[i].ljust(l_colwidth))
244
            else:
245
                tmp_str += ''.ljust(l_colwidth)
246
            tmp_str += '\t| '
247
248
            if i < n_struct:
249
                tmp_str += (pstr_struct[i].expandtabs().ljust(r_colwidth))
250
            else:
251
                tmp_str += ''.ljust(r_colwidth)
252
            tmp_str += '\t|'
253
254
            print(tmp_str)
255
256
        print("")
257
258
    def clear_memory(self):
259
        """Ensuring the Numpy Array does not exist in memory"""
260
        del self.data
261
        del self.filt_data
262
263
    def __init__(self, data):
264
        super(KYD, self).__init__()
265
266
        # Ensuring that the array is a numpy array
267
        if not isinstance(data, np.ndarray):
268
            data = np.array(data)
269
270
        self.data = data
271
272
        self.check_finite()
273
        self.check_struct()
274
        self.get_basic_stats()
275
        self.clear_memory()
276
277
278
def sizeof_fmt(num, suffix='B'):
279
    """Return human readable version of in-memory size.
280
    Code from Fred Cirera from Stack Overflow:
281
    https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
282
    """
283
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
284
        if abs(num) < 1024.0:
285
            return "%3.1f%s%s" % (num, unit, suffix)
286
        num /= 1024.0
287
    return "%.1f%s%s" % (num, 'Yi', suffix)
288
289
290
def kyd(data, full_statistics=False):
291
    """Print statistics of any numpy array
292
293
    data -- Numpy Array of Data
294
295
    Keyword arguments:
296
    full_statistics -- printing all detailed statistics of the sources
297
    (Currently Not Implemented)
298
299
    """
300
301
    data_kyd = KYD(data)
302
    if full_statistics:
303
        data_kyd.display()
304
    else:
305
        data_kyd.display(short=True)
306
307
    return data_kyd
308