Completed
Push — master ( 8492fe...523f19 )
by Mubdi
01:38
created

knowyourdata.kyd.KYD.__init__()   A

Complexity

Conditions 2

Size

Total Lines 13
Code Lines 9

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 2
eloc 9
nop 2
dl 0
loc 13
rs 9.4285
c 0
b 0
f 0
1
"""
2
KnowYourData
3
============
4
5
A rapid and lightweight module to describe the statistics and structure of
6
data arrays for interactive use.
7
8
The most simple use case to display data is if you have a numpy array 'x':
9
10
    >>> from knowyourdata import kyd
11
    >>> kyd(x)
12
13
"""
14
15
import sys
16
import numpy as np
17
18
19
class KYD(object):
0 ignored issues
show
best-practice introduced by
Too many instance attributes (22/7)
Loading history...
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
20
    """The Central Class for KYD"""
21
22
    # Variable for Data Vector
23
    data = None
24
25
    # Initial Flags
26
    f_allfinite = False
27
    f_hasnan = False
28
    f_hasinf = False
29
30
    # Display Settings
31
    col_width = 10
32
    precision = 4
33
34
    def check_finite(self):
35
        """Checking to see if all elements are finite and setting flags"""
36
        if np.all(np.isfinite(self.data)):
37
            self.filt_data = self.data
38
            self.f_allfinite = True
39
        else:
40
            finite_inds = np.where(np.isfinite(self.data))
41
            self.filt_data = self.data[finite_inds]
42
43
            if np.any(np.isnan(self.data)):
44
                self.f_hasnan = True
45
            if np.any(np.isinf(self.data)):
46
                self.f_hasinf = True
47
48
    def check_struct(self):
49
        """Determining the Structure of the Numpy Array"""
50
        self.dtype = self.data.dtype
51
        self.ndim = self.data.ndim
52
        self.shape = self.data.shape
53
        self.size = self.data.size
54
        self.memsize = sys.getsizeof(self.data)
55
        self.human_memsize = sizeof_fmt(self.memsize)
56
57
    def get_basic_stats(self):
58
        """Get basic statistics about array"""
59
        self.min = np.float_(np.min(self.filt_data))
60
        self.max = np.float_(np.max(self.filt_data))
61
        self.range = self.max - self.min
62
        self.mean = np.mean(self.filt_data)
63
        self.std = np.std(self.filt_data)
64
        self.median = np.float_(np.median(self.filt_data))
65
        self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
66
        self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
67
        self.cl_99 = np.float_(
68
            np.percentile(self.filt_data, np.array([0.5, 99.5])))
69
        self.cl_95 = np.float_(
70
            np.percentile(self.filt_data, np.array([2.5, 97.5])))
71
        self.cl_68 = np.float_(
72
            np.percentile(self.filt_data, np.array([16.0, 84.0])))
73
74
    def display_basic_stats(self):
75
        """Display basic statistics of array"""
76
        pstr_list = []
77
78
        # Heading for Section
79
80
        pstr_struct_header1 = '\033[1m' + "Basic Statistics  " + '\033[0m'
81
        pstr_struct_header2 = ''
82
83
        pstr_list.append(pstr_struct_header1)
84
        pstr_list.append(pstr_struct_header2)
85
86
        # Mean and Standard Deviation
87
88
        pstr_meanstdhead = (
89
            "{0:^15}"
90
            "{1:^15}"
91
        ).format("Mean", "Std Dev")
92
        pstr_meanstdhead = (
93
            "{0:^{self.col_width}}"
94
        ).format(pstr_meanstdhead, self=self)
95
        pstr_list.append(pstr_meanstdhead)
96
97
        pstr_meanstdstat = (
98
            "{self.mean:^15.{self.precision}}"
99
            "{self.std:^15.{self.precision}}"
100
        ).format(self=self)
101
        pstr_meanstdstat = (
102
            "{0:^{self.col_width}}"
103
        ).format(pstr_meanstdstat, self=self)
104
        pstr_list.append(pstr_meanstdstat)
105
106
        pstr_list.append("")
107
108
        # Three point statistics
109
110
        pstr_3pthead = (
111
            "{0:^10}"
112
            "{1:^10}"
113
            "{2:^10}"
114
            "{3:^10}"
115
            "{4:^10}"
116
        ).format('Min,', '1Q', 'Median', '3Q', 'Max')
117
        pstr_3pthead = (
118
            "{0:^{self.col_width}}"
119
        ).format(pstr_3pthead, self=self)
120
        pstr_list.append(pstr_3pthead)
121
122
        pstr_3ptstat = (
123
            "{self.min:^10.{self.precision}}"
124
            "{self.firstquartile:^10.{self.precision}}"
125
            "{self.median:^10.{self.precision}}"
126
            "{self.thirdquartile:^10.{self.precision}}"
127
            "{self.max:^10.{self.precision}}"
128
        ).format(self=self)
129
        pstr_3ptstat = (
130
            "{0:^{self.col_width}}"
131
        ).format(pstr_3ptstat, self=self)
132
        pstr_list.append(pstr_3ptstat)
133
134
        pstr_list.append("")
135
136
        # Confidence Levels
137
138
        pstr_clhead = (
139
            "{0:^10}"
140
            "{1:^10}"
141
            "{2:^10}"
142
            "{3:^10}"
143
            "{4:^10}"
144
            "{5:^10}"
145
        ).format('-99 CL', '-95 CL', '-68 CL', '+68 CL', '+95 CL', '+99 CL')
146
        pstr_clhead = (
147
            "{0:^{self.col_width}}"
148
        ).format(pstr_clhead, self=self)
149
        pstr_list.append(pstr_clhead)
150
151
        pstr_clstat = (
152
            "{self.cl_99[0]:^10.{self.precision}}"
153
            "{self.cl_95[0]:^10.{self.precision}}"
154
            "{self.cl_68[0]:^10.{self.precision}}"
155
            "{self.cl_68[1]:^10.{self.precision}}"
156
            "{self.cl_95[1]:^10.{self.precision}}"
157
            "{self.cl_99[1]:^10.{self.precision}}"
158
        ).format(self=self)
159
        pstr_clstat = (
160
            "{0:^{self.col_width}}"
161
        ).format(pstr_clstat, self=self)
162
        pstr_list.append(pstr_clstat)
163
164
        return pstr_list
165
166
    def display_struct(self):
167
        """Display information about array structure"""
168
169
        pstr_list = []
170
171
        # pstr_struct_header0 = "................."
172
        pstr_struct_header1 = '\033[1m' + "Array Structure  " + '\033[0m'
173
        pstr_struct_header2 = "                 "
174
175
        # pstr_list.append(pstr_struct_header0)
176
        pstr_list.append(pstr_struct_header1)
177
        pstr_list.append(pstr_struct_header2)
178
179
        pstr_n_dim = (
180
            "Number of Dimensions:\t"
181
            "{self.ndim}").format(
182
                self=self)
183
        pstr_list.append(pstr_n_dim)
184
185
        pstr_shape = (
186
            "Shape of Dimensions:\t"
187
            "{self.shape}").format(
188
                self=self)
189
        pstr_list.append(pstr_shape)
190
191
        pstr_dtype = (
192
            "Array Data Type:\t"
193
            "{self.dtype}").format(
194
                self=self)
195
        pstr_list.append(pstr_dtype)
196
197
        pstr_memsize = (
198
            "Memory Size:\t\t"
199
            "{self.human_memsize}").format(
200
                self=self)
201
        pstr_list.append(pstr_memsize)
202
203
        return pstr_list
204
205
    def display(self, short=False):
206
        """Displaying all relevant statistics"""
207
208
        if short:
209
            pass
210
211
        print()
212
        pstr_basic = self.display_basic_stats()
213
        pstr_struct = self.display_struct()
214
        n_basic = len(pstr_basic)
215
        n_struct = len(pstr_struct)
216
217
        l_colwidth = max([len(x) for x in pstr_basic]) + 1
218
219
        r_colwidth = max([len(x) for x in pstr_struct]) + 2
220
221
        # new_colwidth = self.col_width + 20
222
223
        # Finding the longest string
224
        len_list = max([n_basic, n_struct])
225
226
        for i in range(len_list):
227
            tmp_str = '| '
228
            if i < n_basic:
229
                tmp_str += (pstr_basic[i].ljust(l_colwidth))
230
            else:
231
                tmp_str += ''.ljust(l_colwidth)
232
            tmp_str += '\t| '
233
234
            if i < n_struct:
235
                tmp_str += (pstr_struct[i].expandtabs().ljust(r_colwidth))
236
            else:
237
                tmp_str += ''.ljust(r_colwidth)
238
            tmp_str += '\t|'
239
240
            print(tmp_str)
241
242
        print()
243
244
    def clear_memory(self):
245
        """Ensuring the Numpy Array does not exist in memory"""
246
        del self.data
247
        del self.filt_data
248
249
    def __init__(self, data):
250
        super(KYD, self).__init__()
251
252
        # Ensuring that the array is a numpy array
253
        if not isinstance(data, np.ndarray):
254
            data = np.array(data)
255
256
        self.data = data
257
258
        self.check_finite()
259
        self.check_struct()
260
        self.get_basic_stats()
261
        self.clear_memory()
262
263
264
def sizeof_fmt(num, suffix='B'):
265
    """Return human readable version of in-memory size.
266
    Code from Fred Cirera from Stack Overflow:
267
    https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
268
    """
269
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
270
        if abs(num) < 1024.0:
271
            return "%3.1f%s%s" % (num, unit, suffix)
272
        num /= 1024.0
273
    return "%.1f%s%s" % (num, 'Yi', suffix)
274
275
276
def kyd(data, full_statistics=False):
277
    """Print statistics of any numpy array
278
279
    data -- Numpy Array of Data
280
281
    Keyword arguments:
282
    full_statistics -- printing all detailed statistics of the sources
283
    (Currently Not Implemented)
284
285
    """
286
287
    data_kyd = KYD(data)
288
    if full_statistics:
289
        data_kyd.display()
290
    else:
291
        data_kyd.display(short=True)
292
293
    return data_kyd
294