knowyourdata.kyd   B
last analyzed

Complexity

Total Complexity 36

Size/Duplication

Total Lines 375
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
eloc 228
dl 0
loc 375
rs 8.8
c 0
b 0
f 0
wmc 36

14 Methods

Rating   Name   Duplication   Size   Complexity  
A KYD_datasummary.__repr__() 0 5 1
B KYD.get_basic_stats() 0 25 2
A KYD.clear_memory() 0 4 1
A KYD.check_struct() 0 8 1
A KYD_datasummary._repr_html_() 0 5 1
A KYD_datasummary.__init__() 0 5 1
A KYD.__init__() 0 14 2
A KYD_datasummary.make_txt_struct() 0 55 1
A KYD_datasummary.make_html_repr() 0 3 1
B KYD.check_finite() 0 20 5
A KYD.display() 0 10 3
A KYD.make_summary() 0 3 1
B KYD_datasummary.make_text_repr() 0 40 6
B KYD_datasummary.make_txt_basic_stats() 0 78 5

2 Functions

Rating   Name   Duplication   Size   Complexity  
A kyd() 0 18 2
A sizeof_fmt() 0 10 3
1
"""
2
KnowYourData
3
============
4
5
A rapid and lightweight module to describe the statistics and structure of
6
data arrays for interactive use.
7
8
The most simple use case to display data is if you have a numpy array 'x':
9
10
    >>> from knowyourdata import kyd
11
    >>> kyd(x)
12
13
"""
14
15
import sys
16
import numpy as np
17
from IPython.display import display
18
19
# Getting HTML Template
20
from . import kyd_html_display_template
21
kyd_htmltemplate = kyd_html_display_template.kyd_htmltemplate
0 ignored issues
show
Coding Style Naming introduced by
The name kyd_htmltemplate does not conform to the constant naming conventions ((([A-Z_][A-Z0-9_]*)|(__.*__))$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
22
23
24
class KYD_datasummary(object):
0 ignored issues
show
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
Coding Style Naming introduced by
The name KYD_datasummary does not conform to the class naming conventions ([A-Z_][a-zA-Z0-9]+$).

This check looks for invalid names for a range of different identifiers.

You can set regular expressions to which the identifiers must conform if the defaults do not match your requirements.

If your project includes a Pylint configuration file, the settings contained in that file take precedence.

To find out more about Pylint, please refer to their site.

Loading history...
25
    """A class to store and display the summary information"""
26
27
    text_repr = ""
28
    html_repr = ""
29
30
    # Display Settings
31
    col_width = 10
32
    precision = 4
33
34
    def __repr__(self):
35
        """
36
        The Plain String Representation of the Data Summary
37
        """
38
        return self.text_repr
39
40
    def _repr_html_(self):
41
        """
42
        The HTML Representation of the Data Summary
43
        """
44
        return self.html_repr
45
46
    def make_html_repr(self):
47
        """Make HTML Representation of Data Summary"""
48
        self.html_repr = kyd_htmltemplate.format(kyd_class=self.kyd_class)
49
50
    def make_txt_basic_stats(self):
51
        """Make Text Representation of Basic Statistics"""
52
        pstr_list = []
53
54
        pstr_struct_header1 = "Basic Statistics  "
55
        pstr_struct_header2 = ''
56
57
        pstr_list.append(pstr_struct_header1)
58
        pstr_list.append(pstr_struct_header2)
59
60
        template_str = (
61
            " {0:^10} "
62
            " {1:>8} "
63
            " {2:<10} "
64
            " {3:>8} "
65
            " {4:<10} "
66
        )
67
68
        tmp_data = [
69
            [
70
                "Mean:", "{kyd_class.mean:.{kyd_class.precision}}".format(
71
                    kyd_class=self.kyd_class),
72
                "",
73
                "Std Dev:", "{kyd_class.std:.{kyd_class.precision}}".format(
74
                    kyd_class=self.kyd_class)
75
            ],
76
            ["Min:", "1Q:", "Median:", "3Q:", "Max:"],
77
            [
78
                "{kyd_class.min: .{kyd_class.precision}}".format(
79
                    kyd_class=self.kyd_class),
80
                "{kyd_class.firstquartile: .{kyd_class.precision}}".format(
81
                    kyd_class=self.kyd_class),
82
                "{kyd_class.median: .{kyd_class.precision}}".format(
83
                    kyd_class=self.kyd_class),
84
                "{kyd_class.thirdquartile: .{kyd_class.precision}}".format(
85
                    kyd_class=self.kyd_class),
86
                "{kyd_class.max: .{kyd_class.precision}}".format(
87
                    kyd_class=self.kyd_class),
88
            ],
89
            ['-99 CI:', '-95 CI:', '-68 CI:', '+68 CI:', '+95 CI:', '+99 CI:'],
90
            [
91
                "{kyd_class.ci_99[0]: .{kyd_class.precision}}".format(
92
                    kyd_class=self.kyd_class),
93
                "{kyd_class.ci_95[0]: .{kyd_class.precision}}".format(
94
                    kyd_class=self.kyd_class),
95
                "{kyd_class.ci_68[0]: .{kyd_class.precision}}".format(
96
                    kyd_class=self.kyd_class),
97
                "{kyd_class.ci_68[1]: .{kyd_class.precision}}".format(
98
                    kyd_class=self.kyd_class),
99
                "{kyd_class.ci_95[1]: .{kyd_class.precision}}".format(
100
                    kyd_class=self.kyd_class),
101
                "{kyd_class.ci_99[1]: .{kyd_class.precision}}".format(
102
                    kyd_class=self.kyd_class),
103
            ],
104
        ]
105
106
        n_tmp_data = len(tmp_data)
107
108
        num_rows_in_cols = [len(i) for i in tmp_data]
109
        num_rows = np.max(num_rows_in_cols)
110
111
        for i in range(n_tmp_data):
112
            tmp_col = tmp_data[i]
113
            for j in range(num_rows_in_cols[i], num_rows):
0 ignored issues
show
Unused Code introduced by
The variable j seems to be unused.
Loading history...
114
                tmp_col.append("")
115
116
        for i in range(num_rows):
117
            pstr_list.append(
118
                template_str.format(
119
                    tmp_data[0][i],
120
                    tmp_data[1][i],
121
                    tmp_data[2][i],
122
                    tmp_data[3][i],
123
                    tmp_data[4][i],
124
                )
125
            )
126
127
        return pstr_list
128
129
    def make_txt_struct(self):
130
        """Make Text Representation of Array"""
131
132
        pstr_list = []
133
134
        # pstr_struct_header0 = "................."
135
        # Commenting out Ansi Coloured Version
136
        # pstr_struct_header1 = '\033[1m' + "Array Structure  " + '\033[0m'
137
        pstr_struct_header1 = "Array Structure  "
138
        pstr_struct_header2 = "                 "
139
140
        # pstr_list.append(pstr_struct_header0)
141
        pstr_list.append(pstr_struct_header1)
142
        pstr_list.append(pstr_struct_header2)
143
144
        pstr_n_dim = (
145
            "Number of Dimensions:\t"
146
            "{kyd_class.ndim}").format(
147
                kyd_class=self.kyd_class)
148
        pstr_list.append(pstr_n_dim)
149
150
        pstr_shape = (
151
            "Shape of Dimensions:\t"
152
            "{kyd_class.shape}").format(
153
                kyd_class=self.kyd_class)
154
        pstr_list.append(pstr_shape)
155
156
        pstr_dtype = (
157
            "Array Data Type:\t"
158
            "{kyd_class.dtype}").format(
159
                kyd_class=self.kyd_class)
160
        pstr_list.append(pstr_dtype)
161
162
        pstr_memsize = (
163
            "Memory Size:\t\t"
164
            "{kyd_class.human_memsize}").format(
165
                kyd_class=self.kyd_class)
166
        pstr_list.append(pstr_memsize)
167
168
        pstr_spacer = ("")
169
        pstr_list.append(pstr_spacer)
170
171
        pstr_numnan = (
172
            "Number of NaN:\t"
173
            "{kyd_class.num_nan}").format(
174
                kyd_class=self.kyd_class)
175
        pstr_list.append(pstr_numnan)
176
177
        pstr_numinf = (
178
            "Number of Inf:\t"
179
            "{kyd_class.num_inf}").format(
180
                kyd_class=self.kyd_class)
181
        pstr_list.append(pstr_numinf)
182
183
        return pstr_list
184
185
    def make_text_repr(self):
186
        """Making final text string for plain text representation"""
187
188
        tmp_text_repr = ""
189
190
        tmp_text_repr += "\n"
191
192
        pstr_basic = self.make_txt_basic_stats()
193
        pstr_struct = self.make_txt_struct()
194
195
        n_basic = len(pstr_basic)
196
        n_struct = len(pstr_struct)
197
198
        l_colwidth = max([len(x) for x in pstr_basic]) + 1
199
200
        r_colwidth = max([len(x) for x in pstr_struct]) + 2
201
202
        # new_colwidth = self.col_width + 20
203
204
        # Finding the longest string
205
        len_list = max([n_basic, n_struct])
206
207
        for i in range(len_list):
208
            tmp_str = '| '
209
            if i < n_basic:
210
                tmp_str += (pstr_basic[i].ljust(l_colwidth))
211
            else:
212
                tmp_str += ''.ljust(l_colwidth)
213
            tmp_str += ' | '
214
215
            if i < n_struct:
216
                tmp_str += (pstr_struct[i].expandtabs().ljust(r_colwidth))
217
            else:
218
                tmp_str += ''.ljust(r_colwidth)
219
            tmp_str += '\t|'
220
221
            tmp_text_repr += tmp_str + "\n"
222
223
        tmp_text_repr += "\n"
224
        self.text_repr = tmp_text_repr
225
226
    def __init__(self, kyd_class):
227
        super(KYD_datasummary, self).__init__()
228
        self.kyd_class = kyd_class
229
        self.make_text_repr()
230
        self.make_html_repr()
231
232
233
class KYD(object):
0 ignored issues
show
best-practice introduced by
Too many instance attributes (26/7)
Loading history...
Unused Code introduced by
The variable __class__ seems to be unused.
Loading history...
234
    """The Central Class for KYD"""
235
236
    # Variable for Data Vector
237
    data = None
238
239
    # Initial Flags
240
    f_allfinite = False
241
    f_allnonfinite = False
242
    f_hasnan = False
243
    f_hasinf = False
244
245
    # Initialized Numbers
246
    num_nan = 0
247
    num_inf = 0
248
249
    # Display Settings
250
    col_width = 10
251
    precision = 4
252
253
    def check_finite(self):
254
        """Checking to see if all elements are finite and setting flags"""
255
        if np.all(np.isfinite(self.data)):
256
            self.filt_data = self.data
257
            self.f_allfinite = True
258
        else:
259
            finite_inds = np.where(np.isfinite(self.data))
260
261
            self.filt_data = self.data[finite_inds]
262
263
            if self.filt_data.size == 0:
264
                self.f_allnonfinite = True
265
266
            if np.any(np.isnan(self.data)):
267
                self.f_hasnan = True
268
                self.num_nan = np.sum(np.isnan(self.data))
269
270
            if np.any(np.isinf(self.data)):
271
                self.f_hasinf = True
272
                self.num_inf = np.sum(np.isinf(self.data))
273
274
    def check_struct(self):
275
        """Determining the Structure of the Numpy Array"""
276
        self.dtype = self.data.dtype
277
        self.ndim = self.data.ndim
278
        self.shape = self.data.shape
279
        self.size = self.data.size
280
        self.memsize = sys.getsizeof(self.data)
281
        self.human_memsize = sizeof_fmt(self.memsize)
282
283
    def get_basic_stats(self):
284
        """Get basic statistics about array"""
285
286
        if self.f_allnonfinite:
287
            self.min = self.max = self.range = np.nan
288
            self.mean = self.std = self.median = np.nan
289
            self.firstquartile = self.thirdquartile = np.nan
290
            self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan])
291
292
            return
293
294
        self.min = np.float_(np.min(self.filt_data))
295
        self.max = np.float_(np.max(self.filt_data))
296
        self.range = self.max - self.min
297
        self.mean = np.mean(self.filt_data)
298
        self.std = np.std(self.filt_data)
299
        self.median = np.float_(np.median(self.filt_data))
300
        self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
301
        self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
302
        self.ci_99 = np.float_(
303
            np.percentile(self.filt_data, np.array([0.5, 99.5])))
304
        self.ci_95 = np.float_(
305
            np.percentile(self.filt_data, np.array([2.5, 97.5])))
306
        self.ci_68 = np.float_(
307
            np.percentile(self.filt_data, np.array([16.0, 84.0])))
308
309
    def make_summary(self):
310
        """Making Data Summary"""
311
        self.data_summary = KYD_datasummary(self)
312
313
    def clear_memory(self):
314
        """Ensuring the Numpy Array does not exist in memory"""
315
        del self.data
316
        del self.filt_data
317
318
    def display(self, short=False):
319
        """Displaying all relevant statistics"""
320
321
        if short:
322
            pass
323
        try:
324
            get_ipython
325
            display(self.data_summary)
326
        except NameError:
327
            print(self.data_summary)
328
329
    def __init__(self, data):
330
        super(KYD, self).__init__()
331
332
        # Ensuring that the array is a numpy array
333
        if not isinstance(data, np.ndarray):
334
            data = np.array(data)
335
336
        self.data = data
337
338
        self.check_finite()
339
        self.check_struct()
340
        self.get_basic_stats()
341
        self.clear_memory()
342
        self.make_summary()
343
344
345
def sizeof_fmt(num, suffix='B'):
346
    """Return human readable version of in-memory size.
347
    Code from Fred Cirera from Stack Overflow:
348
    https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
349
    """
350
    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
351
        if abs(num) < 1024.0:
352
            return "%3.1f%s%s" % (num, unit, suffix)
353
        num /= 1024.0
354
    return "%.1f%s%s" % (num, 'Yi', suffix)
355
356
357
def kyd(data, full_statistics=False):
358
    """Print statistics of any numpy array
359
360
    data -- Numpy Array of Data
361
362
    Keyword arguments:
363
    full_statistics -- printing all detailed statistics of the sources
364
    (Currently Not Implemented)
365
366
    """
367
368
    data_kyd = KYD(data)
369
    if full_statistics:
370
        data_kyd.display()
371
    else:
372
        data_kyd.display(short=True)
373
374
    return data_kyd
375