1
|
|
|
"""Dataset functions. |
2
|
|
|
|
3
|
|
|
Includes functions for loading common datasets as pandas DataFrames. |
4
|
|
|
""" |
5
|
|
|
|
6
|
|
|
import io |
7
|
|
|
import gzip |
8
|
|
|
import pkgutil |
9
|
|
|
|
10
|
|
|
import pandas as pd |
11
|
|
|
|
12
|
|
|
_datasets = [ |
13
|
|
|
"iris", |
14
|
|
|
"boston" |
15
|
|
|
] |
16
|
|
|
|
17
|
|
|
def _decompress(bstr: bytes): |
18
|
|
|
"""CSV gzip decompression helper fn. |
19
|
|
|
|
20
|
|
|
Helper function for decompressing |
21
|
|
|
a gzip-ed CSV dataset and converting |
22
|
|
|
it to a pandas DataFrame. |
23
|
|
|
|
24
|
|
|
Args: |
25
|
|
|
bstr: |
|
|
|
|
26
|
|
|
Binary string of a CSV with gzip compression. |
|
|
|
|
27
|
|
|
Returns: |
28
|
|
|
Pandas DataFrame from compressed dataset. |
29
|
|
|
""" |
30
|
|
|
decomp = gzip.decompress(bstr).decode() |
31
|
|
|
f = io.StringIO(decomp) |
|
|
|
|
32
|
|
|
return pd.read_csv(f,encoding="utf-8") |
|
|
|
|
33
|
|
|
|
34
|
|
|
def list_datasets(): |
35
|
|
|
"""Get available datasets. |
36
|
|
|
|
|
|
|
|
37
|
|
|
Each dataset in the list can be loaded |
38
|
|
|
with a load_<name> function, where |
39
|
|
|
<name> is the name of the dataset. |
40
|
|
|
|
41
|
|
|
Returns: |
42
|
|
|
Returns a list of the available datasets. |
43
|
|
|
""" |
44
|
|
|
return _datasets[:] |
45
|
|
|
|
46
|
|
|
def load_iris(): |
47
|
|
|
""" Load iris dataset. |
48
|
|
|
|
49
|
|
|
Loads the iris dataset as a Pandas |
50
|
|
|
DataFrame. |
51
|
|
|
|
52
|
|
|
Iris dataset: https://archive.ics.uci.edu/ml/datasets/iris |
53
|
|
|
|
54
|
|
|
Returns: |
55
|
|
|
Iris dataset as a Pandas DataFrame. |
56
|
|
|
""" |
57
|
|
|
compressed = pkgutil.get_data('apoor.data', '_data/iris.csv.gz') |
58
|
|
|
df = _decompress(compressed) |
|
|
|
|
59
|
|
|
df["target"] = df["target"].astype("category") |
60
|
|
|
return df |
61
|
|
|
|
62
|
|
|
def load_boston(): |
63
|
|
|
"""Load boston housing dataset. |
64
|
|
|
|
65
|
|
|
Loads the boston housing dataset as a Pandas |
66
|
|
|
DataFrame. |
67
|
|
|
|
68
|
|
|
Boston Housing dataset: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html |
69
|
|
|
|
70
|
|
|
Returns: |
71
|
|
|
Boston Housing dataset as a Pandas DataFrame. |
72
|
|
|
""" |
73
|
|
|
compressed = pkgutil.get_data('apoor.data', '_data/boston.csv.gz') |
74
|
|
|
df = _decompress(compressed) |
|
|
|
|
75
|
|
|
df.CHAS = df.CHAS.astype("int8") |
76
|
|
|
df.MEDV = df.MEDV.astype("int32") |
77
|
|
|
return df |
78
|
|
|
|
79
|
|
|
|
|
|
|
|
80
|
|
|
|