1
|
|
|
import pandas as pd |
2
|
|
|
from numpy import array |
3
|
|
|
from sklearn.decomposition import PCA as sklearnPCA |
4
|
|
|
from sklearn.preprocessing import StandardScaler |
5
|
|
|
from sklearn.preprocessing import LabelEncoder |
6
|
|
|
from sklearn.preprocessing import OneHotEncoder |
7
|
|
|
from tabpy.models.utils import setup_utils |
8
|
|
|
|
9
|
|
|
|
10
|
|
|
def PCA(component, _arg1, _arg2, *_argN): |
11
|
|
|
""" |
12
|
|
|
Principal Component Analysis is a technique that extracts the key |
13
|
|
|
distinct components from a high dimensional space whie attempting |
14
|
|
|
to capture as much of the variance as possible. For more information |
15
|
|
|
on the function and how to use it please refer to tabpy-tools.md |
16
|
|
|
""" |
17
|
|
|
cols = [_arg1, _arg2] + list(_argN) |
18
|
|
|
encodedCols = [] |
19
|
|
|
labelEncoder = LabelEncoder() |
20
|
|
|
oneHotEncoder = OneHotEncoder(categories="auto", sparse=False) |
21
|
|
|
|
22
|
|
|
for col in cols: |
23
|
|
|
if isinstance(col[0], (int, float)): |
24
|
|
|
encodedCols.append(col) |
25
|
|
|
elif type(col[0]) is bool: |
26
|
|
|
intCol = array(col) |
27
|
|
|
encodedCols.append(intCol.astype(int)) |
28
|
|
|
else: |
29
|
|
|
if len(set(col)) > 25: |
30
|
|
|
print( |
31
|
|
|
"ERROR: Non-numeric arguments cannot have more than " |
32
|
|
|
"25 unique values" |
33
|
|
|
) |
34
|
|
|
raise ValueError |
35
|
|
|
integerEncoded = labelEncoder.fit_transform(array(col)) |
36
|
|
|
integerEncoded = integerEncoded.reshape(len(col), 1) |
37
|
|
|
oneHotEncoded = oneHotEncoder.fit_transform(integerEncoded) |
38
|
|
|
transformedMatrix = oneHotEncoded.transpose() |
39
|
|
|
encodedCols += list(transformedMatrix) |
40
|
|
|
|
41
|
|
|
dataDict = {} |
42
|
|
|
for i in range(len(encodedCols)): |
43
|
|
|
dataDict[f"col{1 + i}"] = list(encodedCols[i]) |
44
|
|
|
|
45
|
|
|
if component <= 0 or component > len(dataDict): |
46
|
|
|
print("ERROR: Component specified must be >= 0 and " "<= number of arguments") |
47
|
|
|
raise ValueError |
48
|
|
|
|
49
|
|
|
df = pd.DataFrame(data=dataDict, dtype=float) |
50
|
|
|
scale = StandardScaler() |
51
|
|
|
scaledData = scale.fit_transform(df) |
52
|
|
|
|
53
|
|
|
pca = sklearnPCA() |
54
|
|
|
pcaComponents = pca.fit_transform(scaledData) |
55
|
|
|
|
56
|
|
|
return pcaComponents[:, component - 1].tolist() |
57
|
|
|
|
58
|
|
|
|
59
|
|
|
if __name__ == "__main__": |
60
|
|
|
setup_utils.deploy_model("PCA", PCA, "Returns the specified principal component") |
61
|
|
|
|