tabpy.models.scripts.PCA.PCA()   B
last analyzed

Complexity

Conditions 8

Size

Total Lines 47
Code Lines 32

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 32
dl 0
loc 47
rs 7.2453
c 0
b 0
f 0
cc 8
nop 4
1
import pandas as pd
2
from numpy import array
3
from sklearn.decomposition import PCA as sklearnPCA
4
from sklearn.preprocessing import StandardScaler
5
from sklearn.preprocessing import LabelEncoder
6
from sklearn.preprocessing import OneHotEncoder
7
from tabpy.models.utils import setup_utils
8
9
10
def PCA(component, _arg1, _arg2, *_argN):
11
    """
12
    Principal Component Analysis is a technique that extracts the key
13
    distinct components from a high dimensional space whie attempting
14
    to capture as much of the variance as possible. For more information
15
    on the function and how to use it please refer to tabpy-tools.md
16
    """
17
    cols = [_arg1, _arg2] + list(_argN)
18
    encodedCols = []
19
    labelEncoder = LabelEncoder()
20
    oneHotEncoder = OneHotEncoder(categories="auto", sparse=False)
21
22
    for col in cols:
23
        if isinstance(col[0], (int, float)):
24
            encodedCols.append(col)
25
        elif type(col[0]) is bool:
26
            intCol = array(col)
27
            encodedCols.append(intCol.astype(int))
28
        else:
29
            if len(set(col)) > 25:
30
                print(
31
                    "ERROR: Non-numeric arguments cannot have more than "
32
                    "25 unique values"
33
                )
34
                raise ValueError
35
            integerEncoded = labelEncoder.fit_transform(array(col))
36
            integerEncoded = integerEncoded.reshape(len(col), 1)
37
            oneHotEncoded = oneHotEncoder.fit_transform(integerEncoded)
38
            transformedMatrix = oneHotEncoded.transpose()
39
            encodedCols += list(transformedMatrix)
40
41
    dataDict = {}
42
    for i in range(len(encodedCols)):
43
        dataDict[f"col{1 + i}"] = list(encodedCols[i])
44
45
    if component <= 0 or component > len(dataDict):
46
        print("ERROR: Component specified must be >= 0 and " "<= number of arguments")
47
        raise ValueError
48
49
    df = pd.DataFrame(data=dataDict, dtype=float)
50
    scale = StandardScaler()
51
    scaledData = scale.fit_transform(df)
52
53
    pca = sklearnPCA()
54
    pcaComponents = pca.fit_transform(scaledData)
55
56
    return pcaComponents[:, component - 1].tolist()
57
58
59
if __name__ == "__main__":
60
    setup_utils.deploy_model("PCA", PCA, "Returns the specified principal component")
61