tabpy.models.scripts.PCA   A
last analyzed

Complexity

Total Complexity 8

Size/Duplication

Total Lines 61
Duplicated Lines 0 %

Importance

Changes 0
Metric Value
wmc 8
eloc 42
dl 0
loc 61
rs 10
c 0
b 0
f 0

1 Function

Rating   Name   Duplication   Size   Complexity  
B PCA() 0 47 8
1
import pandas as pd
2
from numpy import array
3
from sklearn.decomposition import PCA as sklearnPCA
4
from sklearn.preprocessing import StandardScaler
5
from sklearn.preprocessing import LabelEncoder
6
from sklearn.preprocessing import OneHotEncoder
7
from tabpy.models.utils import setup_utils
8
9
10
def PCA(component, _arg1, _arg2, *_argN):
11
    """
12
    Principal Component Analysis is a technique that extracts the key
13
    distinct components from a high dimensional space whie attempting
14
    to capture as much of the variance as possible. For more information
15
    on the function and how to use it please refer to tabpy-tools.md
16
    """
17
    cols = [_arg1, _arg2] + list(_argN)
18
    encodedCols = []
19
    labelEncoder = LabelEncoder()
20
    oneHotEncoder = OneHotEncoder(categories="auto", sparse=False)
21
22
    for col in cols:
23
        if isinstance(col[0], (int, float)):
24
            encodedCols.append(col)
25
        elif type(col[0]) is bool:
26
            intCol = array(col)
27
            encodedCols.append(intCol.astype(int))
28
        else:
29
            if len(set(col)) > 25:
30
                print(
31
                    "ERROR: Non-numeric arguments cannot have more than "
32
                    "25 unique values"
33
                )
34
                raise ValueError
35
            integerEncoded = labelEncoder.fit_transform(array(col))
36
            integerEncoded = integerEncoded.reshape(len(col), 1)
37
            oneHotEncoded = oneHotEncoder.fit_transform(integerEncoded)
38
            transformedMatrix = oneHotEncoded.transpose()
39
            encodedCols += list(transformedMatrix)
40
41
    dataDict = {}
42
    for i in range(len(encodedCols)):
43
        dataDict[f"col{1 + i}"] = list(encodedCols[i])
44
45
    if component <= 0 or component > len(dataDict):
46
        print("ERROR: Component specified must be >= 0 and " "<= number of arguments")
47
        raise ValueError
48
49
    df = pd.DataFrame(data=dataDict, dtype=float)
50
    scale = StandardScaler()
51
    scaledData = scale.fit_transform(df)
52
53
    pca = sklearnPCA()
54
    pcaComponents = pca.fit_transform(scaledData)
55
56
    return pcaComponents[:, component - 1].tolist()
57
58
59
if __name__ == "__main__":
60
    setup_utils.deploy_model("PCA", PCA, "Returns the specified principal component")
61