Passed
Pull Request — master (#156)
by Tomáš
03:56
created

PCA::calculateMeans()   A

Complexity

Conditions 2
Paths 2

Size

Total Lines 9
Code Lines 5

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 9
rs 9.6666
c 0
b 0
f 0
cc 2
eloc 5
nc 2
nop 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\DimensionReduction;
6
7
use Exception;
8
use Phpml\Math\Statistic\Covariance;
9
use Phpml\Math\Statistic\Mean;
10
11
class PCA extends EigenTransformerBase
12
{
13
    /**
14
     * Temporary storage for mean values for each dimension in given data
15
     *
16
     * @var array
17
     */
18
    protected $means = [];
19
20
    /**
21
     * @var bool
22
     */
23
    protected $fit = false;
24
25
    /**
26
     * PCA (Principal Component Analysis) used to explain given
27
     * data with lower number of dimensions. This analysis transforms the
28
     * data to a lower dimensional version of it by conserving a proportion of total variance
29
     * within the data. It is a lossy data compression technique.<br>
30
     *
31
     * @param float $totalVariance Total explained variance to be preserved
32
     * @param int   $numFeatures   Number of features to be preserved
33
     *
34
     * @throws \Exception
35
     */
36
    public function __construct(?float $totalVariance = null, ?int $numFeatures = null)
37
    {
38
        if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
39
            throw new Exception('Total variance can be a value between 0.1 and 0.99');
40
        }
41
42
        if ($numFeatures !== null && $numFeatures <= 0) {
43
            throw new Exception('Number of features to be preserved should be greater than 0');
44
        }
45
46
        if ($totalVariance !== null && $numFeatures !== null) {
47
            throw new Exception('Either totalVariance or numFeatures should be specified in order to run the algorithm');
48
        }
49
50
        if ($numFeatures !== null) {
51
            $this->numFeatures = $numFeatures;
52
        }
53
54
        if ($totalVariance !== null) {
55
            $this->totalVariance = $totalVariance;
56
        }
57
    }
58
59
    /**
60
     * Takes a data and returns a lower dimensional version
61
     * of this data while preserving $totalVariance or $numFeatures. <br>
62
     * $data is an n-by-m matrix and returned array is
63
     * n-by-k matrix where k <= m
64
     */
65
    public function fit(array $data): array
66
    {
67
        $n = count($data[0]);
68
69
        $data = $this->normalize($data, $n);
70
71
        $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
72
73
        $this->eigenDecomposition($covMatrix);
74
75
        $this->fit = true;
76
77
        return $this->reduce($data);
78
    }
79
80
    /**
81
     * Transforms the given sample to a lower dimensional vector by using
82
     * the eigenVectors obtained in the last run of <code>fit</code>.
83
     *
84
     * @throws \Exception
85
     */
86
    public function transform(array $sample): array
87
    {
88
        if (!$this->fit) {
89
            throw new Exception('PCA has not been fitted with respect to original dataset, please run PCA::fit() first');
90
        }
91
92
        if (!is_array($sample[0])) {
93
            $sample = [$sample];
94
        }
95
96
        $sample = $this->normalize($sample, count($sample[0]));
97
98
        return $this->reduce($sample);
99
    }
100
101
    protected function calculateMeans(array $data, int $n): void
102
    {
103
        // Calculate means for each dimension
104
        $this->means = [];
105
        for ($i = 0; $i < $n; ++$i) {
106
            $column = array_column($data, $i);
107
            $this->means[] = Mean::arithmetic($column);
108
        }
109
    }
110
111
    /**
112
     * Normalization of the data includes subtracting mean from
113
     * each dimension therefore dimensions will be centered to zero
114
     */
115
    protected function normalize(array $data, int $n): array
116
    {
117
        if (empty($this->means)) {
118
            $this->calculateMeans($data, $n);
119
        }
120
121
        // Normalize data
122
        foreach ($data as $i => $row) {
123
            for ($k = 0; $k < $n; ++$k) {
124
                $data[$i][$k] -= $this->means[$k];
125
            }
126
        }
127
128
        return $data;
129
    }
130
}
131