Test Failed
Pull Request — master (#81)
by
unknown
04:16
created

PCA::eigenDecomposition()   B

Complexity

Conditions 5
Paths 5

Size

Total Lines 28
Code Lines 19

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 28
rs 8.439
c 0
b 0
f 0
cc 5
eloc 19
nc 5
nop 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\DimensionReduction;
6
7
use Phpml\Math\LinAlg\EigenvalueDecomposition;
8
use Phpml\Math\Statistic\Covariance;
9
use Phpml\Math\Statistic\Mean;
10
use Phpml\Math\Matrix;
11
12
class PCA
13
{
14
    /**
15
     * Total variance to be conserved after the reduction
16
     *
17
     * @var float
18
     */
19
    public $totalVariance = 0.9;
20
21
    /**
22
     * Number of features to be preserved after the reduction
23
     *
24
     * @var int
25
     */
26
    public $numFeatures = null;
27
28
    /**
29
     * Temporary storage for mean values for each dimension in given data
30
     *
31
     * @var array
32
     */
33
    protected $means = [];
34
35
    /**
36
     * Eigenvectors of the covariance matrix
37
     *
38
     * @var array
39
     */
40
    protected $eigVectors = [];
41
42
    /**
43
     * Top eigenValues of the covariance matrix
44
     *
45
     * @var type
46
     */
47
    protected $eigValues = [];
48
49
    /**
50
     * @var bool
51
     */
52
    protected $fit = false;
53
54
    /**
55
     * PCA (Principal Component Analysis) used to explain given
56
     * data with lower number of dimensions. This analysis transforms the
57
     * data to a lower dimensional version of it by conserving a proportion of total variance
58
     * within the data. It is a lossy data compression technique.<br>
59
     *
60
     * @param float $totalVariance Total explained variance to be preserved
61
     * @param int $numFeatures Number of features to be preserved
62
     *
63
     * @throws \Exception
64
     */
65
    public function __construct($totalVariance = null, $numFeatures = null)
66
    {
67
        if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) {
68
            throw new \Exception("Total variance can be a value between 0.1 and 0.99");
69
        }
70
        if ($numFeatures !== null && $numFeatures <= 0) {
71
            throw new \Exception("Number of features to be preserved should be greater than 0");
72
        }
73
        if ($totalVariance !== null && $numFeatures !== null) {
74
            throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm");
75
        }
76
77
        if ($numFeatures !== null) {
78
            $this->numFeatures = $numFeatures;
79
        }
80
        if ($totalVariance !== null) {
81
            $this->totalVariance = $totalVariance;
82
        }
83
    }
84
85
    /**
86
     * Takes a data and returns a lower dimensional version
87
     * of this data while preserving $totalVariance or $numFeatures. <br>
88
     * $data is an n-by-m matrix and returned array is
89
     * n-by-k matrix where k <= m
90
     *
91
     * @param array $data
92
     *
93
     * @return array
94
     */
95
    public function fit(array $data)
96
    {
97
        $n = count($data[0]);
98
99
        $data = $this->normalize($data, $n);
100
101
        $covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0));
102
103
        list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n);
104
105
        $this->fit = true;
106
107
        return $this->reduce($data);
108
    }
109
110
    /**
111
     * @param array $data
112
     * @param int $n
113
     */
114
    protected function calculateMeans(array $data, int $n)
115
    {
116
        // Calculate means for each dimension
117
        $this->means = [];
118 View Code Duplication
        for ($i=0; $i < $n; $i++) {
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated across your project.

Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.

You can also find more detailed suggestions in the “Code” section of your repository.

Loading history...
119
            $column = array_column($data, $i);
120
            $this->means[] = Mean::arithmetic($column);
121
        }
122
    }
123
124
    /**
125
     * Normalization of the data includes subtracting mean from
126
     * each dimension therefore dimensions will be centered to zero
127
     *
128
     * @param array $data
129
     * @param int $n
130
     *
131
     * @return array
132
     */
133
    protected function normalize(array $data, int $n)
134
    {
135
        if (empty($this->means)) {
136
            $this->calculateMeans($data, $n);
137
        }
138
139
        // Normalize data
140
        foreach ($data as $i => $row) {
141
            for ($k=0; $k < $n; $k++) {
142
                $data[$i][$k] -= $this->means[$k];
143
            }
144
        }
145
146
        return $data;
147
    }
148
149
    /**
150
     * Calculates eigenValues and eigenVectors of the given matrix. Returns
151
     * top eigenVectors along with the largest eigenValues. The total explained variance
152
     * of these eigenVectors will be no less than desired $totalVariance value
153
     *
154
     * @param array $matrix
155
     * @param int $n
156
     *
157
     * @return array
158
     */
159
    protected function eigenDecomposition(array $matrix, int $n)
160
    {
161
        $eig = new EigenvalueDecomposition($matrix);
162
        $eigVals = $eig->getRealEigenvalues();
163
        $eigVects= $eig->getEigenvectors();
164
        $totalEigVal = array_sum($eigVals);
165
166
        $explainedVar = 0.0;
167
        $vectors = [];
168
        $values = [];
169
        for ($i=$n - 1; $i >= 0; $i--) {
170
            $explainedVar += $eigVals[$i] / $totalEigVal;
171
            $vectors[] = $eigVects[$i];
172
            $values[] = $eigVals[$i];
173
174
            if ($this->numFeatures !== null) {
175
                if (count($vectors) == $this->numFeatures) {
176
                    break;
177
                }
178
            } else {
179
                if ($explainedVar >= $this->totalVariance) {
180
                    break;
181
                }
182
            }
183
        }
184
185
        return [$values, $vectors];
186
    }
187
188
    /**
189
     * Returns the reduced data
190
     *
191
     * @param array $data
192
     *
193
     * @return array
194
     */
195
    protected function reduce(array $data)
196
    {
197
        $m1 = new Matrix($data);
198
        $m2 = new Matrix($this->eigVectors);
199
200
        return $m1->multiply($m2->transpose())->toArray();
201
    }
202
203
    /**
204
     * Transforms the given sample to a lower dimensional vector by using
205
     * the eigenVectors obtained in the last run of <code>fit</code>.
206
     *
207
     * @param array $sample
208
     *
209
     * @return array
210
     */
211
    public function transform(array $sample)
212
    {
213
        if (!$this->fit) {
214
            throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first");
215
        }
216
217
        if (! is_array($sample[0])) {
218
            $sample = [$sample];
219
        }
220
221
        $sample = $this->normalize($sample, count($sample[0]));
222
223
        return $this->reduce($sample);
224
    }
225
}
226