1
|
|
|
<?php |
2
|
|
|
|
3
|
|
|
declare(strict_types=1); |
4
|
|
|
|
5
|
|
|
namespace Phpml\DimensionReduction; |
6
|
|
|
|
7
|
|
|
use Phpml\Math\LinearAlgebra\EigenvalueDecomposition; |
8
|
|
|
use Phpml\Math\Statistic\Covariance; |
9
|
|
|
use Phpml\Math\Statistic\Mean; |
10
|
|
|
use Phpml\Math\Matrix; |
11
|
|
|
|
12
|
|
|
class PCA |
13
|
|
|
{ |
14
|
|
|
/** |
15
|
|
|
* Total variance to be conserved after the reduction |
16
|
|
|
* |
17
|
|
|
* @var float |
18
|
|
|
*/ |
19
|
|
|
public $totalVariance = 0.9; |
20
|
|
|
|
21
|
|
|
/** |
22
|
|
|
* Number of features to be preserved after the reduction |
23
|
|
|
* |
24
|
|
|
* @var int |
25
|
|
|
*/ |
26
|
|
|
public $numFeatures = null; |
27
|
|
|
|
28
|
|
|
/** |
29
|
|
|
* Temporary storage for mean values for each dimension in given data |
30
|
|
|
* |
31
|
|
|
* @var array |
32
|
|
|
*/ |
33
|
|
|
protected $means = []; |
34
|
|
|
|
35
|
|
|
/** |
36
|
|
|
* Eigenvectors of the covariance matrix |
37
|
|
|
* |
38
|
|
|
* @var array |
39
|
|
|
*/ |
40
|
|
|
protected $eigVectors = []; |
41
|
|
|
|
42
|
|
|
/** |
43
|
|
|
* Top eigenValues of the covariance matrix |
44
|
|
|
* |
45
|
|
|
* @var type |
46
|
|
|
*/ |
47
|
|
|
protected $eigValues = []; |
48
|
|
|
|
49
|
|
|
/** |
50
|
|
|
* @var bool |
51
|
|
|
*/ |
52
|
|
|
protected $fit = false; |
53
|
|
|
|
54
|
|
|
/** |
55
|
|
|
* PCA (Principal Component Analysis) used to explain given |
56
|
|
|
* data with lower number of dimensions. This analysis transforms the |
57
|
|
|
* data to a lower dimensional version of it by conserving a proportion of total variance |
58
|
|
|
* within the data. It is a lossy data compression technique.<br> |
59
|
|
|
* |
60
|
|
|
* @param float $totalVariance Total explained variance to be preserved |
61
|
|
|
* @param int $numFeatures Number of features to be preserved |
62
|
|
|
* |
63
|
|
|
* @throws \Exception |
64
|
|
|
*/ |
65
|
|
|
public function __construct($totalVariance = null, $numFeatures = null) |
66
|
|
|
{ |
67
|
|
|
if ($totalVariance !== null && ($totalVariance < 0.1 || $totalVariance > 0.99)) { |
68
|
|
|
throw new \Exception("Total variance can be a value between 0.1 and 0.99"); |
69
|
|
|
} |
70
|
|
|
if ($numFeatures !== null && $numFeatures <= 0) { |
71
|
|
|
throw new \Exception("Number of features to be preserved should be greater than 0"); |
72
|
|
|
} |
73
|
|
|
if ($totalVariance !== null && $numFeatures !== null) { |
74
|
|
|
throw new \Exception("Either totalVariance or numFeatures should be specified in order to run the algorithm"); |
75
|
|
|
} |
76
|
|
|
|
77
|
|
|
if ($numFeatures !== null) { |
78
|
|
|
$this->numFeatures = $numFeatures; |
79
|
|
|
} |
80
|
|
|
if ($totalVariance !== null) { |
81
|
|
|
$this->totalVariance = $totalVariance; |
82
|
|
|
} |
83
|
|
|
} |
84
|
|
|
|
85
|
|
|
/** |
86
|
|
|
* Takes a data and returns a lower dimensional version |
87
|
|
|
* of this data while preserving $totalVariance or $numFeatures. <br> |
88
|
|
|
* $data is an n-by-m matrix and returned array is |
89
|
|
|
* n-by-k matrix where k <= m |
90
|
|
|
* |
91
|
|
|
* @param array $data |
92
|
|
|
* |
93
|
|
|
* @return array |
94
|
|
|
*/ |
95
|
|
|
public function fit(array $data) |
96
|
|
|
{ |
97
|
|
|
$n = count($data[0]); |
98
|
|
|
|
99
|
|
|
$data = $this->normalize($data, $n); |
100
|
|
|
|
101
|
|
|
$covMatrix = Covariance::covarianceMatrix($data, array_fill(0, $n, 0)); |
102
|
|
|
|
103
|
|
|
list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($covMatrix, $n); |
104
|
|
|
|
105
|
|
|
$this->fit = true; |
106
|
|
|
|
107
|
|
|
return $this->reduce($data); |
108
|
|
|
} |
109
|
|
|
|
110
|
|
|
/** |
111
|
|
|
* @param array $data |
112
|
|
|
* @param int $n |
113
|
|
|
*/ |
114
|
|
|
protected function calculateMeans(array $data, int $n) |
115
|
|
|
{ |
116
|
|
|
// Calculate means for each dimension |
117
|
|
|
$this->means = []; |
118
|
|
View Code Duplication |
for ($i=0; $i < $n; $i++) { |
|
|
|
|
119
|
|
|
$column = array_column($data, $i); |
120
|
|
|
$this->means[] = Mean::arithmetic($column); |
121
|
|
|
} |
122
|
|
|
} |
123
|
|
|
|
124
|
|
|
/** |
125
|
|
|
* Normalization of the data includes subtracting mean from |
126
|
|
|
* each dimension therefore dimensions will be centered to zero |
127
|
|
|
* |
128
|
|
|
* @param array $data |
129
|
|
|
* @param int $n |
130
|
|
|
* |
131
|
|
|
* @return array |
132
|
|
|
*/ |
133
|
|
|
protected function normalize(array $data, int $n) |
134
|
|
|
{ |
135
|
|
|
if (empty($this->means)) { |
136
|
|
|
$this->calculateMeans($data, $n); |
137
|
|
|
} |
138
|
|
|
|
139
|
|
|
// Normalize data |
140
|
|
|
foreach ($data as $i => $row) { |
141
|
|
|
for ($k=0; $k < $n; $k++) { |
142
|
|
|
$data[$i][$k] -= $this->means[$k]; |
143
|
|
|
} |
144
|
|
|
} |
145
|
|
|
|
146
|
|
|
return $data; |
147
|
|
|
} |
148
|
|
|
|
149
|
|
|
/** |
150
|
|
|
* Calculates eigenValues and eigenVectors of the given matrix. Returns |
151
|
|
|
* top eigenVectors along with the largest eigenValues. The total explained variance |
152
|
|
|
* of these eigenVectors will be no less than desired $totalVariance value |
153
|
|
|
* |
154
|
|
|
* @param array $matrix |
155
|
|
|
* @param int $n |
156
|
|
|
* |
157
|
|
|
* @return array |
158
|
|
|
*/ |
159
|
|
|
protected function eigenDecomposition(array $matrix, int $n) |
|
|
|
|
160
|
|
|
{ |
161
|
|
|
$eig = new EigenvalueDecomposition($matrix); |
162
|
|
|
$eigVals = $eig->getRealEigenvalues(); |
163
|
|
|
$eigVects= $eig->getEigenvectors(); |
164
|
|
|
|
165
|
|
|
$totalEigVal = array_sum($eigVals); |
166
|
|
|
// Sort eigenvalues in descending order |
167
|
|
|
arsort($eigVals); |
168
|
|
|
|
169
|
|
|
$explainedVar = 0.0; |
170
|
|
|
$vectors = []; |
171
|
|
|
$values = []; |
172
|
|
|
foreach ($eigVals as $i => $eigVal) { |
173
|
|
|
$explainedVar += $eigVal / $totalEigVal; |
174
|
|
|
$vectors[] = $eigVects[$i]; |
175
|
|
|
$values[] = $eigVal; |
176
|
|
|
|
177
|
|
|
if ($this->numFeatures !== null) { |
178
|
|
|
if (count($vectors) == $this->numFeatures) { |
179
|
|
|
break; |
180
|
|
|
} |
181
|
|
|
} else { |
182
|
|
|
if ($explainedVar >= $this->totalVariance) { |
183
|
|
|
break; |
184
|
|
|
} |
185
|
|
|
} |
186
|
|
|
} |
187
|
|
|
|
188
|
|
|
return [$values, $vectors]; |
189
|
|
|
} |
190
|
|
|
|
191
|
|
|
/** |
192
|
|
|
* Returns the reduced data |
193
|
|
|
* |
194
|
|
|
* @param array $data |
195
|
|
|
* |
196
|
|
|
* @return array |
197
|
|
|
*/ |
198
|
|
|
protected function reduce(array $data) |
199
|
|
|
{ |
200
|
|
|
$m1 = new Matrix($data); |
201
|
|
|
$m2 = new Matrix($this->eigVectors); |
202
|
|
|
|
203
|
|
|
return $m1->multiply($m2->transpose())->toArray(); |
204
|
|
|
} |
205
|
|
|
|
206
|
|
|
/** |
207
|
|
|
* Transforms the given sample to a lower dimensional vector by using |
208
|
|
|
* the eigenVectors obtained in the last run of <code>fit</code>. |
209
|
|
|
* |
210
|
|
|
* @param array $sample |
211
|
|
|
* |
212
|
|
|
* @return array |
213
|
|
|
*/ |
214
|
|
|
public function transform(array $sample) |
215
|
|
|
{ |
216
|
|
|
if (!$this->fit) { |
217
|
|
|
throw new \Exception("PCA has not been fitted with respect to original dataset, please run PCA::fit() first"); |
218
|
|
|
} |
219
|
|
|
|
220
|
|
|
if (! is_array($sample[0])) { |
221
|
|
|
$sample = [$sample]; |
222
|
|
|
} |
223
|
|
|
|
224
|
|
|
$sample = $this->normalize($sample, count($sample[0])); |
225
|
|
|
|
226
|
|
|
return $this->reduce($sample); |
227
|
|
|
} |
228
|
|
|
} |
229
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.