|
1
|
|
|
<?php |
|
2
|
|
|
|
|
3
|
|
|
declare(strict_types=1); |
|
4
|
|
|
|
|
5
|
|
|
namespace Phpml\DimensionReduction; |
|
6
|
|
|
|
|
7
|
|
|
use Phpml\Math\Distance\Euclidean; |
|
8
|
|
|
use Phpml\Math\Distance\Manhattan; |
|
9
|
|
|
use Phpml\Math\Matrix; |
|
10
|
|
|
|
|
11
|
|
|
class KernelPCA extends PCA |
|
12
|
|
|
{ |
|
13
|
|
|
const KERNEL_RBF = 1; |
|
14
|
|
|
const KERNEL_SIGMOID = 2; |
|
15
|
|
|
const KERNEL_LAPLACIAN = 3; |
|
16
|
|
|
|
|
17
|
|
|
/** |
|
18
|
|
|
* Selected kernel function |
|
19
|
|
|
* |
|
20
|
|
|
* @var int |
|
21
|
|
|
*/ |
|
22
|
|
|
protected $kernel; |
|
23
|
|
|
|
|
24
|
|
|
/** |
|
25
|
|
|
* Gamma value used by the kernel |
|
26
|
|
|
* |
|
27
|
|
|
* @var float |
|
28
|
|
|
*/ |
|
29
|
|
|
protected $gamma; |
|
30
|
|
|
|
|
31
|
|
|
/** |
|
32
|
|
|
* Original dataset used to fit KernelPCA |
|
33
|
|
|
* |
|
34
|
|
|
* @var array |
|
35
|
|
|
*/ |
|
36
|
|
|
protected $data; |
|
37
|
|
|
|
|
38
|
|
|
/** |
|
39
|
|
|
* |
|
40
|
|
|
* @param int $kernel |
|
41
|
|
|
* @param float $totalVariance |
|
42
|
|
|
* @param int $numFeatures |
|
43
|
|
|
* @param float $gamma |
|
44
|
|
|
* |
|
45
|
|
|
* @throws \Exception |
|
46
|
|
|
*/ |
|
47
|
|
|
public function __construct(int $kernel = self::KERNEL_RBF, $totalVariance = null, $numFeatures = null, $gamma = null) |
|
48
|
|
|
{ |
|
49
|
|
|
if (! in_array($kernel, [self::KERNEL_RBF, self::KERNEL_SIGMOID, self::KERNEL_LAPLACIAN])) { |
|
50
|
|
|
throw new \Exception("KernelPCA can be initialized with the following kernels only: RBF, Sigmoid and Laplacian"); |
|
51
|
|
|
} |
|
52
|
|
|
|
|
53
|
|
|
parent::__construct($totalVariance, $numFeatures); |
|
54
|
|
|
|
|
55
|
|
|
$this->kernel = $kernel; |
|
56
|
|
|
$this->gamma = $gamma; |
|
57
|
|
|
} |
|
58
|
|
|
|
|
59
|
|
|
/** |
|
60
|
|
|
* Takes a data and returns a lower dimensional version |
|
61
|
|
|
* of this data while preserving $totalVariance or $numFeatures. <br> |
|
62
|
|
|
* $data is an n-by-m matrix and returned array is |
|
63
|
|
|
* n-by-k matrix where k <= m |
|
64
|
|
|
* |
|
65
|
|
|
* @param array $data |
|
66
|
|
|
* |
|
67
|
|
|
* @return array |
|
68
|
|
|
*/ |
|
69
|
|
|
public function fit(array $data) |
|
70
|
|
|
{ |
|
71
|
|
|
$numRows = count($data); |
|
72
|
|
|
$numCols = count($data[0]); |
|
73
|
|
|
|
|
74
|
|
|
if ($this->gamma === null) { |
|
75
|
|
|
$this->gamma = 1.0 / $numRows; |
|
76
|
|
|
} |
|
77
|
|
|
|
|
78
|
|
|
$this->data = $this->normalize($data, $numCols); |
|
79
|
|
|
$matrix = $this->calculateKernelMatrix($this->data, $numRows); |
|
80
|
|
|
$matrix = $this->centerMatrix($matrix, $numRows); |
|
81
|
|
|
|
|
82
|
|
|
list($this->eigValues, $this->eigVectors) = $this->eigenDecomposition($matrix, $numRows); |
|
83
|
|
|
|
|
84
|
|
|
$this->fit = true; |
|
85
|
|
|
|
|
86
|
|
|
return Matrix::transposeArray($this->eigVectors); |
|
87
|
|
|
} |
|
88
|
|
|
|
|
89
|
|
|
/** |
|
90
|
|
|
* Calculates similarity matrix by use of selected kernel function<br> |
|
91
|
|
|
* An n-by-m matrix is given and an n-by-n matrix is returned |
|
92
|
|
|
* |
|
93
|
|
|
* @param array $data |
|
94
|
|
|
* @param int $numRows |
|
95
|
|
|
* |
|
96
|
|
|
* @return array |
|
97
|
|
|
*/ |
|
98
|
|
|
protected function calculateKernelMatrix(array $data, int $numRows) |
|
99
|
|
|
{ |
|
100
|
|
|
$kernelFunc = $this->getKernel(); |
|
101
|
|
|
|
|
102
|
|
|
$matrix = []; |
|
103
|
|
|
for ($i=0; $i < $numRows; $i++) { |
|
104
|
|
|
for ($k=0; $k < $numRows; $k++) { |
|
105
|
|
|
if ($i <= $k) { |
|
106
|
|
|
$matrix[$i][$k] = $kernelFunc($data[$i], $data[$k]); |
|
107
|
|
|
} else { |
|
108
|
|
|
$matrix[$i][$k] = $matrix[$k][$i]; |
|
109
|
|
|
} |
|
110
|
|
|
} |
|
111
|
|
|
} |
|
112
|
|
|
|
|
113
|
|
|
return $matrix; |
|
114
|
|
|
} |
|
115
|
|
|
|
|
116
|
|
|
/** |
|
117
|
|
|
* Kernel matrix is centered in its original space by using the following |
|
118
|
|
|
* conversion: |
|
119
|
|
|
* |
|
120
|
|
|
* K′ = K− N . K − K.N + N.K.N where N is n-by-n matrix filled with 1/n |
|
121
|
|
|
* |
|
122
|
|
|
* @param array $matrix |
|
123
|
|
|
* @param int $n |
|
124
|
|
|
*/ |
|
125
|
|
|
protected function centerMatrix(array $matrix, int $n) |
|
126
|
|
|
{ |
|
127
|
|
|
$N = array_fill(0, $n, array_fill(0, $n, 1.0/$n)); |
|
128
|
|
|
$N = new Matrix($N, false); |
|
129
|
|
|
$K = new Matrix($matrix, false); |
|
130
|
|
|
|
|
131
|
|
|
// K.N (This term is repeated so we cache it once) |
|
132
|
|
|
$K_N = $K->multiply($N); |
|
133
|
|
|
// N.K |
|
134
|
|
|
$N_K = $N->multiply($K); |
|
135
|
|
|
// N.K.N |
|
136
|
|
|
$N_K_N = $N->multiply($K_N); |
|
137
|
|
|
|
|
138
|
|
|
return $K->subtract($N_K) |
|
139
|
|
|
->subtract($K_N) |
|
140
|
|
|
->add($N_K_N) |
|
141
|
|
|
->toArray(); |
|
142
|
|
|
} |
|
143
|
|
|
|
|
144
|
|
|
/** |
|
145
|
|
|
* Returns the callable kernel function |
|
146
|
|
|
* |
|
147
|
|
|
* @return \Closure |
|
148
|
|
|
*/ |
|
149
|
|
|
protected function getKernel() |
|
150
|
|
|
{ |
|
151
|
|
|
/** |
|
152
|
|
|
* Each kernel function operates in two modes: |
|
153
|
|
|
* 1) k(x, y) mode: x and y is given |
|
154
|
|
|
* 2) k(x) mode: only x is given |
|
155
|
|
|
*/ |
|
156
|
|
|
switch ($this->kernel) { |
|
157
|
|
View Code Duplication |
case self::KERNEL_RBF: |
|
|
|
|
|
|
158
|
|
|
// k(x,y)=exp(-γ.|x-y|) where |..| is Euclidean distance |
|
159
|
|
|
$dist = new Euclidean(); |
|
160
|
|
|
return function ($x, $y = null) use ($dist) { |
|
161
|
|
|
if ($y === null) { |
|
162
|
|
|
foreach ($x as $i => $element) { |
|
163
|
|
|
$x[$i] = exp(-$this->gamma * $element); |
|
164
|
|
|
} |
|
165
|
|
|
|
|
166
|
|
|
return $x; |
|
167
|
|
|
} |
|
168
|
|
|
|
|
169
|
|
|
return exp(-$this->gamma * $dist->distance($x, $y)); |
|
170
|
|
|
}; |
|
171
|
|
|
|
|
172
|
|
|
case self::KERNEL_SIGMOID: |
|
173
|
|
|
// k(x,y)=tanh(γ.xT.y+c0) where c0=0 |
|
174
|
|
|
return function ($x, $y = null) { |
|
175
|
|
|
if ($y === null) { |
|
176
|
|
|
foreach ($x as $i => $element) { |
|
177
|
|
|
$x[$i] = tanh($this->gamma * $element); |
|
178
|
|
|
} |
|
179
|
|
|
|
|
180
|
|
|
return $x; |
|
181
|
|
|
} |
|
182
|
|
|
|
|
183
|
|
|
$x = new Matrix($x); |
|
184
|
|
|
$y = new Matrix($y); |
|
185
|
|
|
$res = $x->multiply($y->transpose()); |
|
186
|
|
|
|
|
187
|
|
|
return tanh($this->gamma * $res->toScalar()); |
|
188
|
|
|
}; |
|
189
|
|
|
|
|
190
|
|
View Code Duplication |
case self::KERNEL_LAPLACIAN: |
|
|
|
|
|
|
191
|
|
|
// k(x,y)=exp(-γ.|x-y|) where |..| is Manhattan distance |
|
192
|
|
|
$dist = new Manhattan(); |
|
193
|
|
|
return function ($x, $y = null) use ($dist) { |
|
194
|
|
|
if ($y === null) { |
|
195
|
|
|
foreach ($x as $i => $element) { |
|
196
|
|
|
$x[$i] = exp(-$this->gamma * $element); |
|
197
|
|
|
} |
|
198
|
|
|
|
|
199
|
|
|
return $x; |
|
200
|
|
|
} |
|
201
|
|
|
|
|
202
|
|
|
return exp(-$this->gamma * $dist->distance($x, $y)); |
|
203
|
|
|
}; |
|
204
|
|
|
} |
|
205
|
|
|
} |
|
206
|
|
|
|
|
207
|
|
|
/** |
|
208
|
|
|
* @param array $sample |
|
209
|
|
|
* |
|
210
|
|
|
* @return array |
|
211
|
|
|
*/ |
|
212
|
|
|
protected function getDistancePairs(array $sample) |
|
213
|
|
|
{ |
|
214
|
|
|
$kernel = $this->getKernel(); |
|
215
|
|
|
$pairs = []; |
|
216
|
|
|
foreach ($this->data as $row) { |
|
217
|
|
|
$pairs[] = $kernel($row, $sample); |
|
218
|
|
|
} |
|
219
|
|
|
|
|
220
|
|
|
return $pairs; |
|
221
|
|
|
} |
|
222
|
|
|
|
|
223
|
|
|
/** |
|
224
|
|
|
* @param array $pairs |
|
225
|
|
|
* |
|
226
|
|
|
* @return array |
|
227
|
|
|
*/ |
|
228
|
|
|
protected function projectSample(array $pairs) |
|
229
|
|
|
{ |
|
230
|
|
|
// Normalize eigenvectors by eig = eigVectors / eigValues |
|
231
|
|
|
$func = function ($eigVal, $eigVect) { |
|
232
|
|
|
$m = new Matrix($eigVect, false); |
|
233
|
|
|
$a = $m->divideByScalar(sqrt($eigVal))->toArray(); |
|
234
|
|
|
|
|
235
|
|
|
return $a[0]; |
|
236
|
|
|
}; |
|
237
|
|
|
$eig = array_map($func, $this->eigValues, $this->eigVectors); |
|
238
|
|
|
|
|
239
|
|
|
// return k.dot(eig) |
|
240
|
|
|
return Matrix::dot($pairs, $eig); |
|
241
|
|
|
} |
|
242
|
|
|
|
|
243
|
|
|
/** |
|
244
|
|
|
* Transforms the given sample to a lower dimensional vector by using |
|
245
|
|
|
* the variables obtained during the last run of <code>fit</code>. |
|
246
|
|
|
* |
|
247
|
|
|
* @param array $sample |
|
248
|
|
|
* |
|
249
|
|
|
* @return array |
|
250
|
|
|
*/ |
|
251
|
|
|
public function transform(array $sample) |
|
252
|
|
|
{ |
|
253
|
|
|
if (!$this->fit) { |
|
254
|
|
|
throw new \Exception("KernelPCA has not been fitted with respect to original dataset, please run KernelPCA::fit() first"); |
|
255
|
|
|
} |
|
256
|
|
|
|
|
257
|
|
|
if (is_array($sample[0])) { |
|
258
|
|
|
throw new \Exception("KernelPCA::transform() accepts only one-dimensional arrays"); |
|
259
|
|
|
} |
|
260
|
|
|
|
|
261
|
|
|
$sample = $this->normalize([$sample], count($sample)); |
|
262
|
|
|
$sample = $sample[0]; |
|
263
|
|
|
|
|
264
|
|
|
$pairs = $this->getDistancePairs($sample); |
|
265
|
|
|
|
|
266
|
|
|
return $this->projectSample($pairs); |
|
267
|
|
|
} |
|
268
|
|
|
} |
|
269
|
|
|
|
Duplicated code is one of the most pungent code smells. If you need to duplicate the same code in three or more different places, we strongly encourage you to look into extracting the code into a single class or operation.
You can also find more detailed suggestions in the “Code” section of your repository.