These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Phpml\Clustering; |
||
6 | |||
7 | use Phpml\Clustering\KMeans\Cluster; |
||
8 | use Phpml\Clustering\KMeans\Point; |
||
9 | use Phpml\Clustering\KMeans\Space; |
||
10 | use Phpml\Exception\InvalidArgumentException; |
||
11 | use Phpml\Math\Distance\Euclidean; |
||
12 | |||
13 | class FuzzyCMeans implements Clusterer |
||
14 | { |
||
15 | /** |
||
16 | * @var int |
||
17 | */ |
||
18 | private $clustersNumber; |
||
19 | |||
20 | /** |
||
21 | * @var Cluster[] |
||
22 | */ |
||
23 | private $clusters = []; |
||
24 | |||
25 | /** |
||
26 | * @var Space |
||
27 | */ |
||
28 | private $space; |
||
29 | |||
30 | /** |
||
31 | * @var float[][] |
||
32 | */ |
||
33 | private $membership = []; |
||
34 | |||
35 | /** |
||
36 | * @var float |
||
37 | */ |
||
38 | private $fuzziness; |
||
39 | |||
40 | /** |
||
41 | * @var float |
||
42 | */ |
||
43 | private $epsilon; |
||
44 | |||
45 | /** |
||
46 | * @var int |
||
47 | */ |
||
48 | private $maxIterations; |
||
49 | |||
50 | /** |
||
51 | * @var int |
||
52 | */ |
||
53 | private $sampleCount; |
||
54 | |||
55 | /** |
||
56 | * @var array |
||
57 | */ |
||
58 | private $samples = []; |
||
59 | |||
60 | /** |
||
61 | * @throws InvalidArgumentException |
||
62 | */ |
||
63 | public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100) |
||
64 | { |
||
65 | if ($clustersNumber <= 0) { |
||
66 | throw new InvalidArgumentException('Invalid clusters number'); |
||
67 | } |
||
68 | |||
69 | $this->clustersNumber = $clustersNumber; |
||
70 | $this->fuzziness = $fuzziness; |
||
71 | $this->epsilon = $epsilon; |
||
72 | $this->maxIterations = $maxIterations; |
||
73 | } |
||
74 | |||
75 | public function getMembershipMatrix(): array |
||
76 | { |
||
77 | return $this->membership; |
||
78 | } |
||
79 | |||
80 | /** |
||
81 | * @param Point[]|int[][] $samples |
||
82 | */ |
||
83 | public function cluster(array $samples): array |
||
84 | { |
||
85 | // Initialize variables, clusters and membership matrix |
||
86 | $this->sampleCount = count($samples); |
||
87 | $this->samples = &$samples; |
||
88 | $this->space = new Space(count($samples[0])); |
||
89 | $this->initClusters(); |
||
90 | |||
91 | // Our goal is minimizing the objective value while |
||
92 | // executing the clustering steps at a maximum number of iterations |
||
93 | $lastObjective = 0.0; |
||
94 | $iterations = 0; |
||
95 | do { |
||
96 | // Update the membership matrix and cluster centers, respectively |
||
97 | $this->updateMembershipMatrix(); |
||
98 | $this->updateClusters(); |
||
99 | |||
100 | // Calculate the new value of the objective function |
||
101 | $objectiveVal = $this->getObjective(); |
||
102 | $difference = abs($lastObjective - $objectiveVal); |
||
103 | $lastObjective = $objectiveVal; |
||
104 | } while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations); |
||
105 | |||
106 | // Attach (hard cluster) each data point to the nearest cluster |
||
107 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
108 | $column = array_column($this->membership, $k); |
||
109 | arsort($column); |
||
110 | reset($column); |
||
111 | $cluster = $this->clusters[key($column)]; |
||
112 | $cluster->attach(new Point($this->samples[$k])); |
||
0 ignored issues
–
show
|
|||
113 | } |
||
114 | |||
115 | // Return grouped samples |
||
116 | $grouped = []; |
||
117 | foreach ($this->clusters as $cluster) { |
||
118 | $grouped[] = $cluster->getPoints(); |
||
119 | } |
||
120 | |||
121 | return $grouped; |
||
122 | } |
||
123 | |||
124 | protected function initClusters(): void |
||
125 | { |
||
126 | // Membership array is a matrix of cluster number by sample counts |
||
127 | // We initilize the membership array with random values |
||
128 | $dim = $this->space->getDimension(); |
||
129 | $this->generateRandomMembership($dim, $this->sampleCount); |
||
130 | $this->updateClusters(); |
||
131 | } |
||
132 | |||
133 | protected function generateRandomMembership(int $rows, int $cols): void |
||
134 | { |
||
135 | $this->membership = []; |
||
136 | for ($i = 0; $i < $rows; ++$i) { |
||
137 | $row = []; |
||
138 | $total = 0.0; |
||
139 | for ($k = 0; $k < $cols; ++$k) { |
||
140 | $val = random_int(1, 5) / 10.0; |
||
141 | $row[] = $val; |
||
142 | $total += $val; |
||
143 | } |
||
144 | |||
145 | $this->membership[] = array_map(function ($val) use ($total) { |
||
146 | return $val / $total; |
||
147 | }, $row); |
||
148 | } |
||
149 | } |
||
150 | |||
151 | protected function updateClusters(): void |
||
152 | { |
||
153 | $dim = $this->space->getDimension(); |
||
154 | if (count($this->clusters) === 0) { |
||
155 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
156 | $this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0)); |
||
157 | } |
||
158 | } |
||
159 | |||
160 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
161 | $cluster = $this->clusters[$i]; |
||
162 | $center = $cluster->getCoordinates(); |
||
163 | for ($k = 0; $k < $dim; ++$k) { |
||
164 | $a = $this->getMembershipRowTotal($i, $k, true); |
||
165 | $b = $this->getMembershipRowTotal($i, $k, false); |
||
166 | $center[$k] = $a / $b; |
||
167 | } |
||
168 | |||
169 | $cluster->setCoordinates($center); |
||
170 | } |
||
171 | } |
||
172 | |||
173 | protected function getMembershipRowTotal(int $row, int $col, bool $multiply): float |
||
174 | { |
||
175 | $sum = 0.0; |
||
176 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
177 | $val = $this->membership[$row][$k] ** $this->fuzziness; |
||
178 | if ($multiply) { |
||
179 | $val *= $this->samples[$k][$col]; |
||
180 | } |
||
181 | |||
182 | $sum += $val; |
||
183 | } |
||
184 | |||
185 | return $sum; |
||
186 | } |
||
187 | |||
188 | protected function updateMembershipMatrix(): void |
||
189 | { |
||
190 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
191 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
192 | $distCalc = $this->getDistanceCalc($i, $k); |
||
193 | $this->membership[$i][$k] = 1.0 / $distCalc; |
||
194 | } |
||
195 | } |
||
196 | } |
||
197 | |||
198 | protected function getDistanceCalc(int $row, int $col): float |
||
199 | { |
||
200 | $sum = 0.0; |
||
201 | $distance = new Euclidean(); |
||
202 | $dist1 = $distance->distance( |
||
203 | $this->clusters[$row]->getCoordinates(), |
||
204 | $this->samples[$col] |
||
205 | ); |
||
206 | |||
207 | for ($j = 0; $j < $this->clustersNumber; ++$j) { |
||
208 | $dist2 = $distance->distance( |
||
209 | $this->clusters[$j]->getCoordinates(), |
||
210 | $this->samples[$col] |
||
211 | ); |
||
212 | |||
213 | $val = ($dist1 / $dist2) ** 2.0 / ($this->fuzziness - 1); |
||
214 | $sum += $val; |
||
215 | } |
||
216 | |||
217 | return $sum; |
||
218 | } |
||
219 | |||
220 | /** |
||
221 | * The objective is to minimize the distance between all data points |
||
222 | * and all cluster centers. This method returns the summation of all |
||
223 | * these distances |
||
224 | */ |
||
225 | protected function getObjective(): float |
||
226 | { |
||
227 | $sum = 0.0; |
||
228 | $distance = new Euclidean(); |
||
229 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
230 | $clust = $this->clusters[$i]->getCoordinates(); |
||
231 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
232 | $point = $this->samples[$k]; |
||
233 | $sum += $distance->distance($clust, $point); |
||
234 | } |
||
235 | } |
||
236 | |||
237 | return $sum; |
||
238 | } |
||
239 | } |
||
240 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.