These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more
1 | <?php |
||
2 | |||
3 | declare(strict_types=1); |
||
4 | |||
5 | namespace Phpml\Clustering; |
||
6 | |||
7 | use Phpml\Clustering\KMeans\Cluster; |
||
8 | use Phpml\Clustering\KMeans\Point; |
||
9 | use Phpml\Clustering\KMeans\Space; |
||
10 | use Phpml\Exception\InvalidArgumentException; |
||
11 | use Phpml\Math\Distance\Euclidean; |
||
12 | |||
13 | class FuzzyCMeans implements Clusterer |
||
14 | { |
||
15 | /** |
||
16 | * @var int |
||
17 | */ |
||
18 | private $clustersNumber; |
||
19 | |||
20 | /** |
||
21 | * @var array|Cluster[] |
||
22 | */ |
||
23 | private $clusters = null; |
||
24 | |||
25 | /** |
||
26 | * @var Space |
||
27 | */ |
||
28 | private $space; |
||
29 | |||
30 | /** |
||
31 | * @var array|float[][] |
||
32 | */ |
||
33 | private $membership = []; |
||
34 | |||
35 | /** |
||
36 | * @var float |
||
37 | */ |
||
38 | private $fuzziness; |
||
39 | |||
40 | /** |
||
41 | * @var float |
||
42 | */ |
||
43 | private $epsilon; |
||
44 | |||
45 | /** |
||
46 | * @var int |
||
47 | */ |
||
48 | private $maxIterations; |
||
49 | |||
50 | /** |
||
51 | * @var int |
||
52 | */ |
||
53 | private $sampleCount; |
||
54 | |||
55 | /** |
||
56 | * @var array |
||
57 | */ |
||
58 | private $samples = []; |
||
59 | |||
60 | /** |
||
61 | * @throws InvalidArgumentException |
||
62 | */ |
||
63 | public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100) |
||
64 | { |
||
65 | if ($clustersNumber <= 0) { |
||
66 | throw InvalidArgumentException::invalidClustersNumber(); |
||
67 | } |
||
68 | |||
69 | $this->clustersNumber = $clustersNumber; |
||
70 | $this->fuzziness = $fuzziness; |
||
71 | $this->epsilon = $epsilon; |
||
72 | $this->maxIterations = $maxIterations; |
||
73 | } |
||
74 | |||
75 | public function getMembershipMatrix(): array |
||
76 | { |
||
77 | return $this->membership; |
||
78 | } |
||
79 | |||
80 | /** |
||
81 | * @param Point[]|int[][] $samples |
||
82 | */ |
||
83 | public function cluster(array $samples): array |
||
84 | { |
||
85 | // Initialize variables, clusters and membership matrix |
||
86 | $this->sampleCount = count($samples); |
||
87 | $this->samples = &$samples; |
||
88 | $this->space = new Space(count($samples[0])); |
||
89 | $this->initClusters(); |
||
90 | |||
91 | // Our goal is minimizing the objective value while |
||
92 | // executing the clustering steps at a maximum number of iterations |
||
93 | $lastObjective = 0.0; |
||
94 | $iterations = 0; |
||
95 | do { |
||
96 | // Update the membership matrix and cluster centers, respectively |
||
97 | $this->updateMembershipMatrix(); |
||
98 | $this->updateClusters(); |
||
99 | |||
100 | // Calculate the new value of the objective function |
||
101 | $objectiveVal = $this->getObjective(); |
||
102 | $difference = abs($lastObjective - $objectiveVal); |
||
103 | $lastObjective = $objectiveVal; |
||
104 | } while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations); |
||
105 | |||
106 | // Attach (hard cluster) each data point to the nearest cluster |
||
107 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
108 | $column = array_column($this->membership, $k); |
||
109 | arsort($column); |
||
110 | reset($column); |
||
111 | $i = key($column); |
||
112 | $cluster = $this->clusters[$i]; |
||
113 | $cluster->attach(new Point($this->samples[$k])); |
||
0 ignored issues
–
show
|
|||
114 | } |
||
115 | |||
116 | // Return grouped samples |
||
117 | $grouped = []; |
||
118 | foreach ($this->clusters as $cluster) { |
||
119 | $grouped[] = $cluster->getPoints(); |
||
120 | } |
||
121 | |||
122 | return $grouped; |
||
123 | } |
||
124 | |||
125 | protected function initClusters(): void |
||
126 | { |
||
127 | // Membership array is a matrix of cluster number by sample counts |
||
128 | // We initilize the membership array with random values |
||
129 | $dim = $this->space->getDimension(); |
||
130 | $this->generateRandomMembership($dim, $this->sampleCount); |
||
131 | $this->updateClusters(); |
||
132 | } |
||
133 | |||
134 | protected function generateRandomMembership(int $rows, int $cols): void |
||
135 | { |
||
136 | $this->membership = []; |
||
137 | for ($i = 0; $i < $rows; ++$i) { |
||
138 | $row = []; |
||
139 | $total = 0.0; |
||
140 | for ($k = 0; $k < $cols; ++$k) { |
||
141 | $val = random_int(1, 5) / 10.0; |
||
142 | $row[] = $val; |
||
143 | $total += $val; |
||
144 | } |
||
145 | |||
146 | $this->membership[] = array_map(function ($val) use ($total) { |
||
147 | return $val / $total; |
||
148 | }, $row); |
||
149 | } |
||
150 | } |
||
151 | |||
152 | protected function updateClusters(): void |
||
153 | { |
||
154 | $dim = $this->space->getDimension(); |
||
155 | if (!$this->clusters) { |
||
156 | $this->clusters = []; |
||
157 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
158 | $this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0)); |
||
159 | } |
||
160 | } |
||
161 | |||
162 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
163 | $cluster = $this->clusters[$i]; |
||
164 | $center = $cluster->getCoordinates(); |
||
165 | for ($k = 0; $k < $dim; ++$k) { |
||
166 | $a = $this->getMembershipRowTotal($i, $k, true); |
||
167 | $b = $this->getMembershipRowTotal($i, $k, false); |
||
168 | $center[$k] = $a / $b; |
||
169 | } |
||
170 | |||
171 | $cluster->setCoordinates($center); |
||
172 | } |
||
173 | } |
||
174 | |||
175 | protected function getMembershipRowTotal(int $row, int $col, bool $multiply) |
||
176 | { |
||
177 | $sum = 0.0; |
||
178 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
179 | $val = pow($this->membership[$row][$k], $this->fuzziness); |
||
180 | if ($multiply) { |
||
181 | $val *= $this->samples[$k][$col]; |
||
182 | } |
||
183 | |||
184 | $sum += $val; |
||
185 | } |
||
186 | |||
187 | return $sum; |
||
188 | } |
||
189 | |||
190 | protected function updateMembershipMatrix(): void |
||
191 | { |
||
192 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
193 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
194 | $distCalc = $this->getDistanceCalc($i, $k); |
||
195 | $this->membership[$i][$k] = 1.0 / $distCalc; |
||
196 | } |
||
197 | } |
||
198 | } |
||
199 | |||
200 | protected function getDistanceCalc(int $row, int $col): float |
||
201 | { |
||
202 | $sum = 0.0; |
||
203 | $distance = new Euclidean(); |
||
204 | $dist1 = $distance->distance( |
||
205 | $this->clusters[$row]->getCoordinates(), |
||
206 | $this->samples[$col] |
||
207 | ); |
||
208 | |||
209 | for ($j = 0; $j < $this->clustersNumber; ++$j) { |
||
210 | $dist2 = $distance->distance( |
||
211 | $this->clusters[$j]->getCoordinates(), |
||
212 | $this->samples[$col] |
||
213 | ); |
||
214 | |||
215 | $val = pow($dist1 / $dist2, 2.0 / ($this->fuzziness - 1)); |
||
216 | $sum += $val; |
||
217 | } |
||
218 | |||
219 | return $sum; |
||
220 | } |
||
221 | |||
222 | /** |
||
223 | * The objective is to minimize the distance between all data points |
||
224 | * and all cluster centers. This method returns the summation of all |
||
225 | * these distances |
||
226 | */ |
||
227 | protected function getObjective() |
||
228 | { |
||
229 | $sum = 0.0; |
||
230 | $distance = new Euclidean(); |
||
231 | for ($i = 0; $i < $this->clustersNumber; ++$i) { |
||
232 | $clust = $this->clusters[$i]->getCoordinates(); |
||
233 | for ($k = 0; $k < $this->sampleCount; ++$k) { |
||
234 | $point = $this->samples[$k]; |
||
235 | $sum += $distance->distance($clust, $point); |
||
236 | } |
||
237 | } |
||
238 | |||
239 | return $sum; |
||
240 | } |
||
241 | } |
||
242 |
If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:
If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.