Passed
Push — master ( e83f7b...d953ef )
by Arkadiusz
03:28
created

src/Phpml/Clustering/FuzzyCMeans.php (1 issue)

Upgrade to new PHP Analysis Engine

These results are based on our legacy PHP analysis, consider migrating to our new PHP analysis engine instead. Learn more

1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Clustering;
6
7
use Phpml\Clustering\KMeans\Cluster;
8
use Phpml\Clustering\KMeans\Point;
9
use Phpml\Clustering\KMeans\Space;
10
use Phpml\Exception\InvalidArgumentException;
11
use Phpml\Math\Distance\Euclidean;
12
13
class FuzzyCMeans implements Clusterer
14
{
15
    /**
16
     * @var int
17
     */
18
    private $clustersNumber;
19
20
    /**
21
     * @var array|Cluster[]
22
     */
23
    private $clusters = null;
24
25
    /**
26
     * @var Space
27
     */
28
    private $space;
29
30
    /**
31
     * @var array|float[][]
32
     */
33
    private $membership = [];
34
35
    /**
36
     * @var float
37
     */
38
    private $fuzziness;
39
40
    /**
41
     * @var float
42
     */
43
    private $epsilon;
44
45
    /**
46
     * @var int
47
     */
48
    private $maxIterations;
49
50
    /**
51
     * @var int
52
     */
53
    private $sampleCount;
54
55
    /**
56
     * @var array
57
     */
58
    private $samples = [];
59
60
    /**
61
     * @throws InvalidArgumentException
62
     */
63
    public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
64
    {
65
        if ($clustersNumber <= 0) {
66
            throw InvalidArgumentException::invalidClustersNumber();
67
        }
68
69
        $this->clustersNumber = $clustersNumber;
70
        $this->fuzziness = $fuzziness;
71
        $this->epsilon = $epsilon;
72
        $this->maxIterations = $maxIterations;
73
    }
74
75
    public function getMembershipMatrix(): array
76
    {
77
        return $this->membership;
78
    }
79
80
    /**
81
     * @param Point[]|int[][] $samples
82
     */
83
    public function cluster(array $samples): array
84
    {
85
        // Initialize variables, clusters and membership matrix
86
        $this->sampleCount = count($samples);
87
        $this->samples = &$samples;
88
        $this->space = new Space(count($samples[0]));
89
        $this->initClusters();
90
91
        // Our goal is minimizing the objective value while
92
        // executing the clustering steps at a maximum number of iterations
93
        $lastObjective = 0.0;
94
        $iterations = 0;
95
        do {
96
            // Update the membership matrix and cluster centers, respectively
97
            $this->updateMembershipMatrix();
98
            $this->updateClusters();
99
100
            // Calculate the new value of the objective function
101
            $objectiveVal = $this->getObjective();
102
            $difference = abs($lastObjective - $objectiveVal);
103
            $lastObjective = $objectiveVal;
104
        } while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations);
105
106
        // Attach (hard cluster) each data point to the nearest cluster
107
        for ($k = 0; $k < $this->sampleCount; ++$k) {
108
            $column = array_column($this->membership, $k);
109
            arsort($column);
110
            reset($column);
111
            $i = key($column);
112
            $cluster = $this->clusters[$i];
113
            $cluster->attach(new Point($this->samples[$k]));
0 ignored issues
show
It seems like $this->samples[$k] can also be of type object<Phpml\Clustering\KMeans\Point>; however, Phpml\Clustering\KMeans\Point::__construct() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
114
        }
115
116
        // Return grouped samples
117
        $grouped = [];
118
        foreach ($this->clusters as $cluster) {
119
            $grouped[] = $cluster->getPoints();
120
        }
121
122
        return $grouped;
123
    }
124
125
    protected function initClusters(): void
126
    {
127
        // Membership array is a matrix of cluster number by sample counts
128
        // We initilize the membership array with random values
129
        $dim = $this->space->getDimension();
130
        $this->generateRandomMembership($dim, $this->sampleCount);
131
        $this->updateClusters();
132
    }
133
134
    protected function generateRandomMembership(int $rows, int $cols): void
135
    {
136
        $this->membership = [];
137
        for ($i = 0; $i < $rows; ++$i) {
138
            $row = [];
139
            $total = 0.0;
140
            for ($k = 0; $k < $cols; ++$k) {
141
                $val = random_int(1, 5) / 10.0;
142
                $row[] = $val;
143
                $total += $val;
144
            }
145
146
            $this->membership[] = array_map(function ($val) use ($total) {
147
                return $val / $total;
148
            }, $row);
149
        }
150
    }
151
152
    protected function updateClusters(): void
153
    {
154
        $dim = $this->space->getDimension();
155
        if (!$this->clusters) {
156
            $this->clusters = [];
157
            for ($i = 0; $i < $this->clustersNumber; ++$i) {
158
                $this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
159
            }
160
        }
161
162
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
163
            $cluster = $this->clusters[$i];
164
            $center = $cluster->getCoordinates();
165
            for ($k = 0; $k < $dim; ++$k) {
166
                $a = $this->getMembershipRowTotal($i, $k, true);
167
                $b = $this->getMembershipRowTotal($i, $k, false);
168
                $center[$k] = $a / $b;
169
            }
170
171
            $cluster->setCoordinates($center);
172
        }
173
    }
174
175
    protected function getMembershipRowTotal(int $row, int $col, bool $multiply)
176
    {
177
        $sum = 0.0;
178
        for ($k = 0; $k < $this->sampleCount; ++$k) {
179
            $val = pow($this->membership[$row][$k], $this->fuzziness);
180
            if ($multiply) {
181
                $val *= $this->samples[$k][$col];
182
            }
183
184
            $sum += $val;
185
        }
186
187
        return $sum;
188
    }
189
190
    protected function updateMembershipMatrix(): void
191
    {
192
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
193
            for ($k = 0; $k < $this->sampleCount; ++$k) {
194
                $distCalc = $this->getDistanceCalc($i, $k);
195
                $this->membership[$i][$k] = 1.0 / $distCalc;
196
            }
197
        }
198
    }
199
200
    protected function getDistanceCalc(int $row, int $col): float
201
    {
202
        $sum = 0.0;
203
        $distance = new Euclidean();
204
        $dist1 = $distance->distance(
205
            $this->clusters[$row]->getCoordinates(),
206
            $this->samples[$col]
207
        );
208
209
        for ($j = 0; $j < $this->clustersNumber; ++$j) {
210
            $dist2 = $distance->distance(
211
                $this->clusters[$j]->getCoordinates(),
212
                $this->samples[$col]
213
            );
214
215
            $val = pow($dist1 / $dist2, 2.0 / ($this->fuzziness - 1));
216
            $sum += $val;
217
        }
218
219
        return $sum;
220
    }
221
222
    /**
223
     * The objective is to minimize the distance between all data points
224
     * and all cluster centers. This method returns the summation of all
225
     * these distances
226
     */
227
    protected function getObjective()
228
    {
229
        $sum = 0.0;
230
        $distance = new Euclidean();
231
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
232
            $clust = $this->clusters[$i]->getCoordinates();
233
            for ($k = 0; $k < $this->sampleCount; ++$k) {
234
                $point = $this->samples[$k];
235
                $sum += $distance->distance($clust, $point);
236
            }
237
        }
238
239
        return $sum;
240
    }
241
}
242