Passed
Pull Request — master (#287)
by Marcin
02:51
created

FuzzyCMeans::generateRandomMembership()   A

Complexity

Conditions 3
Paths 3

Size

Total Lines 17

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
dl 0
loc 17
rs 9.7
c 0
b 0
f 0
cc 3
nc 3
nop 2
1
<?php
2
3
declare(strict_types=1);
4
5
namespace Phpml\Clustering;
6
7
use Phpml\Clustering\KMeans\Cluster;
8
use Phpml\Clustering\KMeans\Point;
9
use Phpml\Clustering\KMeans\Space;
10
use Phpml\Exception\InvalidArgumentException;
11
use Phpml\Math\Distance\Euclidean;
12
13
class FuzzyCMeans implements Clusterer
14
{
15
    /**
16
     * @var int
17
     */
18
    private $clustersNumber;
19
20
    /**
21
     * @var array|Cluster[]
22
     */
23
    private $clusters = [];
24
25
    /**
26
     * @var Space
27
     */
28
    private $space;
29
30
    /**
31
     * @var array|float[][]
32
     */
33
    private $membership = [];
34
35
    /**
36
     * @var float
37
     */
38
    private $fuzziness;
39
40
    /**
41
     * @var float
42
     */
43
    private $epsilon;
44
45
    /**
46
     * @var int
47
     */
48
    private $maxIterations;
49
50
    /**
51
     * @var int
52
     */
53
    private $sampleCount;
54
55
    /**
56
     * @var array
57
     */
58
    private $samples = [];
59
60
    /**
61
     * @throws InvalidArgumentException
62
     */
63
    public function __construct(int $clustersNumber, float $fuzziness = 2.0, float $epsilon = 1e-2, int $maxIterations = 100)
64
    {
65
        if ($clustersNumber <= 0) {
66
            throw new InvalidArgumentException('Invalid clusters number');
67
        }
68
69
        $this->clustersNumber = $clustersNumber;
70
        $this->fuzziness = $fuzziness;
71
        $this->epsilon = $epsilon;
72
        $this->maxIterations = $maxIterations;
73
    }
74
75
    public function getMembershipMatrix(): array
76
    {
77
        return $this->membership;
78
    }
79
80
    /**
81
     * @param Point[]|int[][] $samples
82
     */
83
    public function cluster(array $samples): array
84
    {
85
        // Initialize variables, clusters and membership matrix
86
        $this->sampleCount = count($samples);
87
        $this->samples = &$samples;
88
        $this->space = new Space(count($samples[0]));
89
        $this->initClusters();
90
91
        // Our goal is minimizing the objective value while
92
        // executing the clustering steps at a maximum number of iterations
93
        $lastObjective = 0.0;
94
        $iterations = 0;
95
        do {
96
            // Update the membership matrix and cluster centers, respectively
97
            $this->updateMembershipMatrix();
98
            $this->updateClusters();
99
100
            // Calculate the new value of the objective function
101
            $objectiveVal = $this->getObjective();
102
            $difference = abs($lastObjective - $objectiveVal);
103
            $lastObjective = $objectiveVal;
104
        } while ($difference > $this->epsilon && $iterations++ <= $this->maxIterations);
105
106
        // Attach (hard cluster) each data point to the nearest cluster
107
        for ($k = 0; $k < $this->sampleCount; ++$k) {
108
            $column = array_column($this->membership, $k);
109
            arsort($column);
110
            reset($column);
111
            $i = key($column);
112
            $cluster = $this->clusters[$i];
113
            $cluster->attach(new Point($this->samples[$k]));
0 ignored issues
show
Bug introduced by
It seems like $this->samples[$k] can also be of type object<Phpml\Clustering\KMeans\Point>; however, Phpml\Clustering\KMeans\Point::__construct() does only seem to accept array, maybe add an additional type check?

If a method or function can return multiple different values and unless you are sure that you only can receive a single value in this context, we recommend to add an additional type check:

/**
 * @return array|string
 */
function returnsDifferentValues($x) {
    if ($x) {
        return 'foo';
    }

    return array();
}

$x = returnsDifferentValues($y);
if (is_array($x)) {
    // $x is an array.
}

If this a common case that PHP Analyzer should handle natively, please let us know by opening an issue.

Loading history...
114
        }
115
116
        // Return grouped samples
117
        $grouped = [];
118
        foreach ($this->clusters as $cluster) {
119
            $grouped[] = $cluster->getPoints();
120
        }
121
122
        return $grouped;
123
    }
124
125
    protected function initClusters(): void
126
    {
127
        // Membership array is a matrix of cluster number by sample counts
128
        // We initilize the membership array with random values
129
        $dim = $this->space->getDimension();
130
        $this->generateRandomMembership($dim, $this->sampleCount);
131
        $this->updateClusters();
132
    }
133
134
    protected function generateRandomMembership(int $rows, int $cols): void
135
    {
136
        $this->membership = [];
137
        for ($i = 0; $i < $rows; ++$i) {
138
            $row = [];
139
            $total = 0.0;
140
            for ($k = 0; $k < $cols; ++$k) {
141
                $val = random_int(1, 5) / 10.0;
142
                $row[] = $val;
143
                $total += $val;
144
            }
145
146
            $this->membership[] = array_map(function ($val) use ($total) {
147
                return $val / $total;
148
            }, $row);
149
        }
150
    }
151
152
    protected function updateClusters(): void
153
    {
154
        $dim = $this->space->getDimension();
155
        if (empty($this->clusters)) {
156
            for ($i = 0; $i < $this->clustersNumber; ++$i) {
157
                $this->clusters[] = new Cluster($this->space, array_fill(0, $dim, 0.0));
158
            }
159
        }
160
161
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
162
            $cluster = $this->clusters[$i];
163
            $center = $cluster->getCoordinates();
164
            for ($k = 0; $k < $dim; ++$k) {
165
                $a = $this->getMembershipRowTotal($i, $k, true);
166
                $b = $this->getMembershipRowTotal($i, $k, false);
167
                $center[$k] = $a / $b;
168
            }
169
170
            $cluster->setCoordinates($center);
171
        }
172
    }
173
174
    protected function getMembershipRowTotal(int $row, int $col, bool $multiply)
175
    {
176
        $sum = 0.0;
177
        for ($k = 0; $k < $this->sampleCount; ++$k) {
178
            $val = pow($this->membership[$row][$k], $this->fuzziness);
179
            if ($multiply) {
180
                $val *= $this->samples[$k][$col];
181
            }
182
183
            $sum += $val;
184
        }
185
186
        return $sum;
187
    }
188
189
    protected function updateMembershipMatrix(): void
190
    {
191
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
192
            for ($k = 0; $k < $this->sampleCount; ++$k) {
193
                $distCalc = $this->getDistanceCalc($i, $k);
194
                $this->membership[$i][$k] = 1.0 / $distCalc;
195
            }
196
        }
197
    }
198
199
    protected function getDistanceCalc(int $row, int $col): float
200
    {
201
        $sum = 0.0;
202
        $distance = new Euclidean();
203
        $dist1 = $distance->distance(
204
            $this->clusters[$row]->getCoordinates(),
205
            $this->samples[$col]
206
        );
207
208
        for ($j = 0; $j < $this->clustersNumber; ++$j) {
209
            $dist2 = $distance->distance(
210
                $this->clusters[$j]->getCoordinates(),
211
                $this->samples[$col]
212
            );
213
214
            $val = pow($dist1 / $dist2, 2.0 / ($this->fuzziness - 1));
215
            $sum += $val;
216
        }
217
218
        return $sum;
219
    }
220
221
    /**
222
     * The objective is to minimize the distance between all data points
223
     * and all cluster centers. This method returns the summation of all
224
     * these distances
225
     */
226
    protected function getObjective()
227
    {
228
        $sum = 0.0;
229
        $distance = new Euclidean();
230
        for ($i = 0; $i < $this->clustersNumber; ++$i) {
231
            $clust = $this->clusters[$i]->getCoordinates();
232
            for ($k = 0; $k < $this->sampleCount; ++$k) {
233
                $point = $this->samples[$k];
234
                $sum += $distance->distance($clust, $point);
235
            }
236
        }
237
238
        return $sum;
239
    }
240
}
241