ML::select_disjoint()   A
last analyzed

Complexity

Conditions 4
Paths 5

Size

Total Lines 12
Code Lines 7

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
eloc 7
dl 0
loc 12
rs 10
c 0
b 0
f 0
cc 4
nc 5
nop 2
1
<?php
2
/**
3
 *
4
 * (c) Ruben Dorado <[email protected]>
5
 *
6
 * For the full copyright and license information, please view the LICENSE
7
 * file that was distributed with this source code.
8
 */
9
namespace SiteAnalyzer;
10
11
use Exception;
12
13
/**
14
 * class ML
15
 *
16
 * @package   SiteAnalyzer
17
 * @author    Ruben Dorado <[email protected]>
18
 * @copyright 2018 Ruben Dorado
19
 * @license   http://www.opensource.org/licenses/MIT The MIT License
20
 */
21
class ML
22
{
23
    
24
    /*
25
     * @param
26
     */
27
    public static function kmeans($data, $nclusters)
28
    {
29
        $resp = [];
30
        $finished = false;
31
        $niter = 0;
32
        $maxiter = 100;
33
        $npoints = count($data);
34
        if ($npoints <= 0) throw new \Exception("Not enough data. ");    
35
        $ndimensions = count($data[0]);
36
        $centroids = self::select_disjoint($data, $nclusters);
37
        
38
        while (!$finished && $niter < $maxiter) {
39
            // Assign each one of the points to one centroid   
40
            $niter++;
41
            $nresp = [];
42
            for ($j = 0; $j < $npoints; $j++) {        
43
                $best = -1;
44
                $bdist = INF;
45
                for ($i = 0; $i < $nclusters; $i++) {
46
                    $ndist = self::eclideanDistance($data[$j], $centroids[$i]);
47
                    if($bdist > $ndist) {
48
                        $bdist = $ndist;
49
                        $best = $i;
50
                    }            
51
                }
52
                $nresp[] = $best;
53
                
54
            }
55
            
56
            // Check change 
57
            $finished = true;
58
            if (count($resp) > 0) {
59
                for ($j=0; $j < $npoints; $j++) {        
60
                    if ($resp[$j]!==$nresp[$j]) {
61
                        $finished = false;
62
                        break;
63
                    }
64
                }
65
            } else {
66
                $finished = false;
67
            }
68
            $resp = $nresp;
69
            // Recalculate the centroids
70
            $centroids = self::initCentroids($nclusters, $ndimensions, function(){return 0;});
71
            $counts = array_fill(0, $nclusters, 0);
72
            for ($j = 0; $j < $npoints; $j++) {    
73
                $centroids[$resp[$j]] = Matrix::sumArray($centroids[$resp[$j]], $data[$j]);
74
                $counts[$resp[$j]]++;            
75
            }
76
            $centroids = self::normalizeCentroids($centroids, $counts);
77
        }
78
        return ["clusters"=>$resp, "centroids"=>$centroids];
79
    }
80
81
    
82
    /*
83
     * @param
84
     */
85
    private static function select_disjoint($data, $n)
86
    {
87
        $resp = [];
88
        foreach ($data as $row) {
89
            if ( !self::contains_point($resp, $row) ) {
90
                $resp[] = $row;
91
            }
92
            if (count($resp) == $n) {
93
                return $resp;
94
            }
95
        }
96
        throw new \Exception("Not enough unique points.");
97
    }
98
    
99
    /*
100
     * @param
101
     */
102
    private static function contains_point($matrix, $array) 
103
    {
104
        foreach ($matrix as $row){
105
            if (self::isEqual($row, $array)) return true;
106
        }
107
        return false;
108
    }
109
    
110
    /*
111
     * @param
112
     */
113
    private static function isEqual($array1, $array2)
114
    {
115
        $len = count($array1);
116
        if($len != count($array2) ) return false;
117
        for ($i=0; $i<$len; $i++) {
118
            if ($array1[$i] != $array2[$i]) return false;
119
        }
120
        return true;
121
    }
122
    
123
    /*
124
     * @param
125
     */
126
    private static function normalizeCentroids($centroids, $counts)
127
    {
128
        $resp = [];
129
        $n = count($centroids);
130
        $d = count($centroids[0]);
131
        for ($i=0;$i<$n;$i++) {
132
            $tmp = [];
133
            for ($j=0;$j<$d;$j++){
134
                $tmp[] = $centroids[$i][$j]/$counts[$i];
135
            }
136
            $resp[] = $tmp;
137
        }
138
        return $resp;
139
    }
140
    
141
    /*
142
     * @param
143
     */
144
    public static function initCentroids($nclusters, $ndimensions, $fvalue) 
145
    {
146
        $resp = [];
147
        for ($i = 0; $i < $nclusters; $i++) {
148
            $centroid = [];
149
            for ($d = 0; $d < $ndimensions; $d++) {
150
                $centroid[] = $fvalue();
151
            }
152
            $resp[] = $centroid;
153
        }
154
        return $resp;
155
    }
156
157
    /*
158
     * @param
159
     */
160
    public static function eclideanDistance($p1, $p2) {
161
       $len = count($p1);
162
       $acum = 0;
163
       for($i=0; $i<$len; $i++) {
164
           $acum += ($p1[$i] - $p2[$i])**2;
165
       }
166
       return sqrt($acum);
167
    }
168
    
169
}
170