Completed
Push — master ( 6ed467...f3f897 )
by Andrei
01:29
created

xmeans.get_centers()   A

Complexity

Conditions 1

Size

Total Lines 12

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 1
dl 0
loc 12
rs 9.4285
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: X-Means
4
@details Based on article description:
5
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2016
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
import math;
31
import random;
32
33
from enum import IntEnum;
34
35
from pyclustering.cluster.encoder import type_encoding;
36
37
import pyclustering.core.wrapper as wrapper;
38
39
from pyclustering.utils import euclidean_distance, euclidean_distance_sqrt;
40
from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_multiplication, list_math_division_number, list_math_subtraction;
41
42
43
class splitting_type(IntEnum):
44
    """!
45
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
46
    
47
    """
48
    
49
    ## Bayesian information criterion to approximate the correct number of clusters.
50
    BAYESIAN_INFORMATION_CRITERION = 0;
51
    
52
    ## Minimum noiseless description length to approximate the correct number of clusters.
53
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
54
55
56
class xmeans:
57
    """!
58
    @brief Class represents clustering algorithm X-Means.
59
    
60
    Example:
61
    @code
62
        # sample for cluster analysis (represented by list)
63
        sample = read_sample(path_to_sample);
64
        
65
        # create object of X-Means algorithm that uses CCORE for processing
66
        # initial centers - optional parameter, if it is None, then random center will be used by the algorithm
67
        initial_centers = [ [0.0, 0.5] ];
68
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
69
        
70
        # run cluster analysis
71
        xmeans_instance.process();
72
        
73
        # obtain results of clustering
74
        clusters = xmeans_instance.get_clusters();
75
        
76
        # display allocated clusters
77
        draw_clusters(sample, clusters);
78
    @endcode
79
    
80
    """
81
    
82
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
83
        """!
84
        @brief Constructor of clustering algorithm X-Means.
85
        
86
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
87
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
88
                    if it is not specified then X-Means starts from the random center.
89
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
90
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
91
        @param[in] criterion (splitting_type): Type of splitting creation.
92
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
93
        
94
        """
95
           
96
        self.__pointer_data = data;
97
        self.__clusters = [];
98
        
99
        if (initial_centers is not None):
100
            self.__centers = initial_centers[:];
101
        else:
102
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
103
        
104
        self.__kmax = kmax;
105
        self.__tolerance = tolerance;
106
        self.__criterion = criterion;
107
         
108
        self.__ccore = ccore;
109
         
110
    def process(self):
111
        """!
112
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
113
        
114
        @remark Results of clustering can be obtained using corresponding gets methods.
115
        
116
        @see get_clusters()
117
        @see get_centers()
118
        
119
        """
120
        
121
        if (self.__ccore is True):
122
            self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance);
123
            self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ]; 
124
            
125
            self.__centers = self.__update_centers(self.__clusters);
126
        else:
127
            self.__clusters = [];
128
            while ( len(self.__centers) < self.__kmax ):
129
                current_cluster_number = len(self.__centers);
130
                 
131
                (self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
132
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
133
                
134
                if ( (current_cluster_number == len(allocated_centers)) ):
135
                    break;
136
                else:
137
                    self.__centers = allocated_centers;
138
                    
139
     
140
    def get_clusters(self):
141
        """!
142
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
143
        
144
        @return (list) List of allocated clusters.
145
        
146
        @see process()
147
        @see get_centers()
148
        
149
        """
150
         
151
        return self.__clusters;
152
     
153
     
154
    def get_centers(self):
155
        """!
156
        @brief Returns list of centers for allocated clusters.
157
        
158
        @return (list) List of centers for allocated clusters.
159
        
160
        @see process()
161
        @see get_clusters()
162
        
163
        """
164
         
165
        return self.__centers;
166
167
168
    def get_cluster_encoding(self):
169
        """!
170
        @brief Returns clustering result representation type that indicate how clusters are encoded.
171
        
172
        @return (type_encoding) Clustering result representation.
173
        
174
        @see get_clusters()
175
        
176
        """
177
        
178
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
179
180
181
    def __improve_parameters(self, centers, available_indexes = None):
182
        """!
183
        @brief Performs k-means clustering in the specified region.
184
        
185
        @param[in] centers (list): Centers of clusters.
186
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
187
        
188
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
189
        
190
        """
191
        
192
        changes = numpy.Inf;
193
        
194
        stop_condition = self.__tolerance * self.__tolerance; # Fast solution
195
          
196
        clusters = [];
197
          
198
        while (changes > stop_condition):
199
            clusters = self.__update_clusters(centers, available_indexes);
200
            clusters = [ cluster for cluster in clusters if len(cluster) > 0 ]; 
201
            
202
            updated_centers = self.__update_centers(clusters);
203
          
204
            changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
205
              
206
            centers = updated_centers;
207
          
208
        return (clusters, centers);
209
     
210
     
211
    def __improve_structure(self, clusters, centers):
212
        """!
213
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
214
        
215
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
216
        @param[in] centers (list): Centers of clusters.
217
        
218
        @return (list) Allocated centers for clustering.
219
        
220
        """
221
         
222
        difference = 0.001;
223
          
224
        allocated_centers = [];
225
          
226
        for index_cluster in range(len(clusters)):
227
            # split cluster into two child clusters
228
            parent_child_centers = [];
229
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
230
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
231
          
232
            # solve k-means problem for children where data of parent are used.
233
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
234
              
235
            # If it's possible to split current data
236
            if (len(parent_child_clusters) > 1):
237
                # Calculate splitting criterion
238
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
239
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
240
              
241
                split_require = False;
242
                
243
                # Reallocate number of centers (clusters) in line with scores        
244
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
245
                    if (parent_scores < child_scores): split_require = True;
246
                    
247
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
248
                    if (parent_scores > child_scores): split_require = True;
249
                    
250
                if (split_require is True):
251
                    allocated_centers.append(parent_child_centers[0]);
252
                    allocated_centers.append(parent_child_centers[1]);
253
                else:
254
                    allocated_centers.append(centers[index_cluster]);
255
256
                    
257
            else:
258
                allocated_centers.append(centers[index_cluster]);
259
          
260
        return allocated_centers;
261
     
262
     
263
    def __splitting_criterion(self, clusters, centers):
264
        """!
265
        @brief Calculates splitting criterion for input clusters.
266
        
267
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
268
        @param[in] centers (list): Centers of the clusters.
269
        
270
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
271
        
272
        @see __bayesian_information_criterion(clusters, centers)
273
        @see __minimum_noiseless_description_length(clusters, centers)
274
        
275
        """
276
        
277
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
278
            return self.__bayesian_information_criterion(clusters, centers);
279
        
280
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
281
            return self.__minimum_noiseless_description_length(clusters, centers);
282
        
283
        else:
284
            assert 0;
285
 
286
    
287
    def __minimum_noiseless_description_length(self, clusters, centers):
288
        """!
289
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
290
        
291
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
292
        @param[in] centers (list): Centers of the clusters.
293
        
294
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
295
                Low value of splitting cretion means that current structure is much better.
296
        
297
        @see __bayesian_information_criterion(clusters, centers)
298
        
299
        """
300
                
301
        scores = [0.0] * len(clusters);
302
        
303
        W = 0.0;
304
        K = len(clusters);
305
        N = 0.0;
306
307
        sigma_sqrt = 0.0;
308
        
309
        alpha = 0.9;
310
        betta = 0.9;
311
                
312
        for index_cluster in range(0, len(clusters), 1):
313
            for index_object in clusters[index_cluster]:
314
                delta_vector = list_math_subtraction(self.__pointer_data[index_object], centers[index_cluster]);
315
                delta_sqrt = sum(list_math_multiplication(delta_vector, delta_vector));
316
                
317
                W += delta_sqrt;
318
                sigma_sqrt += delta_sqrt;
319
            
320
            N += len(clusters[index_cluster]);     
321
        
322
        if (N - K != 0):
323
            W /= N;
324
            
325
            sigma_sqrt /= (N - K);
326
            sigma = sigma_sqrt ** 0.5;
327
            
328
            for index_cluster in range(0, len(clusters), 1):
329
                Kw = (1.0 - K / N) * sigma_sqrt;
330
                Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) + ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
331
                U = W - Kw + 2.0 * (alpha ** 2.0) * sigma_sqrt / N + Ks;
332
                
333
                Z = K * sigma_sqrt / N + U + betta * ( (2.0 * K) ** 0.5 ) * sigma_sqrt / N;
334
                
335
                if (Z == 0.0):
336
                    scores[index_cluster] = float("inf");
337
                else:
338
                    scores[index_cluster] = Z;
339
                
340
        else:
341
            scores = [float("inf")] * len(clusters);
342
        
343
        return sum(scores);
344
 
345
    def __bayesian_information_criterion(self, clusters, centers):
346
        """!
347
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
348
        
349
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
350
        @param[in] centers (list): Centers of the clusters.
351
        
352
        @return (double) Splitting criterion in line with bayesian information criterion.
353
                High value of splitting cretion means that current structure is much better.
354
                
355
        @see __minimum_noiseless_description_length(clusters, centers)
356
        
357
        """
358
359
        scores = [0.0] * len(clusters)     # splitting criterion
360
        dimension = len(self.__pointer_data[0]);
361 View Code Duplication
          
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
362
        # estimation of the noise variance in the data set
363
        sigma = 0.0;
364
        K = len(clusters);
365
        N = 0.0;
366
          
367
        for index_cluster in range(0, len(clusters), 1):
368
            for index_object in clusters[index_cluster]:
369
                sigma += (euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]));  # It works
370
371
            N += len(clusters[index_cluster]);
372
      
373
        if (N - K != 0):
374
            sigma /= (N - K);
375
        
376
            # splitting criterion    
377
            for index_cluster in range(0, len(clusters), 1):
378
                n = len(clusters[index_cluster]);
379
                
380
                if (sigma > 0.0):
381
                    scores[index_cluster] = n * math.log(n) - n * math.log(N) - n * math.log(2.0 * numpy.pi) / 2.0 - n * dimension * math.log(sigma) / 2.0 - (n - K) / 2.0;
382
                  
383
        return sum(scores);
384
 
385
 
386
    def __update_clusters(self, centers, available_indexes = None):
387
        """!
388
        @brief Calculates Euclidean distance to each point from the each cluster.
389
               Nearest points are captured by according clusters and as a result clusters are updated.
390
               
391
        @param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
392
        @param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
393
        
394
        @return (list) Updated clusters.
395
        
396
        """
397 View Code Duplication
            
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
398
        bypass = None;
399
        if (available_indexes is None):
400
            bypass = range(len(self.__pointer_data));
401
        else:
402
            bypass = available_indexes;
403
          
404
        clusters = [[] for i in range(len(centers))];
405
        for index_point in bypass:
406
            index_optim = -1;
407
            dist_optim = 0.0;
408
              
409
            for index in range(len(centers)):
410
                # dist = euclidean_distance(data[index_point], centers[index]);         # Slow solution
411
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]);      # Fast solution
412
                  
413
                if ( (dist < dist_optim) or (index is 0)):
414
                    index_optim = index;
415
                    dist_optim = dist;
416
              
417
            clusters[index_optim].append(index_point);
418
              
419
        return clusters;
420
             
421
     
422
    def __update_centers(self, clusters):
423
        """!
424
        @brief Updates centers of clusters in line with contained objects.
425
        
426
        @param[in] clusters (list): Clusters that contain indexes of objects from data.
427
        
428
        @return (list) Updated centers.
429
        
430
        """
431
         
432
        centers = [[] for i in range(len(clusters))];
433
        dimension = len(self.__pointer_data[0])
434
          
435
        for index in range(len(clusters)):
436
            point_sum = [0.0] * dimension;
437
              
438
            for index_point in clusters[index]:
439
                point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
440
            
441
            centers[index] = list_math_division_number(point_sum, len(clusters[index]));
442
              
443
        return centers;
444