Completed
Push — master ( 2f25f6...d3c425 )
by Andrei
02:18
created

xmeans.__minimum_noiseless_description_length()   B

Complexity

Conditions 5

Size

Total Lines 50

Duplication

Lines 1
Ratio 2 %

Importance

Changes 0
Metric Value
cc 5
dl 1
loc 50
rs 8.295
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: X-Means
4
@details Based on article description:
5
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2017
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
import random;
31
32
from enum import IntEnum;
33
34
from math import log;
35
36
from pyclustering.cluster.encoder import type_encoding;
37
38
import pyclustering.core.xmeans_wrapper as wrapper;
39
40
from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
41
from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_division_number;
42
43
44
class splitting_type(IntEnum):
45
    """!
46
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
47
    
48
    """
49
    
50
    ## Bayesian information criterion (BIC) to approximate the correct number of clusters.
51
    ## Kass's formula is used to calculate BIC:
52
    ## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
53
    ##
54
    ## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
55
    ## \f[p = (K - 1) + MK + 1\f]
56
    ##
57
    ## The log-likelihood of the data:
58
    ## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
59
    ##
60
    ## The maximum likelihood estimate (MLE) for the variance:
61
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||^2\f]
62
    BAYESIAN_INFORMATION_CRITERION = 0;
63
    
64
    ## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
65
    ## Beheshti's formula is used to calculate upper bound:
66
    ## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
67
    ##
68
    ## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
69
    ##
70
    ## To improve clustering results some contradiction is introduced:
71
    ## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
72
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
73
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
74
75
76
class xmeans:
77
    """!
78
    @brief Class represents clustering algorithm X-Means.
79
    @details X-means clustering method starts with the assumption of having a minimum number of clusters, 
80
             and then dynamically increases them. X-means uses specified splitting criterion to control 
81
             the process of splitting clusters.
82
    
83
    Example:
84
    @code
85
        # sample for cluster analysis (represented by list)
86
        sample = read_sample(path_to_sample);
87
        
88
        # create object of X-Means algorithm that uses CCORE for processing
89
        # initial centers - optional parameter, if it is None, then random center will be used by the algorithm
90
        initial_centers = [ [0.0, 0.5] ];
91
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
92
        
93
        # run cluster analysis
94
        xmeans_instance.process();
95
        
96
        # obtain results of clustering
97
        clusters = xmeans_instance.get_clusters();
98
        
99
        # display allocated clusters
100
        draw_clusters(sample, clusters);
101
    @endcode
102
    
103
    """
104
    
105
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
106
        """!
107
        @brief Constructor of clustering algorithm X-Means.
108
        
109
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
110
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
111
                    if it is not specified then X-Means starts from the random center.
112
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
113
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
114
        @param[in] criterion (splitting_type): Type of splitting creation.
115
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
116
        
117
        """
118
           
119
        self.__pointer_data = data;
120
        self.__clusters = [];
121
        
122
        if (initial_centers is not None):
123
            self.__centers = initial_centers[:];
124
        else:
125
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
126
        
127
        self.__kmax = kmax;
128
        self.__tolerance = tolerance;
129
        self.__criterion = criterion;
130
         
131
        self.__ccore = ccore;
132
         
133
    def process(self):
134
        """!
135
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
136
        
137
        @remark Results of clustering can be obtained using corresponding gets methods.
138
        
139
        @see get_clusters()
140
        @see get_centers()
141
        
142
        """
143
        
144
        if (self.__ccore is True):
145
            self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);
146
            self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ]; 
147
            
148
            self.__centers = self.__update_centers(self.__clusters);
149
        else:
150
            self.__clusters = [];
151
            while ( len(self.__centers) < self.__kmax ):
152
                current_cluster_number = len(self.__centers);
153
                 
154
                (self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
155
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
156
                
157
                if ( (current_cluster_number == len(allocated_centers)) ):
158
                    break;
159
                else:
160
                    self.__centers = allocated_centers;
161
                    
162
     
163
    def get_clusters(self):
164
        """!
165
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
166
        
167
        @return (list) List of allocated clusters.
168
        
169
        @see process()
170
        @see get_centers()
171
        
172
        """
173
         
174
        return self.__clusters;
175
     
176
     
177
    def get_centers(self):
178
        """!
179
        @brief Returns list of centers for allocated clusters.
180
        
181
        @return (list) List of centers for allocated clusters.
182
        
183
        @see process()
184
        @see get_clusters()
185
        
186
        """
187
         
188
        return self.__centers;
189
190
191
    def get_cluster_encoding(self):
192
        """!
193
        @brief Returns clustering result representation type that indicate how clusters are encoded.
194
        
195
        @return (type_encoding) Clustering result representation.
196
        
197
        @see get_clusters()
198
        
199
        """
200
        
201
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
202
203
204
    def __improve_parameters(self, centers, available_indexes = None):
205
        """!
206
        @brief Performs k-means clustering in the specified region.
207
        
208
        @param[in] centers (list): Centers of clusters.
209
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
210
        
211
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
212
        
213
        """
214
215
        changes = numpy.Inf;
216
217
        stop_condition = self.__tolerance * self.__tolerance; # Fast solution
218
219
        clusters = [];
220
221
        while (changes > stop_condition):
222
            clusters = self.__update_clusters(centers, available_indexes);
223
            clusters = [ cluster for cluster in clusters if len(cluster) > 0 ]; 
224
            
225
            updated_centers = self.__update_centers(clusters);
226
          
227
            changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
228
              
229
            centers = updated_centers;
230
          
231
        return (clusters, centers);
232
     
233
     
234
    def __improve_structure(self, clusters, centers):
235
        """!
236
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
237
        
238
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
239
        @param[in] centers (list): Centers of clusters.
240
        
241
        @return (list) Allocated centers for clustering.
242
        
243
        """
244
         
245
        difference = 0.001;
246
          
247
        allocated_centers = [];
248
          
249
        for index_cluster in range(len(clusters)):
250
            # split cluster into two child clusters
251
            parent_child_centers = [];
252
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
253
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
254
          
255
            # solve k-means problem for children where data of parent are used.
256
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
257
              
258
            # If it's possible to split current data
259
            if (len(parent_child_clusters) > 1):
260
                # Calculate splitting criterion
261
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
262
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
263
              
264
                split_require = False;
265
                
266
                # Reallocate number of centers (clusters) in line with scores
267
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
268
                    if (parent_scores < child_scores): split_require = True;
269
                    
270
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
271
                    # If its score for the split structure with two children is smaller than that for the parent structure, 
272
                    # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
273
                    if (parent_scores > child_scores): split_require = True;
274
                
275
                if (split_require is True):
276
                    allocated_centers.append(parent_child_centers[0]);
277
                    allocated_centers.append(parent_child_centers[1]);
278
                else:
279
                    allocated_centers.append(centers[index_cluster]);
280
281
                    
282
            else:
283
                allocated_centers.append(centers[index_cluster]);
284
          
285
        return allocated_centers;
286
     
287
     
288
    def __splitting_criterion(self, clusters, centers):
289
        """!
290
        @brief Calculates splitting criterion for input clusters.
291
        
292
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
293
        @param[in] centers (list): Centers of the clusters.
294
        
295
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
296
        
297
        @see __bayesian_information_criterion(clusters, centers)
298
        @see __minimum_noiseless_description_length(clusters, centers)
299
        
300
        """
301
        
302
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
303
            return self.__bayesian_information_criterion(clusters, centers);
304
        
305
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
306
            return self.__minimum_noiseless_description_length(clusters, centers);
307
        
308
        else:
309
            assert 0;
310
311
312
    def __minimum_noiseless_description_length(self, clusters, centers):
313
        """!
314
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
315
        
316
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
317
        @param[in] centers (list): Centers of the clusters.
318
        
319
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
320
                Low value of splitting cretion means that current structure is much better.
321
        
322
        @see __bayesian_information_criterion(clusters, centers)
323
        
324
        """
325
        
326
        scores = float('inf');
327
        
328
        W = 0.0;
329
        K = len(clusters);
330
        N = 0.0;
331
332
        sigma_sqrt = 0.0;
333
        
334
        alpha = 0.9;
335
        betta = 0.9;
336
        
337
        for index_cluster in range(0, len(clusters), 1):
338
            Ni = len(clusters[index_cluster]);
339
            if (Ni == 0): 
340
                return float('inf');
341
            
342
            Wi = 0.0;
343
            for index_object in clusters[index_cluster]:
344
                # euclidean_distance_sqrt should be used in line with paper, but in this case results are
345
                # very poor, therefore square root is used to improved.
346
                Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
347
            
348
            sigma_sqrt += Wi;
349
            W += Wi / Ni;
350
            N += Ni;
351
        
352
        if (N - K > 0):
353
            sigma_sqrt /= (N - K);
354
            sigma = sigma_sqrt ** 0.5;
355
            
356
            Kw = (1.0 - K / N) * sigma_sqrt;
357
            Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
358
            
359
            scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
360
        
361 View Code Duplication
        return scores;
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
362
363
364
    def __bayesian_information_criterion(self, clusters, centers):
365
        """!
366
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
367
        
368
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
369
        @param[in] centers (list): Centers of the clusters.
370
        
371
        @return (double) Splitting criterion in line with bayesian information criterion.
372
                High value of splitting criterion means that current structure is much better.
373
                
374
        @see __minimum_noiseless_description_length(clusters, centers)
375
        
376
        """
377
378
        scores = [float('inf')] * len(clusters)     # splitting criterion
379
        dimension = len(self.__pointer_data[0]);
380
          
381
        # estimation of the noise variance in the data set
382
        sigma_sqrt = 0.0;
383
        K = len(clusters);
384
        N = 0.0;
385
          
386
        for index_cluster in range(0, len(clusters), 1):
387
            for index_object in clusters[index_cluster]:
388
                sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
389
390
            N += len(clusters[index_cluster]);
391
      
392
        if (N - K > 0):
393
            sigma_sqrt /= (N - K);
394
            p = (K - 1) + dimension * K + 1;
395
            
396
            # splitting criterion    
397 View Code Duplication
            for index_cluster in range(0, len(clusters), 1):
0 ignored issues
show
Duplication introduced by
This code seems to be duplicated in your project.
Loading history...
398
                n = len(clusters[index_cluster]);
399
                
400
                L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * dimension * 0.5 * log(sigma_sqrt) - (n - K) * 0.5;
401
                
402
                # BIC calculation
403
                scores[index_cluster] = L - p * 0.5 * log(N);
404
                
405
        return sum(scores);
406
 
407
 
408
    def __update_clusters(self, centers, available_indexes = None):
409
        """!
410
        @brief Calculates Euclidean distance to each point from the each cluster.
411
               Nearest points are captured by according clusters and as a result clusters are updated.
412
               
413
        @param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
414
        @param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
415
        
416
        @return (list) Updated clusters.
417
        
418
        """
419
            
420
        bypass = None;
421
        if (available_indexes is None):
422
            bypass = range(len(self.__pointer_data));
423
        else:
424
            bypass = available_indexes;
425
          
426
        clusters = [[] for i in range(len(centers))];
427
        for index_point in bypass:
428
            index_optim = -1;
429
            dist_optim = 0.0;
430
              
431
            for index in range(len(centers)):
432
                # dist = euclidean_distance(data[index_point], centers[index]);         # Slow solution
433
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]);      # Fast solution
434
                  
435
                if ( (dist < dist_optim) or (index is 0)):
436
                    index_optim = index;
437
                    dist_optim = dist;
438
              
439
            clusters[index_optim].append(index_point);
440
              
441
        return clusters;
442
             
443
     
444
    def __update_centers(self, clusters):
445
        """!
446
        @brief Updates centers of clusters in line with contained objects.
447
        
448
        @param[in] clusters (list): Clusters that contain indexes of objects from data.
449
        
450
        @return (list) Updated centers.
451
        
452
        """
453
         
454
        centers = [[] for i in range(len(clusters))];
455
        dimension = len(self.__pointer_data[0])
456
          
457
        for index in range(len(clusters)):
458
            point_sum = [0.0] * dimension;
459
              
460
            for index_point in clusters[index]:
461
                point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
462
            
463
            centers[index] = list_math_division_number(point_sum, len(clusters[index]));
464
              
465
        return centers;
466