Completed
Push — 0.8.dev ( 0b64b0...837ab1 )
by Andrei
01:49
created

xmeans.__update_clusters()   C

Complexity

Conditions 7

Size

Total Lines 34

Duplication

Lines 0
Ratio 0 %

Importance

Changes 0
Metric Value
cc 7
dl 0
loc 34
rs 5.5
c 0
b 0
f 0
1
"""!
2
3
@brief Cluster analysis algorithm: X-Means
4
@details Based on article description:
5
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7
@authors Andrei Novikov ([email protected])
8
@date 2014-2018
9
@copyright GNU Public License
10
11
@cond GNU_PUBLIC_LICENSE
12
    PyClustering is free software: you can redistribute it and/or modify
13
    it under the terms of the GNU General Public License as published by
14
    the Free Software Foundation, either version 3 of the License, or
15
    (at your option) any later version.
16
    
17
    PyClustering is distributed in the hope that it will be useful,
18
    but WITHOUT ANY WARRANTY; without even the implied warranty of
19
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20
    GNU General Public License for more details.
21
    
22
    You should have received a copy of the GNU General Public License
23
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
24
@endcond
25
26
"""
27
28
29
import numpy;
0 ignored issues
show
Configuration introduced by
The import numpy could not be resolved.

This can be caused by one of the following:

1. Missing Dependencies

This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands.

# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version.

2. Missing __init__.py files

This error could also result from missing __init__.py files in your module folders. Make sure that you place one file in each sub-folder.

Loading history...
30
import random;
31
32
from enum import IntEnum;
33
34
from math import log;
35
36
from pyclustering.cluster.encoder import type_encoding;
37
from pyclustering.cluster.kmeans import kmeans;
38
39
from pyclustering.core.wrapper import ccore_library;
40
41
import pyclustering.core.xmeans_wrapper as wrapper;
42
43
from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
44
from pyclustering.utils import list_math_addition_number;
45
46
47
class splitting_type(IntEnum):
48
    """!
49
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
50
    
51
    """
52
    
53
    ## Bayesian information criterion (BIC) to approximate the correct number of clusters.
54
    ## Kass's formula is used to calculate BIC:
55
    ## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
56
    ##
57
    ## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
58
    ## \f[p = (K - 1) + MK + 1\f]
59
    ##
60
    ## The log-likelihood of the data:
61
    ## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
62
    ##
63
    ## The maximum likelihood estimate (MLE) for the variance:
64
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||^2\f]
65
    BAYESIAN_INFORMATION_CRITERION = 0;
66
    
67
    ## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
68
    ## Beheshti's formula is used to calculate upper bound:
69
    ## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
70
    ##
71
    ## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
72
    ##
73
    ## To improve clustering results some contradiction is introduced:
74
    ## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
75
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
76
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
77
78
79
class xmeans:
80
    """!
81
    @brief Class represents clustering algorithm X-Means.
82
    @details X-means clustering method starts with the assumption of having a minimum number of clusters, 
83
             and then dynamically increases them. X-means uses specified splitting criterion to control 
84
             the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
85
             
86
             CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
87
             
88
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
89
    
90
    Example:
91
    @code
92
        # sample for cluster analysis (represented by list)
93
        sample = read_sample(path_to_sample);
94
        
95
        # create object of X-Means algorithm that uses CCORE for processing
96
        # initial centers - optional parameter, if it is None, then random centers will be used by the algorithm.
97
        # let's avoid random initial centers and initialize them using K-Means++ method:
98
        initial_centers = kmeans_plusplus_initializer(sample, 2).initialize();
99
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
100
        
101
        # run cluster analysis
102
        xmeans_instance.process();
103
        
104
        # obtain results of clustering
105
        clusters = xmeans_instance.get_clusters();
106
        
107
        # display allocated clusters
108
        draw_clusters(sample, clusters);
109
    @endcode
110
    
111
    @see center_initializer
112
    
113
    """
114
    
115
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
116
        """!
117
        @brief Constructor of clustering algorithm X-Means.
118
        
119
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
120
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
121
                    if it is not specified then X-Means starts from the random center.
122
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
123
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
124
        @param[in] criterion (splitting_type): Type of splitting creation.
125
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
126
        
127
        """
128
        
129
        self.__pointer_data = data;
130
        self.__clusters = [];
131
        
132
        if (initial_centers is not None):
133
            self.__centers = initial_centers[:];
134
        else:
135
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
136
        
137
        self.__kmax = kmax;
138
        self.__tolerance = tolerance;
139
        self.__criterion = criterion;
140
         
141
        self.__ccore = ccore;
142
        if (self.__ccore):
143
            self.__ccore = ccore_library.workable();
144
145
146
    def process(self):
147
        """!
148
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
149
        
150
        @remark Results of clustering can be obtained using corresponding gets methods.
151
        
152
        @see get_clusters()
153
        @see get_centers()
154
        
155
        """
156
        
157
        if (self.__ccore is True):
158
            self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);
159
160
        else:
161
            self.__clusters = [];
162
            while ( len(self.__centers) <= self.__kmax ):
163
                current_cluster_number = len(self.__centers);
164
                
165
                self.__clusters, self.__centers = self.__improve_parameters(self.__centers);
166
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
167
                
168
                if (current_cluster_number == len(allocated_centers)):
169
                #if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
170
                    break;
171
                else:
172
                    self.__centers = allocated_centers;
173
            
174
            self.__clusters, self.__centers = self.__improve_parameters(self.__centers);
175
176
177
    def get_clusters(self):
178
        """!
179
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
180
        
181
        @return (list) List of allocated clusters.
182
        
183
        @see process()
184
        @see get_centers()
185
        
186
        """
187
188
        return self.__clusters;
189
190
191
    def get_centers(self):
192
        """!
193
        @brief Returns list of centers for allocated clusters.
194
        
195
        @return (list) List of centers for allocated clusters.
196
        
197
        @see process()
198
        @see get_clusters()
199
        
200
        """
201
         
202
        return self.__centers;
203
204
205
    def get_cluster_encoding(self):
206
        """!
207
        @brief Returns clustering result representation type that indicate how clusters are encoded.
208
        
209
        @return (type_encoding) Clustering result representation.
210
        
211
        @see get_clusters()
212
        
213
        """
214
        
215
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
216
217
218
    def __improve_parameters(self, centers, available_indexes = None):
219
        """!
220
        @brief Performs k-means clustering in the specified region.
221
        
222
        @param[in] centers (list): Centers of clusters.
223
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
224
        
225
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
226
        
227
        """
228
229
        local_data = self.__pointer_data;
230
        if (available_indexes):
231
            local_data = [ self.__pointer_data[i] for i in available_indexes ];
232
        
233
        kmeans_instance = kmeans(local_data, centers, tolerance=self.__tolerance, ccore=False);
234
        kmeans_instance.process();
235
        
236
        local_clusters = kmeans_instance.get_clusters();
237
        local_centers = kmeans_instance.get_centers();
238
        
239
        clusters = [];
240
        if (available_indexes):
241
            for local_cluster in local_clusters:
242
                current_cluster = [];
243
                for index_point in local_cluster:
244
                    current_cluster.append(available_indexes[index_point]);
245
                    
246
                clusters.append(current_cluster);
247
        else:
248
            clusters = local_clusters;
249
        
250
        return (clusters, local_centers);
251
    
252
    
253
    def __improve_structure(self, clusters, centers):
254
        """!
255
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
256
        
257
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
258
        @param[in] centers (list): Centers of clusters.
259
        
260
        @return (list) Allocated centers for clustering.
261
        
262
        """
263
         
264
        difference = 0.001;
265
          
266
        allocated_centers = [];
267
        amount_free_centers = self.__kmax - len(centers);
268
269
        for index_cluster in range(len(clusters)):
270
            # split cluster into two child clusters
271
            parent_child_centers = [];
272
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
273
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
274
          
275
            # solve k-means problem for children where data of parent are used.
276
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
277
              
278
            # If it's possible to split current data
279
            if (len(parent_child_clusters) > 1):
280
                # Calculate splitting criterion
281
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
282
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
283
              
284
                split_require = False;
285
                
286
                # Reallocate number of centers (clusters) in line with scores
287
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
288
                    if (parent_scores < child_scores): split_require = True;
289
                    
290
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
291
                    # If its score for the split structure with two children is smaller than that for the parent structure, 
292
                    # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
293
                    if (parent_scores > child_scores): split_require = True;
294
                
295
                if ( (split_require is True) and (amount_free_centers > 0) ):
296
                    allocated_centers.append(parent_child_centers[0]);
297
                    allocated_centers.append(parent_child_centers[1]);
298
                    
299
                    amount_free_centers -= 1;
300
                else:
301
                    allocated_centers.append(centers[index_cluster]);
302
303
                    
304
            else:
305
                allocated_centers.append(centers[index_cluster]);
306
          
307
        return allocated_centers;
308
     
309
     
310
    def __splitting_criterion(self, clusters, centers):
311
        """!
312
        @brief Calculates splitting criterion for input clusters.
313
        
314
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
315
        @param[in] centers (list): Centers of the clusters.
316
        
317
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
318
        
319
        @see __bayesian_information_criterion(clusters, centers)
320
        @see __minimum_noiseless_description_length(clusters, centers)
321
        
322
        """
323
        
324
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
325
            return self.__bayesian_information_criterion(clusters, centers);
326
        
327
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
328
            return self.__minimum_noiseless_description_length(clusters, centers);
329
        
330
        else:
331
            assert 0;
332
333
334
    def __minimum_noiseless_description_length(self, clusters, centers):
335
        """!
336
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
337
        
338
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
339
        @param[in] centers (list): Centers of the clusters.
340
        
341
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
342
                Low value of splitting cretion means that current structure is much better.
343
        
344
        @see __bayesian_information_criterion(clusters, centers)
345
        
346
        """
347
        
348
        scores = float('inf');
349
        
350
        W = 0.0;
351
        K = len(clusters);
352
        N = 0.0;
353
354
        sigma_sqrt = 0.0;
355
        
356
        alpha = 0.9;
357
        betta = 0.9;
358
        
359
        for index_cluster in range(0, len(clusters), 1):
360
            Ni = len(clusters[index_cluster]);
361
            if (Ni == 0): 
362
                return float('inf');
363
            
364
            Wi = 0.0;
365
            for index_object in clusters[index_cluster]:
366
                # euclidean_distance_sqrt should be used in line with paper, but in this case results are
367
                # very poor, therefore square root is used to improved.
368
                Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
369
            
370
            sigma_sqrt += Wi;
371
            W += Wi / Ni;
372
            N += Ni;
373
        
374
        if (N - K > 0):
375
            sigma_sqrt /= (N - K);
376
            sigma = sigma_sqrt ** 0.5;
377
            
378
            Kw = (1.0 - K / N) * sigma_sqrt;
379
            Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
380
            
381
            scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
382
        
383
        return scores;
384
385
386
    def __bayesian_information_criterion(self, clusters, centers):
387
        """!
388
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
389
        
390
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
391
        @param[in] centers (list): Centers of the clusters.
392
        
393
        @return (double) Splitting criterion in line with bayesian information criterion.
394
                High value of splitting criterion means that current structure is much better.
395
                
396
        @see __minimum_noiseless_description_length(clusters, centers)
397
        
398
        """
399
400
        scores = [float('inf')] * len(clusters)     # splitting criterion
401
        dimension = len(self.__pointer_data[0]);
402
          
403
        # estimation of the noise variance in the data set
404
        sigma_sqrt = 0.0;
405
        K = len(clusters);
406
        N = 0.0;
407
          
408
        for index_cluster in range(0, len(clusters), 1):
409
            for index_object in clusters[index_cluster]:
410
                sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
411
412
            N += len(clusters[index_cluster]);
413
      
414
        if (N - K > 0):
415
            sigma_sqrt /= (N - K);
416
            p = (K - 1) + dimension * K + 1;
417
418
            # in case of the same points, sigma_sqrt can be zero (issue: #407)
419
            sigma_multiplier = 0.0;
420
            if (sigma_sqrt <= 0.0):
421
                sigma_multiplier = float('-inf');
422
            else:
423
                sigma_multiplier = dimension * 0.5 * log(sigma_sqrt);
424
            
425
            # splitting criterion    
426
            for index_cluster in range(0, len(clusters), 1):
427
                n = len(clusters[index_cluster]);
428
429
                L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5;
430
                
431
                # BIC calculation
432
                scores[index_cluster] = L - p * 0.5 * log(N);
433
                
434
        return sum(scores);
435