xmeans.__minimum_noiseless_description_length() - Code Metrics - Inspection of "#335: Scrutinizer issue correction for OPTICS." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 2f25f6...d3c425 )

by Andrei

created 2017-05-17 07:46 UTC

xmeans.__minimum_noiseless_description_length() B

↳ Parent: xmeans

Complexity

Conditions

Size

Total Lines

Duplication

Lines	1
Ratio	2 %

Importance

Changes

Metric	Value
cc	5
dl	1
loc	50
rs	8.295
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: X-Means
@details Based on article description:
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.

@authors Andrei Novikov ([email protected])
@date 2014-2017
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from enum import IntEnum;

from math import log;

from pyclustering.cluster.encoder import type_encoding;

import pyclustering.core.xmeans_wrapper as wrapper;

from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_division_number;


class splitting_type(IntEnum):
    """!
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
    
    """
    
    ## Bayesian information criterion (BIC) to approximate the correct number of clusters.
    ## Kass's formula is used to calculate BIC:
    ## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
    ##
    ## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
    ## \f[p = (K - 1) + MK + 1\f]
    ##
    ## The log-likelihood of the data:
    ## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
    ##
    ## The maximum likelihood estimate (MLE) for the variance:
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||^2\f]
    BAYESIAN_INFORMATION_CRITERION = 0;
    
    ## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
    ## Beheshti's formula is used to calculate upper bound:
    ## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
    ##
    ## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
    ##
    ## To improve clustering results some contradiction is introduced:
    ## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;


class xmeans:
    """!
    @brief Class represents clustering algorithm X-Means.
    @details X-means clustering method starts with the assumption of having a minimum number of clusters, 
             and then dynamically increases them. X-means uses specified splitting criterion to control 
             the process of splitting clusters.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of X-Means algorithm that uses CCORE for processing
        # initial centers - optional parameter, if it is None, then random center will be used by the algorithm
        initial_centers = [ [0.0, 0.5] ];
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
        
        # run cluster analysis
        xmeans_instance.process();
        
        # obtain results of clustering
        clusters = xmeans_instance.get_clusters();
        
        # display allocated clusters
        draw_clusters(sample, clusters);
    @endcode
    
    """
    
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """
           
        self.__pointer_data = data;
        self.__clusters = [];
        
        if (initial_centers is not None):
            self.__centers = initial_centers[:];
        else:
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
        
        self.__kmax = kmax;
        self.__tolerance = tolerance;
        self.__criterion = criterion;
         
        self.__ccore = ccore;
         
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        @see get_centers()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);
            self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ]; 
            
            self.__centers = self.__update_centers(self.__clusters);
        else:
            self.__clusters = [];
            while ( len(self.__centers) < self.__kmax ):
                current_cluster_number = len(self.__centers);
                 
                (self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
                
                if ( (current_cluster_number == len(allocated_centers)) ):
                    break;
                else:
                    self.__centers = allocated_centers;
                    
     
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_centers()
        
        """
         
        return self.__clusters;
     
     
    def get_centers(self):
        """!
        @brief Returns list of centers for allocated clusters.
        
        @return (list) List of centers for allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """
         
        return self.__centers;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __improve_parameters(self, centers, available_indexes = None):
        """!
        @brief Performs k-means clustering in the specified region.
        
        @param[in] centers (list): Centers of clusters.
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        """

        changes = numpy.Inf;

        stop_condition = self.__tolerance * self.__tolerance; # Fast solution

        clusters = [];

        while (changes > stop_condition):
            clusters = self.__update_clusters(centers, available_indexes);
            clusters = [ cluster for cluster in clusters if len(cluster) > 0 ]; 
            
            updated_centers = self.__update_centers(clusters);
          
            changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
              
            centers = updated_centers;
          
        return (clusters, centers);
     
     
    def __improve_structure(self, clusters, centers):
        """!
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
        
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
        @param[in] centers (list): Centers of clusters.
        
        @return (list) Allocated centers for clustering.
        
        """
         
        difference = 0.001;
          
        allocated_centers = [];
          
        for index_cluster in range(len(clusters)):
            # split cluster into two child clusters
            parent_child_centers = [];
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
          
            # solve k-means problem for children where data of parent are used.
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
              
            # If it's possible to split current data
            if (len(parent_child_clusters) > 1):
                # Calculate splitting criterion
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
              
                split_require = False;
                
                # Reallocate number of centers (clusters) in line with scores
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
                    if (parent_scores < child_scores): split_require = True;
                    
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
                    # If its score for the split structure with two children is smaller than that for the parent structure, 
                    # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
                    if (parent_scores > child_scores): split_require = True;
                
                if (split_require is True):
                    allocated_centers.append(parent_child_centers[0]);
                    allocated_centers.append(parent_child_centers[1]);
                else:
                    allocated_centers.append(centers[index_cluster]);

                    
            else:
                allocated_centers.append(centers[index_cluster]);
          
        return allocated_centers;
     
     
    def __splitting_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """
        
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
            return self.__bayesian_information_criterion(clusters, centers);
        
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
            return self.__minimum_noiseless_description_length(clusters, centers);
        
        else:
            assert 0;


    def __minimum_noiseless_description_length(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
                Low value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        
        """
        
        scores = float('inf');
        
        W = 0.0;
        K = len(clusters);
        N = 0.0;

        sigma_sqrt = 0.0;
        
        alpha = 0.9;
        betta = 0.9;
        
        for index_cluster in range(0, len(clusters), 1):
            Ni = len(clusters[index_cluster]);
            if (Ni == 0): 
                return float('inf');
            
            Wi = 0.0;
            for index_object in clusters[index_cluster]:
                # euclidean_distance_sqrt should be used in line with paper, but in this case results are
                # very poor, therefore square root is used to improved.
                Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
            
            sigma_sqrt += Wi;
            W += Wi / Ni;
            N += Ni;
        
        if (N - K > 0):
            sigma_sqrt /= (N - K);
            sigma = sigma_sqrt ** 0.5;
            
            Kw = (1.0 - K / N) * sigma_sqrt;
            Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
            
            scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
        
        return scores;



    def __bayesian_information_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Splitting criterion in line with bayesian information criterion.
                High value of splitting criterion means that current structure is much better.
                
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """

        scores = [float('inf')] * len(clusters)     # splitting criterion
        dimension = len(self.__pointer_data[0]);
          
        # estimation of the noise variance in the data set
        sigma_sqrt = 0.0;
        K = len(clusters);
        N = 0.0;
          
        for index_cluster in range(0, len(clusters), 1):
            for index_object in clusters[index_cluster]:
                sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);

            N += len(clusters[index_cluster]);
      
        if (N - K > 0):
            sigma_sqrt /= (N - K);
            p = (K - 1) + dimension * K + 1;
            
            # splitting criterion    
            for index_cluster in range(0, len(clusters), 1):

                n = len(clusters[index_cluster]);
                
                L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * dimension * 0.5 * log(sigma_sqrt) - (n - K) * 0.5;
                
                # BIC calculation
                scores[index_cluster] = L - p * 0.5 * log(N);
                
        return sum(scores);
 
 
    def __update_clusters(self, centers, available_indexes = None):
        """!
        @brief Calculates Euclidean distance to each point from the each cluster.
               Nearest points are captured by according clusters and as a result clusters are updated.
               
        @param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
        @param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
        
        @return (list) Updated clusters.
        
        """
            
        bypass = None;
        if (available_indexes is None):
            bypass = range(len(self.__pointer_data));
        else:
            bypass = available_indexes;
          
        clusters = [[] for i in range(len(centers))];
        for index_point in bypass:
            index_optim = -1;
            dist_optim = 0.0;
              
            for index in range(len(centers)):
                # dist = euclidean_distance(data[index_point], centers[index]);         # Slow solution
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]);      # Fast solution
                  
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim = index;
                    dist_optim = dist;
              
            clusters[index_optim].append(index_point);
              
        return clusters;
             
     
    def __update_centers(self, clusters):
        """!
        @brief Updates centers of clusters in line with contained objects.
        
        @param[in] clusters (list): Clusters that contain indexes of objects from data.
        
        @return (list) Updated centers.
        
        """
         
        centers = [[] for i in range(len(clusters))];
        dimension = len(self.__pointer_data[0])
          
        for index in range(len(clusters)):
            point_sum = [0.0] * dimension;
              
            for index_point in clusters[index]:
                point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
            
            centers[index] = list_math_division_number(point_sum, len(clusters[index]));
              
        return centers;


Push — master ( 2f25f6...d3c425 )

xmeans.__minimum_noiseless_description_length() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1		"""!
2
3		@brief Cluster analysis algorithm: X-Means
4		@details Based on article description:
5		- D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2017
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		import numpy;
		0 ignored issues – show Configuration introduced 2017-04-17 02:25 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30		import random;
31
32		from enum import IntEnum;
33
34		from math import log;
35
36		from pyclustering.cluster.encoder import type_encoding;
37
38		import pyclustering.core.xmeans_wrapper as wrapper;
39
40		from pyclustering.utils import euclidean_distance_sqrt, euclidean_distance;
41		from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_division_number;
42
43
44		class splitting_type(IntEnum):
45		"""!
46		@brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
47
48		"""
49
50		## Bayesian information criterion (BIC) to approximate the correct number of clusters.
51		## Kass's formula is used to calculate BIC:
52		## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
53		##
54		## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
55		## \f[p = (K - 1) + MK + 1\f]
56		##
57		## The log-likelihood of the data:
58		## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
59		##
60		## The maximum likelihood estimate (MLE) for the variance:
61		## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|^2\f]
62		BAYESIAN_INFORMATION_CRITERION = 0;
63
64		## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
65		## Beheshti's formula is used to calculate upper bound:
66		## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
67		##
68		## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
69		##
70		## To improve clustering results some contradiction is introduced:
71		## \f[W = \frac{1}{n_j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|\f]
72		## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|\f]
73		MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
74
75
76		class xmeans:
77		"""!
78		@brief Class represents clustering algorithm X-Means.
79		@details X-means clustering method starts with the assumption of having a minimum number of clusters,
80		and then dynamically increases them. X-means uses specified splitting criterion to control
81		the process of splitting clusters.
82
83		Example:
84		@code
85		# sample for cluster analysis (represented by list)
86		sample = read_sample(path_to_sample);
87
88		# create object of X-Means algorithm that uses CCORE for processing
89		# initial centers - optional parameter, if it is None, then random center will be used by the algorithm
90		initial_centers = [ [0.0, 0.5] ];
91		xmeans_instance = xmeans(sample, initial_centers, ccore = True);
92
93		# run cluster analysis
94		xmeans_instance.process();
95
96		# obtain results of clustering
97		clusters = xmeans_instance.get_clusters();
98
99		# display allocated clusters
100		draw_clusters(sample, clusters);
101		@endcode
102
103		"""
104
105		def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
106		"""!
107		@brief Constructor of clustering algorithm X-Means.
108
109		@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
110		@param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
111		if it is not specified then X-Means starts from the random center.
112		@param[in] kmax (uint): Maximum number of clusters that can be allocated.
113		@param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
114		@param[in] criterion (splitting_type): Type of splitting creation.
115		@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
116
117		"""
118
119		self.__pointer_data = data;
120		self.__clusters = [];
121
122		if (initial_centers is not None):
123		self.__centers = initial_centers[:];
124		else:
125		self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
126
127		self.__kmax = kmax;
128		self.__tolerance = tolerance;
129		self.__criterion = criterion;
130
131		self.__ccore = ccore;
132
133		def process(self):
134		"""!
135		@brief Performs cluster analysis in line with rules of X-Means algorithm.
136
137		@remark Results of clustering can be obtained using corresponding gets methods.
138
139		@see get_clusters()
140		@see get_centers()
141
142		"""
143
144		if (self.__ccore is True):
145		self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);
146		self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ];
147
148		self.__centers = self.__update_centers(self.__clusters);
149		else:
150		self.__clusters = [];
151		while ( len(self.__centers) < self.__kmax ):
152		current_cluster_number = len(self.__centers);
153
154		(self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
155		allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
156
157		if ( (current_cluster_number == len(allocated_centers)) ):
158		break;
159		else:
160		self.__centers = allocated_centers;
161
162
163		def get_clusters(self):
164		"""!
165		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
166
167		@return (list) List of allocated clusters.
168
169		@see process()
170		@see get_centers()
171
172		"""
173
174		return self.__clusters;
175
176
177		def get_centers(self):
178		"""!
179		@brief Returns list of centers for allocated clusters.
180
181		@return (list) List of centers for allocated clusters.
182
183		@see process()
184		@see get_clusters()
185
186		"""
187
188		return self.__centers;
189
190
191		def get_cluster_encoding(self):
192		"""!
193		@brief Returns clustering result representation type that indicate how clusters are encoded.
194
195		@return (type_encoding) Clustering result representation.
196
197		@see get_clusters()
198
199		"""
200
201		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
202
203
204		def __improve_parameters(self, centers, available_indexes = None):
205		"""!
206		@brief Performs k-means clustering in the specified region.
207
208		@param[in] centers (list): Centers of clusters.
209		@param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
210
211		@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
212
213		"""
214
215		changes = numpy.Inf;
216
217		stop_condition = self.__tolerance * self.__tolerance; # Fast solution
218
219		clusters = [];
220
221		while (changes > stop_condition):
222		clusters = self.__update_clusters(centers, available_indexes);
223		clusters = [ cluster for cluster in clusters if len(cluster) > 0 ];
224
225		updated_centers = self.__update_centers(clusters);
226
227		changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution
228
229		centers = updated_centers;
230
231		return (clusters, centers);
232
233
234		def __improve_structure(self, clusters, centers):
235		"""!
236		@brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
237
238		@param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
239		@param[in] centers (list): Centers of clusters.
240
241		@return (list) Allocated centers for clustering.
242
243		"""
244
245		difference = 0.001;
246
247		allocated_centers = [];
248
249		for index_cluster in range(len(clusters)):
250		# split cluster into two child clusters
251		parent_child_centers = [];
252		parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
253		parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
254
255		# solve k-means problem for children where data of parent are used.
256		(parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
257
258		# If it's possible to split current data
259		if (len(parent_child_clusters) > 1):
260		# Calculate splitting criterion
261		parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
262		child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
263
264		split_require = False;
265
266		# Reallocate number of centers (clusters) in line with scores
267		if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
268		if (parent_scores < child_scores): split_require = True;
269
270		elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
271		# If its score for the split structure with two children is smaller than that for the parent structure,
272		# then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
273		if (parent_scores > child_scores): split_require = True;
274
275		if (split_require is True):
276		allocated_centers.append(parent_child_centers[0]);
277		allocated_centers.append(parent_child_centers[1]);
278		else:
279		allocated_centers.append(centers[index_cluster]);
280
281
282		else:
283		allocated_centers.append(centers[index_cluster]);
284
285		return allocated_centers;
286
287
288		def __splitting_criterion(self, clusters, centers):
289		"""!
290		@brief Calculates splitting criterion for input clusters.
291
292		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
293		@param[in] centers (list): Centers of the clusters.
294
295		@return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
296
297		@see __bayesian_information_criterion(clusters, centers)
298		@see __minimum_noiseless_description_length(clusters, centers)
299
300		"""
301
302		if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
303		return self.__bayesian_information_criterion(clusters, centers);
304
305		elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
306		return self.__minimum_noiseless_description_length(clusters, centers);
307
308		else:
309		assert 0;
310
311
312		def __minimum_noiseless_description_length(self, clusters, centers):
313		"""!
314		@brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
315
316		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
317		@param[in] centers (list): Centers of the clusters.
318
319		@return (double) Returns splitting criterion in line with bayesian information criterion.
320		Low value of splitting cretion means that current structure is much better.
321
322		@see __bayesian_information_criterion(clusters, centers)
323
324		"""
325
326		scores = float('inf');
327
328		W = 0.0;
329		K = len(clusters);
330		N = 0.0;
331
332		sigma_sqrt = 0.0;
333
334		alpha = 0.9;
335		betta = 0.9;
336
337		for index_cluster in range(0, len(clusters), 1):
338		Ni = len(clusters[index_cluster]);
339		if (Ni == 0):
340		return float('inf');
341
342		Wi = 0.0;
343		for index_object in clusters[index_cluster]:
344		# euclidean_distance_sqrt should be used in line with paper, but in this case results are
345		# very poor, therefore square root is used to improved.
346		Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
347
348		sigma_sqrt += Wi;
349		W += Wi / Ni;
350		N += Ni;
351
352		if (N - K > 0):
353		sigma_sqrt /= (N - K);
354		sigma = sigma_sqrt ** 0.5;
355
356		Kw = (1.0 - K / N) * sigma_sqrt;
357		Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
358
359		scores = sigma_sqrt * (2 * K)*0.5 ((2 * K)*0.5 + betta) / N + W - sigma_sqrt + Ks + 2 alpha*0.5 sigma_sqrt / N
360
361	View Code Duplication	return scores;
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
362
363
364		def __bayesian_information_criterion(self, clusters, centers):
365		"""!
366		@brief Calculates splitting criterion for input clusters using bayesian information criterion.
367
368		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
369		@param[in] centers (list): Centers of the clusters.
370
371		@return (double) Splitting criterion in line with bayesian information criterion.
372		High value of splitting criterion means that current structure is much better.
373
374		@see __minimum_noiseless_description_length(clusters, centers)
375
376		"""
377
378		scores = [float('inf')] * len(clusters) # splitting criterion
379		dimension = len(self.__pointer_data[0]);
380
381		# estimation of the noise variance in the data set
382		sigma_sqrt = 0.0;
383		K = len(clusters);
384		N = 0.0;
385
386		for index_cluster in range(0, len(clusters), 1):
387		for index_object in clusters[index_cluster]:
388		sigma_sqrt += euclidean_distance_sqrt(self.__pointer_data[index_object], centers[index_cluster]);
389
390		N += len(clusters[index_cluster]);
391
392		if (N - K > 0):
393		sigma_sqrt /= (N - K);
394		p = (K - 1) + dimension * K + 1;
395
396		# splitting criterion
397	View Code Duplication	for index_cluster in range(0, len(clusters), 1):
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
398		n = len(clusters[index_cluster]);
399
400		L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * dimension * 0.5 * log(sigma_sqrt) - (n - K) * 0.5;
401
402		# BIC calculation
403		scores[index_cluster] = L - p * 0.5 * log(N);
404
405		return sum(scores);
406
407
408		def __update_clusters(self, centers, available_indexes = None):
409		"""!
410		@brief Calculates Euclidean distance to each point from the each cluster.
411		Nearest points are captured by according clusters and as a result clusters are updated.
412
413		@param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
414		@param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
415
416		@return (list) Updated clusters.
417
418		"""
419
420		bypass = None;
421		if (available_indexes is None):
422		bypass = range(len(self.__pointer_data));
423		else:
424		bypass = available_indexes;
425
426		clusters = [[] for i in range(len(centers))];
427		for index_point in bypass:
428		index_optim = -1;
429		dist_optim = 0.0;
430
431		for index in range(len(centers)):
432		# dist = euclidean_distance(data[index_point], centers[index]); # Slow solution
433		dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]); # Fast solution
434
435		if ( (dist < dist_optim) or (index is 0)):
436		index_optim = index;
437		dist_optim = dist;
438
439		clusters[index_optim].append(index_point);
440
441		return clusters;
442
443
444		def __update_centers(self, clusters):
445		"""!
446		@brief Updates centers of clusters in line with contained objects.
447
448		@param[in] clusters (list): Clusters that contain indexes of objects from data.
449
450		@return (list) Updated centers.
451
452		"""
453
454		centers = [[] for i in range(len(clusters))];
455		dimension = len(self.__pointer_data[0])
456
457		for index in range(len(clusters)):
458		point_sum = [0.0] * dimension;
459
460		for index_point in clusters[index]:
461		point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
462
463		centers[index] = list_math_division_number(point_sum, len(clusters[index]));
464
465		return centers;
466

annoviko / pyclustering

Push — master ( 2f25f6...d3c425 )

xmeans.__minimum_noiseless_description_length() B

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files