xmeans.__improve_parameters() - Code Metrics - Inspection of "[.] Merge branch '0.8.dev'." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( c8bc69...89bc14 )

by Andrei

created 2018-02-23 14:25 UTC

xmeans.__improve_parameters() C

↳ Parent: xmeans

Complexity

Conditions

Size

Total Lines

Duplication

Lines	0
Ratio	0 %

Importance

Changes

Metric	Value
cc	7
dl	0
loc	33
rs	5.5
c	0
b	0
f	0

"""!

@brief Cluster analysis algorithm: X-Means
@details Based on article description:
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.

@authors Andrei Novikov ([email protected])
@date 2014-2018
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import random;

from enum import IntEnum;

from math import log;

from pyclustering.cluster.encoder import type_encoding;
from pyclustering.cluster.kmeans import kmeans;
from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;

from pyclustering.core.wrapper import ccore_library;

import pyclustering.core.xmeans_wrapper as wrapper;

from pyclustering.utils import euclidean_distance_square, euclidean_distance;
from pyclustering.utils import list_math_addition_number;



class splitting_type(IntEnum):
    """!
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
    
    """
    
    ## Bayesian information criterion (BIC) to approximate the correct number of clusters.
    ## Kass's formula is used to calculate BIC:
    ## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
    ##
    ## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
    ## \f[p = (K - 1) + MK + 1\f]
    ##
    ## The log-likelihood of the data:
    ## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
    ##
    ## The maximum likelihood estimate (MLE) for the variance:
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||^2\f]
    BAYESIAN_INFORMATION_CRITERION = 0;
    
    ## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
    ## Beheshti's formula is used to calculate upper bound:
    ## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
    ##
    ## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
    ##
    ## To improve clustering results some contradiction is introduced:
    ## \f[W = \frac{1}{n_j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
    ## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}||x_{ij} - \hat{C}_j||\f]
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;


class xmeans:
    """!
    @brief Class represents clustering algorithm X-Means.
    @details X-means clustering method starts with the assumption of having a minimum number of clusters, 
             and then dynamically increases them. X-means uses specified splitting criterion to control 
             the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
             
             CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
             
             CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of X-Means algorithm that uses CCORE for processing
        # initial centers - optional parameter, if it is None, then random centers will be used by the algorithm.
        # let's avoid random initial centers and initialize them using K-Means++ method:
        initial_centers = kmeans_plusplus_initializer(sample, 2).initialize();
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
        
        # run cluster analysis
        xmeans_instance.process();
        
        # obtain results of clustering
        clusters = xmeans_instance.get_clusters();
        
        # display allocated clusters
        draw_clusters(sample, clusters);
    @endcode
    
    @see center_initializer
    
    """
    
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """
        
        self.__pointer_data = data;
        self.__clusters = [];
        
        if (initial_centers is not None):
            self.__centers = initial_centers[:];
        else:
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
        
        self.__kmax = kmax;
        self.__tolerance = tolerance;
        self.__criterion = criterion;
         
        self.__ccore = ccore;
        if (self.__ccore):
            self.__ccore = ccore_library.workable();


    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        @see get_centers()
        
        """
        
        if (self.__ccore is True):
            self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);

        else:
            self.__clusters = [];
            while ( len(self.__centers) <= self.__kmax ):
                current_cluster_number = len(self.__centers);
                
                self.__clusters, self.__centers = self.__improve_parameters(self.__centers);
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
                
                if (current_cluster_number == len(allocated_centers)):
                #if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
                    break;
                else:
                    self.__centers = allocated_centers;
            
            self.__clusters, self.__centers = self.__improve_parameters(self.__centers);


    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_centers()
        
        """

        return self.__clusters;


    def get_centers(self):
        """!
        @brief Returns list of centers for allocated clusters.
        
        @return (list) List of centers for allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """
         
        return self.__centers;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __improve_parameters(self, centers, available_indexes = None):
        """!
        @brief Performs k-means clustering in the specified region.
        
        @param[in] centers (list): Centers of clusters.
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        """

        if (available_indexes and len(available_indexes) == 1):
            index_center = available_indexes[0];
            return ([ available_indexes ], self.__pointer_data[index_center]);

        local_data = self.__pointer_data;
        if available_indexes:
            local_data = [ self.__pointer_data[i] for i in available_indexes ];

        local_centers = centers;
        if centers is None:
            local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize();

        kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False);
        kmeans_instance.process();

        local_centers = kmeans_instance.get_centers();
        
        clusters = kmeans_instance.get_clusters();
        if (available_indexes):
            clusters = self.__local_to_global_clusters(clusters, available_indexes);
        
        return (clusters, local_centers);


    def __local_to_global_clusters(self, local_clusters, available_indexes):
        """!
        @brief Converts clusters in local region define by 'available_indexes' to global clusters.

        @param[in] local_clusters (list): Local clusters in specific region.
        @param[in] available_indexes (list): Map between local and global point's indexes.

        @return Global clusters.

        """

        clusters = [];
        for local_cluster in local_clusters:
            current_cluster = [];
            for index_point in local_cluster:
                current_cluster.append(available_indexes[index_point]);

            clusters.append(current_cluster);

        return clusters;

    
    def __improve_structure(self, clusters, centers):
        """!
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
        
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
        @param[in] centers (list): Centers of clusters.
        
        @return (list) Allocated centers for clustering.
        
        """

        allocated_centers = [];
        amount_free_centers = self.__kmax - len(centers);

        for index_cluster in range(len(clusters)):
            # solve k-means problem for children where data of parent are used.
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(None, clusters[index_cluster]);
              
            # If it's possible to split current data
            if (len(parent_child_clusters) > 1):
                # Calculate splitting criterion
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
              
                split_require = False;
                
                # Reallocate number of centers (clusters) in line with scores
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
                    if (parent_scores < child_scores): split_require = True;
                    
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
                    # If its score for the split structure with two children is smaller than that for the parent structure, 
                    # then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
                    if (parent_scores > child_scores): split_require = True;
                
                if ( (split_require is True) and (amount_free_centers > 0) ):
                    allocated_centers.append(parent_child_centers[0]);
                    allocated_centers.append(parent_child_centers[1]);
                    
                    amount_free_centers -= 1;
                else:
                    allocated_centers.append(centers[index_cluster]);

                    
            else:
                allocated_centers.append(centers[index_cluster]);
          
        return allocated_centers;
     
     
    def __splitting_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """
        
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
            return self.__bayesian_information_criterion(clusters, centers);
        
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
            return self.__minimum_noiseless_description_length(clusters, centers);
        
        else:
            assert 0;


    def __minimum_noiseless_description_length(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
                Low value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        
        """
        
        scores = float('inf');
        
        W = 0.0;
        K = len(clusters);
        N = 0.0;

        sigma_sqrt = 0.0;
        
        alpha = 0.9;
        betta = 0.9;
        
        for index_cluster in range(0, len(clusters), 1):
            Ni = len(clusters[index_cluster]);
            if (Ni == 0): 
                return float('inf');
            
            Wi = 0.0;
            for index_object in clusters[index_cluster]:
                # euclidean_distance_square should be used in line with paper, but in this case results are
                # very poor, therefore square root is used to improved.
                Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
            
            sigma_sqrt += Wi;
            W += Wi / Ni;
            N += Ni;
        
        if (N - K > 0):
            sigma_sqrt /= (N - K);
            sigma = sigma_sqrt ** 0.5;
            
            Kw = (1.0 - K / N) * sigma_sqrt;
            Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
            
            scores = sigma_sqrt * (2 * K)**0.5 * ((2 * K)**0.5 + betta) / N + W - sigma_sqrt + Ks + 2 * alpha**0.5 * sigma_sqrt / N
        
        return scores;


    def __bayesian_information_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Splitting criterion in line with bayesian information criterion.
                High value of splitting criterion means that current structure is much better.
                
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """

        scores = [float('inf')] * len(clusters)     # splitting criterion
        dimension = len(self.__pointer_data[0]);
          
        # estimation of the noise variance in the data set
        sigma_sqrt = 0.0;
        K = len(clusters);
        N = 0.0;
          
        for index_cluster in range(0, len(clusters), 1):
            for index_object in clusters[index_cluster]:
                sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster]);

            N += len(clusters[index_cluster]);
      
        if (N - K > 0):
            sigma_sqrt /= (N - K);
            p = (K - 1) + dimension * K + 1;

            # in case of the same points, sigma_sqrt can be zero (issue: #407)
            sigma_multiplier = 0.0;
            if (sigma_sqrt <= 0.0):
                sigma_multiplier = float('-inf');
            else:
                sigma_multiplier = dimension * 0.5 * log(sigma_sqrt);
            
            # splitting criterion    
            for index_cluster in range(0, len(clusters), 1):
                n = len(clusters[index_cluster]);

                L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5;
                
                # BIC calculation
                scores[index_cluster] = L - p * 0.5 * log(N);
                
        return sum(scores);


Push — master ( c8bc69...89bc14 )

xmeans.__improve_parameters() C

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing init.py files

1			"""!
2
3			@brief Cluster analysis algorithm: X-Means
4			@details Based on article description:
5			- D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2018
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28
29			import numpy;
			0 ignored issues – show Configuration introduced 2017-04-17 02:25 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30			import random;
31
32			from enum import IntEnum;
33
34			from math import log;
35
36			from pyclustering.cluster.encoder import type_encoding;
37			from pyclustering.cluster.kmeans import kmeans;
38			from pyclustering.cluster.center_initializer import kmeans_plusplus_initializer;
39
40			from pyclustering.core.wrapper import ccore_library;
41
42			import pyclustering.core.xmeans_wrapper as wrapper;
43
44			from pyclustering.utils import euclidean_distance_square, euclidean_distance;
45			from pyclustering.utils import list_math_addition_number;
			0 ignored issues – show Unused Code introduced 2018-02-17 12:39 UTC by Report Bug Copy Issue Report Unused list_math_addition_number imported from pyclustering.utils Loading history...
46
47
48			class splitting_type(IntEnum):
49			"""!
50			@brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
51
52			"""
53
54			## Bayesian information criterion (BIC) to approximate the correct number of clusters.
55			## Kass's formula is used to calculate BIC:
56			## \f[BIC(\theta) = L(D) - \frac{1}{2}pln(N)\f]
57			##
58			## The number of free parameters \f$p\f$ is simply the sum of \f$K - 1\f$ class probabilities, \f$MK\f$ centroid coordinates, and one variance estimate:
59			## \f[p = (K - 1) + MK + 1\f]
60			##
61			## The log-likelihood of the data:
62			## \f[L(D) = n_jln(n_j) - n_jln(N) - \frac{n_j}{2}ln(2\pi) - \frac{n_jd}{2}ln(\hat{\sigma}^2) - \frac{n_j - K}{2}\f]
63			##
64			## The maximum likelihood estimate (MLE) for the variance:
65			## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|^2\f]
66			BAYESIAN_INFORMATION_CRITERION = 0;
67
68			## Minimum noiseless description length (MNDL) to approximate the correct number of clusters.
69			## Beheshti's formula is used to calculate upper bound:
70			## \f[Z = \frac{\sigma^2 \sqrt{2K} }{N}(\sqrt{2K} + \beta) + W - \sigma^2 + \frac{2\alpha\sigma}{\sqrt{N}}\sqrt{\frac{\alpha^2\sigma^2}{N} + W - \left(1 - \frac{K}{N}\right)\frac{\sigma^2}{2}} + \frac{2\alpha^2\sigma^2}{N}\f]
71			##
72			## where \f$\alpha\f$ and \f$\beta\f$ represent the parameters for validation probability and confidence probability.
73			##
74			## To improve clustering results some contradiction is introduced:
75			## \f[W = \frac{1}{n_j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|\f]
76			## \f[\hat{\sigma}^2 = \frac{1}{N - K}\sum\limits_{j}\sum\limits_{i}\|\|x_{ij} - \hat{C}_j\|\|\f]
77			MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
78
79
80			class xmeans:
81			"""!
82			@brief Class represents clustering algorithm X-Means.
83			@details X-means clustering method starts with the assumption of having a minimum number of clusters,
84			and then dynamically increases them. X-means uses specified splitting criterion to control
85			the process of splitting clusters. Method K-Means++ can be used for calculation of initial centers.
86
87			CCORE option can be used to use the pyclustering core - C/C++ shared library for processing that significantly increases performance.
88
89			CCORE implementation of the algorithm uses thread pool to parallelize the clustering process.
90
91			Example:
92			@code
93			# sample for cluster analysis (represented by list)
94			sample = read_sample(path_to_sample);
95
96			# create object of X-Means algorithm that uses CCORE for processing
97			# initial centers - optional parameter, if it is None, then random centers will be used by the algorithm.
98			# let's avoid random initial centers and initialize them using K-Means++ method:
99			initial_centers = kmeans_plusplus_initializer(sample, 2).initialize();
100			xmeans_instance = xmeans(sample, initial_centers, ccore = True);
101
102			# run cluster analysis
103			xmeans_instance.process();
104
105			# obtain results of clustering
106			clusters = xmeans_instance.get_clusters();
107
108			# display allocated clusters
109			draw_clusters(sample, clusters);
110			@endcode
111
112			@see center_initializer
113
114			"""
115
116			def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = True):
117			"""!
118			@brief Constructor of clustering algorithm X-Means.
119
120			@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
121			@param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
122			if it is not specified then X-Means starts from the random center.
123			@param[in] kmax (uint): Maximum number of clusters that can be allocated.
124			@param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
125			@param[in] criterion (splitting_type): Type of splitting creation.
126			@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
127
128			"""
129
130			self.__pointer_data = data;
131			self.__clusters = [];
132
133			if (initial_centers is not None):
134			self.__centers = initial_centers[:];
135			else:
136			self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
137
138			self.__kmax = kmax;
139			self.__tolerance = tolerance;
140			self.__criterion = criterion;
141
142			self.__ccore = ccore;
143			if (self.__ccore):
144			self.__ccore = ccore_library.workable();
145
146
147			def process(self):
148			"""!
149			@brief Performs cluster analysis in line with rules of X-Means algorithm.
150
151			@remark Results of clustering can be obtained using corresponding gets methods.
152
153			@see get_clusters()
154			@see get_centers()
155
156			"""
157
158			if (self.__ccore is True):
159			self.__clusters, self.__centers = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance, self.__criterion);
160
161			else:
162			self.__clusters = [];
163			while ( len(self.__centers) <= self.__kmax ):
164			current_cluster_number = len(self.__centers);
165
166			self.__clusters, self.__centers = self.__improve_parameters(self.__centers);
167			allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
168
169			if (current_cluster_number == len(allocated_centers)):
170			#if ( (current_cluster_number == len(allocated_centers)) or (len(allocated_centers) > self.__kmax) ):
171			break;
172			else:
173			self.__centers = allocated_centers;
174
175			self.__clusters, self.__centers = self.__improve_parameters(self.__centers);
176
177
178			def get_clusters(self):
179			"""!
180			@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
181
182			@return (list) List of allocated clusters.
183
184			@see process()
185			@see get_centers()
186
187			"""
188
189			return self.__clusters;
190
191
192			def get_centers(self):
193			"""!
194			@brief Returns list of centers for allocated clusters.
195
196			@return (list) List of centers for allocated clusters.
197
198			@see process()
199			@see get_clusters()
200
201			"""
202
203			return self.__centers;
204
205
206			def get_cluster_encoding(self):
207			"""!
208			@brief Returns clustering result representation type that indicate how clusters are encoded.
209
210			@return (type_encoding) Clustering result representation.
211
212			@see get_clusters()
213
214			"""
215
216			return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
217
218
219			def __improve_parameters(self, centers, available_indexes = None):
220			"""!
221			@brief Performs k-means clustering in the specified region.
222
223			@param[in] centers (list): Centers of clusters.
224			@param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
225
226			@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
227
228			"""
229
230			if (available_indexes and len(available_indexes) == 1):
231			index_center = available_indexes[0];
232			return ([ available_indexes ], self.__pointer_data[index_center]);
233
234			local_data = self.__pointer_data;
235			if available_indexes:
236			local_data = [ self.__pointer_data[i] for i in available_indexes ];
237
238			local_centers = centers;
239			if centers is None:
240			local_centers = kmeans_plusplus_initializer(local_data, 2, kmeans_plusplus_initializer.FARTHEST_CENTER_CANDIDATE).initialize();
241
242			kmeans_instance = kmeans(local_data, local_centers, tolerance=self.__tolerance, ccore=False);
243			kmeans_instance.process();
244
245			local_centers = kmeans_instance.get_centers();
246
247			clusters = kmeans_instance.get_clusters();
248			if (available_indexes):
249			clusters = self.__local_to_global_clusters(clusters, available_indexes);
250
251			return (clusters, local_centers);
252
253
254			def __local_to_global_clusters(self, local_clusters, available_indexes):
255			"""!
256			@brief Converts clusters in local region define by 'available_indexes' to global clusters.
257
258			@param[in] local_clusters (list): Local clusters in specific region.
259			@param[in] available_indexes (list): Map between local and global point's indexes.
260
261			@return Global clusters.
262
263			"""
264
265			clusters = [];
266			for local_cluster in local_clusters:
267			current_cluster = [];
268			for index_point in local_cluster:
269			current_cluster.append(available_indexes[index_point]);
270
271			clusters.append(current_cluster);
272
273			return clusters;
274
275
276			def __improve_structure(self, clusters, centers):
277			"""!
278			@brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
279
280			@param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
281			@param[in] centers (list): Centers of clusters.
282
283			@return (list) Allocated centers for clustering.
284
285			"""
286
287			allocated_centers = [];
288			amount_free_centers = self.__kmax - len(centers);
289
290			for index_cluster in range(len(clusters)):
291			# solve k-means problem for children where data of parent are used.
292			(parent_child_clusters, parent_child_centers) = self.__improve_parameters(None, clusters[index_cluster]);
293
294			# If it's possible to split current data
295			if (len(parent_child_clusters) > 1):
296			# Calculate splitting criterion
297			parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
298			child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
299
300			split_require = False;
301
302			# Reallocate number of centers (clusters) in line with scores
303			if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
304			if (parent_scores < child_scores): split_require = True;
305
306			elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
307			# If its score for the split structure with two children is smaller than that for the parent structure,
308			# then representing the data samples with two clusters is more accurate in comparison to a single parent cluster.
309			if (parent_scores > child_scores): split_require = True;
310
311			if ( (split_require is True) and (amount_free_centers > 0) ):
312			allocated_centers.append(parent_child_centers[0]);
313			allocated_centers.append(parent_child_centers[1]);
314
315			amount_free_centers -= 1;
316			else:
317			allocated_centers.append(centers[index_cluster]);
318
319
320			else:
321			allocated_centers.append(centers[index_cluster]);
322
323			return allocated_centers;
324
325
326			def __splitting_criterion(self, clusters, centers):
327			"""!
328			@brief Calculates splitting criterion for input clusters.
329
330			@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
331			@param[in] centers (list): Centers of the clusters.
332
333			@return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
334
335			@see __bayesian_information_criterion(clusters, centers)
336			@see __minimum_noiseless_description_length(clusters, centers)
337
338			"""
339
340			if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
341			return self.__bayesian_information_criterion(clusters, centers);
342
343			elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
344			return self.__minimum_noiseless_description_length(clusters, centers);
345
346			else:
347			assert 0;
348
349
350			def __minimum_noiseless_description_length(self, clusters, centers):
351			"""!
352			@brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
353
354			@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
355			@param[in] centers (list): Centers of the clusters.
356
357			@return (double) Returns splitting criterion in line with bayesian information criterion.
358			Low value of splitting cretion means that current structure is much better.
359
360			@see __bayesian_information_criterion(clusters, centers)
361
362			"""
363
364			scores = float('inf');
365
366			W = 0.0;
367			K = len(clusters);
368			N = 0.0;
369
370			sigma_sqrt = 0.0;
371
372			alpha = 0.9;
373			betta = 0.9;
374
375			for index_cluster in range(0, len(clusters), 1):
376			Ni = len(clusters[index_cluster]);
377			if (Ni == 0):
378			return float('inf');
379
380			Wi = 0.0;
381			for index_object in clusters[index_cluster]:
382			# euclidean_distance_square should be used in line with paper, but in this case results are
383			# very poor, therefore square root is used to improved.
384			Wi += euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]);
385
386			sigma_sqrt += Wi;
387			W += Wi / Ni;
388			N += Ni;
389
390			if (N - K > 0):
391			sigma_sqrt /= (N - K);
392			sigma = sigma_sqrt ** 0.5;
393
394			Kw = (1.0 - K / N) * sigma_sqrt;
395			Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) * ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
396
397			scores = sigma_sqrt * (2 * K)*0.5 ((2 * K)*0.5 + betta) / N + W - sigma_sqrt + Ks + 2 alpha*0.5 sigma_sqrt / N
398
399			return scores;
400
401
402			def __bayesian_information_criterion(self, clusters, centers):
403			"""!
404			@brief Calculates splitting criterion for input clusters using bayesian information criterion.
405
406			@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
407			@param[in] centers (list): Centers of the clusters.
408
409			@return (double) Splitting criterion in line with bayesian information criterion.
410			High value of splitting criterion means that current structure is much better.
411
412			@see __minimum_noiseless_description_length(clusters, centers)
413
414			"""
415
416			scores = [float('inf')] * len(clusters) # splitting criterion
417			dimension = len(self.__pointer_data[0]);
418
419			# estimation of the noise variance in the data set
420			sigma_sqrt = 0.0;
421			K = len(clusters);
422			N = 0.0;
423
424			for index_cluster in range(0, len(clusters), 1):
425			for index_object in clusters[index_cluster]:
426			sigma_sqrt += euclidean_distance_square(self.__pointer_data[index_object], centers[index_cluster]);
427
428			N += len(clusters[index_cluster]);
429
430			if (N - K > 0):
431			sigma_sqrt /= (N - K);
432			p = (K - 1) + dimension * K + 1;
433
434			# in case of the same points, sigma_sqrt can be zero (issue: #407)
435			sigma_multiplier = 0.0;
436			if (sigma_sqrt <= 0.0):
437			sigma_multiplier = float('-inf');
438			else:
439			sigma_multiplier = dimension * 0.5 * log(sigma_sqrt);
440
441			# splitting criterion
442			for index_cluster in range(0, len(clusters), 1):
443			n = len(clusters[index_cluster]);
444
445			L = n * log(n) - n * log(N) - n * 0.5 * log(2.0 * numpy.pi) - n * sigma_multiplier - (n - K) * 0.5;
446
447			# BIC calculation
448			scores[index_cluster] = L - p * 0.5 * log(N);
449
450			return sum(scores);
451

annoviko / pyclustering

Push — master ( c8bc69...89bc14 )

xmeans.__improve_parameters() C

Complexity

Size

Duplication

Importance

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files