xmeans - Code Metrics - Inspection of "[pyclustering.cluster] Clustering result encoder,..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 6ed467...f3f897 )

by Andrei

created 2016-12-23 11:53 UTC

xmeans B

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	388
Duplicated Lines	14.43 %

Importance

Changes

Metric	Value
dl	56
loc	388
rs	7.9487
c	0
b	0
f	0
wmc	52

12 Methods

Rating	Name	Duplication	Size	Complexity
A	get_cluster_encoding()	0	11	1
B	process()	0	28	6
A	get_clusters()	0	12	1
B	__minimum_noiseless_description_length()	0	57	6
B	__init__()	0	27	3
A	get_centers()	0	12	1
B	__bayesian_information_criterion()	23	39	6
C	__update_clusters()	31	34	7
B	__update_centers()	0	22	4
B	__improve_parameters()	0	28	5
C	__improve_structure()	0	50	8
B	__splitting_criterion()	0	22	4

How to fix Duplicated Code Complexity

"""!

@brief Cluster analysis algorithm: X-Means
@details Based on article description:
         - D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""


import numpy;
# .scrutinizer.yml
before_commands:
    - sudo pip install abc # Python2
    - sudo pip3 install abc # Python3
import math;
import random;

from enum import IntEnum;

from pyclustering.cluster.encoder import type_encoding;

import pyclustering.core.wrapper as wrapper;

from pyclustering.utils import euclidean_distance, euclidean_distance_sqrt;
from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_multiplication, list_math_division_number, list_math_subtraction;


class splitting_type(IntEnum):
    """!
    @brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
    
    """
    
    ## Bayesian information criterion to approximate the correct number of clusters.
    BAYESIAN_INFORMATION_CRITERION = 0;
    
    ## Minimum noiseless description length to approximate the correct number of clusters.
    MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;


class xmeans:
    """!
    @brief Class represents clustering algorithm X-Means.
    
    Example:
    @code
        # sample for cluster analysis (represented by list)
        sample = read_sample(path_to_sample);
        
        # create object of X-Means algorithm that uses CCORE for processing
        # initial centers - optional parameter, if it is None, then random center will be used by the algorithm
        initial_centers = [ [0.0, 0.5] ];
        xmeans_instance = xmeans(sample, initial_centers, ccore = True);
        
        # run cluster analysis
        xmeans_instance.process();
        
        # obtain results of clustering
        clusters = xmeans_instance.get_clusters();
        
        # display allocated clusters
        draw_clusters(sample, clusters);
    @endcode
    
    """
    
    def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
        """!
        @brief Constructor of clustering algorithm X-Means.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...], 
                    if it is not specified then X-Means starts from the random center.
        @param[in] kmax (uint): Maximum number of clusters that can be allocated.
        @param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
        @param[in] criterion (splitting_type): Type of splitting creation.
        @param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
        
        """
           
        self.__pointer_data = data;
        self.__clusters = [];
        
        if (initial_centers is not None):
            self.__centers = initial_centers[:];
        else:
            self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
        
        self.__kmax = kmax;
        self.__tolerance = tolerance;
        self.__criterion = criterion;
         
        self.__ccore = ccore;
         
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of X-Means algorithm.
        
        @remark Results of clustering can be obtained using corresponding gets methods.
        
        @see get_clusters()
        @see get_centers()
        
        """
        
        if (self.__ccore is True):
            self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance);
            self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ]; 
            
            self.__centers = self.__update_centers(self.__clusters);
        else:
            self.__clusters = [];
            while ( len(self.__centers) < self.__kmax ):
                current_cluster_number = len(self.__centers);
                 
                (self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
                allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
                
                if ( (current_cluster_number == len(allocated_centers)) ):
                    break;
                else:
                    self.__centers = allocated_centers;
                    
     
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_centers()
        
        """
         
        return self.__clusters;
     
     
    def get_centers(self):
        """!
        @brief Returns list of centers for allocated clusters.
        
        @return (list) List of centers for allocated clusters.
        
        @see process()
        @see get_clusters()
        
        """
         
        return self.__centers;


    def get_cluster_encoding(self):
        """!
        @brief Returns clustering result representation type that indicate how clusters are encoded.
        
        @return (type_encoding) Clustering result representation.
        
        @see get_clusters()
        
        """
        
        return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;


    def __improve_parameters(self, centers, available_indexes = None):
        """!
        @brief Performs k-means clustering in the specified region.
        
        @param[in] centers (list): Centers of clusters.
        @param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
        
        @return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
        
        """
        
        changes = numpy.Inf;
        
        stop_condition = self.__tolerance * self.__tolerance; # Fast solution
          
        clusters = [];
          
        while (changes > stop_condition):
            clusters = self.__update_clusters(centers, available_indexes);
            clusters = [ cluster for cluster in clusters if len(cluster) > 0 ]; 
            
            updated_centers = self.__update_centers(clusters);
          
            changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]);    # Fast solution
              
            centers = updated_centers;
          
        return (clusters, centers);
     
     
    def __improve_structure(self, clusters, centers):
        """!
        @brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
        
        @param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
        @param[in] centers (list): Centers of clusters.
        
        @return (list) Allocated centers for clustering.
        
        """
         
        difference = 0.001;
          
        allocated_centers = [];
          
        for index_cluster in range(len(clusters)):
            # split cluster into two child clusters
            parent_child_centers = [];
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
            parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
          
            # solve k-means problem for children where data of parent are used.
            (parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
              
            # If it's possible to split current data
            if (len(parent_child_clusters) > 1):
                # Calculate splitting criterion
                parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
                child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
              
                split_require = False;
                
                # Reallocate number of centers (clusters) in line with scores        
                if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
                    if (parent_scores < child_scores): split_require = True;
                    
                elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
                    if (parent_scores > child_scores): split_require = True;
                    
                if (split_require is True):
                    allocated_centers.append(parent_child_centers[0]);
                    allocated_centers.append(parent_child_centers[1]);
                else:
                    allocated_centers.append(centers[index_cluster]);

                    
            else:
                allocated_centers.append(centers[index_cluster]);
          
        return allocated_centers;
     
     
    def __splitting_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """
        
        if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
            return self.__bayesian_information_criterion(clusters, centers);
        
        elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
            return self.__minimum_noiseless_description_length(clusters, centers);
        
        else:
            assert 0;
 
    
    def __minimum_noiseless_description_length(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Returns splitting criterion in line with bayesian information criterion. 
                Low value of splitting cretion means that current structure is much better.
        
        @see __bayesian_information_criterion(clusters, centers)
        
        """
                
        scores = [0.0] * len(clusters);
        
        W = 0.0;
        K = len(clusters);
        N = 0.0;

        sigma_sqrt = 0.0;
        
        alpha = 0.9;
        betta = 0.9;
                
        for index_cluster in range(0, len(clusters), 1):
            for index_object in clusters[index_cluster]:
                delta_vector = list_math_subtraction(self.__pointer_data[index_object], centers[index_cluster]);
                delta_sqrt = sum(list_math_multiplication(delta_vector, delta_vector));
                
                W += delta_sqrt;
                sigma_sqrt += delta_sqrt;
            
            N += len(clusters[index_cluster]);     
        
        if (N - K != 0):
            W /= N;
            
            sigma_sqrt /= (N - K);
            sigma = sigma_sqrt ** 0.5;
            
            for index_cluster in range(0, len(clusters), 1):
                Kw = (1.0 - K / N) * sigma_sqrt;
                Ks = ( 2.0 * alpha * sigma / (N ** 0.5) ) + ( (alpha ** 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
                U = W - Kw + 2.0 * (alpha ** 2.0) * sigma_sqrt / N + Ks;
                
                Z = K * sigma_sqrt / N + U + betta * ( (2.0 * K) ** 0.5 ) * sigma_sqrt / N;
                
                if (Z == 0.0):
                    scores[index_cluster] = float("inf");
                else:
                    scores[index_cluster] = Z;
                
        else:
            scores = [float("inf")] * len(clusters);
        
        return sum(scores);
 
    def __bayesian_information_criterion(self, clusters, centers):
        """!
        @brief Calculates splitting criterion for input clusters using bayesian information criterion.
        
        @param[in] clusters (list): Clusters for which splitting criterion should be calculated.
        @param[in] centers (list): Centers of the clusters.
        
        @return (double) Splitting criterion in line with bayesian information criterion.
                High value of splitting cretion means that current structure is much better.
                
        @see __minimum_noiseless_description_length(clusters, centers)
        
        """

        scores = [0.0] * len(clusters)     # splitting criterion
        dimension = len(self.__pointer_data[0]);
          

        # estimation of the noise variance in the data set
        sigma = 0.0;
        K = len(clusters);
        N = 0.0;
          
        for index_cluster in range(0, len(clusters), 1):
            for index_object in clusters[index_cluster]:
                sigma += (euclidean_distance(self.__pointer_data[index_object], centers[index_cluster]));  # It works

            N += len(clusters[index_cluster]);
      
        if (N - K != 0):
            sigma /= (N - K);
        
            # splitting criterion    
            for index_cluster in range(0, len(clusters), 1):
                n = len(clusters[index_cluster]);
                
                if (sigma > 0.0):
                    scores[index_cluster] = n * math.log(n) - n * math.log(N) - n * math.log(2.0 * numpy.pi) / 2.0 - n * dimension * math.log(sigma) / 2.0 - (n - K) / 2.0;
                  
        return sum(scores);
 
 
    def __update_clusters(self, centers, available_indexes = None):
        """!
        @brief Calculates Euclidean distance to each point from the each cluster.
               Nearest points are captured by according clusters and as a result clusters are updated.
               
        @param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
        @param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
        
        @return (list) Updated clusters.
        
        """
            

        bypass = None;
        if (available_indexes is None):
            bypass = range(len(self.__pointer_data));
        else:
            bypass = available_indexes;
          
        clusters = [[] for i in range(len(centers))];
        for index_point in bypass:
            index_optim = -1;
            dist_optim = 0.0;
              
            for index in range(len(centers)):
                # dist = euclidean_distance(data[index_point], centers[index]);         # Slow solution
                dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]);      # Fast solution
                  
                if ( (dist < dist_optim) or (index is 0)):
                    index_optim = index;
                    dist_optim = dist;
              
            clusters[index_optim].append(index_point);
              
        return clusters;
             
     
    def __update_centers(self, clusters):
        """!
        @brief Updates centers of clusters in line with contained objects.
        
        @param[in] clusters (list): Clusters that contain indexes of objects from data.
        
        @return (list) Updated centers.
        
        """
         
        centers = [[] for i in range(len(clusters))];
        dimension = len(self.__pointer_data[0])
          
        for index in range(len(clusters)):
            point_sum = [0.0] * dimension;
              
            for index_point in clusters[index]:
                point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
            
            centers[index] = list_math_division_number(point_sum, len(clusters[index]));
              
        return centers;


Push — master ( 6ed467...f3f897 )

xmeans B

Complexity

Size/Duplication

Importance

12 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complex Class

1. Missing Dependencies

2. Missing init.py files

1		"""!
2
3		@brief Cluster analysis algorithm: X-Means
4		@details Based on article description:
5		- D.Pelleg, A.Moore. X-means: Extending K-means with Efficient Estimation of the Number of Clusters. 2000.
6
7		@authors Andrei Novikov ([email protected])
8		@date 2014-2016
9		@copyright GNU Public License
10
11		@cond GNU_PUBLIC_LICENSE
12		PyClustering is free software: you can redistribute it and/or modify
13		it under the terms of the GNU General Public License as published by
14		the Free Software Foundation, either version 3 of the License, or
15		(at your option) any later version.
16
17		PyClustering is distributed in the hope that it will be useful,
18		but WITHOUT ANY WARRANTY; without even the implied warranty of
19		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20		GNU General Public License for more details.
21
22		You should have received a copy of the GNU General Public License
23		along with this program. If not, see <http://www.gnu.org/licenses/>.
24		@endcond
25
26		"""
27
28
29		import numpy;
		0 ignored issues – show Configuration introduced 2016-03-31 09:26 UTC by Report Bug Copy Issue Report The import `numpy` could not be resolved. This can be caused by one of the following: 1. Missing Dependencies This error could indicate a configuration issue of Pylint. Make sure that your libraries are available by adding the necessary commands. # .scrutinizer.yml before_commands: - sudo pip install abc # Python2 - sudo pip3 install abc # Python3 Tip: We are currently not using virtualenv to run pylint, when installing your modules make sure to use the command for the correct version. 2. Missing __init__.py files This error could also result from missing `__init__.py` files in your module folders. Make sure that you place one file in each sub-folder. Loading history...
30		import math;
31		import random;
32
33		from enum import IntEnum;
34
35		from pyclustering.cluster.encoder import type_encoding;
36
37		import pyclustering.core.wrapper as wrapper;
38
39		from pyclustering.utils import euclidean_distance, euclidean_distance_sqrt;
40		from pyclustering.utils import list_math_addition_number, list_math_addition, list_math_multiplication, list_math_division_number, list_math_subtraction;
41
42
43		class splitting_type(IntEnum):
44		"""!
45		@brief Enumeration of splitting types that can be used as splitting creation of cluster in X-Means algorithm.
46
47		"""
48
49		## Bayesian information criterion to approximate the correct number of clusters.
50		BAYESIAN_INFORMATION_CRITERION = 0;
51
52		## Minimum noiseless description length to approximate the correct number of clusters.
53		MINIMUM_NOISELESS_DESCRIPTION_LENGTH = 1;
54
55
56		class xmeans:
57		"""!
58		@brief Class represents clustering algorithm X-Means.
59
60		Example:
61		@code
62		# sample for cluster analysis (represented by list)
63		sample = read_sample(path_to_sample);
64
65		# create object of X-Means algorithm that uses CCORE for processing
66		# initial centers - optional parameter, if it is None, then random center will be used by the algorithm
67		initial_centers = [ [0.0, 0.5] ];
68		xmeans_instance = xmeans(sample, initial_centers, ccore = True);
69
70		# run cluster analysis
71		xmeans_instance.process();
72
73		# obtain results of clustering
74		clusters = xmeans_instance.get_clusters();
75
76		# display allocated clusters
77		draw_clusters(sample, clusters);
78		@endcode
79
80		"""
81
82		def __init__(self, data, initial_centers = None, kmax = 20, tolerance = 0.025, criterion = splitting_type.BAYESIAN_INFORMATION_CRITERION, ccore = False):
83		"""!
84		@brief Constructor of clustering algorithm X-Means.
85
86		@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
87		@param[in] initial_centers (list): Initial coordinates of centers of clusters that are represented by list: [center1, center2, ...],
88		if it is not specified then X-Means starts from the random center.
89		@param[in] kmax (uint): Maximum number of clusters that can be allocated.
90		@param[in] tolerance (double): Stop condition for each iteration: if maximum value of change of centers of clusters is less than tolerance than algorithm will stop processing.
91		@param[in] criterion (splitting_type): Type of splitting creation.
92		@param[in] ccore (bool): Defines should be CCORE (C++ pyclustering library) used instead of Python code or not.
93
94		"""
95
96		self.__pointer_data = data;
97		self.__clusters = [];
98
99		if (initial_centers is not None):
100		self.__centers = initial_centers[:];
101		else:
102		self.__centers = [ [random.random() for _ in range(len(data[0])) ] ];
103
104		self.__kmax = kmax;
105		self.__tolerance = tolerance;
106		self.__criterion = criterion;
107
108		self.__ccore = ccore;
109
110		def process(self):
111		"""!
112		@brief Performs cluster analysis in line with rules of X-Means algorithm.
113
114		@remark Results of clustering can be obtained using corresponding gets methods.
115
116		@see get_clusters()
117		@see get_centers()
118
119		"""
120
121		if (self.__ccore is True):
122		self.__clusters = wrapper.xmeans(self.__pointer_data, self.__centers, self.__kmax, self.__tolerance);
123		self.__clusters = [ cluster for cluster in self.__clusters if len(cluster) > 0 ];
124
125		self.__centers = self.__update_centers(self.__clusters);
126		else:
127		self.__clusters = [];
128		while ( len(self.__centers) < self.__kmax ):
129		current_cluster_number = len(self.__centers);
130
131		(self.__clusters, self.__centers) = self.__improve_parameters(self.__centers);
132		allocated_centers = self.__improve_structure(self.__clusters, self.__centers);
133
134		if ( (current_cluster_number == len(allocated_centers)) ):
135		break;
136		else:
137		self.__centers = allocated_centers;
138
139
140		def get_clusters(self):
141		"""!
142		@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
143
144		@return (list) List of allocated clusters.
145
146		@see process()
147		@see get_centers()
148
149		"""
150
151		return self.__clusters;
152
153
154		def get_centers(self):
155		"""!
156		@brief Returns list of centers for allocated clusters.
157
158		@return (list) List of centers for allocated clusters.
159
160		@see process()
161		@see get_clusters()
162
163		"""
164
165		return self.__centers;
166
167
168		def get_cluster_encoding(self):
169		"""!
170		@brief Returns clustering result representation type that indicate how clusters are encoded.
171
172		@return (type_encoding) Clustering result representation.
173
174		@see get_clusters()
175
176		"""
177
178		return type_encoding.CLUSTER_INDEX_LIST_SEPARATION;
179
180
181		def __improve_parameters(self, centers, available_indexes = None):
182		"""!
183		@brief Performs k-means clustering in the specified region.
184
185		@param[in] centers (list): Centers of clusters.
186		@param[in] available_indexes (list): Indexes that defines which points can be used for k-means clustering, if None - then all points are used.
187
188		@return (list) List of allocated clusters, each cluster contains indexes of objects in list of data.
189
190		"""
191
192		changes = numpy.Inf;
193
194		stop_condition = self.__tolerance * self.__tolerance; # Fast solution
195
196		clusters = [];
197
198		while (changes > stop_condition):
199		clusters = self.__update_clusters(centers, available_indexes);
200		clusters = [ cluster for cluster in clusters if len(cluster) > 0 ];
201
202		updated_centers = self.__update_centers(clusters);
203
204		changes = max([euclidean_distance_sqrt(centers[index], updated_centers[index]) for index in range(len(updated_centers))]); # Fast solution
205
206		centers = updated_centers;
207
208		return (clusters, centers);
209
210
211		def __improve_structure(self, clusters, centers):
212		"""!
213		@brief Check for best structure: divides each cluster into two and checks for best results using splitting criterion.
214
215		@param[in] clusters (list): Clusters that have been allocated (each cluster contains indexes of points from data).
216		@param[in] centers (list): Centers of clusters.
217
218		@return (list) Allocated centers for clustering.
219
220		"""
221
222		difference = 0.001;
223
224		allocated_centers = [];
225
226		for index_cluster in range(len(clusters)):
227		# split cluster into two child clusters
228		parent_child_centers = [];
229		parent_child_centers.append(list_math_addition_number(centers[index_cluster], -difference));
230		parent_child_centers.append(list_math_addition_number(centers[index_cluster], difference));
231
232		# solve k-means problem for children where data of parent are used.
233		(parent_child_clusters, parent_child_centers) = self.__improve_parameters(parent_child_centers, clusters[index_cluster]);
234
235		# If it's possible to split current data
236		if (len(parent_child_clusters) > 1):
237		# Calculate splitting criterion
238		parent_scores = self.__splitting_criterion([ clusters[index_cluster] ], [ centers[index_cluster] ]);
239		child_scores = self.__splitting_criterion([ parent_child_clusters[0], parent_child_clusters[1] ], parent_child_centers);
240
241		split_require = False;
242
243		# Reallocate number of centers (clusters) in line with scores
244		if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
245		if (parent_scores < child_scores): split_require = True;
246
247		elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
248		if (parent_scores > child_scores): split_require = True;
249
250		if (split_require is True):
251		allocated_centers.append(parent_child_centers[0]);
252		allocated_centers.append(parent_child_centers[1]);
253		else:
254		allocated_centers.append(centers[index_cluster]);
255
256
257		else:
258		allocated_centers.append(centers[index_cluster]);
259
260		return allocated_centers;
261
262
263		def __splitting_criterion(self, clusters, centers):
264		"""!
265		@brief Calculates splitting criterion for input clusters.
266
267		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
268		@param[in] centers (list): Centers of the clusters.
269
270		@return (double) Returns splitting criterion. High value of splitting cretion means that current structure is much better.
271
272		@see __bayesian_information_criterion(clusters, centers)
273		@see __minimum_noiseless_description_length(clusters, centers)
274
275		"""
276
277		if (self.__criterion == splitting_type.BAYESIAN_INFORMATION_CRITERION):
278		return self.__bayesian_information_criterion(clusters, centers);
279
280		elif (self.__criterion == splitting_type.MINIMUM_NOISELESS_DESCRIPTION_LENGTH):
281		return self.__minimum_noiseless_description_length(clusters, centers);
282
283		else:
284		assert 0;
285
286
287		def __minimum_noiseless_description_length(self, clusters, centers):
288		"""!
289		@brief Calculates splitting criterion for input clusters using minimum noiseless description length criterion.
290
291		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
292		@param[in] centers (list): Centers of the clusters.
293
294		@return (double) Returns splitting criterion in line with bayesian information criterion.
295		Low value of splitting cretion means that current structure is much better.
296
297		@see __bayesian_information_criterion(clusters, centers)
298
299		"""
300
301		scores = [0.0] * len(clusters);
302
303		W = 0.0;
304		K = len(clusters);
305		N = 0.0;
306
307		sigma_sqrt = 0.0;
308
309		alpha = 0.9;
310		betta = 0.9;
311
312		for index_cluster in range(0, len(clusters), 1):
313		for index_object in clusters[index_cluster]:
314		delta_vector = list_math_subtraction(self.__pointer_data[index_object], centers[index_cluster]);
315		delta_sqrt = sum(list_math_multiplication(delta_vector, delta_vector));
316
317		W += delta_sqrt;
318		sigma_sqrt += delta_sqrt;
319
320		N += len(clusters[index_cluster]);
321
322		if (N - K != 0):
323		W /= N;
324
325		sigma_sqrt /= (N - K);
326		sigma = sigma_sqrt ** 0.5;
327
328		for index_cluster in range(0, len(clusters), 1):
329		Kw = (1.0 - K / N) * sigma_sqrt;
330		Ks = ( 2.0 * alpha * sigma / (N 0.5) ) + ( (alpha 2.0) * sigma_sqrt / N + W - Kw / 2.0 ) ** 0.5;
331		U = W - Kw + 2.0 * (alpha ** 2.0) * sigma_sqrt / N + Ks;
332
333		Z = K * sigma_sqrt / N + U + betta * ( (2.0 * K) ** 0.5 ) * sigma_sqrt / N;
334
335		if (Z == 0.0):
336		scores[index_cluster] = float("inf");
337		else:
338		scores[index_cluster] = Z;
339
340		else:
341		scores = [float("inf")] * len(clusters);
342
343		return sum(scores);
344
345		def __bayesian_information_criterion(self, clusters, centers):
346		"""!
347		@brief Calculates splitting criterion for input clusters using bayesian information criterion.
348
349		@param[in] clusters (list): Clusters for which splitting criterion should be calculated.
350		@param[in] centers (list): Centers of the clusters.
351
352		@return (double) Splitting criterion in line with bayesian information criterion.
353		High value of splitting cretion means that current structure is much better.
354
355		@see __minimum_noiseless_description_length(clusters, centers)
356
357		"""
358
359		scores = [0.0] * len(clusters) # splitting criterion
360		dimension = len(self.__pointer_data[0]);
361	View Code Duplication
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
362		# estimation of the noise variance in the data set
363		sigma = 0.0;
364		K = len(clusters);
365		N = 0.0;
366
367		for index_cluster in range(0, len(clusters), 1):
368		for index_object in clusters[index_cluster]:
369		sigma += (euclidean_distance(self.__pointer_data[index_object], centers[index_cluster])); # It works
370
371		N += len(clusters[index_cluster]);
372
373		if (N - K != 0):
374		sigma /= (N - K);
375
376		# splitting criterion
377		for index_cluster in range(0, len(clusters), 1):
378		n = len(clusters[index_cluster]);
379
380		if (sigma > 0.0):
381		scores[index_cluster] = n * math.log(n) - n * math.log(N) - n * math.log(2.0 * numpy.pi) / 2.0 - n * dimension * math.log(sigma) / 2.0 - (n - K) / 2.0;
382
383		return sum(scores);
384
385
386		def __update_clusters(self, centers, available_indexes = None):
387		"""!
388		@brief Calculates Euclidean distance to each point from the each cluster.
389		Nearest points are captured by according clusters and as a result clusters are updated.
390
391		@param[in] centers (list): Coordinates of centers of clusters that are represented by list: [center1, center2, ...].
392		@param[in] available_indexes (list): Indexes that defines which points can be used from imput data, if None - then all points are used.
393
394		@return (list) Updated clusters.
395
396		"""
397	View Code Duplication
		0 ignored issues – show Duplication introduced 2016-04-28 20:11 UTC by Report Bug Copy Issue Report This code seems to be duplicated in your project. Loading history...
398		bypass = None;
399		if (available_indexes is None):
400		bypass = range(len(self.__pointer_data));
401		else:
402		bypass = available_indexes;
403
404		clusters = [[] for i in range(len(centers))];
405		for index_point in bypass:
406		index_optim = -1;
407		dist_optim = 0.0;
408
409		for index in range(len(centers)):
410		# dist = euclidean_distance(data[index_point], centers[index]); # Slow solution
411		dist = euclidean_distance_sqrt(self.__pointer_data[index_point], centers[index]); # Fast solution
412
413		if ( (dist < dist_optim) or (index is 0)):
414		index_optim = index;
415		dist_optim = dist;
416
417		clusters[index_optim].append(index_point);
418
419		return clusters;
420
421
422		def __update_centers(self, clusters):
423		"""!
424		@brief Updates centers of clusters in line with contained objects.
425
426		@param[in] clusters (list): Clusters that contain indexes of objects from data.
427
428		@return (list) Updated centers.
429
430		"""
431
432		centers = [[] for i in range(len(clusters))];
433		dimension = len(self.__pointer_data[0])
434
435		for index in range(len(clusters)):
436		point_sum = [0.0] * dimension;
437
438		for index_point in clusters[index]:
439		point_sum = list_math_addition(point_sum, self.__pointer_data[index_point]);
440
441		centers[index] = list_math_division_number(point_sum, len(clusters[index]));
442
443		return centers;
444

annoviko / pyclustering

Push — master ( 6ed467...f3f897 )

xmeans B

Complexity

Size/Duplication

Importance

12 Methods

How to fix Duplicated Code Complexity

Duplicated Code

Complex Class

1. Missing Dependencies

2. Missing __init__.py files

Duplication Side-by-Side

Filter issues like

2. Missing init.py files