cure - Code Metrics - Inspection of "[pyclustering.cluster.cure] Additional information..." - annoviko/pyclustering - Measure and Improve Code Quality continuously with Scrutinizer

Completed

Push — master ( 4daa3c...607c53 )

by Andrei

created 2016-05-21 16:37 UTC

cure D

↳ Parent: Project

Complexity

Total Complexity

Size/Duplication

Total Lines	383
Duplicated Lines	0 %

Metric	Value
dl	0
loc	383
rs	4.8387
wmc	58

14 Methods

Rating	Name	Size	Complexity
B	__init__()	27	2
A	get_means()	13	1
A	get_representors()	13	1
C	__merge_clusters()	54	10
A	__delete_represented_points()	10	2
F	process()	87	15
A	__create_kdtree()	12	3
B	__closest_cluster()	23	6
A	__cluster_distance()	20	4
C	__create_queue()	29	7
A	__insert_represented_points()	10	2
A	__relocate_cluster()	10	1
A	__insert_cluster()	14	3
A	get_clusters()	13	1

How to fix Complexity

"""!

@brief Cluster analysis algorithm: CURE
@details Implementation based on article:
         - S.Guha, R.Rastogi, K.Shim. CURE: An Efficient Clustering Algorithm for Large Databases. 1998.

@authors Andrei Novikov ([email protected])
@date 2014-2016
@copyright GNU Public License

@cond GNU_PUBLIC_LICENSE
    PyClustering is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    
    PyClustering is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
@endcond

"""

from pyclustering.utils import euclidean_distance;

from pyclustering.container.kdtree import kdtree;

import pyclustering.core.cure_wrapper as wrapper;

class cure_cluster:
    """!
    @brief Represents data cluster in CURE term. CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
    
    """
    
    def __init__(self, point = None):
        """!
        @brief Constructor of CURE cluster.
        
        @param[in] point (list): Point represented by list of coordinates.
        
        """
        
        ## List of points that make up cluster.
        self.points = [ ];
        
        ## Mean of points that make up cluster.
        self.mean = None;
        
        ## List of points that represents clusters.
        self.rep = [ ];
        
        if (point is not None):
            self.points = [ point ];
            self.mean = point;
            self.rep = [ point ];
        
        ## Pointer to the closest cluster.
        self.closest = None;
        
        ## Distance to the closest cluster.
        self.distance = float('inf');      # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.

    def __repr__(self):
        """!
        @brief Displays distance to closest cluster and points that are contained by current cluster.
        
        """
        return "%s, %s" % (self.distance, self.points);
        

class cure:
    """!
    @brief Class represents clustering algorithm CURE.
    
    Example:
    @code
        # read data for clustering from some file
        sample = read_sample(path_to_data);
        
        # create instance of cure algorithm for cluster analysis
        # request for allocation of two clusters.
        cure_instance = cure(sample, 2, 5, 0.5, True);
        
        # run cluster analysis
        cure_instance.process();
        
        # get results of clustering
        clusters = cure_instance.get_clusters();
    @endcode
    
    """
    
    def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False):
        """!
        @brief Constructor of clustering algorithm CURE.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        @param[in] number_cluster (uint): Number of clusters that should be allocated.
        @param[in] number_represent_points (uint): Number of representative points for each cluster.
        @param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
        @param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
        
        """
                
        self.__pointer_data = data;
        
        self.__clusters = None;
        self.__representors = None;
        self.__means = None;
        
        self.__number_cluster = number_cluster;
        self.__number_represent_points = number_represent_points;
        self.__compression = compression;
        
        self.__ccore = ccore;
        
        if (ccore is False):
            self.__create_queue();      # queue
            self.__create_kdtree();     # create k-d tree

    
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of CURE algorithm.
        
        @remark Results of clustering can be obtained using corresponding get methods.
        
        @see get_clusters()
        
        """
        
        if (self.__ccore is True):
            cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression);
            
            self.__clusters = wrapper.cure_get_clusters(cure_data_pointer);
            self.__representors = wrapper.cure_get_representors(cure_data_pointer);
            self.__means = wrapper.cure_get_means(cure_data_pointer);
            
            wrapper.cure_data_destroy(cure_data_pointer);
            
        else:
            while (len(self.__queue) > self.__number_cluster):
                cluster1 = self.__queue[0];            # cluster that has nearest neighbor.
                cluster2 = cluster1.closest;    # closest cluster.
                
                #print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
                
                self.__queue.remove(cluster1);
                self.__queue.remove(cluster2);
                
                self.__delete_represented_points(cluster1);
                self.__delete_represented_points(cluster2);
        
                merged_cluster = self.__merge_clusters(cluster1, cluster2);
        
                self.__insert_represented_points(merged_cluster);
                
                # Pointers to clusters that should be relocated is stored here.
                cluster_relocation_requests = [];
                
                # Check for the last cluster
                if (len(self.__queue) > 0):
                    merged_cluster.closest = self.__queue[0];  # arbitrary cluster from queue
                    merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest);
                    
                    for item in self.__queue:
                        distance = self.__cluster_distance(merged_cluster, item);
                        # Check if distance between new cluster and current is the best than now.
                        if (distance < merged_cluster.distance):
                            merged_cluster.closest = item;
                            merged_cluster.distance = distance;
                        
                        # Check if current cluster has removed neighbor.
                        if ( (item.closest is cluster1) or (item.closest is cluster2) ):
                            # If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
                            #print("Update: ", item);
                            if (item.distance < distance):
                                (item.closest, item.distance) = self.__closest_cluster(item, distance);
                                
                                # TODO: investigation of root cause is required.
                                # Itself and merged cluster should be always in list of neighbors in line with specified radius.
                                # But merged cluster may not be in list due to error calculation, therefore it should be added
                                # manually.
                                if (item.closest is None):
                                    item.closest = merged_cluster;
                                    item.distance = distance;
                                
                            # Otherwise new cluster is nearest.
                            else:
                                item.closest = merged_cluster;
                                item.distance = distance;
                            
                            cluster_relocation_requests.append(item);
                        elif (item.distance > distance):
                            item.closest = merged_cluster;
                            item.ditance = distance;
                            
                            cluster_relocation_requests.append(item);
                
                # New cluster and updated clusters should relocated in queue
                self.__insert_cluster(merged_cluster);
                for item in cluster_relocation_requests:
                    self.__relocate_cluster(item);
        
            # Change cluster representation
            self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ];
            self.__representors = [ cure_cluster_unit.rep for cure_cluster_unit in self.__queue ];
            self.__means = [ cure_cluster_unit.mean for cure_cluster_unit in self.__queue ];
    
    
    def get_clusters(self):
        """!
        @brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
        
        @return (list) List of allocated clusters.
        
        @see process()
        @see get_representors()
        @see get_means()
        
        """
        
        return self.__clusters;  
    
    
    def get_representors(self):
        """!
        @brief Returns list of representors of each cluster.
        @details Cluster index should be used for navigation between lists of representors.
        
        @return (list) List of representors of each cluster.
        
        @see get_clusters()
        @see get_means()
        
        """
        
        return self.__representors;
    
    
    def get_means(self):
        """!
        @brief Returns list of mean values of each cluster.
        @details Cluster index should be used for navigation between mean values.
        
        @return (list) List of mean values of each cluster.
        
        @see get_clusters()
        @see get_representors()
        
        """
        
        return self.__means;
    
    
    def __insert_cluster(self, cluster):
        """!
        @brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
        
        @param[in] cluster (cure_cluster): Cluster that should be inserted.
        
        """
        
        for index in range(len(self.__queue)):
            if (cluster.distance < self.__queue[index].distance):
                self.__queue.insert(index, cluster);
                return;
    
        self.__queue.append(cluster);        


    def __relocate_cluster(self, cluster):
        """!
        @brief Relocate cluster in list in line with distance order.
        
        @param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
        
        """
        
        self.__queue.remove(cluster);
        self.__insert_cluster(cluster);


    def __closest_cluster(self, cluster, distance):
        """!
        @brief Find closest cluster to the specified cluster in line with distance.
        
        @param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
        @param[in] distance (double): Closest distance to the previous cluster.
        
        @return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
        
        """
        
        nearest_cluster = None;
        nearest_distance = float('inf');
        
        for point in cluster.rep:
            # Nearest nodes should be returned (at least it will return itself).
            nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance);
            for (candidate_distance, kdtree_node) in nearest_nodes:
                if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ):
                    nearest_distance = candidate_distance;
                    nearest_cluster = kdtree_node.payload;
                    
        return (nearest_cluster, nearest_distance);


    def __insert_represented_points(self, cluster):
        """!
        @brief Insert representation points to the k-d tree.
        
        @param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
        
        """
        
        for point in cluster.rep:
            self.__tree.insert(point, cluster);


    def __delete_represented_points(self, cluster): 
        """!
        @brief Remove representation points of clusters from the k-d tree
        
        @param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
        
        """
        
        for point in cluster.rep:
            self.__tree.remove(point);


    def __merge_clusters(self, cluster1, cluster2):
        """!
        @brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
        
        @param[in] cluster1 (cure_cluster): Cluster that should be merged.
        @param[in] cluster2 (cure_cluster): Cluster that should be merged.
        
        @return (cure_cluster) New merged CURE cluster.
        
        """
        
        merged_cluster = cure_cluster();
        
        merged_cluster.points = cluster1.points + cluster2.points;
        
        # merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
        dimension = len(cluster1.mean);
        merged_cluster.mean = [0] * dimension;
        if merged_cluster.points[1:] == merged_cluster.points[:-1]:
            merged_cluster.mean = merged_cluster.points[0]
        else:
            for index in range(dimension):
                merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
        
        temporary = list(); # TODO: Set should be used in line with specification (article), but list is not hashable object therefore it's impossible to use list in this fucking set!
        
        for index in range(self.__number_represent_points):
            maximal_distance = 0;
            maximal_point = None;
            
            for point in merged_cluster.points:
                minimal_distance = 0;
                if (index == 0):
                    minimal_distance = euclidean_distance(point, merged_cluster.mean);
                    #minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
                else:
                    minimal_distance = euclidean_distance(point, temporary[0]);
                    #minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
                    
                if (minimal_distance >= maximal_distance):
                    maximal_distance = minimal_distance;
                    maximal_point = point;
        
            if (maximal_point not in temporary):
                temporary.append(maximal_point);
                
        for point in temporary:
            representative_point = [0] * dimension;
            for index in range(dimension):
                representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]);
                
            merged_cluster.rep.append(representative_point);
        
        return merged_cluster;


    def __create_queue(self):
        """!
        @brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
        
        @param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
        
        @return (list) Create queue of sorted clusters by distance between them.
        
        """
        
        self.__queue = [cure_cluster(point) for point in self.__pointer_data];
        
        # set closest clusters
        for i in range(0, len(self.__queue)):
            minimal_distance = float('inf');
            closest_index_cluster = -1;
            
            for k in range(0, len(self.__queue)):
                if (i != k):
                    dist = self.__cluster_distance(self.__queue[i], self.__queue[k]);
                    if (dist < minimal_distance):
                        minimal_distance = dist;
                        closest_index_cluster = k;
            
            self.__queue[i].closest = self.__queue[closest_index_cluster];
            self.__queue[i].distance = minimal_distance;
        
        # sort clusters
        self.__queue.sort(key = lambda x: x.distance, reverse = False);
    

    def __create_kdtree(self):
        """!
        @brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
        
        @return (kdtree) k-d tree that consist of representative points of CURE clusters.
        
        """
        
        self.__tree = kdtree();
        for current_cluster in self.__queue:
            for representative_point in current_cluster.rep:
                self.__tree.insert(representative_point, current_cluster);    


    def __cluster_distance(self, cluster1, cluster2):
        """!
        @brief Calculate minimal distance between clusters using representative points.
        
        @param[in] cluster1 (cure_cluster): The first cluster.
        @param[in] cluster2 (cure_cluster): The second cluster.
        
        @return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
        
        """
        
        distance = float('inf');
        for i in range(0, len(cluster1.rep)):
            for k in range(0, len(cluster2.rep)):
                #dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]);   # Fast mode
                dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]);        # Slow mode
                if (dist < distance):
                    distance = dist;
                    
        return distance;


1			"""!
2
3			@brief Cluster analysis algorithm: CURE
4			@details Implementation based on article:
5			- S.Guha, R.Rastogi, K.Shim. CURE: An Efficient Clustering Algorithm for Large Databases. 1998.
6
7			@authors Andrei Novikov ([email protected])
8			@date 2014-2016
9			@copyright GNU Public License
10
11			@cond GNU_PUBLIC_LICENSE
12			PyClustering is free software: you can redistribute it and/or modify
13			it under the terms of the GNU General Public License as published by
14			the Free Software Foundation, either version 3 of the License, or
15			(at your option) any later version.
16
17			PyClustering is distributed in the hope that it will be useful,
18			but WITHOUT ANY WARRANTY; without even the implied warranty of
19			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20			GNU General Public License for more details.
21
22			You should have received a copy of the GNU General Public License
23			along with this program. If not, see <http://www.gnu.org/licenses/>.
24			@endcond
25
26			"""
27
28			from pyclustering.utils import euclidean_distance;
29
30			from pyclustering.container.kdtree import kdtree;
31
32			import pyclustering.core.cure_wrapper as wrapper;
33
34			class cure_cluster:
35			"""!
36			@brief Represents data cluster in CURE term. CURE cluster is described by points of cluster, representation points of the cluster and by the cluster center.
37
38			"""
39
40			def __init__(self, point = None):
41			"""!
42			@brief Constructor of CURE cluster.
43
44			@param[in] point (list): Point represented by list of coordinates.
45
46			"""
47
48			## List of points that make up cluster.
49			self.points = [ ];
50
51			## Mean of points that make up cluster.
52			self.mean = None;
53
54			## List of points that represents clusters.
55			self.rep = [ ];
56
57			if (point is not None):
58			self.points = [ point ];
59			self.mean = point;
60			self.rep = [ point ];
61
62			## Pointer to the closest cluster.
63			self.closest = None;
64
65			## Distance to the closest cluster.
66			self.distance = float('inf'); # calculation of distance is really complexity operation (even square distance), so let's store distance to closest cluster.
67
68			def __repr__(self):
69			"""!
70			@brief Displays distance to closest cluster and points that are contained by current cluster.
71
72			"""
73			return "%s, %s" % (self.distance, self.points);
74
75
76			class cure:
77			"""!
78			@brief Class represents clustering algorithm CURE.
79
80			Example:
81			@code
82			# read data for clustering from some file
83			sample = read_sample(path_to_data);
84
85			# create instance of cure algorithm for cluster analysis
86			# request for allocation of two clusters.
87			cure_instance = cure(sample, 2, 5, 0.5, True);
88
89			# run cluster analysis
90			cure_instance.process();
91
92			# get results of clustering
93			clusters = cure_instance.get_clusters();
94			@endcode
95
96			"""
97
98			def __init__(self, data, number_cluster, number_represent_points = 5, compression = 0.5, ccore = False):
99			"""!
100			@brief Constructor of clustering algorithm CURE.
101
102			@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
103			@param[in] number_cluster (uint): Number of clusters that should be allocated.
104			@param[in] number_represent_points (uint): Number of representative points for each cluster.
105			@param[in] compression (double): Coefficient defines level of shrinking of representation points toward the mean of the new created cluster after merging on each step. Usually it destributed from 0 to 1.
106			@param[in] ccore (bool): If True than DLL CCORE (C++ solution) will be used for solving.
107
108			"""
109
110			self.__pointer_data = data;
111
112			self.__clusters = None;
113			self.__representors = None;
114			self.__means = None;
115
116			self.__number_cluster = number_cluster;
117			self.__number_represent_points = number_represent_points;
118			self.__compression = compression;
119
120			self.__ccore = ccore;
121
122			if (ccore is False):
123			self.__create_queue(); # queue
124			self.__create_kdtree(); # create k-d tree
125
126
127			def process(self):
128			"""!
129			@brief Performs cluster analysis in line with rules of CURE algorithm.
130
131			@remark Results of clustering can be obtained using corresponding get methods.
132
133			@see get_clusters()
134
135			"""
136
137			if (self.__ccore is True):
138			cure_data_pointer = wrapper.cure_algorithm(self.__pointer_data, self.__number_cluster, self.__number_represent_points, self.__compression);
139
140			self.__clusters = wrapper.cure_get_clusters(cure_data_pointer);
141			self.__representors = wrapper.cure_get_representors(cure_data_pointer);
142			self.__means = wrapper.cure_get_means(cure_data_pointer);
143
144			wrapper.cure_data_destroy(cure_data_pointer);
145
146			else:
147			while (len(self.__queue) > self.__number_cluster):
148			cluster1 = self.__queue[0]; # cluster that has nearest neighbor.
149			cluster2 = cluster1.closest; # closest cluster.
150
151			#print("Merge decision: \n\t", cluster1, "\n\t", cluster2);
152
153			self.__queue.remove(cluster1);
154			self.__queue.remove(cluster2);
155
156			self.__delete_represented_points(cluster1);
157			self.__delete_represented_points(cluster2);
158
159			merged_cluster = self.__merge_clusters(cluster1, cluster2);
160
161			self.__insert_represented_points(merged_cluster);
162
163			# Pointers to clusters that should be relocated is stored here.
164			cluster_relocation_requests = [];
165
166			# Check for the last cluster
167			if (len(self.__queue) > 0):
168			merged_cluster.closest = self.__queue[0]; # arbitrary cluster from queue
169			merged_cluster.distance = self.__cluster_distance(merged_cluster, merged_cluster.closest);
170
171			for item in self.__queue:
172			distance = self.__cluster_distance(merged_cluster, item);
173			# Check if distance between new cluster and current is the best than now.
174			if (distance < merged_cluster.distance):
175			merged_cluster.closest = item;
176			merged_cluster.distance = distance;
177
178			# Check if current cluster has removed neighbor.
179			if ( (item.closest is cluster1) or (item.closest is cluster2) ):
180			# If previous distance was less then distance to new cluster then nearest cluster should be found in the tree.
181			#print("Update: ", item);
182			if (item.distance < distance):
183			(item.closest, item.distance) = self.__closest_cluster(item, distance);
184
185			# TODO: investigation of root cause is required.
186			# Itself and merged cluster should be always in list of neighbors in line with specified radius.
187			# But merged cluster may not be in list due to error calculation, therefore it should be added
188			# manually.
189			if (item.closest is None):
190			item.closest = merged_cluster;
191			item.distance = distance;
192
193			# Otherwise new cluster is nearest.
194			else:
195			item.closest = merged_cluster;
196			item.distance = distance;
197
198			cluster_relocation_requests.append(item);
199			elif (item.distance > distance):
200			item.closest = merged_cluster;
201			item.ditance = distance;
202
203			cluster_relocation_requests.append(item);
204
205			# New cluster and updated clusters should relocated in queue
206			self.__insert_cluster(merged_cluster);
207			for item in cluster_relocation_requests:
208			self.__relocate_cluster(item);
209
210			# Change cluster representation
211			self.__clusters = [ cure_cluster_unit.points for cure_cluster_unit in self.__queue ];
212			self.__representors = [ cure_cluster_unit.rep for cure_cluster_unit in self.__queue ];
213			self.__means = [ cure_cluster_unit.mean for cure_cluster_unit in self.__queue ];
214
215
216			def get_clusters(self):
217			"""!
218			@brief Returns list of allocated clusters, each cluster contains indexes of objects in list of data.
219
220			@return (list) List of allocated clusters.
221
222			@see process()
223			@see get_representors()
224			@see get_means()
225
226			"""
227
228			return self.__clusters;
229
230
231			def get_representors(self):
232			"""!
233			@brief Returns list of representors of each cluster.
234			@details Cluster index should be used for navigation between lists of representors.
235
236			@return (list) List of representors of each cluster.
237
238			@see get_clusters()
239			@see get_means()
240
241			"""
242
243			return self.__representors;
244
245
246			def get_means(self):
247			"""!
248			@brief Returns list of mean values of each cluster.
249			@details Cluster index should be used for navigation between mean values.
250
251			@return (list) List of mean values of each cluster.
252
253			@see get_clusters()
254			@see get_representors()
255
256			"""
257
258			return self.__means;
259
260
261			def __insert_cluster(self, cluster):
262			"""!
263			@brief Insert cluster to the list (sorted queue) in line with sequence order (distance).
264
265			@param[in] cluster (cure_cluster): Cluster that should be inserted.
266
267			"""
268
269			for index in range(len(self.__queue)):
270			if (cluster.distance < self.__queue[index].distance):
271			self.__queue.insert(index, cluster);
272			return;
273
274			self.__queue.append(cluster);
275
276
277			def __relocate_cluster(self, cluster):
278			"""!
279			@brief Relocate cluster in list in line with distance order.
280
281			@param[in] cluster (cure_cluster): Cluster that should be relocated in line with order.
282
283			"""
284
285			self.__queue.remove(cluster);
286			self.__insert_cluster(cluster);
287
288
289			def __closest_cluster(self, cluster, distance):
290			"""!
291			@brief Find closest cluster to the specified cluster in line with distance.
292
293			@param[in] cluster (cure_cluster): Cluster for which nearest cluster should be found.
294			@param[in] distance (double): Closest distance to the previous cluster.
295
296			@return (tuple) Pair (nearest CURE cluster, nearest distance) if the nearest cluster has been found, otherwise None is returned.
297
298			"""
299
300			nearest_cluster = None;
301			nearest_distance = float('inf');
302
303			for point in cluster.rep:
304			# Nearest nodes should be returned (at least it will return itself).
305			nearest_nodes = self.__tree.find_nearest_dist_nodes(point, distance);
306			for (candidate_distance, kdtree_node) in nearest_nodes:
307			if ( (candidate_distance < nearest_distance) and (kdtree_node is not None) and (kdtree_node.payload is not cluster) ):
308			nearest_distance = candidate_distance;
309			nearest_cluster = kdtree_node.payload;
310
311			return (nearest_cluster, nearest_distance);
312
313
314			def __insert_represented_points(self, cluster):
315			"""!
316			@brief Insert representation points to the k-d tree.
317
318			@param[in] cluster (cure_cluster): Cluster whose representation points should be inserted.
319
320			"""
321
322			for point in cluster.rep:
323			self.__tree.insert(point, cluster);
324
325
326			def __delete_represented_points(self, cluster):
327			"""!
328			@brief Remove representation points of clusters from the k-d tree
329
330			@param[in] cluster (cure_cluster): Cluster whose representation points should be removed.
331
332			"""
333
334			for point in cluster.rep:
335			self.__tree.remove(point);
336
337
338			def __merge_clusters(self, cluster1, cluster2):
339			"""!
340			@brief Merges two clusters and returns new merged cluster. Representation points and mean points are calculated for the new cluster.
341
342			@param[in] cluster1 (cure_cluster): Cluster that should be merged.
343			@param[in] cluster2 (cure_cluster): Cluster that should be merged.
344
345			@return (cure_cluster) New merged CURE cluster.
346
347			"""
348
349			merged_cluster = cure_cluster();
350
351			merged_cluster.points = cluster1.points + cluster2.points;
352
353			# merged_cluster.mean = ( len(cluster1.points) * cluster1.mean + len(cluster2.points) * cluster2.mean ) / ( len(cluster1.points) + len(cluster2.points) );
354			dimension = len(cluster1.mean);
355			merged_cluster.mean = [0] * dimension;
356			if merged_cluster.points[1:] == merged_cluster.points[:-1]:
357			merged_cluster.mean = merged_cluster.points[0]
358			else:
359			for index in range(dimension):
360			merged_cluster.mean[index] = ( len(cluster1.points) * cluster1.mean[index] + len(cluster2.points) * cluster2.mean[index] ) / ( len(cluster1.points) + len(cluster2.points) );
361
362			temporary = list(); # TODO: Set should be used in line with specification (article), but list is not hashable object therefore it's impossible to use list in this fucking set!
363
364			for index in range(self.__number_represent_points):
365			maximal_distance = 0;
366			maximal_point = None;
367
368			for point in merged_cluster.points:
369			minimal_distance = 0;
370			if (index == 0):
371			minimal_distance = euclidean_distance(point, merged_cluster.mean);
372			#minimal_distance = euclidean_distance_sqrt(point, merged_cluster.mean);
373			else:
374			minimal_distance = euclidean_distance(point, temporary[0]);
375			#minimal_distance = cluster_distance(cure_cluster(point), cure_cluster(temporary[0]));
376
377			if (minimal_distance >= maximal_distance):
378			maximal_distance = minimal_distance;
379			maximal_point = point;
380
381			if (maximal_point not in temporary):
382			temporary.append(maximal_point);
383
384			for point in temporary:
385			representative_point = [0] * dimension;
386			for index in range(dimension):
387			representative_point[index] = point[index] + self.__compression * (merged_cluster.mean[index] - point[index]);
388
389			merged_cluster.rep.append(representative_point);
390
391			return merged_cluster;
392
393
394			def __create_queue(self):
395			"""!
396			@brief Create queue of sorted clusters by distance between them, where first cluster has the nearest neighbor. At the first iteration each cluster contains only one point.
397
398			@param[in] data (list): Input data that is presented as list of points (objects), each point should be represented by list or tuple.
399
400			@return (list) Create queue of sorted clusters by distance between them.
401
402			"""
403
404			self.__queue = [cure_cluster(point) for point in self.__pointer_data];
405
406			# set closest clusters
407			for i in range(0, len(self.__queue)):
408			minimal_distance = float('inf');
409			closest_index_cluster = -1;
410
411			for k in range(0, len(self.__queue)):
412			if (i != k):
413			dist = self.__cluster_distance(self.__queue[i], self.__queue[k]);
414			if (dist < minimal_distance):
415			minimal_distance = dist;
416			closest_index_cluster = k;
417
418			self.__queue[i].closest = self.__queue[closest_index_cluster];
419			self.__queue[i].distance = minimal_distance;
420
421			# sort clusters
422			self.__queue.sort(key = lambda x: x.distance, reverse = False);
423
424
425			def __create_kdtree(self):
426			"""!
427			@brief Create k-d tree in line with created clusters. At the first iteration contains all points from the input data set.
428
429			@return (kdtree) k-d tree that consist of representative points of CURE clusters.
430
431			"""
432
433			self.__tree = kdtree();
434			for current_cluster in self.__queue:
435			for representative_point in current_cluster.rep:
436			self.__tree.insert(representative_point, current_cluster);
437
438
439			def __cluster_distance(self, cluster1, cluster2):
440			"""!
441			@brief Calculate minimal distance between clusters using representative points.
442
443			@param[in] cluster1 (cure_cluster): The first cluster.
444			@param[in] cluster2 (cure_cluster): The second cluster.
445
446			@return (double) Euclidean distance between two clusters that is defined by minimum distance between representation points of two clusters.
447
448			"""
449
450			distance = float('inf');
451			for i in range(0, len(cluster1.rep)):
452			for k in range(0, len(cluster2.rep)):
453			#dist = euclidean_distance_sqrt(cluster1.rep[i], cluster2.rep[k]); # Fast mode
454			dist = euclidean_distance(cluster1.rep[i], cluster2.rep[k]); # Slow mode
455			if (dist < distance):
456			distance = dist;
457
458			return distance;
459

annoviko / pyclustering

Push — master ( 4daa3c...607c53 )

cure D

Complexity

Size/Duplication

14 Methods

How to fix Complexity

Complex Class

Duplication Side-by-Side

Filter issues like